// ariarm.d (c) Copyright 1994, 1997 P.J.Burwood
// little-endian modifications (c) Copyright 1996 B. Haible
// external routines for arilev1.d
// Processor: ARM in APCS mode
// Assembler-Syntax: ObjAsm under RISC OS, GAS otherwise
// Assumptions: intCsize=32, intDsize=32.
// Parameter passing conventions: APCS means that registers a1-a4 and ip
//   do not have to be preserved across function calls.
// Note: A sequence of up to 4 conditional instructions is used in preference
//   to a branch.


#ifdef __riscos

// ObjAsm syntax

a1      RN      0
a2      RN      1
a3      RN      2
a4      RN      3
v1      RN      4
v2      RN      5
v3      RN      6
v4      RN      7
v5      RN      8
v6      RN      9
sl      RN      10
fp      RN      11
ip      RN      12
sp      RN      13
lr      RN      14
pc      RN      15

f0      FN      0
f1      FN      1
f2      FN      2
f3      FN      3
f4      FN      4
f5      FN      5
f6      FN      6
f7      FN      7

#define C(x) _##x
#define EXPORT(x) EXPORT x
#define DECLARE_FUNCTION(x)
#define GLABEL(x) _##x
#define LABEL(x) _##x

        AREA    |C$$code|,CODE,READONLY

#else

// GAS syntax

a1      .req    r0
a2      .req    r1
a3      .req    r2
a4      .req    r3
v1      .req    r4
v2      .req    r5
v3      .req    r6
v4      .req    r7
v5      .req    r8
v6      .req    r9
rfp     .req    r9
sl      .req    r10
fp      .req    r11
ip      .req    r12
sp      .req    r13
lr      .req    r14
pc      .req    r15

#define C(x) _##x
#define EXPORT(x) .global _##x
#if defined(__NetBSD__)
#define DECLARE_FUNCTION(x) .type _##x,%function
#else
#define DECLARE_FUNCTION(x)
#endif
#define GLABEL(x) _##x##:
#define LABEL(x) x##:
#define RRX rrx
#define END

#endif


#if defined(__arm7m__) || defined(__arm8__) || defined(__arm9__) || defined(__strongarm__)
  // ARM7M and later have 32x32 -> 64 multiplies which execute in 2-4 clocks.
  #define HAVE_umull
#endif


#if defined(__GNUC__) && 0
  // With GNU C, we would like to pass the second return value in a2, don't
  // need a global variable. Unfortunately, the current Acorn gcc crashes if
  // we declare an appropriate local register variable with __asm__.
  // It would be possible to declare the functions as returning a 64-bit
  // result, but given the quality of gcc code dealing with 64-bit entities
  // and the subtleties of 64-bit returns values (passed in register or in
  // memory?) we now let it be.
#else
  // Use three global variables.
  #define MULU32_HIGH
  #define DIVU_16_REST
  #define DIVU_32_REST
#endif

#ifdef __riscos

#ifdef MULU32_HIGH
ptr_mulu32_high
        IMPORT  mulu32_high
        DCD     mulu32_high
#endif
#ifdef DIVU_16_REST
ptr_divu_16_rest
        IMPORT  divu_16_rest
        DCD     divu_16_rest
#endif
#ifdef DIVU_32_REST
ptr_divu_32_rest
        IMPORT  divu_32_rest
        DCD     divu_32_rest
#endif

#else

#ifdef MULU32_HIGH
ptr_mulu32_high:
        .word   _mulu32_high
        .align  0
#endif
#ifdef DIVU_16_REST
ptr_divu_16_rest:
        .word   _divu_16_rest
        .align  0
#endif
#ifdef DIVU_32_REST
ptr_divu_32_rest:
        .word   _divu_32_rest
        .align  0
#endif

#endif


// extern uint32 mulu32_ (uint32 x, uint32 y);
//       entry
//               a1 = x
//               a2 = y
//       exit
//               a1 = low32(x*y)
//               a2 = high32(x*y)
//               mulu32_high = high32(x*y)
//               a3,a4,ip destroyed
        EXPORT(mulu32_)
        DECLARE_FUNCTION(mulu32_)
GLABEL(mulu32_)
#ifdef HAVE_umull
        MOV     a3,a2
        UMULL   a1,a2,a3,a1
#else
        MOV     ip,a1,LSR #16           // temp := top half of x
        MOV     a3,a2,LSR #16           // hi := top half of y
        BIC     a1,a1,ip,LSL #16        // x  := bottom half of x
        BIC     a2,a2,a3,LSL #16        // y  := bottom half of y
        MUL     a4,a1,a2                // low section of result
        MUL     a2,ip,a2                // ) middle sections
        MUL     a1,a3,a1                // )   of result
        MUL     a3,ip,a3                // high section of result
        ADDS    a2,a2,a1                // add middle sections
                                        // (can't use mla as we need carry)
        ADDCS   a3,a3,#0x10000          // carry from above add
        ADDS    a1,a4,a2,LSL #16        // x is now bottom 32 bits of result
        ADC     a2,a3,a2,LSR #16        // hi is top 32 bits
#endif
#ifdef MULU32_HIGH
        LDR     a3,[pc,#ptr_mulu32_high-.-8]
        STR     a2,[a3,#0]
#endif
        MOVS    pc,lr

// extern uint16 divu_3216_1616_ (uint32 x, uint16 y);
//       entry
//               a1 = x
//               a2 = y
//       exit
//               a1 = q = floor(x/y)
//               a2 = r = x-q*y
//               divu_16_rest = r = x-q*y
//               a3 destroyed
        EXPORT(divu_3216_1616_)
        DECLARE_FUNCTION(divu_3216_1616_)
GLABEL(divu_3216_1616_)
        // see cl_low_div.cc for algorithm
        // in that notation: a1 = r, a2 = -s.
        MOV     a2,a2,LSL#15            // multiply divisor by 2^15
        RSB     a2,a2,#0                // negate divisor
        ADDS    a1,a2,a1                // dividend = dividend + -divisor/2
        SUBCC   a1,a1,a2                // dividend = dividend - -divisor/2
        ADCS    a1,a2,a1,LSL#1          // dividend = dividend*2 + -divisor
                                        // and shift quotient
        SUBCC   a1,a1,a2                // do this another 14 times
        ADCS    a1,a2,a1,LSL#1
        SUBCC   a1,a1,a2
        ADCS    a1,a2,a1,LSL#1
        SUBCC   a1,a1,a2
        ADCS    a1,a2,a1,LSL#1
        SUBCC   a1,a1,a2
        ADCS    a1,a2,a1,LSL#1
        SUBCC   a1,a1,a2
        ADCS    a1,a2,a1,LSL#1
        SUBCC   a1,a1,a2
        ADCS    a1,a2,a1,LSL#1
        SUBCC   a1,a1,a2
        ADCS    a1,a2,a1,LSL#1
        SUBCC   a1,a1,a2
        ADCS    a1,a2,a1,LSL#1
        SUBCC   a1,a1,a2
        ADCS    a1,a2,a1,LSL#1
        SUBCC   a1,a1,a2
        ADCS    a1,a2,a1,LSL#1
        SUBCC   a1,a1,a2
        ADCS    a1,a2,a1,LSL#1
        SUBCC   a1,a1,a2
        ADCS    a1,a2,a1,LSL#1
        SUBCC   a1,a1,a2
        ADCS    a1,a2,a1,LSL#1
        SUBCC   a1,a1,a2
        ADCS    a1,a2,a1,LSL#1
        SUBCC   a1,a1,a2                // do the last conditional subtraction
        MOV     a2,a1,LSR#15            // move remainder into a2 and shift
        ADC     a1,a1,a1                // move last bit of quotient in
        MOV     a1,a1,LSL#16            // AND out top 16 bits by shifting up
        MOV     a1,a1,LSR#16            // and back down again
#ifdef DIVU_16_REST
        LDR     a3,[pc,#ptr_divu_16_rest-.-8]   // save rest so can be picked up later
        STR     a2,[a3,#0]              // the result is 16 bits
#endif
        MOVS    pc, lr

// extern uint32 divu_6432_3232_ (uint32 xhi, uint32 xlo, uint32 y); // -> Quotient q
// extern uint32 divu_32_rest;                                       // -> Rest r
//       see cl_low_div.cc for algorithm
//       entry
//               a1 = xhi (dividend)
//               a2 = xlo (dividend)
//               a3 = y (divisor)
//       exit
//               a1 = 32 bit quotient
//               a2 = 32 bit remainder
//               a3, a4 destroyed
        EXPORT(divu_6432_3232_)
        DECLARE_FUNCTION(divu_6432_3232_)
GLABEL(divu_6432_3232_)
        STMFD   sp!, {v1,v2,v3,v4,v5,v6,lr}
        MOV     v2, a2                  // = xlo
        MOV     v1, a3                  // = y
        CMP     a3,#0x10000             // y <= (uint32)(bit(16)-1)
        BCS     divu_6432_3232_l1
        MOV     a2, v2, LSR #16
        ORR     a1, a2, a1, ASL #16     // = highlow32(low16(xhi),high16(xlo))
        MOV     a2, v1
        BL      C(divu_3216_1616_)
        MOV     v3, a1                  // = q1
        MOV     a1, v2, ASL #16
        MOV     a1, a1, LSR #16
        ORR     a1, a1, a2, ASL #16     // = highlow32(r1,low16(xlo))
        MOV     a2, v1
        BL      C(divu_3216_1616_)
        ORR     a1, a1, v3, ASL #16     // = highlow32(q1,q0)
#ifdef DIVU_32_REST
        LDR     a4,[pc,#ptr_divu_32_rest-.-8]
        STR     a2,[a4,#0]              // divu_32_rest = remainder
#endif
        LDMFD   sp!, {v1,v2,v3,v4,v5,v6,pc}^

LABEL(divu_6432_3232_l1)
        MOV     v3, #0                  // s = 0
        MOVS    a4, v1, LSR #16         // while ((sint32)y >= 0)
        ADDEQ   v3, v3, #16             //   { y = y<<1; s++; }
        MOVEQ   v1, v1, ASL #16
        MOVS    a4, v1, LSR #24
        ADDEQ   v3, v3, #8
        MOVEQ   v1, v1, ASL #8
        MOVS    a4, v1, LSR #28
        ADDEQ   v3, v3, #4
        MOVEQ   v1, v1, ASL #4
        MOVS    a4, v1, LSR #30
        ADDEQ   v3, v3, #2
        MOVEQ   v1, v1, ASL #2
        MOVS    a4, v1, LSR #31
        ADDEQ   v3, v3, #1
        MOVEQ   v1, v1, ASL #1

        CMPS    v3, #0
        MOVNE   a2, a1, ASL v3          // if (!(s==0))
        RSBNE   a1, v3, #32             //   { xhi = (xhi << s)
        ORRNE   a1, a2, v2, LSR a1      //         | (xlo >> (32-s));
        MOVNE   v2, v2, ASL v3          //     xlo = xlo << s; }
        ADD     a2, v1, #0x10000        // y1_1 = high16(y)+1
        MOVS    v5, a2, LSR #16         // if (y1_1 = 0)
        MOVEQ   v4, a1, ASL #16         // r16 = low16(xhi) * 2^16
        MOVEQ   a1, a1, LSR #16         // q1 = high16(xhi)
        MOVNE   a2, v5
        BLNE    C(divu_3216_1616_)      // divu_3216_1616(xhi,y1_1, q1=,r16=)
        MOVNE   v4, a2, ASL #16         // r16 = r16 * 2^16
        ORR     v4, v4, v2, LSR #16     // r = highlow32(r16,high16(xlo))
        MOV     a4, v1, ASL #16         // tmp = mulu16(low16(y),q1)
        MOV     a4, a4, LSR #16
        MUL     a3, a4, a1
        RSB     a3, a3, a1, ASL #16     // r2 = highlow32_0(q1) - tmp
        MOV     v6, a1                  // = q1
        ADDS    a1, v4, a3              // r += r2
        ADDCS   v6, v6, #1              // if ( r < r2 ) { q1 += 1
        SUBCS   a1, a1, v1              //                 r -= y }
        CMP     a1, v1                  // if (r >= y)
        ADDCS   v6, v6, #1              //     { q1 += 1
        SUBCS   a1, a1, v1              //       r -= y }
        CMP     v5, #0                  // if (y1_1 = 0)
        MOVEQ   v4, a1, ASL #16         //    { r16 = low16(r) * 2^16
        MOVEQ   a1, a1, LSR #16         //      q0  = high16(r) }
        MOVNE   a2, v5
        BLNE    C(divu_3216_1616_)      // divu_3216_1616(r,y1_1, q0=,r16=)
        MOVNE   v4, a2, ASL #16         // r16 = r16 * 2^16
        MOV     v2, v2, ASL #16
        ORR     v4, v4, v2, LSR #16     // r = highlow32(r16,low16(xlo))
        MOV     a4, v1, ASL #16         // tmp = mulu16(low16(y),q0)
        MOV     a4, a4, LSR #16
        MUL     a3, a4, a1
        RSB     a3, a3, a1, ASL #16     // r2 = highlow32_0(q0) - tmp
        ADDS    v4, v4, a3              // r += r2
        ADDCS   a1, a1, #1              // if ( r < r2 ) { q0 += 1
        SUBCS   v4, v4, v1              //                 r -= y }
        CMP     v4, v1                  // if (r >= y)
        ADDCS   a1, a1, #1              //     { q0 += 1
        SUBCS   v4, v4, v1              //       r -= y }
        MOV     a2, v4, LSR v3          // remainder = r >> s
        ORR     a1, a1, v6, ASL #16     // return highlow32(q1,q0)
#ifdef DIVU_32_REST
        LDR     a3,[pc,#ptr_divu_32_rest-.-8]
        STR     a2,[a3,#0]              // divu_32_rest = remainder
#endif
        LDMFD   sp!, {v1,v2,v3,v4,v5,v6,pc}^

// extern uintD* copy_loop_up (uintD* sourceptr, uintD* destptr, uintC count);
//       entry
//               a1 = source pointer
//               a2 = destination pointer
//               a3 = count of words to store
//       exit
//               a1 = address of last word stored + 1
//               a2 - a4, ip destroyed
        EXPORT(copy_loop_up)            // word aligned copy loop up
        DECLARE_FUNCTION(copy_loop_up)
GLABEL(copy_loop_up)
        ANDS    a4,a3,#3                // multiple of 4 words ?
        BEQ     copy_loop_up_l1         // yup, so branch
        CMP     a4,#2                   // copy the first 1-3 words
        LDR     a4,[a1],#4              // to align the total to a multiple
        STR     a4,[a2],#4              // of 4 words
        LDRGE   a4,[a1],#4
        STRGE   a4,[a2],#4
        LDRGT   a4,[a1],#4
        STRGT   a4,[a2],#4
LABEL(copy_loop_up_l1)
        BICS    a4,a3,#3                // set counter to multiple of 4
        MOVEQ   a1,a2                   // return addr of last word stored
        MOVEQS  pc,lr                   // if zero then we're done
        STMFD   sp!,{v1,lr}             // save work regs
LABEL(copy_loop_up_l2)
        LDMIA   a1!,{a3,v1,ip,lr}       // copy 4 words in one go
        STMIA   a2!,{a3,v1,ip,lr}
        SUBS    a4,a4,#8                // decrement counter by 8
        LDMGEIA a1!,{a3,v1,ip,lr}       // if count still positive then copy
        STMGEIA a2!,{a3,v1,ip,lr}       // 4 more words
        BGT     copy_loop_up_l2         // and loop
        MOV     a1,a2                   // return addr of last word stored
        LDMFD   sp!,{v1,pc}^            // restore work regs and return

// extern uintD* copy_loop_down (uintD* sourceptr, uintD* destptr, uintC count);
//       entry
//               a1 = source pointer
//               a2 = destination pointer
//               a3 = count of words to store
//       exit
//               a1 = address of last word stored
//               a2 - a4, ip destroyed
        EXPORT(copy_loop_down)          // word aligned copy loop down
        DECLARE_FUNCTION(copy_loop_down)
GLABEL(copy_loop_down)
        ANDS    a4,a3,#3                // multiple of 4 words ?
        BEQ     copy_loop_down_l1       // yup, so branch
        CMP     a4,#2                   // copy the first 1-3 words
        LDR     a4,[a1,#-4]!            // to align the total to a multiple
        STR     a4,[a2,#-4]!            // of 4 words
        LDRGE   a4,[a1,#-4]!
        STRGE   a4,[a2,#-4]!
        LDRGT   a4,[a1,#-4]!
        STRGT   a4,[a2,#-4]!
LABEL(copy_loop_down_l1)
        BICS    a4,a3,#3                // set counter to multiple of 4
        MOVEQ   a1,a2                   // return addr of last word stored
        MOVEQS  pc,lr                   // if zero then we're done
        STMFD   sp!,{v1,lr}             // save work regs
LABEL(copy_loop_down_l2)
        LDMDB   a1!,{a3,v1,ip,lr}       // copy 4 words in one go
        STMDB   a2!,{a3,v1,ip,lr}
        SUBS    a4,a4,#8                // decrement counter by 8
        LDMGEDB a1!,{a3,v1,ip,lr}       // if count still positive then copy
        STMGEDB a2!,{a3,v1,ip,lr}       // 4 more words
        BGT     copy_loop_down_l2       // and loop
        MOV     a1,a2                   // return addr of last word stored
        LDMFD   sp!,{v1,pc}^            // restore work regs and return

// extern uintD* clear_loop_up (uintD* destptr, uintC count);
//       entry
//               a1 = destination pointer
//               a2 = count of words to store
//       exit
//               a1 = address of last word stored + 1
//               a2 - a4, ip destroyed
        EXPORT(clear_loop_up)           // word aligned clear loop up
        DECLARE_FUNCTION(clear_loop_up)
GLABEL(clear_loop_up)
        MOV     a3,#0                   // set filler to 0
                                        // and drop into fill_loop_up

// extern uintD* fill_loop_up (uintD* destptr, uintC count, uintD filler);
//       entry
//               a1 = destination pointer
//               a2 = count of words to store
//               a3 = word to store
//       exit
//               a1 = address of last word stored + 1
//               a2 - a4, ip destroyed
        EXPORT(fill_loop_up)            // word aligned fill loop up
        DECLARE_FUNCTION(fill_loop_up)
GLABEL(fill_loop_up)
        ANDS    a4,a2,#3                // multiple of 4 words ?
        BEQ     fill_loop_up_l1         // yup, so branch
        CMP     a4,#2                   // store the first 1-3 words
        STR     a3,[a1],#4              // to align the total to a multiple
        STRGE   a3,[a1],#4              // of 4 words
        STRGT   a3,[a1],#4
LABEL(fill_loop_up_l1)
        BICS    a4,a2,#3                // set counter to multiple of 4
        MOVEQS  pc,lr                   // if zero then we're done
        STMFD   sp!,{v1,lr}             // save work regs
        MOV     v1,a3                   // copy filler to three other
        MOV     ip,a3                   // registers
        MOV     lr,a3
LABEL(fill_loop_up_l2)
        STMIA   a1!,{a3,v1,ip,lr}       // store 4 fillers in one go
        SUBS    a4,a4,#8                // decrement counter by 8
        STMGEIA a1!,{a3,v1,ip,lr}       // if count still positive then store 4
        BGT     fill_loop_up_l2         // more and loop
        LDMFD   sp!,{v1,pc}^            // restore work regs and return


// extern uintD* clear_loop_down (uintD* destptr, uintC count);
//       entry
//               a1 = destination pointer
//               a2 = count of words to store
//       exit
//               a1 = address of last word stored + 1
//               a2 - a4, ip destroyed
        EXPORT(clear_loop_down)         // word aligned clear loop down
        DECLARE_FUNCTION(clear_loop_down)
GLABEL(clear_loop_down)
        MOV     a3,#0                   // set filler to 0
                                        // and drop into fill_loop_down

// extern uintD* fill_loop_down (uintD* destptr, uintC count, uintD filler);
//       entry
//               a1 = destination pointer
//               a2 = count of words to store
//               a3 = word to store
//       exit
//               a1 = address of last word stored
//               a2 - a4, ip destroyed
        EXPORT(fill_loop_down)          // word aligned fill loop down
        DECLARE_FUNCTION(fill_loop_down)
GLABEL(fill_loop_down)
        ANDS    a4,a2,#3                // multiple of 4 words ?
        BEQ     fill_loop_down_l1       // yup, so branch
        CMP     a4,#2                   // store the first 1-3 words
        STR     a3,[a1,#-4]!            // to align the total to a multiple
        STRGE   a3,[a1,#-4]!            // of 4 words
        STRGT   a3,[a1,#-4]!
LABEL(fill_loop_down_l1)
        BICS    a4,a2,#3                // set counter to multiple of 4
        MOVEQS  pc,lr                   // if zero then we're done
        STMFD   sp!,{v1,lr}             // save work regs
        MOV     v1,a3                   // copy filler to three other
        MOV     ip,a3                   // registers
        MOV     lr,a3
LABEL(fill_loop_down_l2)
        STMDB   a1!,{a3,v1,ip,lr}       // store 4 fillers in one go
        SUBS    a4,a4,#8                // decrement counter by 8
        STMGEDB a1!,{a3,v1,ip,lr}       // if count still positive then store 4
        BGT     fill_loop_down_l2       // more and loop
        LDMFD   sp!,{v1,pc}^            // restore work regs and return

// extern void test_loop_up (uintD* xptr, uintC count);
//       entry
//               a1 = xptr
//               a2 = count of words to be TESTed
//       exit
//               a1 = TRUE if any words are non-zero else FALSE
//               a2 - a4, ip destroyed
        EXPORT(test_loop_up)            // word aligned test loop up
        DECLARE_FUNCTION(test_loop_up)
GLABEL(test_loop_up)
        MOV     ip,a1                   // move xptr to ip
        MOV     a1,#1                   // set result to TRUE
        ANDS    a3,a2,#3                // multiple of 4 words ?
        BEQ     test_loop_up_l1         // yup, so branch
        LDR     a4,[ip],#4              // TEST the first 1-3 words
        TEQ     a4,#0                   // align the total to a multiple of 4
        MOVNES  pc,lr                   // return TRUE if AND_TEST ok
        CMP     a3,#2
        BLT     test_loop_up_l1         // need to branch 'cos PSR set
        LDRGE   a4,[ip],#4              // when checking against zero
        TEQGE   a4,#0
        MOVNES  pc,lr
        CMP     a3,#2
        BLE     test_loop_up_l1         // need to branch 'cos PSR set
        LDRGT   a4,[ip],#4              // when checking against zero
        TEQGT   a4,#0
        MOVNES  pc,lr
LABEL(test_loop_up_l1)
        BICS    a4,a2,#3                // set counter to multiple of 4
        MOVEQ   a1,#0                   // return FALSE
        MOVEQS  pc,lr                   // if zero then we're done
        STMFD   sp!,{v1,lr}             // save work regs
LABEL(test_loop_up_l2)
        LDMIA   ip!,{a2,a3,v1,lr}       // load 4 words in one go
        TEQ     a2,#0                   // TEST the four words
        TEQEQ   a3,#0
        TEQEQ   v1,#0
        TEQEQ   lr,#0
        LDMNEFD sp!,{v1,pc}^
        SUBS    a4,a4,#4                // decrement counter by 4
        BGT     test_loop_up_l2         // if count still positive then loop
        MOV     a1,#0
        LDMFD   sp!,{v1,pc}^            // restore work regs and return

// extern void test_loop_down (uintD* xptr, uintC count);
//       entry
//               a1 = xptr
//               a2 = count of words to be TESTed
//       exit
//               a1 = TRUE if any words are non-zero else FALSE
//               a2 - a4, ip destroyed
        EXPORT(test_loop_down)          // word aligned test loop down
        DECLARE_FUNCTION(test_loop_down)
GLABEL(test_loop_down)
        MOV     ip,a1                   // move xptr to ip
        MOV     a1,#1                   // set result to TRUE
        ANDS    a3,a2,#3                // multiple of 4 words ?
        BEQ     test_loop_down_l1       // yup, so branch
        LDR     a4,[ip,#-4]!            // TEST the first 1-3 words
        TEQ     a4,#0                   // align the total to a multiple of 4
        MOVNES  pc,lr                   // return TRUE if AND_TEST ok
        CMP     a3,#2
        BLT     test_loop_down_l1       // need to branch 'cos PSR set
        LDRGE   a4,[ip,#-4]!            // when checking against zero
        TEQGE   a4,#0
        MOVNES  pc,lr
        CMP     a3,#2
        BLE     test_loop_down_l1       // need to branch 'cos PSR set
        LDRGT   a4,[ip,#-4]!            // when checking against zero
        TEQGT   a4,#0
        MOVNES  pc,lr
LABEL(test_loop_down_l1)
        BICS    a4,a2,#3                // set counter to multiple of 4
        MOVEQ   a1,#0                   // return FALSE
        MOVEQS  pc,lr                   // if zero then we're done
        STMFD   sp!,{v1,lr}             // save work regs
LABEL(test_loop_down_l2)
        LDMDB   ip!,{a2,a3,v1,lr}       // load 4 words in one go
        TEQ     a2,#0                   // TEST the four words
        TEQEQ   a3,#0
        TEQEQ   v1,#0
        TEQEQ   lr,#0
        LDMNEFD sp!,{v1,pc}^
        SUBS    a4,a4,#4                // decrement counter by 4
        BGT     test_loop_down_l2       // if count still positive then loop
        MOV     a1,#0
        LDMFD   sp!,{v1,pc}^            // restore work regs and return

#if CL_DS_BIG_ENDIAN_P

// extern void or_loop_up (uintD* xptr, uintD* yptr, uintC count);
//       entry
//               a1 = xptr
//               a2 = yptr
//               a3 = count of words to be ORed
//       exit
//               xptr |= yptr for count words
//               a1 - a4, ip destroyed
        EXPORT(or_loop_up)              // word aligned or loop up
        DECLARE_FUNCTION(or_loop_up)
GLABEL(or_loop_up)
        ANDS    a4,a3,#3                // multiple of 4 words ?
        BEQ     or_loop_up_l1           // yup, so branch
        CMP     a4,#2                   // OR the first 1-3 words
        LDR     a4,[a2],#4              // to align the total to a multiple
        LDR     ip,[a1]                 // of 4 words
        ORR     ip,ip,a4
        STR     ip,[a1],#4
        BLT     or_loop_up_l1           // better to branch than skip instrs.
        LDRGE   a4,[a2],#4
        LDRGE   ip,[a1]
        ORRGE   ip,ip,a4
        STRGE   ip,[a1],#4
        LDRGT   a4,[a2],#4
        LDRGT   ip,[a1]
        ORRGT   ip,ip,a4
        STRGT   ip,[a1],#4
LABEL(or_loop_up_l1)
        BICS    a4,a3,#3                // set counter to multiple of 4
        MOVEQS  pc,lr                   // if zero then we're done
        STMFD   sp!,{v1-v5,lr}          // save work regs
LABEL(or_loop_up_l2)
        LDMIA   a2!,{a3,v1,v2,ip}       // load 4 words in one go
        LDMIA   a1,{v3,v4,v5,lr}        // load target words
        ORR     v3,v3,a3                // OR the four words
        ORR     v4,v4,v1
        ORR     v5,v5,v2
        ORR     lr,lr,ip
        STMIA   a1!,{v3,v4,v5,lr}       // store 4 results
        SUBS    a4,a4,#4                // decrement counter by 4
        BGT     or_loop_up_l2           // if count still positive then loop
        LDMFD   sp!,{v1-v5,pc}^         // restore work regs and return

#endif

// extern void xor_loop_up (uintD* xptr, uintD* yptr, uintC count);
//       entry
//               a1 = xptr
//               a2 = yptr
//               a3 = count of words to be XORed
//       exit
//               xptr ^= yptr for count words
//               a1 - a4, ip destroyed
        EXPORT(xor_loop_up)             // word aligned xor loop up
        DECLARE_FUNCTION(xor_loop_up)
GLABEL(xor_loop_up)
        ANDS    a4,a3,#3                // multiple of 4 words ?
        BEQ     xor_loop_up_l1          // yup, so branch
        CMP     a4,#2                   // XOR the first 1-3 words
        LDR     a4,[a2],#4              // to align the total to a multiple
        LDR     ip,[a1]                 // of 4 words
        EOR     ip,ip,a4
        STR     ip,[a1],#4
        BLT     xor_loop_up_l1          // better to branch than skip instrs.
        LDRGE   a4,[a2],#4
        LDRGE   ip,[a1]
        EORGE   ip,ip,a4
        STRGE   ip,[a1],#4
        LDRGT   a4,[a2],#4
        LDRGT   ip,[a1]
        EORGT   ip,ip,a4
        STRGT   ip,[a1],#4
LABEL(xor_loop_up_l1)
        BICS    a4,a3,#3                // set counter to multiple of 4
        MOVEQS  pc,lr                   // if zero then we're done
        STMFD   sp!,{v1-v5,lr}          // save work regs
LABEL(xor_loop_up_l2)
        LDMIA   a2!,{a3,v1,v2,ip}       // load 4 words in one go
        LDMIA   a1,{v3,v4,v5,lr}        // load target words
        EOR     v3,v3,a3                // XOR the four words
        EOR     v4,v4,v1
        EOR     v5,v5,v2
        EOR     lr,lr,ip
        STMIA   a1!,{v3,v4,v5,lr}       // store 4 results
        SUBS    a4,a4,#4                // decrement counter by 4
        BGT     xor_loop_up_l2          // if count still positive then loop
        LDMFD   sp!,{v1-v5,pc}^         // restore work regs and return

#if CL_DS_BIG_ENDIAN_P

// extern void and_loop_up (uintD* xptr, uintD* yptr, uintC count);
//       entry
//               a1 = xptr
//               a2 = yptr
//               a3 = count of words to be ANDed
//       exit
//               xptr &= yptr for count words
//               a1 - a4, ip destroyed
        EXPORT(and_loop_up)             // word aligned and loop up
        DECLARE_FUNCTION(and_loop_up)
GLABEL(and_loop_up)
        ANDS    a4,a3,#3                // multiple of 4 words ?
        BEQ     and_loop_up_l1          // yup, so branch
        CMP     a4,#2                   // AND the first 1-3 words
        LDR     a4,[a2],#4              // to align the total to a multiple
        LDR     ip,[a1]                 // of 4 words
        AND     ip,ip,a4
        STR     ip,[a1],#4
        BLT     and_loop_up_l1          // better to branch than skip instrs.
        LDRGE   a4,[a2],#4
        LDRGE   ip,[a1]
        ANDGE   ip,ip,a4
        STRGE   ip,[a1],#4
        LDRGT   a4,[a2],#4
        LDRGT   ip,[a1]
        ANDGT   ip,ip,a4
        STRGT   ip,[a1],#4
LABEL(and_loop_up_l1)
        BICS    a4,a3,#3                // set counter to multiple of 4
        MOVEQS  pc,lr                   // if zero then we're done
        STMFD   sp!,{v1-v5,lr}          // save work regs
LABEL(and_loop_up_l2)
        LDMIA   a2!,{a3,v1,v2,ip}       // load 4 words in one go
        LDMIA   a1,{v3,v4,v5,lr}        // load target words
        AND     v3,v3,a3                // AND the four words
        AND     v4,v4,v1
        AND     v5,v5,v2
        AND     lr,lr,ip
        STMIA   a1!,{v3,v4,v5,lr}       // store 4 results
        SUBS    a4,a4,#4                // decrement counter by 4
        BGT     and_loop_up_l2          // if count still positive then loop
        LDMFD   sp!,{v1-v5,pc}^         // restore work regs and return

// extern void eqv_loop_up (uintD* xptr, uintD* yptr, uintC count);
//       entry
//               a1 = xptr
//               a2 = yptr
//               a3 = count of words to be XORed
//       exit
//               xptr = ~(xptr ^ yptr) for count words
//               a1 - a4, ip destroyed
        EXPORT(eqv_loop_up)             // word aligned eqv loop up
        DECLARE_FUNCTION(eqv_loop_up)
GLABEL(eqv_loop_up)
        ANDS    a4,a3,#3                // multiple of 4 words ?
        BEQ     eqv_loop_up_l1          // yup, so branch
        CMP     a4,#2                   // EQV the first 1-3 words
        LDR     a4,[a2],#4              // to align the total to a multiple
        LDR     ip,[a1]                 // of 4 words
        EOR     ip,ip,a4
        MVN     ip,ip
        STR     ip,[a1],#4
        BLT     eqv_loop_up_l1          // better to branch than skip instrs.
        LDRGE   a4,[a2],#4
        LDRGE   ip,[a1]
        EORGE   ip,ip,a4
        MVNGE   ip,ip
        STRGE   ip,[a1],#4
        BLE     eqv_loop_up_l1          // better to branch than skip instrs.
        LDRGT   a4,[a2],#4
        LDRGT   ip,[a1]
        EORGT   ip,ip,a4
        MVNGT   ip,ip
        STRGT   ip,[a1],#4
LABEL(eqv_loop_up_l1)
        BICS    a4,a3,#3                // set counter to multiple of 4
        MOVEQS  pc,lr                   // if zero then we're done
        STMFD   sp!,{v1-v5,lr}          // save work regs
LABEL(eqv_loop_up_l2)
        LDMIA   a2!,{a3,v1,v2,ip}       // load 4 words in one go
        LDMIA   a1,{v3,v4,v5,lr}        // load target words
        EOR     v3,v3,a3                // EVQ the four words
        MVN     v3,v3
        EOR     v4,v4,v1
        MVN     v4,v4
        EOR     v5,v5,v2
        MVN     v5,v5
        EOR     lr,lr,ip
        MVN     lr,lr
        STMIA   a1!,{v3,v4,v5,lr}       // store 4 results
        SUBS    a4,a4,#4                // decrement counter by 4
        BGT     eqv_loop_up_l2          // if count still positive then loop
        LDMFD   sp!,{v1-v5,pc}^         // restore work regs and return

// extern void nand_loop_up (uintD* xptr, uintD* yptr, uintC count);
//       entry
//               a1 = xptr
//               a2 = yptr
//               a3 = count of words to be NANDed
//       exit
//               xptr = ~(xptr & yptr) for count words
//               a1 - a4, ip destroyed
        EXPORT(nand_loop_up)            // word aligned nand loop up
        DECLARE_FUNCTION(nand_loop_up)
GLABEL(nand_loop_up)
        ANDS    a4,a3,#3                // multiple of 4 words ?
        BEQ     nand_loop_up_l1         // yup, so branch
        CMP     a4,#2                   // NAND the first 1-3 words
        LDR     a4,[a2],#4              // to align the total to a multiple
        LDR     ip,[a1]                 // of 4 words
        AND     ip,ip,a4
        MVN     ip,ip
        STR     ip,[a1],#4
        BLT     nand_loop_up_l1         // better to branch than skip instrs.
        LDRGE   a4,[a2],#4
        LDRGE   ip,[a1]
        ANDGE   ip,ip,a4
        MVNGE   ip,ip
        STRGE   ip,[a1],#4
        BLE     nand_loop_up_l1         // better to branch than skip instrs.
        LDRGT   a4,[a2],#4
        LDRGT   ip,[a1]
        ANDGT   ip,ip,a4
        MVNGT   ip,ip
        STRGT   ip,[a1],#4
LABEL(nand_loop_up_l1)
        BICS    a4,a3,#3                // set counter to multiple of 4
        MOVEQS  pc,lr                   // if zero then we're done
        STMFD   sp!,{v1-v5,lr}          // save work regs
LABEL(nand_loop_up_l2)
        LDMIA   a2!,{a3,v1,v2,ip}       // load 4 words in one go
        LDMIA   a1,{v3,v4,v5,lr}        // load target words
        AND     v3,v3,a3                // NAND the four words
        MVN     v3,v3
        AND     v4,v4,v1
        MVN     v4,v4
        AND     v5,v5,v2
        MVN     v5,v5
        AND     lr,lr,ip
        MVN     lr,lr
        STMIA   a1!,{v3,v4,v5,lr}       // store 4 results
        SUBS    a4,a4,#4                // decrement counter by 4
        BGT     nand_loop_up_l2         // if count still positive then loop
        LDMFD   sp!,{v1-v5,pc}^         // restore work regs and return

// extern void nor_loop_up (uintD* xptr, uintD* yptr, uintC count);
//       entry
//               a1 = xptr
//               a2 = yptr
//               a3 = count of words to be NORed
//       exit
//               xptr = ~(xptr | yptr) for count words
//               a1 - a4, ip destroyed
        EXPORT(nor_loop_up)             // word aligned nor loop up
        DECLARE_FUNCTION(nor_loop_up)
GLABEL(nor_loop_up)
        ANDS    a4,a3,#3                // multiple of 4 words ?
        BEQ     nor_loop_up_l1          // yup, so branch
        CMP     a4,#2                   // NOR the first 1-3 words
        LDR     a4,[a2],#4              // to align the total to a multiple
        LDR     ip,[a1]                 // of 4 words
        ORR     ip,ip,a4
        MVN     ip,ip
        STR     ip,[a1],#4
        BLT     nor_loop_up_l1          // better to branch than skip instrs.
        LDRGE   a4,[a2],#4
        LDRGE   ip,[a1]
        ORRGE   ip,ip,a4
        MVNGE   ip,ip
        STRGE   ip,[a1],#4
        BLE     nor_loop_up_l1          // better to branch than skip instrs.
        LDRGT   a4,[a2],#4
        LDRGT   ip,[a1]
        ORRGT   ip,ip,a4
        MVNGT   ip,ip
        STRGT   ip,[a1],#4
LABEL(nor_loop_up_l1)
        BICS    a4,a3,#3                // set counter to multiple of 4
        MOVEQS  pc,lr                   // if zero then we're done
        STMFD   sp!,{v1-v5,lr}          // save work regs
LABEL(nor_loop_up_l2)
        LDMIA   a2!,{a3,v1,v2,ip}       // load 4 words in one go
        LDMIA   a1,{v3,v4,v5,lr}        // load target words
        ORR     v3,v3,a3                // NOR the four words
        MVN     v3,v3
        ORR     v4,v4,v1
        MVN     v4,v4
        ORR     v5,v5,v2
        MVN     v5,v5
        ORR     lr,lr,ip
        MVN     lr,lr
        STMIA   a1!,{v3,v4,v5,lr}       // store 4 results
        SUBS    a4,a4,#4                // decrement counter by 4
        BGT     nor_loop_up_l2          // if count still positive then loop
        LDMFD   sp!,{v1-v5,pc}^         // restore work regs and return

// extern void andc2_loop_up (uintD* xptr, uintD* yptr, uintC count);
//       entry
//               a1 = xptr
//               a2 = yptr
//               a3 = count of words to be ANDC2ed
//       exit
//               xptr = xptr & ~yptr for count words
//               a1 - a4, ip destroyed
        EXPORT(andc2_loop_up)           // word aligned andc2 loop up
        DECLARE_FUNCTION(andc2_loop_up)
GLABEL(andc2_loop_up)
        ANDS    a4,a3,#3                // multiple of 4 words ?
        BEQ     andc2_loop_up_l1        // yup, so branch
        CMP     a4,#2                   // ANDC2 the first 1-3 words
        LDR     a4,[a2],#4              // to align the total to a multiple
        LDR     ip,[a1]                 // of 4 words
        BIC     ip,ip,a4
        STR     ip,[a1],#4
        BLT     andc2_loop_up_l1        // better to branch than skip instrs.
        LDRGE   a4,[a2],#4
        LDRGE   ip,[a1]
        BICGE   ip,ip,a4
        STRGE   ip,[a1],#4
        LDRGT   a4,[a2],#4
        LDRGT   ip,[a1]
        BICGT   ip,ip,a4
        STRGT   ip,[a1],#4
LABEL(andc2_loop_up_l1)
        BICS    a4,a3,#3                // set counter to multiple of 4
        MOVEQS  pc,lr                   // if zero then we're done
        STMFD   sp!,{v1-v5,lr}          // save work regs
LABEL(andc2_loop_up_l2)
        LDMIA   a2!,{a3,v1,v2,ip}       // load 4 words in one go
        LDMIA   a1,{v3,v4,v5,lr}        // load target words
        BIC     v3,v3,a3                // ANDC2 the four words
        BIC     v4,v4,v1
        BIC     v5,v5,v2
        BIC     lr,lr,ip
        STMIA   a1!,{v3,v4,v5,lr}       // store 4 results
        SUBS    a4,a4,#4                // decrement counter by 4
        BGT     andc2_loop_up_l2        // if count still positive then loop
        LDMFD   sp!,{v1-v5,pc}^         // restore work regs and return

// extern void orc2_loop_up (uintD* xptr, uintD* yptr, uintC count);
//       entry
//               a1 = xptr
//               a2 = yptr
//               a3 = count of words to be XORed
//       exit
//               xptr = xptr | ~yptr for count words
//               a1 - a4, ip destroyed
        EXPORT(orc2_loop_up)            // word aligned orc2 loop up
        DECLARE_FUNCTION(orc2_loop_up)
GLABEL(orc2_loop_up)
        ANDS    a4,a3,#3                // multiple of 4 words ?
        BEQ     orc2_loop_up_l1         // yup, so branch
        CMP     a4,#2                   // ORC2 the first 1-3 words
        LDR     a4,[a2],#4              // to align the total to a multiple
        LDR     ip,[a1]                 // of 4 words
        MVN     a4,a4
        ORR     ip,ip,a4
        STR     ip,[a1],#4
        BLT     orc2_loop_up_l1         // better to branch than skip instrs.
        LDRGE   a4,[a2],#4
        LDRGE   ip,[a1]
        MVNGE   a4,a4
        ORRGE   ip,ip,a4
        STRGE   ip,[a1],#4
        BLE     orc2_loop_up_l1         // better to branch than skip instrs.
        LDRGT   a4,[a2],#4
        LDRGT   ip,[a1]
        MVNGT   a4,a4
        ORRGT   ip,ip,a4
        STRGT   ip,[a1],#4
LABEL(orc2_loop_up_l1)
        BICS    a4,a3,#3                // set counter to multiple of 4
        MOVEQS  pc,lr                   // if zero then we're done
        STMFD   sp!,{v1-v5,lr}          // save work regs
LABEL(orc2_loop_up_l2)
        LDMIA   a2!,{a3,v1,v2,ip}       // load 4 words in one go
        LDMIA   a1,{v3,v4,v5,lr}        // load target words
        MVN     a3,a3                   // ORC2 the four words
        ORR     v3,v3,a3
        MVN     v1,v1
        ORR     v4,v4,v1
        MVN     v2,v2
        ORR     v5,v5,v2
        MVN     ip,ip
        ORR     lr,lr,ip
        STMIA   a1!,{v3,v4,v5,lr}       // store 4 results
        SUBS    a4,a4,#4                // decrement counter by 4
        BGT     orc2_loop_up_l2         // if count still positive then loop
        LDMFD   sp!,{v1-v5,pc}^         // restore work regs and return

// extern void not_loop_up (uintD* xptr, uintC count);
//       entry
//               a1 = xptr
//               a2 = count of words to be NOTed
//       exit
//               xptr = ~xptr for count words
//               a1 - a4, ip destroyed
        EXPORT(not_loop_up)             // word aligned not loop up
        DECLARE_FUNCTION(not_loop_up)
GLABEL(not_loop_up)
        ANDS    a3,a2,#3                // multiple of 4 words ?
        BEQ     not_loop_up_l1          // yup, so branch
        CMP     a3,#2                   // NOT the first 1-3 words
        LDR     a3,[a1]                 // to align the total to a multiple
        MVN     a3,a3                   // of 4 words
        STR     a3,[a1],#4
        BLT     not_loop_up_l1          // better to branch than skip instrs.
        LDRGE   a3,[a1]
        MVNGE   a3,a3
        STRGE   a3,[a1],#4
        LDRGT   a3,[a1]
        MVNGT   a3,a3
        STRGT   a3,[a1],#4
LABEL(not_loop_up_l1)
        BICS    a4,a2,#3                // set counter to multiple of 4
        MOVEQS  pc,lr                   // if zero then we're done
        STMFD   sp!,{lr}                // save work regs
LABEL(not_loop_up_l2)
        LDMIA   a1,{a2,a3,ip,lr}        // load 4 words in one go,NO writeback
        MVN     a2,a2                   // NOT the four words
        MVN     a3,a3
        MVN     ip,ip
        MVN     lr,lr
        STMIA   a1!,{a2,a3,ip,lr}       // store 4 results
        SUBS    a4,a4,#4                // decrement counter by 4
        BGT     not_loop_up_l2          // if count still positive then loop
        LDMFD   sp!,{pc}^               // restore work regs and return

// extern void and_test_loop_up (uintD* xptr, uintD* yptr, uintC count);
//       entry
//               a1 = xptr
//               a2 = yptr
//               a3 = count of words to be AND_TESTed
//       exit
//               a1 = TRUE if any words ANDed together are non-zero else FALSE
//               a2 - a4, ip destroyed
        EXPORT(and_test_loop_up)        // word aligned and_test loop up
        DECLARE_FUNCTION(and_test_loop_up)
GLABEL(and_test_loop_up)
        ANDS    a4,a3,#3                // multiple of 4 words ?
        BEQ     and_test_loop_up_l1     // yup, so branch
        CMP     a4,#2
        LDR     a4,[a2],#4              // AND_TEST the first 1-3 words
        LDR     ip,[a1],#4              // to align the total to a multiple
        TST     ip,a4                   // of 4 words
        MOVNE   a1,#1                   // return TRUE if AND_TEST ok
        MOVNES  pc,lr
        BCC     and_test_loop_up_l1     // better to branch than skip instrs.
        LDRGE   a4,[a2],#4
        LDRGE   ip,[a1],#4
        TSTGE   ip,a4
        MOVNE   a1,#1
        MOVNES  pc,lr
        ANDS    a4,a3,#3
        CMP     a4,#2
        BLE     and_test_loop_up_l1     // better to branch than skip instrs.
        LDRGT   a4,[a2],#4
        LDRGT   ip,[a1],#4
        TSTGT   ip,a4
        MOVNE   a1,#1
        MOVNES  pc,lr
LABEL(and_test_loop_up_l1)
        BICS    a4,a3,#3                // set counter to multiple of 4
        MOVEQ   a1,#0                   // return FALSE
        MOVEQS  pc,lr                   // if zero then we're done
        STMFD   sp!,{v1-v6,lr}          // save work regs
        MOV     v6,a1                   // move xptr to v6
        MOV     a1,#1                   // set result to TRUE
LABEL(and_test_loop_up_l2)
        LDMIA   a2!,{a3,v1,v2,ip}       // load 4 words in one go
        LDMIA   v6!,{v3,v4,v5,lr}       // load target words
        TST     v3,a3                   // AND_TEST the four words
        TSTEQ   v4,v1
        TSTEQ   v5,v2
        TSTEQ   lr,ip
        LDMNEFD sp!,{v1-v6,pc}^
        SUBS    a4,a4,#4                // decrement counter by 4
        BGT     and_test_loop_up_l2     // if count still positive then loop
        MOV     a1,#0
        LDMFD   sp!,{v1-v6,pc}^         // restore work regs and return

#endif

// extern void compare_loop_up (uintD* xptr, uintD* yptr, uintC count);
//       entry
//               a1 = xptr
//               a2 = yptr
//               a3 = count of words to be COMPAREd
//       exit
//               a1 = +1 if first non-equal word in xptr[] and yptr[]
//                       xptr[i] > yptr[i]
//                    -1 if xptr[i] < yptr[i]
//                     0 otherwise
//               a2 - a4, ip destroyed
        EXPORT(compare_loop_up)         // word aligned compare loop up
        DECLARE_FUNCTION(compare_loop_up)
GLABEL(compare_loop_up)
        ANDS    a4,a3,#3                // multiple of 4 words ?
        BEQ     compare_loop_up_l1      // yup, so branch
        LDR     a4,[a2],#4              // COMPARE the first 1-3 words
        LDR     ip,[a1],#4              // to align the total to a multiple
        CMP     ip,a4                   // of 4 words
        MVNLO   a1,#0                   // x < y -> -1
        MOVHI   a1,#1                   // x > y -> +1
        MOVNES  pc,lr                   // and return result if not equal
        ANDS    a4,a3,#3
        CMP     a4,#2
        BLT     compare_loop_up_l1      // need to branch 'cos PSR used
        LDR     a4,[a2],#4
        LDR     ip,[a1],#4
        CMP     ip,a4
        MVNLO   a1,#0
        MOVHI   a1,#1
        MOVNES  pc,lr
        ANDS    a4,a3,#3
        CMP     a4,#2
        BLE     compare_loop_up_l1      // need to branch 'cos PSR used
        LDR     a4,[a2],#4
        LDR     ip,[a1],#4
        CMP     ip,a4
        MVNLO   a1,#0
        MOVHI   a1,#1
        MOVNES  pc,lr
LABEL(compare_loop_up_l1)
        BICS    a4,a3,#3                // set counter to multiple of 4
        MOVEQ   a1,#0                   // xptr[] == yptr[] -> 0
        MOVEQS  pc,lr                   // if zero then we're done
        STMFD   sp!,{v1-v6,lr}          // save work regs
        MOV     v6,a1                   // move xptr to v6
        MOV     a1,#1                   // set result to +1
LABEL(compare_loop_up_l2)
        LDMIA   a2!,{a3,v1,v2,ip}       // load 4 words in one go
        LDMIA   v6!,{v3,v4,v5,lr}       // load test words
        CMP     v3,a3                   // COMPARE the four words
        CMPEQ   v4,v1
        CMPEQ   v5,v2
        CMPEQ   lr,ip
        MVNLO   a1,#0                   // x < y -> -1 (a1 already holds +1)
        LDMNEFD sp!,{v1-v6,pc}^
        SUBS    a4,a4,#4                // decrement counter by 4
        BGT     compare_loop_up_l2      // if count still positive then loop
        MOV     a1,#0
        LDMFD   sp!,{v1-v6,pc}^         // restore work regs and return

#if CL_DS_BIG_ENDIAN_P

// extern uintD addto_loop_down (uintD* sourceptr, uintD* destptr, uintC count);
//       entry
//               a1 = sourceptr
//               a2 = destptr
//               a3 = count of words to be added
//       exit
//               destptr[] = sourceptr[] + destptr[]
//               a1 = last carry
//               a2 - a4, ip destroyed
        EXPORT(addto_loop_down)         // word aligned addto loop down
        DECLARE_FUNCTION(addto_loop_down)
GLABEL(addto_loop_down)
                MOV     a4,a3           // set regs for a call
                MOV     a3,a2           // to add_loop_down
                                        // and drop into add_loop_down

// extern uintD add_loop_down (uintD* sourceptr1, uintD* sourceptr2, uintD* destptr, uintC count);
//       entry
//               a1 = sourceptr1
//               a2 = sourceptr2
//               a3 = destptr
//               a4 = count of words to be added
//       exit
//               destptr[] = sourceptr1[] + sourceptr2[]
//               a1 = last carry
//               a2 - a4, ip destroyed
        EXPORT(add_loop_down)           // word aligned add loop down
        DECLARE_FUNCTION(add_loop_down)
GLABEL(add_loop_down)
        ANDS    ip,a4,#3                // multiple of 4 words ?
        BEQ     add_loop_down_l1        // yup, so branch
        STMFD   sp!,{v6,lr}
        LDR     v6,[a2,#-4]!            // add the first 1-3 words
        LDR     lr,[a1,#-4]!            // to align the total to a multiple
        ADDS    lr,lr,v6                // of 4 words
        STR     lr,[a3,#-4]!
        TEQ     ip,#1
        BEQ     add_loop_down_l0        // need to branch 'cos PSR used
        LDR     v6,[a2,#-4]!
        LDR     lr,[a1,#-4]!
        ADCS    lr,lr,v6
        STR     lr,[a3,#-4]!
        TEQ     ip,#2
        BEQ     add_loop_down_l0        // need to branch 'cos PSR used
        LDR     v6,[a2,#-4]!
        LDR     lr,[a1,#-4]!
        ADCS    lr,lr,v6
        STR     lr,[a3,#-4]!
LABEL(add_loop_down_l0)                 // at least one add has happened
        BICS    a4,a4,#3                // set counter to multiple of 4
        BNE     add_loop_down_l3        // branch if more adds to do
        ADCEQ   a1,a4,a4                // set result to Carry (a4 is 0)
        LDMEQFD sp!,{v6,pc}^            // and return
LABEL(add_loop_down_l1)
        BICS    a4,a4,#3                // set counter to multiple of 4
        MOVEQ   a1,#0                   // no adds, so C = 0
        MOVEQS  pc,lr                   // if zero then we're done
        CMN     a4,#0                   // clear carry bit
        STMFD   sp!,{v6,lr}
LABEL(add_loop_down_l3)
        STMFD   sp!,{v1-v5}             // save work regs
LABEL(add_loop_down_l2)
        LDMDB   a2!,{v1,v2,v3,ip}       // load 4 words in one go
        LDMDB   a1!,{v4,v5,v6,lr}       // and from source2
        ADCS    lr,lr,ip                // add the four words with carry
        ADCS    v6,v6,v3
        ADCS    v5,v5,v2
        ADCS    v4,v4,v1
        STMDB   a3!,{v4,v5,v6,lr}       // store 4 results
        SUB     a4,a4,#4                // decrement counter by 4, preserve C
        TEQ     a4,#0                   // are we done ?
        BNE     add_loop_down_l2        // if count non-zero then loop
        ADC     a1,a4,a4                // set result to Carry (a4 is 0)
        LDMFD   sp!,{v1-v6,pc}^         // restore work regs and return

// extern uintD inc_loop_down (uintD* ptr, uintC count);
//       entry
//               a1 = ptr
//               a2 = count of words to be INCed
//       exit
//               a1 = 0 if any words are non-zero after increment else 1
//                      stop incrementing when first word becomes non-zero
//               a2 - a4, ip destroyed
        EXPORT(inc_loop_down)           // word aligned inc loop down
        DECLARE_FUNCTION(inc_loop_down)
GLABEL(inc_loop_down)
        ANDS    a3,a2,#1                // multiple of 2 words ?
        BEQ     inc_loop_down_l1        // yup, so branch
        LDR     a4,[a1,#-4]!            // INC the first word
        ADDS    a4,a4,#1                // align the total to a multiple of 2
        STR     a4,[a1]
        MOVNE   a1,#0                   // set result to 0
        MOVNES  pc,lr                   // return 0 if non-zero result
LABEL(inc_loop_down_l1)
        BICS    a4,a2,#1                // set counter to multiple of 2
        MOVEQ   a1,#1                   // return 1
        MOVEQS  pc,lr                   // if zero then we're done
        MOV     ip,a1                   // move ptr to ip
        MOV     a1,#0                   // set result to 0
        ANDS    a3,a4,#3
        BEQ     inc_loop_down_l3
        LDMDB   ip,{a2,a3}              // load 2 words in one go
        ADDS    a3,a3,#1                // INC the two words
        ADDEQS  a2,a2,#1                // stopping when first word non-zero
        STMDB   ip!,{a2,a3}             // store 2 results
        MOVNES  pc,lr                   // return 0 if any result non-zero
        SUBS    a4,a4,#2                // decrement counter by 2
        MOVEQ   a1,#1                   // if finished loop then
        MOVEQS  pc,lr                   // return 1
LABEL(inc_loop_down_l3)                 // now a multiple of 4 words
        STMFD   sp!,{v1,lr}             // save work regs
LABEL(inc_loop_down_l2)
        LDMDB   ip,{a2,a3,v1,lr}        // load 4 words in one go
        ADDS    lr,lr,#1                // INC the four words
        ADDEQS  v1,v1,#1                // stopping when first word non-zero
        ADDEQS  a3,a3,#1
        ADDEQS  a2,a2,#1
        STMDB   ip!,{a2,a3,v1,lr}       // store 4 results
        LDMNEFD sp!,{v1,pc}^            // return 0 if any result non-zero
        SUBS    a4,a4,#4                // decrement counter by 4
        BGT     inc_loop_down_l2        // if count still positive then loop
        MOV     a1,#1
        LDMFD   sp!,{v1,pc}^            // restore work regs and return 1

// extern uintD sub_loop_down (uintD* sourceptr1, uintD* sourceptr2, uintD* destptr, uintC count);
//       entry
//               a1 = sourceptr1
//               a2 = sourceptr2
//               a3 = destptr
//               a4 = count of words to be subtracted
//       exit
//               destptr[] = sourceptr1[] -  sourceptr2[]
//               a1 = last carry
//               a2 - a4, ip destroyed
        EXPORT(sub_loop_down)           // word aligned sub loop down
LABEL(sub_loop_down)
        ANDS    ip,a4,#3                // multiple of 4 words ?
        BEQ     sub_loop_down_l1        // yup, so branch
        STMFD   sp!,{v6,lr}
        LDR     v6,[a2,#-4]!            // subtract the first 1-3 words
        LDR     lr,[a1,#-4]!            // to align the total to a multiple
        SUBS    lr,lr,v6                // of 4 words
        STR     lr,[a3,#-4]!
        TEQ     ip,#1
        BNE     sub_loop_down_l0        // branch if more than one subtract
LABEL(sub_loop_down_l4)                 // drop through for better instr. timings
        BICS    a4,a4,#3                // set counter to multiple of 4
        SBCEQ   a1,a4,a4                // set result to Carry (a4 is 0)
        LDMEQFD sp!,{v6,pc}^            // and return
        STMFD   sp!,{v1-v5}             // save work regs
        B       sub_loop_down_l2        // branch if more subtracts to do
LABEL(sub_loop_down_l0)
        LDR     v6,[a2,#-4]!
        LDR     lr,[a1,#-4]!
        SBCS    lr,lr,v6
        STR     lr,[a3,#-4]!
        TEQ     ip,#2
        BEQ     sub_loop_down_l4        // need to branch 'cos PSR used
        LDR     v6,[a2,#-4]!
        LDR     lr,[a1,#-4]!
        SBCS    lr,lr,v6
        STR     lr,[a3,#-4]!
        B       sub_loop_down_l4
LABEL(sub_loop_down_l1)
        BICS    a4,a4,#3                // set counter to multiple of 4
        MOVEQ   a1,#0                   // no subtracts, so C = 0
        MOVEQS  pc,lr                   // if zero then we're done
        CMP     a4,#0                   // set carry bit, since a4 > 0
        STMFD   sp!,{v1-v6,lr}          // save work regs
LABEL(sub_loop_down_l2)
        LDMDB   a2!,{v1,v2,v3,ip}       // load 4 words in one go
        LDMDB   a1!,{v4,v5,v6,lr}       // and from source2
        SBCS    lr,lr,ip                // subtract the four words with carry
        SBCS    v6,v6,v3
        SBCS    v5,v5,v2
        SBCS    v4,v4,v1
        STMDB   a3!,{v4,v5,v6,lr}       // store 4 results
        SUB     a4,a4,#4                // decrement counter by 4, preserve C
        TEQ     a4,#0                   // are we done ?
        BNE     sub_loop_down_l2        // if count non-zero then loop
        SBC     a1,a4,a4                // set result to Carry (a4 is 0)
        LDMFD   sp!,{v1-v6,pc}^         // restore work regs and return

// extern uintD subx_loop_down (uintD* sourceptr1, uintD* sourceptr2, uintD* destptr, uintC count, uintD carry);
//       entry
//               a1 = sourceptr1
//               a2 = sourceptr2
//               a3 = destptr
//               a4 = count of words to be subtracted
//               [sp] = carry
//       exit
//               destptr[] = sourceptr1[] -  sourceptr2[]
//               a1 = last carry
//               a2 - a4, ip destroyed
        EXPORT(subx_loop_down)          // word aligned xsub loop down
LABEL(subx_loop_down)
        LDR     ip,[sp]                 // get starting value of carry
LABEL(subx_loop_down_lsub)
        RSBS    ip,ip,#0                // set carry in PSR
        ANDS    ip,a4,#3                // multiple of 4 words ?
        BEQ     subx_loop_down_l1       // yup, so branch
        STMFD   sp!,{v6,lr}
        LDR     v6,[a2,#-4]!            // subtract the first 1-3 words
        LDR     lr,[a1,#-4]!            // to align the total to a multiple
        SBCS    lr,lr,v6                // of 4 words
        STR     lr,[a3,#-4]!
        TEQ     ip,#1
        BNE     subx_loop_down_l0       // branch if more than one subtract
LABEL(subx_loop_down_l4)                // drop through for better instr. timings
        BICS    a4,a4,#3                // set counter to multiple of 4
        SBCEQ   a1,a4,a4                // set result to Carry (a4 is 0)
        LDMEQFD sp!,{v6,pc}^            // and return
        STMFD   sp!,{v1-v5}             // save work regs
        B       subx_loop_down_l2       // branch if more subtracts to do
LABEL(subx_loop_down_l0)
        LDR     v6,[a2,#-4]!
        LDR     lr,[a1,#-4]!
        SBCS    lr,lr,v6
        STR     lr,[a3,#-4]!
        TEQ     ip,#2
        BEQ     subx_loop_down_l4       // need to branch 'cos PSR used
        LDR     v6,[a2,#-4]!
        LDR     lr,[a1,#-4]!
        SBCS    lr,lr,v6
        STR     lr,[a3,#-4]!
        B       subx_loop_down_l4
LABEL(subx_loop_down_l1)
        BICS    a4,a4,#3                // set counter to multiple of 4
        SBCEQ   a1,a4,a4                // set result to Carry (a4 is 0)
        MOVEQS  pc,lr                   // if zero then we're done
        STMFD   sp!,{v1-v6,lr}          // save work regs
LABEL(subx_loop_down_l2)
        LDMDB   a2!,{v1,v2,v3,ip}       // load 4 words in one go
        LDMDB   a1!,{v4,v5,v6,lr}       // and from source2
        SBCS    lr,lr,ip                // subtract the four words with carry
        SBCS    v6,v6,v3
        SBCS    v5,v5,v2
        SBCS    v4,v4,v1
        STMDB   a3!,{v4,v5,v6,lr}       // store 4 results
        SUB     a4,a4,#4                // decrement counter by 4, preserve C
        TEQ     a4,#0                   // are we done ?
        BNE     subx_loop_down_l2       // if count non-zero then loop
        SBC     a1,a4,a4                // set result to Carry (a4 is 0)
        LDMFD   sp!,{v1-v6,pc}^         // restore work regs and return

// extern uintD subfrom_loop_down (uintD* sourceptr, uintD* destptr, uintC count);
//       entry
//               a1 = sourceptr
//               a2 = destptr
//               a3 = count of words to be subtracted
//       exit
//               destptr[] = destptr[] - sourceptr[]
//               a1 = last carry
//               a2 - a4, ip destroyed
        EXPORT(subfrom_loop_down)       // word aligned subfrom loop down
        DECLARE_FUNCTION(subfrom_loop_down)
GLABEL(subfrom_loop_down)
        ANDS    ip,a3,#3                // multiple of 4 words ?
        BEQ     subfrom_loop_down_l1    // yup, so branch
        STMFD   sp!,{lr}
        LDR     a4,[a1,#-4]!            // subtract the first 1-3 words
        LDR     lr,[a2,#-4]!            // to align the total to a multiple
        SUBS    lr,lr,a4                // of 4 words
        STR     lr,[a2]
        TEQ     ip,#1
        BNE     subfrom_loop_down_l0    // branch if more than one subtract
LABEL(subfrom_loop_down_l4)             // drop through for better instr. timings
        BICS    a4,a3,#3                // set counter to multiple of 4
        SBCEQ   a1,a4,a4                // set result to Carry (a4 is 0)
        LDMEQFD sp!,{pc}^               // and return
        STMFD   sp!,{v1-v5}             // save work regs
        B       subfrom_loop_down_l2    // branch if more subtracts to do
LABEL(subfrom_loop_down_l0)
        LDR     a4,[a1,#-4]!
        LDR     lr,[a2,#-4]!
        SBCS    lr,lr,a4
        STR     lr,[a2]
        TEQ     ip,#2
        BEQ     subfrom_loop_down_l4    // need to branch 'cos PSR used
        LDR     a4,[a1,#-4]!
        LDR     lr,[a2,#-4]!
        SBCS    lr,lr,a4
        STR     lr,[a2]
        B       subfrom_loop_down_l4
LABEL(subfrom_loop_down_l1)
        BICS    a4,a3,#3                // set counter to multiple of 4
        MOVEQ   a1,#0                   // no subtracts, so C = 0
        MOVEQS  pc,lr                   // if zero then we're done
        CMP     a4,#0                   // set carry bit, since a4 > 0
        STMFD   sp!,{v1-v5,lr}          // save work regs
LABEL(subfrom_loop_down_l2)
        LDMDB   a1!,{a3,v1,v2,ip}       // load 4 words in one go
        LDMDB   a2,{v3,v4,v5,lr}        // and from destptr
        SBCS    lr,lr,ip                // subtract the four words with carry
        SBCS    v5,v5,v2
        SBCS    v4,v4,v1
        SBCS    v3,v3,a3
        STMDB   a2!,{v3,v4,v5,lr}       // store 4 results
        SUB     a4,a4,#4                // decrement counter by 4, preserve C
        TEQ     a4,#0                   // are we done ?
        BNE     subfrom_loop_down_l2    // if count non-zero then loop
        SBC     a1,a4,a4                // set result to Carry (a4 is 0)
        LDMFD   sp!,{v1-v5,pc}^         // restore work regs and return

// extern uintD dec_loop_down (uintD* ptr, uintC count);
//       entry
//               a1 = ptr
//               a2 = count of words to be DECed
//       exit
//               a1 = 0 if any words are non-zero before decrement else -1
//                      stop decrementing when first word is non-zero
//               a2 - a4, ip destroyed
        EXPORT(dec_loop_down)           // word aligned dec loop down
        DECLARE_FUNCTION(dec_loop_down)
GLABEL(dec_loop_down)
        ANDS    a3,a2,#1                // multiple of 2 words ?
        BEQ     dec_loop_down_l1        // yup, so branch
        LDR     a4,[a1,#-4]!            // DEC the first word
        SUBS    a4,a4,#1                // align the total to a multiple of 2
        STR     a4,[a1]
        MOVCS   a1,#0                   // set result to 0
        MOVCSS  pc,lr                   // return 0 if non-zero result
LABEL(dec_loop_down_l1)
        BICS    a4,a2,#1                // set counter to multiple of 2
        MVNEQ   a1,#0                   // return -1
        MOVEQS  pc,lr                   // if zero then we're done
        MOV     ip,a1                   // move ptr to ip
        MOV     a1,#0                   // set result to 0
        ANDS    a3,a4,#3
        BEQ     dec_loop_down_l3
        LDMDB   ip,{a2,a3}              // load 2 words in one go
        SUBS    a3,a3,#1                // DEC the two words
        SUBCCS  a2,a2,#1                // stopping when first word non-zero
        STMDB   ip!,{a2,a3}             // store 2 results
        MOVCSS  pc,lr                   // return 0 if any result non-zero
        SUBS    a4,a4,#2                // decrement counter by 2
        MVNEQ   a1,#0                   // if finished loop then
        MOVEQS  pc,lr                   // return -1
LABEL(dec_loop_down_l3)                 // now a multiple of 4 words
        STMFD   sp!,{v1,lr}             // save work regs
LABEL(dec_loop_down_l2)
        LDMDB   ip,{a2,a3,v1,lr}        // load 4 words in one go
        SUBS    lr,lr,#1                // DEC the four words
        SUBCCS  v1,v1,#1                // stopping when first word non-zero
        SUBCCS  a3,a3,#1
        SUBCCS  a2,a2,#1
        STMDB   ip!,{a2,a3,v1,lr}       // store 4 results
        LDMCSFD sp!,{v1,pc}^            // return 0 if any carry
        SUBS    a4,a4,#4                // decrement counter by 4
        BGT     dec_loop_down_l2        // if count still positive then loop
        MVN     a1,#0
        LDMFD   sp!,{v1,pc}^            // restore work regs and return -1

// extern void neg_loop_down (uintD* ptr, uintC count);
//       entry
//               a1 = ptr
//               a2 = count of words. The long integer is to be NEGated
//       exit
//               ptr[] = -ptr[] for count words
//               a1 = last carry
//               a2 - a4, ip destroyed
        EXPORT(neg_loop_down)           // word aligned neg loop down
        DECLARE_FUNCTION(neg_loop_down)
GLABEL(neg_loop_down)
        CMPS    a2,#0                   // count = 0 ?
        MOVEQ   a1,#0                   // yup, so return 0
        MOVEQS  pc,lr
LABEL(neg_loop_down_l1)                 // skip all the zero words first
        LDR     a3,[a1,#-4]!            // compare words against zero
        CMPS    a3,#0                   // downwards in memory
        BNE     neg_loop_down_l2        // non-zero, so negate rest of words
        SUBS    a2,a2,#1                // reduce count of words
        BNE     neg_loop_down_l1        // more ?, so loop
        MOV     a1,#0                   // return 0
        MOVS    pc,lr
LABEL(neg_loop_down_l2)
        RSB     a3,a3,#0                // first non-zero word = -word
        STR     a3,[a1]
        SUBS    a2,a2,#1
        MVNEQ   a1,#0                   // done ? -> return -1
        MOVEQS  pc,lr
                                        // now NOT rest of the words
        ANDS    a3,a2,#3                // multiple of 4 words ?
        BEQ     neg_loop_down_l3        // yup, so branch
        CMP     a3,#2                   // NOT the first 1-3 words
        LDR     a3,[a1,#-4]!            // to align the total to a multiple
        MVN     a3,a3                   // of 4 words
        STR     a3,[a1]
        BLT     neg_loop_down_l3        // better to branch than skip instrs.
        LDRGE   a3,[a1,#-4]!
        MVNGE   a3,a3
        STRGE   a3,[a1]
        LDRGT   a3,[a1,#-4]!
        MVNGT   a3,a3
        STRGT   a3,[a1]
LABEL(neg_loop_down_l3)
        BICS    a4,a2,#3                // set counter to multiple of 4
        MVNEQ   a1,#0                   // set result to -1
        MOVEQS  pc,lr                   // if zero then we're done
        STMFD   sp!,{lr}                // save work regs
LABEL(neg_loop_down_l4)
        LDMDB   a1,{a2,a3,ip,lr}        // load 4 words in one go,NO writeback
        MVN     a2,a2                   // NOT the four words
        MVN     a3,a3
        MVN     ip,ip
        MVN     lr,lr
        STMDB   a1!,{a2,a3,ip,lr}       // store 4 results
        SUBS    a4,a4,#4                // decrement counter by 4
        BGT     neg_loop_down_l4        // if count still positive then loop
        MVN     a1,#0                   // set result to -1
        LDMFD   sp!,{pc}^               // restore work regs and return -1

// extern uintD shift1left_loop_down (uintD* ptr, uintC count);
//       entry
//               a1 = ptr
//               a2 = count of words to be shifted left
//       exit
//               a1 = carry out from last shift left
//               a2 - a4, ip destroyed
        EXPORT(shift1left_loop_down)    // word aligned shift1left loop down
        DECLARE_FUNCTION(shift1left_loop_down)
GLABEL(shift1left_loop_down)
        CMN     a1,#0                   // clear carry bit, since a1 > 0
        ANDS    a3,a2,#1                // multiple of 2 words ?
        BEQ     shift1left_loop_down_l1 // yup, so branch
        LDR     a4,[a1,#-4]!            // shift left the first word
        ADDS    a4,a4,a4
        STR     a4,[a1]
LABEL(shift1left_loop_down_l1)
        BICS    a4,a2,#1                // set counter to multiple of 2
        ADCEQ   a1,a4,a4                // if zero set result to C (a4 is 0)
        MOVEQS  pc,lr                   // and return
        ANDS    a3,a4,#3                // multiple of 4 words ?
        BEQ     shift1left_loop_down_l3 // yup, so branch
        LDMDB   a1,{a2,a3}              // load 2 words in one go
        ADCS    a3,a3,a3                // shift left the two words
        ADCS    a2,a2,a2
        STMDB   a1!,{a2,a3}             // store 2 results
        BICS    a4,a4,#2                // decrement counter by 2
        ADCEQ   a1,a4,a4                // set result to Carry (a4 is 0)
        MOVEQS  pc,lr                   // and return
LABEL(shift1left_loop_down_l3)          // now a multiple of 4 words
        STMFD   sp!,{lr}                // save work regs
LABEL(shift1left_loop_down_l2)
        LDMDB   a1,{a2,a3,ip,lr}        // load 4 words in one go
        ADCS    lr,lr,lr                // shift left the four words
        ADCS    ip,ip,ip
        ADCS    a3,a3,a3
        ADCS    a2,a2,a2
        STMDB   a1!,{a2,a3,ip,lr}       // store 4 results
        SUB     a4,a4,#4                // decrement counter by 4
        TEQ     a4,#0                   // are we done ?
        BNE     shift1left_loop_down_l2 // if count non-zero then loop
        ADC     a1,a4,a4                // set result to Carry (a4 is 0)
        LDMFD   sp!,{pc}^               // restore work regs and return 1

// extern uintD shiftleft_loop_down (uintD* ptr, uintC count, uintC i, uintD carry);
//       entry
//               a1 = ptr
//               a2 = count of words to be shifted left
//               a3 = size of left shift
//               a4 = value to ORR in for first shift
//       exit
//               a1 = shift out from last shift left
//               a2 - a4, ip destroyed
        EXPORT(shiftleft_loop_down)     // word aligned shiftleft loop down
        DECLARE_FUNCTION(shiftleft_loop_down)
GLABEL(shiftleft_loop_down)
        STMFD   sp!,{v6,lr}
        RSB     v6,a3,#32               // size of complementary right shift
        ANDS    ip,a2,#3                // multiple of 4 words ?
        BEQ     shiftleft_loop_down_l1  // yup, so branch
        LDR     lr,[a1,#-4]!            // shiftleft the first 1-3 words
        ORR     a4,a4,lr,ASL a3         // to align the total to a multiple
        STR     a4,[a1,#0]              // of 4 words
        MOV     a4,lr,LSR v6
        CMP     ip,#2
        BLT     shiftleft_loop_down_l1  // better to branch than skip instrs.
        LDRGE   lr,[a1,#-4]!
        ORRGE   a4,a4,lr,ASL a3
        STRGE   a4,[a1,#0]
        MOVGE   a4,lr,LSR v6
        LDRGT   lr,[a1,#-4]!
        ORRGT   a4,a4,lr,ASL a3
        STRGT   a4,[a1,#0]
        MOVGT   a4,lr,LSR v6
LABEL(shiftleft_loop_down_l1)
        BICS    ip,a2,#3                // set counter to multiple of 4
        MOVEQ   a1,a4                   // if zero then we're done
        LDMEQFD sp!,{v6,pc}^            // so return last shift out
        STMFD   sp!,{v1-v3}             // save work regs
LABEL(shiftleft_loop_down_l2)
        LDMDB   a1,{a2,v1,v2,v3}        // load 4 words in one go
        ORR     lr,a4,v3,ASL a3         // shiftleft the four words
        MOV     a4,v3,LSR v6            // keep carry in a4
        ORR     v3,a4,v2,ASL a3         // and store results up a register
        MOV     a4,v2,LSR v6            // to regs v1-v3,lr
        ORR     v2,a4,v1,ASL a3
        MOV     a4,v1,LSR v6
        ORR     v1,a4,a2,ASL a3
        MOV     a4,a2,LSR v6
        STMDB   a1!,{v1,v2,v3,lr}       // store 4 results
        SUBS    ip,ip,#4                // decrement counter by 4
        BGT     shiftleft_loop_down_l2  // if count still positive then loop
        MOV     a1,a4                   // result = last shift out
        LDMFD   sp!,{v1-v3,v6,pc}^      // restore work regs and return

// extern uintD shiftleftcopy_loop_down (uintD* sourceptr, uintD* destptr, uintC count, uintC i);
//       entry
//               a1 = sourceptr
//               a2 = destptr
//               a3 = count of words to be shifted left
//               a4 = size of left shift
//       exit
//               a1 = shift out from last shift left
//               a2 - a4, ip destroyed
        EXPORT(shiftleftcopy_loop_down) // word aligned shiftleftcopy loop down
        DECLARE_FUNCTION(shiftleftcopy_loop_down)
GLABEL(shiftleftcopy_loop_down)
        STMFD   sp!,{v5,v6,lr}
        MOV     v5,#0                   // initial shift carry
        RSB     v6,a4,#32               // size of complementary right shift
        ANDS    ip,a3,#3                // multiple of 4 words ?
        BEQ     shiftleftcopy_loop_down_l1      // yup, so branch
        LDR     lr,[a1,#-4]!            // shiftleft the first 1-3 words
        ORR     v5,v5,lr,ASL a4         // to align the total to a multiple
        STR     v5,[a2,#-4]!            // of 4 words
        MOV     v5,lr,LSR v6
        CMP     ip,#2
        BLT     shiftleftcopy_loop_down_l1      // better to branch than skip instrs.
        LDRGE   lr,[a1,#-4]!
        ORRGE   v5,v5,lr,ASL a4
        STRGE   v5,[a2,#-4]!
        MOVGE   v5,lr,LSR v6
        LDRGT   lr,[a1,#-4]!
        ORRGT   v5,v5,lr,ASL a4
        STRGT   v5,[a2,#-4]!
        MOVGT   v5,lr,LSR v6
LABEL(shiftleftcopy_loop_down_l1)
        BICS    ip,a3,#3                // set counter to multiple of 4
        MOVEQ   a1,v5                   // if zero then we're done
        LDMEQFD sp!,{v5,v6,pc}^         // so return last shift out
        STMFD   sp!,{v1-v3}             // save work regs
LABEL(shiftleftcopy_loop_down_l2)
        LDMDB   a1!,{a3,v1,v2,v3}       // load 4 words in one go
        ORR     lr,v5,v3,ASL a4         // shiftleft the four words
        MOV     v5,v3,LSR v6            // keep carry in v5
        ORR     v3,v5,v2,ASL a4         // and store results up a register
        MOV     v5,v2,LSR v6            // to regs v1-v3,lr
        ORR     v2,v5,v1,ASL a4
        MOV     v5,v1,LSR v6
        ORR     v1,v5,a3,ASL a4
        MOV     v5,a3,LSR v6
        STMDB   a2!,{v1,v2,v3,lr}       // store 4 results
        SUBS    ip,ip,#4                // decrement counter by 4
        BGT     shiftleftcopy_loop_down_l2      // if count still positive then loop
        MOV     a1,v5                   // result = last shift out
        LDMFD   sp!,{v1-v3,v5,v6,pc}^   // restore work regs and return

// extern uintD shift1right_loop_up (uintD* ptr, uintC count, uintD carry);
//       entry
//               a1 = ptr
//               a2 = count of words to be shifted right
//               a3 = carry
//       exit
//               a1 = carry out from last shift right
//               a2 - a4, ip destroyed
        EXPORT(shift1right_loop_up)     // word aligned shift1right loop up
        DECLARE_FUNCTION(shift1right_loop_up)
GLABEL(shift1right_loop_up)
        MOVS    a3,a3,LSR #1            // set carry
        ANDS    a3,a2,#1                // multiple of 2 words ?
        BEQ     shift1right_loop_up_l1  // yup, so branch
        LDR     a4,[a1]                 // shift right the first word
        MOVS    a4,a4,RRX
        STR     a4,[a1],#4
LABEL(shift1right_loop_up_l1)
        BICS    a4,a2,#1                // set counter to multiple of 2
        MOVEQ   a1,a4,RRX               // if zero set result to C (a4 is 0)
        MOVEQS  pc,lr                   // and return
        ANDS    a3,a4,#3                // multiple of 4 words ?
        BEQ     shift1right_loop_up_l3  // yup, so branch
        LDMIA   a1,{a2,a3}              // load 2 words in one go
        MOVS    a2,a2,RRX               // shift right the two words
        MOVS    a3,a3,RRX
        STMIA   a1!,{a2,a3}             // store 2 results
        BICS    a4,a4,#2                // decrement counter by 2
        ADCEQ   a1,a4,a4                // set result to Carry (a4 is 0)
        MOVEQS  pc,lr                   // and return
LABEL(shift1right_loop_up_l3)           // now a multiple of 4 words
        STMFD   sp!,{lr}                // save work regs
LABEL(shift1right_loop_up_l2)
        LDMIA   a1,{a2,a3,ip,lr}        // load 4 words in one go
        MOVS    a2,a2,RRX               // shift right the four words
        MOVS    a3,a3,RRX
        MOVS    ip,ip,RRX
        MOVS    lr,lr,RRX
        STMIA   a1!,{a2,a3,ip,lr}       // store 4 results
        SUB     a4,a4,#4                // decrement counter by 4
        TEQ     a4,#0                   // are we done ?
        BNE     shift1right_loop_up_l2  // if count non-zero then loop
        MOV     a1,a4,RRX               // set result to Carry (a4 is 0)
        LDMFD   sp!,{pc}^               // restore work regs and return 1

// extern uintD shiftright_loop_up (uintD* ptr, uintC count, uintC i);
//       entry
//               a1 = ptr
//               a2 = count of words to be shifted right
//               a3 = size of right shift
//       exit
//               a1 = shift out from last shift right
//               a2 - a4, ip destroyed
        EXPORT(shiftright_loop_up)      // word aligned shiftright loop up
        DECLARE_FUNCTION(shiftright_loop_up)
GLABEL(shiftright_loop_up)
        STMFD   sp!,{v6,lr}
        MOV     a4,#0                   // initial shift carry
        RSB     v6,a3,#32               // size of complementary left shift
LABEL(shiftright_loop_up_l0)
        ANDS    ip,a2,#3                // multiple of 4 words ?
        BEQ     shiftright_loop_up_l1   // yup, so branch
        LDR     lr,[a1]                 // shiftright the first 1-3 words
        ORR     a4,a4,lr,LSR a3         // to align the total to a multiple
        STR     a4,[a1],#4              // of 4 words
        MOV     a4,lr,ASL v6
        CMP     ip,#2
        BLT     shiftright_loop_up_l1   // better to branch than skip instrs.
        LDRGE   lr,[a1]
        ORRGE   a4,a4,lr,LSR a3
        STRGE   a4,[a1],#4
        MOVGE   a4,lr,ASL v6
        LDRGT   lr,[a1]
        ORRGT   a4,a4,lr,LSR a3
        STRGT   a4,[a1],#4
        MOVGT   a4,lr,ASL v6
LABEL(shiftright_loop_up_l1)
        BICS    ip,a2,#3                // set counter to multiple of 4
        MOVEQ   a1,a4                   // if zero then we're done
        LDMEQFD sp!,{v6,pc}^            // so return last shift out
        STMFD   sp!,{v1-v3}             // save work regs
LABEL(shiftright_loop_up_l2)
        LDMIA   a1,{v1,v2,v3,lr}        // load 4 words in one go
        ORR     a2,a4,v1,LSR a3         // shiftright the four words
        MOV     a4,v1,ASL v6            // keep carry in a4
        ORR     v1,a4,v2,LSR a3         // and store results down a register
        MOV     a4,v2,ASL v6            // to regs a2,v1-v3
        ORR     v2,a4,v3,LSR a3
        MOV     a4,v3,ASL v6
        ORR     v3,a4,lr,LSR a3
        MOV     a4,lr,ASL v6
        STMIA   a1!,{a2,v1,v2,v3}       // store 4 results
        SUBS    ip,ip,#4                // decrement counter by 4
        BGT     shiftright_loop_up_l2   // if count still positive then loop
        MOV     a1,a4                   // result = last shift out
        LDMFD   sp!,{v1-v3,v6,pc}^      // restore work regs and return

// extern uintD shiftrightsigned_loop_up (uintD* ptr, uintC count, uintC i);
//       entry
//               a1 = ptr
//               a2 = count of words to be shifted right signed
//               a3 = size of right shift
//       exit
//               a1 = shift out from last shift right
//               a2 - a4, ip destroyed
        EXPORT(shiftrightsigned_loop_up)// word aligned shiftrightsigned loop up
        DECLARE_FUNCTION(shiftrightsigned_loop_up)
GLABEL(shiftrightsigned_loop_up)
        STMFD   sp!,{v6,lr}
        RSB     v6,a3,#32               // size of complementary left shift
        LDR     lr,[a1]                 // setup carry for first shift.
        MOV     a4,lr,ASR #31           // this is the sign extended bits
        AND     a4,a4,a4,LSL v6         // 31->(32-i) of the first word
        B       shiftright_loop_up_l0   // use right shift code now

// extern uintD shiftrightcopy_loop_up (uintD* sourceptr, uintD* destptr, uintC count, uintC i, uintD carry);
//       entry
//               a1 = sourceptr
//               a2 = destptr
//               a3 = count of words to be shifted right
//               a4 = size of right shift
//               [sp] = carry for first shift
//       exit
//               a1 = shift out from last shift right
//               a2 - a4, ip destroyed
        EXPORT(shiftrightcopy_loop_up)  // word aligned shiftrightcopy loop up
        DECLARE_FUNCTION(shiftrightcopy_loop_up)
GLABEL(shiftrightcopy_loop_up)
        STMFD   sp!,{v5,v6,lr}
        LDR     v5,[sp,#12]             // initial shift carry
        RSB     v6,a4,#32               // size of complementary left shift
        MOV     v5,v5,ASL v6
LABEL(shiftrightcopy_loop_up_l0)
        ANDS    ip,a3,#3                // multiple of 4 words ?
        BEQ     shiftrightcopy_loop_up_l1       // yup, so branch
        LDR     lr,[a1],#4              // shiftright the first 1-3 words
        ORR     v5,v5,lr,LSR a4         // to align the total to a multiple
        STR     v5,[a2],#4              // of 4 words
        MOV     v5,lr,ASL v6
        CMP     ip,#2
        BLT     shiftrightcopy_loop_up_l1       // better to branch than skip instrs.
        LDRGE   lr,[a1],#4
        ORRGE   v5,v5,lr,LSR a4
        STRGE   v5,[a2],#4
        MOVGE   v5,lr,ASL v6
        LDRGT   lr,[a1],#4
        ORRGT   v5,v5,lr,LSR a4
        STRGT   v5,[a2],#4
        MOVGT   v5,lr,ASL v6
LABEL(shiftrightcopy_loop_up_l1)
        BICS    ip,a3,#3                // set counter to multiple of 4
        MOVEQ   a1,v5                   // if zero then we're done
        LDMEQFD sp!,{v5,v6,pc}^         // so return last shift out
        STMFD   sp!,{v1-v3}             // save work regs
LABEL(shiftrightcopy_loop_up_l2)
        LDMIA   a1!,{v1,v2,v3,lr}       // load 4 words in one go
        ORR     a3,v5,v1,LSR a4         // shiftright the four words
        MOV     v5,v1,ASL v6            // keep carry in v5
        ORR     v1,v5,v2,LSR a4         // and store results down a register
        MOV     v5,v2,ASL v6            // to regs a2,v1-v3
        ORR     v2,v5,v3,LSR a4
        MOV     v5,v3,ASL v6
        ORR     v3,v5,lr,LSR a4
        MOV     v5,lr,ASL v6
        STMIA   a2!,{a3,v1,v2,v3}       // store 4 results
        SUBS    ip,ip,#4                // decrement counter by 4
        BGT     shiftrightcopy_loop_up_l2       // if count still positive then loop
        MOV     a1,v5                   // result = last shift out
        LDMFD   sp!,{v1-v3,v5,v6,pc}^   // restore work regs and return

#ifndef HAVE_umull
// mulu32_64_vregs
//       entry
//               a1 = x
//               ip = y
//       exit
//               v1 = low32(x*y)
//               ip = high32(x*y)
//               v2,v3,v4 destroyed
LABEL(mulu32_64_vregs)
        MOV     v1,a1,LSR #16           // temp := top half of x
        MOV     v2,ip,LSR #16           // hi := top half of y
        BIC     v3,a1,v1,LSL #16        // x  := bottom half of x
        BIC     ip,ip,v2,LSL #16        // y  := bottom half of y
        MUL     v4,v3,ip                // low section of result
        MUL     ip,v1,ip                // ) middle sections
        MUL     v3,v2,v3                // )   of result
        MUL     v2,v1,v2                // high section of result
        ADDS    ip,ip,v3                // add middle sections
                                        // (can't use mla as we need carry)
        ADDCS   v2,v2,#0x10000          // carry from above add
        ADDS    v1,v4,ip,LSL #16        // x is now bottom 32 bits of result
        ADC     ip,v2,ip,LSR #16        // hi is top 32 bits
        MOVS    pc,lr
#endif

// extern uintD mulusmall_loop_down (uintD digit, uintD* ptr, uintC len, uintD newdigit);
//       entry
//               a1 = digit
//               a2 = ptr
//               a3 = count of words to be multiplied down
//               a4 = new digit = carry
//       exit
//               a1 = final carry of multiply
//               a2 - a4, ip destroyed
        EXPORT(mulusmall_loop_down)
        DECLARE_FUNCTION(mulusmall_loop_down)
GLABEL(mulusmall_loop_down)
        CMP     a3,#0
        MOVEQ   a1,a4
        MOVEQS  pc,lr
#ifdef HAVE_umull
        STMFD   sp!,{v1,lr}
LABEL(mulusmall_loop_down_l1)
        LDR     ip,[a2,#-4]!
        UMULL   v1,ip,a1,ip             // muluD(digit,*--ptr,hi=,lo=)
        ADDS    v1,v1,a4                // lo += carry
        ADC     a4,ip,#0                // if (lo<carry) { hi += 1 }; carry=hi
        STR     v1,[a2,#0]              // *ptr = lo
        SUBS    a3,a3,#1                // len--
        BNE     mulusmall_loop_down_l1  // until len==0
        MOV     a1,a4                   // return carry
        LDMFD   sp!,{v1,pc}^
#else
        STMFD   sp!,{v1-v2,lr}
LABEL(mulusmall_loop_down_l1)
        LDR     ip,[a2,#-4]!

//      BL      mulu32_64_vregs         // muluD(digit,*--ptr,hi=,lo=)
// replaced by multiplication of a small x = a1 and a big y = ip :
        MOV     v1,ip,LSR #16           // top half of y
        BIC     ip,ip,v1,LSL #16        // bottom half of y
        MUL     v2,a1,v1                // middle section of result
        MUL     v1,a1,ip                // low section of result
        MOV     ip,#0                   // high section of result
        ADDS    v1,v1,v2,LSL #16        // bottom 32 bits of result
        ADC     ip,ip,v2,LSR #16        // top 32 bits of result

        ADDS    v1,v1,a4                // lo += carry
        ADC     a4,ip,#0                // if (lo<carry) { hi += 1 }; carry=hi
        STR     v1,[a2,#0]              // *ptr = lo
        SUBS    a3,a3,#1                // len--
        BNE     mulusmall_loop_down_l1  // until len==0
        MOV     a1,a4                   // return carry
        LDMFD   sp!,{v1-v2,pc}^
#endif

// extern void mulu_loop_down (uintD digit, uintD* sourceptr, uintD* destptr, uintC len);
//       entry
//               a1 = digit
//               a2 = sourceptr
//               a3 = destptr
//               a4 = count of words to be multiplied down
//       exit
//               a1 - a4, ip destroyed
        EXPORT(mulu_loop_down)
        DECLARE_FUNCTION(mulu_loop_down)
GLABEL(mulu_loop_down)
#ifdef HAVE_umull
        STMFD   sp!,{v1,v5,lr}
        MOV     v5,#0
LABEL(mulu_loop_down_l1)
        LDR     ip,[a2,#-4]!
        UMULL   v1,ip,a1,ip             // muluD(digit,*--sourceptr,hi=,lo=)
        ADDS    v1,v1,v5                // lo += carry
        ADC     v5,ip,#0                // if (lo<carry) { hi += 1 }; carry=hi
        STR     v1,[a3,#-4]!            // *--destptr = lo
        SUBS    a4,a4,#1                // len--
        BNE     mulu_loop_down_l1       // until len==0
        STR     v5,[a3,#-4]!            // *--destptr = carry
        LDMFD   sp!,{v1,v5,pc}^
#else
        STMFD   sp!,{v1-v5,lr}
        MOV     v5,#0
LABEL(mulu_loop_down_l1)
        LDR     ip,[a2,#-4]!
        BL      mulu32_64_vregs         // muluD(digit,*--sourceptr,hi=,lo=)
        ADDS    v1,v1,v5                // lo += carry
        ADC     v5,ip,#0                // if (lo<carry) { hi += 1 }; carry=hi
        STR     v1,[a3,#-4]!            // *--destptr = lo
        SUBS    a4,a4,#1                // len--
        BNE     mulu_loop_down_l1       // until len==0
        STR     v5,[a3,#-4]!            // *--destptr = carry
        LDMFD   sp!,{v1-v5,pc}^
#endif

// extern void muluadd_loop_down (uintD digit, uintD* sourceptr, uintD* destptr, uintC len);
//       entry
//               a1 = digit
//               a2 = sourceptr
//               a3 = destptr
//               a4 = count of words to be multiplied added down
//       exit
//               a1 - a4, ip destroyed
        EXPORT(muluadd_loop_down)
        DECLARE_FUNCTION(muluadd_loop_down)
GLABEL(muluadd_loop_down)
#ifdef HAVE_umull
        STMFD   sp!,{v1,v5,lr}
        MOV     v5,#0
LABEL(muluadd_loop_down_l1)
        LDR     ip,[a2,#-4]!
        UMULL   v1,ip,a1,ip             // muluD(digit,*--sourceptr,hi=,lo=)
        ADDS    v1,v1,v5                // lo += carry
        ADCCS   ip,ip,#0                // if (lo<carry) { hi += 1 };
        LDR     v5,[a3,#-4]!            // carry = *--destptr
        ADDS    v1,v1,v5                // lo += carry
        ADC     v5,ip,#0                // if (lo<carry) { hi += 1 }; carry=hi
        STR     v1,[a3,#0]              // *destptr = lo
        SUBS    a4,a4,#1                // len--
        BNE     muluadd_loop_down_l1    // until len==0
        MOV     a1,v5                   // return carry
        LDMFD   sp!,{v1,v5,pc}^
#else
        STMFD   sp!,{v1-v5,lr}
        MOV     v5,#0
LABEL(muluadd_loop_down_l1)
        LDR     ip,[a2,#-4]!
        BL      mulu32_64_vregs         // muluD(digit,*--sourceptr,hi=,lo=)
        ADDS    v1,v1,v5                // lo += carry
        ADCCS   ip,ip,#0                // if (lo<carry) { hi += 1 };
        LDR     v5,[a3,#-4]!            // carry = *--destptr
        ADDS    v1,v1,v5                // lo += carry
        ADC     v5,ip,#0                // if (lo<carry) { hi += 1 }; carry=hi
        STR     v1,[a3,#0]              // *destptr = lo
        SUBS    a4,a4,#1                // len--
        BNE     muluadd_loop_down_l1    // until len==0
        MOV     a1,v5                   // return carry
        LDMFD   sp!,{v1-v5,pc}^
#endif

// extern void mulusub_loop_down (uintD digit, uintD* sourceptr, uintD* destptr, uintC len);
//       entry
//               a1 = digit
//               a2 = sourceptr
//               a3 = destptr
//               a4 = count of words to be multiplied subtracted down
//       exit
//               a1 - a4, ip destroyed
        EXPORT(mulusub_loop_down)
        DECLARE_FUNCTION(mulusub_loop_down)
GLABEL(mulusub_loop_down)
#ifdef HAVE_umull
        STMFD   sp!,{v1,v5,lr}
        MOV     v5,#0
LABEL(mulusub_loop_down_l1)
        LDR     ip,[a2,#-4]!
        UMULL   v1,ip,a1,ip             // muluD(digit,*--sourceptr,hi=,lo=)
        ADDS    v1,v1,v5                // lo += carry
        ADC     v5,ip,#0                // if (lo<carry) { hi += 1 };
        LDR     ip,[a3,#-4]!            // carry = *--destptr
        SUBS    ip,ip,v1
        STR     ip,[a3,#0]              // *destptr = carry - lo
        ADDCC   v5,v5,#1                // if (carry<lo) { hi += 1 }; carry=hi
        SUBS    a4,a4,#1                // len--
        BNE     mulusub_loop_down_l1    // until len==0
        MOV     a1,v5                   // return carry
        LDMFD   sp!,{v1,v5,pc}^
#else
        STMFD   sp!,{v1-v5,lr}
        MOV     v5,#0
LABEL(mulusub_loop_down_l1)
        LDR     ip,[a2,#-4]!
        BL      mulu32_64_vregs         // muluD(digit,*--sourceptr,hi=,lo=)
        ADDS    v1,v1,v5                // lo += carry
        ADC     v5,ip,#0                // if (lo<carry) { hi += 1 };
        LDR     ip,[a3,#-4]!            // carry = *--destptr
        SUBS    ip,ip,v1
        STR     ip,[a3,#0]              // *destptr = carry - lo
        ADDCC   v5,v5,#1                // if (carry<lo) { hi += 1 }; carry=hi
        SUBS    a4,a4,#1                // len--
        BNE     mulusub_loop_down_l1    // until len==0
        MOV     a1,v5                   // return carry
        LDMFD   sp!,{v1-v5,pc}^
#endif

#endif

#if !CL_DS_BIG_ENDIAN_P

// extern void or_loop_down (uintD* xptr, uintD* yptr, uintC count);
//       entry
//               a1 = xptr
//               a2 = yptr
//               a3 = count of words to be ORed
//       exit
//               xptr |= yptr for count words
//               a1 - a4, ip destroyed
        EXPORT(or_loop_down)            // word aligned or loop down
        DECLARE_FUNCTION(or_loop_down)
GLABEL(or_loop_down)
        ANDS    a4,a3,#3                // multiple of 4 words ?
        BEQ     or_loop_down_l1         // yup, so branch
        CMP     a4,#2                   // OR the first 1-3 words
        LDR     a4,[a2,#-4]!            // to align the total to a multiple
        LDR     ip,[a1,#-4]!            // of 4 words
        ORR     ip,ip,a4
        STR     ip,[a1]
        BLT     or_loop_down_l1         // better to branch than skip instrs.
        LDRGE   a4,[a2,#-4]!
        LDRGE   ip,[a1,#-4]!
        ORRGE   ip,ip,a4
        STRGE   ip,[a1]
        LDRGT   a4,[a2,#-4]!
        LDRGT   ip,[a1,#-4]!
        ORRGT   ip,ip,a4
        STRGT   ip,[a1]
LABEL(or_loop_down_l1)
        BICS    a4,a3,#3                // set counter to multiple of 4
        MOVEQS  pc,lr                   // if zero then we're done
        STMFD   sp!,{v1-v5,lr}          // save work regs
LABEL(or_loop_down_l2)
        LDMDB   a2!,{a3,v1,v2,ip}       // load 4 words in one go
        LDMDB   a1,{v3,v4,v5,lr}        // load target words
        ORR     v3,v3,a3                // OR the four words
        ORR     v4,v4,v1
        ORR     v5,v5,v2
        ORR     lr,lr,ip
        STMDB   a1!,{v3,v4,v5,lr}       // store 4 results
        SUBS    a4,a4,#4                // decrement counter by 4
        BGT     or_loop_down_l2         // if count still positive then loop
        LDMFD   sp!,{v1-v5,pc}^         // restore work regs and return

// extern void xor_loop_down (uintD* xptr, uintD* yptr, uintC count);
//       entry
//               a1 = xptr
//               a2 = yptr
//               a3 = count of words to be XORed
//       exit
//               xptr ^= yptr for count words
//               a1 - a4, ip destroyed
        EXPORT(xor_loop_down)           // word aligned xor loop down
        DECLARE_FUNCTION(xor_loop_down)
GLABEL(xor_loop_down)
        ANDS    a4,a3,#3                // multiple of 4 words ?
        BEQ     xor_loop_down_l1        // yup, so branch
        CMP     a4,#2                   // XOR the first 1-3 words
        LDR     a4,[a2,#-4]!            // to align the total to a multiple
        LDR     ip,[a1,#-4]!            // of 4 words
        EOR     ip,ip,a4
        STR     ip,[a1]
        BLT     xor_loop_down_l1        // better to branch than skip instrs.
        LDRGE   a4,[a2,#-4]!
        LDRGE   ip,[a1,#-4]!
        EORGE   ip,ip,a4
        STRGE   ip,[a1]
        LDRGT   a4,[a2,#-4]!
        LDRGT   ip,[a1,#-4]!
        EORGT   ip,ip,a4
        STRGT   ip,[a1]
LABEL(xor_loop_down_l1)
        BICS    a4,a3,#3                // set counter to multiple of 4
        MOVEQS  pc,lr                   // if zero then we're done
        STMFD   sp!,{v1-v5,lr}          // save work regs
LABEL(xor_loop_down_l2)
        LDMDB   a2!,{a3,v1,v2,ip}       // load 4 words in one go
        LDMDB   a1,{v3,v4,v5,lr}        // load target words
        EOR     v3,v3,a3                // XOR the four words
        EOR     v4,v4,v1
        EOR     v5,v5,v2
        EOR     lr,lr,ip
        STMDB   a1!,{v3,v4,v5,lr}       // store 4 results
        SUBS    a4,a4,#4                // decrement counter by 4
        BGT     xor_loop_down_l2        // if count still positive then loop
        LDMFD   sp!,{v1-v5,pc}^         // restore work regs and return

// extern void and_loop_down (uintD* xptr, uintD* yptr, uintC count);
//       entry
//               a1 = xptr
//               a2 = yptr
//               a3 = count of words to be ANDed
//       exit
//               xptr &= yptr for count words
//               a1 - a4, ip destroyed
        EXPORT(and_loop_down)           // word aligned and loop down
        DECLARE_FUNCTION(and_loop_down)
GLABEL(and_loop_down)
        ANDS    a4,a3,#3                // multiple of 4 words ?
        BEQ     and_loop_down_l1        // yup, so branch
        CMP     a4,#2                   // AND the first 1-3 words
        LDR     a4,[a2,#-4]!            // to align the total to a multiple
        LDR     ip,[a1,#-4]!            // of 4 words
        AND     ip,ip,a4
        STR     ip,[a1]
        BLT     and_loop_down_l1        // better to branch than skip instrs.
        LDRGE   a4,[a2,#-4]!
        LDRGE   ip,[a1,#-4]!
        ANDGE   ip,ip,a4
        STRGE   ip,[a1]
        LDRGT   a4,[a2,#-4]!
        LDRGT   ip,[a1,#-4]!
        ANDGT   ip,ip,a4
        STRGT   ip,[a1]
LABEL(and_loop_down_l1)
        BICS    a4,a3,#3                // set counter to multiple of 4
        MOVEQS  pc,lr                   // if zero then we're done
        STMFD   sp!,{v1-v5,lr}          // save work regs
LABEL(and_loop_down_l2)
        LDMDB   a2!,{a3,v1,v2,ip}       // load 4 words in one go
        LDMDB   a1,{v3,v4,v5,lr}        // load target words
        AND     v3,v3,a3                // AND the four words
        AND     v4,v4,v1
        AND     v5,v5,v2
        AND     lr,lr,ip
        STMDB   a1!,{v3,v4,v5,lr}       // store 4 results
        SUBS    a4,a4,#4                // decrement counter by 4
        BGT     and_loop_down_l2        // if count still positive then loop
        LDMFD   sp!,{v1-v5,pc}^         // restore work regs and return

// extern void eqv_loop_down (uintD* xptr, uintD* yptr, uintC count);
//       entry
//               a1 = xptr
//               a2 = yptr
//               a3 = count of words to be XORed
//       exit
//               xptr = ~(xptr ^ yptr) for count words
//               a1 - a4, ip destroyed
        EXPORT(eqv_loop_down)           // word aligned eqv loop down
        DECLARE_FUNCTION(eqv_loop_down)
GLABEL(eqv_loop_down)
        ANDS    a4,a3,#3                // multiple of 4 words ?
        BEQ     eqv_loop_down_l1        // yup, so branch
        CMP     a4,#2                   // EQV the first 1-3 words
        LDR     a4,[a2,#-4]!            // to align the total to a multiple
        LDR     ip,[a1,#-4]!            // of 4 words
        EOR     ip,ip,a4
        MVN     ip,ip
        STR     ip,[a1]
        BLT     eqv_loop_down_l1        // better to branch than skip instrs.
        LDRGE   a4,[a2,#-4]!
        LDRGE   ip,[a1,#-4]!
        EORGE   ip,ip,a4
        MVNGE   ip,ip
        STRGE   ip,[a1]
        BLE     eqv_loop_down_l1        // better to branch than skip instrs.
        LDRGT   a4,[a2,#-4]!
        LDRGT   ip,[a1,#-4]!
        EORGT   ip,ip,a4
        MVNGT   ip,ip
        STRGT   ip,[a1]
LABEL(eqv_loop_down_l1)
        BICS    a4,a3,#3                // set counter to multiple of 4
        MOVEQS  pc,lr                   // if zero then we're done
        STMFD   sp!,{v1-v5,lr}          // save work regs
LABEL(eqv_loop_down_l2)
        LDMDB   a2!,{a3,v1,v2,ip}       // load 4 words in one go
        LDMDB   a1,{v3,v4,v5,lr}        // load target words
        EOR     v3,v3,a3                // EVQ the four words
        MVN     v3,v3
        EOR     v4,v4,v1
        MVN     v4,v4
        EOR     v5,v5,v2
        MVN     v5,v5
        EOR     lr,lr,ip
        MVN     lr,lr
        STMDB   a1!,{v3,v4,v5,lr}       // store 4 results
        SUBS    a4,a4,#4                // decrement counter by 4
        BGT     eqv_loop_down_l2        // if count still positive then loop
        LDMFD   sp!,{v1-v5,pc}^         // restore work regs and return

// extern void nand_loop_down (uintD* xptr, uintD* yptr, uintC count);
//       entry
//               a1 = xptr
//               a2 = yptr
//               a3 = count of words to be NANDed
//       exit
//               xptr = ~(xptr & yptr) for count words
//               a1 - a4, ip destroyed
        EXPORT(nand_loop_down)          // word aligned nand loop down
        DECLARE_FUNCTION(nand_loop_down)
GLABEL(nand_loop_down)
        ANDS    a4,a3,#3                // multiple of 4 words ?
        BEQ     nand_loop_down_l1       // yup, so branch
        CMP     a4,#2                   // NAND the first 1-3 words
        LDR     a4,[a2,#-4]!            // to align the total to a multiple
        LDR     ip,[a1,#-4]!            // of 4 words
        AND     ip,ip,a4
        MVN     ip,ip
        STR     ip,[a1]
        BLT     nand_loop_down_l1       // better to branch than skip instrs.
        LDRGE   a4,[a2,#-4]!
        LDRGE   ip,[a1,#-4]!
        ANDGE   ip,ip,a4
        MVNGE   ip,ip
        STRGE   ip,[a1]
        BLE     nand_loop_down_l1       // better to branch than skip instrs.
        LDRGT   a4,[a2,#-4]!
        LDRGT   ip,[a1,#-4]!
        ANDGT   ip,ip,a4
        MVNGT   ip,ip
        STRGT   ip,[a1]
LABEL(nand_loop_down_l1)
        BICS    a4,a3,#3                // set counter to multiple of 4
        MOVEQS  pc,lr                   // if zero then we're done
        STMFD   sp!,{v1-v5,lr}          // save work regs
LABEL(nand_loop_down_l2)
        LDMDB   a2!,{a3,v1,v2,ip}       // load 4 words in one go
        LDMDB   a1,{v3,v4,v5,lr}        // load target words
        AND     v3,v3,a3                // NAND the four words
        MVN     v3,v3
        AND     v4,v4,v1
        MVN     v4,v4
        AND     v5,v5,v2
        MVN     v5,v5
        AND     lr,lr,ip
        MVN     lr,lr
        STMDB   a1!,{v3,v4,v5,lr}       // store 4 results
        SUBS    a4,a4,#4                // decrement counter by 4
        BGT     nand_loop_down_l2       // if count still positive then loop
        LDMFD   sp!,{v1-v5,pc}^         // restore work regs and return

// extern void nor_loop_down (uintD* xptr, uintD* yptr, uintC count);
//       entry
//               a1 = xptr
//               a2 = yptr
//               a3 = count of words to be NORed
//       exit
//               xptr = ~(xptr | yptr) for count words
//               a1 - a4, ip destroyed
        EXPORT(nor_loop_down)           // word aligned nor loop down
        DECLARE_FUNCTION(nor_loop_down)
GLABEL(nor_loop_down)
        ANDS    a4,a3,#3                // multiple of 4 words ?
        BEQ     nor_loop_down_l1        // yup, so branch
        CMP     a4,#2                   // NOR the first 1-3 words
        LDR     a4,[a2,#-4]!            // to align the total to a multiple
        LDR     ip,[a1,#-4]!            // of 4 words
        ORR     ip,ip,a4
        MVN     ip,ip
        STR     ip,[a1]
        BLT     nor_loop_down_l1        // better to branch than skip instrs.
        LDRGE   a4,[a2,#-4]!
        LDRGE   ip,[a1,#-4]!
        ORRGE   ip,ip,a4
        MVNGE   ip,ip
        STRGE   ip,[a1]
        BLE     nor_loop_down_l1        // better to branch than skip instrs.
        LDRGT   a4,[a2,#-4]!
        LDRGT   ip,[a1,#-4]!
        ORRGT   ip,ip,a4
        MVNGT   ip,ip
        STRGT   ip,[a1]
LABEL(nor_loop_down_l1)
        BICS    a4,a3,#3                // set counter to multiple of 4
        MOVEQS  pc,lr                   // if zero then we're done
        STMFD   sp!,{v1-v5,lr}          // save work regs
LABEL(nor_loop_down_l2)
        LDMDB   a2!,{a3,v1,v2,ip}       // load 4 words in one go
        LDMDB   a1,{v3,v4,v5,lr}        // load target words
        ORR     v3,v3,a3                // NOR the four words
        MVN     v3,v3
        ORR     v4,v4,v1
        MVN     v4,v4
        ORR     v5,v5,v2
        MVN     v5,v5
        ORR     lr,lr,ip
        MVN     lr,lr
        STMDB   a1!,{v3,v4,v5,lr}       // store 4 results
        SUBS    a4,a4,#4                // decrement counter by 4
        BGT     nor_loop_down_l2        // if count still positive then loop
        LDMFD   sp!,{v1-v5,pc}^         // restore work regs and return

// extern void andc2_loop_down (uintD* xptr, uintD* yptr, uintC count);
//       entry
//               a1 = xptr
//               a2 = yptr
//               a3 = count of words to be ANDC2ed
//       exit
//               xptr = xptr & ~yptr for count words
//               a1 - a4, ip destroyed
        EXPORT(andc2_loop_down)         // word aligned andc2 loop down
        DECLARE_FUNCTION(andc2_loop_down)
GLABEL(andc2_loop_down)
        ANDS    a4,a3,#3                // multiple of 4 words ?
        BEQ     andc2_loop_down_l1      // yup, so branch
        CMP     a4,#2                   // ANDC2 the first 1-3 words
        LDR     a4,[a2,#-4]!            // to align the total to a multiple
        LDR     ip,[a1,#-4]!            // of 4 words
        BIC     ip,ip,a4
        STR     ip,[a1]
        BLT     andc2_loop_down_l1      // better to branch than skip instrs.
        LDRGE   a4,[a2,#-4]!
        LDRGE   ip,[a1,#-4]!
        BICGE   ip,ip,a4
        STRGE   ip,[a1]
        LDRGT   a4,[a2,#-4]!
        LDRGT   ip,[a1,#-4]!
        BICGT   ip,ip,a4
        STRGT   ip,[a1]
LABEL(andc2_loop_down_l1)
        BICS    a4,a3,#3                // set counter to multiple of 4
        MOVEQS  pc,lr                   // if zero then we're done
        STMFD   sp!,{v1-v5,lr}          // save work regs
LABEL(andc2_loop_down_l2)
        LDMDB   a2!,{a3,v1,v2,ip}       // load 4 words in one go
        LDMDB   a1,{v3,v4,v5,lr}        // load target words
        BIC     v3,v3,a3                // ANDC2 the four words
        BIC     v4,v4,v1
        BIC     v5,v5,v2
        BIC     lr,lr,ip
        STMDB   a1!,{v3,v4,v5,lr}       // store 4 results
        SUBS    a4,a4,#4                // decrement counter by 4
        BGT     andc2_loop_down_l2      // if count still positive then loop
        LDMFD   sp!,{v1-v5,pc}^         // restore work regs and return

// extern void orc2_loop_down (uintD* xptr, uintD* yptr, uintC count);
//       entry
//               a1 = xptr
//               a2 = yptr
//               a3 = count of words to be XORed
//       exit
//               xptr = xptr | ~yptr for count words
//               a1 - a4, ip destroyed
        EXPORT(orc2_loop_down)          // word aligned orc2 loop down
        DECLARE_FUNCTION(orc2_loop_down)
GLABEL(orc2_loop_down)
        ANDS    a4,a3,#3                // multiple of 4 words ?
        BEQ     orc2_loop_down_l1       // yup, so branch
        CMP     a4,#2                   // ORC2 the first 1-3 words
        LDR     a4,[a2,#-4]!            // to align the total to a multiple
        LDR     ip,[a1,#-4]!            // of 4 words
        MVN     a4,a4
        ORR     ip,ip,a4
        STR     ip,[a1]
        BLT     orc2_loop_down_l1       // better to branch than skip instrs.
        LDRGE   a4,[a2,#-4]!
        LDRGE   ip,[a1,#-4]!
        MVNGE   a4,a4
        ORRGE   ip,ip,a4
        STRGE   ip,[a1]
        BLE     orc2_loop_down_l1       // better to branch than skip instrs.
        LDRGT   a4,[a2,#-4]!
        LDRGT   ip,[a1,#-4]!
        MVNGT   a4,a4
        ORRGT   ip,ip,a4
        STRGT   ip,[a1]
LABEL(orc2_loop_down_l1)
        BICS    a4,a3,#3                // set counter to multiple of 4
        MOVEQS  pc,lr                   // if zero then we're done
        STMFD   sp!,{v1-v5,lr}          // save work regs
LABEL(orc2_loop_down_l2)
        LDMDB   a2!,{a3,v1,v2,ip}       // load 4 words in one go
        LDMDB   a1,{v3,v4,v5,lr}        // load target words
        MVN     a3,a3                   // ORC2 the four words
        ORR     v3,v3,a3
        MVN     v1,v1
        ORR     v4,v4,v1
        MVN     v2,v2
        ORR     v5,v5,v2
        MVN     ip,ip
        ORR     lr,lr,ip
        STMDB   a1!,{v3,v4,v5,lr}       // store 4 results
        SUBS    a4,a4,#4                // decrement counter by 4
        BGT     orc2_loop_down_l2       // if count still positive then loop
        LDMFD   sp!,{v1-v5,pc}^         // restore work regs and return

// extern void not_loop_down (uintD* xptr, uintC count);
//       entry
//               a1 = xptr
//               a2 = count of words to be NOTed
//       exit
//               xptr = ~xptr for count words
//               a1 - a4, ip destroyed
        EXPORT(not_loop_down)           // word aligned not loop down
        DECLARE_FUNCTION(not_loop_down)
GLABEL(not_loop_down)
        ANDS    a3,a2,#3                // multiple of 4 words ?
        BEQ     not_loop_down_l1        // yup, so branch
        CMP     a3,#2                   // NOT the first 1-3 words
        LDR     a3,[a1,#-4]!            // to align the total to a multiple
        MVN     a3,a3                   // of 4 words
        STR     a3,[a1]
        BLT     not_loop_down_l1        // better to branch than skip instrs.
        LDRGE   a3,[a1,#-4]!
        MVNGE   a3,a3
        STRGE   a3,[a1]
        LDRGT   a3,[a1,#-4]!
        MVNGT   a3,a3
        STRGT   a3,[a1]
LABEL(not_loop_down_l1)
        BICS    a4,a2,#3                // set counter to multiple of 4
        MOVEQS  pc,lr                   // if zero then we're done
        STMFD   sp!,{lr}                // save work regs
LABEL(not_loop_down_l2)
        LDMDB   a1,{a2,a3,ip,lr}        // load 4 words in one go,NO writeback
        MVN     a2,a2                   // NOT the four words
        MVN     a3,a3
        MVN     ip,ip
        MVN     lr,lr
        STMDB   a1!,{a2,a3,ip,lr}       // store 4 results
        SUBS    a4,a4,#4                // decrement counter by 4
        BGT     not_loop_down_l2        // if count still positive then loop
        LDMFD   sp!,{pc}^               // restore work regs and return

// extern void and_test_loop_down (uintD* xptr, uintD* yptr, uintC count);
//       entry
//               a1 = xptr
//               a2 = yptr
//               a3 = count of words to be AND_TESTed
//       exit
//               a1 = TRUE if any words ANDed together are non-zero else FALSE
//               a2 - a4, ip destroyed
        EXPORT(and_test_loop_down)      // word aligned and_test loop down
        DECLARE_FUNCTION(and_test_loop_down)
GLABEL(and_test_loop_down)
        ANDS    a4,a3,#3                // multiple of 4 words ?
        BEQ     and_test_loop_down_l1   // yup, so branch
        CMP     a4,#2
        LDR     a4,[a2,#-4]!            // AND_TEST the first 1-3 words
        LDR     ip,[a1,#-4]!            // to align the total to a multiple
        TST     ip,a4                   // of 4 words
        MOVNE   a1,#1                   // return TRUE if AND_TEST ok
        MOVNES  pc,lr
        BCC     and_test_loop_down_l1   // better to branch than skip instrs.
        LDRGE   a4,[a2,#-4]!
        LDRGE   ip,[a1,#-4]!
        TSTGE   ip,a4
        MOVNE   a1,#1
        MOVNES  pc,lr
        ANDS    a4,a3,#3
        CMP     a4,#2
        BLE     and_test_loop_down_l1   // better to branch than skip instrs.
        LDRGT   a4,[a2,#-4]!
        LDRGT   ip,[a1,#-4]!
        TSTGT   ip,a4
        MOVNE   a1,#1
        MOVNES  pc,lr
LABEL(and_test_loop_down_l1)
        BICS    a4,a3,#3                // set counter to multiple of 4
        MOVEQ   a1,#0                   // return FALSE
        MOVEQS  pc,lr                   // if zero then we're done
        STMFD   sp!,{v1-v6,lr}          // save work regs
        MOV     v6,a1                   // move xptr to v6
        MOV     a1,#1                   // set result to TRUE
LABEL(and_test_loop_down_l2)
        LDMDB   a2!,{a3,v1,v2,ip}       // load 4 words in one go
        LDMDB   v6!,{v3,v4,v5,lr}       // load target words
        TST     v3,a3                   // AND_TEST the four words
        TSTEQ   v4,v1
        TSTEQ   v5,v2
        TSTEQ   lr,ip
        LDMNEFD sp!,{v1-v6,pc}^
        SUBS    a4,a4,#4                // decrement counter by 4
        BGT     and_test_loop_down_l2   // if count still positive then loop
        MOV     a1,#0
        LDMFD   sp!,{v1-v6,pc}^         // restore work regs and return

// extern void compare_loop_down (uintD* xptr, uintD* yptr, uintC count);
//       entry
//               a1 = xptr
//               a2 = yptr
//               a3 = count of words to be COMPAREd
//       exit
//               a1 = +1 if first non-equal word in xptr[] and yptr[]
//                       xptr[i] > yptr[i]
//                    -1 if xptr[i] < yptr[i]
//                     0 otherwise
//               a2 - a4, ip destroyed
        EXPORT(compare_loop_down)       // word aligned compare loop down
        DECLARE_FUNCTION(compare_loop_down)
GLABEL(compare_loop_down)
        ANDS    a4,a3,#3                // multiple of 4 words ?
        BEQ     compare_loop_down_l1    // yup, so branch
        LDR     a4,[a2,#-4]!            // COMPARE the first 1-3 words
        LDR     ip,[a1,#-4]!            // to align the total to a multiple
        CMP     ip,a4                   // of 4 words
        MVNLO   a1,#0                   // x < y -> -1
        MOVHI   a1,#1                   // x > y -> +1
        MOVNES  pc,lr                   // and return result if not equal
        ANDS    a4,a3,#3
        CMP     a4,#2
        BLT     compare_loop_down_l1    // need to branch 'cos PSR used
        LDR     a4,[a2,#-4]!
        LDR     ip,[a1,#-4]!
        CMP     ip,a4
        MVNLO   a1,#0
        MOVHI   a1,#1
        MOVNES  pc,lr
        ANDS    a4,a3,#3
        CMP     a4,#2
        BLE     compare_loop_down_l1    // need to branch 'cos PSR used
        LDR     a4,[a2,#-4]!
        LDR     ip,[a1,#-4]!
        CMP     ip,a4
        MVNLO   a1,#0
        MOVHI   a1,#1
        MOVNES  pc,lr
LABEL(compare_loop_down_l1)
        BICS    a4,a3,#3                // set counter to multiple of 4
        MOVEQ   a1,#0                   // xptr[] == yptr[] -> 0
        MOVEQS  pc,lr                   // if zero then we're done
        STMFD   sp!,{v1-v6,lr}          // save work regs
        MOV     v6,a1                   // move xptr to v6
        MOV     a1,#1                   // set result to +1
LABEL(compare_loop_down_l2)
        LDMDB   a2!,{a3,v1,v2,ip}       // load 4 words in one go
        LDMDB   v6!,{v3,v4,v5,lr}       // load test words
        CMP     lr,ip                   // COMPARE the four words
        CMPEQ   v5,v2
        CMPEQ   v4,v1
        CMPEQ   v3,a3
        MVNLO   a1,#0                   // x < y -> -1 (a1 already holds +1)
        LDMNEFD sp!,{v1-v6,pc}^
        SUBS    a4,a4,#4                // decrement counter by 4
        BGT     compare_loop_down_l2    // if count still positive then loop
        MOV     a1,#0
        LDMFD   sp!,{v1-v6,pc}^         // restore work regs and return

// extern uintD addto_loop_up (uintD* sourceptr, uintD* destptr, uintC count);
//       entry
//               a1 = sourceptr
//               a2 = destptr
//               a3 = count of words to be added
//       exit
//               destptr[] = sourceptr[] + destptr[]
//               a1 = last carry
//               a2 - a4, ip destroyed
        EXPORT(addto_loop_up)           // word aligned addto loop up
        DECLARE_FUNCTION(addto_loop_up)
GLABEL(addto_loop_up)
                MOV     a4,a3           // set regs for a call
                MOV     a3,a2           // to add_loop_up
                                        // and drop into add_loop_up

// extern uintD add_loop_up (uintD* sourceptr1, uintD* sourceptr2, uintD* destptr, uintC count);
//       entry
//               a1 = sourceptr1
//               a2 = sourceptr2
//               a3 = destptr
//               a4 = count of words to be added
//       exit
//               destptr[] = sourceptr1[] + sourceptr2[]
//               a1 = last carry
//               a2 - a4, ip destroyed
        EXPORT(add_loop_up)             // word aligned add loop up
        DECLARE_FUNCTION(add_loop_up)
GLABEL(add_loop_up)
        ANDS    ip,a4,#3                // multiple of 4 words ?
        BEQ     add_loop_up_l1          // yup, so branch
        STMFD   sp!,{v6,lr}
        LDR     v6,[a2],#4              // add the first 1-3 words
        LDR     lr,[a1],#4              // to align the total to a multiple
        ADDS    lr,lr,v6                // of 4 words
        STR     lr,[a3],#4
        TEQ     ip,#1
        BEQ     add_loop_up_l0          // need to branch 'cos PSR used
        LDR     v6,[a2],#4
        LDR     lr,[a1],#4
        ADCS    lr,lr,v6
        STR     lr,[a3],#4
        TEQ     ip,#2
        BEQ     add_loop_up_l0          // need to branch 'cos PSR used
        LDR     v6,[a2],#4
        LDR     lr,[a1],#4
        ADCS    lr,lr,v6
        STR     lr,[a3],#4
LABEL(add_loop_up_l0)                   // at least one add has happened
        BICS    a4,a4,#3                // set counter to multiple of 4
        BNE     add_loop_up_l3          // branch if more adds to do
        ADCEQ   a1,a4,a4                // set result to Carry (a4 is 0)
        LDMEQFD sp!,{v6,pc}^            // and return
LABEL(add_loop_up_l1)
        BICS    a4,a4,#3                // set counter to multiple of 4
        MOVEQ   a1,#0                   // no adds, so C = 0
        MOVEQS  pc,lr                   // if zero then we're done
        CMN     a4,#0                   // clear carry bit
        STMFD   sp!,{v6,lr}
LABEL(add_loop_up_l3)
        STMFD   sp!,{v1-v5}             // save work regs
LABEL(add_loop_up_l2)
        LDMIA   a2!,{v1,v2,v3,ip}       // load 4 words in one go
        LDMIA   a1!,{v4,v5,v6,lr}       // and from source2
        ADCS    v4,v4,v1                // add the four words with carry
        ADCS    v5,v5,v2
        ADCS    v6,v6,v3
        ADCS    lr,lr,ip
        STMIA   a3!,{v4,v5,v6,lr}       // store 4 results
        SUB     a4,a4,#4                // decrement counter by 4, preserve C
        TEQ     a4,#0                   // are we done ?
        BNE     add_loop_up_l2          // if count non-zero then loop
        ADC     a1,a4,a4                // set result to Carry (a4 is 0)
        LDMFD   sp!,{v1-v6,pc}^         // restore work regs and return

// extern uintD inc_loop_up (uintD* ptr, uintC count);
//       entry
//               a1 = ptr
//               a2 = count of words to be INCed
//       exit
//               a1 = 0 if any words are non-zero after increment else 1
//                      stop incrementing when first word becomes non-zero
//               a2 - a4, ip destroyed
        EXPORT(inc_loop_up)             // word aligned inc loop up
        DECLARE_FUNCTION(inc_loop_up)
GLABEL(inc_loop_up)
        ANDS    a3,a2,#1                // multiple of 2 words ?
        BEQ     inc_loop_up_l1          // yup, so branch
        LDR     a4,[a1]                 // INC the first word
        ADDS    a4,a4,#1                // align the total to a multiple of 2
        STR     a4,[a1],#4
        MOVNE   a1,#0                   // set result to 0
        MOVNES  pc,lr                   // return 0 if non-zero result
LABEL(inc_loop_up_l1)
        BICS    a4,a2,#1                // set counter to multiple of 2
        MOVEQ   a1,#1                   // return 1
        MOVEQS  pc,lr                   // if zero then we're done
        MOV     ip,a1                   // move ptr to ip
        MOV     a1,#0                   // set result to 0
        ANDS    a3,a4,#3
        BEQ     inc_loop_up_l3
        LDMIA   ip,{a2,a3}              // load 2 words in one go
        ADDS    a2,a2,#1                // INC the two words
        ADDEQS  a3,a3,#1                // stopping when first word non-zero
        STMIA   ip!,{a2,a3}             // store 2 results
        MOVNES  pc,lr                   // return 0 if any result non-zero
        SUBS    a4,a4,#2                // decrement counter by 2
        MOVEQ   a1,#1                   // if finished loop then
        MOVEQS  pc,lr                   // return 1
LABEL(inc_loop_up_l3)                   // now a multiple of 4 words
        STMFD   sp!,{v1,lr}             // save work regs
LABEL(inc_loop_up_l2)
        LDMIA   ip,{a2,a3,v1,lr}        // load 4 words in one go
        ADDS    a2,a2,#1                // INC the four words
        ADDEQS  a3,a3,#1                // stopping when first word non-zero
        ADDEQS  v1,v1,#1
        ADDEQS  lr,lr,#1
        STMIA   ip!,{a2,a3,v1,lr}       // store 4 results
        LDMNEFD sp!,{v1,pc}^            // return 0 if any result non-zero
        SUBS    a4,a4,#4                // decrement counter by 4
        BGT     inc_loop_up_l2          // if count still positive then loop
        MOV     a1,#1
        LDMFD   sp!,{v1,pc}^            // restore work regs and return 1

// extern uintD sub_loop_up (uintD* sourceptr1, uintD* sourceptr2, uintD* destptr, uintC count);
//       entry
//               a1 = sourceptr1
//               a2 = sourceptr2
//               a3 = destptr
//               a4 = count of words to be subtracted
//       exit
//               destptr[] = sourceptr1[] -  sourceptr2[]
//               a1 = last carry
//               a2 - a4, ip destroyed
        EXPORT(sub_loop_up)             // word aligned sub loop up
        DECLARE_FUNCTION(sub_loop_up)
GLABEL(sub_loop_up)
        ANDS    ip,a4,#3                // multiple of 4 words ?
        BEQ     sub_loop_up_l1          // yup, so branch
        STMFD   sp!,{v6,lr}
        LDR     v6,[a2],#4              // subtract the first 1-3 words
        LDR     lr,[a1],#4              // to align the total to a multiple
        SUBS    lr,lr,v6                // of 4 words
        STR     lr,[a3],#4
        TEQ     ip,#1
        BNE     sub_loop_up_l0          // branch if more than one subtract
LABEL(sub_loop_up_l4)                   // drop through for better instr. timings
        BICS    a4,a4,#3                // set counter to multiple of 4
        SBCEQ   a1,a4,a4                // set result to Carry (a4 is 0)
        LDMEQFD sp!,{v6,pc}^            // and return
        STMFD   sp!,{v1-v5}             // save work regs
        B       sub_loop_up_l2          // branch if more subtracts to do
LABEL(sub_loop_up_l0)
        LDR     v6,[a2],#4
        LDR     lr,[a1],#4
        SBCS    lr,lr,v6
        STR     lr,[a3],#4
        TEQ     ip,#2
        BEQ     sub_loop_up_l4          // need to branch 'cos PSR used
        LDR     v6,[a2],#4
        LDR     lr,[a1],#4
        SBCS    lr,lr,v6
        STR     lr,[a3],#4
        B       sub_loop_up_l4
LABEL(sub_loop_up_l1)
        BICS    a4,a4,#3                // set counter to multiple of 4
        MOVEQ   a1,#0                   // no subtracts, so C = 0
        MOVEQS  pc,lr                   // if zero then we're done
        CMP     a4,#0                   // set carry bit, since a4 > 0
        STMFD   sp!,{v1-v6,lr}          // save work regs
LABEL(sub_loop_up_l2)
        LDMIA   a2!,{v1,v2,v3,ip}       // load 4 words in one go
        LDMIA   a1!,{v4,v5,v6,lr}       // and from source2
        SBCS    v4,v4,v1                // subtract the four words with carry
        SBCS    v5,v5,v2
        SBCS    v6,v6,v3
        SBCS    lr,lr,ip
        STMIA   a3!,{v4,v5,v6,lr}       // store 4 results
        SUB     a4,a4,#4                // decrement counter by 4, preserve C
        TEQ     a4,#0                   // are we done ?
        BNE     sub_loop_up_l2          // if count non-zero then loop
        SBC     a1,a4,a4                // set result to Carry (a4 is 0)
        LDMFD   sp!,{v1-v6,pc}^         // restore work regs and return

// extern uintD subx_loop_up (uintD* sourceptr1, uintD* sourceptr2, uintD* destptr, uintC count, uintD carry);
//       entry
//               a1 = sourceptr1
//               a2 = sourceptr2
//               a3 = destptr
//               a4 = count of words to be subtracted
//               [sp] = carry
//       exit
//               destptr[] = sourceptr1[] -  sourceptr2[]
//               a1 = last carry
//               a2 - a4, ip destroyed
        EXPORT(subx_loop_up)            // word aligned xsub loop up
        DECLARE_FUNCTION(subx_loop_up)
GLABEL(subx_loop_up)
        LDR     ip,[sp]                 // get starting value of carry
LABEL(subx_loop_up_lsub)
        RSBS    ip,ip,#0                // set carry in PSR
        ANDS    ip,a4,#3                // multiple of 4 words ?
        BEQ     subx_loop_up_l1         // yup, so branch
        STMFD   sp!,{v6,lr}
        LDR     v6,[a2],#4              // subtract the first 1-3 words
        LDR     lr,[a1],#4              // to align the total to a multiple
        SBCS    lr,lr,v6                // of 4 words
        STR     lr,[a3],#4
        TEQ     ip,#1
        BNE     subx_loop_up_l0         // branch if more than one subtract
LABEL(subx_loop_up_l4)                  // drop through for better instr. timings
        BICS    a4,a4,#3                // set counter to multiple of 4
        SBCEQ   a1,a4,a4                // set result to Carry (a4 is 0)
        LDMEQFD sp!,{v6,pc}^            // and return
        STMFD   sp!,{v1-v5}             // save work regs
        B       subx_loop_up_l2         // branch if more subtracts to do
LABEL(subx_loop_up_l0)
        LDR     v6,[a2],#4
        LDR     lr,[a1],#4
        SBCS    lr,lr,v6
        STR     lr,[a3],#4
        TEQ     ip,#2
        BEQ     subx_loop_up_l4         // need to branch 'cos PSR used
        LDR     v6,[a2],#4
        LDR     lr,[a1],#4
        SBCS    lr,lr,v6
        STR     lr,[a3],#4
        B       subx_loop_up_l4
LABEL(subx_loop_up_l1)
        BICS    a4,a4,#3                // set counter to multiple of 4
        SBCEQ   a1,a4,a4                // set result to Carry (a4 is 0)
        MOVEQS  pc,lr                   // if zero then we're done
        STMFD   sp!,{v1-v6,lr}          // save work regs
LABEL(subx_loop_up_l2)
        LDMIA   a2!,{v1,v2,v3,ip}       // load 4 words in one go
        LDMIA   a1!,{v4,v5,v6,lr}       // and from source2
        SBCS    v4,v4,v1                // subtract the four words with carry
        SBCS    v5,v5,v2
        SBCS    v6,v6,v3
        SBCS    lr,lr,ip
        STMIA   a3!,{v4,v5,v6,lr}       // store 4 results
        SUB     a4,a4,#4                // decrement counter by 4, preserve C
        TEQ     a4,#0                   // are we done ?
        BNE     subx_loop_up_l2         // if count non-zero then loop
        SBC     a1,a4,a4                // set result to Carry (a4 is 0)
        LDMFD   sp!,{v1-v6,pc}^         // restore work regs and return

// extern uintD subfrom_loop_up (uintD* sourceptr, uintD* destptr, uintC count);
//       entry
//               a1 = sourceptr
//               a2 = destptr
//               a3 = count of words to be subtracted
//       exit
//               destptr[] = destptr[] - sourceptr[]
//               a1 = last carry
//               a2 - a4, ip destroyed
        EXPORT(subfrom_loop_up)         // word aligned subfrom loop up
        DECLARE_FUNCTION(subfrom_loop_up)
GLABEL(subfrom_loop_up)
        ANDS    ip,a3,#3                // multiple of 4 words ?
        BEQ     subfrom_loop_up_l1      // yup, so branch
        STMFD   sp!,{lr}
        LDR     a4,[a1],#4              // subtract the first 1-3 words
        LDR     lr,[a2]                 // to align the total to a multiple
        SUBS    lr,lr,a4                // of 4 words
        STR     lr,[a2],#4
        TEQ     ip,#1
        BNE     subfrom_loop_up_l0      // branch if more than one subtract
LABEL(subfrom_loop_up_l4)                    // drop through for better instr. timings
        BICS    a4,a3,#3                // set counter to multiple of 4
        SBCEQ   a1,a4,a4                // set result to Carry (a4 is 0)
        LDMEQFD sp!,{pc}^               // and return
        STMFD   sp!,{v1-v5}             // save work regs
        B       subfrom_loop_up_l2      // branch if more subtracts to do
LABEL(subfrom_loop_up_l0)
        LDR     a4,[a1],#4
        LDR     lr,[a2]
        SBCS    lr,lr,a4
        STR     lr,[a2],#4
        TEQ     ip,#2
        BEQ     subfrom_loop_up_l4      // need to branch 'cos PSR used
        LDR     a4,[a1],#4
        LDR     lr,[a2]
        SBCS    lr,lr,a4
        STR     lr,[a2],#4
        B       subfrom_loop_up_l4
LABEL(subfrom_loop_up_l1)
        BICS    a4,a3,#3                // set counter to multiple of 4
        MOVEQ   a1,#0                   // no subtracts, so C = 0
        MOVEQS  pc,lr                   // if zero then we're done
        CMP     a4,#0                   // set carry bit, since a4 > 0
        STMFD   sp!,{v1-v5,lr}          // save work regs
LABEL(subfrom_loop_up_l2)
        LDMIA   a1!,{a3,v1,v2,ip}       // load 4 words in one go
        LDMIA   a2,{v3,v4,v5,lr}        // and from destptr
        SBCS    v3,v3,a3                // subtract the four words with carry
        SBCS    v4,v4,v1
        SBCS    v5,v5,v2
        SBCS    lr,lr,ip
        STMIA   a2!,{v3,v4,v5,lr}       // store 4 results
        SUB     a4,a4,#4                // decrement counter by 4, preserve C
        TEQ     a4,#0                   // are we done ?
        BNE     subfrom_loop_up_l2      // if count non-zero then loop
        SBC     a1,a4,a4                // set result to Carry (a4 is 0)
        LDMFD   sp!,{v1-v5,pc}^         // restore work regs and return

// extern uintD dec_loop_up (uintD* ptr, uintC count);
//       entry
//               a1 = ptr
//               a2 = count of words to be DECed
//       exit
//               a1 = 0 if any words are non-zero before decrement else -1
//                      stop decrementing when first word is non-zero
//               a2 - a4, ip destroyed
        EXPORT(dec_loop_up)             // word aligned dec loop up
        DECLARE_FUNCTION(dec_loop_up)
GLABEL(dec_loop_up)
        ANDS    a3,a2,#1                // multiple of 2 words ?
        BEQ     dec_loop_up_l1          // yup, so branch
        LDR     a4,[a1]                 // DEC the first word
        SUBS    a4,a4,#1                // align the total to a multiple of 2
        STR     a4,[a1],#4
        MOVCS   a1,#0                   // set result to 0
        MOVCSS  pc,lr                   // return 0 if non-zero result
LABEL(dec_loop_up_l1)
        BICS    a4,a2,#1                // set counter to multiple of 2
        MVNEQ   a1,#0                   // return -1
        MOVEQS  pc,lr                   // if zero then we're done
        MOV     ip,a1                   // move ptr to ip
        MOV     a1,#0                   // set result to 0
        ANDS    a3,a4,#3
        BEQ     dec_loop_up_l3
        LDMIA   ip,{a2,a3}              // load 2 words in one go
        SUBS    a2,a2,#1                // DEC the two words
        SUBCCS  a3,a3,#1                // stopping when first word non-zero
        STMIA   ip!,{a2,a3}             // store 2 results
        MOVCSS  pc,lr                   // return 0 if any result non-zero
        SUBS    a4,a4,#2                // decrement counter by 2
        MVNEQ   a1,#0                   // if finished loop then
        MOVEQS  pc,lr                   // return -1
LABEL(dec_loop_up_l3)                   // now a multiple of 4 words
        STMFD   sp!,{v1,lr}             // save work regs
LABEL(dec_loop_up_l2)
        LDMIA   ip,{a2,a3,v1,lr}        // load 4 words in one go
        SUBS    a2,a2,#1                // DEC the four words
        SUBCCS  a3,a3,#1                // stopping when first word non-zero
        SUBCCS  v1,v1,#1
        SUBCCS  lr,lr,#1
        STMIA   ip!,{a2,a3,v1,lr}       // store 4 results
        LDMCSFD sp!,{v1,pc}^            // return 0 if any carry
        SUBS    a4,a4,#4                // decrement counter by 4
        BGT     dec_loop_up_l2          // if count still positive then loop
        MVN     a1,#0
        LDMFD   sp!,{v1,pc}^            // restore work regs and return -1

// extern void neg_loop_up (uintD* ptr, uintC count);
//       entry
//               a1 = ptr
//               a2 = count of words. The long integer is to be NEGated
//       exit
//               ptr[] = -ptr[] for count words
//               a1 = last carry
//               a2 - a4, ip destroyed
        EXPORT(neg_loop_up)             // word aligned neg loop up
        DECLARE_FUNCTION(neg_loop_up)
GLABEL(neg_loop_up)
        CMPS    a2,#0                   // count = 0 ?
        MOVEQ   a1,#0                   // yup, so return 0
        MOVEQS  pc,lr
LABEL(neg_loop_up_l1)                   // skip all the zero words first
        LDR     a3,[a1],#4              // compare words against zero
        CMPS    a3,#0                   // upwards in memory
        BNE     neg_loop_up_l2          // non-zero, so negate rest of words
        SUBS    a2,a2,#1                // reduce count of words
        BNE     neg_loop_up_l1          // more ?, so loop
        MOV     a1,#0                   // return 0
        MOVS    pc,lr
LABEL(neg_loop_up_l2)
        RSB     a3,a3,#0                // first non-zero word = -word
        STR     a3,[a1,#-4]
        SUBS    a2,a2,#1
        MVNEQ   a1,#0                   // done ? -> return -1
        MOVEQS  pc,lr
                                        // now NOT rest of the words
        ANDS    a3,a2,#3                // multiple of 4 words ?
        BEQ     neg_loop_up_l3          // yup, so branch
        CMP     a3,#2                   // NOT the first 1-3 words
        LDR     a3,[a1]                 // to align the total to a multiple
        MVN     a3,a3                   // of 4 words
        STR     a3,[a1],#4
        BLT     neg_loop_up_l3          // better to branch than skip instrs.
        LDRGE   a3,[a1]
        MVNGE   a3,a3
        STRGE   a3,[a1],#4
        LDRGT   a3,[a1]
        MVNGT   a3,a3
        STRGT   a3,[a1],#4
LABEL(neg_loop_up_l3)
        BICS    a4,a2,#3                // set counter to multiple of 4
        MVNEQ   a1,#0                   // set result to -1
        MOVEQS  pc,lr                   // if zero then we're done
        STMFD   sp!,{lr}                // save work regs
LABEL(neg_loop_up_l4)
        LDMIA   a1,{a2,a3,ip,lr}        // load 4 words in one go,NO writeback
        MVN     a2,a2                   // NOT the four words
        MVN     a3,a3
        MVN     ip,ip
        MVN     lr,lr
        STMIA   a1!,{a2,a3,ip,lr}       // store 4 results
        SUBS    a4,a4,#4                // decrement counter by 4
        BGT     neg_loop_up_l4          // if count still positive then loop
        MVN     a1,#0                   // set result to -1
        LDMFD   sp!,{pc}^               // restore work regs and return -1

// extern uintD shift1left_loop_up (uintD* ptr, uintC count);
//       entry
//               a1 = ptr
//               a2 = count of words to be shifted left
//       exit
//               a1 = carry out from last shift left
//               a2 - a4, ip destroyed
        EXPORT(shift1left_loop_up)      // word aligned shift1left loop up
        DECLARE_FUNCTION(shift1left_loop_up)
GLABEL(shift1left_loop_up)
        CMN     a1,#0                   // clear carry bit, since a1 > 0
        ANDS    a3,a2,#1                // multiple of 2 words ?
        BEQ     shift1left_loop_up_l1   // yup, so branch
        LDR     a4,[a1]                 // shift left the first word
        ADDS    a4,a4,a4
        STR     a4,[a1],#4
LABEL(shift1left_loop_up_l1)
        BICS    a4,a2,#1                // set counter to multiple of 2
        ADCEQ   a1,a4,a4                // if zero set result to C (a4 is 0)
        MOVEQS  pc,lr                   // and return
        ANDS    a3,a4,#3                // multiple of 4 words ?
        BEQ     shift1left_loop_up_l3   // yup, so branch
        LDMIA   a1,{a2,a3}              // load 2 words in one go
        ADCS    a2,a2,a2                // shift left the two words
        ADCS    a3,a3,a3
        STMIA   a1!,{a2,a3}             // store 2 results
        BICS    a4,a4,#2                // decrement counter by 2
        ADCEQ   a1,a4,a4                // set result to Carry (a4 is 0)
        MOVEQS  pc,lr                   // and return
LABEL(shift1left_loop_up_l3)            // now a multiple of 4 words
        STMFD   sp!,{lr}                // save work regs
LABEL(shift1left_loop_up_l2)
        LDMIA   a1,{a2,a3,ip,lr}        // load 4 words in one go
        ADCS    a2,a2,a2                // shift left the four words
        ADCS    a3,a3,a3
        ADCS    ip,ip,ip
        ADCS    lr,lr,lr
        STMIA   a1!,{a2,a3,ip,lr}       // store 4 results
        SUB     a4,a4,#4                // decrement counter by 4
        TEQ     a4,#0                   // are we done ?
        BNE     shift1left_loop_up_l2   // if count non-zero then loop
        ADC     a1,a4,a4                // set result to Carry (a4 is 0)
        LDMFD   sp!,{pc}^               // restore work regs and return 1

// extern uintD shiftleft_loop_up (uintD* ptr, uintC count, uintC i, uintD carry);
//       entry
//               a1 = ptr
//               a2 = count of words to be shifted left
//               a3 = size of left shift
//               a4 = value to ORR in for first shift
//       exit
//               a1 = shift out from last shift left
//               a2 - a4, ip destroyed
        EXPORT(shiftleft_loop_up)       // word aligned shiftleft loop up
        DECLARE_FUNCTION(shiftleft_loop_up)
GLABEL(shiftleft_loop_up)
        STMFD   sp!,{v6,lr}
        RSB     v6,a3,#32               // size of complementary right shift
        ANDS    ip,a2,#3                // multiple of 4 words ?
        BEQ     shiftleft_loop_up_l1    // yup, so branch
        LDR     lr,[a1]                 // shiftleft the first 1-3 words
        ORR     a4,a4,lr,ASL a3         // to align the total to a multiple
        STR     a4,[a1],#4              // of 4 words
        MOV     a4,lr,LSR v6
        CMP     ip,#2
        BLT     shiftleft_loop_up_l1    // better to branch than skip instrs.
        LDRGE   lr,[a1]
        ORRGE   a4,a4,lr,ASL a3
        STRGE   a4,[a1],#4
        MOVGE   a4,lr,LSR v6
        LDRGT   lr,[a1]
        ORRGT   a4,a4,lr,ASL a3
        STRGT   a4,[a1],#4
        MOVGT   a4,lr,LSR v6
LABEL(shiftleft_loop_up_l1)
        BICS    ip,a2,#3                // set counter to multiple of 4
        MOVEQ   a1,a4                   // if zero then we're done
        LDMEQFD sp!,{v6,pc}^            // so return last shift out
        STMFD   sp!,{v1-v3}             // save work regs
LABEL(shiftleft_loop_up_l2)
        LDMIA   a1,{v1,v2,v3,lr}        // load 4 words in one go
        ORR     a2,a4,v1,ASL a3         // shiftleft the four words
        MOV     a4,v1,LSR v6            // keep carry in a4
        ORR     v1,a4,v2,ASL a3         // and store results down a register
        MOV     a4,v2,LSR v6            // to regs a2,v1-v3
        ORR     v2,a4,v3,ASL a3
        MOV     a4,v3,LSR v6
        ORR     v3,a4,lr,ASL a3
        MOV     a4,lr,LSR v6
        STMIA   a1!,{a2,v1,v2,v3}       // store 4 results
        SUBS    ip,ip,#4                // decrement counter by 4
        BGT     shiftleft_loop_up_l2    // if count still positive then loop
        MOV     a1,a4                   // result = last shift out
        LDMFD   sp!,{v1-v3,v6,pc}^      // restore work regs and return

#endif

// extern uintD shiftleftcopy_loop_up (uintD* sourceptr, uintD* destptr, uintC count, uintC i);
//       entry
//               a1 = sourceptr
//               a2 = destptr
//               a3 = count of words to be shifted left
//               a4 = size of left shift
//       exit
//               a1 = shift out from last shift left
//               a2 - a4, ip destroyed
        EXPORT(shiftleftcopy_loop_up)   // word aligned shiftleftcopy loop up
        DECLARE_FUNCTION(shiftleftcopy_loop_up)
GLABEL(shiftleftcopy_loop_up)
        STMFD   sp!,{v5,v6,lr}
        MOV     v5,#0                   // initial shift carry
        RSB     v6,a4,#32               // size of complementary right shift
        ANDS    ip,a3,#3                // multiple of 4 words ?
        BEQ     shiftleftcopy_loop_up_l1 // yup, so branch
        LDR     lr,[a1],#4              // shiftleft the first 1-3 words
        ORR     v5,v5,lr,ASL a4         // to align the total to a multiple
        STR     v5,[a2],#4              // of 4 words
        MOV     v5,lr,LSR v6
        CMP     ip,#2
        BLT     shiftleftcopy_loop_up_l1 // better to branch than skip instrs.
        LDRGE   lr,[a1],#4
        ORRGE   v5,v5,lr,ASL a4
        STRGE   v5,[a2],#4
        MOVGE   v5,lr,LSR v6
        LDRGT   lr,[a1],#4
        ORRGT   v5,v5,lr,ASL a4
        STRGT   v5,[a2],#4
        MOVGT   v5,lr,LSR v6
LABEL(shiftleftcopy_loop_up_l1)
        BICS    ip,a3,#3                // set counter to multiple of 4
        MOVEQ   a1,v5                   // if zero then we're done
        LDMEQFD sp!,{v5,v6,pc}^         // so return last shift out
        STMFD   sp!,{v1-v3}             // save work regs
LABEL(shiftleftcopy_loop_up_l2)
        LDMIA   a1!,{v1,v2,v3,lr}       // load 4 words in one go
        ORR     a3,v5,v1,ASL a4         // shiftleft the four words
        MOV     v5,v1,LSR v6            // keep carry in v5
        ORR     v1,v5,v2,ASL a4         // and store results down a register
        MOV     v5,v2,LSR v6            // to regs a3,v1-v3
        ORR     v2,v5,v3,ASL a4
        MOV     v5,v3,LSR v6
        ORR     v3,v5,lr,ASL a4
        MOV     v5,lr,LSR v6
        STMIA   a2!,{a3,v1,v2,v3}       // store 4 results
        SUBS    ip,ip,#4                // decrement counter by 4
        BGT     shiftleftcopy_loop_up_l2 // if count still positive then loop
        MOV     a1,v5                   // result = last shift out
        LDMFD   sp!,{v1-v3,v5,v6,pc}^   // restore work regs and return

#if !CL_DS_BIG_ENDIAN_P

// extern uintD shift1right_loop_down (uintD* ptr, uintC count, uintD carry);
//       entry
//               a1 = ptr
//               a2 = count of words to be shifted right
//               a3 = carry
//       exit
//               a1 = carry out from last shift right
//               a2 - a4, ip destroyed
        EXPORT(shift1right_loop_down)   // word aligned shift1right loop down
        DECLARE_FUNCTION(shift1right_loop_down)
GLABEL(shift1right_loop_down)
        MOVS    a3,a3,LSR #1            // set carry
        ANDS    a3,a2,#1                // multiple of 2 words ?
        BEQ     shift1right_loop_down_l1 // yup, so branch
        LDR     a4,[a1,#-4]!            // shift right the first word
        MOVS    a4,a4,RRX
        STR     a4,[a1]
LABEL(shift1right_loop_down_l1)
        BICS    a4,a2,#1                // set counter to multiple of 2
        MOVEQ   a1,a4,RRX               // if zero set result to C (a4 is 0)
        MOVEQS  pc,lr                   // and return
        ANDS    a3,a4,#3                // multiple of 4 words ?
        BEQ     shift1right_loop_down_l3 // yup, so branch
        LDMDB   a1,{a2,a3}              // load 2 words in one go
        MOVS    a3,a3,RRX               // shift right the two words
        MOVS    a2,a2,RRX
        STMDB   a1!,{a2,a3}             // store 2 results
        BICS    a4,a4,#2                // decrement counter by 2
        ADCEQ   a1,a4,a4                // set result to Carry (a4 is 0)
        MOVEQS  pc,lr                   // and return
LABEL(shift1right_loop_down_l3)         // now a multiple of 4 words
        STMFD   sp!,{lr}                // save work regs
LABEL(shift1right_loop_down_l2)
        LDMDB   a1,{a2,a3,ip,lr}        // load 4 words in one go
        MOVS    lr,lr,RRX               // shift right the four words
        MOVS    ip,ip,RRX
        MOVS    a3,a3,RRX
        MOVS    a2,a2,RRX
        STMDB   a1!,{a2,a3,ip,lr}       // store 4 results
        SUB     a4,a4,#4                // decrement counter by 4
        TEQ     a4,#0                   // are we done ?
        BNE     shift1right_loop_down_l2 // if count non-zero then loop
        MOV     a1,a4,RRX               // set result to Carry (a4 is 0)
        LDMFD   sp!,{pc}^               // restore work regs and return 1

// extern uintD shiftright_loop_down (uintD* ptr, uintC count, uintC i);
//       entry
//               a1 = ptr
//               a2 = count of words to be shifted right
//               a3 = size of right shift
//       exit
//               a1 = shift out from last shift right
//               a2 - a4, ip destroyed
        EXPORT(shiftright_loop_down)    // word aligned shiftright loop down
        DECLARE_FUNCTION(shiftright_loop_down)
GLABEL(shiftright_loop_down)
        STMFD   sp!,{v6,lr}
        MOV     a4,#0                   // initial shift carry
        RSB     v6,a3,#32               // size of complementary left shift
LABEL(shiftright_loop_down_l0)
        ANDS    ip,a2,#3                // multiple of 4 words ?
        BEQ     shiftright_loop_down_l1 // yup, so branch
        LDR     lr,[a1,#-4]!            // shiftright the first 1-3 words
        ORR     a4,a4,lr,LSR a3         // to align the total to a multiple
        STR     a4,[a1]                 // of 4 words
        MOV     a4,lr,ASL v6
        CMP     ip,#2
        BLT     shiftright_loop_down_l1 // better to branch than skip instrs.
        LDRGE   lr,[a1,#-4]!
        ORRGE   a4,a4,lr,LSR a3
        STRGE   a4,[a1]
        MOVGE   a4,lr,ASL v6
        LDRGT   lr,[a1,#-4]!
        ORRGT   a4,a4,lr,LSR a3
        STRGT   a4,[a1]
        MOVGT   a4,lr,ASL v6
LABEL(shiftright_loop_down_l1)
        BICS    ip,a2,#3                // set counter to multiple of 4
        MOVEQ   a1,a4                   // if zero then we're done
        LDMEQFD sp!,{v6,pc}^            // so return last shift out
        STMFD   sp!,{v1-v3}             // save work regs
LABEL(shiftright_loop_down_l2)
        LDMDB   a1,{a2,v1,v2,v3}        // load 4 words in one go
        ORR     lr,a4,v3,LSR a3         // shiftright the four words
        MOV     a4,v3,ASL v6            // keep carry in a4
        ORR     v3,a4,v2,LSR a3         // and store results up a register
        MOV     a4,v2,ASL v6            // to regs v1-v3,lr
        ORR     v2,a4,v1,LSR a3
        MOV     a4,v1,ASL v6
        ORR     v1,a4,a2,LSR a3
        MOV     a4,a2,ASL v6
        STMDB   a1!,{v1,v2,v3,lr}       // store 4 results
        SUBS    ip,ip,#4                // decrement counter by 4
        BGT     shiftright_loop_down_l2 // if count still positive then loop
        MOV     a1,a4                   // result = last shift out
        LDMFD   sp!,{v1-v3,v6,pc}^      // restore work regs and return

// extern uintD shiftrightsigned_loop_down (uintD* ptr, uintC count, uintC i);
//       entry
//               a1 = ptr
//               a2 = count of words to be shifted right signed
//               a3 = size of right shift
//       exit
//               a1 = shift out from last shift right
//               a2 - a4, ip destroyed
        EXPORT(shiftrightsigned_loop_down)// word aligned shiftrightsigned loop down
        DECLARE_FUNCTION(shiftrightsigned_loop_down)
GLABEL(shiftrightsigned_loop_down)
        STMFD   sp!,{v6,lr}
        RSB     v6,a3,#32               // size of complementary left shift
        LDR     lr,[a1,#-4]             // setup carry for first shift.
        MOV     a4,lr,ASR #31           // this is the sign extended bits
        AND     a4,a4,a4,LSL v6         // 31->(32-i) of the first word
        B       shiftright_loop_down_l0 // use right shift code now

// extern uintD shiftrightcopy_loop_down (uintD* sourceptr, uintD* destptr, uintC count, uintC i, uintD carry);
//       entry
//               a1 = sourceptr
//               a2 = destptr
//               a3 = count of words to be shifted right
//               a4 = size of right shift
//               [sp] = carry for first shift
//       exit
//               a1 = shift out from last shift right
//               a2 - a4, ip destroyed
        EXPORT(shiftrightcopy_loop_down)// word aligned shiftrightcopy loop down
        DECLARE_FUNCTION(shiftrightcopy_loop_down)
GLABEL(shiftrightcopy_loop_down)
        STMFD   sp!,{v5,v6,lr}
        LDR     v5,[sp,#12]             // initial shift carry
        RSB     v6,a4,#32               // size of complementary left shift
        MOV     v5,v5,ASL v6
LABEL(shiftrightcopy_loop_down_l0)
        ANDS    ip,a3,#3                // multiple of 4 words ?
        BEQ     shiftrightcopy_loop_down_l1 // yup, so branch
        LDR     lr,[a1,#-4]!            // shiftright the first 1-3 words
        ORR     v5,v5,lr,LSR a4         // to align the total to a multiple
        STR     v5,[a2,#-4]!            // of 4 words
        MOV     v5,lr,ASL v6
        CMP     ip,#2
        BLT     shiftrightcopy_loop_down_l1 // better to branch than skip instrs.
        LDRGE   lr,[a1,#-4]!
        ORRGE   v5,v5,lr,LSR a4
        STRGE   v5,[a2,#-4]!
        MOVGE   v5,lr,ASL v6
        LDRGT   lr,[a1,#-4]!
        ORRGT   v5,v5,lr,LSR a4
        STRGT   v5,[a2,#-4]!
        MOVGT   v5,lr,ASL v6
LABEL(shiftrightcopy_loop_down_l1)
        BICS    ip,a3,#3                // set counter to multiple of 4
        MOVEQ   a1,v5                   // if zero then we're done
        LDMEQFD sp!,{v5,v6,pc}^         // so return last shift out
        STMFD   sp!,{v1-v3}             // save work regs
LABEL(shiftrightcopy_loop_down_l2)
        LDMDB   a1!,{a3,v1,v2,v3}       // load 4 words in one go
        ORR     lr,v5,v3,LSR a4         // shiftright the four words
        MOV     v5,v3,ASL v6            // keep carry in v5
        ORR     v3,v5,v2,LSR a4         // and store results up a register
        MOV     v5,v2,ASL v6            // to regs v1-v3,lr
        ORR     v2,v5,v1,LSR a4
        MOV     v5,v1,ASL v6
        ORR     v1,v5,a3,LSR a4
        MOV     v5,a3,ASL v6
        STMDB   a2!,{v1,v2,v3,lr}       // store 4 results
        SUBS    ip,ip,#4                // decrement counter by 4
        BGT     shiftrightcopy_loop_down_l2 // if count still positive then loop
        MOV     a1,v5                   // result = last shift out
        LDMFD   sp!,{v1-v3,v5,v6,pc}^   // restore work regs and return

#ifndef HAVE_umull
// mulu32_64_vregs
//       entry
//               a1 = x
//               ip = y
//       exit
//               v1 = low32(x*y)
//               ip = high32(x*y)
//               v2,v3,v4 destroyed
LABEL(mulu32_64_vregs)
        MOV     v1,a1,LSR #16           // temp := top half of x
        MOV     v2,ip,LSR #16           // hi := top half of y
        BIC     v3,a1,v1,LSL #16        // x  := bottom half of x
        BIC     ip,ip,v2,LSL #16        // y  := bottom half of y
        MUL     v4,v3,ip                // low section of result
        MUL     ip,v1,ip                // ) middle sections
        MUL     v3,v2,v3                // )   of result
        MUL     v2,v1,v2                // high section of result
        ADDS    ip,ip,v3                // add middle sections
                                        // (can't use mla as we need carry)
        ADDCS   v2,v2,#0x10000          // carry from above add
        ADDS    v1,v4,ip,LSL #16        // x is now bottom 32 bits of result
        ADC     ip,v2,ip,LSR #16        // hi is top 32 bits
        MOVS    pc,lr
#endif

// extern uintD mulusmall_loop_up (uintD digit, uintD* ptr, uintC len, uintD newdigit);
//       entry
//               a1 = digit
//               a2 = ptr
//               a3 = count of words to be multiplied up
//               a4 = new digit = carry
//       exit
//               a1 = final carry of multiply
//               a2 - a4, ip destroyed
        EXPORT(mulusmall_loop_up)
        DECLARE_FUNCTION(mulusmall_loop_up)
GLABEL(mulusmall_loop_up)
        CMP     a3,#0
        MOVEQ   a1,a4
        MOVEQS  pc,lr
#ifdef HAVE_umull
        STMFD   sp!,{v1,lr}
LABEL(mulusmall_loop_up_l1)
        LDR     ip,[a2]
        UMULL   v1,ip,a1,ip             // muluD(digit,*--ptr,hi=,lo=)
        ADDS    v1,v1,a4                // lo += carry
        ADC     a4,ip,#0                // if (lo<carry) { hi += 1 }; carry=hi
        STR     v1,[a2],#4              // *ptr++ = lo
        SUBS    a3,a3,#1                // len--
        BNE     mulusmall_loop_up_l1    // until len==0
        MOV     a1,a4                   // return carry
        LDMFD   sp!,{v1,pc}^
#else
        STMFD   sp!,{v1-v2,lr}
LABEL(mulusmall_loop_up_l1)
        LDR     ip,[a2]

//      BL      mulu32_64_vregs         // muluD(digit,*ptr,hi=,lo=)
// replaced by multiplication of a small x = a1 and a big y = ip :
        MOV     v1,ip,LSR #16           // top half of y
        BIC     ip,ip,v1,LSL #16        // bottom half of y
        MUL     v2,a1,v1                // middle section of result
        MUL     v1,a1,ip                // low section of result
        MOV     ip,#0                   // high section of result
        ADDS    v1,v1,v2,LSL #16        // bottom 32 bits of result
        ADC     ip,ip,v2,LSR #16        // top 32 bits of result

        ADDS    v1,v1,a4                // lo += carry
        ADC     a4,ip,#0                // if (lo<carry) { hi += 1 }; carry=hi
        STR     v1,[a2],#4              // *ptr++ = lo
        SUBS    a3,a3,#1                // len--
        BNE     mulusmall_loop_up_l1    // until len==0
        MOV     a1,a4                   // return carry
        LDMFD   sp!,{v1-v2,pc}^
#endif

// extern void mulu_loop_up (uintD digit, uintD* sourceptr, uintD* destptr, uintC len);
//       entry
//               a1 = digit
//               a2 = sourceptr
//               a3 = destptr
//               a4 = count of words to be multiplied up
//       exit
//               a1 - a4, ip destroyed
        EXPORT(mulu_loop_up)
        DECLARE_FUNCTION(mulu_loop_up)
GLABEL(mulu_loop_up)
#ifdef HAVE_umull
        STMFD   sp!,{v1,v5,lr}
        MOV     v5,#0
LABEL(mulu_loop_up_l1)
        LDR     ip,[a2],#4
        UMULL   v1,ip,a1,ip             // muluD(digit,*sourceptr++,hi=,lo=)
        ADDS    v1,v1,v5                // lo += carry
        ADC     v5,ip,#0                // if (lo<carry) { hi += 1 }; carry=hi
        STR     v1,[a3],#4              // *destptr++ = lo
        SUBS    a4,a4,#1                // len--
        BNE     mulu_loop_up_l1         // until len==0
        STR     v5,[a3],#4              // *destptr++ = carry
        LDMFD   sp!,{v1,v5,pc}^
#else
        STMFD   sp!,{v1-v5,lr}
        MOV     v5,#0
LABEL(mulu_loop_up_l1)
        LDR     ip,[a2],#4
        BL      mulu32_64_vregs         // muluD(digit,*sourceptr++,hi=,lo=)
        ADDS    v1,v1,v5                // lo += carry
        ADC     v5,ip,#0                // if (lo<carry) { hi += 1 }; carry=hi
        STR     v1,[a3],#4              // *destptr++ = lo
        SUBS    a4,a4,#1                // len--
        BNE     mulu_loop_up_l1         // until len==0
        STR     v5,[a3],#4              // *destptr++ = carry
        LDMFD   sp!,{v1-v5,pc}^
#endif

// extern void muluadd_loop_up (uintD digit, uintD* sourceptr, uintD* destptr, uintC len);
//       entry
//               a1 = digit
//               a2 = sourceptr
//               a3 = destptr
//               a4 = count of words to be multiplied added up
//       exit
//               a1 - a4, ip destroyed
        EXPORT(muluadd_loop_up)
        DECLARE_FUNCTION(muluadd_loop_up)
GLABEL(muluadd_loop_up)
#ifdef HAVE_umull
        STMFD   sp!,{v1,v5,lr}
        MOV     v5,#0
LABEL(muluadd_loop_up_l1)
        LDR     ip,[a2],#4
        UMULL   v1,ip,a1,ip             // muluD(digit,*sourceptr++,hi=,lo=)
        ADDS    v1,v1,v5                // lo += carry
        ADCCS   ip,ip,#0                // if (lo<carry) { hi += 1 };
        LDR     v5,[a3]                 // carry = *destptr
        ADDS    v1,v1,v5                // lo += carry
        ADC     v5,ip,#0                // if (lo<carry) { hi += 1 }; carry=hi
        STR     v1,[a3],#4              // *destptr++ = lo
        SUBS    a4,a4,#1                // len--
        BNE     muluadd_loop_up_l1      // until len==0
        MOV     a1,v5                   // return carry
        LDMFD   sp!,{v1,v5,pc}^
#else
        STMFD   sp!,{v1-v5,lr}
        MOV     v5,#0
LABEL(muluadd_loop_up_l1)
        LDR     ip,[a2],#4
        BL      mulu32_64_vregs         // muluD(digit,*sourceptr++,hi=,lo=)
        ADDS    v1,v1,v5                // lo += carry
        ADCCS   ip,ip,#0                // if (lo<carry) { hi += 1 };
        LDR     v5,[a3]                 // carry = *destptr
        ADDS    v1,v1,v5                // lo += carry
        ADC     v5,ip,#0                // if (lo<carry) { hi += 1 }; carry=hi
        STR     v1,[a3],#4              // *destptr++ = lo
        SUBS    a4,a4,#1                // len--
        BNE     muluadd_loop_up_l1      // until len==0
        MOV     a1,v5                   // return carry
        LDMFD   sp!,{v1-v5,pc}^
#endif

// extern void mulusub_loop_up (uintD digit, uintD* sourceptr, uintD* destptr, uintC len);
//       entry
//               a1 = digit
//               a2 = sourceptr
//               a3 = destptr
//               a4 = count of words to be multiplied subtracted up
//       exit
//               a1 - a4, ip destroyed
        EXPORT(mulusub_loop_up)
        DECLARE_FUNCTION(mulusub_loop_up)
GLABEL(mulusub_loop_up)
#ifdef HAVE_umull
        STMFD   sp!,{v1,v5,lr}
        MOV     v5,#0
LABEL(mulusub_loop_up_l1)
        LDR     ip,[a2],#4
        UMULL   v1,ip,a1,ip             // muluD(digit,*sourceptr++,hi=,lo=)
        ADDS    v1,v1,v5                // lo += carry
        ADC     v5,ip,#0                // if (lo<carry) { hi += 1 };
        LDR     ip,[a3]                 // carry = *destptr
        SUBS    ip,ip,v1
        STR     ip,[a3],#4              // *destptr++ = carry - lo
        ADDCC   v5,v5,#1                // if (carry<lo) { hi += 1 }; carry=hi
        SUBS    a4,a4,#1                // len--
        BNE     mulusub_loop_up_l1      // until len==0
        MOV     a1,v5                   // return carry
        LDMFD   sp!,{v1,v5,pc}^
#else
        STMFD   sp!,{v1-v5,lr}
        MOV     v5,#0
LABEL(mulusub_loop_up_l1)
        LDR     ip,[a2],#4
        BL      mulu32_64_vregs         // muluD(digit,*sourceptr++,hi=,lo=)
        ADDS    v1,v1,v5                // lo += carry
        ADC     v5,ip,#0                // if (lo<carry) { hi += 1 };
        LDR     ip,[a3]                 // carry = *destptr
        SUBS    ip,ip,v1
        STR     ip,[a3],#4              // *destptr++ = carry - lo
        ADDCC   v5,v5,#1                // if (carry<lo) { hi += 1 }; carry=hi
        SUBS    a4,a4,#1                // len--
        BNE     mulusub_loop_up_l1      // until len==0
        MOV     a1,v5                   // return carry
        LDMFD   sp!,{v1-v5,pc}^
#endif

#endif

// extern void shiftxor_loop_up (uintD* xptr, const uintD* yptr, uintC count, uintC i);
//       entry
//               a1 = xptr
//               a2 = yptr
//               a3 = count of words to be shifted left
//               a4 = size of left shift
//       exit
//               a1 - a4, ip destroyed
        EXPORT(shiftxor_loop_up)        // word aligned shiftxor loop up
        DECLARE_FUNCTION(shiftxor_loop_up)
GLABEL(shiftxor_loop_up)
        STMFD   sp!,{v5,v6,lr}
        RSB     lr,a4,#32               // size of complementary right shift
        LDR     ip,[a1]                 // get first *xptr
        ANDS    v6,a3,#3                // multiple of 4 words ?
        BEQ     shiftxor_loop_up_l1     // yup, so branch
        LDR     v5,[a2],#4              // get *yptr
        EOR     ip,ip,v5,ASL a4         // combine with modified *xptr
        STR     ip,[a1],#4              // save new *xptr
        LDR     ip,[a1]                 // get next *xptr
        EOR     ip,ip,v5,LSR lr         // combine with *xptr
        CMP     v6,#2
        BLT     shiftxor_loop_up_l1     // better to branch than skip instrs.
        LDR     v5,[a2],#4              // get *yptr
        EOR     ip,ip,v5,ASL a4         // combine with modified *xptr
        STR     ip,[a1],#4              // save new *xptr
        LDR     ip,[a1]                 // get next *xptr
        EOR     ip,ip,v5,LSR lr         // combine with *xptr
        LDRGT   v5,[a2],#4              // get *yptr
        EORGT   ip,ip,v5,ASL a4         // combine with modified *xptr
        STRGT   ip,[a1],#4              // save new *xptr
        LDRGT   ip,[a1]                 // get next *xptr
        EORGT   ip,ip,v5,LSR lr         // combine with *xptr
LABEL(shiftxor_loop_up_l1)
        BICS    a3,a3,#3                // set counter to multiple of 4
        STREQ   ip,[a1]
        LDMEQFD sp!,{v5,v6,pc}^         // return if done
        STMFD   sp!,{v1-v4}             // save work regs
LABEL(shiftxor_loop_up_l2)
        LDMIA   a2!,{v3,v4,v5,v6}       // load 4 words yptr[0..3] in one go
        EOR     v1,ip,v3,ASL a4         // combine with modified *xptr
        LDR     v2,[a1,#4]
        EOR     v2,v2,v3,LSR lr
        EOR     v2,v2,v4,ASL a4         // combine with modified *xptr
        LDR     v3,[a1,#8]
        EOR     v3,v3,v4,LSR lr
        EOR     v3,v3,v5,ASL a4         // combine with modified *xptr
        LDR     v4,[a1,#12]
        EOR     v4,v4,v5,LSR lr
        EOR     v4,v4,v6,ASL a4         // combine with modified *xptr
        STMIA   a1!,{v1,v2,v3,v4}       // store 4 words xptr[0..3] in one go
        LDR     ip,[a1]
        EOR     ip,ip,v6,LSR lr
        SUBS    a3,a3,#4                // decrement counter by 4
        BGT     shiftxor_loop_up_l2
        STR     ip,[a1]
        LDMFD   sp!,{v1-v6,pc}^         // restore work regs and return

        END


syntax highlighted by Code2HTML, v. 0.9.1