; irrednas.s: IBM PC assembler routines for irredg (below version 2.00) ; using nasm ; ; Copyright (C) 2000 Richard P. Brent. ; ; These routines come with absolutely no warranty. ; For conditions of use see the GNU General Public License ; and the comments in the calling program irred.c (version 3.10 or above). ; ; Equivalent to routines in irredasm.s except for prefetching ; Use these routines on P-III and above ; (for earlier processors use irredasm.s/irredasm.o) ; ; Richard Brent (rpb@comlab.ox.ac.uk), last revised 20000717, ; correction 20000816 (fortunately this does not change the results) ; ; Changes made by Carlo Wood (20021104): ; - Renamed GLOBAL names to have the prefix libecc_ in order to avoid ; global namespace collisions. ; - Added type and size of global symbols, necessary for a shared library. ; - Removed the assembly routine `reducer'. ; - Changed "unsigned long" in the comments by uint32_t. ; - Added a parameter to squarem() in order to seperate input and output vectors. ; - Removed an erroneous rest on the 'q1' parameter of squarem and changed its ; meaning to be 'the number of digits' (removed an 'inc ecx' at the start of ; of the function). ; ; Assemble with ; ; nasm -f elf -o irrednas.o irrednas.s ; ; nasm can be downloaded from http://sourceforge.net/projects/nasm/ ;######################################## squarem ############################# ; ; void libecc_squarem(uint32_t* in, uint32_t* out, int digits); ; ; Version of square for IBM PC, using MMX instructions. ; ; Based on gas code by Samuli Larvala (Samuli.Larvala@hut.fi) 20000629 ; Converted by Richard Brent 20000706 ; Seperated input and output buffer and changed meaning of the `digits' ; parameter to 'number of digits of `in'' by Carlo Wood 20021104 ; SECTION .data align 16 C0: dd 0x0F0F0F0F ; These are really three 64-bit dd 0x0F0F0F0F ; constants C1: dd 0x33333333 dd 0x33333333 C2: dd 0x55555555 dd 0x55555555 SECTION .text align 16 global libecc_squarem:function (libecc_squarem_end - libecc_squarem) libecc_squarem: ; src:DWORD, len:DWORD push ebp mov ebp, esp push ebx push ecx mov ecx, [ebp+16] ; q1 mov ebx, [ebp+12] ; out mov eax, [ebp+8] ; in movq mm0, [C0] ; Load constants movq mm1, [C1] movq mm2, [C2] movq mm3, [ecx*4+eax-8] ; Load first value to square test ecx, 1 ; Test if ecx is odd jz sqr_mainloop ; If not, skip following code movq mm4, mm3 ; ecx was odd, do one 32-bit square psrlq mm3, 4 punpckhbw mm4, mm3 pand mm4, mm0 ; 0x0H0H0H0H0H0H0H0H movq mm5, mm4 psllq mm4, 2 por mm5, mm4 movq mm3, [ecx*4+eax-12] pand mm5, mm1 movq mm4, mm5 psllq mm5, 1 por mm4, mm5 pand mm4, mm2 movq [ecx*8+ebx-8], mm4 dec ecx jz sqr_exit align 4 sqr_mainloop: ; mm0 = 0F0F0F0F0F0F0F0F ; mm1 = 3333333333333333 ; mm2 = 5555555555555555 ; mm3 = ABCDEFGHabcdefgh movq mm4, mm3 ; mm4 = ABCDEFGHabcdefgh psrlq mm3, 4 ; mm3 = 0ABCDEFGHabcdefg movq mm5, mm4 ; mm5 = ABCDEFGHabcdefgh punpckhbw mm4, mm3 ; mm4 = 0AABBCCDDEEFFGGH punpcklbw mm5, mm3 ; mm5 = Haabbccddeeffggh pand mm4, mm0 ; mm4 = 0A0B0C0D0E0F0G0H pand mm5, mm0 ; mm5 = 0a0b0c0d0e0f0g0h movq mm3, [ecx*4+eax-16] ; Preload next 64-bits movq mm7, mm4 movq mm6, mm5 psllq mm4, 2 psllq mm5, 2 por mm7, mm4 prefetcht0 [ecx*4+eax-48] ; Prefetch a (four ahead) por mm6, mm5 pand mm7, mm1 pand mm6, mm1 movq mm4, mm7 movq mm5, mm6 psllq mm7, 1 psllq mm6, 1 por mm4, mm7 por mm5, mm6 pand mm4, mm2 movq [ecx*8+ebx-8], mm4 ; Store H pand mm5, mm2 movq [ecx*8+ebx-16], mm5 ; Store L add ecx, byte -2 jnz sqr_mainloop sqr_exit: pop ecx pop ebx pop ebp emms ; Clear MMX state ret libecc_squarem_end: ;######################################## reducem ############################# ; ; uint32_t libecc_reducem(uint32_t* a, uint32_t* b, uint32_t* c, int r1, int r2, int kt); ; ; Version of reducea for IBM PC, using MMX instructions. ; ; Based on gas code by Samuli Larvala (Samuli.Larvala@hut.fi) 20000629 ; Converted by Richard Brent 20000706 ; ; Allows r1 and/or r2 zero (unlike the C code) ; SECTION .text align 16 global libecc_reducem:function (libecc_reducem_end - libecc_reducem) libecc_reducem: ; aptr:DWORD, bptr:DWORD, cptr:DWORD, r1:DWORD, r2:DWORD, kt:DWORD ; 36 40 44 48 52 56 pusha ; lazy, but overhead small mov edx, [esp+56] ; kt pxor mm2, mm2 ; old (here in case early exit) mov ecx, [esp+48] ; r1 test edx, edx ; counter (j) jz near rdc_exit inc edx mov eax, [esp+52] ; r2 mov esi, [esp+36] ; aptr mov ebx, [esp+40] ; bptr mov edi, [esp+44] ; cptr movd mm0, ecx ; mm0 = r1 movd mm1, eax ; mm1 = r2 xor ecx, byte 63 ; We are working with 64-bit words xor eax, byte 63 ; so compute 64-r1, 64-r2 inc ecx ; The case r1 or r2 zero is OK inc eax ; because MMX does shift by 64 ; correctly, i.e. gives zero result ! test edx, 1 ; test if length edx is odd jz rdc_mainloop movd mm4, [esi+edx*4-4] ; a[j] (was movq, fixed 20000816) movq mm5, [ebx+edx*4-4] ; b[j] movq mm6, [edi+edx*4-4] ; c[j] movq mm3, mm4 psrlq mm4, mm0 ; (new >> r1) movq mm2, mm3 psrlq mm3, mm1 ; (new >> r2) pxor mm5, mm4 ; b[j] xor (new >> r1) pxor mm6, mm3 ; c[j] xor (new >> r2) movq [ebx+edx*4-4], mm5 ; store new b[j] movq [edi+edx*4-4], mm6 ; store new c[j] dec edx jz rdc_exit align 4 rdc_mainloop: movq mm3, [esi+edx*4-8] ; a[j] movd mm5, ecx ; r1c movq mm7, mm2 ; old prefetcht0 [edx*4+esi-40] ; prefetch a (four ahead) psllq mm2, mm5 ; old << r1c (shift by 64 is OK) movq mm6, mm3 ; new movq mm4, [ebx+edx*4-8] ; b[j] psrlq mm3, mm0 ; new >> r1 prefetcht0 [edx*4+ebx-32] ; prefetch b (four ahead) movq mm5, [edi+edx*4-8] ; c[j] por mm3, mm2 ; (new>>r1) or (old<>r1) or (old<> r2 movq [ebx+edx*4-8], mm4 ; store b[j] por mm6, mm7 pxor mm5, mm6 movq [edi+edx*4-8], mm5 ; store c[j] add edx, byte -2 jnz rdc_mainloop rdc_exit: popa movd eax, mm2 ; return last new emms ; clear MMX state ret libecc_reducem_end: ;######################################## reduce2 ############################# ; ; Fast modulo reduction of polynomial Q(x) (mod 2) ; ; Based on gas code by Samuli Larvala (Samuli.Larvala@hut.fi) 20000626 ; ; Modified by Richard Brent (rpb@comlab.ox.ac.uk) 20000629 ; to use 32-bit memory XOR, which may be faster in cases where the second ; and third arguments are not on 8-byte boundaries. ; Converted to nasm 20000705. ; ; Allows r1 and/or r2 to be zero (unlike C code) ; ; aptr:DWORD, bptr:DWORD, cptr:DWORD, r1:DWORD, r2:DWORD, kt:DWORD ; 36 40 44 48 52 56 ; ; Assumes r-s > 63 (as for 64-bit version) ; ; Equivalent to following C code except unrolled twice (and assumes this ; does not cause any problems because of overlap of arrays a, b, c - hence ; the restriction r-s > 63), and the assembler does "proper" shifts ; if r1 or r2 is zero whereas C does not. ; ; uint32_t reducea(uint32_t* a, uint32_t* b, uint32_t* c, int r1, int r2, int kt) ; { ; uint32_t d1, d2 = 0; ; for (int j = kt; j >= 0; --j) ; { ; d1 = d2; ; d2 = a[j]; ; b[j] ^= (d2 >> r1) | (d1 << (32 - r1)); ; c[j] ^= (d2 >> r2) | (d1 << (32 - r2)); ; } ; return new; ; } ; SECTION .text align 16 global libecc_reduce2:function (libecc_reduce2_end - libecc_reduce2) libecc_reduce2: pusha ; lazy, but overhead small mov edx, [esp+56] ; kt pxor mm2, mm2 ; old (here in case early exit) mov ecx, [esp+48] ; r1 test edx, edx ; counter (j) jz near rdc2_exit inc edx mov eax, [esp+52] ; r2 mov esi, [esp+36] ; aptr mov ebx, [esp+40] ; bptr mov edi, [esp+44] ; cptr movd mm0, ecx ; mm0 = r1 movd mm1, eax ; mm1 = r2 xor ecx, byte 63 ; We are working with 64-bit words xor eax, byte 63 ; so need 64-r1 not 32-r1, etc. inc ecx ; r1c = 64 - r1 inc eax ; r2c = 64 - r2 test edx, 1 ; test if length edx is odd jz rdc2_init movd mm4, [edx*4+esi-4] ; a[j] (this one was OK) movq mm5, [edx*4+ebx-4] ; b[j] movq mm6, [edx*4+edi-4] ; c[j] movq mm3, mm4 psrlq mm4, mm0 ; (new >> r1) movq mm2, mm3 psrlq mm3, mm1 ; (new >> r2) pxor mm5, mm4 ; b[j] xor (new >> r1) pxor mm6, mm3 ; c[j] xor (new >> r2) movq [edx*4+ebx-4], mm5 ; store new b[j] movq [edx*4+edi-4], mm6 ; store new c[j] dec edx jz rdc2_exit ; Following loop optimised as suggested by Samuli 2000712, ; note that instructions are grouped in triplets rdc2_init: lea ebx,[ebx+edx*4-4] ; see comments after rdc2_mainloop lea edi,[edi+edx*4-4] movd mm5, ecx ; r1c (in range 33..64) movd mm4, eax ; r2c align 4 rdc2_mainloop: movq mm3, [edx*4+esi-8] ; new = (a[j+1], a[j]) movq mm7, mm2 ; old prefetcht0 [edx*4+esi-40] ; prefetch a (four ahead) psllq mm2, mm5 ; old << r1c (shift by 64 is OK) movq mm6, mm3 ; new psrlq mm3, mm0 ; new >> r1 por mm3, mm2 ; (new>>r1) | (old<> r2 por mm6, mm7 ; (new>>r2) | (old<>r1) | (old<>r2) | (old<