; irrednas.s: IBM PC assembler routines for irredg (below version 2.00)
;             using nasm
;
; Copyright (C) 2000 Richard P. Brent.
;
; These routines come with absolutely no warranty.
; For conditions of use see the GNU General Public License
; and the comments in the calling program irred.c (version 3.10 or above).
;
; Equivalent to routines in irredasm.s except for prefetching
; Use these routines on P-III and above 
; (for earlier processors use irredasm.s/irredasm.o)
;
; Richard Brent (rpb@comlab.ox.ac.uk), last revised 20000717,
; correction 20000816 (fortunately this does not change the results)	
;
; Changes made by Carlo Wood (20021104):
; - Renamed GLOBAL names to have the prefix libecc_ in order to avoid
;   global namespace collisions.
; - Added type and size of global symbols, necessary for a shared library.
; - Removed the assembly routine `reducer'.
; - Changed "unsigned long" in the comments by uint32_t.
; - Added a parameter to squarem() in order to seperate input and output vectors.
; - Removed an erroneous rest on the 'q1' parameter of squarem and changed its
;   meaning to be 'the number of digits' (removed an 'inc ecx' at the start of
;   of the function).
;
; Assemble with
;
;     nasm -f elf -o irrednas.o irrednas.s
;
; nasm can be downloaded from http://sourceforge.net/projects/nasm/

;######################################## squarem #############################
;
; void libecc_squarem(uint32_t* in, uint32_t* out, int digits);
;
; Version of square for IBM PC, using MMX instructions.
;
; Based on gas code by Samuli Larvala (Samuli.Larvala@hut.fi) 20000629
; Converted by Richard Brent 20000706
; Seperated input and output buffer and changed meaning of the `digits'
; parameter to 'number of digits of `in'' by Carlo Wood 20021104
;
	SECTION .data
	align 16

C0:	dd 0x0F0F0F0F			; These are really three 64-bit
	dd 0x0F0F0F0F			; constants
C1:	dd 0x33333333
	dd 0x33333333
C2:	dd 0x55555555
	dd 0x55555555
	
	SECTION .text
	align 16
global	libecc_squarem:function (libecc_squarem_end - libecc_squarem)

libecc_squarem:	
; src:DWORD, len:DWORD

	push ebp
	mov  ebp, esp
	push ebx
	push ecx
	
	mov  ecx, [ebp+16]		; q1
	mov  ebx, [ebp+12]		; out
	mov  eax, [ebp+8]		; in


	movq mm0, [C0]			; Load constants
	movq mm1, [C1]
	movq mm2, [C2]

	movq mm3, [ecx*4+eax-8]		; Load first value to square


	test ecx, 1			; Test if ecx is odd
	jz   sqr_mainloop		; If not, skip following code
	
	movq mm4, mm3			; ecx was odd, do one 32-bit square
	psrlq mm3, 4
	
	punpckhbw mm4, mm3
	pand mm4, mm0			; 0x0H0H0H0H0H0H0H0H

	movq mm5, mm4
	psllq mm4, 2
	
	por  mm5, mm4
	movq mm3, [ecx*4+eax-12]
	pand mm5, mm1
	
	movq mm4, mm5
	psllq mm5, 1

	por  mm4, mm5
	pand mm4, mm2

	movq [ecx*8+ebx-8], mm4
	dec  ecx
	jz   sqr_exit

	align 4

sqr_mainloop:

	; mm0 = 0F0F0F0F0F0F0F0F
	; mm1 = 3333333333333333
	; mm2 = 5555555555555555
	; mm3 = ABCDEFGHabcdefgh	

	movq mm4, mm3			; mm4 = ABCDEFGHabcdefgh
	psrlq mm3, 4			; mm3 = 0ABCDEFGHabcdefg

	movq mm5, mm4			; mm5 = ABCDEFGHabcdefgh
	punpckhbw mm4, mm3		; mm4 = 0AABBCCDDEEFFGGH
	
	punpcklbw mm5, mm3		; mm5 = Haabbccddeeffggh
	pand mm4, mm0			; mm4 = 0A0B0C0D0E0F0G0H

	pand mm5, mm0			; mm5 = 0a0b0c0d0e0f0g0h

	movq mm3, [ecx*4+eax-16]	; Preload next 64-bits
	movq mm7, mm4

	movq mm6, mm5
	psllq mm4, 2
	
	psllq mm5, 2
	por  mm7, mm4
 
	prefetcht0 [ecx*4+eax-48]	; Prefetch a (four ahead) 

	por  mm6, mm5
	pand mm7, mm1

	pand mm6, mm1
	movq mm4, mm7
	
	movq mm5, mm6
	psllq mm7, 1
	
	psllq mm6, 1
	por  mm4, mm7
	
	por  mm5, mm6
	pand mm4, mm2
	
	movq [ecx*8+ebx-8], mm4		; Store H
	pand mm5, mm2

	movq [ecx*8+ebx-16], mm5	; Store L

	add  ecx, byte -2
	jnz  sqr_mainloop

sqr_exit:
	pop ecx
	pop ebx
	pop ebp
	emms				; Clear MMX state
	ret

libecc_squarem_end:

;######################################## reducem #############################
;
; uint32_t libecc_reducem(uint32_t* a, uint32_t* b, uint32_t* c, int r1, int r2, int kt);
;
; Version of reducea for IBM PC, using MMX instructions.
;
; Based on gas code by Samuli Larvala (Samuli.Larvala@hut.fi) 20000629
; Converted by Richard Brent 20000706
;
; Allows r1 and/or r2 zero (unlike the C code)
;
	SECTION .text
	align 16
global	libecc_reducem:function (libecc_reducem_end - libecc_reducem)

libecc_reducem:	
; aptr:DWORD, bptr:DWORD, cptr:DWORD, r1:DWORD, r2:DWORD, kt:DWORD
;  36          40          44          48        52        56

	pusha				; lazy, but overhead small

	mov edx, [esp+56]		; kt
	pxor mm2, mm2			; old	(here in case early exit)
	mov ecx, [esp+48]		; r1
	test edx, edx			; counter (j)
	jz   near rdc_exit

	inc edx
	mov eax, [esp+52]		; r2
	mov esi, [esp+36]		; aptr
	mov ebx, [esp+40]		; bptr
	mov edi, [esp+44]		; cptr
	movd mm0, ecx			; mm0 = r1
	movd mm1, eax			; mm1 = r2

	xor ecx, byte 63		; We are working with 64-bit words
	xor eax, byte 63		; so compute 64-r1, 64-r2
	inc ecx				; The case r1 or r2 zero is OK
	inc eax				; because MMX does shift by 64
					; correctly, i.e. gives zero result !

	test edx, 1			; test if length edx is odd
	jz   rdc_mainloop

	movd mm4, [esi+edx*4-4]		; a[j] (was movq, fixed 20000816)
	movq mm5, [ebx+edx*4-4]		; b[j] 
	movq mm6, [edi+edx*4-4]		; c[j] 

	movq mm3, mm4
	psrlq mm4, mm0			; (new >> r1)
	movq mm2, mm3
	psrlq mm3, mm1			; (new >> r2)

	pxor mm5, mm4			; b[j] xor (new >> r1)
	pxor mm6, mm3			; c[j] xor (new >> r2)

	movq [ebx+edx*4-4], mm5		; store new b[j]
	movq [edi+edx*4-4], mm6		; store new c[j]
	dec  edx
	jz   rdc_exit

	align 4

rdc_mainloop:
	movq mm3, [esi+edx*4-8]		; a[j]
	movd mm5, ecx			; r1c
	movq mm7, mm2			; old

        prefetcht0 [edx*4+esi-40]       ; prefetch a (four ahead)

	psllq mm2, mm5			; old << r1c (shift by 64 is OK)
	movq mm6, mm3			; new
	movq mm4, [ebx+edx*4-8]		; b[j]

	psrlq mm3, mm0			; new >> r1
        prefetcht0 [edx*4+ebx-32]       ; prefetch b (four ahead)
	movq mm5, [edi+edx*4-8]		; c[j]
	por  mm3, mm2			; (new>>r1) or (old<<r1c)

	movd mm2, eax			; r2c
        prefetcht0 [edx*4+edi-32]       ; prefetch c (four ahead)

	psllq mm7, mm2			; old << r2c (shift by 64 is OK)
	pxor mm4, mm3			; b[j] xor ((new>>r1) or (old<<r1c))

	movq mm2, mm6			; new (new becomes old)
	psrlq mm6, mm1			; new >> r2

	movq [ebx+edx*4-8], mm4		; store b[j]
	por  mm6, mm7
	pxor mm5, mm6

	movq [edi+edx*4-8], mm5		; store c[j]
	add edx, byte -2
	jnz  rdc_mainloop

rdc_exit:
	popa
	movd eax, mm2			; return last new
	emms				; clear MMX state
	ret

libecc_reducem_end:

;######################################## reduce2 #############################
; 
; Fast modulo reduction of polynomial Q(x) (mod 2)
;
; Based on gas code by Samuli Larvala (Samuli.Larvala@hut.fi) 20000626
;
; Modified by Richard Brent (rpb@comlab.ox.ac.uk) 20000629
; to use 32-bit memory XOR, which may be faster in cases where the second
; and third arguments are not on 8-byte boundaries.
; Converted to nasm 20000705.
;
; Allows r1 and/or r2 to be zero (unlike C code)
;
; aptr:DWORD, bptr:DWORD, cptr:DWORD, r1:DWORD, r2:DWORD, kt:DWORD
;  36          40          44          48        52        56
;
; Assumes r-s > 63 (as for 64-bit version)
;
; Equivalent to following C code except unrolled twice (and assumes this
; does not cause any problems because of overlap of arrays a, b, c - hence
; the restriction r-s > 63), and the assembler does "proper" shifts
; if r1 or r2 is zero whereas C does not.
;
; uint32_t reducea(uint32_t* a, uint32_t* b, uint32_t* c, int r1, int r2, int kt)	
; {
;   uint32_t d1, d2 = 0;
;   for (int j = kt; j >= 0; --j)
;   {
;     d1 = d2;
;     d2 = a[j];
;     b[j] ^= (d2 >> r1) | (d1 << (32 - r1));
;     c[j] ^= (d2 >> r2) | (d1 << (32 - r2));
;   }
;   return new;  
; }  
;

	SECTION .text
	align 16
global	libecc_reduce2:function (libecc_reduce2_end - libecc_reduce2)

libecc_reduce2:	
	pusha				; lazy, but overhead small

	mov edx, [esp+56]		; kt
	pxor mm2, mm2			; old	(here in case early exit)
  	mov ecx, [esp+48]		; r1
	test edx, edx			; counter (j)
	jz near rdc2_exit

	inc edx
	mov eax, [esp+52]		; r2
	mov esi, [esp+36]		; aptr
	mov ebx, [esp+40]		; bptr
	mov edi, [esp+44]		; cptr
	movd mm0, ecx			; mm0 = r1
	movd mm1, eax			; mm1 = r2

	xor ecx, byte 63		; We are working with 64-bit words
	xor eax, byte 63		; so need 64-r1 not 32-r1, etc.
	inc ecx				; r1c = 64 - r1
	inc eax				; r2c = 64 - r2

	test edx, 1			; test if length edx is odd
	jz   rdc2_init

	movd mm4, [edx*4+esi-4]		; a[j] (this one was OK)
	movq mm5, [edx*4+ebx-4]		; b[j]
	movq mm6, [edx*4+edi-4]		; c[j]

	movq  mm3, mm4
	psrlq mm4, mm0			; (new >> r1)
	movq  mm2, mm3
	psrlq mm3, mm1			; (new >> r2)

	pxor  mm5, mm4			; b[j] xor (new >> r1)
	pxor  mm6, mm3			; c[j] xor (new >> r2)

	movq  [edx*4+ebx-4], mm5	; store new b[j]
	movq  [edx*4+edi-4], mm6	; store new c[j]
	dec   edx
	jz    rdc2_exit

; Following loop optimised as suggested by Samuli 2000712,
; note that instructions are grouped in triplets

rdc2_init:
	lea  ebx,[ebx+edx*4-4]		; see comments after rdc2_mainloop
	lea  edi,[edi+edx*4-4]

	movd mm5, ecx			; r1c (in range 33..64)
	movd mm4, eax			; r2c

	align 4

rdc2_mainloop:

	movq mm3, [edx*4+esi-8]		; new = (a[j+1], a[j])
	movq mm7, mm2			; old
	prefetcht0 [edx*4+esi-40]	; prefetch a (four ahead)

	psllq mm2, mm5			; old << r1c (shift by 64 is OK)
	movq  mm6, mm3			; new
	psrlq mm3, mm0			; new >> r1

	por   mm3, mm2			; (new>>r1) | (old<<r1c)
	psllq mm7, mm4			; old << r2c (shift by 64 is OK)
	movq  mm2, mm6			; new (new becomes old)

	psrlq mm6, mm1			; new >> r2
	por   mm6, mm7			; (new>>r2) | (old<<r2c)
	movd  ebp, mm3			; low 32 bits

	psrlq mm3, 32			; shift right 32
	movd  ecx, mm3			; other 32 bits
	movd  eax, mm6			; low 32 bits

	xor   [ebx], ecx		; b[j+1] ^= etc
	prefetcht0 [ebx-32]		; prefetch b (four ahead)
	sub   ebx, byte 8		

	xor   [ebx+4], ebp		; b[j] ^= (new>>r1) | (old<<r1c)
	psrlq mm6, 32			; shift right 32
	movd  ecx, mm6			; other 32 bits

	xor   [edi], ecx		; c[j+1] ^= etc
	prefetcht0 [edi-32]		; prefetch c (four ahead)
	sub   edi, byte 8		

	xor   [edi+4], eax		; c[j] ^= (new>>r2) | (old<<r2c)
	add   edx, byte -2		; loop counter j -= 2
	jnz   rdc2_mainloop		; end inner loop

; I only added edx to ebx and edi since with esi, we have just modified edx
; (at the end of the loop) so we should get it for free. Also, adding an
; extra instruction to modify esi would add a new decoding cycle. Samuli

rdc2_exit:

	popa
	movd eax, mm2			; return last new (32 bits)
	emms				; clear MMX registers
	ret

libecc_reduce2_end:

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;