dnl  Copyright 1999, 2000, 2001, 2002, 2005 Free Software Foundation, Inc.
dnl
dnl  This file is a modified part of the GNU MP Library.
dnl
dnl  The GNU MP Library is free software; you can redistribute it and/or
dnl  modify it under the terms of the GNU Lesser General Public License as
dnl  published by the Free Software Foundation; either version 2.1 of the
dnl  License, or (at your option) any later version.
dnl
dnl  The GNU MP Library is distributed in the hope that it will be useful,
dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
dnl  Lesser General Public License for more details.
dnl
dnl  You should have received a copy of the GNU Lesser General Public
dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
dnl  Suite 330, Boston, MA 02111-1307, USA.

include(`config.m4')
        TEXT
	GLOBL GSYM_PREFIX`'ecm_redc3
	TYPE(GSYM_PREFIX`'ecm_redc3,`function')

GSYM_PREFIX`'ecm_redc3:
	push	%ebp					# Push registers
	push	%edi
	push	%esi
	push	%ebx
	subl	$16, %esp				# SF: 2 Cpt + Jump +1

	movl	44(%esp), %ecx                          # Read size
	movl	36(%esp), %edi				# Read Dest Ptr
	movl	%ecx, (%esp)				# Save counter
        cmpl    $5, %ecx
        jae     Unroll		
Loop:	
		movl	48(%esp), %eax			# Read invm
	        movl    40(%esp), %esi                  # Read Source Ptr
		mull	(%edi)				# Dest[0] * invm
		movl	%edi, 36(%esp)			# Save new Dest
		movl	44(%esp), %ecx			# Read Size (2)
		xorl	%ebx, %ebx			# Initial Carry
		movl	%eax, %ebp			# Multiplier
InnerLoop:
		        # esi:	  Source
		        # edi:	  Dest
			# ebp:	  Multiplier
			# ecx:	  Counter
		        movl    (%esi), %eax		# U1
			addl    $4, %edi		# V1
			mull    %ebp			# U2
			addl    $4, %esi		# V2
			addl    %ebx, %eax		# U3
		        adcl    $0, %edx		# U4
			addl    %eax, -4(%edi)		# V4
			adcl    $0, %edx		# U5
			decl    %ecx			# V5
			movl    %edx, %ebx		# U6
			jnz     InnerLoop		# V6
		movl	36(%esp), %edi
		movl    %ebx, (%edi)                    # Save final carry
		decl	(%esp)
		lea	4(%edi), %edi			# Advance Dest
		jnz     Loop				# Loop
End:
	addl	$16, %esp
	pop	%ebx
	pop	%esi
	pop	%edi
	pop	%ebp
	ret
	
Unroll:
# %ecx Read size // %edi Dest Ptr
	# Precalcul du saut 
	movl    %ecx, %edx
        decl    %ecx
	subl    $2, %edx
	negl    %ecx	
	shrl    $4, %edx
	andl    $15, %ecx
	movl    %edx, 8(%esp)				# Org Cpt of 4(%esp)
	movl    %ecx, %edx
	shll    $4, %edx
	negl    %ecx
        leal    UnrollEntry (%edx, %ecx,1), %edx
	movl	%ecx, 44(%esp)				# (-size)%16
	movl	%edx, 12(%esp)				# Org PC inside	

UnrollLoop:	
                movl    48(%esp), %eax                  # Read invm
                movl    40(%esp), %esi                  # Read Source Ptr
                mull    (%edi)                          # Dest[0] * invm
                movl    %edi, 36(%esp)                  # Save new Dest
                movl    44(%esp), %ecx                  # Read Size %16
		movl    8(%esp), %edx			# Read InnerLoop Cpt
	        movl    %eax, %ebp                      # Set Multiplier
		movl	%edx, 4(%esp)			# Set InnerLoop Cpt
	
		# First mull and set initial carry
	        movl    (%esi), %eax
	        leal    4(%esi,%ecx,4), %esi
	        mull    %ebp
		leal    (%edi,%ecx,4), %edi
	        movl    %edx, %ebx
	
		# Do the Jump inside the unrolling loop
		# And set up the registers differently if odd
	        movl    12(%esp), %edx
	        testl   $1, %ecx
	        movl    %eax, %ecx
		cmovnz  %ebx, %ecx
	        cmovnz  %eax, %ebx	
	        jmp     *%edx
		
		        # eax   scratch
			# ebx   carry hi
			# ecx   carry lo
			# edx   scratch
			# esi   src
			# edi   dst
			# ebp   multiplier

	       .align  32, 0x90
UnrollInnerLoop:	
		addl    $64, %edi
UnrollEntry:	
#	        movl    0(%esi), %eax # Can't use this instruction
	        .byte   0x8b,0x46,0x00
	        mull    %ebp
#	        addl    %ecx, 0(%edi) # Can't use this instruction
	        .byte   0x01,0x4f,0x00
	        adcl    %eax, %ebx
	        movl    %edx, %ecx
	        adcl    $0, %ecx

	        movl    4(%esi), %eax
	        mull    %ebp
	        addl    %ebx, 4(%edi)
	        adcl    %eax, %ecx
	        movl    %edx, %ebx
	        adcl    $0, %ebx

	        movl    8(%esi), %eax
	        mull    %ebp
	        addl    %ecx, 8(%edi)
	        adcl    %eax, %ebx
	        movl    %edx, %ecx
	        adcl    $0, %ecx

	        movl    12(%esi), %eax
	        mull    %ebp
	        addl    %ebx, 12(%edi)
	        adcl    %eax, %ecx
	        movl    %edx, %ebx
	        adcl    $0, %ebx

	        movl    16(%esi), %eax
	        mull    %ebp
	        addl    %ecx, 16(%edi)
	        adcl    %eax, %ebx
	        movl    %edx, %ecx
	        adcl    $0, %ecx

	        movl    20(%esi), %eax
	        mull    %ebp
	        addl    %ebx, 20(%edi)
	        adcl    %eax, %ecx
	        movl    %edx, %ebx
	        adcl    $0, %ebx

	        movl    24(%esi), %eax
	        mull    %ebp
	        addl    %ecx, 24(%edi)
	        adcl    %eax, %ebx
	        movl    %edx, %ecx
	        adcl    $0, %ecx
	
	        movl    28(%esi), %eax
	        mull    %ebp
	        addl    %ebx, 28(%edi)
	        adcl    %eax, %ecx
	        movl    %edx, %ebx
	        adcl    $0, %ebx

	        movl    32(%esi), %eax
	        mull    %ebp
	        addl    %ecx, 32(%edi)
	        adcl    %eax, %ebx
	        movl    %edx, %ecx
	        adcl    $0, %ecx

	        movl    36(%esi), %eax
	        mull    %ebp
	        addl    %ebx, 36(%edi)
	        adcl    %eax, %ecx
	        movl    %edx, %ebx
	        adcl    $0, %ebx

	        movl    40(%esi), %eax
	        mull    %ebp
	        addl    %ecx, 40(%edi)
	        adcl    %eax, %ebx
	        movl    %edx, %ecx
	        adcl    $0, %ecx

	        movl    44(%esi), %eax
	        mull    %ebp
	        addl    %ebx, 44(%edi)
	        adcl    %eax, %ecx
	        movl    %edx, %ebx
	        adcl    $0, %ebx

	        movl    48(%esi), %eax
	        mull    %ebp
	        addl    %ecx, 48(%edi)
	        adcl    %eax, %ebx
	        movl    %edx, %ecx
	        adcl    $0, %ecx

	        movl    52(%esi), %eax
	        mull    %ebp
	        addl    %ebx, 52(%edi)
	        adcl    %eax, %ecx
	        movl    %edx, %ebx
	        adcl    $0, %ebx

	        movl    56(%esi), %eax
	        mull    %ebp
	        addl    %ecx, 56(%edi)
	        adcl    %eax, %ebx
	        movl    %edx, %ecx
	        adcl    $0, %ecx

	        movl    60(%esi), %eax
	        mull    %ebp
	        addl    %ebx, 60(%edi)
	        adcl    %eax, %ecx
	        movl    %edx, %ebx
	        adcl    $0, %ebx

	        decl    4(%esp)
	        leal    64(%esi), %esi
	        jns     UnrollInnerLoop

	        addl    %ecx, 64(%edi)
	        movl    36(%esp), %edi
	        adcl    $0, %ebx
                movl    %ebx, (%edi)                    # Save final carry
                decl    (%esp)
                lea     4(%edi), %edi                   # Advance Dest
                jnz     UnrollLoop                      # Loop
End2:	
        addl    $16, %esp
        pop     %ebx
        pop     %esi
        pop     %edi
        pop     %ebp
        ret


syntax highlighted by Code2HTML, v. 0.9.1