dnl  x86 mpn_mod_1 -- mpn by limb remainder.

dnl  Copyright 1999, 2000, 2001, 2002, 2003, 2007 Free Software Foundation,
dnl  Inc.
dnl
dnl  This file is part of the GNU MP Library.
dnl
dnl  The GNU MP Library is free software; you can redistribute it and/or
dnl  modify it under the terms of the GNU Lesser General Public License as
dnl  published by the Free Software Foundation; either version 3 of the
dnl  License, or (at your option) any later version.
dnl
dnl  The GNU MP Library is distributed in the hope that it will be useful,
dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
dnl  Lesser General Public License for more details.
dnl
dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')


C      cycles/limb
C 486     42 approx, maybe
C P5      44
C P6      39
C K6      20
C K7      41
C P4      58


C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
C                       mp_limb_t carry);
C
C Essentially this code is the same as the division based part of
C mpn/generic/mod_1.c, but has the advantage that we get the desired divl
C instruction even when gcc is not being used (where longlong.h only has the
C rather slow generic C udiv_qrnnd().
C
C A test is done to see if the high limb is less than the divisor, and if so
C one less div is done.  A div is between 20 and 40 cycles on the various
C x86s, so assuming high<divisor about half the time, then this test saves
C half that amount.  The branch misprediction penalty on each chip is less
C than half a div.
C
C
C Notes for K6:
C
C Back-to-back div instructions take 20 cycles, the same as the loop here,
C so it seems there's nothing to gain by rearranging.  Pairing the mov and
C loop instructions was found to gain nothing.  Normally we use a loop
C instruction rather than decl/jnz, but it gains nothing here.
C
C A multiply-by-inverse is used in mpn/x86/k6/pre_mod_1.asm, but it saves
C only 2 c/l so currently we haven't bothered with the same for mpn_mod_1.
C If an inverse takes about 40 cycles for normalized or perhaps 60 for
C unnormalized (due to bsfl being slow on k6) then the threshold would be at
C least 20 or 30 limbs.
C

defframe(PARAM_CARRY,  16)
defframe(PARAM_DIVISOR,12)
defframe(PARAM_SIZE,   8)
defframe(PARAM_SRC,    4)

	TEXT

	ALIGN(16)
PROLOGUE(mpn_mod_1)
deflit(`FRAME',0)

	movl	PARAM_SIZE, %ecx
	pushl	%ebx		FRAME_pushl()

	movl	PARAM_SRC, %ebx
	pushl	%esi		FRAME_pushl()

	orl	%ecx, %ecx
	jz	L(done_zero)

	movl	PARAM_DIVISOR, %esi
	movl	-4(%ebx,%ecx,4), %eax	C src high limb

	cmpl	%esi, %eax

	sbbl	%edx, %edx		C -1 if high<divisor

	addl	%edx, %ecx		C skip one division if high<divisor
	jz	L(done_eax)

	andl	%eax, %edx		C carry if high<divisor


L(top):
	C eax	scratch (quotient)
	C ebx	src
	C ecx	counter
	C edx	carry (remainder)
	C esi	divisor
	C edi
	C ebp

	movl	-4(%ebx,%ecx,4), %eax

	divl	%esi

	decl	%ecx
	jnz	L(top)


	movl	%edx, %eax
L(done_eax):
	popl	%esi

	popl	%ebx

	ret

EPILOGUE()


	C This code located after mpn_mod_1, so the jump to L(top) here is
	C back and hence will be predicted as taken.  (size==0 is considered
	C unlikely.)

	ALIGN(16)
PROLOGUE(mpn_mod_1c)
deflit(`FRAME',0)

	movl	PARAM_SIZE, %ecx
	pushl	%ebx		FRAME_pushl()

	movl	PARAM_SRC, %ebx
	pushl	%esi		FRAME_pushl()

	movl	PARAM_DIVISOR, %esi
	orl	%ecx, %ecx

	movl	PARAM_CARRY, %edx
	jnz	L(top)

	popl	%esi
	movl	%edx, %eax

	popl	%ebx

	ret


	C This code is for mpn_mod_1, but is positioned here to save some
	C space in the alignment padding.
	C
L(done_zero):
	popl	%esi
	xorl	%eax, %eax

	popl	%ebx

	ret

EPILOGUE()


syntax highlighted by Code2HTML, v. 0.9.1