#
#  mp_limb_t mulredc1(mp_limb_t *z, const mp_limb_t x, const mp_limb_t y,
#                 const mp_limb_t m, mp_limb_t inv_m)
#
#  Compute z := x*y mod m, in Montgomery representation, where x, y < m
#  and m is n limb wide.  inv_m is the less significant limb of the
#  inverse of m modulo 2^(n*GMP_LIMB_BITS)
#
#  The result might be unreduced (larger than m) but becomes reduced
#  after subtracting m. The calling function should take care of that.
#
#  We use a temporary space for unreduced product on the stack.
#  Therefore, this can not be used for large integers (anyway, the
#  algorithm is quadratic).
#
#  WARNING: z is only n limbs but since it might be unreduced, there
#  could be a carry that does not fit in z. This carry is returned.

include(`config.m4')
	TEXT
	GLOBL GSYM_PREFIX`'mulredc1
	TYPE(GSYM_PREFIX`'mulredc1,`function')

GSYM_PREFIX`'mulredc1:
# Stack:
#    inv_m  20(%esp)
#    m      16
#    y      12(%esp)
#    x      8
#    z      4(%esp)

	movl	12(%esp), %eax
	mull	8(%esp)
	movl	%edx, 12(%esp)
	movl	%eax, 8(%esp)   # store xy in [8(%esp):12(%esp)]
	mull	20(%esp)          # compute u
	mull	16(%esp)         # compute u*m
	addl	8(%esp), %eax       # eax is 0, now (carry is important)
	adcl	12(%esp), %edx
	movl	4(%esp), %ecx
	movl    %edx, (%ecx)
	adcl	$0, %eax
	ret



syntax highlighted by Code2HTML, v. 0.9.1