dnl  IA-64 mpn_popcount.

dnl  Copyright 2000, 2001, 2006 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

dnl  Runs at 1 cycle/limb on the Itanium.  That is the peak performance for the
dnl  popcnt instruction, so this is optimal code.  It should be straightforward
dnl  to write mpn_hamdist with the same awesome performance.

include(`../config.m4')

define(ABI32,
m4_assert_onearg()
`ifdef(`HAVE_ABI_32',`$1')')

C INPUT PARAMETERS
C sp = r32
C n = r33

ASM_START()
PROLOGUE(mpn_popcount)
	.prologue
	.save	ar.lc, r2
ABI32(`		addp4	r32 = 0, r32')	C M  src extend
		mov	r2 = ar.lc	C I0
ABI32(`		zxt4	r33 = r33')	C I1  size extend
		;;
	.body

		and	r22 = 3, r33
		shr.u	r23 = r33, 2	;;
		mov	ar.lc = r22
		mov	r8 = 0		;;
		br.cloop.dpnt	.Loop0	;;
		br	.L0
.Loop0:		ld8	r16 = [r32], 8	;;
		popcnt	r20 = r16	;;
		add	r8 = r8, r20
		br.cloop.dptk	.Loop0	;;

.L0:		mov	ar.lc = r23	;;
		br.cloop.dptk	.L1	;;
		mov	ar.lc = r2
		br.ret.sptk.many b0	;;
.L1:		ld8	r16 = [r32], 8	;;
		ld8	r17 = [r32], 8	;;
		ld8	r18 = [r32], 8	;;
		ld8	r19 = [r32], 8	;;
		br.cloop.dptk	.L2    ;;
		br		.Ldone1	;;
.L2:
		popcnt	r20 = r16
		ld8	r16 = [r32], 8	;;
		popcnt	r21 = r17
		ld8	r17 = [r32], 8	;;
		popcnt	r22 = r18
		ld8	r18 = [r32], 8	;;
		popcnt	r23 = r19
		ld8	r19 = [r32], 8	;;
		br.cloop.dptk	.Loop  ;;
		br		.Ldone0

.Loop:		add	r8 = r8, r20
		popcnt	r20 = r16
		ld8	r16 = [r32], 8	;;
		add	r8 = r8, r21
		popcnt	r21 = r17
		ld8	r17 = [r32], 8	;;
		add	r8 = r8, r22
		popcnt	r22 = r18
		ld8	r18 = [r32], 8	;;
		add	r8 = r8, r23
		popcnt	r23 = r19
		ld8	r19 = [r32], 8
		br.cloop.dptk	.Loop	;;

.Ldone0:
		add	r8 = r8, r20
		popcnt	r20 = r16	;;
		add	r8 = r8, r21
		popcnt	r21 = r17	;;
		add	r8 = r8, r22
		popcnt	r22 = r18	;;
		add	r8 = r8, r23
		popcnt	r23 = r19	;;
		add	r21 = r21, r20
		add	r23 = r23, r22	;;
		add	r8 = r8, r21	;;
		add	r8 = r8, r23
		mov	ar.lc = r2
		br.ret.sptk.many b0

.Ldone1:
		popcnt	r20 = r16
		popcnt	r21 = r17
		popcnt	r22 = r18
		popcnt	r23 = r19	;;
		add	r21 = r21, r20
		add	r23 = r23, r22	;;
		add	r8 = r8, r21	;;
		add	r8 = r8, r23
		mov	ar.lc = r2
		br.ret.sptk.many b0
EPILOGUE(mpn_popcount)
ASM_END()


syntax highlighted by Code2HTML, v. 0.9.1