dnl IA-64 mpn_popcount. dnl Copyright 2000, 2001, 2006 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. dnl The GNU MP Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published dnl by the Free Software Foundation; either version 3 of the License, or (at dnl your option) any later version. dnl The GNU MP Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl You should have received a copy of the GNU Lesser General Public License dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. dnl Runs at 1 cycle/limb on the Itanium. That is the peak performance for the dnl popcnt instruction, so this is optimal code. It should be straightforward dnl to write mpn_hamdist with the same awesome performance. include(`../config.m4') define(ABI32, m4_assert_onearg() `ifdef(`HAVE_ABI_32',`$1')') C INPUT PARAMETERS C sp = r32 C n = r33 ASM_START() PROLOGUE(mpn_popcount) .prologue .save ar.lc, r2 ABI32(` addp4 r32 = 0, r32') C M src extend mov r2 = ar.lc C I0 ABI32(` zxt4 r33 = r33') C I1 size extend ;; .body and r22 = 3, r33 shr.u r23 = r33, 2 ;; mov ar.lc = r22 mov r8 = 0 ;; br.cloop.dpnt .Loop0 ;; br .L0 .Loop0: ld8 r16 = [r32], 8 ;; popcnt r20 = r16 ;; add r8 = r8, r20 br.cloop.dptk .Loop0 ;; .L0: mov ar.lc = r23 ;; br.cloop.dptk .L1 ;; mov ar.lc = r2 br.ret.sptk.many b0 ;; .L1: ld8 r16 = [r32], 8 ;; ld8 r17 = [r32], 8 ;; ld8 r18 = [r32], 8 ;; ld8 r19 = [r32], 8 ;; br.cloop.dptk .L2 ;; br .Ldone1 ;; .L2: popcnt r20 = r16 ld8 r16 = [r32], 8 ;; popcnt r21 = r17 ld8 r17 = [r32], 8 ;; popcnt r22 = r18 ld8 r18 = [r32], 8 ;; popcnt r23 = r19 ld8 r19 = [r32], 8 ;; br.cloop.dptk .Loop ;; br .Ldone0 .Loop: add r8 = r8, r20 popcnt r20 = r16 ld8 r16 = [r32], 8 ;; add r8 = r8, r21 popcnt r21 = r17 ld8 r17 = [r32], 8 ;; add r8 = r8, r22 popcnt r22 = r18 ld8 r18 = [r32], 8 ;; add r8 = r8, r23 popcnt r23 = r19 ld8 r19 = [r32], 8 br.cloop.dptk .Loop ;; .Ldone0: add r8 = r8, r20 popcnt r20 = r16 ;; add r8 = r8, r21 popcnt r21 = r17 ;; add r8 = r8, r22 popcnt r22 = r18 ;; add r8 = r8, r23 popcnt r23 = r19 ;; add r21 = r21, r20 add r23 = r23, r22 ;; add r8 = r8, r21 ;; add r8 = r8, r23 mov ar.lc = r2 br.ret.sptk.many b0 .Ldone1: popcnt r20 = r16 popcnt r21 = r17 popcnt r22 = r18 popcnt r23 = r19 ;; add r21 = r21, r20 add r23 = r23, r22 ;; add r8 = r8, r21 ;; add r8 = r8, r23 mov ar.lc = r2 br.ret.sptk.many b0 EPILOGUE(mpn_popcount) ASM_END()