dnl IA-64 mpn_Xshift. dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. dnl The GNU MP Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published dnl by the Free Software Foundation; either version 3 of the License, or (at dnl your option) any later version. dnl The GNU MP Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl You should have received a copy of the GNU Lesser General Public License dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C This code runs at 2 cycles/limb for large operands on the Itanium. It needs C a very deep software pipeline, since shl/shr.u have a 4 cycle latency. The C main loop here is not great; it is oversheduled with respect to the shr.u C instructions, and this actually turns out to give considerably more complex C wind down code. The code runs slowly for operands with <= 8 limbs, since we C have a non-scheduled loop for that case. We also have a primitive loop for C the unrolling edge, and as a consequence of the main loop stupidity it is C executed 1-4 steps instead of 0-3 steps. C By having 63 separate loops using the shrp instruction, we could easily reach C 1 cycle/limb. Such loops would require a less deep software pipeline, since C shrp unlike shl/shr.u have a plain one cycle latency. C INPUT PARAMETERS C rp = r32 C sp = r33 C n = r34 C cnt = r35 ifdef(`OPERATION_lshift',` define(`FSH',`shl') define(`BSH',`shr.u') define(`UPD',`-8') define(`func',`mpn_lshift') ') ifdef(`OPERATION_rshift',` define(`FSH',`shr.u') define(`BSH',`shl') define(`UPD',`8') define(`func',`mpn_rshift') ') ASM_START() PROLOGUE(func) .prologue ifdef(`HAVE_ABI_32', ` addp4 r32 = 0, r32 addp4 r33 = 0, r33 sxt4 r34 = r34 zxt4 r35 = r35 ;; ') add r34 = -1, r34 sub r31 = 64, r35 .save ar.lc, r2 mov r2 = ar.lc;; .body cmp.leu p6, p7 = 8,r34 ifdef(`OPERATION_lshift',` shladd r33 = r34, 3, r33 shladd r32 = r34, 3, r32;; ') ld8 r19 = [r33], UPD ;; BSH r8 = r19, r31 C function return value (p6) br.dptk .Lbig C C Code for small operands. Not an optimization for the Itanium, it is here C just to simplify the general case. C mov ar.lc = r34;; br.cloop.dptk .Loops FSH r26 = r19, r35 ;; st8 [r32] = r26 mov ar.lc = r2 br.ret.sptk.many b0 .Loops: ld8 r16 = [r33], UPD FSH r26 = r19, r35 ;; BSH r27 = r16, r31 ;; { .mib; nop.b 0;; } C delay to save 6 cycles... { .mib; nop.b 0;; } C delay to save 6 cycles... { .mib; nop.b 0;; } C delay to save 6 cycles... or r27 = r27, r26 mov r19 = r16 ;; st8 [r32] = r27, UPD br.cloop.dptk .Loops FSH r26 = r19, r35 ;; st8 [r32] = r26 mov ar.lc = r2 br.ret.sptk.many b0 C C Code for operands with >8 limbs. An edge loop and a very deep software C pipeline. C .Lbig: and r15 = 3, r34 shr.u r14 = r34, 2 ;; mov ar.lc = r15 .Loop0: ld8 r16 = [r33], UPD FSH r26 = r19, r35 ;; BSH r27 = r16, r31 ;; { .mib; nop.b 0;; } C delay to save 6 cycles... { .mib; nop.b 0;; } C delay to save 6 cycles... { .mib; nop.b 0;; } C delay to save 6 cycles... or r27 = r27, r26 mov r19 = r16 ;; st8 [r32] = r27, UPD br.cloop.dptk .Loop0 .Lunroll: add r14 = -2, r14 ;; mov ar.lc = r14 .Lphase1: { .mmi ld8 r16 = [r33], UPD ;; } { .mmi ld8 r17 = [r33], UPD ;; } { .mmi ld8 r18 = [r33], UPD FSH r26 = r19, r35 ;; } { .mmi ld8 r19 = [r33], UPD BSH r27 = r16, r31 ;; } { .mib FSH r20 = r16, r35 } .Lphase2: { .mmi ld8 r16 = [r33], UPD BSH r21 = r17, r31 } { .mib FSH r22 = r17, r35 ;; } { .mmi ld8 r17 = [r33], UPD BSH r23 = r18, r31 } { .mib or r27 = r27, r26 FSH r24 = r18, r35 br.cloop.dptk .Loop } br.sptk .Lend2 .Loop: { .mmi st8 [r32] = r27, UPD ld8 r18 = [r33], UPD BSH r25 = r19, r31 } { .mib or r21 = r21, r20 FSH r26 = r19, r35 ;; } { .mmi st8 [r32] = r21, UPD ld8 r19 = [r33], UPD BSH r27 = r16, r31 } { .mib or r23 = r23, r22 FSH r20 = r16, r35 ;; } { .mmi st8 [r32] = r23, UPD ld8 r16 = [r33], UPD BSH r21 = r17, r31 } { .mib or r25 = r25, r24 FSH r22 = r17, r35 ;; } { .mmi st8 [r32] = r25, UPD ld8 r17 = [r33], UPD BSH r23 = r18, r31 } { .mib or r27 = r27, r26 FSH r24 = r18, r35 br.cloop.sptk .Loop;; } .Lend2: { .mmi st8 [r32] = r27, UPD ld8 r18 = [r33], UPD BSH r25 = r19, r31 } { .mib or r21 = r21, r20 FSH r26 = r19, r35 ;; } { .mmi st8 [r32] = r21, UPD BSH r27 = r16, r31 } { .mib or r23 = r23, r22 FSH r20 = r16, r35 ;; } { .mmi st8 [r32] = r23, UPD BSH r21 = r17, r31 } { .mib or r25 = r25, r24 FSH r22 = r17, r35 ;; } { .mmi st8 [r32] = r25, UPD BSH r23 = r18, r31 } { .mib or r27 = r27, r26 FSH r24 = r18, r35 ;; } { .mmi st8 [r32] = r27, UPD } { .mib or r21 = r21, r20 ;; } { .mmi st8 [r32] = r21, UPD } { .mib or r23 = r23, r22 ;; } { .mmi st8 [r32] = r23, UPD;; } { .mmi st8 [r32] = r24 } mov ar.lc = r2 br.ret.sptk.many b0 EPILOGUE(func) ASM_END()