dnl IA-64 mpn_Xshift.
dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 3 of the License, or (at
dnl your option) any later version.
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C This code runs at 2 cycles/limb for large operands on the Itanium. It needs
C a very deep software pipeline, since shl/shr.u have a 4 cycle latency. The
C main loop here is not great; it is oversheduled with respect to the shr.u
C instructions, and this actually turns out to give considerably more complex
C wind down code. The code runs slowly for operands with <= 8 limbs, since we
C have a non-scheduled loop for that case. We also have a primitive loop for
C the unrolling edge, and as a consequence of the main loop stupidity it is
C executed 1-4 steps instead of 0-3 steps.
C By having 63 separate loops using the shrp instruction, we could easily reach
C 1 cycle/limb. Such loops would require a less deep software pipeline, since
C shrp unlike shl/shr.u have a plain one cycle latency.
C INPUT PARAMETERS
C rp = r32
C sp = r33
C n = r34
C cnt = r35
ifdef(`OPERATION_lshift',`
define(`FSH',`shl')
define(`BSH',`shr.u')
define(`UPD',`-8')
define(`func',`mpn_lshift')
')
ifdef(`OPERATION_rshift',`
define(`FSH',`shr.u')
define(`BSH',`shl')
define(`UPD',`8')
define(`func',`mpn_rshift')
')
ASM_START()
PROLOGUE(func)
.prologue
ifdef(`HAVE_ABI_32',
` addp4 r32 = 0, r32
addp4 r33 = 0, r33
sxt4 r34 = r34
zxt4 r35 = r35
;;
')
add r34 = -1, r34
sub r31 = 64, r35
.save ar.lc, r2
mov r2 = ar.lc;;
.body
cmp.leu p6, p7 = 8,r34
ifdef(`OPERATION_lshift',`
shladd r33 = r34, 3, r33
shladd r32 = r34, 3, r32;;
')
ld8 r19 = [r33], UPD ;;
BSH r8 = r19, r31 C function return value
(p6) br.dptk .Lbig
C
C Code for small operands. Not an optimization for the Itanium, it is here
C just to simplify the general case.
C
mov ar.lc = r34;;
br.cloop.dptk .Loops
FSH r26 = r19, r35 ;;
st8 [r32] = r26
mov ar.lc = r2
br.ret.sptk.many b0
.Loops:
ld8 r16 = [r33], UPD
FSH r26 = r19, r35 ;;
BSH r27 = r16, r31 ;;
{ .mib; nop.b 0;; } C delay to save 6 cycles...
{ .mib; nop.b 0;; } C delay to save 6 cycles...
{ .mib; nop.b 0;; } C delay to save 6 cycles...
or r27 = r27, r26
mov r19 = r16 ;;
st8 [r32] = r27, UPD
br.cloop.dptk .Loops
FSH r26 = r19, r35 ;;
st8 [r32] = r26
mov ar.lc = r2
br.ret.sptk.many b0
C
C Code for operands with >8 limbs. An edge loop and a very deep software
C pipeline.
C
.Lbig: and r15 = 3, r34
shr.u r14 = r34, 2 ;;
mov ar.lc = r15
.Loop0:
ld8 r16 = [r33], UPD
FSH r26 = r19, r35 ;;
BSH r27 = r16, r31 ;;
{ .mib; nop.b 0;; } C delay to save 6 cycles...
{ .mib; nop.b 0;; } C delay to save 6 cycles...
{ .mib; nop.b 0;; } C delay to save 6 cycles...
or r27 = r27, r26
mov r19 = r16 ;;
st8 [r32] = r27, UPD
br.cloop.dptk .Loop0
.Lunroll:
add r14 = -2, r14 ;;
mov ar.lc = r14
.Lphase1:
{ .mmi
ld8 r16 = [r33], UPD ;;
} { .mmi
ld8 r17 = [r33], UPD ;;
} { .mmi
ld8 r18 = [r33], UPD
FSH r26 = r19, r35 ;;
} { .mmi
ld8 r19 = [r33], UPD
BSH r27 = r16, r31 ;;
} { .mib
FSH r20 = r16, r35
}
.Lphase2:
{ .mmi
ld8 r16 = [r33], UPD
BSH r21 = r17, r31
} { .mib
FSH r22 = r17, r35 ;;
} { .mmi
ld8 r17 = [r33], UPD
BSH r23 = r18, r31
} { .mib
or r27 = r27, r26
FSH r24 = r18, r35
br.cloop.dptk .Loop
}
br.sptk .Lend2
.Loop:
{ .mmi
st8 [r32] = r27, UPD
ld8 r18 = [r33], UPD
BSH r25 = r19, r31
} { .mib
or r21 = r21, r20
FSH r26 = r19, r35 ;;
} { .mmi
st8 [r32] = r21, UPD
ld8 r19 = [r33], UPD
BSH r27 = r16, r31
} { .mib
or r23 = r23, r22
FSH r20 = r16, r35 ;;
} { .mmi
st8 [r32] = r23, UPD
ld8 r16 = [r33], UPD
BSH r21 = r17, r31
} { .mib
or r25 = r25, r24
FSH r22 = r17, r35 ;;
} { .mmi
st8 [r32] = r25, UPD
ld8 r17 = [r33], UPD
BSH r23 = r18, r31
} { .mib
or r27 = r27, r26
FSH r24 = r18, r35
br.cloop.sptk .Loop;;
}
.Lend2:
{ .mmi
st8 [r32] = r27, UPD
ld8 r18 = [r33], UPD
BSH r25 = r19, r31
} { .mib
or r21 = r21, r20
FSH r26 = r19, r35 ;;
} { .mmi
st8 [r32] = r21, UPD
BSH r27 = r16, r31
} { .mib
or r23 = r23, r22
FSH r20 = r16, r35 ;;
} { .mmi
st8 [r32] = r23, UPD
BSH r21 = r17, r31
} { .mib
or r25 = r25, r24
FSH r22 = r17, r35 ;;
} { .mmi
st8 [r32] = r25, UPD
BSH r23 = r18, r31
} { .mib
or r27 = r27, r26
FSH r24 = r18, r35 ;;
}
{ .mmi
st8 [r32] = r27, UPD
} { .mib
or r21 = r21, r20 ;;
} { .mmi
st8 [r32] = r21, UPD
} { .mib
or r23 = r23, r22 ;;
} { .mmi
st8 [r32] = r23, UPD;;
} { .mmi
st8 [r32] = r24
}
mov ar.lc = r2
br.ret.sptk.many b0
EPILOGUE(func)
ASM_END()
syntax highlighted by Code2HTML, v. 0.9.1