lh | 9ed821d | 2023-04-07 01:36:19 -0700 | [diff] [blame] | 1 | # Alpha 21064 __mpn_lshift -- |
| 2 | |
| 3 | # Copyright (C) 1994-2015 Free Software Foundation, Inc. |
| 4 | |
| 5 | # This file is part of the GNU MP Library. |
| 6 | |
| 7 | # The GNU MP Library is free software; you can redistribute it and/or modify |
| 8 | # it under the terms of the GNU Lesser General Public License as published by |
| 9 | # the Free Software Foundation; either version 2.1 of the License, or (at your |
| 10 | # option) any later version. |
| 11 | |
| 12 | # The GNU MP Library is distributed in the hope that it will be useful, but |
| 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
| 14 | # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public |
| 15 | # License for more details. |
| 16 | |
| 17 | # You should have received a copy of the GNU Lesser General Public License |
| 18 | # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>. |
| 19 | |
| 20 | |
| 21 | # INPUT PARAMETERS |
| 22 | # res_ptr r16 |
| 23 | # s1_ptr r17 |
| 24 | # size r18 |
| 25 | # cnt r19 |
| 26 | |
| 27 | # This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling, |
| 28 | # it would take 4 cycles/limb. It should be possible to get down to 3 |
| 29 | # cycles/limb since both ldq and stq can be paired with the other used |
| 30 | # instructions. But there are many restrictions in the 21064 pipeline that |
| 31 | # makes it hard, if not impossible, to get down to 3 cycles/limb: |
| 32 | |
| 33 | # 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay. |
| 34 | # 2. Only aligned instruction pairs can be paired. |
| 35 | # 3. The store buffer or silo might not be able to deal with the bandwidth. |
| 36 | |
| 37 | .set noreorder |
| 38 | .set noat |
| 39 | .text |
| 40 | .align 3 |
| 41 | .globl __mpn_lshift |
| 42 | .ent __mpn_lshift |
| 43 | __mpn_lshift: |
| 44 | .frame $30,0,$26,0 |
| 45 | |
| 46 | s8addq $18,$17,$17 # make r17 point at end of s1 |
| 47 | ldq $4,-8($17) # load first limb |
| 48 | subq $17,8,$17 |
| 49 | subq $31,$19,$7 |
| 50 | s8addq $18,$16,$16 # make r16 point at end of RES |
| 51 | subq $18,1,$18 |
| 52 | and $18,4-1,$20 # number of limbs in first loop |
| 53 | srl $4,$7,$0 # compute function result |
| 54 | |
| 55 | beq $20,.L0 |
| 56 | subq $18,$20,$18 |
| 57 | |
| 58 | .align 3 |
| 59 | .Loop0: |
| 60 | ldq $3,-8($17) |
| 61 | subq $16,8,$16 |
| 62 | subq $17,8,$17 |
| 63 | subq $20,1,$20 |
| 64 | sll $4,$19,$5 |
| 65 | srl $3,$7,$6 |
| 66 | bis $3,$3,$4 |
| 67 | bis $5,$6,$8 |
| 68 | stq $8,0($16) |
| 69 | bne $20,.Loop0 |
| 70 | |
| 71 | .L0: beq $18,.Lend |
| 72 | |
| 73 | .align 3 |
| 74 | .Loop: ldq $3,-8($17) |
| 75 | subq $16,32,$16 |
| 76 | subq $18,4,$18 |
| 77 | sll $4,$19,$5 |
| 78 | srl $3,$7,$6 |
| 79 | |
| 80 | ldq $4,-16($17) |
| 81 | sll $3,$19,$1 |
| 82 | bis $5,$6,$8 |
| 83 | stq $8,24($16) |
| 84 | srl $4,$7,$2 |
| 85 | |
| 86 | ldq $3,-24($17) |
| 87 | sll $4,$19,$5 |
| 88 | bis $1,$2,$8 |
| 89 | stq $8,16($16) |
| 90 | srl $3,$7,$6 |
| 91 | |
| 92 | ldq $4,-32($17) |
| 93 | sll $3,$19,$1 |
| 94 | bis $5,$6,$8 |
| 95 | stq $8,8($16) |
| 96 | srl $4,$7,$2 |
| 97 | |
| 98 | subq $17,32,$17 |
| 99 | bis $1,$2,$8 |
| 100 | stq $8,0($16) |
| 101 | |
| 102 | bgt $18,.Loop |
| 103 | |
| 104 | .Lend: sll $4,$19,$8 |
| 105 | stq $8,-8($16) |
| 106 | ret $31,($26),1 |
| 107 | .end __mpn_lshift |