yuezonghe | 824eb0c | 2024-06-27 02:32:26 -0700 | [diff] [blame^] | 1 | #! /usr/bin/env perl |
| 2 | # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. |
| 3 | # |
| 4 | # Licensed under the OpenSSL license (the "License"). You may not use |
| 5 | # this file except in compliance with the License. You can obtain a copy |
| 6 | # in the file LICENSE in the source distribution or at |
| 7 | # https://www.openssl.org/source/license.html |
| 8 | |
| 9 | # |
| 10 | # ==================================================================== |
| 11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
| 12 | # project. The module is, however, dual licensed under OpenSSL and |
| 13 | # CRYPTOGAMS licenses depending on where you obtain it. For further |
| 14 | # details see http://www.openssl.org/~appro/cryptogams/. |
| 15 | # ==================================================================== |
| 16 | # |
| 17 | # This module implements Poly1305 hash for ARMv8. |
| 18 | # |
| 19 | # June 2015 |
| 20 | # |
| 21 | # Numbers are cycles per processed byte with poly1305_blocks alone. |
| 22 | # |
| 23 | # IALU/gcc-4.9 NEON |
| 24 | # |
| 25 | # Apple A7 1.86/+5% 0.72 |
| 26 | # Cortex-A53 2.69/+58% 1.47 |
| 27 | # Cortex-A57 2.70/+7% 1.14 |
| 28 | # Denver 1.64/+50% 1.18(*) |
| 29 | # X-Gene 2.13/+68% 2.27 |
| 30 | # Mongoose 1.77/+75% 1.12 |
| 31 | # Kryo 2.70/+55% 1.13 |
| 32 | # |
| 33 | # (*) estimate based on resources availability is less than 1.0, |
| 34 | # i.e. measured result is worse than expected, presumably binary |
| 35 | # translator is not almighty; |
| 36 | |
| 37 | $flavour=shift; |
| 38 | $output=shift; |
| 39 | |
| 40 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
| 41 | ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or |
| 42 | ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or |
| 43 | die "can't locate arm-xlate.pl"; |
| 44 | |
| 45 | open OUT,"| \"$^X\" $xlate $flavour $output"; |
| 46 | *STDOUT=*OUT; |
| 47 | |
| 48 | my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3)); |
| 49 | my ($mac,$nonce)=($inp,$len); |
| 50 | |
| 51 | my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14)); |
| 52 | |
| 53 | $code.=<<___; |
| 54 | #include "arm_arch.h" |
| 55 | |
| 56 | .text |
| 57 | |
| 58 | // forward "declarations" are required for Apple |
| 59 | .extern OPENSSL_armcap_P |
| 60 | .hidden OPENSSL_armcap_P |
| 61 | .globl poly1305_init |
| 62 | .hidden poly1305_init |
| 63 | .globl poly1305_blocks |
| 64 | .hidden poly1305_blocks |
| 65 | .globl poly1305_emit |
| 66 | .hidden poly1305_emit |
| 67 | |
| 68 | .type poly1305_init,%function |
| 69 | .align 5 |
| 70 | poly1305_init: |
| 71 | cmp $inp,xzr |
| 72 | stp xzr,xzr,[$ctx] // zero hash value |
| 73 | stp xzr,xzr,[$ctx,#16] // [along with is_base2_26] |
| 74 | |
| 75 | csel x0,xzr,x0,eq |
| 76 | b.eq .Lno_key |
| 77 | |
| 78 | #ifdef __ILP32__ |
| 79 | ldrsw $t1,.LOPENSSL_armcap_P |
| 80 | #else |
| 81 | ldr $t1,.LOPENSSL_armcap_P |
| 82 | #endif |
| 83 | adr $t0,.LOPENSSL_armcap_P |
| 84 | |
| 85 | ldp $r0,$r1,[$inp] // load key |
| 86 | mov $s1,#0xfffffffc0fffffff |
| 87 | movk $s1,#0x0fff,lsl#48 |
| 88 | ldr w17,[$t0,$t1] |
| 89 | #ifdef __ARMEB__ |
| 90 | rev $r0,$r0 // flip bytes |
| 91 | rev $r1,$r1 |
| 92 | #endif |
| 93 | and $r0,$r0,$s1 // &=0ffffffc0fffffff |
| 94 | and $s1,$s1,#-4 |
| 95 | and $r1,$r1,$s1 // &=0ffffffc0ffffffc |
| 96 | stp $r0,$r1,[$ctx,#32] // save key value |
| 97 | |
| 98 | tst w17,#ARMV7_NEON |
| 99 | |
| 100 | adr $d0,poly1305_blocks |
| 101 | adr $r0,poly1305_blocks_neon |
| 102 | adr $d1,poly1305_emit |
| 103 | adr $r1,poly1305_emit_neon |
| 104 | |
| 105 | csel $d0,$d0,$r0,eq |
| 106 | csel $d1,$d1,$r1,eq |
| 107 | |
| 108 | #ifdef __ILP32__ |
| 109 | stp w12,w13,[$len] |
| 110 | #else |
| 111 | stp $d0,$d1,[$len] |
| 112 | #endif |
| 113 | |
| 114 | mov x0,#1 |
| 115 | .Lno_key: |
| 116 | ret |
| 117 | .size poly1305_init,.-poly1305_init |
| 118 | |
| 119 | .type poly1305_blocks,%function |
| 120 | .align 5 |
| 121 | poly1305_blocks: |
| 122 | ands $len,$len,#-16 |
| 123 | b.eq .Lno_data |
| 124 | |
| 125 | ldp $h0,$h1,[$ctx] // load hash value |
| 126 | ldp $r0,$r1,[$ctx,#32] // load key value |
| 127 | ldr $h2,[$ctx,#16] |
| 128 | add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) |
| 129 | b .Loop |
| 130 | |
| 131 | .align 5 |
| 132 | .Loop: |
| 133 | ldp $t0,$t1,[$inp],#16 // load input |
| 134 | sub $len,$len,#16 |
| 135 | #ifdef __ARMEB__ |
| 136 | rev $t0,$t0 |
| 137 | rev $t1,$t1 |
| 138 | #endif |
| 139 | adds $h0,$h0,$t0 // accumulate input |
| 140 | adcs $h1,$h1,$t1 |
| 141 | |
| 142 | mul $d0,$h0,$r0 // h0*r0 |
| 143 | adc $h2,$h2,$padbit |
| 144 | umulh $d1,$h0,$r0 |
| 145 | |
| 146 | mul $t0,$h1,$s1 // h1*5*r1 |
| 147 | umulh $t1,$h1,$s1 |
| 148 | |
| 149 | adds $d0,$d0,$t0 |
| 150 | mul $t0,$h0,$r1 // h0*r1 |
| 151 | adc $d1,$d1,$t1 |
| 152 | umulh $d2,$h0,$r1 |
| 153 | |
| 154 | adds $d1,$d1,$t0 |
| 155 | mul $t0,$h1,$r0 // h1*r0 |
| 156 | adc $d2,$d2,xzr |
| 157 | umulh $t1,$h1,$r0 |
| 158 | |
| 159 | adds $d1,$d1,$t0 |
| 160 | mul $t0,$h2,$s1 // h2*5*r1 |
| 161 | adc $d2,$d2,$t1 |
| 162 | mul $t1,$h2,$r0 // h2*r0 |
| 163 | |
| 164 | adds $d1,$d1,$t0 |
| 165 | adc $d2,$d2,$t1 |
| 166 | |
| 167 | and $t0,$d2,#-4 // final reduction |
| 168 | and $h2,$d2,#3 |
| 169 | add $t0,$t0,$d2,lsr#2 |
| 170 | adds $h0,$d0,$t0 |
| 171 | adcs $h1,$d1,xzr |
| 172 | adc $h2,$h2,xzr |
| 173 | |
| 174 | cbnz $len,.Loop |
| 175 | |
| 176 | stp $h0,$h1,[$ctx] // store hash value |
| 177 | str $h2,[$ctx,#16] |
| 178 | |
| 179 | .Lno_data: |
| 180 | ret |
| 181 | .size poly1305_blocks,.-poly1305_blocks |
| 182 | |
| 183 | .type poly1305_emit,%function |
| 184 | .align 5 |
| 185 | poly1305_emit: |
| 186 | ldp $h0,$h1,[$ctx] // load hash base 2^64 |
| 187 | ldr $h2,[$ctx,#16] |
| 188 | ldp $t0,$t1,[$nonce] // load nonce |
| 189 | |
| 190 | adds $d0,$h0,#5 // compare to modulus |
| 191 | adcs $d1,$h1,xzr |
| 192 | adc $d2,$h2,xzr |
| 193 | |
| 194 | tst $d2,#-4 // see if it's carried/borrowed |
| 195 | |
| 196 | csel $h0,$h0,$d0,eq |
| 197 | csel $h1,$h1,$d1,eq |
| 198 | |
| 199 | #ifdef __ARMEB__ |
| 200 | ror $t0,$t0,#32 // flip nonce words |
| 201 | ror $t1,$t1,#32 |
| 202 | #endif |
| 203 | adds $h0,$h0,$t0 // accumulate nonce |
| 204 | adc $h1,$h1,$t1 |
| 205 | #ifdef __ARMEB__ |
| 206 | rev $h0,$h0 // flip output bytes |
| 207 | rev $h1,$h1 |
| 208 | #endif |
| 209 | stp $h0,$h1,[$mac] // write result |
| 210 | |
| 211 | ret |
| 212 | .size poly1305_emit,.-poly1305_emit |
| 213 | ___ |
| 214 | my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8)); |
| 215 | my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13)); |
| 216 | my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18)); |
| 217 | my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23)); |
| 218 | my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28)); |
| 219 | my ($T0,$T1,$MASK) = map("v$_",(29..31)); |
| 220 | |
| 221 | my ($in2,$zeros)=("x16","x17"); |
| 222 | my $is_base2_26 = $zeros; # borrow |
| 223 | |
| 224 | $code.=<<___; |
| 225 | .type poly1305_mult,%function |
| 226 | .align 5 |
| 227 | poly1305_mult: |
| 228 | mul $d0,$h0,$r0 // h0*r0 |
| 229 | umulh $d1,$h0,$r0 |
| 230 | |
| 231 | mul $t0,$h1,$s1 // h1*5*r1 |
| 232 | umulh $t1,$h1,$s1 |
| 233 | |
| 234 | adds $d0,$d0,$t0 |
| 235 | mul $t0,$h0,$r1 // h0*r1 |
| 236 | adc $d1,$d1,$t1 |
| 237 | umulh $d2,$h0,$r1 |
| 238 | |
| 239 | adds $d1,$d1,$t0 |
| 240 | mul $t0,$h1,$r0 // h1*r0 |
| 241 | adc $d2,$d2,xzr |
| 242 | umulh $t1,$h1,$r0 |
| 243 | |
| 244 | adds $d1,$d1,$t0 |
| 245 | mul $t0,$h2,$s1 // h2*5*r1 |
| 246 | adc $d2,$d2,$t1 |
| 247 | mul $t1,$h2,$r0 // h2*r0 |
| 248 | |
| 249 | adds $d1,$d1,$t0 |
| 250 | adc $d2,$d2,$t1 |
| 251 | |
| 252 | and $t0,$d2,#-4 // final reduction |
| 253 | and $h2,$d2,#3 |
| 254 | add $t0,$t0,$d2,lsr#2 |
| 255 | adds $h0,$d0,$t0 |
| 256 | adcs $h1,$d1,xzr |
| 257 | adc $h2,$h2,xzr |
| 258 | |
| 259 | ret |
| 260 | .size poly1305_mult,.-poly1305_mult |
| 261 | |
| 262 | .type poly1305_splat,%function |
| 263 | .align 5 |
| 264 | poly1305_splat: |
| 265 | and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26 |
| 266 | ubfx x13,$h0,#26,#26 |
| 267 | extr x14,$h1,$h0,#52 |
| 268 | and x14,x14,#0x03ffffff |
| 269 | ubfx x15,$h1,#14,#26 |
| 270 | extr x16,$h2,$h1,#40 |
| 271 | |
| 272 | str w12,[$ctx,#16*0] // r0 |
| 273 | add w12,w13,w13,lsl#2 // r1*5 |
| 274 | str w13,[$ctx,#16*1] // r1 |
| 275 | add w13,w14,w14,lsl#2 // r2*5 |
| 276 | str w12,[$ctx,#16*2] // s1 |
| 277 | str w14,[$ctx,#16*3] // r2 |
| 278 | add w14,w15,w15,lsl#2 // r3*5 |
| 279 | str w13,[$ctx,#16*4] // s2 |
| 280 | str w15,[$ctx,#16*5] // r3 |
| 281 | add w15,w16,w16,lsl#2 // r4*5 |
| 282 | str w14,[$ctx,#16*6] // s3 |
| 283 | str w16,[$ctx,#16*7] // r4 |
| 284 | str w15,[$ctx,#16*8] // s4 |
| 285 | |
| 286 | ret |
| 287 | .size poly1305_splat,.-poly1305_splat |
| 288 | |
| 289 | .type poly1305_blocks_neon,%function |
| 290 | .align 5 |
| 291 | poly1305_blocks_neon: |
| 292 | ldr $is_base2_26,[$ctx,#24] |
| 293 | cmp $len,#128 |
| 294 | b.hs .Lblocks_neon |
| 295 | cbz $is_base2_26,poly1305_blocks |
| 296 | |
| 297 | .Lblocks_neon: |
| 298 | .inst 0xd503233f // paciasp |
| 299 | stp x29,x30,[sp,#-80]! |
| 300 | add x29,sp,#0 |
| 301 | |
| 302 | ands $len,$len,#-16 |
| 303 | b.eq .Lno_data_neon |
| 304 | |
| 305 | cbz $is_base2_26,.Lbase2_64_neon |
| 306 | |
| 307 | ldp w10,w11,[$ctx] // load hash value base 2^26 |
| 308 | ldp w12,w13,[$ctx,#8] |
| 309 | ldr w14,[$ctx,#16] |
| 310 | |
| 311 | tst $len,#31 |
| 312 | b.eq .Leven_neon |
| 313 | |
| 314 | ldp $r0,$r1,[$ctx,#32] // load key value |
| 315 | |
| 316 | add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64 |
| 317 | lsr $h1,x12,#12 |
| 318 | adds $h0,$h0,x12,lsl#52 |
| 319 | add $h1,$h1,x13,lsl#14 |
| 320 | adc $h1,$h1,xzr |
| 321 | lsr $h2,x14,#24 |
| 322 | adds $h1,$h1,x14,lsl#40 |
| 323 | adc $d2,$h2,xzr // can be partially reduced... |
| 324 | |
| 325 | ldp $d0,$d1,[$inp],#16 // load input |
| 326 | sub $len,$len,#16 |
| 327 | add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) |
| 328 | |
| 329 | and $t0,$d2,#-4 // ... so reduce |
| 330 | and $h2,$d2,#3 |
| 331 | add $t0,$t0,$d2,lsr#2 |
| 332 | adds $h0,$h0,$t0 |
| 333 | adcs $h1,$h1,xzr |
| 334 | adc $h2,$h2,xzr |
| 335 | |
| 336 | #ifdef __ARMEB__ |
| 337 | rev $d0,$d0 |
| 338 | rev $d1,$d1 |
| 339 | #endif |
| 340 | adds $h0,$h0,$d0 // accumulate input |
| 341 | adcs $h1,$h1,$d1 |
| 342 | adc $h2,$h2,$padbit |
| 343 | |
| 344 | bl poly1305_mult |
| 345 | ldr x30,[sp,#8] |
| 346 | |
| 347 | cbz $padbit,.Lstore_base2_64_neon |
| 348 | |
| 349 | and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 |
| 350 | ubfx x11,$h0,#26,#26 |
| 351 | extr x12,$h1,$h0,#52 |
| 352 | and x12,x12,#0x03ffffff |
| 353 | ubfx x13,$h1,#14,#26 |
| 354 | extr x14,$h2,$h1,#40 |
| 355 | |
| 356 | cbnz $len,.Leven_neon |
| 357 | |
| 358 | stp w10,w11,[$ctx] // store hash value base 2^26 |
| 359 | stp w12,w13,[$ctx,#8] |
| 360 | str w14,[$ctx,#16] |
| 361 | b .Lno_data_neon |
| 362 | |
| 363 | .align 4 |
| 364 | .Lstore_base2_64_neon: |
| 365 | stp $h0,$h1,[$ctx] // store hash value base 2^64 |
| 366 | stp $h2,xzr,[$ctx,#16] // note that is_base2_26 is zeroed |
| 367 | b .Lno_data_neon |
| 368 | |
| 369 | .align 4 |
| 370 | .Lbase2_64_neon: |
| 371 | ldp $r0,$r1,[$ctx,#32] // load key value |
| 372 | |
| 373 | ldp $h0,$h1,[$ctx] // load hash value base 2^64 |
| 374 | ldr $h2,[$ctx,#16] |
| 375 | |
| 376 | tst $len,#31 |
| 377 | b.eq .Linit_neon |
| 378 | |
| 379 | ldp $d0,$d1,[$inp],#16 // load input |
| 380 | sub $len,$len,#16 |
| 381 | add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) |
| 382 | #ifdef __ARMEB__ |
| 383 | rev $d0,$d0 |
| 384 | rev $d1,$d1 |
| 385 | #endif |
| 386 | adds $h0,$h0,$d0 // accumulate input |
| 387 | adcs $h1,$h1,$d1 |
| 388 | adc $h2,$h2,$padbit |
| 389 | |
| 390 | bl poly1305_mult |
| 391 | |
| 392 | .Linit_neon: |
| 393 | and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 |
| 394 | ubfx x11,$h0,#26,#26 |
| 395 | extr x12,$h1,$h0,#52 |
| 396 | and x12,x12,#0x03ffffff |
| 397 | ubfx x13,$h1,#14,#26 |
| 398 | extr x14,$h2,$h1,#40 |
| 399 | |
| 400 | stp d8,d9,[sp,#16] // meet ABI requirements |
| 401 | stp d10,d11,[sp,#32] |
| 402 | stp d12,d13,[sp,#48] |
| 403 | stp d14,d15,[sp,#64] |
| 404 | |
| 405 | fmov ${H0},x10 |
| 406 | fmov ${H1},x11 |
| 407 | fmov ${H2},x12 |
| 408 | fmov ${H3},x13 |
| 409 | fmov ${H4},x14 |
| 410 | |
| 411 | ////////////////////////////////// initialize r^n table |
| 412 | mov $h0,$r0 // r^1 |
| 413 | add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) |
| 414 | mov $h1,$r1 |
| 415 | mov $h2,xzr |
| 416 | add $ctx,$ctx,#48+12 |
| 417 | bl poly1305_splat |
| 418 | |
| 419 | bl poly1305_mult // r^2 |
| 420 | sub $ctx,$ctx,#4 |
| 421 | bl poly1305_splat |
| 422 | |
| 423 | bl poly1305_mult // r^3 |
| 424 | sub $ctx,$ctx,#4 |
| 425 | bl poly1305_splat |
| 426 | |
| 427 | bl poly1305_mult // r^4 |
| 428 | sub $ctx,$ctx,#4 |
| 429 | bl poly1305_splat |
| 430 | ldr x30,[sp,#8] |
| 431 | |
| 432 | add $in2,$inp,#32 |
| 433 | adr $zeros,.Lzeros |
| 434 | subs $len,$len,#64 |
| 435 | csel $in2,$zeros,$in2,lo |
| 436 | |
| 437 | mov x4,#1 |
| 438 | str x4,[$ctx,#-24] // set is_base2_26 |
| 439 | sub $ctx,$ctx,#48 // restore original $ctx |
| 440 | b .Ldo_neon |
| 441 | |
| 442 | .align 4 |
| 443 | .Leven_neon: |
| 444 | add $in2,$inp,#32 |
| 445 | adr $zeros,.Lzeros |
| 446 | subs $len,$len,#64 |
| 447 | csel $in2,$zeros,$in2,lo |
| 448 | |
| 449 | stp d8,d9,[sp,#16] // meet ABI requirements |
| 450 | stp d10,d11,[sp,#32] |
| 451 | stp d12,d13,[sp,#48] |
| 452 | stp d14,d15,[sp,#64] |
| 453 | |
| 454 | fmov ${H0},x10 |
| 455 | fmov ${H1},x11 |
| 456 | fmov ${H2},x12 |
| 457 | fmov ${H3},x13 |
| 458 | fmov ${H4},x14 |
| 459 | |
| 460 | .Ldo_neon: |
| 461 | ldp x8,x12,[$in2],#16 // inp[2:3] (or zero) |
| 462 | ldp x9,x13,[$in2],#48 |
| 463 | |
| 464 | lsl $padbit,$padbit,#24 |
| 465 | add x15,$ctx,#48 |
| 466 | |
| 467 | #ifdef __ARMEB__ |
| 468 | rev x8,x8 |
| 469 | rev x12,x12 |
| 470 | rev x9,x9 |
| 471 | rev x13,x13 |
| 472 | #endif |
| 473 | and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 |
| 474 | and x5,x9,#0x03ffffff |
| 475 | ubfx x6,x8,#26,#26 |
| 476 | ubfx x7,x9,#26,#26 |
| 477 | add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 |
| 478 | extr x8,x12,x8,#52 |
| 479 | extr x9,x13,x9,#52 |
| 480 | add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 |
| 481 | fmov $IN23_0,x4 |
| 482 | and x8,x8,#0x03ffffff |
| 483 | and x9,x9,#0x03ffffff |
| 484 | ubfx x10,x12,#14,#26 |
| 485 | ubfx x11,x13,#14,#26 |
| 486 | add x12,$padbit,x12,lsr#40 |
| 487 | add x13,$padbit,x13,lsr#40 |
| 488 | add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 |
| 489 | fmov $IN23_1,x6 |
| 490 | add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 |
| 491 | add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 |
| 492 | fmov $IN23_2,x8 |
| 493 | fmov $IN23_3,x10 |
| 494 | fmov $IN23_4,x12 |
| 495 | |
| 496 | ldp x8,x12,[$inp],#16 // inp[0:1] |
| 497 | ldp x9,x13,[$inp],#48 |
| 498 | |
| 499 | ld1 {$R0,$R1,$S1,$R2},[x15],#64 |
| 500 | ld1 {$S2,$R3,$S3,$R4},[x15],#64 |
| 501 | ld1 {$S4},[x15] |
| 502 | |
| 503 | #ifdef __ARMEB__ |
| 504 | rev x8,x8 |
| 505 | rev x12,x12 |
| 506 | rev x9,x9 |
| 507 | rev x13,x13 |
| 508 | #endif |
| 509 | and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 |
| 510 | and x5,x9,#0x03ffffff |
| 511 | ubfx x6,x8,#26,#26 |
| 512 | ubfx x7,x9,#26,#26 |
| 513 | add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 |
| 514 | extr x8,x12,x8,#52 |
| 515 | extr x9,x13,x9,#52 |
| 516 | add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 |
| 517 | fmov $IN01_0,x4 |
| 518 | and x8,x8,#0x03ffffff |
| 519 | and x9,x9,#0x03ffffff |
| 520 | ubfx x10,x12,#14,#26 |
| 521 | ubfx x11,x13,#14,#26 |
| 522 | add x12,$padbit,x12,lsr#40 |
| 523 | add x13,$padbit,x13,lsr#40 |
| 524 | add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 |
| 525 | fmov $IN01_1,x6 |
| 526 | add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 |
| 527 | add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 |
| 528 | movi $MASK.2d,#-1 |
| 529 | fmov $IN01_2,x8 |
| 530 | fmov $IN01_3,x10 |
| 531 | fmov $IN01_4,x12 |
| 532 | ushr $MASK.2d,$MASK.2d,#38 |
| 533 | |
| 534 | b.ls .Lskip_loop |
| 535 | |
| 536 | .align 4 |
| 537 | .Loop_neon: |
| 538 | //////////////////////////////////////////////////////////////// |
| 539 | // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 |
| 540 | // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r |
| 541 | // \___________________/ |
| 542 | // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 |
| 543 | // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r |
| 544 | // \___________________/ \____________________/ |
| 545 | // |
| 546 | // Note that we start with inp[2:3]*r^2. This is because it |
| 547 | // doesn't depend on reduction in previous iteration. |
| 548 | //////////////////////////////////////////////////////////////// |
| 549 | // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 |
| 550 | // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 |
| 551 | // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 |
| 552 | // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 |
| 553 | // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 |
| 554 | |
| 555 | subs $len,$len,#64 |
| 556 | umull $ACC4,$IN23_0,${R4}[2] |
| 557 | csel $in2,$zeros,$in2,lo |
| 558 | umull $ACC3,$IN23_0,${R3}[2] |
| 559 | umull $ACC2,$IN23_0,${R2}[2] |
| 560 | ldp x8,x12,[$in2],#16 // inp[2:3] (or zero) |
| 561 | umull $ACC1,$IN23_0,${R1}[2] |
| 562 | ldp x9,x13,[$in2],#48 |
| 563 | umull $ACC0,$IN23_0,${R0}[2] |
| 564 | #ifdef __ARMEB__ |
| 565 | rev x8,x8 |
| 566 | rev x12,x12 |
| 567 | rev x9,x9 |
| 568 | rev x13,x13 |
| 569 | #endif |
| 570 | |
| 571 | umlal $ACC4,$IN23_1,${R3}[2] |
| 572 | and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 |
| 573 | umlal $ACC3,$IN23_1,${R2}[2] |
| 574 | and x5,x9,#0x03ffffff |
| 575 | umlal $ACC2,$IN23_1,${R1}[2] |
| 576 | ubfx x6,x8,#26,#26 |
| 577 | umlal $ACC1,$IN23_1,${R0}[2] |
| 578 | ubfx x7,x9,#26,#26 |
| 579 | umlal $ACC0,$IN23_1,${S4}[2] |
| 580 | add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 |
| 581 | |
| 582 | umlal $ACC4,$IN23_2,${R2}[2] |
| 583 | extr x8,x12,x8,#52 |
| 584 | umlal $ACC3,$IN23_2,${R1}[2] |
| 585 | extr x9,x13,x9,#52 |
| 586 | umlal $ACC2,$IN23_2,${R0}[2] |
| 587 | add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 |
| 588 | umlal $ACC1,$IN23_2,${S4}[2] |
| 589 | fmov $IN23_0,x4 |
| 590 | umlal $ACC0,$IN23_2,${S3}[2] |
| 591 | and x8,x8,#0x03ffffff |
| 592 | |
| 593 | umlal $ACC4,$IN23_3,${R1}[2] |
| 594 | and x9,x9,#0x03ffffff |
| 595 | umlal $ACC3,$IN23_3,${R0}[2] |
| 596 | ubfx x10,x12,#14,#26 |
| 597 | umlal $ACC2,$IN23_3,${S4}[2] |
| 598 | ubfx x11,x13,#14,#26 |
| 599 | umlal $ACC1,$IN23_3,${S3}[2] |
| 600 | add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 |
| 601 | umlal $ACC0,$IN23_3,${S2}[2] |
| 602 | fmov $IN23_1,x6 |
| 603 | |
| 604 | add $IN01_2,$IN01_2,$H2 |
| 605 | add x12,$padbit,x12,lsr#40 |
| 606 | umlal $ACC4,$IN23_4,${R0}[2] |
| 607 | add x13,$padbit,x13,lsr#40 |
| 608 | umlal $ACC3,$IN23_4,${S4}[2] |
| 609 | add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 |
| 610 | umlal $ACC2,$IN23_4,${S3}[2] |
| 611 | add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 |
| 612 | umlal $ACC1,$IN23_4,${S2}[2] |
| 613 | fmov $IN23_2,x8 |
| 614 | umlal $ACC0,$IN23_4,${S1}[2] |
| 615 | fmov $IN23_3,x10 |
| 616 | |
| 617 | //////////////////////////////////////////////////////////////// |
| 618 | // (hash+inp[0:1])*r^4 and accumulate |
| 619 | |
| 620 | add $IN01_0,$IN01_0,$H0 |
| 621 | fmov $IN23_4,x12 |
| 622 | umlal $ACC3,$IN01_2,${R1}[0] |
| 623 | ldp x8,x12,[$inp],#16 // inp[0:1] |
| 624 | umlal $ACC0,$IN01_2,${S3}[0] |
| 625 | ldp x9,x13,[$inp],#48 |
| 626 | umlal $ACC4,$IN01_2,${R2}[0] |
| 627 | umlal $ACC1,$IN01_2,${S4}[0] |
| 628 | umlal $ACC2,$IN01_2,${R0}[0] |
| 629 | #ifdef __ARMEB__ |
| 630 | rev x8,x8 |
| 631 | rev x12,x12 |
| 632 | rev x9,x9 |
| 633 | rev x13,x13 |
| 634 | #endif |
| 635 | |
| 636 | add $IN01_1,$IN01_1,$H1 |
| 637 | umlal $ACC3,$IN01_0,${R3}[0] |
| 638 | umlal $ACC4,$IN01_0,${R4}[0] |
| 639 | and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 |
| 640 | umlal $ACC2,$IN01_0,${R2}[0] |
| 641 | and x5,x9,#0x03ffffff |
| 642 | umlal $ACC0,$IN01_0,${R0}[0] |
| 643 | ubfx x6,x8,#26,#26 |
| 644 | umlal $ACC1,$IN01_0,${R1}[0] |
| 645 | ubfx x7,x9,#26,#26 |
| 646 | |
| 647 | add $IN01_3,$IN01_3,$H3 |
| 648 | add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 |
| 649 | umlal $ACC3,$IN01_1,${R2}[0] |
| 650 | extr x8,x12,x8,#52 |
| 651 | umlal $ACC4,$IN01_1,${R3}[0] |
| 652 | extr x9,x13,x9,#52 |
| 653 | umlal $ACC0,$IN01_1,${S4}[0] |
| 654 | add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 |
| 655 | umlal $ACC2,$IN01_1,${R1}[0] |
| 656 | fmov $IN01_0,x4 |
| 657 | umlal $ACC1,$IN01_1,${R0}[0] |
| 658 | and x8,x8,#0x03ffffff |
| 659 | |
| 660 | add $IN01_4,$IN01_4,$H4 |
| 661 | and x9,x9,#0x03ffffff |
| 662 | umlal $ACC3,$IN01_3,${R0}[0] |
| 663 | ubfx x10,x12,#14,#26 |
| 664 | umlal $ACC0,$IN01_3,${S2}[0] |
| 665 | ubfx x11,x13,#14,#26 |
| 666 | umlal $ACC4,$IN01_3,${R1}[0] |
| 667 | add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 |
| 668 | umlal $ACC1,$IN01_3,${S3}[0] |
| 669 | fmov $IN01_1,x6 |
| 670 | umlal $ACC2,$IN01_3,${S4}[0] |
| 671 | add x12,$padbit,x12,lsr#40 |
| 672 | |
| 673 | umlal $ACC3,$IN01_4,${S4}[0] |
| 674 | add x13,$padbit,x13,lsr#40 |
| 675 | umlal $ACC0,$IN01_4,${S1}[0] |
| 676 | add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 |
| 677 | umlal $ACC4,$IN01_4,${R0}[0] |
| 678 | add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 |
| 679 | umlal $ACC1,$IN01_4,${S2}[0] |
| 680 | fmov $IN01_2,x8 |
| 681 | umlal $ACC2,$IN01_4,${S3}[0] |
| 682 | fmov $IN01_3,x10 |
| 683 | fmov $IN01_4,x12 |
| 684 | |
| 685 | ///////////////////////////////////////////////////////////////// |
| 686 | // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein |
| 687 | // and P. Schwabe |
| 688 | // |
| 689 | // [see discussion in poly1305-armv4 module] |
| 690 | |
| 691 | ushr $T0.2d,$ACC3,#26 |
| 692 | xtn $H3,$ACC3 |
| 693 | ushr $T1.2d,$ACC0,#26 |
| 694 | and $ACC0,$ACC0,$MASK.2d |
| 695 | add $ACC4,$ACC4,$T0.2d // h3 -> h4 |
| 696 | bic $H3,#0xfc,lsl#24 // &=0x03ffffff |
| 697 | add $ACC1,$ACC1,$T1.2d // h0 -> h1 |
| 698 | |
| 699 | ushr $T0.2d,$ACC4,#26 |
| 700 | xtn $H4,$ACC4 |
| 701 | ushr $T1.2d,$ACC1,#26 |
| 702 | xtn $H1,$ACC1 |
| 703 | bic $H4,#0xfc,lsl#24 |
| 704 | add $ACC2,$ACC2,$T1.2d // h1 -> h2 |
| 705 | |
| 706 | add $ACC0,$ACC0,$T0.2d |
| 707 | shl $T0.2d,$T0.2d,#2 |
| 708 | shrn $T1.2s,$ACC2,#26 |
| 709 | xtn $H2,$ACC2 |
| 710 | add $ACC0,$ACC0,$T0.2d // h4 -> h0 |
| 711 | bic $H1,#0xfc,lsl#24 |
| 712 | add $H3,$H3,$T1.2s // h2 -> h3 |
| 713 | bic $H2,#0xfc,lsl#24 |
| 714 | |
| 715 | shrn $T0.2s,$ACC0,#26 |
| 716 | xtn $H0,$ACC0 |
| 717 | ushr $T1.2s,$H3,#26 |
| 718 | bic $H3,#0xfc,lsl#24 |
| 719 | bic $H0,#0xfc,lsl#24 |
| 720 | add $H1,$H1,$T0.2s // h0 -> h1 |
| 721 | add $H4,$H4,$T1.2s // h3 -> h4 |
| 722 | |
| 723 | b.hi .Loop_neon |
| 724 | |
| 725 | .Lskip_loop: |
| 726 | dup $IN23_2,${IN23_2}[0] |
| 727 | add $IN01_2,$IN01_2,$H2 |
| 728 | |
| 729 | //////////////////////////////////////////////////////////////// |
| 730 | // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 |
| 731 | |
| 732 | adds $len,$len,#32 |
| 733 | b.ne .Long_tail |
| 734 | |
| 735 | dup $IN23_2,${IN01_2}[0] |
| 736 | add $IN23_0,$IN01_0,$H0 |
| 737 | add $IN23_3,$IN01_3,$H3 |
| 738 | add $IN23_1,$IN01_1,$H1 |
| 739 | add $IN23_4,$IN01_4,$H4 |
| 740 | |
| 741 | .Long_tail: |
| 742 | dup $IN23_0,${IN23_0}[0] |
| 743 | umull2 $ACC0,$IN23_2,${S3} |
| 744 | umull2 $ACC3,$IN23_2,${R1} |
| 745 | umull2 $ACC4,$IN23_2,${R2} |
| 746 | umull2 $ACC2,$IN23_2,${R0} |
| 747 | umull2 $ACC1,$IN23_2,${S4} |
| 748 | |
| 749 | dup $IN23_1,${IN23_1}[0] |
| 750 | umlal2 $ACC0,$IN23_0,${R0} |
| 751 | umlal2 $ACC2,$IN23_0,${R2} |
| 752 | umlal2 $ACC3,$IN23_0,${R3} |
| 753 | umlal2 $ACC4,$IN23_0,${R4} |
| 754 | umlal2 $ACC1,$IN23_0,${R1} |
| 755 | |
| 756 | dup $IN23_3,${IN23_3}[0] |
| 757 | umlal2 $ACC0,$IN23_1,${S4} |
| 758 | umlal2 $ACC3,$IN23_1,${R2} |
| 759 | umlal2 $ACC2,$IN23_1,${R1} |
| 760 | umlal2 $ACC4,$IN23_1,${R3} |
| 761 | umlal2 $ACC1,$IN23_1,${R0} |
| 762 | |
| 763 | dup $IN23_4,${IN23_4}[0] |
| 764 | umlal2 $ACC3,$IN23_3,${R0} |
| 765 | umlal2 $ACC4,$IN23_3,${R1} |
| 766 | umlal2 $ACC0,$IN23_3,${S2} |
| 767 | umlal2 $ACC1,$IN23_3,${S3} |
| 768 | umlal2 $ACC2,$IN23_3,${S4} |
| 769 | |
| 770 | umlal2 $ACC3,$IN23_4,${S4} |
| 771 | umlal2 $ACC0,$IN23_4,${S1} |
| 772 | umlal2 $ACC4,$IN23_4,${R0} |
| 773 | umlal2 $ACC1,$IN23_4,${S2} |
| 774 | umlal2 $ACC2,$IN23_4,${S3} |
| 775 | |
| 776 | b.eq .Lshort_tail |
| 777 | |
| 778 | //////////////////////////////////////////////////////////////// |
| 779 | // (hash+inp[0:1])*r^4:r^3 and accumulate |
| 780 | |
| 781 | add $IN01_0,$IN01_0,$H0 |
| 782 | umlal $ACC3,$IN01_2,${R1} |
| 783 | umlal $ACC0,$IN01_2,${S3} |
| 784 | umlal $ACC4,$IN01_2,${R2} |
| 785 | umlal $ACC1,$IN01_2,${S4} |
| 786 | umlal $ACC2,$IN01_2,${R0} |
| 787 | |
| 788 | add $IN01_1,$IN01_1,$H1 |
| 789 | umlal $ACC3,$IN01_0,${R3} |
| 790 | umlal $ACC0,$IN01_0,${R0} |
| 791 | umlal $ACC4,$IN01_0,${R4} |
| 792 | umlal $ACC1,$IN01_0,${R1} |
| 793 | umlal $ACC2,$IN01_0,${R2} |
| 794 | |
| 795 | add $IN01_3,$IN01_3,$H3 |
| 796 | umlal $ACC3,$IN01_1,${R2} |
| 797 | umlal $ACC0,$IN01_1,${S4} |
| 798 | umlal $ACC4,$IN01_1,${R3} |
| 799 | umlal $ACC1,$IN01_1,${R0} |
| 800 | umlal $ACC2,$IN01_1,${R1} |
| 801 | |
| 802 | add $IN01_4,$IN01_4,$H4 |
| 803 | umlal $ACC3,$IN01_3,${R0} |
| 804 | umlal $ACC0,$IN01_3,${S2} |
| 805 | umlal $ACC4,$IN01_3,${R1} |
| 806 | umlal $ACC1,$IN01_3,${S3} |
| 807 | umlal $ACC2,$IN01_3,${S4} |
| 808 | |
| 809 | umlal $ACC3,$IN01_4,${S4} |
| 810 | umlal $ACC0,$IN01_4,${S1} |
| 811 | umlal $ACC4,$IN01_4,${R0} |
| 812 | umlal $ACC1,$IN01_4,${S2} |
| 813 | umlal $ACC2,$IN01_4,${S3} |
| 814 | |
| 815 | .Lshort_tail: |
| 816 | //////////////////////////////////////////////////////////////// |
| 817 | // horizontal add |
| 818 | |
| 819 | addp $ACC3,$ACC3,$ACC3 |
| 820 | ldp d8,d9,[sp,#16] // meet ABI requirements |
| 821 | addp $ACC0,$ACC0,$ACC0 |
| 822 | ldp d10,d11,[sp,#32] |
| 823 | addp $ACC4,$ACC4,$ACC4 |
| 824 | ldp d12,d13,[sp,#48] |
| 825 | addp $ACC1,$ACC1,$ACC1 |
| 826 | ldp d14,d15,[sp,#64] |
| 827 | addp $ACC2,$ACC2,$ACC2 |
| 828 | |
| 829 | //////////////////////////////////////////////////////////////// |
| 830 | // lazy reduction, but without narrowing |
| 831 | |
| 832 | ushr $T0.2d,$ACC3,#26 |
| 833 | and $ACC3,$ACC3,$MASK.2d |
| 834 | ushr $T1.2d,$ACC0,#26 |
| 835 | and $ACC0,$ACC0,$MASK.2d |
| 836 | |
| 837 | add $ACC4,$ACC4,$T0.2d // h3 -> h4 |
| 838 | add $ACC1,$ACC1,$T1.2d // h0 -> h1 |
| 839 | |
| 840 | ushr $T0.2d,$ACC4,#26 |
| 841 | and $ACC4,$ACC4,$MASK.2d |
| 842 | ushr $T1.2d,$ACC1,#26 |
| 843 | and $ACC1,$ACC1,$MASK.2d |
| 844 | add $ACC2,$ACC2,$T1.2d // h1 -> h2 |
| 845 | |
| 846 | add $ACC0,$ACC0,$T0.2d |
| 847 | shl $T0.2d,$T0.2d,#2 |
| 848 | ushr $T1.2d,$ACC2,#26 |
| 849 | and $ACC2,$ACC2,$MASK.2d |
| 850 | add $ACC0,$ACC0,$T0.2d // h4 -> h0 |
| 851 | add $ACC3,$ACC3,$T1.2d // h2 -> h3 |
| 852 | |
| 853 | ushr $T0.2d,$ACC0,#26 |
| 854 | and $ACC0,$ACC0,$MASK.2d |
| 855 | ushr $T1.2d,$ACC3,#26 |
| 856 | and $ACC3,$ACC3,$MASK.2d |
| 857 | add $ACC1,$ACC1,$T0.2d // h0 -> h1 |
| 858 | add $ACC4,$ACC4,$T1.2d // h3 -> h4 |
| 859 | |
| 860 | //////////////////////////////////////////////////////////////// |
| 861 | // write the result, can be partially reduced |
| 862 | |
| 863 | st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16 |
| 864 | st1 {$ACC4}[0],[$ctx] |
| 865 | |
| 866 | .Lno_data_neon: |
| 867 | ldr x29,[sp],#80 |
| 868 | .inst 0xd50323bf // autiasp |
| 869 | ret |
| 870 | .size poly1305_blocks_neon,.-poly1305_blocks_neon |
| 871 | |
| 872 | .type poly1305_emit_neon,%function |
| 873 | .align 5 |
| 874 | poly1305_emit_neon: |
| 875 | ldr $is_base2_26,[$ctx,#24] |
| 876 | cbz $is_base2_26,poly1305_emit |
| 877 | |
| 878 | ldp w10,w11,[$ctx] // load hash value base 2^26 |
| 879 | ldp w12,w13,[$ctx,#8] |
| 880 | ldr w14,[$ctx,#16] |
| 881 | |
| 882 | add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64 |
| 883 | lsr $h1,x12,#12 |
| 884 | adds $h0,$h0,x12,lsl#52 |
| 885 | add $h1,$h1,x13,lsl#14 |
| 886 | adc $h1,$h1,xzr |
| 887 | lsr $h2,x14,#24 |
| 888 | adds $h1,$h1,x14,lsl#40 |
| 889 | adc $h2,$h2,xzr // can be partially reduced... |
| 890 | |
| 891 | ldp $t0,$t1,[$nonce] // load nonce |
| 892 | |
| 893 | and $d0,$h2,#-4 // ... so reduce |
| 894 | add $d0,$d0,$h2,lsr#2 |
| 895 | and $h2,$h2,#3 |
| 896 | adds $h0,$h0,$d0 |
| 897 | adcs $h1,$h1,xzr |
| 898 | adc $h2,$h2,xzr |
| 899 | |
| 900 | adds $d0,$h0,#5 // compare to modulus |
| 901 | adcs $d1,$h1,xzr |
| 902 | adc $d2,$h2,xzr |
| 903 | |
| 904 | tst $d2,#-4 // see if it's carried/borrowed |
| 905 | |
| 906 | csel $h0,$h0,$d0,eq |
| 907 | csel $h1,$h1,$d1,eq |
| 908 | |
| 909 | #ifdef __ARMEB__ |
| 910 | ror $t0,$t0,#32 // flip nonce words |
| 911 | ror $t1,$t1,#32 |
| 912 | #endif |
| 913 | adds $h0,$h0,$t0 // accumulate nonce |
| 914 | adc $h1,$h1,$t1 |
| 915 | #ifdef __ARMEB__ |
| 916 | rev $h0,$h0 // flip output bytes |
| 917 | rev $h1,$h1 |
| 918 | #endif |
| 919 | stp $h0,$h1,[$mac] // write result |
| 920 | |
| 921 | ret |
| 922 | .size poly1305_emit_neon,.-poly1305_emit_neon |
| 923 | |
| 924 | .align 5 |
| 925 | .Lzeros: |
| 926 | .long 0,0,0,0,0,0,0,0 |
| 927 | .LOPENSSL_armcap_P: |
| 928 | #ifdef __ILP32__ |
| 929 | .long OPENSSL_armcap_P-. |
| 930 | #else |
| 931 | .quad OPENSSL_armcap_P-. |
| 932 | #endif |
| 933 | .asciz "Poly1305 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" |
| 934 | .align 2 |
| 935 | ___ |
| 936 | |
| 937 | foreach (split("\n",$code)) { |
| 938 | s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or |
| 939 | s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or |
| 940 | (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or |
| 941 | (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or |
| 942 | (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or |
| 943 | (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or |
| 944 | (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1)); |
| 945 | |
| 946 | s/\.[124]([sd])\[/.$1\[/; |
| 947 | |
| 948 | print $_,"\n"; |
| 949 | } |
| 950 | close STDOUT or die "error closing STDOUT: $!"; |