arch/arm/crypto/sha512-armv4.pl

   1 #!/usr/bin/env perl
   2 # SPDX-License-Identifier: GPL-2.0
   3
   4 # This code is taken from the OpenSSL project but the author (Andy Polyakov)
   5 # has relicensed it under the GPLv2. Therefore this program is free software;
   6 # you can redistribute it and/or modify it under the terms of the GNU General
   7 # Public License version 2 as published by the Free Software Foundation.
   8 #
   9 # The original headers, including the original license headers, are
  10 # included below for completeness.
  11
  12 # ====================================================================
  13 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  14 # project. The module is, however, dual licensed under OpenSSL and
  15 # CRYPTOGAMS licenses depending on where you obtain it. For further
  16 # details see http://www.openssl.org/~appro/cryptogams/.
  17 # ====================================================================
  18
  19 # SHA512 block procedure for ARMv4. September 2007.
  20
  21 # This code is ~4.5 (four and a half) times faster than code generated
  22 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
  23 # Xscale PXA250 core].
  24 #
  25 # July 2010.
  26 #
  27 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
  28 # Cortex A8 core and ~40 cycles per processed byte.
  29
  30 # February 2011.
  31 #
  32 # Profiler-assisted and platform-specific optimization resulted in 7%
  33 # improvement on Coxtex A8 core and ~38 cycles per byte.
  34
  35 # March 2011.
  36 #
  37 # Add NEON implementation. On Cortex A8 it was measured to process
  38 # one byte in 23.3 cycles or ~60% faster than integer-only code.
  39
  40 # August 2012.
  41 #
  42 # Improve NEON performance by 12% on Snapdragon S4. In absolute
  43 # terms it's 22.6 cycles per byte, which is disappointing result.
  44 # Technical writers asserted that 3-way S4 pipeline can sustain
  45 # multiple NEON instructions per cycle, but dual NEON issue could
  46 # not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
  47 # for further details. On side note Cortex-A15 processes one byte in
  48 # 16 cycles.
  49
  50 # Byte order [in]dependence. =========================================
  51 #
  52 # Originally caller was expected to maintain specific *dword* order in
  53 # h[0-7], namely with most significant dword at *lower* address, which
  54 # was reflected in below two parameters as 0 and 4. Now caller is
  55 # expected to maintain native byte order for whole 64-bit values.
  56 $hi="HI";
  57 $lo="LO";
  58 # ====================================================================
  59
  60 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
  61 open STDOUT,">$output";
  62
  63 $ctx="r0";      # parameter block
  64 $inp="r1";
  65 $len="r2";
  66
  67 $Tlo="r3";
  68 $Thi="r4";
  69 $Alo="r5";
  70 $Ahi="r6";
  71 $Elo="r7";
  72 $Ehi="r8";
  73 $t0="r9";
  74 $t1="r10";
  75 $t2="r11";
  76 $t3="r12";
  77 ############    r13 is stack pointer
  78 $Ktbl="r14";
  79 ############    r15 is program counter
  80
  81 $Aoff=8*0;
  82 $Boff=8*1;
  83 $Coff=8*2;
  84 $Doff=8*3;
  85 $Eoff=8*4;
  86 $Foff=8*5;
  87 $Goff=8*6;
  88 $Hoff=8*7;
  89 $Xoff=8*8;
  90
  91 sub BODY_00_15() {
  92 my $magic = shift;
  93 $code.=<<___;
  94         @ Sigma1(x)     (ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
  95         @ LO            lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
  96         @ HI            hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
  97         mov     $t0,$Elo,lsr#14
  98         str     $Tlo,[sp,#$Xoff+0]
  99         mov     $t1,$Ehi,lsr#14
 100         str     $Thi,[sp,#$Xoff+4]
 101         eor     $t0,$t0,$Ehi,lsl#18
 102         ldr     $t2,[sp,#$Hoff+0]       @ h.lo
 103         eor     $t1,$t1,$Elo,lsl#18
 104         ldr     $t3,[sp,#$Hoff+4]       @ h.hi
 105         eor     $t0,$t0,$Elo,lsr#18
 106         eor     $t1,$t1,$Ehi,lsr#18
 107         eor     $t0,$t0,$Ehi,lsl#14
 108         eor     $t1,$t1,$Elo,lsl#14
 109         eor     $t0,$t0,$Ehi,lsr#9
 110         eor     $t1,$t1,$Elo,lsr#9
 111         eor     $t0,$t0,$Elo,lsl#23
 112         eor     $t1,$t1,$Ehi,lsl#23     @ Sigma1(e)
 113         adds    $Tlo,$Tlo,$t0
 114         ldr     $t0,[sp,#$Foff+0]       @ f.lo
 115         adc     $Thi,$Thi,$t1           @ T += Sigma1(e)
 116         ldr     $t1,[sp,#$Foff+4]       @ f.hi
 117         adds    $Tlo,$Tlo,$t2
 118         ldr     $t2,[sp,#$Goff+0]       @ g.lo
 119         adc     $Thi,$Thi,$t3           @ T += h
 120         ldr     $t3,[sp,#$Goff+4]       @ g.hi
 121
 122         eor     $t0,$t0,$t2
 123         str     $Elo,[sp,#$Eoff+0]
 124         eor     $t1,$t1,$t3
 125         str     $Ehi,[sp,#$Eoff+4]
 126         and     $t0,$t0,$Elo
 127         str     $Alo,[sp,#$Aoff+0]
 128         and     $t1,$t1,$Ehi
 129         str     $Ahi,[sp,#$Aoff+4]
 130         eor     $t0,$t0,$t2
 131         ldr     $t2,[$Ktbl,#$lo]        @ K[i].lo
 132         eor     $t1,$t1,$t3             @ Ch(e,f,g)
 133         ldr     $t3,[$Ktbl,#$hi]        @ K[i].hi
 134
 135         adds    $Tlo,$Tlo,$t0
 136         ldr     $Elo,[sp,#$Doff+0]      @ d.lo
 137         adc     $Thi,$Thi,$t1           @ T += Ch(e,f,g)
 138         ldr     $Ehi,[sp,#$Doff+4]      @ d.hi
 139         adds    $Tlo,$Tlo,$t2
 140         and     $t0,$t2,#0xff
 141         adc     $Thi,$Thi,$t3           @ T += K[i]
 142         adds    $Elo,$Elo,$Tlo
 143         ldr     $t2,[sp,#$Boff+0]       @ b.lo
 144         adc     $Ehi,$Ehi,$Thi          @ d += T
 145         teq     $t0,#$magic
 146
 147         ldr     $t3,[sp,#$Coff+0]       @ c.lo
 148 #if __ARM_ARCH__>=7
 149         it      eq                      @ Thumb2 thing, sanity check in ARM
 150 #endif
 151         orreq   $Ktbl,$Ktbl,#1
 152         @ Sigma0(x)     (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
 153         @ LO            lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
 154         @ HI            hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
 155         mov     $t0,$Alo,lsr#28
 156         mov     $t1,$Ahi,lsr#28
 157         eor     $t0,$t0,$Ahi,lsl#4
 158         eor     $t1,$t1,$Alo,lsl#4
 159         eor     $t0,$t0,$Ahi,lsr#2
 160         eor     $t1,$t1,$Alo,lsr#2
 161         eor     $t0,$t0,$Alo,lsl#30
 162         eor     $t1,$t1,$Ahi,lsl#30
 163         eor     $t0,$t0,$Ahi,lsr#7
 164         eor     $t1,$t1,$Alo,lsr#7
 165         eor     $t0,$t0,$Alo,lsl#25
 166         eor     $t1,$t1,$Ahi,lsl#25     @ Sigma0(a)
 167         adds    $Tlo,$Tlo,$t0
 168         and     $t0,$Alo,$t2
 169         adc     $Thi,$Thi,$t1           @ T += Sigma0(a)
 170
 171         ldr     $t1,[sp,#$Boff+4]       @ b.hi
 172         orr     $Alo,$Alo,$t2
 173         ldr     $t2,[sp,#$Coff+4]       @ c.hi
 174         and     $Alo,$Alo,$t3
 175         and     $t3,$Ahi,$t1
 176         orr     $Ahi,$Ahi,$t1
 177         orr     $Alo,$Alo,$t0           @ Maj(a,b,c).lo
 178         and     $Ahi,$Ahi,$t2
 179         adds    $Alo,$Alo,$Tlo
 180         orr     $Ahi,$Ahi,$t3           @ Maj(a,b,c).hi
 181         sub     sp,sp,#8
 182         adc     $Ahi,$Ahi,$Thi          @ h += T
 183         tst     $Ktbl,#1
 184         add     $Ktbl,$Ktbl,#8
 185 ___
 186 }
 187 $code=<<___;
 188 #ifndef __KERNEL__
 189 # include "arm_arch.h"
 190 # define VFP_ABI_PUSH   vstmdb  sp!,{d8-d15}
 191 # define VFP_ABI_POP    vldmia  sp!,{d8-d15}
 192 #else
 193 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
 194 # define __ARM_MAX_ARCH__ 7
 195 # define VFP_ABI_PUSH
 196 # define VFP_ABI_POP
 197 #endif
 198
 199 #ifdef __ARMEL__
 200 # define LO 0
 201 # define HI 4
 202 # define WORD64(hi0,lo0,hi1,lo1)        .word   lo0,hi0, lo1,hi1
 203 #else
 204 # define HI 0
 205 # define LO 4
 206 # define WORD64(hi0,lo0,hi1,lo1)        .word   hi0,lo0, hi1,lo1
 207 #endif
 208
 209 .text
 210 #if __ARM_ARCH__<7
 211 .code   32
 212 #else
 213 .syntax unified
 214 # ifdef __thumb2__
 215 #  define adrl adr
 216 .thumb
 217 # else
 218 .code   32
 219 # endif
 220 #endif
 221
 222 .type   K512,%object
 223 .align  5
 224 K512:
 225 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
 226 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
 227 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
 228 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
 229 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
 230 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
 231 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
 232 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
 233 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
 234 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
 235 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
 236 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
 237 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
 238 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
 239 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
 240 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
 241 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
 242 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
 243 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
 244 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
 245 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
 246 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
 247 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
 248 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
 249 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
 250 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
 251 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
 252 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
 253 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
 254 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
 255 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
 256 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
 257 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
 258 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
 259 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
 260 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
 261 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
 262 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
 263 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
 264 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
 265 .size   K512,.-K512
 266 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
 267 .LOPENSSL_armcap:
 268 .word   OPENSSL_armcap_P-sha512_block_data_order
 269 .skip   32-4
 270 #else
 271 .skip   32
 272 #endif
 273
 274 .global sha512_block_data_order
 275 .type   sha512_block_data_order,%function
 276 sha512_block_data_order:
 277 .Lsha512_block_data_order:
 278 #if __ARM_ARCH__<7
 279         sub     r3,pc,#8                @ sha512_block_data_order
 280 #else
 281         adr     r3,.Lsha512_block_data_order
 282 #endif
 283 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
 284         ldr     r12,.LOPENSSL_armcap
 285         ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
 286         tst     r12,#1
 287         bne     .LNEON
 288 #endif
 289         add     $len,$inp,$len,lsl#7    @ len to point at the end of inp
 290         stmdb   sp!,{r4-r12,lr}
 291         sub     $Ktbl,r3,#672           @ K512
 292         sub     sp,sp,#9*8
 293
 294         ldr     $Elo,[$ctx,#$Eoff+$lo]
 295         ldr     $Ehi,[$ctx,#$Eoff+$hi]
 296         ldr     $t0, [$ctx,#$Goff+$lo]
 297         ldr     $t1, [$ctx,#$Goff+$hi]
 298         ldr     $t2, [$ctx,#$Hoff+$lo]
 299         ldr     $t3, [$ctx,#$Hoff+$hi]
 300 .Loop:
 301         str     $t0, [sp,#$Goff+0]
 302         str     $t1, [sp,#$Goff+4]
 303         str     $t2, [sp,#$Hoff+0]
 304         str     $t3, [sp,#$Hoff+4]
 305         ldr     $Alo,[$ctx,#$Aoff+$lo]
 306         ldr     $Ahi,[$ctx,#$Aoff+$hi]
 307         ldr     $Tlo,[$ctx,#$Boff+$lo]
 308         ldr     $Thi,[$ctx,#$Boff+$hi]
 309         ldr     $t0, [$ctx,#$Coff+$lo]
 310         ldr     $t1, [$ctx,#$Coff+$hi]
 311         ldr     $t2, [$ctx,#$Doff+$lo]
 312         ldr     $t3, [$ctx,#$Doff+$hi]
 313         str     $Tlo,[sp,#$Boff+0]
 314         str     $Thi,[sp,#$Boff+4]
 315         str     $t0, [sp,#$Coff+0]
 316         str     $t1, [sp,#$Coff+4]
 317         str     $t2, [sp,#$Doff+0]
 318         str     $t3, [sp,#$Doff+4]
 319         ldr     $Tlo,[$ctx,#$Foff+$lo]
 320         ldr     $Thi,[$ctx,#$Foff+$hi]
 321         str     $Tlo,[sp,#$Foff+0]
 322         str     $Thi,[sp,#$Foff+4]
 323
 324 .L00_15:
 325 #if __ARM_ARCH__<7
 326         ldrb    $Tlo,[$inp,#7]
 327         ldrb    $t0, [$inp,#6]
 328         ldrb    $t1, [$inp,#5]
 329         ldrb    $t2, [$inp,#4]
 330         ldrb    $Thi,[$inp,#3]
 331         ldrb    $t3, [$inp,#2]
 332         orr     $Tlo,$Tlo,$t0,lsl#8
 333         ldrb    $t0, [$inp,#1]
 334         orr     $Tlo,$Tlo,$t1,lsl#16
 335         ldrb    $t1, [$inp],#8
 336         orr     $Tlo,$Tlo,$t2,lsl#24
 337         orr     $Thi,$Thi,$t3,lsl#8
 338         orr     $Thi,$Thi,$t0,lsl#16
 339         orr     $Thi,$Thi,$t1,lsl#24
 340 #else
 341         ldr     $Tlo,[$inp,#4]
 342         ldr     $Thi,[$inp],#8
 343 #ifdef __ARMEL__
 344         rev     $Tlo,$Tlo
 345         rev     $Thi,$Thi
 346 #endif
 347 #endif
 348 ___
 349         &BODY_00_15(0x94);
 350 $code.=<<___;
 351         tst     $Ktbl,#1
 352         beq     .L00_15
 353         ldr     $t0,[sp,#`$Xoff+8*(16-1)`+0]
 354         ldr     $t1,[sp,#`$Xoff+8*(16-1)`+4]
 355         bic     $Ktbl,$Ktbl,#1
 356 .L16_79:
 357         @ sigma0(x)     (ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
 358         @ LO            lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
 359         @ HI            hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
 360         mov     $Tlo,$t0,lsr#1
 361         ldr     $t2,[sp,#`$Xoff+8*(16-14)`+0]
 362         mov     $Thi,$t1,lsr#1
 363         ldr     $t3,[sp,#`$Xoff+8*(16-14)`+4]
 364         eor     $Tlo,$Tlo,$t1,lsl#31
 365         eor     $Thi,$Thi,$t0,lsl#31
 366         eor     $Tlo,$Tlo,$t0,lsr#8
 367         eor     $Thi,$Thi,$t1,lsr#8
 368         eor     $Tlo,$Tlo,$t1,lsl#24
 369         eor     $Thi,$Thi,$t0,lsl#24
 370         eor     $Tlo,$Tlo,$t0,lsr#7
 371         eor     $Thi,$Thi,$t1,lsr#7
 372         eor     $Tlo,$Tlo,$t1,lsl#25
 373
 374         @ sigma1(x)     (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
 375         @ LO            lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
 376         @ HI            hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
 377         mov     $t0,$t2,lsr#19
 378         mov     $t1,$t3,lsr#19
 379         eor     $t0,$t0,$t3,lsl#13
 380         eor     $t1,$t1,$t2,lsl#13
 381         eor     $t0,$t0,$t3,lsr#29
 382         eor     $t1,$t1,$t2,lsr#29
 383         eor     $t0,$t0,$t2,lsl#3
 384         eor     $t1,$t1,$t3,lsl#3
 385         eor     $t0,$t0,$t2,lsr#6
 386         eor     $t1,$t1,$t3,lsr#6
 387         ldr     $t2,[sp,#`$Xoff+8*(16-9)`+0]
 388         eor     $t0,$t0,$t3,lsl#26
 389
 390         ldr     $t3,[sp,#`$Xoff+8*(16-9)`+4]
 391         adds    $Tlo,$Tlo,$t0
 392         ldr     $t0,[sp,#`$Xoff+8*16`+0]
 393         adc     $Thi,$Thi,$t1
 394
 395         ldr     $t1,[sp,#`$Xoff+8*16`+4]
 396         adds    $Tlo,$Tlo,$t2
 397         adc     $Thi,$Thi,$t3
 398         adds    $Tlo,$Tlo,$t0
 399         adc     $Thi,$Thi,$t1
 400 ___
 401         &BODY_00_15(0x17);
 402 $code.=<<___;
 403 #if __ARM_ARCH__>=7
 404         ittt    eq                      @ Thumb2 thing, sanity check in ARM
 405 #endif
 406         ldreq   $t0,[sp,#`$Xoff+8*(16-1)`+0]
 407         ldreq   $t1,[sp,#`$Xoff+8*(16-1)`+4]
 408         beq     .L16_79
 409         bic     $Ktbl,$Ktbl,#1
 410
 411         ldr     $Tlo,[sp,#$Boff+0]
 412         ldr     $Thi,[sp,#$Boff+4]
 413         ldr     $t0, [$ctx,#$Aoff+$lo]
 414         ldr     $t1, [$ctx,#$Aoff+$hi]
 415         ldr     $t2, [$ctx,#$Boff+$lo]
 416         ldr     $t3, [$ctx,#$Boff+$hi]
 417         adds    $t0,$Alo,$t0
 418         str     $t0, [$ctx,#$Aoff+$lo]
 419         adc     $t1,$Ahi,$t1
 420         str     $t1, [$ctx,#$Aoff+$hi]
 421         adds    $t2,$Tlo,$t2
 422         str     $t2, [$ctx,#$Boff+$lo]
 423         adc     $t3,$Thi,$t3
 424         str     $t3, [$ctx,#$Boff+$hi]
 425
 426         ldr     $Alo,[sp,#$Coff+0]
 427         ldr     $Ahi,[sp,#$Coff+4]
 428         ldr     $Tlo,[sp,#$Doff+0]
 429         ldr     $Thi,[sp,#$Doff+4]
 430         ldr     $t0, [$ctx,#$Coff+$lo]
 431         ldr     $t1, [$ctx,#$Coff+$hi]
 432         ldr     $t2, [$ctx,#$Doff+$lo]
 433         ldr     $t3, [$ctx,#$Doff+$hi]
 434         adds    $t0,$Alo,$t0
 435         str     $t0, [$ctx,#$Coff+$lo]
 436         adc     $t1,$Ahi,$t1
 437         str     $t1, [$ctx,#$Coff+$hi]
 438         adds    $t2,$Tlo,$t2
 439         str     $t2, [$ctx,#$Doff+$lo]
 440         adc     $t3,$Thi,$t3
 441         str     $t3, [$ctx,#$Doff+$hi]
 442
 443         ldr     $Tlo,[sp,#$Foff+0]
 444         ldr     $Thi,[sp,#$Foff+4]
 445         ldr     $t0, [$ctx,#$Eoff+$lo]
 446         ldr     $t1, [$ctx,#$Eoff+$hi]
 447         ldr     $t2, [$ctx,#$Foff+$lo]
 448         ldr     $t3, [$ctx,#$Foff+$hi]
 449         adds    $Elo,$Elo,$t0
 450         str     $Elo,[$ctx,#$Eoff+$lo]
 451         adc     $Ehi,$Ehi,$t1
 452         str     $Ehi,[$ctx,#$Eoff+$hi]
 453         adds    $t2,$Tlo,$t2
 454         str     $t2, [$ctx,#$Foff+$lo]
 455         adc     $t3,$Thi,$t3
 456         str     $t3, [$ctx,#$Foff+$hi]
 457
 458         ldr     $Alo,[sp,#$Goff+0]
 459         ldr     $Ahi,[sp,#$Goff+4]
 460         ldr     $Tlo,[sp,#$Hoff+0]
 461         ldr     $Thi,[sp,#$Hoff+4]
 462         ldr     $t0, [$ctx,#$Goff+$lo]
 463         ldr     $t1, [$ctx,#$Goff+$hi]
 464         ldr     $t2, [$ctx,#$Hoff+$lo]
 465         ldr     $t3, [$ctx,#$Hoff+$hi]
 466         adds    $t0,$Alo,$t0
 467         str     $t0, [$ctx,#$Goff+$lo]
 468         adc     $t1,$Ahi,$t1
 469         str     $t1, [$ctx,#$Goff+$hi]
 470         adds    $t2,$Tlo,$t2
 471         str     $t2, [$ctx,#$Hoff+$lo]
 472         adc     $t3,$Thi,$t3
 473         str     $t3, [$ctx,#$Hoff+$hi]
 474
 475         add     sp,sp,#640
 476         sub     $Ktbl,$Ktbl,#640
 477
 478         teq     $inp,$len
 479         bne     .Loop
 480
 481         add     sp,sp,#8*9              @ destroy frame
 482 #if __ARM_ARCH__>=5
 483         ldmia   sp!,{r4-r12,pc}
 484 #else
 485         ldmia   sp!,{r4-r12,lr}
 486         tst     lr,#1
 487         moveq   pc,lr                   @ be binary compatible with V4, yet
 488         bx      lr                      @ interoperable with Thumb ISA:-)
 489 #endif
 490 .size   sha512_block_data_order,.-sha512_block_data_order
 491 ___
 492
 493 {
 494 my @Sigma0=(28,34,39);
 495 my @Sigma1=(14,18,41);
 496 my @sigma0=(1, 8, 7);
 497 my @sigma1=(19,61,6);
 498
 499 my $Ktbl="r3";
 500 my $cnt="r12";  # volatile register known as ip, intra-procedure-call scratch
 501
 502 my @X=map("d$_",(0..15));
 503 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
 504
 505 sub NEON_00_15() {
 506 my $i=shift;
 507 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
 508 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));   # temps
 509
 510 $code.=<<___ if ($i<16 || $i&1);
 511         vshr.u64        $t0,$e,#@Sigma1[0]      @ $i
 512 #if $i<16
 513         vld1.64         {@X[$i%16]},[$inp]!     @ handles unaligned
 514 #endif
 515         vshr.u64        $t1,$e,#@Sigma1[1]
 516 #if $i>0
 517          vadd.i64       $a,$Maj                 @ h+=Maj from the past
 518 #endif
 519         vshr.u64        $t2,$e,#@Sigma1[2]
 520 ___
 521 $code.=<<___;
 522         vld1.64         {$K},[$Ktbl,:64]!       @ K[i++]
 523         vsli.64         $t0,$e,#`64-@Sigma1[0]`
 524         vsli.64         $t1,$e,#`64-@Sigma1[1]`
 525         vmov            $Ch,$e
 526         vsli.64         $t2,$e,#`64-@Sigma1[2]`
 527 #if $i<16 && defined(__ARMEL__)
 528         vrev64.8        @X[$i],@X[$i]
 529 #endif
 530         veor            $t1,$t0
 531         vbsl            $Ch,$f,$g               @ Ch(e,f,g)
 532         vshr.u64        $t0,$a,#@Sigma0[0]
 533         veor            $t2,$t1                 @ Sigma1(e)
 534         vadd.i64        $T1,$Ch,$h
 535         vshr.u64        $t1,$a,#@Sigma0[1]
 536         vsli.64         $t0,$a,#`64-@Sigma0[0]`
 537         vadd.i64        $T1,$t2
 538         vshr.u64        $t2,$a,#@Sigma0[2]
 539         vadd.i64        $K,@X[$i%16]
 540         vsli.64         $t1,$a,#`64-@Sigma0[1]`
 541         veor            $Maj,$a,$b
 542         vsli.64         $t2,$a,#`64-@Sigma0[2]`
 543         veor            $h,$t0,$t1
 544         vadd.i64        $T1,$K
 545         vbsl            $Maj,$c,$b              @ Maj(a,b,c)
 546         veor            $h,$t2                  @ Sigma0(a)
 547         vadd.i64        $d,$T1
 548         vadd.i64        $Maj,$T1
 549         @ vadd.i64      $h,$Maj
 550 ___
 551 }
 552
 553 sub NEON_16_79() {
 554 my $i=shift;
 555
 556 if ($i&1)       { &NEON_00_15($i,@_); return; }
 557
 558 # 2x-vectorized, therefore runs every 2nd round
 559 my @X=map("q$_",(0..7));                        # view @X as 128-bit vector
 560 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));     # temps
 561 my ($d0,$d1,$d2) = map("d$_",(24..26));         # temps from NEON_00_15
 562 my $e=@_[4];                                    # $e from NEON_00_15
 563 $i /= 2;
 564 $code.=<<___;
 565         vshr.u64        $t0,@X[($i+7)%8],#@sigma1[0]
 566         vshr.u64        $t1,@X[($i+7)%8],#@sigma1[1]
 567          vadd.i64       @_[0],d30                       @ h+=Maj from the past
 568         vshr.u64        $s1,@X[($i+7)%8],#@sigma1[2]
 569         vsli.64         $t0,@X[($i+7)%8],#`64-@sigma1[0]`
 570         vext.8          $s0,@X[$i%8],@X[($i+1)%8],#8    @ X[i+1]
 571         vsli.64         $t1,@X[($i+7)%8],#`64-@sigma1[1]`
 572         veor            $s1,$t0
 573         vshr.u64        $t0,$s0,#@sigma0[0]
 574         veor            $s1,$t1                         @ sigma1(X[i+14])
 575         vshr.u64        $t1,$s0,#@sigma0[1]
 576         vadd.i64        @X[$i%8],$s1
 577         vshr.u64        $s1,$s0,#@sigma0[2]
 578         vsli.64         $t0,$s0,#`64-@sigma0[0]`
 579         vsli.64         $t1,$s0,#`64-@sigma0[1]`
 580         vext.8          $s0,@X[($i+4)%8],@X[($i+5)%8],#8        @ X[i+9]
 581         veor            $s1,$t0
 582         vshr.u64        $d0,$e,#@Sigma1[0]              @ from NEON_00_15
 583         vadd.i64        @X[$i%8],$s0
 584         vshr.u64        $d1,$e,#@Sigma1[1]              @ from NEON_00_15
 585         veor            $s1,$t1                         @ sigma0(X[i+1])
 586         vshr.u64        $d2,$e,#@Sigma1[2]              @ from NEON_00_15
 587         vadd.i64        @X[$i%8],$s1
 588 ___
 589         &NEON_00_15(2*$i,@_);
 590 }
 591
 592 $code.=<<___;
 593 #if __ARM_MAX_ARCH__>=7
 594 .arch   armv7-a
 595 .fpu    neon
 596
 597 .global sha512_block_data_order_neon
 598 .type   sha512_block_data_order_neon,%function
 599 .align  4
 600 sha512_block_data_order_neon:
 601 .LNEON:
 602         dmb                             @ errata #451034 on early Cortex A8
 603         add     $len,$inp,$len,lsl#7    @ len to point at the end of inp
 604         VFP_ABI_PUSH
 605         adrl    $Ktbl,K512
 606         vldmia  $ctx,{$A-$H}            @ load context
 607 .Loop_neon:
 608 ___
 609 for($i=0;$i<16;$i++)    { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
 610 $code.=<<___;
 611         mov             $cnt,#4
 612 .L16_79_neon:
 613         subs            $cnt,#1
 614 ___
 615 for(;$i<32;$i++)        { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
 616 $code.=<<___;
 617         bne             .L16_79_neon
 618
 619          vadd.i64       $A,d30          @ h+=Maj from the past
 620         vldmia          $ctx,{d24-d31}  @ load context to temp
 621         vadd.i64        q8,q12          @ vectorized accumulate
 622         vadd.i64        q9,q13
 623         vadd.i64        q10,q14
 624         vadd.i64        q11,q15
 625         vstmia          $ctx,{$A-$H}    @ save context
 626         teq             $inp,$len
 627         sub             $Ktbl,#640      @ rewind K512
 628         bne             .Loop_neon
 629
 630         VFP_ABI_POP
 631         ret                             @ bx lr
 632 .size   sha512_block_data_order_neon,.-sha512_block_data_order_neon
 633 #endif
 634 ___
 635 }
 636 $code.=<<___;
 637 .asciz  "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
 638 .align  2
 639 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
 640 .comm   OPENSSL_armcap_P,4,4
 641 #endif
 642 ___
 643
 644 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 645 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
 646 $code =~ s/\bret\b/bx   lr/gm;
 647
 648 open SELF,$0;
 649 while(<SELF>) {
 650         next if (/^#!/);
 651         last if (!s/^#/@/ and !/^$/);
 652         print;
 653 }
 654 close SELF;
 655
 656 print $code;
 657 close STDOUT; # enforce flush