2 # SPDX-License-Identifier: GPL-2.0
4 # This code is taken from the OpenSSL project but the author (Andy Polyakov)
5 # has relicensed it under the GPLv2. Therefore this program is free software;
6 # you can redistribute it and/or modify it under the terms of the GNU General
7 # Public License version 2 as published by the Free Software Foundation.
9 # The original headers, including the original license headers, are
10 # included below for completeness.
12 # ====================================================================
13 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
14 # project. The module is, however, dual licensed under OpenSSL and
15 # CRYPTOGAMS licenses depending on where you obtain it. For further
16 # details see https://www.openssl.org/~appro/cryptogams/.
17 # ====================================================================
19 # GHASH for PowerISA v2.07.
23 # Accurate performance measurements are problematic, because it's
24 # always virtualized setup with possibly throttled processor.
25 # Relative comparison is therefore more informative. This initial
26 # version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
27 # faster than "4-bit" integer-only compiler-generated 64-bit code.
28 # "Initial version" means that there is room for futher improvement.
33 if ($flavour =~ /64/) {
39 } elsif ($flavour =~ /32/) {
45 } else { die "nonsense $flavour"; }
47 $0 =~ m/(.*[\/\\])[^\
/\\]+$/; $dir=$1;
48 ( $xlate="${dir}ppc-xlate.pl" and -f
$xlate ) or
49 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f
$xlate) or
50 die "can't locate ppc-xlate.pl";
52 open STDOUT
,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
54 my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block
56 my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
57 my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
58 my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19));
60 my ($t4,$t5,$t6) = ($Hl,$H,$Hh);
74 lvx_u
$H,0,r4
# load H
76 le?addi r7
,r7
,0x8 # need a vperm start with 08
79 le?vxor
5,5,6 # set a b-endian mask
82 vspltisb
$xC2,-16 # 0xf0
84 vaddubm
$xC2,$xC2,$xC2 # 0xe0
85 vxor
$zero,$zero,$zero
86 vor
$xC2,$xC2,$t0 # 0xe1
87 vsldoi
$xC2,$xC2,$zero,15 # 0xe1...
88 vsldoi
$t1,$zero,$t0,1 # ...1
89 vaddubm
$xC2,$xC2,$xC2 # 0xc2...
91 vor
$xC2,$xC2,$t1 # 0xc2....01
92 vspltb
$t1,$H,0 # most significant byte
94 vsrab
$t1,$t1,$t2 # broadcast carry bit
96 vxor
$H,$H,$t1 # twisted H
98 vsldoi
$H,$H,$H,8 # twist even more ...
99 vsldoi
$xC2,$zero,$xC2,8 # 0xc2.0
100 vsldoi
$Hl,$zero,$H,8 # ... and split
101 vsldoi
$Hh,$H,$zero,8
103 stvx_u
$xC2,0,r3
# save pre-computed table
111 .byte
0,12,0x14,0,0,0,2,0
113 .size
.gcm_init_p10
,.-.gcm_init_p10
115 .globl
.gcm_init_htable
122 lvx_u
$H,0,r4
# load H
124 vspltisb
$xC2,-16 # 0xf0
126 vaddubm
$xC2,$xC2,$xC2 # 0xe0
127 vxor
$zero,$zero,$zero
128 vor
$xC2,$xC2,$t0 # 0xe1
129 vsldoi
$xC2,$xC2,$zero,15 # 0xe1...
130 vsldoi
$t1,$zero,$t0,1 # ...1
131 vaddubm
$xC2,$xC2,$xC2 # 0xc2...
133 vor
$xC2,$xC2,$t1 # 0xc2....01
134 vspltb
$t1,$H,0 # most significant byte
135 vsl
$H,$H,$t0 # H<<=1
136 vsrab
$t1,$t1,$t2 # broadcast carry bit
138 vxor
$IN,$H,$t1 # twisted H
140 vsldoi
$H,$IN,$IN,8 # twist even more ...
141 vsldoi
$xC2,$zero,$xC2,8 # 0xc2.0
142 vsldoi
$Hl,$zero,$H,8 # ... and split
143 vsldoi
$Hh,$H,$zero,8
145 stvx_u
$xC2,0,r3
# save pre-computed table
153 vpmsumd
$Xl,$IN,$Hl # H.lo·H.lo
154 vpmsumd
$Xm,$IN,$H # H.hi·H.lo+H.lo·H.hi
155 vpmsumd
$Xh,$IN,$Hh # H.hi·H.hi
157 vpmsumd
$t2,$Xl,$xC2 # 1st reduction phase
159 vsldoi
$t0,$Xm,$zero,8
160 vsldoi
$t1,$zero,$Xm,8
167 vsldoi
$t1,$Xl,$Xl,8 # 2nd reduction phase
172 vsldoi
$H2,$IN1,$IN1,8
173 vsldoi
$H2l,$zero,$H2,8
174 vsldoi
$H2h,$H2,$zero,8
176 stvx_u
$H2l,r8
,r3
# save H^2
183 vpmsumd
$Xl,$IN,$H2l # H.lo·H^2.lo
184 vpmsumd
$Xl1,$IN1,$H2l # H^2.lo·H^2.lo
185 vpmsumd
$Xm,$IN,$H2 # H.hi·H^2.lo+H.lo·H^2.hi
186 vpmsumd
$Xm1,$IN1,$H2 # H^2.hi·H^2.lo+H^2.lo·H^2.hi
187 vpmsumd
$Xh,$IN,$H2h # H.hi·H^2.hi
188 vpmsumd
$Xh1,$IN1,$H2h # H^2.hi·H^2.hi
190 vpmsumd
$t2,$Xl,$xC2 # 1st reduction phase
191 vpmsumd
$t6,$Xl1,$xC2 # 1st reduction phase
193 vsldoi
$t0,$Xm,$zero,8
194 vsldoi
$t1,$zero,$Xm,8
195 vsldoi
$t4,$Xm1,$zero,8
196 vsldoi
$t5,$zero,$Xm1,8
203 vsldoi
$Xl1,$Xl1,$Xl1,8
207 vsldoi
$t1,$Xl,$Xl,8 # 2nd reduction phase
208 vsldoi
$t5,$Xl1,$Xl1,8 # 2nd reduction phase
210 vpmsumd
$Xl1,$Xl1,$xC2
217 vsldoi
$H2,$Xl1,$Xl1,8
218 vsldoi
$Hl,$zero,$H,8
219 vsldoi
$Hh,$H,$zero,8
220 vsldoi
$H2l,$zero,$H2,8
221 vsldoi
$H2h,$H2,$zero,8
223 stvx_u
$Hl,r8
,r3
# save H^3
229 stvx_u
$H2l,r8
,r3
# save H^4
236 .byte
0,12,0x14,0,0,0,2,0
238 .size
.gcm_init_htable
,.-.gcm_init_htable
240 .globl
.gcm_gmult_p10
247 lvx_u
$IN,0,$Xip # load Xi
249 lvx_u
$Hl,r8
,$Htbl # load pre-computed table
250 le?lvsl
$lemask,r0
,r0
254 le?vxor
$lemask,$lemask,$t0
256 le?vperm
$IN,$IN,$IN,$lemask
257 vxor
$zero,$zero,$zero
259 vpmsumd
$Xl,$IN,$Hl # H.lo·Xi.lo
260 vpmsumd
$Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
261 vpmsumd
$Xh,$IN,$Hh # H.hi·Xi.hi
263 vpmsumd
$t2,$Xl,$xC2 # 1st phase
265 vsldoi
$t0,$Xm,$zero,8
266 vsldoi
$t1,$zero,$Xm,8
273 vsldoi
$t1,$Xl,$Xl,8 # 2nd phase
278 le?vperm
$Xl,$Xl,$Xl,$lemask
279 stvx_u
$Xl,0,$Xip # write out Xi
284 .byte
0,12,0x14,0,0,0,2,0
286 .size
.gcm_gmult_p10
,.-.gcm_gmult_p10
288 .globl
.gcm_ghash_p10
295 lvx_u
$Xl,0,$Xip # load Xi
297 lvx_u
$Hl,r8
,$Htbl # load pre-computed table
298 le?lvsl
$lemask,r0
,r0
302 le?vxor
$lemask,$lemask,$t0
304 le?vperm
$Xl,$Xl,$Xl,$lemask
305 vxor
$zero,$zero,$zero
310 le?vperm
$IN,$IN,$IN,$lemask
317 vpmsumd
$Xl,$IN,$Hl # H.lo·Xi.lo
318 subfe
. r0
,r0
,r0
# borrow?-1:0
319 vpmsumd
$Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
321 vpmsumd
$Xh,$IN,$Hh # H.hi·Xi.hi
324 vpmsumd
$t2,$Xl,$xC2 # 1st phase
326 vsldoi
$t0,$Xm,$zero,8
327 vsldoi
$t1,$zero,$Xm,8
336 vsldoi
$t1,$Xl,$Xl,8 # 2nd phase
338 le?vperm
$IN,$IN,$IN,$lemask
342 beq Loop
# did $len-=16 borrow?
345 le?vperm
$Xl,$Xl,$Xl,$lemask
346 stvx_u
$Xl,0,$Xip # write out Xi
351 .byte
0,12,0x14,0,0,0,4,0
353 .size
.gcm_ghash_p10
,.-.gcm_ghash_p10
355 .asciz
"GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
359 foreach (split("\n",$code)) {
360 if ($flavour =~ /le$/o) { # little-endian
370 close STDOUT
; # enforce flush