Merge tag 'trace-printf-v6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/trace...
[drm/drm-misc.git] / arch / powerpc / crypto / ghashp10-ppc.pl
blob27a6b0bec645f9ae4e457fcbdbe0bccda07dc71e
1 #!/usr/bin/env perl
2 # SPDX-License-Identifier: GPL-2.0
4 # This code is taken from the OpenSSL project but the author (Andy Polyakov)
5 # has relicensed it under the GPLv2. Therefore this program is free software;
6 # you can redistribute it and/or modify it under the terms of the GNU General
7 # Public License version 2 as published by the Free Software Foundation.
9 # The original headers, including the original license headers, are
10 # included below for completeness.
12 # ====================================================================
13 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
14 # project. The module is, however, dual licensed under OpenSSL and
15 # CRYPTOGAMS licenses depending on where you obtain it. For further
16 # details see https://www.openssl.org/~appro/cryptogams/.
17 # ====================================================================
19 # GHASH for PowerISA v2.07.
21 # July 2014
23 # Accurate performance measurements are problematic, because it's
24 # always virtualized setup with possibly throttled processor.
25 # Relative comparison is therefore more informative. This initial
26 # version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
27 # faster than "4-bit" integer-only compiler-generated 64-bit code.
28 # "Initial version" means that there is room for futher improvement.
30 $flavour=shift;
31 $output =shift;
33 if ($flavour =~ /64/) {
34 $SIZE_T=8;
35 $LRSAVE=2*$SIZE_T;
36 $STU="stdu";
37 $POP="ld";
38 $PUSH="std";
39 } elsif ($flavour =~ /32/) {
40 $SIZE_T=4;
41 $LRSAVE=$SIZE_T;
42 $STU="stwu";
43 $POP="lwz";
44 $PUSH="stw";
45 } else { die "nonsense $flavour"; }
47 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
48 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
49 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
50 die "can't locate ppc-xlate.pl";
52 open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
54 my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block
56 my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
57 my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
58 my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19));
59 my $vrsave="r12";
60 my ($t4,$t5,$t6) = ($Hl,$H,$Hh);
62 $code=<<___;
63 .machine "any"
65 .text
67 .globl .gcm_init_p10
68 lis r0,0xfff0
69 li r8,0x10
70 mfspr $vrsave,256
71 li r9,0x20
72 mtspr 256,r0
73 li r10,0x30
74 lvx_u $H,0,r4 # load H
75 le?xor r7,r7,r7
76 le?addi r7,r7,0x8 # need a vperm start with 08
77 le?lvsr 5,0,r7
78 le?vspltisb 6,0x0f
79 le?vxor 5,5,6 # set a b-endian mask
80 le?vperm $H,$H,$H,5
82 vspltisb $xC2,-16 # 0xf0
83 vspltisb $t0,1 # one
84 vaddubm $xC2,$xC2,$xC2 # 0xe0
85 vxor $zero,$zero,$zero
86 vor $xC2,$xC2,$t0 # 0xe1
87 vsldoi $xC2,$xC2,$zero,15 # 0xe1...
88 vsldoi $t1,$zero,$t0,1 # ...1
89 vaddubm $xC2,$xC2,$xC2 # 0xc2...
90 vspltisb $t2,7
91 vor $xC2,$xC2,$t1 # 0xc2....01
92 vspltb $t1,$H,0 # most significant byte
93 vsl $H,$H,$t0 # H<<=1
94 vsrab $t1,$t1,$t2 # broadcast carry bit
95 vand $t1,$t1,$xC2
96 vxor $H,$H,$t1 # twisted H
98 vsldoi $H,$H,$H,8 # twist even more ...
99 vsldoi $xC2,$zero,$xC2,8 # 0xc2.0
100 vsldoi $Hl,$zero,$H,8 # ... and split
101 vsldoi $Hh,$H,$zero,8
103 stvx_u $xC2,0,r3 # save pre-computed table
104 stvx_u $Hl,r8,r3
105 stvx_u $H, r9,r3
106 stvx_u $Hh,r10,r3
108 mtspr 256,$vrsave
110 .long 0
111 .byte 0,12,0x14,0,0,0,2,0
112 .long 0
113 .size .gcm_init_p10,.-.gcm_init_p10
115 .globl .gcm_init_htable
116 lis r0,0xfff0
117 li r8,0x10
118 mfspr $vrsave,256
119 li r9,0x20
120 mtspr 256,r0
121 li r10,0x30
122 lvx_u $H,0,r4 # load H
124 vspltisb $xC2,-16 # 0xf0
125 vspltisb $t0,1 # one
126 vaddubm $xC2,$xC2,$xC2 # 0xe0
127 vxor $zero,$zero,$zero
128 vor $xC2,$xC2,$t0 # 0xe1
129 vsldoi $xC2,$xC2,$zero,15 # 0xe1...
130 vsldoi $t1,$zero,$t0,1 # ...1
131 vaddubm $xC2,$xC2,$xC2 # 0xc2...
132 vspltisb $t2,7
133 vor $xC2,$xC2,$t1 # 0xc2....01
134 vspltb $t1,$H,0 # most significant byte
135 vsl $H,$H,$t0 # H<<=1
136 vsrab $t1,$t1,$t2 # broadcast carry bit
137 vand $t1,$t1,$xC2
138 vxor $IN,$H,$t1 # twisted H
140 vsldoi $H,$IN,$IN,8 # twist even more ...
141 vsldoi $xC2,$zero,$xC2,8 # 0xc2.0
142 vsldoi $Hl,$zero,$H,8 # ... and split
143 vsldoi $Hh,$H,$zero,8
145 stvx_u $xC2,0,r3 # save pre-computed table
146 stvx_u $Hl,r8,r3
147 li r8,0x40
148 stvx_u $H, r9,r3
149 li r9,0x50
150 stvx_u $Hh,r10,r3
151 li r10,0x60
153 vpmsumd $Xl,$IN,$Hl # H.lo·H.lo
154 vpmsumd $Xm,$IN,$H # H.hi·H.lo+H.lo·H.hi
155 vpmsumd $Xh,$IN,$Hh # H.hi·H.hi
157 vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
159 vsldoi $t0,$Xm,$zero,8
160 vsldoi $t1,$zero,$Xm,8
161 vxor $Xl,$Xl,$t0
162 vxor $Xh,$Xh,$t1
164 vsldoi $Xl,$Xl,$Xl,8
165 vxor $Xl,$Xl,$t2
167 vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
168 vpmsumd $Xl,$Xl,$xC2
169 vxor $t1,$t1,$Xh
170 vxor $IN1,$Xl,$t1
172 vsldoi $H2,$IN1,$IN1,8
173 vsldoi $H2l,$zero,$H2,8
174 vsldoi $H2h,$H2,$zero,8
176 stvx_u $H2l,r8,r3 # save H^2
177 li r8,0x70
178 stvx_u $H2,r9,r3
179 li r9,0x80
180 stvx_u $H2h,r10,r3
181 li r10,0x90
183 vpmsumd $Xl,$IN,$H2l # H.lo·H^2.lo
184 vpmsumd $Xl1,$IN1,$H2l # H^2.lo·H^2.lo
185 vpmsumd $Xm,$IN,$H2 # H.hi·H^2.lo+H.lo·H^2.hi
186 vpmsumd $Xm1,$IN1,$H2 # H^2.hi·H^2.lo+H^2.lo·H^2.hi
187 vpmsumd $Xh,$IN,$H2h # H.hi·H^2.hi
188 vpmsumd $Xh1,$IN1,$H2h # H^2.hi·H^2.hi
190 vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
191 vpmsumd $t6,$Xl1,$xC2 # 1st reduction phase
193 vsldoi $t0,$Xm,$zero,8
194 vsldoi $t1,$zero,$Xm,8
195 vsldoi $t4,$Xm1,$zero,8
196 vsldoi $t5,$zero,$Xm1,8
197 vxor $Xl,$Xl,$t0
198 vxor $Xh,$Xh,$t1
199 vxor $Xl1,$Xl1,$t4
200 vxor $Xh1,$Xh1,$t5
202 vsldoi $Xl,$Xl,$Xl,8
203 vsldoi $Xl1,$Xl1,$Xl1,8
204 vxor $Xl,$Xl,$t2
205 vxor $Xl1,$Xl1,$t6
207 vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
208 vsldoi $t5,$Xl1,$Xl1,8 # 2nd reduction phase
209 vpmsumd $Xl,$Xl,$xC2
210 vpmsumd $Xl1,$Xl1,$xC2
211 vxor $t1,$t1,$Xh
212 vxor $t5,$t5,$Xh1
213 vxor $Xl,$Xl,$t1
214 vxor $Xl1,$Xl1,$t5
216 vsldoi $H,$Xl,$Xl,8
217 vsldoi $H2,$Xl1,$Xl1,8
218 vsldoi $Hl,$zero,$H,8
219 vsldoi $Hh,$H,$zero,8
220 vsldoi $H2l,$zero,$H2,8
221 vsldoi $H2h,$H2,$zero,8
223 stvx_u $Hl,r8,r3 # save H^3
224 li r8,0xa0
225 stvx_u $H,r9,r3
226 li r9,0xb0
227 stvx_u $Hh,r10,r3
228 li r10,0xc0
229 stvx_u $H2l,r8,r3 # save H^4
230 stvx_u $H2,r9,r3
231 stvx_u $H2h,r10,r3
233 mtspr 256,$vrsave
235 .long 0
236 .byte 0,12,0x14,0,0,0,2,0
237 .long 0
238 .size .gcm_init_htable,.-.gcm_init_htable
240 .globl .gcm_gmult_p10
241 lis r0,0xfff8
242 li r8,0x10
243 mfspr $vrsave,256
244 li r9,0x20
245 mtspr 256,r0
246 li r10,0x30
247 lvx_u $IN,0,$Xip # load Xi
249 lvx_u $Hl,r8,$Htbl # load pre-computed table
250 le?lvsl $lemask,r0,r0
251 lvx_u $H, r9,$Htbl
252 le?vspltisb $t0,0x07
253 lvx_u $Hh,r10,$Htbl
254 le?vxor $lemask,$lemask,$t0
255 lvx_u $xC2,0,$Htbl
256 le?vperm $IN,$IN,$IN,$lemask
257 vxor $zero,$zero,$zero
259 vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
260 vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
261 vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
263 vpmsumd $t2,$Xl,$xC2 # 1st phase
265 vsldoi $t0,$Xm,$zero,8
266 vsldoi $t1,$zero,$Xm,8
267 vxor $Xl,$Xl,$t0
268 vxor $Xh,$Xh,$t1
270 vsldoi $Xl,$Xl,$Xl,8
271 vxor $Xl,$Xl,$t2
273 vsldoi $t1,$Xl,$Xl,8 # 2nd phase
274 vpmsumd $Xl,$Xl,$xC2
275 vxor $t1,$t1,$Xh
276 vxor $Xl,$Xl,$t1
278 le?vperm $Xl,$Xl,$Xl,$lemask
279 stvx_u $Xl,0,$Xip # write out Xi
281 mtspr 256,$vrsave
283 .long 0
284 .byte 0,12,0x14,0,0,0,2,0
285 .long 0
286 .size .gcm_gmult_p10,.-.gcm_gmult_p10
288 .globl .gcm_ghash_p10
289 lis r0,0xfff8
290 li r8,0x10
291 mfspr $vrsave,256
292 li r9,0x20
293 mtspr 256,r0
294 li r10,0x30
295 lvx_u $Xl,0,$Xip # load Xi
297 lvx_u $Hl,r8,$Htbl # load pre-computed table
298 le?lvsl $lemask,r0,r0
299 lvx_u $H, r9,$Htbl
300 le?vspltisb $t0,0x07
301 lvx_u $Hh,r10,$Htbl
302 le?vxor $lemask,$lemask,$t0
303 lvx_u $xC2,0,$Htbl
304 le?vperm $Xl,$Xl,$Xl,$lemask
305 vxor $zero,$zero,$zero
307 lvx_u $IN,0,$inp
308 addi $inp,$inp,16
309 subi $len,$len,16
310 le?vperm $IN,$IN,$IN,$lemask
311 vxor $IN,$IN,$Xl
312 b Loop
314 .align 5
315 Loop:
316 subic $len,$len,16
317 vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
318 subfe. r0,r0,r0 # borrow?-1:0
319 vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
320 and r0,r0,$len
321 vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
322 add $inp,$inp,r0
324 vpmsumd $t2,$Xl,$xC2 # 1st phase
326 vsldoi $t0,$Xm,$zero,8
327 vsldoi $t1,$zero,$Xm,8
328 vxor $Xl,$Xl,$t0
329 vxor $Xh,$Xh,$t1
331 vsldoi $Xl,$Xl,$Xl,8
332 vxor $Xl,$Xl,$t2
333 lvx_u $IN,0,$inp
334 addi $inp,$inp,16
336 vsldoi $t1,$Xl,$Xl,8 # 2nd phase
337 vpmsumd $Xl,$Xl,$xC2
338 le?vperm $IN,$IN,$IN,$lemask
339 vxor $t1,$t1,$Xh
340 vxor $IN,$IN,$t1
341 vxor $IN,$IN,$Xl
342 beq Loop # did $len-=16 borrow?
344 vxor $Xl,$Xl,$t1
345 le?vperm $Xl,$Xl,$Xl,$lemask
346 stvx_u $Xl,0,$Xip # write out Xi
348 mtspr 256,$vrsave
350 .long 0
351 .byte 0,12,0x14,0,0,0,4,0
352 .long 0
353 .size .gcm_ghash_p10,.-.gcm_ghash_p10
355 .asciz "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
356 .align 2
359 foreach (split("\n",$code)) {
360 if ($flavour =~ /le$/o) { # little-endian
361 s/le\?//o or
362 s/be\?/#be#/o;
363 } else {
364 s/le\?/#le#/o or
365 s/be\?//o;
367 print $_,"\n";
370 close STDOUT; # enforce flush