Merge Chromium + Blink git repositories
[chromium-blink-merge.git] / third_party / boringssl / linux-aarch64 / crypto / modes / ghashv8-armx64.S
blob8d44667ec23e92837c15182aa20db37c3eab4104
1 #if defined(__aarch64__)
2 #include <openssl/arm_arch.h>
4 .text
5 #if !defined(__clang__)
6 .arch   armv8-a+crypto
7 #endif
8 .globl  gcm_init_v8
9 .type   gcm_init_v8,%function
10 .align  4
11 gcm_init_v8:
12         ld1     {v17.2d},[x1]           //load input H
13         movi    v19.16b,#0xe1
14         shl     v19.2d,v19.2d,#57               //0xc2.0
15         ext     v3.16b,v17.16b,v17.16b,#8
16         ushr    v18.2d,v19.2d,#63
17         dup     v17.4s,v17.s[1]
18         ext     v16.16b,v18.16b,v19.16b,#8              //t0=0xc2....01
19         ushr    v18.2d,v3.2d,#63
20         sshr    v17.4s,v17.4s,#31               //broadcast carry bit
21         and     v18.16b,v18.16b,v16.16b
22         shl     v3.2d,v3.2d,#1
23         ext     v18.16b,v18.16b,v18.16b,#8
24         and     v16.16b,v16.16b,v17.16b
25         orr     v3.16b,v3.16b,v18.16b           //H<<<=1
26         eor     v20.16b,v3.16b,v16.16b          //twisted H
27         st1     {v20.2d},[x0],#16               //store Htable[0]
29         //calculate H^2
30         ext     v16.16b,v20.16b,v20.16b,#8              //Karatsuba pre-processing
31         pmull   v0.1q,v20.1d,v20.1d
32         eor     v16.16b,v16.16b,v20.16b
33         pmull2  v2.1q,v20.2d,v20.2d
34         pmull   v1.1q,v16.1d,v16.1d
36         ext     v17.16b,v0.16b,v2.16b,#8                //Karatsuba post-processing
37         eor     v18.16b,v0.16b,v2.16b
38         eor     v1.16b,v1.16b,v17.16b
39         eor     v1.16b,v1.16b,v18.16b
40         pmull   v18.1q,v0.1d,v19.1d             //1st phase
42         ins     v2.d[0],v1.d[1]
43         ins     v1.d[1],v0.d[0]
44         eor     v0.16b,v1.16b,v18.16b
46         ext     v18.16b,v0.16b,v0.16b,#8                //2nd phase
47         pmull   v0.1q,v0.1d,v19.1d
48         eor     v18.16b,v18.16b,v2.16b
49         eor     v22.16b,v0.16b,v18.16b
51         ext     v17.16b,v22.16b,v22.16b,#8              //Karatsuba pre-processing
52         eor     v17.16b,v17.16b,v22.16b
53         ext     v21.16b,v16.16b,v17.16b,#8              //pack Karatsuba pre-processed
54         st1     {v21.2d,v22.2d},[x0]            //store Htable[1..2]
56         ret
57 .size   gcm_init_v8,.-gcm_init_v8
58 .globl  gcm_gmult_v8
59 .type   gcm_gmult_v8,%function
60 .align  4
61 gcm_gmult_v8:
62         ld1     {v17.2d},[x0]           //load Xi
63         movi    v19.16b,#0xe1
64         ld1     {v20.2d,v21.2d},[x1]    //load twisted H, ...
65         shl     v19.2d,v19.2d,#57
66 #ifndef __ARMEB__
67         rev64   v17.16b,v17.16b
68 #endif
69         ext     v3.16b,v17.16b,v17.16b,#8
71         pmull   v0.1q,v20.1d,v3.1d              //H.lo·Xi.lo
72         eor     v17.16b,v17.16b,v3.16b          //Karatsuba pre-processing
73         pmull2  v2.1q,v20.2d,v3.2d              //H.hi·Xi.hi
74         pmull   v1.1q,v21.1d,v17.1d             //(H.lo+H.hi)·(Xi.lo+Xi.hi)
76         ext     v17.16b,v0.16b,v2.16b,#8                //Karatsuba post-processing
77         eor     v18.16b,v0.16b,v2.16b
78         eor     v1.16b,v1.16b,v17.16b
79         eor     v1.16b,v1.16b,v18.16b
80         pmull   v18.1q,v0.1d,v19.1d             //1st phase of reduction
82         ins     v2.d[0],v1.d[1]
83         ins     v1.d[1],v0.d[0]
84         eor     v0.16b,v1.16b,v18.16b
86         ext     v18.16b,v0.16b,v0.16b,#8                //2nd phase of reduction
87         pmull   v0.1q,v0.1d,v19.1d
88         eor     v18.16b,v18.16b,v2.16b
89         eor     v0.16b,v0.16b,v18.16b
91 #ifndef __ARMEB__
92         rev64   v0.16b,v0.16b
93 #endif
94         ext     v0.16b,v0.16b,v0.16b,#8
95         st1     {v0.2d},[x0]            //write out Xi
97         ret
98 .size   gcm_gmult_v8,.-gcm_gmult_v8
99 .globl  gcm_ghash_v8
100 .type   gcm_ghash_v8,%function
101 .align  4
102 gcm_ghash_v8:
103         ld1     {v0.2d},[x0]            //load [rotated] Xi
104                                                 //"[rotated]" means that
105                                                 //loaded value would have
106                                                 //to be rotated in order to
107                                                 //make it appear as in
108                                                 //alorithm specification
109         subs    x3,x3,#32               //see if x3 is 32 or larger
110         mov     x12,#16         //x12 is used as post-
111                                                 //increment for input pointer;
112                                                 //as loop is modulo-scheduled
113                                                 //x12 is zeroed just in time
114                                                 //to preclude oversteping
115                                                 //inp[len], which means that
116                                                 //last block[s] are actually
117                                                 //loaded twice, but last
118                                                 //copy is not processed
119         ld1     {v20.2d,v21.2d},[x1],#32        //load twisted H, ..., H^2
120         movi    v19.16b,#0xe1
121         ld1     {v22.2d},[x1]
122         csel    x12,xzr,x12,eq                  //is it time to zero x12?
123         ext     v0.16b,v0.16b,v0.16b,#8         //rotate Xi
124         ld1     {v16.2d},[x2],#16       //load [rotated] I[0]
125         shl     v19.2d,v19.2d,#57               //compose 0xc2.0 constant
126 #ifndef __ARMEB__
127         rev64   v16.16b,v16.16b
128         rev64   v0.16b,v0.16b
129 #endif
130         ext     v3.16b,v16.16b,v16.16b,#8               //rotate I[0]
131         b.lo    .Lodd_tail_v8           //x3 was less than 32
132         ld1     {v17.2d},[x2],x12       //load [rotated] I[1]
133 #ifndef __ARMEB__
134         rev64   v17.16b,v17.16b
135 #endif
136         ext     v7.16b,v17.16b,v17.16b,#8
137         eor     v3.16b,v3.16b,v0.16b            //I[i]^=Xi
138         pmull   v4.1q,v20.1d,v7.1d              //H·Ii+1
139         eor     v17.16b,v17.16b,v7.16b          //Karatsuba pre-processing
140         pmull2  v6.1q,v20.2d,v7.2d
141         b       .Loop_mod2x_v8
143 .align  4
144 .Loop_mod2x_v8:
145         ext     v18.16b,v3.16b,v3.16b,#8
146         subs    x3,x3,#32               //is there more data?
147         pmull   v0.1q,v22.1d,v3.1d              //H^2.lo·Xi.lo
148         csel    x12,xzr,x12,lo                  //is it time to zero x12?
150         pmull   v5.1q,v21.1d,v17.1d
151         eor     v18.16b,v18.16b,v3.16b          //Karatsuba pre-processing
152         pmull2  v2.1q,v22.2d,v3.2d              //H^2.hi·Xi.hi
153         eor     v0.16b,v0.16b,v4.16b            //accumulate
154         pmull2  v1.1q,v21.2d,v18.2d             //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
155         ld1     {v16.2d},[x2],x12       //load [rotated] I[i+2]
157         eor     v2.16b,v2.16b,v6.16b
158         csel    x12,xzr,x12,eq                  //is it time to zero x12?
159         eor     v1.16b,v1.16b,v5.16b
161         ext     v17.16b,v0.16b,v2.16b,#8                //Karatsuba post-processing
162         eor     v18.16b,v0.16b,v2.16b
163         eor     v1.16b,v1.16b,v17.16b
164         ld1     {v17.2d},[x2],x12       //load [rotated] I[i+3]
165 #ifndef __ARMEB__
166         rev64   v16.16b,v16.16b
167 #endif
168         eor     v1.16b,v1.16b,v18.16b
169         pmull   v18.1q,v0.1d,v19.1d             //1st phase of reduction
171 #ifndef __ARMEB__
172         rev64   v17.16b,v17.16b
173 #endif
174         ins     v2.d[0],v1.d[1]
175         ins     v1.d[1],v0.d[0]
176         ext     v7.16b,v17.16b,v17.16b,#8
177         ext     v3.16b,v16.16b,v16.16b,#8
178         eor     v0.16b,v1.16b,v18.16b
179         pmull   v4.1q,v20.1d,v7.1d              //H·Ii+1
180         eor     v3.16b,v3.16b,v2.16b            //accumulate v3.16b early
182         ext     v18.16b,v0.16b,v0.16b,#8                //2nd phase of reduction
183         pmull   v0.1q,v0.1d,v19.1d
184         eor     v3.16b,v3.16b,v18.16b
185         eor     v17.16b,v17.16b,v7.16b          //Karatsuba pre-processing
186         eor     v3.16b,v3.16b,v0.16b
187         pmull2  v6.1q,v20.2d,v7.2d
188         b.hs    .Loop_mod2x_v8          //there was at least 32 more bytes
190         eor     v2.16b,v2.16b,v18.16b
191         ext     v3.16b,v16.16b,v16.16b,#8               //re-construct v3.16b
192         adds    x3,x3,#32               //re-construct x3
193         eor     v0.16b,v0.16b,v2.16b            //re-construct v0.16b
194         b.eq    .Ldone_v8               //is x3 zero?
195 .Lodd_tail_v8:
196         ext     v18.16b,v0.16b,v0.16b,#8
197         eor     v3.16b,v3.16b,v0.16b            //inp^=Xi
198         eor     v17.16b,v16.16b,v18.16b         //v17.16b is rotated inp^Xi
200         pmull   v0.1q,v20.1d,v3.1d              //H.lo·Xi.lo
201         eor     v17.16b,v17.16b,v3.16b          //Karatsuba pre-processing
202         pmull2  v2.1q,v20.2d,v3.2d              //H.hi·Xi.hi
203         pmull   v1.1q,v21.1d,v17.1d             //(H.lo+H.hi)·(Xi.lo+Xi.hi)
205         ext     v17.16b,v0.16b,v2.16b,#8                //Karatsuba post-processing
206         eor     v18.16b,v0.16b,v2.16b
207         eor     v1.16b,v1.16b,v17.16b
208         eor     v1.16b,v1.16b,v18.16b
209         pmull   v18.1q,v0.1d,v19.1d             //1st phase of reduction
211         ins     v2.d[0],v1.d[1]
212         ins     v1.d[1],v0.d[0]
213         eor     v0.16b,v1.16b,v18.16b
215         ext     v18.16b,v0.16b,v0.16b,#8                //2nd phase of reduction
216         pmull   v0.1q,v0.1d,v19.1d
217         eor     v18.16b,v18.16b,v2.16b
218         eor     v0.16b,v0.16b,v18.16b
220 .Ldone_v8:
221 #ifndef __ARMEB__
222         rev64   v0.16b,v0.16b
223 #endif
224         ext     v0.16b,v0.16b,v0.16b,#8
225         st1     {v0.2d},[x0]            //write out Xi
227         ret
228 .size   gcm_ghash_v8,.-gcm_ghash_v8
229 .byte   71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
230 .align  2
231 .align  2
232 #endif