io_uring: ensure finish_wait() is always called in __io_uring_task_cancel()
[linux/fpc-iii.git] / arch / powerpc / lib / copyuser_64.S
blobdb8719a14846da9897bac1f21a3430684f15d4bb
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * Copyright (C) 2002 Paul Mackerras, IBM Corp.
4  */
5 #include <asm/processor.h>
6 #include <asm/ppc_asm.h>
7 #include <asm/export.h>
8 #include <asm/asm-compat.h>
9 #include <asm/feature-fixups.h>
11 #ifndef SELFTEST_CASE
12 /* 0 == most CPUs, 1 == POWER6, 2 == Cell */
13 #define SELFTEST_CASE   0
14 #endif
16 #ifdef __BIG_ENDIAN__
17 #define sLd sld         /* Shift towards low-numbered address. */
18 #define sHd srd         /* Shift towards high-numbered address. */
19 #else
20 #define sLd srd         /* Shift towards low-numbered address. */
21 #define sHd sld         /* Shift towards high-numbered address. */
22 #endif
25  * These macros are used to generate exception table entries.
26  * The exception handlers below use the original arguments
27  * (stored on the stack) and the point where we're up to in
28  * the destination buffer, i.e. the address of the first
29  * unmodified byte.  Generally r3 points into the destination
30  * buffer, but the first unmodified byte is at a variable
31  * offset from r3.  In the code below, the symbol r3_offset
32  * is set to indicate the current offset at each point in
33  * the code.  This offset is then used as a negative offset
34  * from the exception handler code, and those instructions
35  * before the exception handlers are addi instructions that
36  * adjust r3 to point to the correct place.
37  */
38         .macro  lex             /* exception handler for load */
39 100:    EX_TABLE(100b, .Lld_exc - r3_offset)
40         .endm
42         .macro  stex            /* exception handler for store */
43 100:    EX_TABLE(100b, .Lst_exc - r3_offset)
44         .endm
46         .align  7
47 _GLOBAL_TOC(__copy_tofrom_user)
48 #ifdef CONFIG_PPC_BOOK3S_64
49 BEGIN_FTR_SECTION
50         nop
51 FTR_SECTION_ELSE
52         b       __copy_tofrom_user_power7
53 ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
54 #endif
55 _GLOBAL(__copy_tofrom_user_base)
56         /* first check for a 4kB copy on a 4kB boundary */
57         cmpldi  cr1,r5,16
58         cmpdi   cr6,r5,4096
59         or      r0,r3,r4
60         neg     r6,r3           /* LS 3 bits = # bytes to 8-byte dest bdry */
61         andi.   r0,r0,4095
62         std     r3,-24(r1)
63         crand   cr0*4+2,cr0*4+2,cr6*4+2
64         std     r4,-16(r1)
65         std     r5,-8(r1)
66         dcbt    0,r4
67         beq     .Lcopy_page_4K
68         andi.   r6,r6,7
69         PPC_MTOCRF(0x01,r5)
70         blt     cr1,.Lshort_copy
71 /* Below we want to nop out the bne if we're on a CPU that has the
72  * CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
73  * cleared.
74  * At the time of writing the only CPU that has this combination of bits
75  * set is Power6.
76  */
77 test_feature = (SELFTEST_CASE == 1)
78 BEGIN_FTR_SECTION
79         nop
80 FTR_SECTION_ELSE
81         bne     .Ldst_unaligned
82 ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
83                     CPU_FTR_UNALIGNED_LD_STD)
84 .Ldst_aligned:
85         addi    r3,r3,-16
86 r3_offset = 16
87 test_feature = (SELFTEST_CASE == 0)
88 BEGIN_FTR_SECTION
89         andi.   r0,r4,7
90         bne     .Lsrc_unaligned
91 END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
92         blt     cr1,.Ldo_tail           /* if < 16 bytes to copy */
93         srdi    r0,r5,5
94         cmpdi   cr1,r0,0
95 lex;    ld      r7,0(r4)
96 lex;    ld      r6,8(r4)
97         addi    r4,r4,16
98         mtctr   r0
99         andi.   r0,r5,0x10
100         beq     22f
101         addi    r3,r3,16
102 r3_offset = 0
103         addi    r4,r4,-16
104         mr      r9,r7
105         mr      r8,r6
106         beq     cr1,72f
108 lex;    ld      r7,16(r4)
109 lex;    ld      r6,24(r4)
110         addi    r4,r4,32
111 stex;   std     r9,0(r3)
112 r3_offset = 8
113 stex;   std     r8,8(r3)
114 r3_offset = 16
116 lex;    ld      r9,0(r4)
117 lex;    ld      r8,8(r4)
118 stex;   std     r7,16(r3)
119 r3_offset = 24
120 stex;   std     r6,24(r3)
121         addi    r3,r3,32
122 r3_offset = 0
123         bdnz    21b
125 stex;   std     r9,0(r3)
126 r3_offset = 8
127 stex;   std     r8,8(r3)
128 r3_offset = 16
129         andi.   r5,r5,0xf
130         beq+    3f
131         addi    r4,r4,16
132 .Ldo_tail:
133         addi    r3,r3,16
134 r3_offset = 0
135         bf      cr7*4+0,246f
136 lex;    ld      r9,0(r4)
137         addi    r4,r4,8
138 stex;   std     r9,0(r3)
139         addi    r3,r3,8
140 246:    bf      cr7*4+1,1f
141 lex;    lwz     r9,0(r4)
142         addi    r4,r4,4
143 stex;   stw     r9,0(r3)
144         addi    r3,r3,4
145 1:      bf      cr7*4+2,2f
146 lex;    lhz     r9,0(r4)
147         addi    r4,r4,2
148 stex;   sth     r9,0(r3)
149         addi    r3,r3,2
150 2:      bf      cr7*4+3,3f
151 lex;    lbz     r9,0(r4)
152 stex;   stb     r9,0(r3)
153 3:      li      r3,0
154         blr
156 .Lsrc_unaligned:
157 r3_offset = 16
158         srdi    r6,r5,3
159         addi    r5,r5,-16
160         subf    r4,r0,r4
161         srdi    r7,r5,4
162         sldi    r10,r0,3
163         cmpldi  cr6,r6,3
164         andi.   r5,r5,7
165         mtctr   r7
166         subfic  r11,r10,64
167         add     r5,r5,r0
168         bt      cr7*4+0,28f
170 lex;    ld      r9,0(r4)        /* 3+2n loads, 2+2n stores */
171 lex;    ld      r0,8(r4)
172         sLd     r6,r9,r10
173 lex;    ldu     r9,16(r4)
174         sHd     r7,r0,r11
175         sLd     r8,r0,r10
176         or      r7,r7,r6
177         blt     cr6,79f
178 lex;    ld      r0,8(r4)
179         b       2f
182 lex;    ld      r0,0(r4)        /* 4+2n loads, 3+2n stores */
183 lex;    ldu     r9,8(r4)
184         sLd     r8,r0,r10
185         addi    r3,r3,-8
186 r3_offset = 24
187         blt     cr6,5f
188 lex;    ld      r0,8(r4)
189         sHd     r12,r9,r11
190         sLd     r6,r9,r10
191 lex;    ldu     r9,16(r4)
192         or      r12,r8,r12
193         sHd     r7,r0,r11
194         sLd     r8,r0,r10
195         addi    r3,r3,16
196 r3_offset = 8
197         beq     cr6,78f
199 1:      or      r7,r7,r6
200 lex;    ld      r0,8(r4)
201 stex;   std     r12,8(r3)
202 r3_offset = 16
203 2:      sHd     r12,r9,r11
204         sLd     r6,r9,r10
205 lex;    ldu     r9,16(r4)
206         or      r12,r8,r12
207 stex;   stdu    r7,16(r3)
208 r3_offset = 8
209         sHd     r7,r0,r11
210         sLd     r8,r0,r10
211         bdnz    1b
214 stex;   std     r12,8(r3)
215 r3_offset = 16
216         or      r7,r7,r6
218 stex;   std     r7,16(r3)
219 r3_offset = 24
220 5:      sHd     r12,r9,r11
221         or      r12,r8,r12
222 stex;   std     r12,24(r3)
223 r3_offset = 32
224         bne     6f
225         li      r3,0
226         blr
227 6:      cmpwi   cr1,r5,8
228         addi    r3,r3,32
229 r3_offset = 0
230         sLd     r9,r9,r10
231         ble     cr1,7f
232 lex;    ld      r0,8(r4)
233         sHd     r7,r0,r11
234         or      r9,r7,r9
236         bf      cr7*4+1,1f
237 #ifdef __BIG_ENDIAN__
238         rotldi  r9,r9,32
239 #endif
240 stex;   stw     r9,0(r3)
241 #ifdef __LITTLE_ENDIAN__
242         rotrdi  r9,r9,32
243 #endif
244         addi    r3,r3,4
245 1:      bf      cr7*4+2,2f
246 #ifdef __BIG_ENDIAN__
247         rotldi  r9,r9,16
248 #endif
249 stex;   sth     r9,0(r3)
250 #ifdef __LITTLE_ENDIAN__
251         rotrdi  r9,r9,16
252 #endif
253         addi    r3,r3,2
254 2:      bf      cr7*4+3,3f
255 #ifdef __BIG_ENDIAN__
256         rotldi  r9,r9,8
257 #endif
258 stex;   stb     r9,0(r3)
259 #ifdef __LITTLE_ENDIAN__
260         rotrdi  r9,r9,8
261 #endif
262 3:      li      r3,0
263         blr
265 .Ldst_unaligned:
266 r3_offset = 0
267         PPC_MTOCRF(0x01,r6)             /* put #bytes to 8B bdry into cr7 */
268         subf    r5,r6,r5
269         li      r7,0
270         cmpldi  cr1,r5,16
271         bf      cr7*4+3,1f
272 100:    EX_TABLE(100b, .Lld_exc_r7)
273         lbz     r0,0(r4)
274 100:    EX_TABLE(100b, .Lst_exc_r7)
275         stb     r0,0(r3)
276         addi    r7,r7,1
277 1:      bf      cr7*4+2,2f
278 100:    EX_TABLE(100b, .Lld_exc_r7)
279         lhzx    r0,r7,r4
280 100:    EX_TABLE(100b, .Lst_exc_r7)
281         sthx    r0,r7,r3
282         addi    r7,r7,2
283 2:      bf      cr7*4+1,3f
284 100:    EX_TABLE(100b, .Lld_exc_r7)
285         lwzx    r0,r7,r4
286 100:    EX_TABLE(100b, .Lst_exc_r7)
287         stwx    r0,r7,r3
288 3:      PPC_MTOCRF(0x01,r5)
289         add     r4,r6,r4
290         add     r3,r6,r3
291         b       .Ldst_aligned
293 .Lshort_copy:
294 r3_offset = 0
295         bf      cr7*4+0,1f
296 lex;    lwz     r0,0(r4)
297 lex;    lwz     r9,4(r4)
298         addi    r4,r4,8
299 stex;   stw     r0,0(r3)
300 stex;   stw     r9,4(r3)
301         addi    r3,r3,8
302 1:      bf      cr7*4+1,2f
303 lex;    lwz     r0,0(r4)
304         addi    r4,r4,4
305 stex;   stw     r0,0(r3)
306         addi    r3,r3,4
307 2:      bf      cr7*4+2,3f
308 lex;    lhz     r0,0(r4)
309         addi    r4,r4,2
310 stex;   sth     r0,0(r3)
311         addi    r3,r3,2
312 3:      bf      cr7*4+3,4f
313 lex;    lbz     r0,0(r4)
314 stex;   stb     r0,0(r3)
315 4:      li      r3,0
316         blr
319  * exception handlers follow
320  * we have to return the number of bytes not copied
321  * for an exception on a load, we set the rest of the destination to 0
322  * Note that the number of bytes of instructions for adjusting r3 needs
323  * to equal the amount of the adjustment, due to the trick of using
324  * .Lld_exc - r3_offset as the handler address.
325  */
327 .Lld_exc_r7:
328         add     r3,r3,r7
329         b       .Lld_exc
331         /* adjust by 24 */
332         addi    r3,r3,8
333         nop
334         /* adjust by 16 */
335         addi    r3,r3,8
336         nop
337         /* adjust by 8 */
338         addi    r3,r3,8
339         nop
342  * Here we have had a fault on a load and r3 points to the first
343  * unmodified byte of the destination.  We use the original arguments
344  * and r3 to work out how much wasn't copied.  Since we load some
345  * distance ahead of the stores, we continue copying byte-by-byte until
346  * we hit the load fault again in order to copy as much as possible.
347  */
348 .Lld_exc:
349         ld      r6,-24(r1)
350         ld      r4,-16(r1)
351         ld      r5,-8(r1)
352         subf    r6,r6,r3
353         add     r4,r4,r6
354         subf    r5,r6,r5        /* #bytes left to go */
357  * first see if we can copy any more bytes before hitting another exception
358  */
359         mtctr   r5
360 r3_offset = 0
361 100:    EX_TABLE(100b, .Ldone)
362 43:     lbz     r0,0(r4)
363         addi    r4,r4,1
364 stex;   stb     r0,0(r3)
365         addi    r3,r3,1
366         bdnz    43b
367         li      r3,0            /* huh? all copied successfully this time? */
368         blr
371  * here we have trapped again, amount remaining is in ctr.
372  */
373 .Ldone:
374         mfctr   r3
375         blr
378  * exception handlers for stores: we need to work out how many bytes
379  * weren't copied, and we may need to copy some more.
380  * Note that the number of bytes of instructions for adjusting r3 needs
381  * to equal the amount of the adjustment, due to the trick of using
382  * .Lst_exc - r3_offset as the handler address.
383  */
384 .Lst_exc_r7:
385         add     r3,r3,r7
386         b       .Lst_exc
388         /* adjust by 24 */
389         addi    r3,r3,8
390         nop
391         /* adjust by 16 */
392         addi    r3,r3,8
393         nop
394         /* adjust by 8 */
395         addi    r3,r3,4
396         /* adjust by 4 */
397         addi    r3,r3,4
398 .Lst_exc:
399         ld      r6,-24(r1)      /* original destination pointer */
400         ld      r4,-16(r1)      /* original source pointer */
401         ld      r5,-8(r1)       /* original number of bytes */
402         add     r7,r6,r5
403         /*
404          * If the destination pointer isn't 8-byte aligned,
405          * we may have got the exception as a result of a
406          * store that overlapped a page boundary, so we may be
407          * able to copy a few more bytes.
408          */
409 17:     andi.   r0,r3,7
410         beq     19f
411         subf    r8,r6,r3        /* #bytes copied */
412 100:    EX_TABLE(100b,19f)
413         lbzx    r0,r8,r4
414 100:    EX_TABLE(100b,19f)
415         stb     r0,0(r3)
416         addi    r3,r3,1
417         cmpld   r3,r7
418         blt     17b
419 19:     subf    r3,r3,r7        /* #bytes not copied in r3 */
420         blr
423  * Routine to copy a whole page of data, optimized for POWER4.
424  * On POWER4 it is more than 50% faster than the simple loop
425  * above (following the .Ldst_aligned label).
426  */
427         .macro  exc
428 100:    EX_TABLE(100b, .Labort)
429         .endm
430 .Lcopy_page_4K:
431         std     r31,-32(1)
432         std     r30,-40(1)
433         std     r29,-48(1)
434         std     r28,-56(1)
435         std     r27,-64(1)
436         std     r26,-72(1)
437         std     r25,-80(1)
438         std     r24,-88(1)
439         std     r23,-96(1)
440         std     r22,-104(1)
441         std     r21,-112(1)
442         std     r20,-120(1)
443         li      r5,4096/32 - 1
444         addi    r3,r3,-8
445         li      r0,5
446 0:      addi    r5,r5,-24
447         mtctr   r0
448 exc;    ld      r22,640(4)
449 exc;    ld      r21,512(4)
450 exc;    ld      r20,384(4)
451 exc;    ld      r11,256(4)
452 exc;    ld      r9,128(4)
453 exc;    ld      r7,0(4)
454 exc;    ld      r25,648(4)
455 exc;    ld      r24,520(4)
456 exc;    ld      r23,392(4)
457 exc;    ld      r10,264(4)
458 exc;    ld      r8,136(4)
459 exc;    ldu     r6,8(4)
460         cmpwi   r5,24
462 exc;    std     r22,648(3)
463 exc;    std     r21,520(3)
464 exc;    std     r20,392(3)
465 exc;    std     r11,264(3)
466 exc;    std     r9,136(3)
467 exc;    std     r7,8(3)
468 exc;    ld      r28,648(4)
469 exc;    ld      r27,520(4)
470 exc;    ld      r26,392(4)
471 exc;    ld      r31,264(4)
472 exc;    ld      r30,136(4)
473 exc;    ld      r29,8(4)
474 exc;    std     r25,656(3)
475 exc;    std     r24,528(3)
476 exc;    std     r23,400(3)
477 exc;    std     r10,272(3)
478 exc;    std     r8,144(3)
479 exc;    std     r6,16(3)
480 exc;    ld      r22,656(4)
481 exc;    ld      r21,528(4)
482 exc;    ld      r20,400(4)
483 exc;    ld      r11,272(4)
484 exc;    ld      r9,144(4)
485 exc;    ld      r7,16(4)
486 exc;    std     r28,664(3)
487 exc;    std     r27,536(3)
488 exc;    std     r26,408(3)
489 exc;    std     r31,280(3)
490 exc;    std     r30,152(3)
491 exc;    stdu    r29,24(3)
492 exc;    ld      r25,664(4)
493 exc;    ld      r24,536(4)
494 exc;    ld      r23,408(4)
495 exc;    ld      r10,280(4)
496 exc;    ld      r8,152(4)
497 exc;    ldu     r6,24(4)
498         bdnz    1b
499 exc;    std     r22,648(3)
500 exc;    std     r21,520(3)
501 exc;    std     r20,392(3)
502 exc;    std     r11,264(3)
503 exc;    std     r9,136(3)
504 exc;    std     r7,8(3)
505         addi    r4,r4,640
506         addi    r3,r3,648
507         bge     0b
508         mtctr   r5
509 exc;    ld      r7,0(4)
510 exc;    ld      r8,8(4)
511 exc;    ldu     r9,16(4)
513 exc;    ld      r10,8(4)
514 exc;    std     r7,8(3)
515 exc;    ld      r7,16(4)
516 exc;    std     r8,16(3)
517 exc;    ld      r8,24(4)
518 exc;    std     r9,24(3)
519 exc;    ldu     r9,32(4)
520 exc;    stdu    r10,32(3)
521         bdnz    3b
523 exc;    ld      r10,8(4)
524 exc;    std     r7,8(3)
525 exc;    std     r8,16(3)
526 exc;    std     r9,24(3)
527 exc;    std     r10,32(3)
528 9:      ld      r20,-120(1)
529         ld      r21,-112(1)
530         ld      r22,-104(1)
531         ld      r23,-96(1)
532         ld      r24,-88(1)
533         ld      r25,-80(1)
534         ld      r26,-72(1)
535         ld      r27,-64(1)
536         ld      r28,-56(1)
537         ld      r29,-48(1)
538         ld      r30,-40(1)
539         ld      r31,-32(1)
540         li      r3,0
541         blr
544  * on an exception, reset to the beginning and jump back into the
545  * standard __copy_tofrom_user
546  */
547 .Labort:
548         ld      r20,-120(1)
549         ld      r21,-112(1)
550         ld      r22,-104(1)
551         ld      r23,-96(1)
552         ld      r24,-88(1)
553         ld      r25,-80(1)
554         ld      r26,-72(1)
555         ld      r27,-64(1)
556         ld      r28,-56(1)
557         ld      r29,-48(1)
558         ld      r30,-40(1)
559         ld      r31,-32(1)
560         ld      r3,-24(r1)
561         ld      r4,-16(r1)
562         li      r5,4096
563         b       .Ldst_aligned
564 EXPORT_SYMBOL(__copy_tofrom_user)