1 /* NG4memcpy.S: Niagara-4 optimized memcpy.
3 * Copyright (C) 2012 David S. Miller (davem@davemloft.net)
7 #include <linux/linkage.h>
8 #include <asm/visasm.h>
10 #define GLOBAL_SPARE %g7
12 #define ASI_BLK_INIT_QUAD_LDD_P 0xe2
15 /* On T4 it is very expensive to access ASRs like %fprs and
16 * %asi, avoiding a read or a write can save ~50 cycles.
20 andcc %o5, FPRS_FEF, %g0; \
22 wr %g0, FPRS_FEF, %fprs; \
26 #define VISEntryHalf FPU_ENTER; \
27 clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0;
28 #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
30 #define VISEntryHalf FPU_ENTER
31 #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
34 #define GLOBAL_SPARE %g5
38 #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
39 #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P
41 #define STORE_ASI 0x80 /* ASI_P */
45 #if !defined(EX_LD) && !defined(EX_ST)
53 #define EX_LD_FP(x,y) x
60 #define EX_ST_FP(x,y) x
65 #define LOAD(type,addr,dest) type [addr], dest
70 #define STORE(type,src,addr) type src, [addr]
72 #define STORE(type,src,addr) type##a src, [addr] %asi
77 #define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI
81 #define FUNC_NAME NG4memcpy
91 .register %g2,#scratch
92 .register %g3,#scratch
96 #define EX_RETVAL(x) x
101 wr %g0, ASI_AIUS, %asi
104 ba,pt %xcc, __restore_asi
107 ENTRY(NG4_retl_o2_plus_1)
108 ba,pt %xcc, __restore_asi
110 ENDPROC(NG4_retl_o2_plus_1)
111 ENTRY(NG4_retl_o2_plus_4)
112 ba,pt %xcc, __restore_asi
114 ENDPROC(NG4_retl_o2_plus_4)
115 ENTRY(NG4_retl_o2_plus_o5)
116 ba,pt %xcc, __restore_asi
118 ENDPROC(NG4_retl_o2_plus_o5)
119 ENTRY(NG4_retl_o2_plus_o5_plus_4)
121 ba,pt %xcc, __restore_asi
123 ENDPROC(NG4_retl_o2_plus_o5_plus_4)
124 ENTRY(NG4_retl_o2_plus_o5_plus_8)
126 ba,pt %xcc, __restore_asi
128 ENDPROC(NG4_retl_o2_plus_o5_plus_8)
129 ENTRY(NG4_retl_o2_plus_o5_plus_16)
131 ba,pt %xcc, __restore_asi
133 ENDPROC(NG4_retl_o2_plus_o5_plus_16)
134 ENTRY(NG4_retl_o2_plus_o5_plus_24)
136 ba,pt %xcc, __restore_asi
138 ENDPROC(NG4_retl_o2_plus_o5_plus_24)
139 ENTRY(NG4_retl_o2_plus_o5_plus_32)
141 ba,pt %xcc, __restore_asi
143 ENDPROC(NG4_retl_o2_plus_o5_plus_32)
144 ENTRY(NG4_retl_o2_plus_g1)
145 ba,pt %xcc, __restore_asi
147 ENDPROC(NG4_retl_o2_plus_g1)
148 ENTRY(NG4_retl_o2_plus_g1_plus_1)
150 ba,pt %xcc, __restore_asi
152 ENDPROC(NG4_retl_o2_plus_g1_plus_1)
153 ENTRY(NG4_retl_o2_plus_g1_plus_8)
155 ba,pt %xcc, __restore_asi
157 ENDPROC(NG4_retl_o2_plus_g1_plus_8)
158 ENTRY(NG4_retl_o2_plus_o4)
159 ba,pt %xcc, __restore_asi
161 ENDPROC(NG4_retl_o2_plus_o4)
162 ENTRY(NG4_retl_o2_plus_o4_plus_8)
164 ba,pt %xcc, __restore_asi
166 ENDPROC(NG4_retl_o2_plus_o4_plus_8)
167 ENTRY(NG4_retl_o2_plus_o4_plus_16)
169 ba,pt %xcc, __restore_asi
171 ENDPROC(NG4_retl_o2_plus_o4_plus_16)
172 ENTRY(NG4_retl_o2_plus_o4_plus_24)
174 ba,pt %xcc, __restore_asi
176 ENDPROC(NG4_retl_o2_plus_o4_plus_24)
177 ENTRY(NG4_retl_o2_plus_o4_plus_32)
179 ba,pt %xcc, __restore_asi
181 ENDPROC(NG4_retl_o2_plus_o4_plus_32)
182 ENTRY(NG4_retl_o2_plus_o4_plus_40)
184 ba,pt %xcc, __restore_asi
186 ENDPROC(NG4_retl_o2_plus_o4_plus_40)
187 ENTRY(NG4_retl_o2_plus_o4_plus_48)
189 ba,pt %xcc, __restore_asi
191 ENDPROC(NG4_retl_o2_plus_o4_plus_48)
192 ENTRY(NG4_retl_o2_plus_o4_plus_56)
194 ba,pt %xcc, __restore_asi
196 ENDPROC(NG4_retl_o2_plus_o4_plus_56)
197 ENTRY(NG4_retl_o2_plus_o4_plus_64)
199 ba,pt %xcc, __restore_asi
201 ENDPROC(NG4_retl_o2_plus_o4_plus_64)
202 ENTRY(NG4_retl_o2_plus_o4_fp)
203 ba,pt %xcc, __restore_asi_fp
205 ENDPROC(NG4_retl_o2_plus_o4_fp)
206 ENTRY(NG4_retl_o2_plus_o4_plus_8_fp)
208 ba,pt %xcc, __restore_asi_fp
210 ENDPROC(NG4_retl_o2_plus_o4_plus_8_fp)
211 ENTRY(NG4_retl_o2_plus_o4_plus_16_fp)
213 ba,pt %xcc, __restore_asi_fp
215 ENDPROC(NG4_retl_o2_plus_o4_plus_16_fp)
216 ENTRY(NG4_retl_o2_plus_o4_plus_24_fp)
218 ba,pt %xcc, __restore_asi_fp
220 ENDPROC(NG4_retl_o2_plus_o4_plus_24_fp)
221 ENTRY(NG4_retl_o2_plus_o4_plus_32_fp)
223 ba,pt %xcc, __restore_asi_fp
225 ENDPROC(NG4_retl_o2_plus_o4_plus_32_fp)
226 ENTRY(NG4_retl_o2_plus_o4_plus_40_fp)
228 ba,pt %xcc, __restore_asi_fp
230 ENDPROC(NG4_retl_o2_plus_o4_plus_40_fp)
231 ENTRY(NG4_retl_o2_plus_o4_plus_48_fp)
233 ba,pt %xcc, __restore_asi_fp
235 ENDPROC(NG4_retl_o2_plus_o4_plus_48_fp)
236 ENTRY(NG4_retl_o2_plus_o4_plus_56_fp)
238 ba,pt %xcc, __restore_asi_fp
240 ENDPROC(NG4_retl_o2_plus_o4_plus_56_fp)
241 ENTRY(NG4_retl_o2_plus_o4_plus_64_fp)
243 ba,pt %xcc, __restore_asi_fp
245 ENDPROC(NG4_retl_o2_plus_o4_plus_64_fp)
250 .type FUNC_NAME,#function
251 FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
270 .Llarge:/* len >= 0x80 */
271 /* First get dest 8 byte aligned. */
278 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2), NG4_retl_o2_plus_g1)
283 EX_ST(STORE(stb, %g2, %o0 - 0x01), NG4_retl_o2_plus_g1_plus_1)
285 51: LOAD(prefetch, %o1 + 0x040, #n_reads_strong)
286 LOAD(prefetch, %o1 + 0x080, #n_reads_strong)
287 LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong)
288 LOAD(prefetch, %o1 + 0x100, #n_reads_strong)
289 LOAD(prefetch, %o1 + 0x140, #n_reads_strong)
290 LOAD(prefetch, %o1 + 0x180, #n_reads_strong)
291 LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong)
292 LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
294 /* Check if we can use the straight fully aligned
295 * loop, or we require the alignaddr/faligndata variant.
298 bne,pn %icc, .Llarge_src_unaligned
301 /* Legitimize the use of initializing stores by getting dest
302 * to be 64-byte aligned.
305 brz,pt %g1, .Llarge_aligned
308 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g2), NG4_retl_o2_plus_g1)
313 EX_ST(STORE(stx, %g2, %o0 - 0x08), NG4_retl_o2_plus_g1_plus_8)
316 /* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */
320 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1), NG4_retl_o2_plus_o4)
322 EX_LD(LOAD(ldx, %o1 - 0x38, %g2), NG4_retl_o2_plus_o4)
324 EX_LD(LOAD(ldx, %o1 - 0x30, %g3), NG4_retl_o2_plus_o4_plus_64)
325 EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE), NG4_retl_o2_plus_o4_plus_64)
326 EX_LD(LOAD(ldx, %o1 - 0x20, %o5), NG4_retl_o2_plus_o4_plus_64)
327 EX_ST(STORE_INIT(%g1, %o0), NG4_retl_o2_plus_o4_plus_64)
329 EX_ST(STORE_INIT(%g2, %o0), NG4_retl_o2_plus_o4_plus_56)
331 EX_LD(LOAD(ldx, %o1 - 0x18, %g2), NG4_retl_o2_plus_o4_plus_48)
332 EX_ST(STORE_INIT(%g3, %o0), NG4_retl_o2_plus_o4_plus_48)
334 EX_LD(LOAD(ldx, %o1 - 0x10, %g3), NG4_retl_o2_plus_o4_plus_40)
335 EX_ST(STORE_INIT(GLOBAL_SPARE, %o0), NG4_retl_o2_plus_o4_plus_40)
337 EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE), NG4_retl_o2_plus_o4_plus_32)
338 EX_ST(STORE_INIT(%o5, %o0), NG4_retl_o2_plus_o4_plus_32)
340 EX_ST(STORE_INIT(%g2, %o0), NG4_retl_o2_plus_o4_plus_24)
342 EX_ST(STORE_INIT(%g3, %o0), NG4_retl_o2_plus_o4_plus_16)
344 EX_ST(STORE_INIT(GLOBAL_SPARE, %o0), NG4_retl_o2_plus_o4_plus_8)
347 LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
349 membar #StoreLoad | #StoreStore
353 ble,pn %icc, .Lsmall_unaligned
355 ba,a,pt %icc, .Lmedium_noprefetch
358 mov EX_RETVAL(%o3), %o0
360 .Llarge_src_unaligned:
362 VISEntryHalfFast(.Lmedium_vis_entry_fail)
368 alignaddr %o1, %g0, %g1
370 EX_LD_FP(LOAD(ldd, %g1 + 0x00, %f0), NG4_retl_o2_plus_o4)
371 1: EX_LD_FP(LOAD(ldd, %g1 + 0x08, %f2), NG4_retl_o2_plus_o4)
373 EX_LD_FP(LOAD(ldd, %g1 + 0x10, %f4), NG4_retl_o2_plus_o4_plus_64)
374 EX_LD_FP(LOAD(ldd, %g1 + 0x18, %f6), NG4_retl_o2_plus_o4_plus_64)
375 EX_LD_FP(LOAD(ldd, %g1 + 0x20, %f8), NG4_retl_o2_plus_o4_plus_64)
376 EX_LD_FP(LOAD(ldd, %g1 + 0x28, %f10), NG4_retl_o2_plus_o4_plus_64)
377 EX_LD_FP(LOAD(ldd, %g1 + 0x30, %f12), NG4_retl_o2_plus_o4_plus_64)
378 EX_LD_FP(LOAD(ldd, %g1 + 0x38, %f14), NG4_retl_o2_plus_o4_plus_64)
379 faligndata %f0, %f2, %f16
380 EX_LD_FP(LOAD(ldd, %g1 + 0x40, %f0), NG4_retl_o2_plus_o4_plus_64)
381 faligndata %f2, %f4, %f18
383 faligndata %f4, %f6, %f20
384 faligndata %f6, %f8, %f22
385 faligndata %f8, %f10, %f24
386 faligndata %f10, %f12, %f26
387 faligndata %f12, %f14, %f28
388 faligndata %f14, %f0, %f30
389 EX_ST_FP(STORE(std, %f16, %o0 + 0x00), NG4_retl_o2_plus_o4_plus_64)
390 EX_ST_FP(STORE(std, %f18, %o0 + 0x08), NG4_retl_o2_plus_o4_plus_56)
391 EX_ST_FP(STORE(std, %f20, %o0 + 0x10), NG4_retl_o2_plus_o4_plus_48)
392 EX_ST_FP(STORE(std, %f22, %o0 + 0x18), NG4_retl_o2_plus_o4_plus_40)
393 EX_ST_FP(STORE(std, %f24, %o0 + 0x20), NG4_retl_o2_plus_o4_plus_32)
394 EX_ST_FP(STORE(std, %f26, %o0 + 0x28), NG4_retl_o2_plus_o4_plus_24)
395 EX_ST_FP(STORE(std, %f28, %o0 + 0x30), NG4_retl_o2_plus_o4_plus_16)
396 EX_ST_FP(STORE(std, %f30, %o0 + 0x38), NG4_retl_o2_plus_o4_plus_8)
399 LOAD(prefetch, %g1 + 0x200, #n_reads_strong)
407 ble,pn %icc, .Lsmall_unaligned
409 ba,a,pt %icc, .Lmedium_unaligned
412 .Lmedium_vis_entry_fail:
416 LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
418 bne,pn %icc, .Lmedium_unaligned
421 andncc %o2, 0x20 - 1, %o5
424 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1), NG4_retl_o2_plus_o5)
425 EX_LD(LOAD(ldx, %o1 + 0x08, %g2), NG4_retl_o2_plus_o5)
426 EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE), NG4_retl_o2_plus_o5)
427 EX_LD(LOAD(ldx, %o1 + 0x18, %o4), NG4_retl_o2_plus_o5)
430 EX_ST(STORE(stx, %g1, %o0 + 0x00), NG4_retl_o2_plus_o5_plus_32)
431 EX_ST(STORE(stx, %g2, %o0 + 0x08), NG4_retl_o2_plus_o5_plus_24)
432 EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10), NG4_retl_o2_plus_o5_plus_24)
433 EX_ST(STORE(stx, %o4, %o0 + 0x18), NG4_retl_o2_plus_o5_plus_8)
436 2: andcc %o2, 0x18, %o5
440 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1), NG4_retl_o2_plus_o5)
445 EX_ST(STORE(stx, %g1, %o0 - 0x08), NG4_retl_o2_plus_o5_plus_8)
446 3: brz,pt %o2, .Lexit
450 EX_LD(LOAD(lduw, %o1 + 0x00, %g1), NG4_retl_o2)
455 EX_ST(STORE(stw, %g1, %o0 - 0x04), NG4_retl_o2_plus_4)
458 /* First get dest 8 byte aligned. */
464 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2), NG4_retl_o2_plus_g1)
469 EX_ST(STORE(stb, %g2, %o0 - 0x01), NG4_retl_o2_plus_g1_plus_1)
472 brz,pn %g1, .Lmedium_noprefetch
477 EX_LD(LOAD(ldx, %o1 + 0x00, %o4), NG4_retl_o2)
479 andn %o2, 0x08 - 1, %o5
481 1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3), NG4_retl_o2_plus_o5)
484 srlx %g3, %g2, GLOBAL_SPARE
485 or GLOBAL_SPARE, %o4, GLOBAL_SPARE
486 EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00), NG4_retl_o2_plus_o5_plus_8)
494 ba,pt %icc, .Lsmall_unaligned
497 EX_LD(LOAD(ldub, %o1 + 0x00, %g1), NG4_retl_o2)
500 EX_ST(STORE(stb, %g1, %o0 + 0x00), NG4_retl_o2_plus_1)
501 EX_LD(LOAD(ldub, %o1 + 0x01, %g1), NG4_retl_o2)
504 EX_ST(STORE(stb, %g1, %o0 + 0x01), NG4_retl_o2_plus_1)
505 EX_LD(LOAD(ldub, %o1 + 0x02, %g1), NG4_retl_o2)
507 EX_ST(STORE(stb, %g1, %o0 + 0x02), NG4_retl_o2)
511 bne,pn %icc, .Lsmall_unaligned
512 andn %o2, 0x4 - 1, %o5
515 EX_LD(LOAD(lduw, %o1 + 0x00, %g1), NG4_retl_o2_plus_o5)
520 EX_ST(STORE(stw, %g1, %o0 - 0x04), NG4_retl_o2_plus_o5_plus_4)
526 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g1), NG4_retl_o2)
531 EX_ST(STORE(stb, %g1, %o0 - 0x01), NG4_retl_o2_plus_1)
534 .size FUNC_NAME, .-FUNC_NAME