1 /* SPDX-License-Identifier: GPL-2.0 */
2 /* NGmemcpy.S: Niagara optimized memcpy.
4 * Copyright (C) 2006, 2007 David S. Miller (davem@davemloft.net)
8 #include <linux/linkage.h>
10 #include <asm/thread_info.h>
11 #define GLOBAL_SPARE %g7
12 #define RESTORE_ASI(TMP) \
13 ldub [%g6 + TI_CURRENT_DS], TMP; \
16 #define GLOBAL_SPARE %g5
17 #define RESTORE_ASI(TMP) \
22 #define SAVE_AMOUNT 128
24 #define SAVE_AMOUNT 64
28 #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P
41 #define LOAD(type,addr,dest) type [addr], dest
43 #define LOAD(type,addr,dest) type##a [addr] 0x80, dest
48 #define LOAD_TWIN(addr_reg,dest0,dest1) \
49 ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0
53 #define STORE(type,src,addr) type src, [addr]
57 #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
58 #define STORE_INIT(src,addr) stxa src, [addr] %asi
60 #define STORE_INIT(src,addr) stx src, [addr + 0x00]
65 #define FUNC_NAME NGmemcpy
76 .register %g2,#scratch
77 .register %g3,#scratch
81 #define EX_RETVAL(x) x
84 wr %g0, ASI_AIUS, %asi
86 ENTRY(NG_ret_i2_plus_i4_plus_1)
87 ba,pt %xcc, __restore_asi
89 ENDPROC(NG_ret_i2_plus_i4_plus_1)
90 ENTRY(NG_ret_i2_plus_g1)
91 ba,pt %xcc, __restore_asi
93 ENDPROC(NG_ret_i2_plus_g1)
94 ENTRY(NG_ret_i2_plus_g1_minus_8)
96 ba,pt %xcc, __restore_asi
98 ENDPROC(NG_ret_i2_plus_g1_minus_8)
99 ENTRY(NG_ret_i2_plus_g1_minus_16)
101 ba,pt %xcc, __restore_asi
103 ENDPROC(NG_ret_i2_plus_g1_minus_16)
104 ENTRY(NG_ret_i2_plus_g1_minus_24)
106 ba,pt %xcc, __restore_asi
108 ENDPROC(NG_ret_i2_plus_g1_minus_24)
109 ENTRY(NG_ret_i2_plus_g1_minus_32)
111 ba,pt %xcc, __restore_asi
113 ENDPROC(NG_ret_i2_plus_g1_minus_32)
114 ENTRY(NG_ret_i2_plus_g1_minus_40)
116 ba,pt %xcc, __restore_asi
118 ENDPROC(NG_ret_i2_plus_g1_minus_40)
119 ENTRY(NG_ret_i2_plus_g1_minus_48)
121 ba,pt %xcc, __restore_asi
123 ENDPROC(NG_ret_i2_plus_g1_minus_48)
124 ENTRY(NG_ret_i2_plus_g1_minus_56)
126 ba,pt %xcc, __restore_asi
128 ENDPROC(NG_ret_i2_plus_g1_minus_56)
129 ENTRY(NG_ret_i2_plus_i4)
130 ba,pt %xcc, __restore_asi
132 ENDPROC(NG_ret_i2_plus_i4)
133 ENTRY(NG_ret_i2_plus_i4_minus_8)
135 ba,pt %xcc, __restore_asi
137 ENDPROC(NG_ret_i2_plus_i4_minus_8)
138 ENTRY(NG_ret_i2_plus_8)
139 ba,pt %xcc, __restore_asi
141 ENDPROC(NG_ret_i2_plus_8)
142 ENTRY(NG_ret_i2_plus_4)
143 ba,pt %xcc, __restore_asi
145 ENDPROC(NG_ret_i2_plus_4)
146 ENTRY(NG_ret_i2_plus_1)
147 ba,pt %xcc, __restore_asi
149 ENDPROC(NG_ret_i2_plus_1)
150 ENTRY(NG_ret_i2_plus_g1_plus_1)
152 ba,pt %xcc, __restore_asi
154 ENDPROC(NG_ret_i2_plus_g1_plus_1)
156 ba,pt %xcc, __restore_asi
159 ENTRY(NG_ret_i2_and_7_plus_i4)
161 ba,pt %xcc, __restore_asi
163 ENDPROC(NG_ret_i2_and_7_plus_i4)
169 .type FUNC_NAME,#function
170 FUNC_NAME: /* %i0=dst, %i1=src, %i2=len */
172 save %sp, -SAVE_AMOUNT, %sp
184 /* 2 blocks (128 bytes) is the minimum we can do the block
185 * copy with. We need to ensure that we'll iterate at least
186 * once in the block copy loop. At worst we'll need to align
187 * the destination to a 64-byte boundary which can chew up
188 * to (64 - 1) bytes from the length before we perform the
197 * %i2: len (known to be >= 128)
199 * The block copy loops will use %i4/%i5,%g2/%g3 as
200 * temporaries while copying the data.
203 LOAD(prefetch, %i1, #one_read)
204 wr %g0, STORE_ASI, %asi
206 /* Align destination on 64-byte boundary. */
207 andcc %o0, (64 - 1), %i4
210 sub %g0, %i4, %i4 ! bytes to align dst
213 EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_plus_i4_plus_1)
214 EX_ST(STORE(stb, %g1, %o0), NG_ret_i2_plus_i4_plus_1)
219 /* If the source is on a 16-byte boundary we can do
220 * the direct block copy loop. If it is 8-byte aligned
221 * we can do the 16-byte loads offset by -8 bytes and the
222 * init stores offset by one register.
224 * If the source is not even 8-byte aligned, we need to do
225 * shifting and masking (basically integer faligndata).
227 * The careful bit with init stores is that if we store
228 * to any part of the cache line we have to store the whole
229 * cacheline else we can end up with corrupt L2 cache line
230 * contents. Since the loop works on 64-bytes of 64-byte
231 * aligned store data at a time, this is easy to ensure.
234 andcc %i1, (16 - 1), %i4
235 andn %i2, (64 - 1), %g1 ! block copy loop iterator
237 sub %i2, %g1, %i2 ! final sub-block copy bytes
243 /* Neither 8-byte nor 16-byte aligned, shift and mask. */
244 and %i4, 0x7, GLOBAL_SPARE
245 sll GLOBAL_SPARE, 3, GLOBAL_SPARE
247 EX_LD(LOAD_TWIN(%i1, %g2, %g3), NG_ret_i2_plus_g1)
248 sub %i5, GLOBAL_SPARE, %i5
257 #define MIX_THREE_WORDS(WORD1, WORD2, WORD3, PRE_SHIFT, POST_SHIFT, TMP) \
258 sllx WORD1, POST_SHIFT, WORD1; \
259 srlx WORD2, PRE_SHIFT, TMP; \
260 sllx WORD2, POST_SHIFT, WORD2; \
261 or WORD1, TMP, WORD1; \
262 srlx WORD3, PRE_SHIFT, TMP; \
263 or WORD2, TMP, WORD2;
265 8: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), NG_ret_i2_plus_g1)
266 MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1)
267 LOAD(prefetch, %i1 + %i3, #one_read)
269 EX_ST(STORE_INIT(%g2, %o0 + 0x00), NG_ret_i2_plus_g1)
270 EX_ST(STORE_INIT(%g3, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8)
272 EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), NG_ret_i2_plus_g1_minus_16)
273 MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1)
275 EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16)
276 EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24)
278 EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1_minus_32)
279 MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1)
281 EX_ST(STORE_INIT(%g2, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32)
282 EX_ST(STORE_INIT(%g3, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40)
284 EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), NG_ret_i2_plus_g1_minus_48)
286 MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1)
288 EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48)
289 EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56)
298 9: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), NG_ret_i2_plus_g1)
299 MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1)
300 LOAD(prefetch, %i1 + %i3, #one_read)
302 EX_ST(STORE_INIT(%g3, %o0 + 0x00), NG_ret_i2_plus_g1)
303 EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8)
305 EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), NG_ret_i2_plus_g1_minus_16)
306 MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1)
308 EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16)
309 EX_ST(STORE_INIT(%g2, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24)
311 EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1_minus_32)
312 MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1)
314 EX_ST(STORE_INIT(%g3, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32)
315 EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40)
317 EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), NG_ret_i2_plus_g1_minus_48)
319 MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1)
321 EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48)
322 EX_ST(STORE_INIT(%g2, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56)
331 10: /* Destination is 64-byte aligned, source was only 8-byte
332 * aligned but it has been subtracted by 8 and we perform
333 * one twin load ahead, then add 8 back into source when
334 * we finish the loop.
336 EX_LD(LOAD_TWIN(%i1, %o4, %o5), NG_ret_i2_plus_g1)
341 1: EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1)
342 LOAD(prefetch, %i1 + %o1, #one_read)
343 EX_ST(STORE_INIT(%o5, %o0 + 0x00), NG_ret_i2_plus_g1) ! initializes cache line
344 EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8)
345 EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), NG_ret_i2_plus_g1_minus_16)
346 EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16)
347 EX_ST(STORE_INIT(%o4, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24)
348 EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), NG_ret_i2_plus_g1_minus_32)
349 EX_ST(STORE_INIT(%o5, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32)
350 EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40)
351 EX_LD(LOAD_TWIN(%i1 + %o1, %o4, %o5), NG_ret_i2_plus_g1_minus_48)
353 EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48)
354 EX_ST(STORE_INIT(%o4, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56)
362 50: /* Destination is 64-byte aligned, and source is 16-byte
369 1: EX_LD(LOAD_TWIN(%i1 + %g0, %o4, %o5), NG_ret_i2_plus_g1)
370 EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1)
371 LOAD(prefetch, %i1 + %o1, #one_read)
372 EX_ST(STORE_INIT(%o4, %o0 + 0x00), NG_ret_i2_plus_g1) ! initializes cache line
373 EX_ST(STORE_INIT(%o5, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8)
374 EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), NG_ret_i2_plus_g1_minus_16)
375 EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16)
376 EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24)
377 EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), NG_ret_i2_plus_g1_minus_32)
379 EX_ST(STORE_INIT(%o4, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32)
380 EX_ST(STORE_INIT(%o5, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40)
381 EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48)
382 EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56)
391 /* %i2 contains any final bytes still needed to be copied
392 * over. If anything is left, we copy it one byte at a time.
401 70: /* 16 < len <= 64 */
408 1: subcc %i4, 0x10, %i4
409 EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_plus_i4)
411 EX_LD(LOAD(ldx, %i1, %g1), NG_ret_i2_plus_i4)
413 EX_ST(STORE(stx, %o4, %i1 + %i3), NG_ret_i2_plus_i4)
415 EX_ST(STORE(stx, %g1, %i1 + %i3), NG_ret_i2_plus_i4_minus_8)
418 73: andcc %i2, 0x8, %g0
422 EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_plus_8)
423 EX_ST(STORE(stx, %o4, %i1 + %i3), NG_ret_i2_plus_8)
425 1: andcc %i2, 0x4, %g0
429 EX_LD(LOAD(lduw, %i1, %i5), NG_ret_i2_plus_4)
430 EX_ST(STORE(stw, %i5, %i1 + %i3), NG_ret_i2_plus_4)
446 EX_LD(LOAD(ldub, %i1, %i5), NG_ret_i2_plus_g1_plus_1)
447 EX_ST(STORE(stb, %i5, %i1 + %i3), NG_ret_i2_plus_g1_plus_1)
463 EX_LD(LOAD(ldx, %i1, %g2), NG_ret_i2)
468 EX_LD(LOAD(ldx, %i1, %g3), NG_ret_i2_and_7_plus_i4)
472 EX_ST(STORE(stx, %i5, %o0), NG_ret_i2_and_7_plus_i4)
485 80: /* 0 < len <= 16 */
492 EX_LD(LOAD(lduw, %i1, %g1), NG_ret_i2_plus_4)
493 EX_ST(STORE(stw, %g1, %i1 + %i3), NG_ret_i2_plus_4)
498 restore EX_RETVAL(%i0), %g0, %o0
503 EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_plus_1)
504 EX_ST(STORE(stb, %g1, %i1 + %i3), NG_ret_i2_plus_1)
508 restore EX_RETVAL(%i0), %g0, %o0
510 .size FUNC_NAME, .-FUNC_NAME