1 /* NG4memcpy.S: Niagara-4 optimized memcpy.
3 * Copyright (C) 2012 David S. Miller (davem@davemloft.net)
7 #include <asm/visasm.h>
9 #define GLOBAL_SPARE %g7
11 #define ASI_BLK_INIT_QUAD_LDD_P 0xe2
14 /* On T4 it is very expensive to access ASRs like %fprs and
15 * %asi, avoiding a read or a write can save ~50 cycles.
19 andcc %o5, FPRS_FEF, %g0; \
21 wr %g0, FPRS_FEF, %fprs; \
25 #define VISEntryHalf FPU_ENTER; \
26 clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0;
27 #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
29 #define VISEntryHalf FPU_ENTER
30 #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
33 #define GLOBAL_SPARE %g5
37 #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
38 #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P
40 #define STORE_ASI 0x80 /* ASI_P */
44 #if !defined(EX_LD) && !defined(EX_ST)
57 #define EX_RETVAL(x) x
61 #define LOAD(type,addr,dest) type [addr], dest
66 #define STORE(type,src,addr) type src, [addr]
68 #define STORE(type,src,addr) type##a src, [addr] %asi
73 #define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI
77 #define FUNC_NAME NG4memcpy
87 .register %g2,#scratch
88 .register %g3,#scratch
94 .type FUNC_NAME,#function
95 FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
114 .Llarge:/* len >= 0x80 */
115 /* First get dest 8 byte aligned. */
121 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
126 EX_ST(STORE(stb, %g2, %o0 - 0x01))
128 51: LOAD(prefetch, %o1 + 0x040, #n_reads_strong)
129 LOAD(prefetch, %o1 + 0x080, #n_reads_strong)
130 LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong)
131 LOAD(prefetch, %o1 + 0x100, #n_reads_strong)
132 LOAD(prefetch, %o1 + 0x140, #n_reads_strong)
133 LOAD(prefetch, %o1 + 0x180, #n_reads_strong)
134 LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong)
135 LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
137 /* Check if we can use the straight fully aligned
138 * loop, or we require the alignaddr/faligndata variant.
141 bne,pn %icc, .Llarge_src_unaligned
144 /* Legitimize the use of initializing stores by getting dest
145 * to be 64-byte aligned.
148 brz,pt %g1, .Llarge_aligned
151 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g2))
156 EX_ST(STORE(stx, %g2, %o0 - 0x08))
159 /* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */
163 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
165 EX_LD(LOAD(ldx, %o1 - 0x38, %g2))
167 EX_LD(LOAD(ldx, %o1 - 0x30, %g3))
168 EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE))
169 EX_LD(LOAD(ldx, %o1 - 0x20, %o5))
170 EX_ST(STORE_INIT(%g1, %o0))
172 EX_ST(STORE_INIT(%g2, %o0))
174 EX_LD(LOAD(ldx, %o1 - 0x18, %g2))
175 EX_ST(STORE_INIT(%g3, %o0))
177 EX_LD(LOAD(ldx, %o1 - 0x10, %g3))
178 EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
180 EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE))
181 EX_ST(STORE_INIT(%o5, %o0))
183 EX_ST(STORE_INIT(%g2, %o0))
185 EX_ST(STORE_INIT(%g3, %o0))
187 EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
190 LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
192 membar #StoreLoad | #StoreStore
196 ble,pn %icc, .Lsmall_unaligned
198 ba,a,pt %icc, .Lmedium_noprefetch
201 mov EX_RETVAL(%o3), %o0
203 .Llarge_src_unaligned:
205 VISEntryHalfFast(.Lmedium_vis_entry_fail)
211 alignaddr %o1, %g0, %g1
213 EX_LD(LOAD(ldd, %g1 + 0x00, %f0))
214 1: EX_LD(LOAD(ldd, %g1 + 0x08, %f2))
216 EX_LD(LOAD(ldd, %g1 + 0x10, %f4))
217 EX_LD(LOAD(ldd, %g1 + 0x18, %f6))
218 EX_LD(LOAD(ldd, %g1 + 0x20, %f8))
219 EX_LD(LOAD(ldd, %g1 + 0x28, %f10))
220 EX_LD(LOAD(ldd, %g1 + 0x30, %f12))
221 EX_LD(LOAD(ldd, %g1 + 0x38, %f14))
222 faligndata %f0, %f2, %f16
223 EX_LD(LOAD(ldd, %g1 + 0x40, %f0))
224 faligndata %f2, %f4, %f18
226 faligndata %f4, %f6, %f20
227 faligndata %f6, %f8, %f22
228 faligndata %f8, %f10, %f24
229 faligndata %f10, %f12, %f26
230 faligndata %f12, %f14, %f28
231 faligndata %f14, %f0, %f30
232 EX_ST(STORE(std, %f16, %o0 + 0x00))
233 EX_ST(STORE(std, %f18, %o0 + 0x08))
234 EX_ST(STORE(std, %f20, %o0 + 0x10))
235 EX_ST(STORE(std, %f22, %o0 + 0x18))
236 EX_ST(STORE(std, %f24, %o0 + 0x20))
237 EX_ST(STORE(std, %f26, %o0 + 0x28))
238 EX_ST(STORE(std, %f28, %o0 + 0x30))
239 EX_ST(STORE(std, %f30, %o0 + 0x38))
242 LOAD(prefetch, %g1 + 0x200, #n_reads_strong)
247 ble,pn %icc, .Lsmall_unaligned
249 ba,a,pt %icc, .Lmedium_unaligned
252 .Lmedium_vis_entry_fail:
256 LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
258 bne,pn %icc, .Lmedium_unaligned
261 andncc %o2, 0x20 - 1, %o5
264 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
265 EX_LD(LOAD(ldx, %o1 + 0x08, %g2))
266 EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE))
267 EX_LD(LOAD(ldx, %o1 + 0x18, %o4))
270 EX_ST(STORE(stx, %g1, %o0 + 0x00))
271 EX_ST(STORE(stx, %g2, %o0 + 0x08))
272 EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10))
273 EX_ST(STORE(stx, %o4, %o0 + 0x18))
276 2: andcc %o2, 0x18, %o5
279 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
284 EX_ST(STORE(stx, %g1, %o0 - 0x08))
285 3: brz,pt %o2, .Lexit
289 EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
294 EX_ST(STORE(stw, %g1, %o0 - 0x04))
297 /* First get dest 8 byte aligned. */
303 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
308 EX_ST(STORE(stb, %g2, %o0 - 0x01))
311 brz,pn %g1, .Lmedium_noprefetch
316 EX_LD(LOAD(ldx, %o1 + 0x00, %o4))
318 andn %o2, 0x08 - 1, %o5
320 1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3))
323 srlx %g3, %g2, GLOBAL_SPARE
324 or GLOBAL_SPARE, %o4, GLOBAL_SPARE
325 EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00))
333 ba,pt %icc, .Lsmall_unaligned
336 EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
339 EX_ST(STORE(stb, %g1, %o0 + 0x00))
340 EX_LD(LOAD(ldub, %o1 + 0x01, %g1))
343 EX_ST(STORE(stb, %g1, %o0 + 0x01))
344 EX_LD(LOAD(ldub, %o1 + 0x02, %g1))
346 EX_ST(STORE(stb, %g1, %o0 + 0x02))
350 bne,pn %icc, .Lsmall_unaligned
351 andn %o2, 0x4 - 1, %o5
354 EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
359 EX_ST(STORE(stw, %g1, %o0 - 0x04))
365 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
370 EX_ST(STORE(stb, %g1, %o0 - 0x01))
372 .size FUNC_NAME, .-FUNC_NAME