1 /* NG4memcpy.S: Niagara-4 optimized memcpy.
3 * Copyright (C) 2012 David S. Miller (davem@davemloft.net)
7 #include <asm/visasm.h>
9 #define GLOBAL_SPARE %g7
11 #define ASI_BLK_INIT_QUAD_LDD_P 0xe2
14 /* On T4 it is very expensive to access ASRs like %fprs and
15 * %asi, avoiding a read or a write can save ~50 cycles.
19 andcc %o5, FPRS_FEF, %g0; \
21 wr %g0, FPRS_FEF, %fprs; \
25 #define VISEntryHalf FPU_ENTER; \
26 clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0;
27 #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
29 #define VISEntryHalf FPU_ENTER
30 #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
33 #define GLOBAL_SPARE %g5
37 #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
38 #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P
40 #define STORE_ASI 0x80 /* ASI_P */
53 #define EX_RETVAL(x) x
57 #define LOAD(type,addr,dest) type [addr], dest
62 #define STORE(type,src,addr) type src, [addr]
64 #define STORE(type,src,addr) type##a src, [addr] %asi
69 #define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI
73 #define FUNC_NAME NG4memcpy
83 .register %g2,#scratch
84 .register %g3,#scratch
90 .type FUNC_NAME,#function
91 FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
110 .Llarge:/* len >= 0x80 */
111 /* First get dest 8 byte aligned. */
117 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
122 EX_ST(STORE(stb, %g2, %o0 - 0x01))
124 51: LOAD(prefetch, %o1 + 0x040, #n_reads_strong)
125 LOAD(prefetch, %o1 + 0x080, #n_reads_strong)
126 LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong)
127 LOAD(prefetch, %o1 + 0x100, #n_reads_strong)
128 LOAD(prefetch, %o1 + 0x140, #n_reads_strong)
129 LOAD(prefetch, %o1 + 0x180, #n_reads_strong)
130 LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong)
131 LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
133 /* Check if we can use the straight fully aligned
134 * loop, or we require the alignaddr/faligndata variant.
137 bne,pn %icc, .Llarge_src_unaligned
140 /* Legitimize the use of initializing stores by getting dest
141 * to be 64-byte aligned.
144 brz,pt %g1, .Llarge_aligned
147 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g2))
152 EX_ST(STORE(stx, %g2, %o0 - 0x08))
155 /* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */
159 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
161 EX_LD(LOAD(ldx, %o1 - 0x38, %g2))
163 EX_LD(LOAD(ldx, %o1 - 0x30, %g3))
164 EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE))
165 EX_LD(LOAD(ldx, %o1 - 0x20, %o5))
166 EX_ST(STORE_INIT(%g1, %o0))
168 EX_ST(STORE_INIT(%g2, %o0))
170 EX_LD(LOAD(ldx, %o1 - 0x18, %g2))
171 EX_ST(STORE_INIT(%g3, %o0))
173 EX_LD(LOAD(ldx, %o1 - 0x10, %g3))
174 EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
176 EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE))
177 EX_ST(STORE_INIT(%o5, %o0))
179 EX_ST(STORE_INIT(%g2, %o0))
181 EX_ST(STORE_INIT(%g3, %o0))
183 EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
186 LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
188 membar #StoreLoad | #StoreStore
192 ble,pn %icc, .Lsmall_unaligned
194 ba,a,pt %icc, .Lmedium_noprefetch
197 mov EX_RETVAL(%o3), %o0
199 .Llarge_src_unaligned:
203 alignaddr %o1, %g0, %g1
205 EX_LD(LOAD(ldd, %g1 + 0x00, %f0))
206 1: EX_LD(LOAD(ldd, %g1 + 0x08, %f2))
208 EX_LD(LOAD(ldd, %g1 + 0x10, %f4))
209 EX_LD(LOAD(ldd, %g1 + 0x18, %f6))
210 EX_LD(LOAD(ldd, %g1 + 0x20, %f8))
211 EX_LD(LOAD(ldd, %g1 + 0x28, %f10))
212 EX_LD(LOAD(ldd, %g1 + 0x30, %f12))
213 EX_LD(LOAD(ldd, %g1 + 0x38, %f14))
214 faligndata %f0, %f2, %f16
215 EX_LD(LOAD(ldd, %g1 + 0x40, %f0))
216 faligndata %f2, %f4, %f18
218 faligndata %f4, %f6, %f20
219 faligndata %f6, %f8, %f22
220 faligndata %f8, %f10, %f24
221 faligndata %f10, %f12, %f26
222 faligndata %f12, %f14, %f28
223 faligndata %f14, %f0, %f30
224 EX_ST(STORE(std, %f16, %o0 + 0x00))
225 EX_ST(STORE(std, %f18, %o0 + 0x08))
226 EX_ST(STORE(std, %f20, %o0 + 0x10))
227 EX_ST(STORE(std, %f22, %o0 + 0x18))
228 EX_ST(STORE(std, %f24, %o0 + 0x20))
229 EX_ST(STORE(std, %f26, %o0 + 0x28))
230 EX_ST(STORE(std, %f28, %o0 + 0x30))
231 EX_ST(STORE(std, %f30, %o0 + 0x38))
234 LOAD(prefetch, %g1 + 0x200, #n_reads_strong)
239 ble,pn %icc, .Lsmall_unaligned
241 ba,a,pt %icc, .Lmedium_unaligned
244 LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
246 bne,pn %icc, .Lmedium_unaligned
249 andncc %o2, 0x20 - 1, %o5
252 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
253 EX_LD(LOAD(ldx, %o1 + 0x08, %g2))
254 EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE))
255 EX_LD(LOAD(ldx, %o1 + 0x18, %o4))
258 EX_ST(STORE(stx, %g1, %o0 + 0x00))
259 EX_ST(STORE(stx, %g2, %o0 + 0x08))
260 EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10))
261 EX_ST(STORE(stx, %o4, %o0 + 0x18))
264 2: andcc %o2, 0x18, %o5
267 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
272 EX_ST(STORE(stx, %g1, %o0 - 0x08))
273 3: brz,pt %o2, .Lexit
277 EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
282 EX_ST(STORE(stw, %g1, %o0 - 0x04))
285 /* First get dest 8 byte aligned. */
291 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
296 EX_ST(STORE(stb, %g2, %o0 - 0x01))
299 brz,pn %g1, .Lmedium_noprefetch
304 EX_LD(LOAD(ldx, %o1 + 0x00, %o4))
306 andn %o2, 0x08 - 1, %o5
308 1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3))
311 srlx %g3, %g2, GLOBAL_SPARE
312 or GLOBAL_SPARE, %o4, GLOBAL_SPARE
313 EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00))
321 ba,pt %icc, .Lsmall_unaligned
324 EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
327 EX_ST(STORE(stb, %g1, %o0 + 0x00))
328 EX_LD(LOAD(ldub, %o1 + 0x01, %g1))
331 EX_ST(STORE(stb, %g1, %o0 + 0x01))
332 EX_LD(LOAD(ldub, %o1 + 0x02, %g1))
334 EX_ST(STORE(stb, %g1, %o0 + 0x02))
338 bne,pn %icc, .Lsmall_unaligned
339 andn %o2, 0x4 - 1, %o5
342 EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
347 EX_ST(STORE(stw, %g1, %o0 - 0x04))
353 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
358 EX_ST(STORE(stb, %g1, %o0 - 0x01))
360 .size FUNC_NAME, .-FUNC_NAME