1 /* Optimized version of the standard memcpy() function.
2 This file is part of the GNU C Library.
3 Copyright (C) 2000, 2001, 2003 Free Software Foundation, Inc.
4 Contributed by Dan Pop for Itanium <Dan.Pop@cern.ch>.
5 Rewritten for McKinley by Sverre Jarp, HP Labs/CERN <Sverre.Jarp@cern.ch>
7 The GNU C Library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Lesser General Public
9 License as published by the Free Software Foundation; either
10 version 2.1 of the License, or (at your option) any later version.
12 The GNU C Library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public
18 License along with the GNU C Library; if not, write to the Free
19 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
29 An assembly implementation of the algorithm used by the generic C
30 version from glibc. The case when source and sest are aligned is
31 treated separately, for extra performance.
33 In this form, memcpy assumes little endian mode. For big endian mode,
34 sh1 must be computed using an extra instruction: sub sh1 = 64, sh1
35 and the order of r[MEMLAT] and r[MEMLAT+1] must be reverted in the
43 #define LFETCH_DIST 500
45 #define ALIGN_UNROLL_no 4 // no. of elements
46 #define ALIGN_UNROLL_sh 2 // (shift amount)
49 #define Nrot ((4*(MEMLAT+2) + 7) & ~7)
92 #elif defined(USE_INT)
106 #ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
107 /* Manually force proper loop-alignment. Note: be sure to
108 double-check the code-layout after making any changes to
110 # define ALIGN(n) { nop 0 }
112 # define ALIGN(n) .align n
115 #if defined(USE_LFETCH)
116 #define LOOP(shift) \
120 (p[0]) ld8.nt1 r[0] = [asrc], 8 ; \
121 (p[0]) lfetch.nt1 [ptr1], 16 ; \
124 (p[MEMLAT+1]) st8 [dest] = tmp3, 8 ; \
125 (p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ; \
128 (p[0]) ld8.nt1 s[0] = [asrc], 8 ; \
129 (p[0]) lfetch.nt1 [ptr2], 16 ; \
132 (p[MEMLAT+1]) st8 [dest] = tmp4, 8 ; \
133 (p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ; \
134 br.ctop.sptk.many .loop##shift \
137 br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */ \
140 #define LOOP(shift) \
144 (p[0]) ld8.nt1 r[0] = [asrc], 8 ; \
147 (p[MEMLAT+1]) st8 [dest] = tmp3, 8 ; \
148 (p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ; \
151 (p[0]) ld8.nt1 s[0] = [asrc], 8 ; \
154 (p[MEMLAT+1]) st8 [dest] = tmp4, 8 ; \
155 (p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ; \
156 br.ctop.sptk.many .loop##shift \
159 br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */ \
167 alloc r2 = ar.pfs, 3, Nrot - 3, 0, Nrot
168 .rotr r[MEMLAT+1], s[MEMLAT+2], q[MEMLAT+1], t[MEMLAT+1]
170 .rotf fr[MEMLAT+1], fq[MEMLAT+1], fs[MEMLAT+1], ft[MEMLAT+1]
171 mov ret0 = in0 // return tmp2 = dest
173 movi0 saved_pr = pr // save the predicate registers
175 and tmp4 = 7, in0 // check if destination is aligned
176 mov dest = in0 // dest
180 cmp.eq p_scr, p0 = in2, r0 // if (len == 0)
181 .save ar.lc, saved_lc
182 movi0 saved_lc = ar.lc // save the loop counter
184 cmp.ge p_few, p0 = OP_T_THRES, in2 // is len <= OP_T_THRESH
187 (p_scr) br.cond.dpnt.few .restore_and_exit // Branch no. 1: return dest
188 (p_few) br.cond.dpnt.many .copy_bytes // Branch no. 2: copy byte by byte
191 #if defined(USE_LFETCH)
195 shr.u elemcnt = len, 3 // elemcnt = len / 8
197 cmp.eq p_scr, p0 = tmp4, r0 // is destination aligned?
198 sub loopcnt = 7, tmp4 //
199 (p_scr) br.cond.dptk.many .dest_aligned
202 ld1 tmp2 = [src], 1 //
203 sub len = len, loopcnt, 1 // reduce len
204 movi0 ar.lc = loopcnt //
206 cmp.ne p_scr, p0 = 0, loopcnt // avoid loading beyond end-point
209 .l0: // ---------------------------- // L0: Align src on 8-byte boundary
211 st1 [dest] = tmp2, 1 //
212 (p_scr) ld1 tmp2 = [src], 1 //
214 cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point
215 add loopcnt = -1, loopcnt
216 br.cloop.dptk.few .l0 //
221 and tmp4 = 7, src // ready for alignment check
222 shr.u elemcnt = len, 3 // elemcnt = len / 8
225 cmp.ne p_scr, p0 = tmp4, r0 // is source also aligned
226 tbit.nz p_xtr, p_nxtr = src, 3 // prepare a separate move if src
227 } { .mib // is not 16B aligned
228 add ptr2 = LFETCH_DIST, dest // prefetch address
229 add ptr1 = LFETCH_DIST, src
230 (p_scr) br.cond.dptk.many .src_not_aligned
233 // The optimal case, when dest, and src are aligned
237 .pred.rel "mutex",p_xtr,p_nxtr
238 (p_xtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no+1, elemcnt // Need N + 1 to qualify
239 (p_nxtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no, elemcnt // Need only N to qualify
240 movi0 pr.rot = 1 << 16 // set rotating predicates
242 (p_scr) br.cond.dpnt.many .copy_full_words
246 (p_xtr) load tempreg = [src], 8
247 (p_xtr) add elemcnt = -1, elemcnt
248 movi0 ar.ec = MEMLAT + 1 // set the epilog counter
251 (p_xtr) add len = -8, len //
252 add asrc = 16, src // one bank apart (for USE_INT)
253 shr.u loopcnt = elemcnt, ALIGN_UNROLL_sh // cater for unrolling
256 add loopcnt = -1, loopcnt
257 (p_xtr) store [dest] = tempreg, 8 // copy the "extra" word
262 movi0 ar.lc = loopcnt // set the loop counter
265 #ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
271 .l1: // ------------------------------- // L1: Everything a multiple of 8
273 #if defined(USE_LFETCH)
274 (p[0]) lfetch.nt1 [ptr2],32
276 (p[0]) ldfp8 the_r[0],the_q[0] = [src], 16
277 (p[0]) add len = -32, len
279 (p[MEMLAT]) store [dest] = the_r[MEMLAT], 8
280 (p[MEMLAT]) store [adest] = the_s[MEMLAT], 8
283 #if defined(USE_LFETCH)
284 (p[0]) lfetch.nt1 [ptr1],32
286 (p[0]) ldfp8 the_s[0], the_t[0] = [src], 16
288 (p[MEMLAT]) store [dest] = the_q[MEMLAT], 24
289 (p[MEMLAT]) store [adest] = the_t[MEMLAT], 24
290 br.ctop.dptk.many .l1
292 #elif defined(USE_INT)
293 .l1: // ------------------------------- // L1: Everything a multiple of 8
295 (p[0]) load the_r[0] = [src], 8
296 (p[0]) load the_q[0] = [asrc], 8
297 (p[0]) add len = -32, len
299 (p[MEMLAT]) store [dest] = the_r[MEMLAT], 8
300 (p[MEMLAT]) store [adest] = the_q[MEMLAT], 8
303 (p[0]) load the_s[0] = [src], 24
304 (p[0]) load the_t[0] = [asrc], 24
306 (p[MEMLAT]) store [dest] = the_s[MEMLAT], 24
307 (p[MEMLAT]) store [adest] = the_t[MEMLAT], 24
308 #if defined(USE_LFETCH)
311 (p[0]) lfetch.nt1 [ptr2],32
312 (p[0]) lfetch.nt1 [ptr1],32
314 br.ctop.dptk.many .l1
320 cmp.gt p_scr, p0 = 8, len //
321 shr.u elemcnt = len, 3 //
322 (p_scr) br.cond.dpnt.many .copy_bytes
325 load tempreg = [src], 8
326 add loopcnt = -1, elemcnt //
329 cmp.ne p_scr, p0 = 0, loopcnt //
330 mov ar.lc = loopcnt //
333 .l2: // ------------------------------- // L2: Max 4 words copied separately
335 store [dest] = tempreg, 8
336 (p_scr) load tempreg = [src], 8 //
339 cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point
340 add loopcnt = -1, loopcnt
341 br.cloop.dptk.few .l2
346 cmp.eq p_scr, p0 = len, r0 // is len == 0 ?
347 add loopcnt = -1, len // len--;
348 (p_scr) br.cond.spnt .restore_and_exit
352 movi0 ar.lc = loopcnt
353 cmp.ne p_scr, p0 = 0, loopcnt // avoid load beyond end-point
356 .l3: // ------------------------------- // L3: Final byte move
359 (p_scr) ld1 tmp2 = [src], 1
361 cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point
362 add loopcnt = -1, loopcnt
363 br.cloop.dptk.few .l3
368 movi0 pr = saved_pr, -1 // restore the predicate registers
371 movi0 ar.lc = saved_lc // restore the loop counter
378 cmp.gt p_scr, p0 = 16, len
379 and sh1 = 7, src // sh1 = src % 8
380 shr.u loopcnt = len, 4 // element-cnt = len / 16
382 add tmp4 = @ltoff(.table), gp
383 add tmp3 = @ltoff(.loop56), gp
384 (p_scr) br.cond.dpnt.many .copy_bytes // do byte by byte if too few
387 and asrc = -8, src // asrc = (-8) -- align src for loop
388 add loopcnt = -1, loopcnt // loopcnt--
389 shl sh1 = sh1, 3 // sh1 = 8 * (src % 8)
391 ld8 ptable = [tmp4] // ptable = &table
392 ld8 ploop56 = [tmp3] // ploop56 = &loop56
393 and tmp2 = -16, len // tmp2 = len & -OPSIZ
396 add tmp3 = ptable, sh1 // tmp3 = &table + sh1
397 add src = src, tmp2 // src += len & (-16)
398 movi0 ar.lc = loopcnt // set LC
401 ld8 tmp4 = [tmp3] // tmp4 = loop offset
402 sub len = len, tmp2 // len -= len & (-16)
403 movi0 ar.ec = MEMLAT + 2 // one more pass needed
406 ld8 s[1] = [asrc], 8 // preload
407 sub loopaddr = ploop56,tmp4 // loopadd = &loop56 - loop offset
408 movi0 pr.rot = 1 << 16 // set rotating predicates
413 br b6 // jump to the appropriate loop
424 libc_hidden_builtin_def (memcpy)
429 data8 0 // dummy entry
430 data8 .loop56 - .loop8
431 data8 .loop56 - .loop16
432 data8 .loop56 - .loop24
433 data8 .loop56 - .loop32
434 data8 .loop56 - .loop40
435 data8 .loop56 - .loop48
436 data8 .loop56 - .loop56