initial commit: a mess of assembly code
[fmap.git] / x86_64_sse2_x87 / pbf / blk / primblk / primgrp / dense / text.s
blob789aabf9459bba45db7f73d3090c766f95eb07d3
1 ; vim: set filetype=fasm foldmethod=marker commentstring=;%s colorcolumn=101 :
2 ; XXX: We actually used this code unit as a training ground. Expect it to be very overkill.
3 dense: namespace dense
4 ;{{{ init_once -------------------------------------------------------------------------------------
5 align_nops
6 init_once:
7 TRACE 'PBF:BLK:PRIMBLK:PRIMGRP:DENSE:INIT_ONCE'
8 mov rbp, rsp
9 and rsp, not 0xf
11 xor edi, edi
12 mov rsi, nodes.bytes_n
13 mov rdx, 1b or 10b ; PROT_READ | PROT_WRITE
14 mov r10, 0x22 ; MAP_PRIVATE | MAP_ANONYMOUS
15 xor r8d, r8d
16 xor r9d, r9d
17 mov rax, linux.mmap
18 syscall
19 cmp rax, linux.errno.last
20 jae .err_nodes_mmap_failed
21 mov [nodes], rax
22 mov [nodes.finish], rax
24 lea rdi, [nodes.msg.idx_nodes_n]
25 mov rsi, nodes.idx.n
26 call qword [libc.printf]
28 xor edi, edi
29 mov rsi, nodes.idx.bytes_n
30 mov rdx, 1b or 10b ; PROT_READ | PROT_WRITE
31 mov r10, 0x22 ; MAP_PRIVATE | MAP_ANONYMOUS
32 xor r8d, r8d
33 xor r9d, r9d
34 mov rax, linux.mmap
35 syscall
36 cmp rax, linux.errno.last
37 jae .err_nodes_idx_mmap_failed
38 mov [nodes.idx], rax
39 add rax, nodes.idx.slot_t.bytes_n
40 mov [nodes.idx.finish], rax ; next availabe idx slot
42 stc ; return true
43 mov rsp, rbp
44 ret
45 .err_nodes_mmap_failed:
46 mov rdi, 2 ; stderr
47 mov rsi, rax ; errno
48 lea rsi, [nodes.msg.mmap_failed]
49 call qword [libc.dprintf]
50 clc ; return false
51 mov rsp, rbp
52 ret
53 .err_nodes_idx_mmap_failed:
54 mov rdi, 2 ; stderr
55 mov rsi, rax ; errno
56 lea rsi, [nodes.msg.idx_mmap_failed]
57 call qword [libc.dprintf]
58 clc ; return false
59 mov rsp, rbp
60 ret
61 ;}}} init_once -------------------------------------------------------------------------------------
62 ; local memcpy ABI
63 define memcpy_dst rbx ; memcpy_dst(out) = memcpy_dst(in) + memcpy_bytes_n
64 define memcpy_src r11 ; clobbered, don't care
65 define memcpy_bytes_n r9 ; clobbered, don't care
66 define memcpy_link rcx ; the return addr
67 ;===================================================================================================
68 ;{{{ parse -----------------------------------------------------------------------------------------
69 ; scratch ~ rax rcx r11 r9
70 ; from primgrp
71 define dense_p rbx ; in
72 define dense_p_finish r13 ; in
73 define flags r15
74 define granularity rbp ; f64
75 define strtbl_idx_p r8
76 define lat_of_lon_of xmm3 ; f64/f64
77 ; local
78 define id_p r12
79 define id_p_finish r14
80 define lat_p r10
81 define lon_p rdx
82 define keys_vals_p rsi ; can be 0
83 align_nops
84 parse:
85 xor keys_vals_p, keys_vals_p ; = 0 because may not have any (see specs)
86 align_nops
87 .next_key:
88 cmp dense_p, dense_p_finish
89 je unserialize
90 varint_load r11, dense_p ; = key
91 mov r9, r11 ; = key copy
92 shr r11, 3 ; = field num
93 cmp r11, 1
94 je .id_found
95 cmp r11, 8
96 je .lat_found
97 cmp r11, 9
98 je .lon_found
99 cmp r11, 10
100 je .keys_vals_found
101 and r9, 111b ; = field type
102 TRACE 'PBF:BLK:PRIMBLK:PRIMGRP:DENSE:MSG:FIELD %lu OF TYPE %lu', r11, r9
103 val_skip dense_p, r9, r11
104 jmp .next_key
105 align_nops
106 .id_found:
107 varint_load r11, dense_p ; = packed_ids_sz
108 TRACE 'PBF:BLK:PRIMBLK:PRIMGRP:DENSE:ID:start = %p:size = %lu(0x%lx) bytes', dense_p, r11, r11
109 mov id_p, dense_p
110 add dense_p, r11 ; dense_p + packed_ids_sz
111 mov id_p_finish, dense_p
112 jmp .next_key
113 align_nops
114 .lat_found:
115 varint_load r11, dense_p ;= packed_lat_sz
116 TRACE 'PBF:BLK:PRIMBLK:PRIMGRP:DENSE:LAT:start = %p:size = %lu(0x%lx) bytes', dense_p, r11, r11
117 mov lat_p, dense_p
118 add dense_p, r11 ; dense_p + packed_lat_sz
119 jmp .next_key
120 align_nops
121 .lon_found:
122 varint_load r11, dense_p ;= packed_lon_sz
123 TRACE 'PBF:BLK:PRIMBLK:PRIMGRP:DENSE:LON:start = %p:size = %lu(0x%lx) bytes', dense_p, r11, r11
124 mov lon_p, dense_p
125 add dense_p, r11 ; p + packed_lon_sz
126 jmp .next_key
127 align_nops
128 .keys_vals_found:
129 varint_load r11, dense_p ;= packed_keys_vals_sz
130 TRACE 'PBF:BLK:PRIMBLK:PRIMGRP:DENSE:KEYS_VALS:start = %p:size = %lu(0x%lx) bytes', dense_p, r11, r11
131 mov keys_vals_p, dense_p
132 add dense_p, r11 ; dense_p + packed_keys_vals_sz
133 jmp .next_key
134 purge dense_p_finish
135 purge dense_p
136 ;}}} parse -----------------------------------------------------------------------------------------
137 ;{{{ unserialize -----------------------------------------------------------------------------------
138 ; scratch ~ rax rcx r11 r9 xmm7 xmm6 xmm5
139 ; from primgrp
140 ;define flags r15
141 ;define granularity rbp ; f64
142 ;define strtbl_idx_p r8
143 ;define lat_of_lon_of xmm3 ; f64/f64
144 ; local from parse:
145 ;define id_p r12
146 ;define id_p_finish r14
147 ;define lat_p r10
148 ;define lon_p rdx
149 ;define keys_vals_p rsi ; can be 0
150 ; local
151 define node_p rbx ; reuse dense_p
152 define prev_id r13 ; reuse dense_p_finish
153 define node_start_p rdi
154 define granularity_xmm xmm2
155 define prev_lat_prev_lon xmm1
156 define one_one xmm0
157 ; PERF
158 tsc_nodes TSC.STORAGE 'TSC:DENSE:%lu'
159 align_nops
160 unserialize:
161 mov node_p, [nodes.finish]
162 if ~SYSTEM_LEGACY
163 prefetchw [node_p] ; 2 cls should be very common
164 prefetchw [node_p + 64]
165 end if
166 pxor one_one, one_one ; construct the constant
167 pcmpeqd one_one, one_one ; xmm = 0xfffff... (128bits)
168 psrlq one_one, 63
170 xor prev_id, prev_id ; = 0
171 pxor prev_lat_prev_lon, prev_lat_prev_lon ; = 0
173 movq granularity_xmm, granularity
174 punpcklqdq granularity_xmm, granularity_xmm
175 ; PERF
176 TSC.SAMPLE tsc_nodes, start
177 align_nops
178 .next_id:
179 cmp id_p, id_p_finish
180 je .epilog
181 inc qword [nodes.n]
182 ;---------------------------------------------------------------------------------------------------
183 ; no-branch vector zigzag decoding on sse (no avx2 coze our dev laptops)
184 ; encoded mod 2 == 1 : < 0 : -(encoded/2) - 1 = decoded
185 ; encoded mod 2 == 0 ; >= 0 : encoded/2 = decoded
186 if ~SYSTEM_LEGACY
187 varint_load r11, lat_p
188 pinsrq xmm5, r11, 1 ; lat_lon int64/int64
189 varint_load r11, lon_p
190 pinsrq xmm5, r11, 0 ; lat_lon int64/int64
191 else
192 varint_load r11, lat_p
193 movq xmm7, r11
194 varint_load r11, lon_p
195 movq xmm5, r11 ; lat_lon int64/int64
196 punpcklqdq xmm5, xmm7 ; = lat_lon int64/int64
197 end if
198 movdqa xmm7, xmm5
199 psrlq xmm5, 1 ; i >> 1, i = lat_lon int64/int64
200 pand xmm7, one_one ; i & 1, i = lat_lon int64/int64
201 xorps xmm6, xmm6 ; = 0
202 psubq xmm6, xmm7 ; 0 - (i & 1) = - (i & 1)
203 pxor xmm5, xmm6 ; (i >> 1) ^ -(i & 1) = DELTA GEO OF granularity units for lat_lon int64/int64
204 paddq prev_lat_prev_lon, xmm5 ; = PREV GEO OF = GEO OF granularity units
205 ;---------------------------------------------------------------------------------------------------
206 ; we can switch now from int64/int64 to f64/f64 since we finished decoding GEO coords
208 ; lon granularity units int64->f64 -- START
209 movq rax, prev_lat_prev_lon
210 cvtsi2sd xmm7, rax
211 ; lon granularity units int64->f64 -- END
213 ; lat granularity units int64->f64 -- START
214 if ~SYSTEM_LEGACY
215 pextrq rax, prev_lat_prev_lon, 1 ; = lat granularity units
216 else
217 movdqa xmm6, prev_lat_prev_lon
218 punpckhqdq xmm6, xmm6 ; DESTRUCTIVE:duplicate xmm high dq in xmm low dq, namely lat
219 movq rax, xmm6 ; = lat granularity units
220 end if
221 cvtsi2sd xmm6, rax
222 ; lat granularity units int64->f64 -- END
224 unpcklpd xmm7, xmm6 ; = lat_lon f64/f64
226 mulpd xmm7, granularity_xmm ; lat_lon granularity units -> lat_lon nanodegrees f64/f64
227 addpd xmm7, lat_of_lon_of ; lat_lon full nanodegrees f64/f64
229 ; cannot use non-temporal store here due to mis-alignment
230 movupd xword [node_p + nodes.node_t.geo], xmm7 ; little endian storage
231 ;---------------------------------------------------------------------------------------------------
232 mov node_start_p, node_p ; save the start of the node since are going to modify node_p
234 test keys_vals_p, keys_vals_p
235 jnz keys_vals_cpy ; unlikely because (key/val)s are more likely on ways/relations
236 ; insert an empty key/terminator while updating node_p
237 add node_p, nodes.node_t.keys_vals + 1 ; room for the 0 terminator (= empty key)
238 mov byte [node_p - 1], 0 ; insert the 0 terminator
239 align_nops
240 .node_next_compute:
241 ; here, node_p points of the finish byte, aka the next node
242 mov qword [node_start_p + nodes.node_t.next], node_p
243 mov [nodes.finish], node_p ; save the finish/next node
244 ;---------------------------------------------------------------------------------------------------
245 ; decode the id
246 varint_load r11, id_p ; load/skip the raw zigzag delta_id
247 mov rax, r11
248 shr r11, 1 ; i >> 1
249 and rax, 1 ; i & 1
250 neg rax ; -(i & 1)
251 xor r11, rax ; (i >> 1) ^ -(i & 1) ; = delta_id
252 add prev_id, r11 ; = prev_id = id
253 ;{{{ idx_insert_node -------------------------------------------------------------------------------
254 ; scratch ~ rax xmm7 xmm6
255 ; from primgrp
256 ;define flags r15
257 ;define granularity rbp ; f64
258 ;define strtbl_idx_p r8
259 ;define lat_of_lon_of xmm3 ; f64/f64
260 ; local from parse:
261 ;define id_p r12
262 ;define id_p_finish r14
263 ;define lat_p r10
264 ;define lon_p rdx
265 ;define keys_vals_p rsi ; can be 0
266 ; local from unserialize
267 ;define node_p rbx
268 ;define prev_id r13 ; = id
269 ;define node_start_p rdi
270 ;define granularity_xmm xmm2
271 ;define prev_lat_prev_lon xmm1
272 ;define one_one xmm0
273 ;local
274 define id prev_id
275 define idx_finish_p r9
276 define id_bit_idx keys_vals_p ; spill
277 define bit_val_select rcx
278 define bit_val_select_b cl
279 define id_bit_idx_msb lon_p ; spill
280 define idx_slot_p r11
281 idx_insert_node:
282 mov idx_slot_p, [nodes.idx]
283 mov idx_finish_p, [nodes.idx.finish]
284 movq xmm7, id_bit_idx_msb ; spill
285 movq xmm6, id_bit_idx ; spill
286 TRACE_NODE_IDX 'id=%#016lx nodes.idx=%p nodes.finish=%p', id, idx_slot_p, idx_finish_p
287 xor id_bit_idx, id_bit_idx
288 xor bit_val_select, bit_val_select
289 mov id_bit_idx_msb, -1 ; this is to make it work with id = 0
290 bsr id_bit_idx_msb, id ; if id = 0, id_bit_idx_msb is untouched namely -1
291 TRACE_NODE_IDX 'id_bit_idx_msb=%lu', id_bit_idx_msb
292 align_nops
293 .next_id_bit:
294 TRACE_NODE_IDX 'id_bit_idx=%lu', id_bit_idx
295 cmp id_bit_idx_msb, id_bit_idx
296 jb .insert_here
297 bt id, id_bit_idx
298 setc bit_val_select_b ; bit_val_select = 1 if id[id_bit_idx] = 1, else 0
299 TRACE_NODE_IDX 'bit_val_select=%lu', bit_val_select
300 mov rax, [idx_slot_p + 8 * bit_val_select]
301 TRACE_NODE_IDX 'slot=%p bit slot=%p', idx_slot_p, rax
302 test rax, rax
303 jnz .idx_existing_slot
304 TRACE_NODE_IDX 'non existing slot using finish=%p', idx_finish_p
305 mov rax, idx_finish_p
306 add idx_finish_p, nodes.idx.slot_t.bytes_n ; we could zero the mem here, but mmap does it for us
307 if ~SYSTEM_LEGACY
308 prefetchw [idx_finish_p + 64 * 2] ; try to get ready
309 prefetchw [idx_finish_p + 64 * 3]
310 end if
311 mov [idx_slot_p + 8 * bit_val_select], rax
312 align_nops
313 .idx_existing_slot:
314 mov idx_slot_p, rax
315 TRACE_NODE_IDX 'next_slot=%p', idx_slot_p
316 inc id_bit_idx
318 jmp .next_id_bit
319 align_nops
320 .insert_here:
321 if TRACE_NODE_IDX_ENABLED
322 mov rax, [idx_slot_p + nodes.idx.slot_t.node]
323 test rax, rax
324 jz .node_slot_available
325 TRACE_NODE_IDX 'WARNING: overwritting an existing node=%p', rax
326 .node_slot_available:
327 TRACE_NODE_IDX 'inserting node=%p in slot=%p', node_start_p, idx_slot_p
328 end if
329 mov [idx_slot_p + nodes.idx.slot_t.node], node_start_p
330 mov [nodes.idx.finish], idx_finish_p
331 movq id_bit_idx_msb, xmm7 ; unspill
332 movq id_bit_idx, xmm6 ; unspill
333 purge idx_slot_p
334 purge id_bit_idx_msb
335 purge bit_val_select_b
336 purge bit_val_select
337 purge id_bit_idx
338 purge idx_finish_p
339 purge id
340 ;---------------------------------------------------------------------------------------------------
341 ; XXX: if one day we need to be backed by a file we will need to DONT_NEED "madvise" the
342 ; the current node memory
343 jmp unserialize.next_id
344 ;}}} idx_insert_node -------------------------------------------------------------------------------
345 align_nops
346 unserialize.epilog:
347 ; PERF -- START
348 TSC.SAMPLE tsc_nodes, stop
349 TSC.PRINT tsc_nodes
350 ; PERF -- STOP
351 jmp primgrp.parse.return_from_dense
352 ;}}} unserialize -----------------------------------------------------------------------------------
353 ;{{{ keys_vals_cpy(unlikely) ----------------------------------------------------------------------
354 ; scratch ~ rax rcx r11 r9 xmm7 xmm6 xmm5 xmm4
355 ; from primgrp
356 ;define flags r15
357 ;define granularity rbp ; f64
358 ;define strtbl_idx_p r8
359 ;define lat_of_lon_of xmm3 ; f64/f64
360 ; local from parse:
361 ;define id_p r12
362 ;define id_p_finish r14
363 ;define lat_p r10
364 ;define lon_p rdx
365 ;define keys_vals_p rsi ; can be 0
366 ; local from unserialize
367 ;define node_p rbx
368 ;define prev_id r13
369 ;define node_start_p rdi
370 ;define granularity_xmm xmm2
371 ;define prev_lat_prev_lon xmm1
372 ;define one_one xmm0
373 align_nops
374 keys_vals_cpy:
375 add node_p, nodes.node_t.keys_vals ; = key_dst_addr
376 align_nops
377 .next_key_val:
378 varint_load r11, keys_vals_p ; load/skip the int32 key_id/end marker
379 test r11, r11 ; int32 key_id == 0 ?
380 jz .epilog ; no more key=val for this node
381 mov r11, [strtbl_idx_p + 8 * r11] ; = key_src_addr
382 varint_load r9, r11 ; r9 = key_bytes_n, r11 = key_src_addr
383 mov [node_p], r9b ; store the byte of key_bytes_n at the start of the str
384 inc node_p
385 ; memcpy_dst = node_p(rbx), memcpy_src = key_src_addr(r11), memcpy_bytes_n = key_bytes_n(r9)
386 lea memcpy_link, [.key_cpy_done] ; where memcpy will jmp back
387 jmp memcpy ; node_p = node_p + key_bytes_n
388 align_nops
389 .key_cpy_done:
390 ; node_p = val_dst_addr
391 varint_load r11, keys_vals_p ; load/skip the int32 val_id
392 mov r11, [strtbl_idx_p + 8 * r11] ; = val_src_addr
393 varint_load r9, r11; r9 = val_bytes_n, r11 = val_src_addr
394 mov [node_p], r9w ; store the short of val_bytes_n at the start of the str
395 add node_p, 2
396 ; memcpy_dst = node_p(rbx), memcpy_src = val_src_addr(r11), memcpy_bytes_n = val_bytes_n(r9)
397 lea memcpy_link, [.next_key_val] ; where memcpy will jmp back
398 jmp memcpy ; node_p = node_p + val_bytes_n
399 align_nops
400 .epilog:
401 mov byte [node_p], 0 ; insert the 0-sized key terminator
402 inc node_p ; node_p now points on the next node, aka current node finish byte
403 jmp unserialize.node_next_compute
404 purge one_one
405 purge prev_lat_prev_lon
406 purge granularity_xmm
407 purge node_start_p
408 purge prev_id
409 purge node_p
410 purge keys_vals_p
411 purge lon_p
412 purge lat_p
413 purge id_p_finish
414 purge id_p
415 purge lat_of_lon_of
416 purge strtbl_idx_p
417 purge strtbl_idx_p
418 purge granularity
419 purge flags
420 ;}}} keys_vals_cpy ---------------------------------------------------------------------------------
421 ;{{{ local memcpy(sse) -----------------------------------------------------------------------------
422 ; XXX: lddqu performs aligned reads, carefull where you use this, but should be fine with classic
423 ; paginated memory.
424 ; scratch ~ rax xmm7 xmm6 xmm5 xmm4
425 align_nops
426 memcpy:
427 cmp memcpy_bytes_n, 16 * 4
428 jb .below64
429 ; since we can be cache line mis-aligned, speculatively do prefetch 2 cls ahead
430 prefetchnta [memcpy_src + 64 * 2] ; speculative non-temporal "3rd" cache line ahead
431 prefetchnta [memcpy_src + 64 * 3] ; speculative non-temporal "4th" cache line ahead
432 if ~SYSTEM_LEGACY
433 prefetchw [memcpy_dst + 64 * 2]
434 prefetchw [memcpy_dst + 64 * 3]
435 end if
436 lddqu xmm7, [memcpy_src + 16 * 0]
437 lddqu xmm6, [memcpy_src + 16 * 1]
438 lddqu xmm5, [memcpy_src + 16 * 2]
439 lddqu xmm4, [memcpy_src + 16 * 3]
441 movdqu [memcpy_dst + 16 * 0], xmm7
442 movdqu [memcpy_dst + 16 * 1], xmm6
443 movdqu [memcpy_dst + 16 * 2], xmm5
444 movdqu [memcpy_dst + 16 * 3], xmm4
446 add memcpy_dst, 16 * 4
447 sub memcpy_bytes_n, 16 * 4
448 jz .done
449 add memcpy_src, 16 * 4
450 jmp memcpy
451 align_nops
452 .below64:
453 cmp memcpy_bytes_n, 16 * 3
454 jb .below48
456 lddqu xmm7, [memcpy_src + 16 * 0]
457 lddqu xmm6, [memcpy_src + 16 * 1]
458 lddqu xmm5, [memcpy_src + 16 * 2]
460 movdqu [memcpy_dst + 16 * 0], xmm7
461 movdqu [memcpy_dst + 16 * 1], xmm6
462 movdqu [memcpy_dst + 16 * 2], xmm5
464 add memcpy_dst, 16 * 3
465 sub memcpy_bytes_n, 16 * 3
466 jz .done
467 add memcpy_src, 16 * 3
468 ; fall-thru
469 align_nops
470 .below48:
471 cmp memcpy_bytes_n, 16 * 2
472 jb .below32
474 lddqu xmm7, [memcpy_src + 16 * 0]
475 lddqu xmm6, [memcpy_src + 16 * 1]
477 movdqu [memcpy_dst + 16 * 0], xmm7
478 movdqu [memcpy_dst + 16 * 1], xmm6
480 add memcpy_dst, 16 * 2
481 sub memcpy_bytes_n, 16 * 2
482 jz .done
483 add memcpy_src, 16 * 2
484 ; fall-thru
485 align_nops
486 .below32:
487 cmp memcpy_bytes_n, 16 * 1
488 jb .below16
490 lddqu xmm7, [memcpy_src + 16 * 0]
491 movdqu [memcpy_dst + 16 * 0], xmm7
493 add memcpy_dst, 16 * 1
494 sub memcpy_bytes_n, 16 * 1
495 jz .done
496 add memcpy_src, 16 * 1
497 ; fall-thru
498 align_nops
499 .below16:
500 cmp memcpy_bytes_n, 8 * 1
501 jb .below8
502 mov rax, [memcpy_src + 8 * 0]
503 mov [memcpy_dst + 8 * 0], rax
505 add memcpy_dst, 8 * 1
506 sub memcpy_bytes_n, 8 * 1
507 jz .done
508 add memcpy_src, 8 * 1
509 ; fall-thru
510 align_nops
511 .below8:
512 cmp memcpy_bytes_n, 7
513 je .cpy7
514 cmp memcpy_bytes_n, 6
515 je .cpy6
516 cmp memcpy_bytes_n, 5
517 je .cpy5
518 cmp memcpy_bytes_n, 4
519 je .cpy4
520 cmp memcpy_bytes_n, 3
521 je .cpy3
522 cmp memcpy_bytes_n, 2
523 je .cpy2
524 cmp memcpy_bytes_n, 1
525 je .cpy1
526 jmp memcpy_link
527 align_nops
528 .cpy7:
529 mov al, [memcpy_src]
530 mov [memcpy_dst], al
531 inc memcpy_src
532 inc memcpy_dst
533 align_nops
534 .cpy6:
535 mov al, [memcpy_src]
536 mov [memcpy_dst], al
537 inc memcpy_src
538 inc memcpy_dst
539 align_nops
540 .cpy5:
541 mov al, [memcpy_src]
542 mov [memcpy_dst], al
543 inc memcpy_src
544 inc memcpy_dst
545 align_nops
546 .cpy4:
547 mov al, [memcpy_src]
548 mov [memcpy_dst], al
549 inc memcpy_src
550 inc memcpy_dst
551 align_nops
552 .cpy3:
553 mov al, [memcpy_src]
554 mov [memcpy_dst], al
555 inc memcpy_src
556 inc memcpy_dst
557 align_nops
558 .cpy2:
559 mov al, [memcpy_src]
560 mov [memcpy_dst], al
561 inc memcpy_src
562 inc memcpy_dst
563 align_nops
564 .cpy1:
565 mov al, [memcpy_src]
566 mov [memcpy_dst], al
567 inc memcpy_src
568 inc memcpy_dst
569 align_nops
570 .done:
571 jmp memcpy_link
572 ;}}} local memcpy(sse) -----------------------------------------------------------------------------
573 ;===================================================================================================
574 purge memcpy_dst
575 purge memcpy_src
576 purge memcpy_bytes_n
577 purge memcpy_link
578 ;{{{ macros ----------------------------------------------------------------------------------------
579 if TRACE_NODE_IDX_ENABLED
580 macro TRACE_NODE_IDX fmt, regs&
581 TRACE_PREFIX 'NODE_IDX', fmt, regs
582 end macro
583 else
584 macro TRACE_NODE_IDX fmt, regs&
585 end macro
586 end if
587 ;}}} macros ----------------------------------------------------------------------------------------
588 end namespace ; dense