1 ; vim
: set filetype
=fasm foldmethod
=marker commentstring
=;
%s colorcolumn
=101 :
2 ; XXX
: We actually used this code unit as
a training ground. Expect it to
be very overkill.
4 ;
{{{ init_once
-------------------------------------------------------------------------------------
7 TRACE
'PBF:BLK:PRIMBLK:PRIMGRP:DENSE:INIT_ONCE'
12 mov rsi
, nodes.bytes_n
13 mov rdx
, 1b or 10b ; PROT_READ | PROT_WRITE
14 mov
r10, 0x22 ; MAP_PRIVATE | MAP_ANONYMOUS
19 cmp rax
, linux.errno.last
20 jae
.err_nodes_mmap_failed
22 mov
[nodes.finish
], rax
24 lea rdi
, [nodes.msg.idx_nodes_n
]
26 call qword
[libc.printf
]
29 mov rsi
, nodes.idx.bytes_n
30 mov rdx
, 1b or 10b ; PROT_READ | PROT_WRITE
31 mov
r10, 0x22 ; MAP_PRIVATE | MAP_ANONYMOUS
36 cmp rax
, linux.errno.last
37 jae
.err_nodes_idx_mmap_failed
39 add rax
, nodes.idx.slot_t.bytes_n
40 mov
[nodes.idx.finish
], rax ; next availabe idx slot
45 .err_nodes_mmap_failed:
48 lea rsi
, [nodes.msg.mmap_failed
]
49 call qword
[libc.dprintf
]
53 .err_nodes_idx_mmap_failed:
56 lea rsi
, [nodes.msg.idx_mmap_failed
]
57 call qword
[libc.dprintf
]
61 ;
}}} init_once
-------------------------------------------------------------------------------------
63 define memcpy_dst rbx ; memcpy_dst
(out
) = memcpy_dst
(in
) + memcpy_bytes_n
64 define memcpy_src
r11 ; clobbered
, don
't care
65 define memcpy_bytes_n r9 ; clobbered, don't care
66 define memcpy_link rcx ; the return addr
67 ;
===================================================================================================
68 ;
{{{ parse
-----------------------------------------------------------------------------------------
69 ; scratch ~ rax rcx
r11 r9
71 define dense_p rbx ; in
72 define dense_p_finish
r13 ; in
74 define granularity rbp ; f64
75 define strtbl_idx_p
r8
76 define lat_of_lon_of xmm3 ; f64
/f64
79 define id_p_finish
r14
82 define keys_vals_p rsi ; can
be 0
85 xor keys_vals_p
, keys_vals_p ;
= 0 because may
not have any
(see specs
)
88 cmp dense_p
, dense_p_finish
90 varint_load
r11, dense_p ;
= key
91 mov
r9, r11 ;
= key copy
92 shr
r11, 3 ;
= field num
101 and r9, 111b ;
= field type
102 TRACE
'PBF:BLK:PRIMBLK:PRIMGRP:DENSE:MSG:FIELD %lu OF TYPE %lu', r11, r9
103 val_skip dense_p
, r9, r11
107 varint_load
r11, dense_p ;
= packed_ids_sz
108 TRACE
'PBF:BLK:PRIMBLK:PRIMGRP:DENSE:ID:start = %p:size = %lu(0x%lx) bytes', dense_p
, r11, r11
110 add dense_p
, r11 ; dense_p
+ packed_ids_sz
111 mov id_p_finish
, dense_p
115 varint_load
r11, dense_p ;
= packed_lat_sz
116 TRACE
'PBF:BLK:PRIMBLK:PRIMGRP:DENSE:LAT:start = %p:size = %lu(0x%lx) bytes', dense_p
, r11, r11
118 add dense_p
, r11 ; dense_p
+ packed_lat_sz
122 varint_load
r11, dense_p ;
= packed_lon_sz
123 TRACE
'PBF:BLK:PRIMBLK:PRIMGRP:DENSE:LON:start = %p:size = %lu(0x%lx) bytes', dense_p
, r11, r11
125 add dense_p
, r11 ; p
+ packed_lon_sz
129 varint_load
r11, dense_p ;
= packed_keys_vals_sz
130 TRACE
'PBF:BLK:PRIMBLK:PRIMGRP:DENSE:KEYS_VALS:start = %p:size = %lu(0x%lx) bytes', dense_p
, r11, r11
131 mov keys_vals_p
, dense_p
132 add dense_p
, r11 ; dense_p
+ packed_keys_vals_sz
136 ;
}}} parse
-----------------------------------------------------------------------------------------
137 ;
{{{ unserialize
-----------------------------------------------------------------------------------
138 ; scratch ~ rax rcx
r11 r9 xmm7 xmm6 xmm5
141 ;define granularity rbp ; f64
142 ;define strtbl_idx_p
r8
143 ;define lat_of_lon_of xmm3 ; f64
/f64
146 ;define id_p_finish
r14
149 ;define keys_vals_p rsi ; can
be 0
151 define node_p rbx ; reuse dense_p
152 define prev_id
r13 ; reuse dense_p_finish
153 define node_start_p rdi
154 define granularity_xmm xmm2
155 define prev_lat_prev_lon xmm1
158 tsc_nodes TSC.STORAGE
'TSC:DENSE:%lu'
161 mov node_p
, [nodes.finish
]
163 prefetchw
[node_p
] ;
2 cls should
be very common
164 prefetchw
[node_p
+ 64]
166 pxor one_one
, one_one ; construct the constant
167 pcmpeqd one_one
, one_one ; xmm
= 0xfffff... (128bits)
170 xor prev_id
, prev_id ;
= 0
171 pxor prev_lat_prev_lon
, prev_lat_prev_lon ;
= 0
173 movq granularity_xmm
, granularity
174 punpcklqdq granularity_xmm
, granularity_xmm
176 TSC.SAMPLE tsc_nodes
, start
179 cmp id_p
, id_p_finish
182 ;
---------------------------------------------------------------------------------------------------
183 ; no-branch vector zigzag decoding on sse
(no avx2 coze our dev laptops
)
184 ; encoded mod
2 == 1 : < 0 : -(encoded
/2) - 1 = decoded
185 ; encoded mod
2 == 0 ;
>= 0 : encoded
/2 = decoded
187 varint_load
r11, lat_p
188 pinsrq xmm5
, r11, 1 ; lat_lon int64
/int64
189 varint_load
r11, lon_p
190 pinsrq xmm5
, r11, 0 ; lat_lon int64
/int64
192 varint_load
r11, lat_p
194 varint_load
r11, lon_p
195 movq xmm5
, r11 ; lat_lon int64
/int64
196 punpcklqdq xmm5
, xmm7 ;
= lat_lon int64
/int64
199 psrlq xmm5
, 1 ; i
>> 1, i
= lat_lon int64
/int64
200 pand xmm7
, one_one ; i
& 1, i
= lat_lon int64
/int64
201 xorps xmm6
, xmm6 ;
= 0
202 psubq xmm6
, xmm7 ;
0 - (i
& 1) = - (i
& 1)
203 pxor xmm5
, xmm6 ;
(i
>> 1) ^
-(i
& 1) = DELTA GEO OF granularity units for lat_lon int64
/int64
204 paddq prev_lat_prev_lon
, xmm5 ;
= PREV GEO OF
= GEO OF granularity units
205 ;
---------------------------------------------------------------------------------------------------
206 ; we can switch now from int64
/int64 to f64
/f64 since we finished decoding GEO coords
208 ; lon granularity units int64-
>f64
-- START
209 movq rax
, prev_lat_prev_lon
211 ; lon granularity units int64-
>f64
-- END
213 ; lat granularity units int64-
>f64
-- START
215 pextrq rax
, prev_lat_prev_lon
, 1 ;
= lat granularity units
217 movdqa xmm6
, prev_lat_prev_lon
218 punpckhqdq xmm6
, xmm6 ; DESTRUCTIVE
:duplicate xmm high dq in xmm low dq
, namely lat
219 movq rax
, xmm6 ;
= lat granularity units
222 ; lat granularity units int64-
>f64
-- END
224 unpcklpd xmm7
, xmm6 ;
= lat_lon f64
/f64
226 mulpd xmm7
, granularity_xmm ; lat_lon granularity units
-> lat_lon nanodegrees f64
/f64
227 addpd xmm7
, lat_of_lon_of ; lat_lon full nanodegrees f64
/f64
229 ; cannot use non-temporal store here due to mis-alignment
230 movupd xword
[node_p
+ nodes.node_t.geo
], xmm7 ; little endian storage
231 ;
---------------------------------------------------------------------------------------------------
232 mov node_start_p
, node_p ; save the start of the node since are going to modify node_p
234 test keys_vals_p
, keys_vals_p
235 jnz keys_vals_cpy ; unlikely because
(key
/val
)s are more likely on ways
/relations
236 ; insert an empty key
/terminator while updating node_p
237 add node_p
, nodes.node_t.keys_vals
+ 1 ; room for the
0 terminator
(= empty key
)
238 mov byte
[node_p
- 1], 0 ; insert the
0 terminator
241 ; here
, node_p points of the finish byte
, aka the next node
242 mov qword
[node_start_p
+ nodes.node_t.next
], node_p
243 mov
[nodes.finish
], node_p ; save the finish
/next node
244 ;
---------------------------------------------------------------------------------------------------
246 varint_load
r11, id_p ; load
/skip the raw zigzag delta_id
251 xor r11, rax ;
(i
>> 1) ^
-(i
& 1) ;
= delta_id
252 add prev_id
, r11 ;
= prev_id
= id
253 ;
{{{ idx_insert_node
-------------------------------------------------------------------------------
254 ; scratch ~ rax xmm7 xmm6
257 ;define granularity rbp ; f64
258 ;define strtbl_idx_p
r8
259 ;define lat_of_lon_of xmm3 ; f64
/f64
262 ;define id_p_finish
r14
265 ;define keys_vals_p rsi ; can
be 0
266 ; local from unserialize
268 ;define prev_id
r13 ;
= id
269 ;define node_start_p rdi
270 ;define granularity_xmm xmm2
271 ;define prev_lat_prev_lon xmm1
275 define idx_finish_p
r9
276 define id_bit_idx keys_vals_p ; spill
277 define bit_val_select rcx
278 define bit_val_select_b cl
279 define id_bit_idx_msb lon_p ; spill
280 define idx_slot_p
r11
282 mov idx_slot_p
, [nodes.idx
]
283 mov idx_finish_p
, [nodes.idx.finish
]
284 movq xmm7
, id_bit_idx_msb ; spill
285 movq xmm6
, id_bit_idx ; spill
286 TRACE_NODE_IDX
'id=%#016lx nodes.idx=%p nodes.finish=%p', id
, idx_slot_p
, idx_finish_p
287 xor id_bit_idx
, id_bit_idx
288 xor bit_val_select
, bit_val_select
289 mov id_bit_idx_msb
, -1 ; this is to make it work with id
= 0
290 bsr id_bit_idx_msb
, id ; if id
= 0, id_bit_idx_msb is untouched namely
-1
291 TRACE_NODE_IDX
'id_bit_idx_msb=%lu', id_bit_idx_msb
294 TRACE_NODE_IDX
'id_bit_idx=%lu', id_bit_idx
295 cmp id_bit_idx_msb
, id_bit_idx
298 setc bit_val_select_b ; bit_val_select
= 1 if id
[id_bit_idx
] = 1, else
0
299 TRACE_NODE_IDX
'bit_val_select=%lu', bit_val_select
300 mov rax
, [idx_slot_p
+ 8 * bit_val_select
]
301 TRACE_NODE_IDX
'slot=%p bit slot=%p', idx_slot_p
, rax
303 jnz
.idx_existing_slot
304 TRACE_NODE_IDX
'non existing slot using finish=%p', idx_finish_p
305 mov rax
, idx_finish_p
306 add idx_finish_p
, nodes.idx.slot_t.bytes_n ; we could zero the mem here
, but mmap does it for us
308 prefetchw
[idx_finish_p
+ 64 * 2] ; try to get ready
309 prefetchw
[idx_finish_p
+ 64 * 3]
311 mov
[idx_slot_p
+ 8 * bit_val_select
], rax
315 TRACE_NODE_IDX
'next_slot=%p', idx_slot_p
321 if TRACE_NODE_IDX_ENABLED
322 mov rax
, [idx_slot_p
+ nodes.idx.slot_t.node
]
324 jz
.node_slot_available
325 TRACE_NODE_IDX
'WARNING: overwritting an existing node=%p', rax
326 .node_slot_available:
327 TRACE_NODE_IDX
'inserting node=%p in slot=%p', node_start_p
, idx_slot_p
329 mov
[idx_slot_p
+ nodes.idx.slot_t.node
], node_start_p
330 mov
[nodes.idx.finish
], idx_finish_p
331 movq id_bit_idx_msb
, xmm7 ; unspill
332 movq id_bit_idx
, xmm6 ; unspill
335 purge bit_val_select_b
340 ;
---------------------------------------------------------------------------------------------------
341 ; XXX
: if one day we need to
be backed by
a file we will need to DONT_NEED
"madvise" the
342 ; the current node memory
343 jmp unserialize.next_id
344 ;
}}} idx_insert_node
-------------------------------------------------------------------------------
348 TSC.SAMPLE tsc_nodes
, stop
351 jmp primgrp.parse.return_from_dense
352 ;
}}} unserialize
-----------------------------------------------------------------------------------
353 ;
{{{ keys_vals_cpy
(unlikely
) ----------------------------------------------------------------------
354 ; scratch ~ rax rcx
r11 r9 xmm7 xmm6 xmm5 xmm4
357 ;define granularity rbp ; f64
358 ;define strtbl_idx_p
r8
359 ;define lat_of_lon_of xmm3 ; f64
/f64
362 ;define id_p_finish
r14
365 ;define keys_vals_p rsi ; can
be 0
366 ; local from unserialize
369 ;define node_start_p rdi
370 ;define granularity_xmm xmm2
371 ;define prev_lat_prev_lon xmm1
375 add node_p
, nodes.node_t.keys_vals ;
= key_dst_addr
378 varint_load
r11, keys_vals_p ; load
/skip the int32 key_id
/end marker
379 test
r11, r11 ; int32 key_id
== 0 ?
380 jz
.epilog ; no more key=val for this node
381 mov
r11, [strtbl_idx_p
+ 8 * r11] ;
= key_src_addr
382 varint_load
r9, r11 ;
r9 = key_bytes_n
, r11 = key_src_addr
383 mov
[node_p
], r9b ; store the byte of key_bytes_n at the start of the str
385 ; memcpy_dst
= node_p
(rbx
), memcpy_src
= key_src_addr
(r11), memcpy_bytes_n
= key_bytes_n
(r9)
386 lea memcpy_link
, [.key_cpy_done] ; where memcpy will jmp back
387 jmp memcpy ; node_p
= node_p
+ key_bytes_n
390 ; node_p
= val_dst_addr
391 varint_load
r11, keys_vals_p ; load
/skip the int32 val_id
392 mov
r11, [strtbl_idx_p
+ 8 * r11] ;
= val_src_addr
393 varint_load
r9, r11;
r9 = val_bytes_n
, r11 = val_src_addr
394 mov
[node_p
], r9w ; store the short of val_bytes_n at the start of the str
396 ; memcpy_dst
= node_p
(rbx
), memcpy_src
= val_src_addr
(r11), memcpy_bytes_n
= val_bytes_n
(r9)
397 lea memcpy_link
, [.next_key_val] ; where memcpy will jmp back
398 jmp memcpy ; node_p
= node_p
+ val_bytes_n
401 mov byte
[node_p
], 0 ; insert the
0-sized key terminator
402 inc node_p ; node_p now points on the next node
, aka current node finish byte
403 jmp unserialize.node_next_compute
405 purge prev_lat_prev_lon
406 purge granularity_xmm
420 ;
}}} keys_vals_cpy
---------------------------------------------------------------------------------
421 ;
{{{ local memcpy
(sse
) -----------------------------------------------------------------------------
422 ; XXX
: lddqu performs aligned reads
, carefull where you use this
, but should
be fine with classic
424 ; scratch ~ rax xmm7 xmm6 xmm5 xmm4
427 cmp memcpy_bytes_n
, 16 * 4
429 ; since we can
be cache line mis-aligned
, speculatively do prefetch
2 cls ahead
430 prefetchnta
[memcpy_src
+ 64 * 2] ; speculative non-temporal
"3rd" cache line ahead
431 prefetchnta
[memcpy_src
+ 64 * 3] ; speculative non-temporal
"4th" cache line ahead
433 prefetchw
[memcpy_dst
+ 64 * 2]
434 prefetchw
[memcpy_dst
+ 64 * 3]
436 lddqu xmm7
, [memcpy_src
+ 16 * 0]
437 lddqu xmm6
, [memcpy_src
+ 16 * 1]
438 lddqu xmm5
, [memcpy_src
+ 16 * 2]
439 lddqu xmm4
, [memcpy_src
+ 16 * 3]
441 movdqu
[memcpy_dst
+ 16 * 0], xmm7
442 movdqu
[memcpy_dst
+ 16 * 1], xmm6
443 movdqu
[memcpy_dst
+ 16 * 2], xmm5
444 movdqu
[memcpy_dst
+ 16 * 3], xmm4
446 add memcpy_dst
, 16 * 4
447 sub memcpy_bytes_n
, 16 * 4
449 add memcpy_src
, 16 * 4
453 cmp memcpy_bytes_n
, 16 * 3
456 lddqu xmm7
, [memcpy_src
+ 16 * 0]
457 lddqu xmm6
, [memcpy_src
+ 16 * 1]
458 lddqu xmm5
, [memcpy_src
+ 16 * 2]
460 movdqu
[memcpy_dst
+ 16 * 0], xmm7
461 movdqu
[memcpy_dst
+ 16 * 1], xmm6
462 movdqu
[memcpy_dst
+ 16 * 2], xmm5
464 add memcpy_dst
, 16 * 3
465 sub memcpy_bytes_n
, 16 * 3
467 add memcpy_src
, 16 * 3
471 cmp memcpy_bytes_n
, 16 * 2
474 lddqu xmm7
, [memcpy_src
+ 16 * 0]
475 lddqu xmm6
, [memcpy_src
+ 16 * 1]
477 movdqu
[memcpy_dst
+ 16 * 0], xmm7
478 movdqu
[memcpy_dst
+ 16 * 1], xmm6
480 add memcpy_dst
, 16 * 2
481 sub memcpy_bytes_n
, 16 * 2
483 add memcpy_src
, 16 * 2
487 cmp memcpy_bytes_n
, 16 * 1
490 lddqu xmm7
, [memcpy_src
+ 16 * 0]
491 movdqu
[memcpy_dst
+ 16 * 0], xmm7
493 add memcpy_dst
, 16 * 1
494 sub memcpy_bytes_n
, 16 * 1
496 add memcpy_src
, 16 * 1
500 cmp memcpy_bytes_n
, 8 * 1
502 mov rax
, [memcpy_src
+ 8 * 0]
503 mov
[memcpy_dst
+ 8 * 0], rax
505 add memcpy_dst
, 8 * 1
506 sub memcpy_bytes_n
, 8 * 1
508 add memcpy_src
, 8 * 1
512 cmp memcpy_bytes_n
, 7
514 cmp memcpy_bytes_n
, 6
516 cmp memcpy_bytes_n
, 5
518 cmp memcpy_bytes_n
, 4
520 cmp memcpy_bytes_n
, 3
522 cmp memcpy_bytes_n
, 2
524 cmp memcpy_bytes_n
, 1
572 ;
}}} local memcpy
(sse
) -----------------------------------------------------------------------------
573 ;
===================================================================================================
578 ;
{{{ macros
----------------------------------------------------------------------------------------
579 if TRACE_NODE_IDX_ENABLED
580 macro TRACE_NODE_IDX fmt
, regs
&
581 TRACE_PREFIX
'NODE_IDX', fmt
, regs
584 macro TRACE_NODE_IDX fmt
, regs
&
587 ;
}}} macros
----------------------------------------------------------------------------------------
588 end namespace ; dense