1 ; vim
: set filetype
=fasm foldmethod
=marker commentstring
=;
%s colorcolumn
=101 :
2 ; XXX
: We actually used this code unit as
a training ground. Expect it to
be very overkill.
4 ;
{{{ init_once
-------------------------------------------------------------------------------------
7 TRACE
'PBF:BLK:PRIMBLK:PRIMGRP:WAYS:INIT_ONCE'
12 mov rsi
, array.bytes_n
13 mov rdx
, 1b or 10b ; PROT_READ | PROT_WRITE
14 mov
r10, 0x22 ; MAP_PRIVATE | MAP_ANONYMOUS
19 cmp rax
, linux.errno.last
20 jae
.err_ways_mmap_failed
22 mov
[array.finish
], rax
24 lea rdi
, [msg.idx_ways_n
]
26 call qword
[libc.printf
]
29 mov rsi
, array.idx.bytes_n
30 mov rdx
, 1b or 10b ; PROT_READ | PROT_WRITE
31 mov
r10, 0x22 ; MAP_PRIVATE | MAP_ANONYMOUS
36 cmp rax
, linux.errno.last
37 jae
.err_ways_idx_mmap_failed
40 mov
[array.idx.finish
], rax ; next availabe idx slot
45 .err_ways_mmap_failed:
48 lea rsi
, [msg.mmap_failed
]
49 call qword
[libc.dprintf
]
53 .err_ways_idx_mmap_failed:
56 lea rsi
, [msg.idx_mmap_failed
]
57 call qword
[libc.dprintf
]
61 ;
}}} init_once
-------------------------------------------------------------------------------------
63 define memcpy_dst rbx ; memcpy_dst
(out
) = memcpy_dst
(in
) + memcpy_bytes_n
64 define memcpy_src
r11 ; clobbered
, don
't care
65 define memcpy_bytes_n r9 ; clobbered, don't care
66 define memcpy_link rcx ; the return addr
67 ;
===================================================================================================
68 ;
{{{ parse
-----------------------------------------------------------------------------------------
69 ; scratch ~ rax rcx
r11 r9
71 define ways_p rbx ; in
72 define ways_p_finish
r13 ; in
73 define flags
r15 ; in
/out
74 define strtbl_idx_p
r8
75 define lat_of_lon_of xmm3 ; f64
/f64
78 define keys_p
r14 ; can
be 0
79 define keys_p_d r14d ; can
be 0
80 define keys_p_finish rbp
81 define vals_p
r10 ; num of vals is num of keys
82 define refs_p rdx ; presume it can
be 0
83 define refs_p_d edx ; presume it can
be 0
84 define refs_finish_p rsi
87 xor keys_p_d
, keys_p_d ; if no
(key
/val
)
88 xor refs_p_d
, refs_p_d ; presume we can have no refs
(should
not happen though
)
91 ;
"wayS" is plural because of the primgrp field name
, but there is actually only one
"way"
92 ; which will
be repeated
93 cmp ways_p
, ways_p_finish
95 varint_load
r11, ways_p ; msg key
96 mov
r9, r11 ;
= msg key copy
97 shr
r11, 3 ;
= field num
106 and r9, 111b ; field type
107 TRACE
'PBF:BLK:PRIMBLK:PRIMGRP:WAYS:MSG:FIELD %lu OF TYPE %lu', r11, r9
108 val_skip ways_p
, r9, r11
112 varint_load id
, ways_p
116 varint_load
r11, ways_p ; keys_sz
118 TRACE
'PBF:BLK:PRIMBLK:PRIMGRP:WAYS:KEYS:start = %p:size = %lu(0x%lx) bytes', keys_p
, r11, r11
120 mov keys_p_finish
, ways_p
124 varint_load
r11, ways_p ; vals_sz
126 TRACE
'PBF:BLK:PRIMBLK:PRIMGRP:WAYS:VALS:start = %p:size = %lu(0x%lx) bytes', vals_p
, r11, r11
131 varint_load
r11, ways_p ; refs_sz
133 TRACE
'PBF:BLK:PRIMBLK:PRIMGRP:WAYS:REFS:start = %p:size = %lu(0x%lx) bytes', refs_p
, r11, r11
135 mov refs_finish_p
, ways_p
139 ;
}}} parse
-----------------------------------------------------------------------------------------
140 ;
{{{ unserialize
-----------------------------------------------------------------------------------
141 ; scratch ~ rax rcx
r11 r9
143 ;define flags
r15 ; in
/out
144 ;define strtbl_idx_p
r8
145 ;define lat_of_lon_of xmm3 ; f64
/f64
148 ;define keys_p
r14 ; can
be 0
149 ;define keys_p_d r14d ; can
be 0
150 ;define keys_p_finish rbp
151 ;define vals_p
r10 ; num of vals is num of keys
152 ;define refs_p rdx ; presume it can
be 0
153 ;define refs_p_d edx ; presume it can
be 0
154 ;define refs_finish_p rsi
157 define way_start_p
r13
160 mov way_p
, [array.finish
]
161 mov way_start_p
, way_p ; save the start of the way since are going to modify way_p
163 ;
{{{ keys_vals_cpy
----------------------------------------------------------------------------------
165 add way_p
, array.way_t.keys_vals ; way_p points on the start of the keys_vals section
167 jz unserialize.keys_vals_done
171 cmp keys_p
, keys_p_finish
172 je unserialize.keys_vals_done
174 varint_load
r11, keys_p ;
= key_id
, keys_p points on next key_id
175 mov
r11, [strtbl_idx_p
+ 8 * r11] ;
= key_src_addr
176 varint_load
r9, r11 ;
r9 = key_bytes_n
, r11 = key_src_addr
177 mov
[way_p
], r9b ; insert the byte containing the sz of the key str in bytes
179 ; memcpy_dst
= way_p
(rbx
), memcpy_src
= key_src_addr
(r11), memcpy_bytes_n
= key_bytes_n
(r9)
180 lea memcpy_link
, [.key_cpy_done] ; where memcpy will jmp back
181 jmp memcpy ; way_p
= way_p
+ key_bytes_n
184 varint_load
r11, vals_p ;
= val_id
, vals_p points on next val_id
185 mov
r11, [strtbl_idx_p
+ 8 * r11] ;
= val_src_addr
186 varint_load
r9, r11;
r9 = val_bytes_n
, r11 = val_src_addr
187 mov
[way_p
], r9w ; insert the short containing the sz of the val str in bytes
189 ; memcpy_dst
= way_p
(rbx
), memcpy_src
= val_src_addr
(r11), memcpy_bytes_n
= val_bytes_n
(r9)
190 lea memcpy_link
, [.next_key_val] ; where memcpy will jmp back
191 jmp memcpy ; way_p
= way_p
+ val_bytes_n
192 ;
}}} keys_val_cpy
----------------------------------------------------------------------------------
194 unserialize.keys_vals_done
:
195 mov byte
[way_p
], 0 ; insert the
0-szed key
/terminator
196 inc way_p ; here way_p points on the start of the section of ptrs on nodes
197 mov
[way_start_p
+ array.way_t.nodes
], way_p
199 jz unserialize.refs_solved ; unlikely
204 ;
{{{ refs_solve
------------------------------------------------------------------------------------
205 ; XXX
: it is expected to have missing nodes in ways-
->do
0 their ptrs
206 ; scratch ~ rax rcx
r11
208 ;define flags
r15 ; in
/out
209 ;define strtbl_idx_p
r8
210 ;define lat_of_lon_of xmm3 ; f64
/f64
213 ;define refs_p rdx ; presume it can
be 0
214 ;define refs_p_d edx ; presume it can
be 0
215 ;define refs_finish_p rsi
216 ; local from unserialize
218 ;define way_start_p
r13
221 define prev_ref_d r14d
223 define bit_val_select
r10
224 define bit_val_select_b r10b
225 define ref_bit_idx_msb rcx
226 define ref_bit_idx_msb_d ecx
229 xor prev_ref_d
, prev_ref_d ; prev_ref
= 0
232 ; decode the id
----------------------------------------------------------------------------
233 varint_load
r11, refs_p ; load
/skip the raw zigzag delta_refs
238 xor r11, rax ;
(i
>> 1) ^
-(i
& 1) ;
= delta_id
239 add prev_ref
, r11 ;
= prev_ref
= ref
241 ; lookup the node ptr in the nodes idx
-----------------------------------------------------
242 xor bit_val_select
, bit_val_select ; bit_val_select
= 0
243 mov idx_slot_p
, [primgrp.dense.nodes.idx
]
244 mov ref_bit_idx_msb
, -1 ; this is to make it work with ref
= 0
245 bsr ref_bit_idx_msb
, ref ; if ref
= 0, ref_bit_idx_msb is untouched namely
-1
246 inc ref_bit_idx_msb_d ; store the idx
+ 1, 32bits because of the following jecxz
249 jecxz
.idx_slot_lookup_done ; bit_idx + 1 is stored in cx
251 setc bit_val_select_b
252 mov idx_slot_p
, [idx_slot_p
+ 8 * bit_val_select
]
253 test idx_slot_p
, idx_slot_p ; if the idx slot in
0, we have
a missing node
254 jz
.missing_slot_node ; unlikely
256 prefetcht0
[idx_slot_p
]
259 dec ref_bit_idx_msb_d
261 .missing_slot_node: ; unlikely
263 jmp
.next_way_node_ptr
265 .idx_slot_lookup_done:
266 mov rax
, [idx_slot_p
+ dense.nodes.idx.slot_t.node
] ; XXX
: can
be 0
268 .next_way_node_ptr: ; unlikely
269 add way_p
, 8 ; points on way storage for next node ptr
270 cmp refs_p
, refs_finish_p
273 purge ref_bit_idx_msb_d
274 purge ref_bit_idx_msb
275 purge bit_val_select_b
280 ;
}}} refs_solve
------------------------------------------------------------------------------------
281 unserialize.refs_solved
: ; unlikely don
't align_nops
282 ; here way_p points on the start of the next/finish way
283 mov [way_start_p + array.way_t.next], way_p
284 mov [array.finish], way_p
286 ;{{{ idx_insert_node -------------------------------------------------------------------------------
289 ;define flags r15 ; in/out
290 ;define strtbl_idx_p r8
291 ;define lat_of_lon_of xmm3 ; f64/f64
294 ;define refs_p rdx ; presume it can be 0
295 ;define refs_p_d edx ; presume it can be 0
296 ;define refs_finish_p rsi
297 ; local from unserialize
298 ;define way_start_p r13
300 define idx_finish_p rbx
301 define id_bit_idx r14
302 define id_bit_idx_d r14d
303 define bit_val_select rcx ; we use rcx here because we may use rcx compressed instructions
304 define bit_val_select_d ecx
305 define bit_val_select_b cl
306 define id_bit_idx_msb rbp
309 mov idx_slot_p, [array.idx]
310 mov idx_finish_p, [array.idx.finish]
311 TRACE_WAYS_IDX 'id
=%#016lx nodes.idx=%p nodes.finish=%p', id, idx_slot_p, idx_finish_p
312 xor id_bit_idx_d
, id_bit_idx_d
313 xor bit_val_select_d
, bit_val_select_d
314 mov id_bit_idx_msb
, -1 ; this is to make it work with id
= 0
315 bsr id_bit_idx_msb
, id ; if id
= 0, id_bit_idx_msb is untouched namely
-1
316 TRACE_WAYS_IDX
'id_bit_idx_msb=%lu', id_bit_idx_msb
319 TRACE_WAYS_IDX
'id_bit_idx=%lu', id_bit_idx
320 cmp id_bit_idx_msb
, id_bit_idx
323 setc bit_val_select_b ; bit_val_select
= 1 if id
[id_bit_idx
] = 1, else
0
324 TRACE_WAYS_IDX
'bit_val_select=%lu', bit_val_select
325 mov rax
, [idx_slot_p
+ 8 * bit_val_select
]
326 TRACE_WAYS_IDX
'slot=%p bit slot=%p', idx_slot_p
, rax
328 jnz
.idx_existing_slot
329 TRACE_WAYS_IDX
'non existing slot using finish=%p', idx_finish_p
330 mov rax
, idx_finish_p
331 add idx_finish_p
, array.idx.slot_t.bytes_n ; we could zero the mem here
, but mmap does it for us
333 prefetchw
[idx_finish_p
+ 64 * 2] ; try to get ready
334 prefetchw
[idx_finish_p
+ 64 * 3]
336 mov
[idx_slot_p
+ 8 * bit_val_select
], rax
340 TRACE_WAYS_IDX
'next_slot=%p', idx_slot_p
346 if TRACE_WAYS_IDX_ENABLED
347 mov rax
, [idx_slot_p
+ array.idx.slot_t.way
]
349 jz
.way_slot_available
350 TRACE_WAYS_IDX
'WARNING: overwritting an existing way=%p', rax
352 TRACE_WAYS_IDX
'inserting way=%p in slot=%p', way_start_p
, idx_slot_p
354 mov
[idx_slot_p
+ array.idx.slot_t.way
], way_start_p
355 mov
[array.idx.finish
], idx_finish_p
356 ; XXX
: if one day we need to
be backed by
a file we will need to DONT_NEED
"madvise" the
357 ; the current way memory
360 purge bit_val_select_b
361 purge bit_val_select_d
366 ;
}}} idx_insert_node
-------------------------------------------------------------------------------
367 jmp primgrp.parse.return_from_ways
376 ;
}}} unserialize
-----------------------------------------------------------------------------------
377 ;
{{{ local memcpy
(sse
) -----------------------------------------------------------------------------
378 ; XXX
: lddqu performs aligned reads
, carefull where you use this
, but should
be fine with classic
380 ; scratch ~ rax xmm7 xmm6 xmm5 xmm4
383 cmp memcpy_bytes_n
, 16 * 4
385 ; since we can
be cache line mis-aligned
, speculatively do prefetch
2 cls ahead
386 prefetchnta
[memcpy_src
+ 64 * 2] ; speculative non-temporal
"3rd" cache line ahead
387 prefetchnta
[memcpy_src
+ 64 * 3] ; speculative non-temporal
"4th" cache line ahead
389 prefetchw
[memcpy_dst
+ 64 * 2]
390 prefetchw
[memcpy_dst
+ 64 * 3]
392 lddqu xmm7
, [memcpy_src
+ 16 * 0]
393 lddqu xmm6
, [memcpy_src
+ 16 * 1]
394 lddqu xmm5
, [memcpy_src
+ 16 * 2]
395 lddqu xmm4
, [memcpy_src
+ 16 * 3]
397 movdqu
[memcpy_dst
+ 16 * 0], xmm7
398 movdqu
[memcpy_dst
+ 16 * 1], xmm6
399 movdqu
[memcpy_dst
+ 16 * 2], xmm5
400 movdqu
[memcpy_dst
+ 16 * 3], xmm4
402 add memcpy_dst
, 16 * 4
403 sub memcpy_bytes_n
, 16 * 4
405 add memcpy_src
, 16 * 4
409 cmp memcpy_bytes_n
, 16 * 3
412 lddqu xmm7
, [memcpy_src
+ 16 * 0]
413 lddqu xmm6
, [memcpy_src
+ 16 * 1]
414 lddqu xmm5
, [memcpy_src
+ 16 * 2]
416 movdqu
[memcpy_dst
+ 16 * 0], xmm7
417 movdqu
[memcpy_dst
+ 16 * 1], xmm6
418 movdqu
[memcpy_dst
+ 16 * 2], xmm5
420 add memcpy_dst
, 16 * 3
421 sub memcpy_bytes_n
, 16 * 3
423 add memcpy_src
, 16 * 3
427 cmp memcpy_bytes_n
, 16 * 2
430 lddqu xmm7
, [memcpy_src
+ 16 * 0]
431 lddqu xmm6
, [memcpy_src
+ 16 * 1]
433 movdqu
[memcpy_dst
+ 16 * 0], xmm7
434 movdqu
[memcpy_dst
+ 16 * 1], xmm6
436 add memcpy_dst
, 16 * 2
437 sub memcpy_bytes_n
, 16 * 2
439 add memcpy_src
, 16 * 2
443 cmp memcpy_bytes_n
, 16 * 1
446 lddqu xmm7
, [memcpy_src
+ 16 * 0]
447 movdqu
[memcpy_dst
+ 16 * 0], xmm7
449 add memcpy_dst
, 16 * 1
450 sub memcpy_bytes_n
, 16 * 1
452 add memcpy_src
, 16 * 1
456 cmp memcpy_bytes_n
, 8 * 1
458 mov rax
, [memcpy_src
+ 8 * 0]
459 mov
[memcpy_dst
+ 8 * 0], rax
461 add memcpy_dst
, 8 * 1
462 sub memcpy_bytes_n
, 8 * 1
464 add memcpy_src
, 8 * 1
468 cmp memcpy_bytes_n
, 7
470 cmp memcpy_bytes_n
, 6
472 cmp memcpy_bytes_n
, 5
474 cmp memcpy_bytes_n
, 4
476 cmp memcpy_bytes_n
, 3
478 cmp memcpy_bytes_n
, 2
480 cmp memcpy_bytes_n
, 1
528 ;
}}} local memcpy
(sse
) -----------------------------------------------------------------------------
529 ;
===================================================================================================
534 ;
{{{ macros
----------------------------------------------------------------------------------------
535 if TRACE_WAYS_IDX_ENABLED
536 macro TRACE_WAYS_IDX fmt
, regs
&
537 TRACE_PREFIX
'WAYS_IDX', fmt
, regs
540 macro TRACE_WAYS_IDX fmt
, regs
&
543 ;
}}} macros
----------------------------------------------------------------------------------------