initial commit: a mess of assembly code
[fmap.git] / x86_64_sse2_x87 / pbf / blk / primblk / primgrp / ways / text.s
blob3be820f0794eb34423db48ca5bde8b45c8726394
1 ; vim: set filetype=fasm foldmethod=marker commentstring=;%s colorcolumn=101 :
2 ; XXX: We actually used this code unit as a training ground. Expect it to be very overkill.
3 ways: namespace ways
4 ;{{{ init_once -------------------------------------------------------------------------------------
5 align_nops
6 init_once:
7 TRACE 'PBF:BLK:PRIMBLK:PRIMGRP:WAYS:INIT_ONCE'
8 mov rbp, rsp
9 and rsp, not 0xf
11 xor edi, edi
12 mov rsi, array.bytes_n
13 mov rdx, 1b or 10b ; PROT_READ | PROT_WRITE
14 mov r10, 0x22 ; MAP_PRIVATE | MAP_ANONYMOUS
15 xor r8d, r8d
16 xor r9d, r9d
17 mov rax, linux.mmap
18 syscall
19 cmp rax, linux.errno.last
20 jae .err_ways_mmap_failed
21 mov [array], rax
22 mov [array.finish], rax
24 lea rdi, [msg.idx_ways_n]
25 mov rsi, array.idx.n
26 call qword [libc.printf]
28 xor edi, edi
29 mov rsi, array.idx.bytes_n
30 mov rdx, 1b or 10b ; PROT_READ | PROT_WRITE
31 mov r10, 0x22 ; MAP_PRIVATE | MAP_ANONYMOUS
32 xor r8d, r8d
33 xor r9d, r9d
34 mov rax, linux.mmap
35 syscall
36 cmp rax, linux.errno.last
37 jae .err_ways_idx_mmap_failed
38 mov [array.idx], rax
39 add rax, 8 * 3
40 mov [array.idx.finish], rax ; next availabe idx slot
42 stc ; return true
43 mov rsp, rbp
44 ret
45 .err_ways_mmap_failed:
46 mov rdi, 2 ; stderr
47 mov rsi, rax ; errno
48 lea rsi, [msg.mmap_failed]
49 call qword [libc.dprintf]
50 clc ; return false
51 mov rsp, rbp
52 ret
53 .err_ways_idx_mmap_failed:
54 mov rdi, 2 ; stderr
55 mov rsi, rax ; errno
56 lea rsi, [msg.idx_mmap_failed]
57 call qword [libc.dprintf]
58 clc ; return false
59 mov rsp, rbp
60 ret
61 ;}}} init_once -------------------------------------------------------------------------------------
62 ; local memcpy ABI
63 define memcpy_dst rbx ; memcpy_dst(out) = memcpy_dst(in) + memcpy_bytes_n
64 define memcpy_src r11 ; clobbered, don't care
65 define memcpy_bytes_n r9 ; clobbered, don't care
66 define memcpy_link rcx ; the return addr
67 ;===================================================================================================
68 ;{{{ parse -----------------------------------------------------------------------------------------
69 ; scratch ~ rax rcx r11 r9
70 ; from primgrp
71 define ways_p rbx ; in
72 define ways_p_finish r13 ; in
73 define flags r15 ; in/out
74 define strtbl_idx_p r8
75 define lat_of_lon_of xmm3 ; f64/f64
76 ; local
77 define id r12
78 define keys_p r14 ; can be 0
79 define keys_p_d r14d ; can be 0
80 define keys_p_finish rbp
81 define vals_p r10 ; num of vals is num of keys
82 define refs_p rdx ; presume it can be 0
83 define refs_p_d edx ; presume it can be 0
84 define refs_finish_p rsi
85 align_nops
86 parse:
87 xor keys_p_d, keys_p_d ; if no (key/val)
88 xor refs_p_d, refs_p_d ; presume we can have no refs (should not happen though)
89 align_nops
90 .next_msg_key:
91 ; "wayS" is plural because of the primgrp field name, but there is actually only one "way"
92 ; which will be repeated
93 cmp ways_p, ways_p_finish
94 je unserialize
95 varint_load r11, ways_p ; msg key
96 mov r9, r11 ; = msg key copy
97 shr r11, 3 ; = field num
98 cmp r11, 1
99 je .id_found
100 cmp r11, 2
101 je .keys_found
102 cmp r11, 3
103 je .vals_found
104 cmp r11, 8
105 je .refs_found
106 and r9, 111b ; field type
107 TRACE 'PBF:BLK:PRIMBLK:PRIMGRP:WAYS:MSG:FIELD %lu OF TYPE %lu', r11, r9
108 val_skip ways_p, r9, r11
109 jmp .next_msg_key
110 align_nops
111 .id_found:
112 varint_load id, ways_p
113 jmp .next_msg_key
114 align_nops
115 .keys_found:
116 varint_load r11, ways_p ; keys_sz
117 mov keys_p, ways_p
118 TRACE 'PBF:BLK:PRIMBLK:PRIMGRP:WAYS:KEYS:start = %p:size = %lu(0x%lx) bytes', keys_p, r11, r11
119 add ways_p, r11
120 mov keys_p_finish, ways_p
121 jmp .next_msg_key
122 align_nops
123 .vals_found:
124 varint_load r11, ways_p ; vals_sz
125 mov vals_p, ways_p
126 TRACE 'PBF:BLK:PRIMBLK:PRIMGRP:WAYS:VALS:start = %p:size = %lu(0x%lx) bytes', vals_p, r11, r11
127 add ways_p, r11
128 jmp .next_msg_key
129 align_nops
130 .refs_found:
131 varint_load r11, ways_p ; refs_sz
132 mov refs_p, ways_p
133 TRACE 'PBF:BLK:PRIMBLK:PRIMGRP:WAYS:REFS:start = %p:size = %lu(0x%lx) bytes', refs_p, r11, r11
134 add ways_p, r11
135 mov refs_finish_p, ways_p
136 jmp .next_msg_key
137 purge ways_p_finish
138 purge ways_p
139 ;}}} parse -----------------------------------------------------------------------------------------
140 ;{{{ unserialize -----------------------------------------------------------------------------------
141 ; scratch ~ rax rcx r11 r9
142 ; from primgrp
143 ;define flags r15 ; in/out
144 ;define strtbl_idx_p r8
145 ;define lat_of_lon_of xmm3 ; f64/f64
146 ; local from parse
147 ;define id r12
148 ;define keys_p r14 ; can be 0
149 ;define keys_p_d r14d ; can be 0
150 ;define keys_p_finish rbp
151 ;define vals_p r10 ; num of vals is num of keys
152 ;define refs_p rdx ; presume it can be 0
153 ;define refs_p_d edx ; presume it can be 0
154 ;define refs_finish_p rsi
155 ; local
156 define way_p rbx
157 define way_start_p r13
158 align_nops
159 unserialize:
160 mov way_p, [array.finish]
161 mov way_start_p, way_p ; save the start of the way since are going to modify way_p
163 ;{{{ keys_vals_cpy ----------------------------------------------------------------------------------
164 keys_vals_cpy:
165 add way_p, array.way_t.keys_vals ; way_p points on the start of the keys_vals section
166 test keys_p, keys_p
167 jz unserialize.keys_vals_done
168 jmp .entry
169 align_nops
170 .next_key_val:
171 cmp keys_p, keys_p_finish
172 je unserialize.keys_vals_done
173 .entry:
174 varint_load r11, keys_p ; = key_id, keys_p points on next key_id
175 mov r11, [strtbl_idx_p + 8 * r11] ; = key_src_addr
176 varint_load r9, r11 ; r9 = key_bytes_n, r11 = key_src_addr
177 mov [way_p], r9b ; insert the byte containing the sz of the key str in bytes
178 inc way_p
179 ; memcpy_dst = way_p(rbx), memcpy_src = key_src_addr(r11), memcpy_bytes_n = key_bytes_n(r9)
180 lea memcpy_link, [.key_cpy_done] ; where memcpy will jmp back
181 jmp memcpy ; way_p = way_p + key_bytes_n
182 align_nops
183 .key_cpy_done:
184 varint_load r11, vals_p ; = val_id, vals_p points on next val_id
185 mov r11, [strtbl_idx_p + 8 * r11] ; = val_src_addr
186 varint_load r9, r11; r9 = val_bytes_n, r11 = val_src_addr
187 mov [way_p], r9w ; insert the short containing the sz of the val str in bytes
188 add way_p, 2
189 ; memcpy_dst = way_p(rbx), memcpy_src = val_src_addr(r11), memcpy_bytes_n = val_bytes_n(r9)
190 lea memcpy_link, [.next_key_val] ; where memcpy will jmp back
191 jmp memcpy ; way_p = way_p + val_bytes_n
192 ;}}} keys_val_cpy ----------------------------------------------------------------------------------
193 align_nops
194 unserialize.keys_vals_done:
195 mov byte [way_p], 0 ; insert the 0-szed key/terminator
196 inc way_p ; here way_p points on the start of the section of ptrs on nodes
197 mov [way_start_p + array.way_t.nodes], way_p
198 test refs_p, refs_p
199 jz unserialize.refs_solved ; unlikely
200 purge vals_p
201 purge keys_p_finish
202 purge keys_p_d
203 purge keys_p
204 ;{{{ refs_solve ------------------------------------------------------------------------------------
205 ; XXX: it is expected to have missing nodes in ways-->do 0 their ptrs
206 ; scratch ~ rax rcx r11
207 ; from primgrp
208 ;define flags r15 ; in/out
209 ;define strtbl_idx_p r8
210 ;define lat_of_lon_of xmm3 ; f64/f64
211 ; local from parse
212 ;define id r12
213 ;define refs_p rdx ; presume it can be 0
214 ;define refs_p_d edx ; presume it can be 0
215 ;define refs_finish_p rsi
216 ; local from unserialize
217 ;define way_p rbx
218 ;define way_start_p r13
219 ; local
220 define prev_ref r14
221 define prev_ref_d r14d
222 define ref rbp
223 define bit_val_select r10
224 define bit_val_select_b r10b
225 define ref_bit_idx_msb rcx
226 define ref_bit_idx_msb_d ecx
227 define idx_slot_p r9
228 refs_solve:
229 xor prev_ref_d, prev_ref_d ; prev_ref = 0
230 align_nops
231 .next_ref:
232 ; decode the id ----------------------------------------------------------------------------
233 varint_load r11, refs_p ; load/skip the raw zigzag delta_refs
234 mov rax, r11
235 shr r11, 1 ; i >> 1
236 and rax, 1 ; i & 1
237 neg rax ; -(i & 1)
238 xor r11, rax ; (i >> 1) ^ -(i & 1) ; = delta_id
239 add prev_ref, r11 ; = prev_ref = ref
240 mov ref, prev_ref
241 ; lookup the node ptr in the nodes idx -----------------------------------------------------
242 xor bit_val_select, bit_val_select ; bit_val_select = 0
243 mov idx_slot_p, [primgrp.dense.nodes.idx]
244 mov ref_bit_idx_msb, -1 ; this is to make it work with ref = 0
245 bsr ref_bit_idx_msb, ref ; if ref = 0, ref_bit_idx_msb is untouched namely -1
246 inc ref_bit_idx_msb_d ; store the idx + 1, 32bits because of the following jecxz
247 align_nops
248 .next_ref_bit:
249 jecxz .idx_slot_lookup_done ; bit_idx + 1 is stored in cx
250 bt ref, 0
251 setc bit_val_select_b
252 mov idx_slot_p, [idx_slot_p + 8 * bit_val_select]
253 test idx_slot_p, idx_slot_p ; if the idx slot in 0, we have a missing node
254 jz .missing_slot_node ; unlikely
255 if ~SYSTEM_LEGACY
256 prefetcht0 [idx_slot_p]
257 end if
258 shr ref, 1
259 dec ref_bit_idx_msb_d
260 jmp .next_ref_bit
261 .missing_slot_node: ; unlikely
262 mov qword [way_p], 0
263 jmp .next_way_node_ptr
264 align_nops
265 .idx_slot_lookup_done:
266 mov rax, [idx_slot_p + dense.nodes.idx.slot_t.node] ; XXX: can be 0
267 mov [way_p], rax
268 .next_way_node_ptr: ; unlikely
269 add way_p, 8 ; points on way storage for next node ptr
270 cmp refs_p, refs_finish_p
271 jne .next_ref
272 purge idx_slot_p
273 purge ref_bit_idx_msb_d
274 purge ref_bit_idx_msb
275 purge bit_val_select_b
276 purge bit_val_select
277 purge ref
278 purge prev_ref_d
279 purge prev_ref
280 ;}}} refs_solve ------------------------------------------------------------------------------------
281 unserialize.refs_solved: ; unlikely don't align_nops
282 ; here way_p points on the start of the next/finish way
283 mov [way_start_p + array.way_t.next], way_p
284 mov [array.finish], way_p
285 purge way_p
286 ;{{{ idx_insert_node -------------------------------------------------------------------------------
287 ; scratch ~ rax
288 ; from primgrp
289 ;define flags r15 ; in/out
290 ;define strtbl_idx_p r8
291 ;define lat_of_lon_of xmm3 ; f64/f64
292 ; local from parse
293 ;define id r12
294 ;define refs_p rdx ; presume it can be 0
295 ;define refs_p_d edx ; presume it can be 0
296 ;define refs_finish_p rsi
297 ; local from unserialize
298 ;define way_start_p r13
299 ; local
300 define idx_finish_p rbx
301 define id_bit_idx r14
302 define id_bit_idx_d r14d
303 define bit_val_select rcx ; we use rcx here because we may use rcx compressed instructions
304 define bit_val_select_d ecx
305 define bit_val_select_b cl
306 define id_bit_idx_msb rbp
307 define idx_slot_p r9
308 idx_insert_node:
309 mov idx_slot_p, [array.idx]
310 mov idx_finish_p, [array.idx.finish]
311 TRACE_WAYS_IDX 'id=%#016lx nodes.idx=%p nodes.finish=%p', id, idx_slot_p, idx_finish_p
312 xor id_bit_idx_d, id_bit_idx_d
313 xor bit_val_select_d, bit_val_select_d
314 mov id_bit_idx_msb, -1 ; this is to make it work with id = 0
315 bsr id_bit_idx_msb, id ; if id = 0, id_bit_idx_msb is untouched namely -1
316 TRACE_WAYS_IDX 'id_bit_idx_msb=%lu', id_bit_idx_msb
317 align_nops
318 .next_id_bit:
319 TRACE_WAYS_IDX 'id_bit_idx=%lu', id_bit_idx
320 cmp id_bit_idx_msb, id_bit_idx
321 jb .insert_here
322 bt id, id_bit_idx
323 setc bit_val_select_b ; bit_val_select = 1 if id[id_bit_idx] = 1, else 0
324 TRACE_WAYS_IDX 'bit_val_select=%lu', bit_val_select
325 mov rax, [idx_slot_p + 8 * bit_val_select]
326 TRACE_WAYS_IDX 'slot=%p bit slot=%p', idx_slot_p, rax
327 test rax, rax
328 jnz .idx_existing_slot
329 TRACE_WAYS_IDX 'non existing slot using finish=%p', idx_finish_p
330 mov rax, idx_finish_p
331 add idx_finish_p, array.idx.slot_t.bytes_n ; we could zero the mem here, but mmap does it for us
332 if ~SYSTEM_LEGACY
333 prefetchw [idx_finish_p + 64 * 2] ; try to get ready
334 prefetchw [idx_finish_p + 64 * 3]
335 end if
336 mov [idx_slot_p + 8 * bit_val_select], rax
337 align_nops
338 .idx_existing_slot:
339 mov idx_slot_p, rax
340 TRACE_WAYS_IDX 'next_slot=%p', idx_slot_p
341 inc id_bit_idx
343 jmp .next_id_bit
344 align_nops
345 .insert_here:
346 if TRACE_WAYS_IDX_ENABLED
347 mov rax, [idx_slot_p + array.idx.slot_t.way]
348 test rax, rax
349 jz .way_slot_available
350 TRACE_WAYS_IDX 'WARNING: overwritting an existing way=%p', rax
351 .way_slot_available:
352 TRACE_WAYS_IDX 'inserting way=%p in slot=%p', way_start_p, idx_slot_p
353 end if
354 mov [idx_slot_p + array.idx.slot_t.way], way_start_p
355 mov [array.idx.finish], idx_finish_p
356 ; XXX: if one day we need to be backed by a file we will need to DONT_NEED "madvise" the
357 ; the current way memory
358 purge idx_slot_p
359 purge id_bit_idx_msb
360 purge bit_val_select_b
361 purge bit_val_select_d
362 purge bit_val_select
363 purge id_bit_idx_d
364 purge id_bit_idx
365 purge idx_finish_p
366 ;}}} idx_insert_node -------------------------------------------------------------------------------
367 jmp primgrp.parse.return_from_ways
368 purge way_start_p
369 purge refs_finish_p
370 purge refs_p_d
371 purge refs_p
372 purge lat_of_lon_of
373 purge strtbl_idx_p
374 purge id
375 purge flags
376 ;}}} unserialize -----------------------------------------------------------------------------------
377 ;{{{ local memcpy(sse) -----------------------------------------------------------------------------
378 ; XXX: lddqu performs aligned reads, carefull where you use this, but should be fine with classic
379 ; paginated memory.
380 ; scratch ~ rax xmm7 xmm6 xmm5 xmm4
381 align_nops
382 memcpy:
383 cmp memcpy_bytes_n, 16 * 4
384 jb .below64
385 ; since we can be cache line mis-aligned, speculatively do prefetch 2 cls ahead
386 prefetchnta [memcpy_src + 64 * 2] ; speculative non-temporal "3rd" cache line ahead
387 prefetchnta [memcpy_src + 64 * 3] ; speculative non-temporal "4th" cache line ahead
388 if ~SYSTEM_LEGACY
389 prefetchw [memcpy_dst + 64 * 2]
390 prefetchw [memcpy_dst + 64 * 3]
391 end if
392 lddqu xmm7, [memcpy_src + 16 * 0]
393 lddqu xmm6, [memcpy_src + 16 * 1]
394 lddqu xmm5, [memcpy_src + 16 * 2]
395 lddqu xmm4, [memcpy_src + 16 * 3]
397 movdqu [memcpy_dst + 16 * 0], xmm7
398 movdqu [memcpy_dst + 16 * 1], xmm6
399 movdqu [memcpy_dst + 16 * 2], xmm5
400 movdqu [memcpy_dst + 16 * 3], xmm4
402 add memcpy_dst, 16 * 4
403 sub memcpy_bytes_n, 16 * 4
404 jz .done
405 add memcpy_src, 16 * 4
406 jmp memcpy
407 align_nops
408 .below64:
409 cmp memcpy_bytes_n, 16 * 3
410 jb .below48
412 lddqu xmm7, [memcpy_src + 16 * 0]
413 lddqu xmm6, [memcpy_src + 16 * 1]
414 lddqu xmm5, [memcpy_src + 16 * 2]
416 movdqu [memcpy_dst + 16 * 0], xmm7
417 movdqu [memcpy_dst + 16 * 1], xmm6
418 movdqu [memcpy_dst + 16 * 2], xmm5
420 add memcpy_dst, 16 * 3
421 sub memcpy_bytes_n, 16 * 3
422 jz .done
423 add memcpy_src, 16 * 3
424 ; fall-thru
425 align_nops
426 .below48:
427 cmp memcpy_bytes_n, 16 * 2
428 jb .below32
430 lddqu xmm7, [memcpy_src + 16 * 0]
431 lddqu xmm6, [memcpy_src + 16 * 1]
433 movdqu [memcpy_dst + 16 * 0], xmm7
434 movdqu [memcpy_dst + 16 * 1], xmm6
436 add memcpy_dst, 16 * 2
437 sub memcpy_bytes_n, 16 * 2
438 jz .done
439 add memcpy_src, 16 * 2
440 ; fall-thru
441 align_nops
442 .below32:
443 cmp memcpy_bytes_n, 16 * 1
444 jb .below16
446 lddqu xmm7, [memcpy_src + 16 * 0]
447 movdqu [memcpy_dst + 16 * 0], xmm7
449 add memcpy_dst, 16 * 1
450 sub memcpy_bytes_n, 16 * 1
451 jz .done
452 add memcpy_src, 16 * 1
453 ; fall-thru
454 align_nops
455 .below16:
456 cmp memcpy_bytes_n, 8 * 1
457 jb .below8
458 mov rax, [memcpy_src + 8 * 0]
459 mov [memcpy_dst + 8 * 0], rax
461 add memcpy_dst, 8 * 1
462 sub memcpy_bytes_n, 8 * 1
463 jz .done
464 add memcpy_src, 8 * 1
465 ; fall-thru
466 align_nops
467 .below8:
468 cmp memcpy_bytes_n, 7
469 je .cpy7
470 cmp memcpy_bytes_n, 6
471 je .cpy6
472 cmp memcpy_bytes_n, 5
473 je .cpy5
474 cmp memcpy_bytes_n, 4
475 je .cpy4
476 cmp memcpy_bytes_n, 3
477 je .cpy3
478 cmp memcpy_bytes_n, 2
479 je .cpy2
480 cmp memcpy_bytes_n, 1
481 je .cpy1
482 jmp memcpy_link
483 align_nops
484 .cpy7:
485 mov al, [memcpy_src]
486 mov [memcpy_dst], al
487 inc memcpy_src
488 inc memcpy_dst
489 align_nops
490 .cpy6:
491 mov al, [memcpy_src]
492 mov [memcpy_dst], al
493 inc memcpy_src
494 inc memcpy_dst
495 align_nops
496 .cpy5:
497 mov al, [memcpy_src]
498 mov [memcpy_dst], al
499 inc memcpy_src
500 inc memcpy_dst
501 align_nops
502 .cpy4:
503 mov al, [memcpy_src]
504 mov [memcpy_dst], al
505 inc memcpy_src
506 inc memcpy_dst
507 align_nops
508 .cpy3:
509 mov al, [memcpy_src]
510 mov [memcpy_dst], al
511 inc memcpy_src
512 inc memcpy_dst
513 align_nops
514 .cpy2:
515 mov al, [memcpy_src]
516 mov [memcpy_dst], al
517 inc memcpy_src
518 inc memcpy_dst
519 align_nops
520 .cpy1:
521 mov al, [memcpy_src]
522 mov [memcpy_dst], al
523 inc memcpy_src
524 inc memcpy_dst
525 align_nops
526 .done:
527 jmp memcpy_link
528 ;}}} local memcpy(sse) -----------------------------------------------------------------------------
529 ;===================================================================================================
530 purge memcpy_link
531 purge memcpy_bytes_n
532 purge memcpy_src
533 purge memcpy_dst
534 ;{{{ macros ----------------------------------------------------------------------------------------
535 if TRACE_WAYS_IDX_ENABLED
536 macro TRACE_WAYS_IDX fmt, regs&
537 TRACE_PREFIX 'WAYS_IDX', fmt, regs
538 end macro
539 else
540 macro TRACE_WAYS_IDX fmt, regs&
541 end macro
542 end if
543 ;}}} macros ----------------------------------------------------------------------------------------
544 end namespace ; ways