revert between 56095 -> 55830 in arch
[AROS.git] / workbench / libs / z / contrib / masmx64 / gvmat64.asm
blob9879c28b908c94d5b0ee598b283da13d26bc8dcf
1 ;uInt longest_match_x64(
2 ; deflate_state *s,
3 ; IPos cur_match); /* current match */
5 ; gvmat64.asm -- Asm portion of the optimized longest_match for 32 bits x86_64
6 ; (AMD64 on Athlon 64, Opteron, Phenom
7 ; and Intel EM64T on Pentium 4 with EM64T, Pentium D, Core 2 Duo, Core I5/I7)
8 ; Copyright (C) 1995-2010 Jean-loup Gailly, Brian Raiter and Gilles Vollant.
10 ; File written by Gilles Vollant, by converting to assembly the longest_match
11 ; from Jean-loup Gailly in deflate.c of zLib and infoZip zip.
13 ; and by taking inspiration on asm686 with masm, optimised assembly code
14 ; from Brian Raiter, written 1998
16 ; This software is provided 'as-is', without any express or implied
17 ; warranty. In no event will the authors be held liable for any damages
18 ; arising from the use of this software.
20 ; Permission is granted to anyone to use this software for any purpose,
21 ; including commercial applications, and to alter it and redistribute it
22 ; freely, subject to the following restrictions:
24 ; 1. The origin of this software must not be misrepresented; you must not
25 ; claim that you wrote the original software. If you use this software
26 ; in a product, an acknowledgment in the product documentation would be
27 ; appreciated but is not required.
28 ; 2. Altered source versions must be plainly marked as such, and must not be
29 ; misrepresented as being the original software
30 ; 3. This notice may not be removed or altered from any source distribution.
34 ; http://www.zlib.net
35 ; http://www.winimage.com/zLibDll
36 ; http://www.muppetlabs.com/~breadbox/software/assembly.html
38 ; to compile this file for infozip Zip, I use option:
39 ; ml64.exe /Flgvmat64 /c /Zi /DINFOZIP gvmat64.asm
41 ; to compile this file for zLib, I use option:
42 ; ml64.exe /Flgvmat64 /c /Zi gvmat64.asm
43 ; Be carrefull to adapt zlib1222add below to your version of zLib
44 ; (if you use a version of zLib before 1.0.4 or after 1.2.2.2, change
45 ; value of zlib1222add later)
47 ; This file compile with Microsoft Macro Assembler (x64) for AMD64
49 ; ml64.exe is given with Visual Studio 2005/2008/2010 and Windows WDK
51 ; (you can get Windows WDK with ml64 for AMD64 from
52 ; http://www.microsoft.com/whdc/Devtools/wdk/default.mspx for low price)
56 ;uInt longest_match(s, cur_match)
57 ; deflate_state *s;
58 ; IPos cur_match; /* current match */
59 .code
60 longest_match PROC
63 ;LocalVarsSize equ 88
64 LocalVarsSize equ 72
66 ; register used : rax,rbx,rcx,rdx,rsi,rdi,r8,r9,r10,r11,r12
67 ; free register : r14,r15
68 ; register can be saved : rsp
70 chainlenwmask equ rsp + 8 - LocalVarsSize ; high word: current chain len
71 ; low word: s->wmask
72 ;window equ rsp + xx - LocalVarsSize ; local copy of s->window ; stored in r10
73 ;windowbestlen equ rsp + xx - LocalVarsSize ; s->window + bestlen , use r10+r11
74 ;scanstart equ rsp + xx - LocalVarsSize ; first two bytes of string ; stored in r12w
75 ;scanend equ rsp + xx - LocalVarsSize ; last two bytes of string use ebx
76 ;scanalign equ rsp + xx - LocalVarsSize ; dword-misalignment of string r13
77 ;bestlen equ rsp + xx - LocalVarsSize ; size of best match so far -> r11d
78 ;scan equ rsp + xx - LocalVarsSize ; ptr to string wanting match -> r9
79 IFDEF INFOZIP
80 ELSE
81 nicematch equ (rsp + 16 - LocalVarsSize) ; a good enough match size
82 ENDIF
84 save_rdi equ rsp + 24 - LocalVarsSize
85 save_rsi equ rsp + 32 - LocalVarsSize
86 save_rbx equ rsp + 40 - LocalVarsSize
87 save_rbp equ rsp + 48 - LocalVarsSize
88 save_r12 equ rsp + 56 - LocalVarsSize
89 save_r13 equ rsp + 64 - LocalVarsSize
90 ;save_r14 equ rsp + 72 - LocalVarsSize
91 ;save_r15 equ rsp + 80 - LocalVarsSize
94 ; summary of register usage
95 ; scanend ebx
96 ; scanendw bx
97 ; chainlenwmask edx
98 ; curmatch rsi
99 ; curmatchd esi
100 ; windowbestlen r8
101 ; scanalign r9
102 ; scanalignd r9d
103 ; window r10
104 ; bestlen r11
105 ; bestlend r11d
106 ; scanstart r12d
107 ; scanstartw r12w
108 ; scan r13
109 ; nicematch r14d
110 ; limit r15
111 ; limitd r15d
112 ; prev rcx
114 ; all the +4 offsets are due to the addition of pending_buf_size (in zlib
115 ; in the deflate_state structure since the asm code was first written
116 ; (if you compile with zlib 1.0.4 or older, remove the +4).
117 ; Note : these value are good with a 8 bytes boundary pack structure
120 MAX_MATCH equ 258
121 MIN_MATCH equ 3
122 MIN_LOOKAHEAD equ (MAX_MATCH+MIN_MATCH+1)
125 ;;; Offsets for fields in the deflate_state structure. These numbers
126 ;;; are calculated from the definition of deflate_state, with the
127 ;;; assumption that the compiler will dword-align the fields. (Thus,
128 ;;; changing the definition of deflate_state could easily cause this
129 ;;; program to crash horribly, without so much as a warning at
130 ;;; compile time. Sigh.)
132 ; all the +zlib1222add offsets are due to the addition of fields
133 ; in zlib in the deflate_state structure since the asm code was first written
134 ; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
135 ; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
136 ; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
139 IFDEF INFOZIP
141 _DATA SEGMENT
142 COMM window_size:DWORD
143 ; WMask ; 7fff
144 COMM window:BYTE:010040H
145 COMM prev:WORD:08000H
146 ; MatchLen : unused
147 ; PrevMatch : unused
148 COMM strstart:DWORD
149 COMM match_start:DWORD
150 ; Lookahead : ignore
151 COMM prev_length:DWORD ; PrevLen
152 COMM max_chain_length:DWORD
153 COMM good_match:DWORD
154 COMM nice_match:DWORD
155 prev_ad equ OFFSET prev
156 window_ad equ OFFSET window
157 nicematch equ nice_match
158 _DATA ENDS
159 WMask equ 07fffh
161 ELSE
163 IFNDEF zlib1222add
164 zlib1222add equ 8
165 ENDIF
166 dsWSize equ 56+zlib1222add+(zlib1222add/2)
167 dsWMask equ 64+zlib1222add+(zlib1222add/2)
168 dsWindow equ 72+zlib1222add
169 dsPrev equ 88+zlib1222add
170 dsMatchLen equ 128+zlib1222add
171 dsPrevMatch equ 132+zlib1222add
172 dsStrStart equ 140+zlib1222add
173 dsMatchStart equ 144+zlib1222add
174 dsLookahead equ 148+zlib1222add
175 dsPrevLen equ 152+zlib1222add
176 dsMaxChainLen equ 156+zlib1222add
177 dsGoodMatch equ 172+zlib1222add
178 dsNiceMatch equ 176+zlib1222add
180 window_size equ [ rcx + dsWSize]
181 WMask equ [ rcx + dsWMask]
182 window_ad equ [ rcx + dsWindow]
183 prev_ad equ [ rcx + dsPrev]
184 strstart equ [ rcx + dsStrStart]
185 match_start equ [ rcx + dsMatchStart]
186 Lookahead equ [ rcx + dsLookahead] ; 0ffffffffh on infozip
187 prev_length equ [ rcx + dsPrevLen]
188 max_chain_length equ [ rcx + dsMaxChainLen]
189 good_match equ [ rcx + dsGoodMatch]
190 nice_match equ [ rcx + dsNiceMatch]
191 ENDIF
193 ; parameter 1 in r8(deflate state s), param 2 in rdx (cur match)
195 ; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
196 ; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
198 ; All registers must be preserved across the call, except for
199 ; rax, rcx, rdx, r8, r9, r10, and r11, which are scratch.
203 ;;; Save registers that the compiler may be using, and adjust esp to
204 ;;; make room for our stack frame.
207 ;;; Retrieve the function arguments. r8d will hold cur_match
208 ;;; throughout the entire function. edx will hold the pointer to the
209 ;;; deflate_state structure during the function's setup (before
210 ;;; entering the main loop.
212 ; parameter 1 in rcx (deflate_state* s), param 2 in edx -> r8 (cur match)
214 ; this clear high 32 bits of r8, which can be garbage in both r8 and rdx
216 mov [save_rdi],rdi
217 mov [save_rsi],rsi
218 mov [save_rbx],rbx
219 mov [save_rbp],rbp
220 IFDEF INFOZIP
221 mov r8d,ecx
222 ELSE
223 mov r8d,edx
224 ENDIF
225 mov [save_r12],r12
226 mov [save_r13],r13
227 ; mov [save_r14],r14
228 ; mov [save_r15],r15
231 ;;; uInt wmask = s->w_mask;
232 ;;; unsigned chain_length = s->max_chain_length;
233 ;;; if (s->prev_length >= s->good_match) {
234 ;;; chain_length >>= 2;
235 ;;; }
237 mov edi, prev_length
238 mov esi, good_match
239 mov eax, WMask
240 mov ebx, max_chain_length
241 cmp edi, esi
242 jl LastMatchGood
243 shr ebx, 2
244 LastMatchGood:
246 ;;; chainlen is decremented once beforehand so that the function can
247 ;;; use the sign flag instead of the zero flag for the exit test.
248 ;;; It is then shifted into the high word, to make room for the wmask
249 ;;; value, which it will always accompany.
251 dec ebx
252 shl ebx, 16
253 or ebx, eax
255 ;;; on zlib only
256 ;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
258 IFDEF INFOZIP
259 mov [chainlenwmask], ebx
260 ; on infozip nice_match = [nice_match]
261 ELSE
262 mov eax, nice_match
263 mov [chainlenwmask], ebx
264 mov r10d, Lookahead
265 cmp r10d, eax
266 cmovnl r10d, eax
267 mov [nicematch],r10d
268 ENDIF
270 ;;; register Bytef *scan = s->window + s->strstart;
271 mov r10, window_ad
272 mov ebp, strstart
273 lea r13, [r10 + rbp]
275 ;;; Determine how many bytes the scan ptr is off from being
276 ;;; dword-aligned.
278 mov r9,r13
279 neg r13
280 and r13,3
282 ;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
283 ;;; s->strstart - (IPos)MAX_DIST(s) : NIL;
284 IFDEF INFOZIP
285 mov eax,07efah ; MAX_DIST = (WSIZE-MIN_LOOKAHEAD) (0x8000-(3+8+1))
286 ELSE
287 mov eax, window_size
288 sub eax, MIN_LOOKAHEAD
289 ENDIF
290 xor edi,edi
291 sub ebp, eax
293 mov r11d, prev_length
295 cmovng ebp,edi
297 ;;; int best_len = s->prev_length;
300 ;;; Store the sum of s->window + best_len in esi locally, and in esi.
302 lea rsi,[r10+r11]
304 ;;; register ush scan_start = *(ushf*)scan;
305 ;;; register ush scan_end = *(ushf*)(scan+best_len-1);
306 ;;; Posf *prev = s->prev;
308 movzx r12d,word ptr [r9]
309 movzx ebx, word ptr [r9 + r11 - 1]
311 mov rdi, prev_ad
313 ;;; Jump into the main loop.
315 mov edx, [chainlenwmask]
317 cmp bx,word ptr [rsi + r8 - 1]
318 jz LookupLoopIsZero
320 LookupLoop1:
321 and r8d, edx
323 movzx r8d, word ptr [rdi + r8*2]
324 cmp r8d, ebp
325 jbe LeaveNow
326 sub edx, 00010000h
327 js LeaveNow
329 LoopEntry1:
330 cmp bx,word ptr [rsi + r8 - 1]
331 jz LookupLoopIsZero
333 LookupLoop2:
334 and r8d, edx
336 movzx r8d, word ptr [rdi + r8*2]
337 cmp r8d, ebp
338 jbe LeaveNow
339 sub edx, 00010000h
340 js LeaveNow
342 LoopEntry2:
343 cmp bx,word ptr [rsi + r8 - 1]
344 jz LookupLoopIsZero
346 LookupLoop4:
347 and r8d, edx
349 movzx r8d, word ptr [rdi + r8*2]
350 cmp r8d, ebp
351 jbe LeaveNow
352 sub edx, 00010000h
353 js LeaveNow
355 LoopEntry4:
357 cmp bx,word ptr [rsi + r8 - 1]
358 jnz LookupLoop1
359 jmp LookupLoopIsZero
362 ;;; do {
363 ;;; match = s->window + cur_match;
364 ;;; if (*(ushf*)(match+best_len-1) != scan_end ||
365 ;;; *(ushf*)match != scan_start) continue;
366 ;;; [...]
367 ;;; } while ((cur_match = prev[cur_match & wmask]) > limit
368 ;;; && --chain_length != 0);
370 ;;; Here is the inner loop of the function. The function will spend the
371 ;;; majority of its time in this loop, and majority of that time will
372 ;;; be spent in the first ten instructions.
374 ;;; Within this loop:
375 ;;; ebx = scanend
376 ;;; r8d = curmatch
377 ;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
378 ;;; esi = windowbestlen - i.e., (window + bestlen)
379 ;;; edi = prev
380 ;;; ebp = limit
382 LookupLoop:
383 and r8d, edx
385 movzx r8d, word ptr [rdi + r8*2]
386 cmp r8d, ebp
387 jbe LeaveNow
388 sub edx, 00010000h
389 js LeaveNow
391 LoopEntry:
393 cmp bx,word ptr [rsi + r8 - 1]
394 jnz LookupLoop1
395 LookupLoopIsZero:
396 cmp r12w, word ptr [r10 + r8]
397 jnz LookupLoop1
400 ;;; Store the current value of chainlen.
401 mov [chainlenwmask], edx
403 ;;; Point edi to the string under scrutiny, and esi to the string we
404 ;;; are hoping to match it up with. In actuality, esi and edi are
405 ;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
406 ;;; initialized to -(MAX_MATCH_8 - scanalign).
408 lea rsi,[r8+r10]
409 mov rdx, 0fffffffffffffef8h; -(MAX_MATCH_8)
410 lea rsi, [rsi + r13 + 0108h] ;MAX_MATCH_8]
411 lea rdi, [r9 + r13 + 0108h] ;MAX_MATCH_8]
413 prefetcht1 [rsi+rdx]
414 prefetcht1 [rdi+rdx]
417 ;;; Test the strings for equality, 8 bytes at a time. At the end,
418 ;;; adjust rdx so that it is offset to the exact byte that mismatched.
420 ;;; We already know at this point that the first three bytes of the
421 ;;; strings match each other, and they can be safely passed over before
422 ;;; starting the compare loop. So what this code does is skip over 0-3
423 ;;; bytes, as much as necessary in order to dword-align the edi
424 ;;; pointer. (rsi will still be misaligned three times out of four.)
426 ;;; It should be confessed that this loop usually does not represent
427 ;;; much of the total running time. Replacing it with a more
428 ;;; straightforward "rep cmpsb" would not drastically degrade
429 ;;; performance.
432 LoopCmps:
433 mov rax, [rsi + rdx]
434 xor rax, [rdi + rdx]
435 jnz LeaveLoopCmps
437 mov rax, [rsi + rdx + 8]
438 xor rax, [rdi + rdx + 8]
439 jnz LeaveLoopCmps8
442 mov rax, [rsi + rdx + 8+8]
443 xor rax, [rdi + rdx + 8+8]
444 jnz LeaveLoopCmps16
446 add rdx,8+8+8
448 jnz short LoopCmps
449 jmp short LenMaximum
450 LeaveLoopCmps16: add rdx,8
451 LeaveLoopCmps8: add rdx,8
452 LeaveLoopCmps:
454 test eax, 0000FFFFh
455 jnz LenLower
457 test eax,0ffffffffh
459 jnz LenLower32
461 add rdx,4
462 shr rax,32
463 or ax,ax
464 jnz LenLower
466 LenLower32:
467 shr eax,16
468 add rdx,2
469 LenLower: sub al, 1
470 adc rdx, 0
471 ;;; Calculate the length of the match. If it is longer than MAX_MATCH,
472 ;;; then automatically accept it as the best possible match and leave.
474 lea rax, [rdi + rdx]
475 sub rax, r9
476 cmp eax, MAX_MATCH
477 jge LenMaximum
479 ;;; If the length of the match is not longer than the best match we
480 ;;; have so far, then forget it and return to the lookup loop.
481 ;///////////////////////////////////
483 cmp eax, r11d
484 jg LongerMatch
486 lea rsi,[r10+r11]
488 mov rdi, prev_ad
489 mov edx, [chainlenwmask]
490 jmp LookupLoop
492 ;;; s->match_start = cur_match;
493 ;;; best_len = len;
494 ;;; if (len >= nice_match) break;
495 ;;; scan_end = *(ushf*)(scan+best_len-1);
497 LongerMatch:
498 mov r11d, eax
499 mov match_start, r8d
500 cmp eax, [nicematch]
501 jge LeaveNow
503 lea rsi,[r10+rax]
505 movzx ebx, word ptr [r9 + rax - 1]
506 mov rdi, prev_ad
507 mov edx, [chainlenwmask]
508 jmp LookupLoop
510 ;;; Accept the current string, with the maximum possible length.
512 LenMaximum:
513 mov r11d,MAX_MATCH
514 mov match_start, r8d
516 ;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
517 ;;; return s->lookahead;
519 LeaveNow:
520 IFDEF INFOZIP
521 mov eax,r11d
522 ELSE
523 mov eax, Lookahead
524 cmp r11d, eax
525 cmovng eax, r11d
526 ENDIF
528 ;;; Restore the stack and return from whence we came.
531 mov rsi,[save_rsi]
532 mov rdi,[save_rdi]
533 mov rbx,[save_rbx]
534 mov rbp,[save_rbp]
535 mov r12,[save_r12]
536 mov r13,[save_r13]
537 ; mov r14,[save_r14]
538 ; mov r15,[save_r15]
541 ret 0
542 ; please don't remove this string !
543 ; Your can freely use gvmat64 in any free or commercial app
544 ; but it is far better don't remove the string in the binary!
545 db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0
546 longest_match ENDP
548 match_init PROC
549 ret 0
550 match_init ENDP