build: update gnulib submodule to latest
[gzip.git] / lib / match.c
blob2cef62866c9e87ffa1351085ab570619376ae008
1 /* match.s -- optional optimized asm version of longest match in deflate.c
3 Copyright (C) 2002, 2006, 2009-2024 Free Software Foundation, Inc.
4 Copyright (C) 1992-1993 Jean-loup Gailly
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
21 * The 68020 version has been written by Francesco Potortì <pot@cnuce.cnr.it>
22 * with adaptations by Carsten Steger <stegerc@informatik.tu-muenchen.de>,
23 * Andreas Schwab <schwab@lamothe.informatik.uni-dortmund.de> and
24 * Kristoffer Eriksson <ske@pkmab.se>
26 * The ia64 version has been written by Sverre Jarp (HP Labs) 2001-2002.
27 * Unwind directives and some reformatting for better readability added by
28 * David Mosberger-Tang <davidm@hpl.hp.com>.
31 /* Preprocess with -DNO_UNDERLINE if your C compiler does not prefix
32 * external symbols with an underline character '_'.
34 #ifdef NO_UNDERLINE
35 # define _prev prev
36 # define _window window
37 # define _match_start match_start
38 # define _prev_length prev_length
39 # define _good_match good_match
40 # define _nice_match nice_match
41 # define _strstart strstart
42 # define _max_chain_length max_chain_length
44 # define _match_init match_init
45 # define _longest_match longest_match
46 #endif
48 #ifdef DYN_ALLOC
49 error: DYN_ALLOC not yet supported in match.s
50 #endif
52 /* On x86-64, Sun C 5.13 (Oracle Solaris Studio 12.4) 'cc -E -m64'
53 defines i386 when compiling .s or .S files! Luckily it also
54 defines __x86_64__. See Bug#23133. */
55 #if ((defined i386 || defined _I386 || defined __i386 || defined __i386__) \
56 && !defined __x86_64__)
58 /* This version is for 386 Unix or OS/2 in 32 bit mode.
59 * Warning: it uses the AT&T syntax: mov source,dest
60 * This file is only optional. If you want to force the C version,
61 * add -DNO_ASM to CFLAGS in Makefile and set OBJA to an empty string.
62 * If you have reduced WSIZE in gzip.h, then change its value below.
63 * This version assumes static allocation of the arrays (-DDYN_ALLOC not used).
66 .file "match.S"
68 #define MAX_MATCH 258
69 #define MAX_MATCH2 $128 /* MAX_MATCH/2-1 */
70 #define MIN_MATCH 3
71 #define WSIZE $32768
72 #define MAX_DIST WSIZE - MAX_MATCH - MIN_MATCH - 1
74 .globl _match_init
75 .globl _longest_match
77 .text
79 _match_init:
80 ret
82 /*-----------------------------------------------------------------------
83 * Set match_start to the longest match starting at the given string and
84 * return its length. Matches shorter or equal to prev_length are discarded,
85 * in which case the result is equal to prev_length and match_start is
86 * garbage.
87 * IN assertions: cur_match is the head of the hash chain for the current
88 * string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1
91 _longest_match: /* int longest_match(cur_match) */
93 #define cur_match 20(%esp)
94 /* return address */ /* esp+16 */
95 push %ebp /* esp+12 */
96 push %edi /* esp+8 */
97 push %esi /* esp+4 */
98 push %ebx /* esp */
101 * match equ esi
102 * scan equ edi
103 * chain_length equ ebp
104 * best_len equ ebx
105 * limit equ edx
107 mov cur_match,%esi
108 mov _max_chain_length,%ebp /* chain_length = max_chain_length */
109 mov _strstart,%edi
110 mov %edi,%edx
111 sub MAX_DIST,%edx /* limit = strstart-MAX_DIST */
112 jae limit_ok
113 sub %edx,%edx /* limit = NIL */
114 limit_ok:
115 add $2+_window,%edi /* edi = offset(window+strstart+2) */
116 mov _prev_length,%ebx /* best_len = prev_length */
117 movw -3(%ebx,%edi),%ax /* ax = scan[best_len-1..best_len] */
118 movw -2(%edi),%cx /* cx = scan[0..1] */
119 cmp _good_match,%ebx /* do we have a good match already? */
120 jb do_scan
121 shr $2,%ebp /* chain_length >>= 2 */
122 jmp do_scan
124 .align 4
125 long_loop:
126 /* at this point, edi == scan+2, esi == cur_match */
127 movw -3(%ebx,%edi),%ax /* ax = scan[best_len-1..best_len] */
128 movw -2(%edi),%cx /* cx = scan[0..1] */
129 short_loop:
131 * at this point, di == scan+2, si == cur_match,
132 * ax = scan[best_len-1..best_len] and cx = scan[0..1]
134 and WSIZE-1, %esi
135 movw _prev(%esi,%esi),%si /* cur_match = prev[cur_match] */
136 /* top word of esi is still 0 */
137 cmp %edx,%esi /* cur_match <= limit ? */
138 jbe the_end
139 dec %ebp /* --chain_length */
140 jz the_end
141 do_scan:
142 cmpw _window-1(%ebx,%esi),%ax/* check match at best_len-1 */
143 jne short_loop
144 cmpw _window(%esi),%cx /* check min_match_length match */
145 jne short_loop
147 lea _window+2(%esi),%esi /* si = match */
148 mov %edi,%eax /* ax = scan+2 */
149 mov MAX_MATCH2,%ecx /* scan for at most MAX_MATCH bytes */
150 rep; cmpsw /* loop until mismatch */
151 je maxmatch /* match of length MAX_MATCH? */
152 mismatch:
153 movb -2(%edi),%cl /* mismatch on first or second byte? */
154 subb -2(%esi),%cl /* cl = 0 if first bytes equal */
155 xchg %edi,%eax /* edi = scan+2, eax = end of scan */
156 sub %edi,%eax /* eax = len */
157 sub %eax,%esi /* esi = cur_match + 2 + offset(window) */
158 sub $2+_window,%esi /* esi = cur_match */
159 subb $1,%cl /* set carry if cl == 0 (cannot use DEC) */
160 adc $0,%eax /* eax = carry ? len+1 : len */
161 cmp %ebx,%eax /* len > best_len ? */
162 jle long_loop
163 mov %esi,_match_start /* match_start = cur_match */
164 mov %eax,%ebx /* ebx = best_len = len */
165 cmp _nice_match,%eax /* len >= nice_match ? */
166 jl long_loop
167 the_end:
168 mov %ebx,%eax /* result = eax = best_len */
169 pop %ebx
170 pop %esi
171 pop %edi
172 pop %ebp
174 maxmatch:
175 cmpsb
176 jmp mismatch
178 #else
180 /* ======================== 680x0 version ================================= */
182 #if defined(m68k)||defined(mc68k)||defined(__mc68000__)||defined(__MC68000__)
183 # ifndef mc68000
184 # define mc68000
185 # endif
186 #endif
188 #if defined(__mc68020__) || defined(__MC68020__) || defined(sysV68)
189 # ifndef mc68020
190 # define mc68020
191 # endif
192 #endif
194 #if defined(mc68020) || defined(mc68000)
196 #if (defined(mc68020) || defined(NeXT)) && !defined(UNALIGNED_OK)
197 # define UNALIGNED_OK
198 #endif
200 #ifdef sysV68 /* Try Motorola Delta style */
202 # define GLOBAL(symbol) global symbol
203 # define TEXT text
204 # define FILE(filename) file filename
205 # define invert_maybe(src,dst) dst,src
206 # define imm(data) &data
207 # define reg(register) %register
209 # define addl add.l
210 # define addql addq.l
211 # define blos blo.b
212 # define bhis bhi.b
213 # define bras bra.b
214 # define clrl clr.l
215 # define cmpmb cmpm.b
216 # define cmpw cmp.w
217 # define cmpl cmp.l
218 # define lslw lsl.w
219 # define lsrl lsr.l
220 # define movel move.l
221 # define movew move.w
222 # define moveb move.b
223 # define moveml movem.l
224 # define subl sub.l
225 # define subw sub.w
226 # define subql subq.l
228 # define IndBase(bd,An) (bd,An)
229 # define IndBaseNdxl(bd,An,Xn) (bd,An,Xn.l)
230 # define IndBaseNdxw(bd,An,Xn) (bd,An,Xn.w)
231 # define predec(An) -(An)
232 # define postinc(An) (An)+
234 #else /* default style (Sun 3, NeXT, Atari) */
236 # define GLOBAL(symbol) .globl symbol
237 # define TEXT .text
238 # define FILE(filename) .even
239 # define invert_maybe(src,dst) src,dst
240 # if defined(sun) || defined(mc68k)
241 # define imm(data) #data
242 # else
243 # define imm(data) \#data
244 # endif
245 # define reg(register) register
247 # define blos bcss
248 # if defined(sun) || defined(mc68k)
249 # define movel movl
250 # define movew movw
251 # define moveb movb
252 # endif
253 # define IndBase(bd,An) An@(bd)
254 # define IndBaseNdxl(bd,An,Xn) An@(bd,Xn:l)
255 # define IndBaseNdxw(bd,An,Xn) An@(bd,Xn:w)
256 # define predec(An) An@-
257 # define postinc(An) An@+
259 #endif /* styles */
261 #define Best_Len reg(d0) /* unsigned */
262 #define Cur_Match reg(d1) /* Ipos */
263 #define Loop_Counter reg(d2) /* int */
264 #define Scan_Start reg(d3) /* unsigned short */
265 #define Scan_End reg(d4) /* unsigned short */
266 #define Limit reg(d5) /* IPos */
267 #define Chain_Length reg(d6) /* unsigned */
268 #define Scan_Test reg(d7)
269 #define Scan reg(a0) /* *uch */
270 #define Match reg(a1) /* *uch */
271 #define Prev_Address reg(a2) /* *Pos */
272 #define Scan_Ini reg(a3) /* *uch */
273 #define Match_Ini reg(a4) /* *uch */
274 #define Stack_Pointer reg(sp)
276 #define MAX_MATCH 258
277 #define MIN_MATCH 3
278 #define WSIZE 32768
279 #define MAX_DIST (WSIZE - MAX_MATCH - MIN_MATCH - 1)
281 GLOBAL (_match_init)
282 GLOBAL (_longest_match)
284 TEXT
286 FILE ("match.S")
288 _match_init:
291 /*-----------------------------------------------------------------------
292 * Set match_start to the longest match starting at the given string and
293 * return its length. Matches shorter or equal to prev_length are discarded,
294 * in which case the result is equal to prev_length and match_start is
295 * garbage.
296 * IN assertions: cur_match is the head of the hash chain for the current
297 * string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1
300 /* int longest_match (cur_match) */
302 #ifdef UNALIGNED_OK
303 # define pushreg 15928 /* d2-d6/a2-a4 */
304 # define popreg 7292
305 #else
306 # define pushreg 16184 /* d2-d7/a2-a4 */
307 # define popreg 7420
308 #endif
310 _longest_match:
311 movel IndBase(4,Stack_Pointer),Cur_Match
312 moveml imm(pushreg),predec(Stack_Pointer)
313 movel _max_chain_length,Chain_Length
314 movel _prev_length,Best_Len
315 movel imm(_prev),Prev_Address
316 movel imm(_window+MIN_MATCH),Match_Ini
317 movel _strstart,Limit
318 movel Match_Ini,Scan_Ini
319 addl Limit,Scan_Ini
320 subw imm(MAX_DIST),Limit
321 bhis L__limit_ok
322 clrl Limit
323 L__limit_ok:
324 cmpl invert_maybe(_good_match,Best_Len)
325 blos L__length_ok
326 lsrl imm(2),Chain_Length
327 L__length_ok:
328 subql imm(1),Chain_Length
329 #ifdef UNALIGNED_OK
330 movew IndBase(-MIN_MATCH,Scan_Ini),Scan_Start
331 movew IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End
332 #else
333 moveb IndBase(-MIN_MATCH,Scan_Ini),Scan_Start
334 lslw imm(8),Scan_Start
335 moveb IndBase(-MIN_MATCH+1,Scan_Ini),Scan_Start
336 moveb IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End
337 lslw imm(8),Scan_End
338 moveb IndBaseNdxw(-MIN_MATCH,Scan_Ini,Best_Len),Scan_End
339 #endif
340 bras L__do_scan
342 L__long_loop:
343 #ifdef UNALIGNED_OK
344 movew IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End
345 #else
346 moveb IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End
347 lslw imm(8),Scan_End
348 moveb IndBaseNdxw(-MIN_MATCH,Scan_Ini,Best_Len),Scan_End
349 #endif
351 L__short_loop:
352 lslw imm(1),Cur_Match
353 movew IndBaseNdxl(0,Prev_Address,Cur_Match),Cur_Match
354 cmpw invert_maybe(Limit,Cur_Match)
355 dbls Chain_Length,L__do_scan
356 bras L__return
358 L__do_scan:
359 movel Match_Ini,Match
360 addl Cur_Match,Match
361 #ifdef UNALIGNED_OK
362 cmpw invert_maybe(IndBaseNdxw(-MIN_MATCH-1,Match,Best_Len),Scan_End)
363 bne L__short_loop
364 cmpw invert_maybe(IndBase(-MIN_MATCH,Match),Scan_Start)
365 bne L__short_loop
366 #else
367 moveb IndBaseNdxw(-MIN_MATCH-1,Match,Best_Len),Scan_Test
368 lslw imm(8),Scan_Test
369 moveb IndBaseNdxw(-MIN_MATCH,Match,Best_Len),Scan_Test
370 cmpw invert_maybe(Scan_Test,Scan_End)
371 bne L__short_loop
372 moveb IndBase(-MIN_MATCH,Match),Scan_Test
373 lslw imm(8),Scan_Test
374 moveb IndBase(-MIN_MATCH+1,Match),Scan_Test
375 cmpw invert_maybe(Scan_Test,Scan_Start)
376 bne L__short_loop
377 #endif
379 movew imm((MAX_MATCH-MIN_MATCH+1)-1),Loop_Counter
380 movel Scan_Ini,Scan
381 L__scan_loop:
382 cmpmb postinc(Match),postinc(Scan)
383 dbne Loop_Counter,L__scan_loop
385 subl Scan_Ini,Scan
386 addql imm(MIN_MATCH-1),Scan
387 cmpl invert_maybe(Best_Len,Scan)
388 bls L__short_loop
389 movel Scan,Best_Len
390 movel Cur_Match,_match_start
391 cmpl invert_maybe(_nice_match,Best_Len)
392 blos L__long_loop
393 L__return:
394 moveml postinc(Stack_Pointer),imm(popreg)
397 #else
399 # if defined (__ia64__)
401 /* ======================== ia64 version ================================= */
404 * 'longest_match.S' (assembly program for gzip for the IA-64 architecture)
406 * Optimized for McKinley, but with Merced-compatibility, such as
407 * MIB+MIB, used wherever possible.
409 * Copyright: Sverre Jarp (HP Labs) 2001-2002
411 * See deflate.c for c-version
412 * Version 2 - Optimize the outer loop
415 #include <endian.h>
417 #if __BYTE_ORDER == ____BIG_ENDIAN
418 #define first shl
419 #define second shr.u
420 #define count czx1.l
421 #else
422 #define first shr.u
423 #define second shl
424 #define count czx1.r
425 #endif
427 // 24 rotating register (r32 - r55)
429 #define s_vmatch0 r32
430 #define s_vmatch1 r33
431 #define s_vmatbst r34
432 #define s_vmatbst1 r35
433 #define s_amatblen r36
435 #define s_tm1 r56
436 #define s_tm2 r57
437 #define s_tm3 r58
438 #define s_tm4 r59
439 #define s_tm5 r60
440 #define s_tm6 r61
441 #define s_tm7 r62
442 #define s_tm8 r63
444 #define s_vlen r31
445 #define s_vstrstart r30
446 #define s_vchainlen r29
447 #define s_awinbest r28
448 #define s_vcurmatch r27
449 #define s_vlimit r26
450 #define s_vscanend r25
451 #define s_vscanend1 r24
452 #define s_anicematch r23
453 #define s_vscan0 r22
454 #define s_vscan1 r21
456 #define s_aprev r20
457 #define s_awindow r19
458 #define s_amatchstart r18
459 #define s_ascan r17
460 #define s_amatch r16
461 #define s_wmask r15
462 #define s_ascanend r14
464 #define s_vspec_cmatch r11 // next iteration
465 #define s_lcsave r10
466 #define s_prsave r9
467 #define s_vbestlen r8 // return register
469 #define s_vscan3 r3
470 #define s_vmatch3 r2
472 #define p_no p2
473 #define p_yes p3
474 #define p_shf p4 //
475 #define p_bn2 p5 // Use in loop (indicating bestlen != 2)
477 #define p_nbs p9 // not new best_len
478 #define p_nnc p10 // not nice_length
479 #define p_ll p11
480 #define p_end p12
482 #define MAX_MATCH 258
483 #define MIN_MATCH 4
484 #define WSIZE 32768
485 #define MAX_DIST WSIZE - MAX_MATCH - MIN_MATCH - 1
487 #define R_INPUT 1
488 #define R_LOCAL 31
489 #define R_OUTPUT 0
490 #define R_ROTATING 24
491 #define MLAT 3
492 #define SHLAT 2
494 #define mova mov
495 #define movi0 mov
496 #define cgtu cmp.gt.unc
497 #define cgeu cmp.ge.unc
498 #define cneu cmp.ne.unc
500 .global longest_match
501 .proc longest_match
502 .align 32
503 longest_match:
504 // -- Cycle: 0
505 .prologue
506 {.mmi
507 alloc r2=ar.pfs,R_INPUT,R_LOCAL,R_OUTPUT,R_ROTATING
508 .rotr scan[MLAT+2], match[MLAT+2], shscan0[SHLAT+1], \
509 shscan1[SHLAT+1], shmatch0[SHLAT+1], shmatch1[SHLAT+1]
510 .rotp lc[MLAT+SHLAT+2]
511 mova s_vspec_cmatch=in0 // cur_match from input register
512 add s_tm1=@gprel(strstart),gp // a(a(strstart))
513 }{.mmi
514 add s_tm3=@gprel(prev_length),gp // a(a(prev_length))
515 add s_tm5=@ltoff(window),gp // a(a(window))
516 add s_tm6=@ltoff(prev),gp // a(a(prev))
518 }{.mmb // Cycle: 1
519 ld4 s_vstrstart=[s_tm1] // strstart
520 ld4 s_vbestlen=[s_tm3] // best_len = prev_length
521 brp.loop.imp .cmploop,.cmploop+48
522 }{.mli
523 add s_tm2=@gprel(max_chain_length),gp // a(a(max_chain_length))
524 movl s_wmask=WSIZE-1
526 }{.mmi // Cycle: 2
527 ld8 s_aprev=[s_tm6] // a(prev)
528 ld8 s_awindow=[s_tm5] // a(window)
529 .save pr, s_prsave
530 movi0 s_prsave=pr // save predicates
531 }{.mmi
532 add s_tm4=@gprel(good_match),gp // a(a(good_match))
533 add s_tm7=@ltoff(nice_match),gp // a(a(nice_match))
534 add s_tm8=@ltoff(match_start),gp // a(match_start)
536 }{.mmi // Cycle: 3
537 ld8 s_anicematch=[s_tm7] // a(nice_match)
538 ld8 s_amatchstart=[s_tm8] // a(match_start)
539 .save ar.lc, s_lcsave
540 movi0 s_lcsave=ar.lc // save loop count register
541 }{.mmi
542 .body
543 add s_tm1=-(MAX_MATCH + MIN_MATCH),s_wmask // maxdist
544 cmp.eq p_ll,p0=r0,r0 // parallel compare initialized as 'true'
545 mova s_vcurmatch=s_vspec_cmatch
547 }{.mmi // Cycle: 4
548 ld4 s_vchainlen=[s_tm2] // chain_length=max_chain_length
549 ld4 s_tm4=[s_tm4] // v(good_match)
550 add s_ascan=s_awindow,s_vstrstart // scan=window + strstart
551 }{.mmi
552 sub s_vlimit=s_vstrstart, s_tm1 // limit=strstart - MAX_DIST
553 add s_amatch=s_awindow,s_vspec_cmatch // match=window + cur_match
554 and s_vspec_cmatch =s_vspec_cmatch,s_wmask
556 }{.mmi // Cycle: 5
557 add s_amatblen=s_amatch,s_vbestlen //
558 cneu p_bn2,p0=2,s_vbestlen // set if bestlen != 2
559 add s_ascanend=s_ascan,s_vbestlen // compute a(scan) + best_len
560 }{.mmi
561 ld1 s_vscan0=[s_ascan],1 // NB: s_ascan++
562 ld1 s_vmatch0=[s_amatch],1
563 cgtu p0,p_no=s_vlimit,r0 // is result positive ?
565 }{.mmi // Cycle: 6
566 ld1.nt1 s_vscan1=[s_ascan],2 // NB: s_ascan+3 in total
567 ld1.nt1 s_vmatch1=[s_amatch],2
568 add s_awinbest=s_awindow,s_vbestlen //
570 }{.mmi // Cycle: 7
571 ld1.nt1 s_vscanend=[s_ascanend],-1 // scan_end=scan[best_len]
572 ld1.nt1 s_vmatbst=[s_amatblen],-1
573 (p_no) mova s_vlimit=r0
575 }{.mmi // Cycle: 8
576 (p_bn2) ld1.nt1 s_vscanend1=[s_ascanend],1 // scan_end1=scan[best_len-1]
577 (p_bn2) ld1.nt1 s_vmatbst1=[s_amatblen]
578 shladd s_vspec_cmatch =s_vspec_cmatch,1,s_aprev
579 }{.mmi
580 cgeu p_shf,p0=s_vbestlen,s_tm4 // is (prev_length >= good_match) ?
582 }{.mmi // Cycle: 9
583 ld1.nt1 s_vscan3=[s_ascan]
584 ld2.nt1 s_vspec_cmatch=[s_vspec_cmatch]
585 mova s_vlen=3
586 }{.mmi
587 (p_shf) shr.u s_vchainlen=s_vchainlen,2 // (cur_len) >> 2
589 }{.mmi // Cycle: 10
590 ld1.nt1 s_vmatch3=[s_amatch]
591 // p_ll switched on as soon as we get a mismatch:
592 cmp.eq.and p_ll,p0=s_vmatch0,s_vscan0
593 cmp.eq.and p_ll,p0=s_vmatbst,s_vscanend
594 }{.mib
595 cmp.eq.and p_ll,p0=s_vmatch1,s_vscan1
596 (p_bn2) cmp.eq.and p_ll,p0=s_vmatbst1,s_vscanend1
597 (p_ll) br.cond.dpnt.many .test_more
601 .next_iter:
602 {.mmi // Cycle 0
603 add s_amatch=s_awindow,s_vspec_cmatch // match=window + cur_match
604 mov s_vcurmatch=s_vspec_cmatch // current value
605 add s_vchainlen=-1,s_vchainlen // --chain_length
606 }{.mib
607 cmp.le.unc p_end,p0=s_vspec_cmatch,s_vlimit
608 and s_vspec_cmatch=s_vspec_cmatch,s_wmask
609 (p_end) br.cond.dptk.many .terminate
611 }{.mmi // Cycle 1
612 ld1 s_vmatch0=[s_amatch],1 // load match[0]
613 // compute prev[cur_match]:
614 shladd s_vspec_cmatch=s_vspec_cmatch,1,s_aprev
615 cmp.eq.unc p_end,p0=s_vchainlen,r0
616 } {.mib
617 nop.m 0
618 add s_amatblen=s_awinbest,s_vcurmatch // match=window + cur_match
619 (p_end) br.cond.dptk.many .terminate
621 }{.mmi // Cycle 2 (short)
622 ld2.nt1 s_vspec_cmatch=[s_vspec_cmatch] // get next cur_match
624 }{.mmi // Cycle 3 (short)
625 ld1.nt1 s_vmatbst=[s_amatblen],-1 // load match[best_len]
626 cmp.ne.unc p_ll,p0=r0,r0 // parallel compare initialized as 'false'
628 }{.mmi // Cycle 4 (short)
629 // load match[1] - - note: match += 3 (in total):
630 ld1.nt1 s_vmatch1=[s_amatch],2
632 // Cycle 5 (short)
633 (p_bn2) ld1.nt1 s_vmatbst1=[s_amatblen] // load match[best_len-1]
634 }{.mib // Here we (MOST LIKELY) pay a L2-fetch stall
635 // p_ll switched on as soon as we get a mismatch:
636 cmp.ne.or p_ll,p0=s_vmatch0,s_vscan0
637 cmp.ne.or p_ll,p0=s_vmatbst,s_vscanend
638 (p_ll) br.cond.dptk.many .next_iter
640 }{.mmi // Cycle 6
641 ld1.nt1 s_vmatch3=[s_amatch]
642 mova s_vlen=3
643 nop.i 0
644 }{.mib
645 cmp.ne.or p_ll,p0=s_vmatch1,s_vscan1
646 (p_bn2) cmp.ne.or p_ll,p0=s_vmatbst1,s_vscanend1
647 (p_ll) br.cond.dptk.many .next_iter
651 // We have passed the first hurdle - Are there additional matches ???
653 .test_more:
654 {.mmi // Cycle 0
655 and s_tm3=7,s_ascan // get byte offset
656 and s_tm4=7,s_amatch // get byte offset
657 movi0 ar.ec=MLAT+SHLAT+2 // NB: One trip more than usual
658 }{.mib
659 cmp.ne.unc p_no,p0=s_vscan3,s_vmatch3 // does not next one differ?
660 (p_no) br.cond.dptk.many .only3
662 }{.mmi // Cycle 1
663 and s_tm1=-8,s_ascan // get aligned address
664 shladd s_tm3=s_tm3,3,r0
665 movi0 ar.lc=31 // 32 times around the loop (8B at a time)
666 }{.mib
667 and s_tm2=-8,s_amatch // get aligned address
668 shladd s_tm4=s_tm4,3,r0
669 nop.b 0
671 }{.mmi // Cycle 2
672 ld8.nt1 scan[1]=[s_tm1],8 // load first chunk
673 sub s_tm5=64,s_tm3 // 64 - amount
674 movi0 pr.rot=1<<16
675 }{.mmi
676 ld8.nt1 match[1]=[s_tm2],8 // load first chunk
677 sub s_tm6=64,s_tm4 // 64 - amount
678 add s_vlen=-8,s_vlen // will be updated at least once
681 .align 32
682 .cmploop:
683 {.mmi // Cycle 0
684 (lc[0]) ld8 scan[0]=[s_tm1],8 // next scan chunk
685 (lc[MLAT+SHLAT+1]) add s_vlen=8,s_vlen
686 (lc[MLAT]) first shscan0[0]=scan[MLAT+1],s_tm3
687 }{.mib
688 (lc[MLAT+SHLAT+1]) cmp.ne.unc p_no,p0=s_tm7,s_tm8 // break search if !=
689 (lc[MLAT]) first shmatch0[0]=match[MLAT+1],s_tm4
690 (p_no) br.cond.dpnt.many .mismatch
692 }{.mii // Cycle 1
693 (lc[0]) ld8 match[0]=[s_tm2],8
694 // shift left(le) or right(be):
695 (lc[MLAT]) second shscan1[0]=scan[MLAT],s_tm5
696 (lc[MLAT]) second shmatch1[0]=match[MLAT],s_tm6
697 }{.mmb
698 (lc[MLAT+SHLAT]) or s_tm7=shscan0[SHLAT],shscan1[SHLAT]
699 (lc[MLAT+SHLAT]) or s_tm8=shmatch0[SHLAT],shmatch1[SHLAT]
700 br.ctop.dptk.many .cmploop
702 }{.mfi
703 mov s_vlen=258
704 nop.f 0
705 }{.mfi
706 nop.f 0 // realign
709 .mismatch:
710 {.mii // Cycle 0 (short)
711 (p_no) pcmp1.eq s_tm2=s_tm7,s_tm8 // find first non-matching character
712 nop.i 0
714 // Cycle 1 (short)
715 (p_no) count s_tm1=s_tm2
717 }{.mib // Cycle 2 (short)
718 (p_no) add s_vlen=s_vlen,s_tm1 // effective length
719 nop.i 0
720 clrrrb
724 .only3:
725 {.mib // Cycle 0 (short)
726 cmp.gt.unc p0,p_nbs=s_vlen,s_vbestlen // (len > best_len) ?
727 (p_nbs) br.cond.dpnt.many .next_iter // if not, reiterate
729 }{.mmi // Cycle 1 (short)
730 ld4 s_tm7=[s_anicematch] // nice_match
731 st4 [s_amatchstart]= s_vcurmatch
732 add s_ascanend=s_ascan,s_vlen // reset with best_len
734 }{.mmi // Cycle 2 (short)
735 mova s_vbestlen=s_vlen
736 add s_ascanend=-3,s_ascanend // remember extra offset
738 }{.mmi // Cycle 3 (short)
739 ld1 s_vscanend=[s_ascanend],-1 // scan_end=scan[best_len]
740 add s_awinbest=s_awindow,s_vbestlen // update with new best_len
741 cmp.ne.unc p_bn2,p0=2,s_vbestlen // set if bestlen != 2
743 }{.mib // Cycle 4 (short)
744 // scan_end1=scan[best_len-1] NB: s_ascanend reset:
745 ld1.nt1 s_vscanend1=[s_ascanend],1
746 cmp.lt.unc p_nnc,p0=s_vlen,s_tm7 // compare with nice_match
747 (p_nnc) br.cond.dptk.many .next_iter
750 .terminate:
751 {.mii // Cycle 0/1
752 nop.m 0
753 movi0 ar.lc=s_lcsave
754 movi0 pr=s_prsave,-1
755 }{.mbb
756 nop.m 0
757 nop.b 0
758 br.ret.sptk.many rp // ret0 is identical to best_len
761 .endp
763 .global match_init
764 .proc match_init
765 match_init:
766 sub ret0=ret0,ret0
767 br.ret.sptk.many rp
768 .endp
770 # else
771 error: this asm version is for 386 or 680x0 or ia64 only
772 # endif /* __ia64__ */
773 #endif /* mc68000 || mc68020 */
774 #endif /* i386 || _I386 */