1 ; LzmaDecOpt.asm -- ASM version of LzmaDec_DecodeReal_3() function
2 ; 2018-02-06: Igor Pavlov : Public domain
4 ; 3 - is the code compatibility version of LzmaDec_DecodeReal_*()
5 ; function for check at link time.
6 ; That code is tightly coupled with LzmaDec_TryDummy()
7 ; and with another functions in LzmaDec.c file.
8 ; CLzmaDec structure, (probs) array layout, input and output of
9 ; LzmaDec_DecodeReal_*() must be equal in both versions (C / ASM).
13 ; .err <x64_IS_REQUIRED>
20 _TEXT$LZMADECOPT
SEGMENT ALIGN(64) 'CODE'
22 MY_ALIGN
macro num:req
39 ; _LZMA_SIZE_OPT equ 1
46 mov dest
, dword ptr [mem
]
49 mov dword ptr [mem
], src
54 movzx dest
, word ptr [mem
]
57 mov word ptr [mem
], @CatStr(src
, _W
)
61 PMULT
equ (1 SHL PSHIFT
)
62 PMULT_HALF
equ (1 SHL (PSHIFT
- 1))
63 PMULT_2
equ (1 SHL (PSHIFT
+ 1))
67 ; x1 pbPos / (prob) TREE
68 ; x2 probBranch / prm (MATCHED) / pbPos / cnt
72 ; x6 t1 NORM_CALC / probs_state / dist
73 ; x7 t0 NORM_CALC / prob2 IF_BIT_1
75 ; x9 match (MATCHED) / sym2 / dist2 / lpMask_reg
76 ; x10 kBitModelTotal_reg
78 ; x12 offs (MATCHED) / dic / len_temp
80 ; x14 bit (MATCHED) / dicPos
91 kBitModelTotal_reg
equ x10
124 probs_state_R
equ t1_R
144 kNumBitModelTotalBits
equ 11
145 kBitModelTotal
equ (1 SHL kNumBitModelTotalBits
)
147 kBitModelOffset
equ ((1 SHL kNumMoveBits
) - 1)
148 kTopValue
equ (1 SHL 24)
151 ; movzx t0, BYTE PTR [buf]
153 mov cod_L
, BYTE PTR [buf
]
168 ; ---------- Branch MACROS ----------
170 UPDATE_0
macro probsArray:req
, probOffset:req
, probDisp:req
171 mov prob2
, kBitModelTotal_reg
172 sub prob2
, probBranch
173 shr prob2
, kNumMoveBits
174 add probBranch
, prob2
175 PSTORE probBranch
, probOffset
* 1 + probsArray
+ probDisp
* PMULT
179 UPDATE_1
macro probsArray:req
, probOffset:req
, probDisp:req
183 mov prob2
, probBranch
184 shr probBranch
, kNumMoveBits
185 sub prob2
, probBranch
186 PSTORE prob2
, probOffset
* 1 + probsArray
+ probDisp
* PMULT
190 CMP_COD
macro probsArray:req
, probOffset:req
, probDisp:req
191 PLOAD probBranch
, probOffset
* 1 + probsArray
+ probDisp
* PMULT
194 shr range
, kNumBitModelTotalBits
195 imul range
, probBranch
200 IF_BIT_1_NOUP
macro probsArray:req
, probOffset:req
, probDisp:req
, toLabel:req
201 CMP_COD probsArray
, probOffset
, probDisp
206 IF_BIT_1
macro probsArray:req
, probOffset:req
, probDisp:req
, toLabel:req
207 IF_BIT_1_NOUP probsArray
, probOffset
, probDisp
, toLabel
208 UPDATE_0 probsArray
, probOffset
, probDisp
212 IF_BIT_0_NOUP
macro probsArray:req
, probOffset:req
, probDisp:req
, toLabel:req
213 CMP_COD probsArray
, probOffset
, probDisp
218 ; ---------- CMOV MACROS ----------
220 NORM_CALC
macro prob:req
223 shr range
, kNumBitModelTotalBits
231 PUP
macro prob:req
, probPtr:req
233 ; only sar works for both 16/32 bit prob modes
240 PUP_SUB
macro prob:req
, probPtr:req
, symSub:req
246 PUP_COD
macro prob:req
, probPtr:req
, symSub:req
247 mov t0
, kBitModelOffset
250 cmovb t0
, kBitModelTotal_reg
251 PUP_SUB prob
, probPtr
, symSub
255 BIT_0
macro prob:req
, probNext:req
256 PLOAD prob
, probs
+ 1 * PMULT
257 PLOAD probNext
, probs
+ 1 * PMULT_2
262 PLOAD t0
, probs
+ 1 * PMULT_2
+ PMULT
264 mov t0
, kBitModelOffset
266 cmovb t0
, kBitModelTotal_reg
268 PUP_SUB prob
, probs
+ 1 * PMULT
, 0 - 1
272 BIT_1
macro prob:req
, probNext:req
273 PLOAD probNext
, probs
+ sym_R
* PMULT_2
279 PLOAD t0
, probs
+ sym_R
* PMULT
+ PMULT
281 PUP_COD prob
, probs
+ t1_R
* PMULT_HALF
, 0 - 1
285 BIT_2
macro prob:req
, symSub:req
291 PUP_COD prob
, probs
+ t1_R
* PMULT_HALF
, symSub
295 ; ---------- MATCHED LITERAL ----------
298 mov offs
, 256 * PMULT
299 shl match
, (PSHIFT
+ 1)
302 PLOAD x1
, probs
+ 256 * PMULT
+ bit_R
* 1 + 1 * PMULT
303 lea prm
, [probs
+ 256 * PMULT
+ bit_R
* 1 + 1 * PMULT
]
304 ; lea prm, [probs + 256 * PMULT + 1 * PMULT]
314 mov t0
, kBitModelOffset
316 cmovb t0
, kBitModelTotal_reg
318 PUP_SUB x1
, prm
, -2-1
324 lea prm
, [probs
+ offs_R
* 1]
326 PLOAD x1
, prm
+ sym_R
* PMULT
336 PUP_COD x1
, prm
+ t1_R
* PMULT_HALF
, - 1
342 lea prm
, [probs
+ offs_R
* 1]
344 PLOAD x1
, prm
+ sym_R
* PMULT
350 PUP_COD x1
, prm
+ t1_R
* PMULT_HALF
, 256 - 1
354 ; ---------- REVERSE BITS ----------
356 REV_0
macro prob:req
, probNext:req
357 ; PLOAD prob, probs + 1 * PMULT
358 ; lea sym2_R, [probs + 2 * PMULT]
359 ; PLOAD probNext, probs + 2 * PMULT
360 PLOAD probNext
, sym2_R
365 PLOAD t0
, probs
+ 3 * PMULT
368 mov t0
, kBitModelOffset
369 cmovb t0
, kBitModelTotal_reg
370 lea t1_R
, [probs
+ 3 * PMULT
]
372 PUP prob
, probs
+ 1 * PMULT
376 REV_1
macro prob:req
, probNext:req
, step:req
377 add sym2_R
, step
* PMULT
378 PLOAD probNext
, sym2_R
383 PLOAD t0
, sym2_R
+ step
* PMULT
386 mov t0
, kBitModelOffset
387 cmovb t0
, kBitModelTotal_reg
388 lea t1_R
, [sym2_R
+ step
* PMULT
]
390 PUP prob
, t1_R
- step
* PMULT_2
394 REV_2
macro prob:req
, step:req
405 mov t0
, kBitModelOffset
406 cmovb t0
, kBitModelTotal_reg
407 PUP prob
, probs
+ sym2_R
* PMULT
411 REV_1_VAR
macro prob:req
419 lea t0_R
, [sym_R
+ sym2_R
]
421 mov t0
, kBitModelOffset
423 ; mov t1, kBitModelTotal
425 cmovb t0
, kBitModelTotal_reg
433 LIT_PROBS
macro lpMaskParam:req
434 ; prob += (UInt32)3 * ((((processedPos << 8) + dic[(dicPos == 0 ? dicBufSize : dicPos) - 1]) & lpMask) << lc);
439 add probs_state_R
, pbPos_R
441 lea sym
, dword ptr[sym_R
+ 2 * sym_R
]
442 add probs
, Literal
* PMULT
445 UPDATE_0 probs_state_R
, 0, IsMatch
452 kNumPosStatesMax
equ (1 SHL kNumPosBitsMax
)
455 kLenNumLowSymbols
equ (1 SHL kLenNumLowBits
)
456 kLenNumHighBits
equ 8
457 kLenNumHighSymbols
equ (1 SHL kLenNumHighBits
)
458 kNumLenProbs
equ (2 * kLenNumLowSymbols
* kNumPosStatesMax
+ kLenNumHighSymbols
)
462 LenChoice2
equ (LenLow
+ kLenNumLowSymbols
)
463 LenHigh
equ (LenLow
+ 2 * kLenNumLowSymbols
* kNumPosStatesMax
)
469 kStartPosModelIndex
equ 4
470 kEndPosModelIndex
equ 14
471 kNumFullDistances
equ (1 SHL (kEndPosModelIndex
SHR 1))
473 kNumPosSlotBits
equ 6
474 kNumLenToPosStates
equ 4
477 kAlignTableSize
equ (1 SHL kNumAlignBits
)
480 kMatchSpecLenStart
equ (kMatchMinLen
+ kLenNumLowSymbols
* 2 + kLenNumHighSymbols
)
482 kStartOffset
equ 1664
483 SpecPos
equ (-kStartOffset
)
484 IsRep0Long
equ (SpecPos
+ kNumFullDistances
)
485 RepLenCoder
equ (IsRep0Long
+ (kNumStates2
SHL kNumPosBitsMax
))
486 LenCoder
equ (RepLenCoder
+ kNumLenProbs
)
487 IsMatch
equ (LenCoder
+ kNumLenProbs
)
488 kAlign
equ (IsMatch
+ (kNumStates2
SHL kNumPosBitsMax
))
489 IsRep
equ (kAlign
+ kAlignTableSize
)
490 IsRepG0
equ (IsRep
+ kNumStates
)
491 IsRepG1
equ (IsRepG0
+ kNumStates
)
492 IsRepG2
equ (IsRepG1
+ kNumStates
)
493 PosSlot
equ (IsRepG2
+ kNumStates
)
494 Literal
equ (PosSlot
+ (kNumLenToPosStates
SHL kNumPosSlotBits
))
495 NUM_BASE_PROBS
equ (Literal
+ kStartOffset
)
498 .err
<Stop_Compiling_Bad_LZMA_kAlign
>
501 if NUM_BASE_PROBS ne
1984
502 .err
<Stop_Compiling_Bad_LZMA_PROBS
>
519 dicPos_Spec PTR_FIELD
524 processedPos_Spec
dd ?
535 CLzmaDec_Asm_Loc
struct
554 dicPos_Spec PTR_FIELD
559 CLzmaDec_Asm_Loc
ends
562 GLOB_2
equ [sym_R
].CLzmaDec_Asm.
563 GLOB
equ [r1
].CLzmaDec_Asm.
564 LOC_0
equ [r0
].CLzmaDec_Asm_Loc.
565 LOC
equ [RSP
].CLzmaDec_Asm_Loc.
574 RESTORE_VAR
macro name
581 IsMatchBranch_Pre
macro reg
582 ; prob = probs + IsMatch + (state << kNumPosBitsMax) + posState;
583 mov pbPos
, LOC pbMask
584 and pbPos
, processedPos
585 shl pbPos
, (kLenNumLowBits
+ 1 + PSHIFT
)
586 lea probs_state_R
, [probs
+ state_R
]
590 IsMatchBranch
macro reg
592 IF_BIT_1 probs_state_R
, pbPos_R
, IsMatch
, IsMatch_label
596 CheckLimits
macro reg
597 cmp buf
, LOC bufLimit
599 cmp dicPos
, LOC limit
605 ; RSP is (16x + 8) bytes aligned in WIN64-x64
606 ; LocalSize equ ((((SIZEOF CLzmaDec_Asm_Loc) + 7) / 16 * 16) + 8)
608 PARAM_lzma
equ REG_PARAM_0
609 PARAM_limit
equ REG_PARAM_1
610 PARAM_bufLimit
equ REG_PARAM_2
613 MY_PROC LzmaDec_DecodeReal_3
, 3
614 MY_PUSH_PRESERVED_REGS
616 lea r0
, [RSP
- (SIZEOF CLzmaDec_Asm_Loc
)]
620 mov LOC_0 Old_RSP
, r5
621 mov LOC_0 lzmaPtr
, PARAM_lzma
623 mov LOC_0 remainLen
, 0 ; remainLen must be ZERO
625 mov LOC_0 bufLimit
, PARAM_bufLimit
626 mov sym_R
, PARAM_lzma
; CLzmaDec_Asm_Loc pointer for GLOB_2
627 mov dic
, GLOB_2 dic_Spec
629 mov LOC_0 limit
, PARAM_limit
636 mov dicPos
, GLOB_2 dicPos_Spec
638 mov LOC_0 dicPos_Spec
, dicPos
639 mov LOC_0 dic_Spec
, dic
647 ; unsigned pbMask = ((unsigned)1 << (p->prop.pb)) - 1;
648 ; unsigned lc = p->prop.lc;
649 ; unsigned lpMask = ((unsigned)0x100 << p->prop.lp) - ((unsigned)0x100 >> lc);
664 ; mov probs, GLOB_2 probs_Spec
665 ; add probs, kStartOffset SHL PSHIFT
666 mov probs
, GLOB_2 probs_1664
667 mov LOC_0 probs_Spec
, probs
669 mov t0_R
, GLOB_2 dicBufSize
670 mov LOC_0 dicBufSize
, t0_R
672 mov x1
, GLOB_2 checkDicSize
673 mov LOC_0 checkDicSize
, x1
675 mov processedPos
, GLOB_2 processedPos_Spec
677 mov state
, GLOB_2 state_Spec
680 mov buf
, GLOB_2 buf_Spec
681 mov range
, GLOB_2 range_Spec
682 mov cod
, GLOB_2 code_Spec
683 mov kBitModelTotal_reg
, kBitModelTotal
686 ; if (processedPos != 0 || checkDicSize != 0)
693 movzx sym
, byte ptr[t0_R
- 1]
699 cmp state
, kNumLitStates
* PMULT
706 ; ---------- LITERAL ----------
715 PLOAD x1
, probs
+ 1 * PMULT
738 ; mov dic, LOC dic_Spec
739 mov probs
, LOC probs_Spec
741 mov byte ptr[dicPos
], sym_L
746 IF_BIT_0_NOUP probs_state_R
, pbPos_R
, IsMatch
, lit_start
750 ; ---------- MATCHES ----------
753 UPDATE_1 probs_state_R
, pbPos_R
, IsMatch
754 IF_BIT_1 probs_state_R
, 0, IsRep
, IsRep_label
756 add probs
, LenCoder
* PMULT
757 add state
, kNumStates
* PMULT
759 ; ---------- LEN DECODE ----------
761 mov len_temp
, 8 - 1 - kMatchMinLen
762 IF_BIT_0_NOUP probs
, 0, 0, len_mid_0
764 add probs
, (1 SHL (kLenNumLowBits
+ PSHIFT
))
765 mov len_temp
, -1 - kMatchMinLen
766 IF_BIT_0_NOUP probs
, 0, 0, len_mid_0
768 add probs
, LenHigh
* PMULT
- (1 SHL (kLenNumLowBits
+ PSHIFT
))
770 PLOAD x1
, probs
+ 1 * PMULT
779 mov len_temp
, (kLenNumHighSymbols
- kLenNumLowSymbols
* 2) - 1 - kMatchMinLen
790 mov probs
, LOC probs_Spec
791 cmp state
, kNumStates
* PMULT
795 ; ---------- DECODE DISTANCE ----------
796 ; probs + PosSlot + ((len < kNumLenToPosStates ? len : kNumLenToPosStates - 1) << kNumPosSlotBits);
798 mov t0
, 3 + kMatchMinLen
799 cmp sym
, 3 + kMatchMinLen
801 add probs
, PosSlot
* PMULT
- (kMatchMinLen
SHL (kNumPosSlotBits
+ PSHIFT
))
802 shl t0
, (kNumPosSlotBits
+ PSHIFT
)
806 ; mov LOC remainLen, sym
811 PLOAD x1
, probs
+ 1 * PMULT
834 mov probs
, LOC probs_Spec
835 cmp x1
, 32 + kEndPosModelIndex
/ 2
838 ; unsigned numDirectBits = (unsigned)(((distance >> 1) - 1));
839 sub x1
, (32 + 1 + kNumAlignBits
)
840 ; distance = (2 | (distance & 1));
842 PLOAD x2
, probs
+ 1 * PMULT
843 shl sym
, kNumAlignBits
+ 1
844 lea sym2_R
, [probs
+ 2 * PMULT
]
847 ; lea t1, [sym_R + (1 SHL kNumAlignBits)]
848 ; cmp range, kTopValue
851 ; ---------- DIRECT DISTANCE ----------
864 lea sym
, dword ptr [r2
+ sym_R
* 2 + 1]
873 lea t1
, [sym_R
+ (1 SHL kNumAlignBits
)]
875 jae near ptr direct_loop
876 ; we align for 32 here with "near ptr" command above
883 ; distance <<= kNumAlignBits;
891 ; if (distance >= (checkDicSize == 0 ? processedPos: checkDicSize))
893 mov t0
, LOC checkDicSize
895 cmove t0
, processedPos
902 ; rep0 = distance + 1;
909 ; mov sym, LOC remainLen
915 ; state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3;
916 cmp state
, (kNumStates
+ kNumLitStates
) * PMULT
917 mov state
, kNumLitStates
* PMULT
918 mov t0
, (kNumLitStates
+ 3) * PMULT
922 ; ---------- COPY MATCH ----------
925 ; len += kMatchMinLen;
926 ; add sym, kMatchMinLen
928 ; if ((rem = limit - dicPos) == 0)
930 ; p->dicPos = dicPos;
931 ; return SZ_ERROR_DATA;
937 ; curLen = ((rem < len) ? (unsigned)rem : len);
939 ; cmovae cnt_R, sym_R ; 64-bit
940 cmovae cnt
, sym
; 32-bit
942 mov dic
, LOC dic_Spec
947 ; processedPos += curLen;
948 add processedPos
, cnt
951 mov LOC remainLen
, sym
955 ; pos = dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0);
959 mov r1
, LOC dicBufSize
965 ; if (curLen <= dicBufSize - pos)
967 ; ---------- COPY MATCH FAST ----------
968 ; Byte *dest = dic + dicPos;
970 ; ptrdiff_t src = (ptrdiff_t)pos - (ptrdiff_t)dicPos;
974 ; const Byte *lim = dest + curLen;
976 movzx sym
, byte ptr[t0_R
]
979 ; lea r1, [dicPos - 1]
994 mov byte ptr[cnt_R
* 1 + dicPos
], sym_L
995 movzx sym
, byte ptr[cnt_R
* 1 + t0_R
]
1001 mov byte ptr[dicPos
], sym_L
1007 IF_BIT_1_NOUP probs_state_R
, pbPos_R
, IsMatch
, IsMatch_label
1011 ; ---------- LITERAL MATCHED ----------
1013 LIT_PROBS LOC lpMask
1015 ; matchByte = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)];
1017 ; mov dic, LOC dic_Spec
1018 mov LOC dicPos_Spec
, dicPos
1020 ; state -= (state < 10) ? 3 : 6;
1021 lea t0
, [state_R
- 6 * PMULT
]
1022 sub state
, 3 * PMULT
1023 cmp state
, 7 * PMULT
1029 add dicPos
, LOC dicBufSize
1034 cmovb t0_R
, LOC dicBufSize
1037 movzx match
, byte ptr[dic
+ dicPos
* 1]
1039 ifdef _LZMA_SIZE_OPT
1041 mov offs
, 256 * PMULT
1042 shl match
, (PSHIFT
+ 1)
1065 mov probs
, LOC probs_Spec
1067 ; mov dic, LOC dic_Spec
1068 mov dicPos
, LOC dicPos_Spec
1069 mov byte ptr[dicPos
], sym_L
1074 IF_BIT_1_NOUP probs_state_R
, pbPos_R
, IsMatch
, IsMatch_label
1076 mov lpMask_reg
, LOC lpMask
1077 sub state
, 3 * PMULT
1082 ; ---------- REP 0 LITERAL ----------
1085 UPDATE_0 probs_state_R
, pbPos_R
, IsRep0Long
1087 ; dic[dicPos] = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)];
1088 mov dic
, LOC dic_Spec
1090 mov probBranch
, LOC rep0
1093 sub probs
, RepLenCoder
* PMULT
1095 ; state = state < kNumLitStates ? 9 : 11;
1099 sub t0_R
, probBranch_R
1101 add t0_R
, LOC dicBufSize
1103 movzx sym
, byte ptr[dic
+ t0_R
* 1]
1109 UPDATE_1 probs_state_R
, 0, IsRep
1111 ; The (checkDicSize == 0 && processedPos == 0) case was checked before in LzmaDec.c with kBadRepCode.
1112 ; So we don't check it here.
1114 ; mov t0, processedPos
1115 ; or t0, LOC checkDicSize
1118 ; state = state < kNumLitStates ? 8 : 11;
1119 cmp state
, kNumLitStates
* PMULT
1120 mov state
, 8 * PMULT
1121 mov probBranch
, 11 * PMULT
1122 cmovae state
, probBranch
1124 ; prob = probs + RepLenCoder;
1125 add probs
, RepLenCoder
* PMULT
1127 IF_BIT_1 probs_state_R
, 0, IsRepG0
, IsRepG0_label
1128 IF_BIT_0_NOUP probs_state_R
, pbPos_R
, IsRep0Long
, IsRep0Short_label
1129 UPDATE_1 probs_state_R
, pbPos_R
, IsRep0Long
1134 UPDATE_1 probs_state_R
, 0, IsRepG0
1139 IF_BIT_1 probs_state_R
, 0, IsRepG1
, IsRepG1_label
1145 UPDATE_1 probs_state_R
, 0, IsRepG1
1149 IF_BIT_1 probs_state_R
, 0, IsRepG2
, IsRepG2_label
1155 UPDATE_1 probs_state_R
, 0, IsRepG2
1163 ; ---------- SPEC SHORT DISTANCE ----------
1171 lea sym_R
, [probs
+ sym_R
* PMULT
+ SpecPos
* PMULT
+ 1 * PMULT
]
1172 mov sym2
, PMULT
; step
1179 mov probs
, LOC probs_Spec
1181 sub sym
, SpecPos
* PMULT
1188 ; ---------- COPY MATCH CROSS ----------
1191 ; r1 - len to dicBufSize
1192 ; cnt_R - total copy len
1194 mov t1_R
, t0_R
; srcPos
1196 mov r1
, LOC dicBufSize
;
1199 movzx sym
, byte ptr[t1_R
* 1 + t0_R
]
1201 mov byte ptr[cnt_R
* 1 + dicPos
], sym_L
1206 movzx sym
, byte ptr[t0_R
]
1214 mov LOC remainLen
, len_temp
1220 cmp sym
, 0FFFFFFFFh
; -1
1223 mov LOC remainLen
, kMatchSpecLenStart
1224 sub state
, kNumStates
* PMULT
1234 sub dicPos
, LOC dic_Spec
1235 mov GLOB dicPos_Spec
, dicPos
1236 mov GLOB buf_Spec
, buf
1237 mov GLOB range_Spec
, range
1238 mov GLOB code_Spec
, cod
1240 mov GLOB state_Spec
, state
1241 mov GLOB processedPos_Spec
, processedPos
1243 RESTORE_VAR
(remainLen
)
1251 mov RSP
, LOC Old_RSP
1253 MY_POP_PRESERVED_REGS
1256 _TEXT$LZMADECOPT
ENDS