3 #ifdef USE_ROLL_ASM /* { */
5 #define CHAR_OFFSET 0 /* Keep this the same as rsync.h, which isn't likely to change. */
8 #define get_checksum1_avx2_asm _get_checksum1_avx2_asm
11 .intel_syntax noprefix
15 .globl get_checksum1_avx2_asm
17 # rdi=*buf, esi=len, edx=i, rcx= *ps1, r8= *ps2
18 get_checksum1_avx2_asm:
19 vmovd xmm6,[rcx] # load *ps1
20 lea eax, [rsi-128] # at least 128 bytes to process?
24 vmovntdqa ymm7, [rax] # load T2 multiplication constants
25 vmovntdqa ymm12,[rax+32]# from memory.
26 vpcmpeqd ymm15, ymm15, ymm15 # set all elements to -1.
29 mov eax, 32*CHAR_OFFSET
31 vpbroadcastd ymm10, xmm10
32 mov eax, 528*CHAR_OFFSET
34 vpbroadcastd ymm13, xmm13
36 vpabsb ymm15, ymm15 # set all byte size elements to 1.
38 vmovdqu ymm2, [rdi] # preload the first 64 bytes.
39 vmovdqu ymm3, [rdi+32]
40 and esi, ~63 # only needed during final reduction,
41 # done here to avoid a longer nop for
44 shr rsi, 6 # longer opcode for alignment
46 vpxor xmm1, xmm1, xmm1 # reset both partial sums accumulators.
47 vpxor xmm4, xmm4, xmm4
49 .p2align 4 # should fit into the LSD allocation queue.
51 vpmaddubsw ymm0, ymm15, ymm2 # s1 partial sums
52 vpmaddubsw ymm5, ymm15, ymm3
53 vmovdqu ymm8, [rdi] # preload the next
54 vmovdqu ymm9, [rdi+32] # 64 bytes.
56 vpaddd ymm4, ymm4, ymm6
57 vpaddw ymm5, ymm5, ymm0
59 vpaddw ymm5, ymm0, ymm5
60 vpaddd ymm6, ymm5, ymm6
61 vpmaddubsw ymm2, ymm7, ymm2 # s2 partial sums
62 vpmaddubsw ymm3, ymm12, ymm3
63 prefetcht0 [rdi+384] # prefetch 6 cachelines ahead.
64 vpaddw ymm3, ymm2, ymm3
66 vpaddd ymm3, ymm2, ymm3
67 vpaddd ymm1, ymm1, ymm3
70 vpaddd ymm6, ymm10, ymm6 # 32*CHAR_OFFSET
71 vpaddd ymm1, ymm13, ymm1 # 528*CHAR_OFFSET
73 vmovdqa ymm2, ymm8 # move the next 64 bytes
74 vmovdqa ymm3, ymm9 # into the right registers
78 # now we reduce the partial sums.
82 vpaddd ymm0, ymm3, ymm1
83 vpaddd ymm6, ymm2, ymm6
87 vpaddd ymm0, ymm3, ymm0
89 vpaddd ymm6, ymm2, ymm6
90 vpaddd ymm0, ymm3, ymm0
91 vextracti128 xmm2, ymm6, 0x1
92 vextracti128 xmm1, ymm0, 0x1
93 vpaddd xmm6, xmm2, xmm6
95 vpaddd xmm1, xmm1, xmm0
177 #endif /* } USE_ROLL_ASM */