modify SAVE_XMM for potential 64bit use
[libvpx.git] / vp8 / common / x86 / idctllm_sse2.asm
blob34a7e18aea727ec7175f57de50d3832d4a13df2a
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 ;void idct_dequant_0_2x_sse2
15 ; (
16 ; short *qcoeff - 0
17 ; short *dequant - 1
18 ; unsigned char *pre - 2
19 ; unsigned char *dst - 3
20 ; int dst_stride - 4
21 ; int blk_stride - 5
22 ; )
24 global sym(idct_dequant_0_2x_sse2)
25 sym(idct_dequant_0_2x_sse2):
26 push rbp
27 mov rbp, rsp
28 SHADOW_ARGS_TO_STACK 6
29 GET_GOT rbx
30 ; end prolog
32 mov rdx, arg(1) ; dequant
33 mov rax, arg(0) ; qcoeff
35 movd xmm4, [rax]
36 movd xmm5, [rdx]
38 pinsrw xmm4, [rax+32], 4
39 pinsrw xmm5, [rdx], 4
41 pmullw xmm4, xmm5
43 ; Zero out xmm5, for use unpacking
44 pxor xmm5, xmm5
46 ; clear coeffs
47 movd [rax], xmm5
48 movd [rax+32], xmm5
49 ;pshufb
50 pshuflw xmm4, xmm4, 00000000b
51 pshufhw xmm4, xmm4, 00000000b
53 mov rax, arg(2) ; pre
54 paddw xmm4, [GLOBAL(fours)]
56 movsxd rcx, dword ptr arg(5) ; blk_stride
57 psraw xmm4, 3
59 movq xmm0, [rax]
60 movq xmm1, [rax+rcx]
61 movq xmm2, [rax+2*rcx]
62 lea rcx, [3*rcx]
63 movq xmm3, [rax+rcx]
65 punpcklbw xmm0, xmm5
66 punpcklbw xmm1, xmm5
67 punpcklbw xmm2, xmm5
68 punpcklbw xmm3, xmm5
70 mov rax, arg(3) ; dst
71 movsxd rdx, dword ptr arg(4) ; dst_stride
73 ; Add to predict buffer
74 paddw xmm0, xmm4
75 paddw xmm1, xmm4
76 paddw xmm2, xmm4
77 paddw xmm3, xmm4
79 ; pack up before storing
80 packuswb xmm0, xmm5
81 packuswb xmm1, xmm5
82 packuswb xmm2, xmm5
83 packuswb xmm3, xmm5
85 ; store blocks back out
86 movq [rax], xmm0
87 movq [rax + rdx], xmm1
89 lea rax, [rax + 2*rdx]
91 movq [rax], xmm2
92 movq [rax + rdx], xmm3
94 ; begin epilog
95 RESTORE_GOT
96 UNSHADOW_ARGS
97 pop rbp
98 ret
100 global sym(idct_dequant_full_2x_sse2)
101 sym(idct_dequant_full_2x_sse2):
102 push rbp
103 mov rbp, rsp
104 SHADOW_ARGS_TO_STACK 7
105 SAVE_XMM 7
106 GET_GOT rbx
107 push rsi
108 push rdi
109 ; end prolog
111 ; special case when 2 blocks have 0 or 1 coeffs
112 ; dc is set as first coeff, so no need to load qcoeff
113 mov rax, arg(0) ; qcoeff
114 mov rsi, arg(2) ; pre
115 mov rdi, arg(3) ; dst
116 movsxd rcx, dword ptr arg(5) ; blk_stride
118 ; Zero out xmm7, for use unpacking
119 pxor xmm7, xmm7
121 mov rdx, arg(1) ; dequant
123 ; note the transpose of xmm1 and xmm2, necessary for shuffle
124 ; to spit out sensicle data
125 movdqa xmm0, [rax]
126 movdqa xmm2, [rax+16]
127 movdqa xmm1, [rax+32]
128 movdqa xmm3, [rax+48]
130 ; Clear out coeffs
131 movdqa [rax], xmm7
132 movdqa [rax+16], xmm7
133 movdqa [rax+32], xmm7
134 movdqa [rax+48], xmm7
136 ; dequantize qcoeff buffer
137 pmullw xmm0, [rdx]
138 pmullw xmm2, [rdx+16]
139 pmullw xmm1, [rdx]
140 pmullw xmm3, [rdx+16]
142 ; repack so block 0 row x and block 1 row x are together
143 movdqa xmm4, xmm0
144 punpckldq xmm0, xmm1
145 punpckhdq xmm4, xmm1
147 pshufd xmm0, xmm0, 11011000b
148 pshufd xmm1, xmm4, 11011000b
150 movdqa xmm4, xmm2
151 punpckldq xmm2, xmm3
152 punpckhdq xmm4, xmm3
154 pshufd xmm2, xmm2, 11011000b
155 pshufd xmm3, xmm4, 11011000b
157 ; first pass
158 psubw xmm0, xmm2 ; b1 = 0-2
159 paddw xmm2, xmm2 ;
161 movdqa xmm5, xmm1
162 paddw xmm2, xmm0 ; a1 = 0+2
164 pmulhw xmm5, [GLOBAL(x_s1sqr2)]
165 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
167 movdqa xmm7, xmm3
168 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
170 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
171 psubw xmm7, xmm5 ; c1
173 movdqa xmm5, xmm1
174 movdqa xmm4, xmm3
176 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
177 paddw xmm5, xmm1
179 pmulhw xmm3, [GLOBAL(x_s1sqr2)]
180 paddw xmm3, xmm4
182 paddw xmm3, xmm5 ; d1
183 movdqa xmm6, xmm2 ; a1
185 movdqa xmm4, xmm0 ; b1
186 paddw xmm2, xmm3 ;0
188 paddw xmm4, xmm7 ;1
189 psubw xmm0, xmm7 ;2
191 psubw xmm6, xmm3 ;3
193 ; transpose for the second pass
194 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
195 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
196 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
198 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
199 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
200 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
203 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
204 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
205 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
207 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
208 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
209 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
212 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
213 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
214 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
216 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
217 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
218 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
220 pshufd xmm0, xmm2, 11011000b
221 pshufd xmm2, xmm1, 11011000b
223 pshufd xmm1, xmm5, 11011000b
224 pshufd xmm3, xmm7, 11011000b
226 ; second pass
227 psubw xmm0, xmm2 ; b1 = 0-2
228 paddw xmm2, xmm2
230 movdqa xmm5, xmm1
231 paddw xmm2, xmm0 ; a1 = 0+2
233 pmulhw xmm5, [GLOBAL(x_s1sqr2)]
234 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
236 movdqa xmm7, xmm3
237 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
239 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
240 psubw xmm7, xmm5 ; c1
242 movdqa xmm5, xmm1
243 movdqa xmm4, xmm3
245 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
246 paddw xmm5, xmm1
248 pmulhw xmm3, [GLOBAL(x_s1sqr2)]
249 paddw xmm3, xmm4
251 paddw xmm3, xmm5 ; d1
252 paddw xmm0, [GLOBAL(fours)]
254 paddw xmm2, [GLOBAL(fours)]
255 movdqa xmm6, xmm2 ; a1
257 movdqa xmm4, xmm0 ; b1
258 paddw xmm2, xmm3 ;0
260 paddw xmm4, xmm7 ;1
261 psubw xmm0, xmm7 ;2
263 psubw xmm6, xmm3 ;3
264 psraw xmm2, 3
266 psraw xmm0, 3
267 psraw xmm4, 3
269 psraw xmm6, 3
271 ; transpose to save
272 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
273 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
274 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
276 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
277 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
278 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
281 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
282 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
283 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
285 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
286 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
287 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
290 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
291 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
292 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
294 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
295 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
296 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
298 pshufd xmm0, xmm2, 11011000b
299 pshufd xmm2, xmm1, 11011000b
301 pshufd xmm1, xmm5, 11011000b
302 pshufd xmm3, xmm7, 11011000b
304 pxor xmm7, xmm7
306 ; Load up predict blocks
307 movq xmm4, [rsi]
308 movq xmm5, [rsi+rcx]
310 punpcklbw xmm4, xmm7
311 punpcklbw xmm5, xmm7
313 paddw xmm0, xmm4
314 paddw xmm1, xmm5
316 movq xmm4, [rsi+2*rcx]
317 lea rcx, [3*rcx]
318 movq xmm5, [rsi+rcx]
320 punpcklbw xmm4, xmm7
321 punpcklbw xmm5, xmm7
323 paddw xmm2, xmm4
324 paddw xmm3, xmm5
326 .finish:
328 ; pack up before storing
329 packuswb xmm0, xmm7
330 packuswb xmm1, xmm7
331 packuswb xmm2, xmm7
332 packuswb xmm3, xmm7
334 ; Load destination stride before writing out,
335 ; doesn't need to persist
336 movsxd rdx, dword ptr arg(4) ; dst_stride
338 ; store blocks back out
339 movq [rdi], xmm0
340 movq [rdi + rdx], xmm1
342 lea rdi, [rdi + 2*rdx]
344 movq [rdi], xmm2
345 movq [rdi + rdx], xmm3
347 ; begin epilog
348 pop rdi
349 pop rsi
350 RESTORE_GOT
351 RESTORE_XMM
352 UNSHADOW_ARGS
353 pop rbp
356 ;void idct_dequant_dc_0_2x_sse2
358 ; short *qcoeff - 0
359 ; short *dequant - 1
360 ; unsigned char *pre - 2
361 ; unsigned char *dst - 3
362 ; int dst_stride - 4
363 ; short *dc - 5
365 global sym(idct_dequant_dc_0_2x_sse2)
366 sym(idct_dequant_dc_0_2x_sse2):
367 push rbp
368 mov rbp, rsp
369 SHADOW_ARGS_TO_STACK 7
370 GET_GOT rbx
371 push rsi
372 push rdi
373 ; end prolog
375 ; special case when 2 blocks have 0 or 1 coeffs
376 ; dc is set as first coeff, so no need to load qcoeff
377 mov rax, arg(0) ; qcoeff
378 mov rsi, arg(2) ; pre
379 mov rdi, arg(3) ; dst
380 mov rdx, arg(5) ; dc
382 ; Zero out xmm5, for use unpacking
383 pxor xmm5, xmm5
385 ; load up 2 dc words here == 2*16 = doubleword
386 movd xmm4, [rdx]
388 ; Load up predict blocks
389 movq xmm0, [rsi]
390 movq xmm1, [rsi+16]
391 movq xmm2, [rsi+32]
392 movq xmm3, [rsi+48]
394 ; Duplicate and expand dc across
395 punpcklwd xmm4, xmm4
396 punpckldq xmm4, xmm4
398 ; Rounding to dequant and downshift
399 paddw xmm4, [GLOBAL(fours)]
400 psraw xmm4, 3
402 ; Predict buffer needs to be expanded from bytes to words
403 punpcklbw xmm0, xmm5
404 punpcklbw xmm1, xmm5
405 punpcklbw xmm2, xmm5
406 punpcklbw xmm3, xmm5
408 ; Add to predict buffer
409 paddw xmm0, xmm4
410 paddw xmm1, xmm4
411 paddw xmm2, xmm4
412 paddw xmm3, xmm4
414 ; pack up before storing
415 packuswb xmm0, xmm5
416 packuswb xmm1, xmm5
417 packuswb xmm2, xmm5
418 packuswb xmm3, xmm5
420 ; Load destination stride before writing out,
421 ; doesn't need to persist
422 movsxd rdx, dword ptr arg(4) ; dst_stride
424 ; store blocks back out
425 movq [rdi], xmm0
426 movq [rdi + rdx], xmm1
428 lea rdi, [rdi + 2*rdx]
430 movq [rdi], xmm2
431 movq [rdi + rdx], xmm3
433 ; begin epilog
434 pop rdi
435 pop rsi
436 RESTORE_GOT
437 UNSHADOW_ARGS
438 pop rbp
441 global sym(idct_dequant_dc_full_2x_sse2)
442 sym(idct_dequant_dc_full_2x_sse2):
443 push rbp
444 mov rbp, rsp
445 SHADOW_ARGS_TO_STACK 7
446 SAVE_XMM 7
447 GET_GOT rbx
448 push rsi
449 push rdi
450 ; end prolog
452 ; special case when 2 blocks have 0 or 1 coeffs
453 ; dc is set as first coeff, so no need to load qcoeff
454 mov rax, arg(0) ; qcoeff
455 mov rsi, arg(2) ; pre
456 mov rdi, arg(3) ; dst
458 ; Zero out xmm7, for use unpacking
459 pxor xmm7, xmm7
461 mov rdx, arg(1) ; dequant
463 ; note the transpose of xmm1 and xmm2, necessary for shuffle
464 ; to spit out sensicle data
465 movdqa xmm0, [rax]
466 movdqa xmm2, [rax+16]
467 movdqa xmm1, [rax+32]
468 movdqa xmm3, [rax+48]
470 ; Clear out coeffs
471 movdqa [rax], xmm7
472 movdqa [rax+16], xmm7
473 movdqa [rax+32], xmm7
474 movdqa [rax+48], xmm7
476 ; dequantize qcoeff buffer
477 pmullw xmm0, [rdx]
478 pmullw xmm2, [rdx+16]
479 pmullw xmm1, [rdx]
480 pmullw xmm3, [rdx+16]
482 ; DC component
483 mov rdx, arg(5)
485 ; repack so block 0 row x and block 1 row x are together
486 movdqa xmm4, xmm0
487 punpckldq xmm0, xmm1
488 punpckhdq xmm4, xmm1
490 pshufd xmm0, xmm0, 11011000b
491 pshufd xmm1, xmm4, 11011000b
493 movdqa xmm4, xmm2
494 punpckldq xmm2, xmm3
495 punpckhdq xmm4, xmm3
497 pshufd xmm2, xmm2, 11011000b
498 pshufd xmm3, xmm4, 11011000b
500 ; insert DC component
501 pinsrw xmm0, [rdx], 0
502 pinsrw xmm0, [rdx+2], 4
504 ; first pass
505 psubw xmm0, xmm2 ; b1 = 0-2
506 paddw xmm2, xmm2 ;
508 movdqa xmm5, xmm1
509 paddw xmm2, xmm0 ; a1 = 0+2
511 pmulhw xmm5, [GLOBAL(x_s1sqr2)]
512 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
514 movdqa xmm7, xmm3
515 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
517 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
518 psubw xmm7, xmm5 ; c1
520 movdqa xmm5, xmm1
521 movdqa xmm4, xmm3
523 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
524 paddw xmm5, xmm1
526 pmulhw xmm3, [GLOBAL(x_s1sqr2)]
527 paddw xmm3, xmm4
529 paddw xmm3, xmm5 ; d1
530 movdqa xmm6, xmm2 ; a1
532 movdqa xmm4, xmm0 ; b1
533 paddw xmm2, xmm3 ;0
535 paddw xmm4, xmm7 ;1
536 psubw xmm0, xmm7 ;2
538 psubw xmm6, xmm3 ;3
540 ; transpose for the second pass
541 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
542 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
543 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
545 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
546 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
547 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
550 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
551 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
552 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
554 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
555 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
556 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
559 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
560 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
561 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
563 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
564 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
565 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
567 pshufd xmm0, xmm2, 11011000b
568 pshufd xmm2, xmm1, 11011000b
570 pshufd xmm1, xmm5, 11011000b
571 pshufd xmm3, xmm7, 11011000b
573 ; second pass
574 psubw xmm0, xmm2 ; b1 = 0-2
575 paddw xmm2, xmm2
577 movdqa xmm5, xmm1
578 paddw xmm2, xmm0 ; a1 = 0+2
580 pmulhw xmm5, [GLOBAL(x_s1sqr2)]
581 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
583 movdqa xmm7, xmm3
584 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
586 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
587 psubw xmm7, xmm5 ; c1
589 movdqa xmm5, xmm1
590 movdqa xmm4, xmm3
592 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
593 paddw xmm5, xmm1
595 pmulhw xmm3, [GLOBAL(x_s1sqr2)]
596 paddw xmm3, xmm4
598 paddw xmm3, xmm5 ; d1
599 paddw xmm0, [GLOBAL(fours)]
601 paddw xmm2, [GLOBAL(fours)]
602 movdqa xmm6, xmm2 ; a1
604 movdqa xmm4, xmm0 ; b1
605 paddw xmm2, xmm3 ;0
607 paddw xmm4, xmm7 ;1
608 psubw xmm0, xmm7 ;2
610 psubw xmm6, xmm3 ;3
611 psraw xmm2, 3
613 psraw xmm0, 3
614 psraw xmm4, 3
616 psraw xmm6, 3
618 ; transpose to save
619 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
620 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
621 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
623 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
624 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
625 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
628 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
629 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
630 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
632 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
633 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
634 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
637 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
638 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
639 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
641 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
642 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
643 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
645 pshufd xmm0, xmm2, 11011000b
646 pshufd xmm2, xmm1, 11011000b
648 pshufd xmm1, xmm5, 11011000b
649 pshufd xmm3, xmm7, 11011000b
651 pxor xmm7, xmm7
653 ; Load up predict blocks
654 movq xmm4, [rsi]
655 movq xmm5, [rsi+16]
657 punpcklbw xmm4, xmm7
658 punpcklbw xmm5, xmm7
660 paddw xmm0, xmm4
661 paddw xmm1, xmm5
663 movq xmm4, [rsi+32]
664 movq xmm5, [rsi+48]
666 punpcklbw xmm4, xmm7
667 punpcklbw xmm5, xmm7
669 paddw xmm2, xmm4
670 paddw xmm3, xmm5
672 .finish:
674 ; pack up before storing
675 packuswb xmm0, xmm7
676 packuswb xmm1, xmm7
677 packuswb xmm2, xmm7
678 packuswb xmm3, xmm7
680 ; Load destination stride before writing out,
681 ; doesn't need to persist
682 movsxd rdx, dword ptr arg(4) ; dst_stride
684 ; store blocks back out
685 movq [rdi], xmm0
686 movq [rdi + rdx], xmm1
688 lea rdi, [rdi + 2*rdx]
690 movq [rdi], xmm2
691 movq [rdi + rdx], xmm3
694 ; begin epilog
695 pop rdi
696 pop rsi
697 RESTORE_GOT
698 RESTORE_XMM
699 UNSHADOW_ARGS
700 pop rbp
703 SECTION_RODATA
704 align 16
705 fours:
706 times 8 dw 0x0004
707 align 16
708 x_s1sqr2:
709 times 8 dw 0x8A8C
710 align 16
711 x_c1sqr2less1:
712 times 8 dw 0x4E7B