Merge "documentation: minor cosmetics"
[libvpx.git] / vp8 / common / x86 / idctllm_sse2.asm
blobedee1578e2607ec0b1311665a68b504f6c620fb1
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 ;void idct_dequant_0_2x_sse2
15 ; (
16 ; short *qcoeff - 0
17 ; short *dequant - 1
18 ; unsigned char *pre - 2
19 ; unsigned char *dst - 3
20 ; int dst_stride - 4
21 ; int blk_stride - 5
22 ; )
24 global sym(idct_dequant_0_2x_sse2)
25 sym(idct_dequant_0_2x_sse2):
26 push rbp
27 mov rbp, rsp
28 SHADOW_ARGS_TO_STACK 6
29 GET_GOT rbx
30 ; end prolog
32 mov rdx, arg(1) ; dequant
33 mov rax, arg(0) ; qcoeff
35 ; Zero out xmm7, for use unpacking
36 pxor xmm7, xmm7
38 movd xmm4, [rax]
39 movd xmm5, [rdx]
41 pinsrw xmm4, [rax+32], 4
42 pinsrw xmm5, [rdx], 4
44 pmullw xmm4, xmm5
46 ; clear coeffs
47 movd [rax], xmm7
48 movd [rax+32], xmm7
49 ;pshufb
50 pshuflw xmm4, xmm4, 00000000b
51 pshufhw xmm4, xmm4, 00000000b
53 mov rax, arg(2) ; pre
54 paddw xmm4, [GLOBAL(fours)]
56 movsxd rcx, dword ptr arg(5) ; blk_stride
57 psraw xmm4, 3
59 movq xmm0, [rax]
60 movq xmm1, [rax+rcx]
61 movq xmm2, [rax+2*rcx]
62 lea rcx, [3*rcx]
63 movq xmm3, [rax+rcx]
65 punpcklbw xmm0, xmm7
66 punpcklbw xmm1, xmm7
67 punpcklbw xmm2, xmm7
68 punpcklbw xmm3, xmm7
70 mov rax, arg(3) ; dst
71 movsxd rdx, dword ptr arg(4) ; dst_stride
73 ; Add to predict buffer
74 paddw xmm0, xmm4
75 paddw xmm1, xmm4
76 paddw xmm2, xmm4
77 paddw xmm3, xmm4
79 ; pack up before storing
80 packuswb xmm0, xmm7
81 packuswb xmm1, xmm7
82 packuswb xmm2, xmm7
83 packuswb xmm3, xmm7
85 ; store blocks back out
86 movq [rax], xmm0
87 movq [rax + rdx], xmm1
89 lea rax, [rax + 2*rdx]
91 movq [rax], xmm2
92 movq [rax + rdx], xmm3
94 ; begin epilog
95 RESTORE_GOT
96 UNSHADOW_ARGS
97 pop rbp
98 ret
100 global sym(idct_dequant_full_2x_sse2)
101 sym(idct_dequant_full_2x_sse2):
102 push rbp
103 mov rbp, rsp
104 SHADOW_ARGS_TO_STACK 7
105 GET_GOT rbx
106 push rsi
107 push rdi
108 ; end prolog
110 ; special case when 2 blocks have 0 or 1 coeffs
111 ; dc is set as first coeff, so no need to load qcoeff
112 mov rax, arg(0) ; qcoeff
113 mov rsi, arg(2) ; pre
114 mov rdi, arg(3) ; dst
115 movsxd rcx, dword ptr arg(5) ; blk_stride
117 ; Zero out xmm7, for use unpacking
118 pxor xmm7, xmm7
120 mov rdx, arg(1) ; dequant
122 ; note the transpose of xmm1 and xmm2, necessary for shuffle
123 ; to spit out sensicle data
124 movdqa xmm0, [rax]
125 movdqa xmm2, [rax+16]
126 movdqa xmm1, [rax+32]
127 movdqa xmm3, [rax+48]
129 ; Clear out coeffs
130 movdqa [rax], xmm7
131 movdqa [rax+16], xmm7
132 movdqa [rax+32], xmm7
133 movdqa [rax+48], xmm7
135 ; dequantize qcoeff buffer
136 pmullw xmm0, [rdx]
137 pmullw xmm2, [rdx+16]
138 pmullw xmm1, [rdx]
139 pmullw xmm3, [rdx+16]
141 ; repack so block 0 row x and block 1 row x are together
142 movdqa xmm4, xmm0
143 punpckldq xmm0, xmm1
144 punpckhdq xmm4, xmm1
146 pshufd xmm0, xmm0, 11011000b
147 pshufd xmm1, xmm4, 11011000b
149 movdqa xmm4, xmm2
150 punpckldq xmm2, xmm3
151 punpckhdq xmm4, xmm3
153 pshufd xmm2, xmm2, 11011000b
154 pshufd xmm3, xmm4, 11011000b
156 ; first pass
157 psubw xmm0, xmm2 ; b1 = 0-2
158 paddw xmm2, xmm2 ;
160 movdqa xmm5, xmm1
161 paddw xmm2, xmm0 ; a1 = 0+2
163 pmulhw xmm5, [GLOBAL(x_s1sqr2)]
164 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
166 movdqa xmm7, xmm3
167 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
169 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
170 psubw xmm7, xmm5 ; c1
172 movdqa xmm5, xmm1
173 movdqa xmm4, xmm3
175 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
176 paddw xmm5, xmm1
178 pmulhw xmm3, [GLOBAL(x_s1sqr2)]
179 paddw xmm3, xmm4
181 paddw xmm3, xmm5 ; d1
182 movdqa xmm6, xmm2 ; a1
184 movdqa xmm4, xmm0 ; b1
185 paddw xmm2, xmm3 ;0
187 paddw xmm4, xmm7 ;1
188 psubw xmm0, xmm7 ;2
190 psubw xmm6, xmm3 ;3
192 ; transpose for the second pass
193 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
194 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
195 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
197 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
198 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
199 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
202 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
203 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
204 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
206 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
207 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
208 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
211 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
212 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
213 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
215 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
216 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
217 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
219 pshufd xmm0, xmm2, 11011000b
220 pshufd xmm2, xmm1, 11011000b
222 pshufd xmm1, xmm5, 11011000b
223 pshufd xmm3, xmm7, 11011000b
225 ; second pass
226 psubw xmm0, xmm2 ; b1 = 0-2
227 paddw xmm2, xmm2
229 movdqa xmm5, xmm1
230 paddw xmm2, xmm0 ; a1 = 0+2
232 pmulhw xmm5, [GLOBAL(x_s1sqr2)]
233 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
235 movdqa xmm7, xmm3
236 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
238 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
239 psubw xmm7, xmm5 ; c1
241 movdqa xmm5, xmm1
242 movdqa xmm4, xmm3
244 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
245 paddw xmm5, xmm1
247 pmulhw xmm3, [GLOBAL(x_s1sqr2)]
248 paddw xmm3, xmm4
250 paddw xmm3, xmm5 ; d1
251 paddw xmm0, [GLOBAL(fours)]
253 paddw xmm2, [GLOBAL(fours)]
254 movdqa xmm6, xmm2 ; a1
256 movdqa xmm4, xmm0 ; b1
257 paddw xmm2, xmm3 ;0
259 paddw xmm4, xmm7 ;1
260 psubw xmm0, xmm7 ;2
262 psubw xmm6, xmm3 ;3
263 psraw xmm2, 3
265 psraw xmm0, 3
266 psraw xmm4, 3
268 psraw xmm6, 3
270 ; transpose to save
271 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
272 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
273 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
275 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
276 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
277 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
280 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
281 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
282 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
284 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
285 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
286 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
289 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
290 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
291 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
293 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
294 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
295 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
297 pshufd xmm0, xmm2, 11011000b
298 pshufd xmm2, xmm1, 11011000b
300 pshufd xmm1, xmm5, 11011000b
301 pshufd xmm3, xmm7, 11011000b
303 pxor xmm7, xmm7
305 ; Load up predict blocks
306 movq xmm4, [rsi]
307 movq xmm5, [rsi+rcx]
309 punpcklbw xmm4, xmm7
310 punpcklbw xmm5, xmm7
312 paddw xmm0, xmm4
313 paddw xmm1, xmm5
315 movq xmm4, [rsi+2*rcx]
316 lea rcx, [3*rcx]
317 movq xmm5, [rsi+rcx]
319 punpcklbw xmm4, xmm7
320 punpcklbw xmm5, xmm7
322 paddw xmm2, xmm4
323 paddw xmm3, xmm5
325 .finish:
327 ; pack up before storing
328 packuswb xmm0, xmm7
329 packuswb xmm1, xmm7
330 packuswb xmm2, xmm7
331 packuswb xmm3, xmm7
333 ; Load destination stride before writing out,
334 ; doesn't need to persist
335 movsxd rdx, dword ptr arg(4) ; dst_stride
337 ; store blocks back out
338 movq [rdi], xmm0
339 movq [rdi + rdx], xmm1
341 lea rdi, [rdi + 2*rdx]
343 movq [rdi], xmm2
344 movq [rdi + rdx], xmm3
346 ; begin epilog
347 pop rdi
348 pop rsi
349 RESTORE_GOT
350 UNSHADOW_ARGS
351 pop rbp
354 ;void idct_dequant_dc_0_2x_sse2
356 ; short *qcoeff - 0
357 ; short *dequant - 1
358 ; unsigned char *pre - 2
359 ; unsigned char *dst - 3
360 ; int dst_stride - 4
361 ; short *dc - 5
363 global sym(idct_dequant_dc_0_2x_sse2)
364 sym(idct_dequant_dc_0_2x_sse2):
365 push rbp
366 mov rbp, rsp
367 SHADOW_ARGS_TO_STACK 7
368 GET_GOT rbx
369 push rsi
370 push rdi
371 ; end prolog
373 ; special case when 2 blocks have 0 or 1 coeffs
374 ; dc is set as first coeff, so no need to load qcoeff
375 mov rax, arg(0) ; qcoeff
376 mov rsi, arg(2) ; pre
377 mov rdi, arg(3) ; dst
378 mov rdx, arg(5) ; dc
380 ; Zero out xmm7, for use unpacking
381 pxor xmm7, xmm7
383 ; load up 2 dc words here == 2*16 = doubleword
384 movd xmm4, [rdx]
386 ; Load up predict blocks
387 movq xmm0, [rsi]
388 movq xmm1, [rsi+16]
389 movq xmm2, [rsi+32]
390 movq xmm3, [rsi+48]
392 ; Duplicate and expand dc across
393 punpcklwd xmm4, xmm4
394 punpckldq xmm4, xmm4
396 ; Rounding to dequant and downshift
397 paddw xmm4, [GLOBAL(fours)]
398 psraw xmm4, 3
400 ; Predict buffer needs to be expanded from bytes to words
401 punpcklbw xmm0, xmm7
402 punpcklbw xmm1, xmm7
403 punpcklbw xmm2, xmm7
404 punpcklbw xmm3, xmm7
406 ; Add to predict buffer
407 paddw xmm0, xmm4
408 paddw xmm1, xmm4
409 paddw xmm2, xmm4
410 paddw xmm3, xmm4
412 ; pack up before storing
413 packuswb xmm0, xmm7
414 packuswb xmm1, xmm7
415 packuswb xmm2, xmm7
416 packuswb xmm3, xmm7
418 ; Load destination stride before writing out,
419 ; doesn't need to persist
420 movsxd rdx, dword ptr arg(4) ; dst_stride
422 ; store blocks back out
423 movq [rdi], xmm0
424 movq [rdi + rdx], xmm1
426 lea rdi, [rdi + 2*rdx]
428 movq [rdi], xmm2
429 movq [rdi + rdx], xmm3
431 ; begin epilog
432 pop rdi
433 pop rsi
434 RESTORE_GOT
435 UNSHADOW_ARGS
436 pop rbp
439 global sym(idct_dequant_dc_full_2x_sse2)
440 sym(idct_dequant_dc_full_2x_sse2):
441 push rbp
442 mov rbp, rsp
443 SHADOW_ARGS_TO_STACK 7
444 GET_GOT rbx
445 push rsi
446 push rdi
447 ; end prolog
449 ; special case when 2 blocks have 0 or 1 coeffs
450 ; dc is set as first coeff, so no need to load qcoeff
451 mov rax, arg(0) ; qcoeff
452 mov rsi, arg(2) ; pre
453 mov rdi, arg(3) ; dst
455 ; Zero out xmm7, for use unpacking
456 pxor xmm7, xmm7
458 mov rdx, arg(1) ; dequant
460 ; note the transpose of xmm1 and xmm2, necessary for shuffle
461 ; to spit out sensicle data
462 movdqa xmm0, [rax]
463 movdqa xmm2, [rax+16]
464 movdqa xmm1, [rax+32]
465 movdqa xmm3, [rax+48]
467 ; Clear out coeffs
468 movdqa [rax], xmm7
469 movdqa [rax+16], xmm7
470 movdqa [rax+32], xmm7
471 movdqa [rax+48], xmm7
473 ; dequantize qcoeff buffer
474 pmullw xmm0, [rdx]
475 pmullw xmm2, [rdx+16]
476 pmullw xmm1, [rdx]
477 pmullw xmm3, [rdx+16]
479 ; DC component
480 mov rdx, arg(5)
482 ; repack so block 0 row x and block 1 row x are together
483 movdqa xmm4, xmm0
484 punpckldq xmm0, xmm1
485 punpckhdq xmm4, xmm1
487 pshufd xmm0, xmm0, 11011000b
488 pshufd xmm1, xmm4, 11011000b
490 movdqa xmm4, xmm2
491 punpckldq xmm2, xmm3
492 punpckhdq xmm4, xmm3
494 pshufd xmm2, xmm2, 11011000b
495 pshufd xmm3, xmm4, 11011000b
497 ; insert DC component
498 pinsrw xmm0, [rdx], 0
499 pinsrw xmm0, [rdx+2], 4
501 ; first pass
502 psubw xmm0, xmm2 ; b1 = 0-2
503 paddw xmm2, xmm2 ;
505 movdqa xmm5, xmm1
506 paddw xmm2, xmm0 ; a1 = 0+2
508 pmulhw xmm5, [GLOBAL(x_s1sqr2)]
509 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
511 movdqa xmm7, xmm3
512 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
514 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
515 psubw xmm7, xmm5 ; c1
517 movdqa xmm5, xmm1
518 movdqa xmm4, xmm3
520 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
521 paddw xmm5, xmm1
523 pmulhw xmm3, [GLOBAL(x_s1sqr2)]
524 paddw xmm3, xmm4
526 paddw xmm3, xmm5 ; d1
527 movdqa xmm6, xmm2 ; a1
529 movdqa xmm4, xmm0 ; b1
530 paddw xmm2, xmm3 ;0
532 paddw xmm4, xmm7 ;1
533 psubw xmm0, xmm7 ;2
535 psubw xmm6, xmm3 ;3
537 ; transpose for the second pass
538 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
539 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
540 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
542 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
543 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
544 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
547 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
548 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
549 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
551 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
552 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
553 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
556 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
557 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
558 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
560 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
561 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
562 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
564 pshufd xmm0, xmm2, 11011000b
565 pshufd xmm2, xmm1, 11011000b
567 pshufd xmm1, xmm5, 11011000b
568 pshufd xmm3, xmm7, 11011000b
570 ; second pass
571 psubw xmm0, xmm2 ; b1 = 0-2
572 paddw xmm2, xmm2
574 movdqa xmm5, xmm1
575 paddw xmm2, xmm0 ; a1 = 0+2
577 pmulhw xmm5, [GLOBAL(x_s1sqr2)]
578 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
580 movdqa xmm7, xmm3
581 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
583 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
584 psubw xmm7, xmm5 ; c1
586 movdqa xmm5, xmm1
587 movdqa xmm4, xmm3
589 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
590 paddw xmm5, xmm1
592 pmulhw xmm3, [GLOBAL(x_s1sqr2)]
593 paddw xmm3, xmm4
595 paddw xmm3, xmm5 ; d1
596 paddw xmm0, [GLOBAL(fours)]
598 paddw xmm2, [GLOBAL(fours)]
599 movdqa xmm6, xmm2 ; a1
601 movdqa xmm4, xmm0 ; b1
602 paddw xmm2, xmm3 ;0
604 paddw xmm4, xmm7 ;1
605 psubw xmm0, xmm7 ;2
607 psubw xmm6, xmm3 ;3
608 psraw xmm2, 3
610 psraw xmm0, 3
611 psraw xmm4, 3
613 psraw xmm6, 3
615 ; transpose to save
616 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
617 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
618 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
620 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
621 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
622 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
625 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
626 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
627 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
629 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
630 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
631 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
634 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
635 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
636 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
638 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
639 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
640 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
642 pshufd xmm0, xmm2, 11011000b
643 pshufd xmm2, xmm1, 11011000b
645 pshufd xmm1, xmm5, 11011000b
646 pshufd xmm3, xmm7, 11011000b
648 pxor xmm7, xmm7
650 ; Load up predict blocks
651 movq xmm4, [rsi]
652 movq xmm5, [rsi+16]
654 punpcklbw xmm4, xmm7
655 punpcklbw xmm5, xmm7
657 paddw xmm0, xmm4
658 paddw xmm1, xmm5
660 movq xmm4, [rsi+32]
661 movq xmm5, [rsi+48]
663 punpcklbw xmm4, xmm7
664 punpcklbw xmm5, xmm7
666 paddw xmm2, xmm4
667 paddw xmm3, xmm5
669 .finish:
671 ; pack up before storing
672 packuswb xmm0, xmm7
673 packuswb xmm1, xmm7
674 packuswb xmm2, xmm7
675 packuswb xmm3, xmm7
677 ; Load destination stride before writing out,
678 ; doesn't need to persist
679 movsxd rdx, dword ptr arg(4) ; dst_stride
681 ; store blocks back out
682 movq [rdi], xmm0
683 movq [rdi + rdx], xmm1
685 lea rdi, [rdi + 2*rdx]
687 movq [rdi], xmm2
688 movq [rdi + rdx], xmm3
691 ; begin epilog
692 pop rdi
693 pop rsi
694 RESTORE_GOT
695 UNSHADOW_ARGS
696 pop rbp
699 SECTION_RODATA
700 align 16
701 fours:
702 times 8 dw 0x0004
703 align 16
704 x_s1sqr2:
705 times 8 dw 0x8A8C
706 align 16
707 x_c1sqr2less1:
708 times 8 dw 0x4E7B