1 ; Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 ; Use of this source code is governed by a BSD-style license that can be
3 ; found in the LICENSE file.
5 %include "media/base/simd/media_export.asm"
6 %include "third_party/x86inc/x86inc.asm"
9 ; This file uses SSE, SSE2, SSE3, and SSSE3, which are supported by all ATOM
13 CPU SSE
, SSE3
, SSE3
, SSSE3
16 ; XMM registers representing constants. We must not use these registers as
17 ; destination operands.
18 ; for (int i = 0; i < 16; i += 4) {
19 ; xmm7.b[i] = 25; xmm7.b[i+1] = 2; xmm7.b[i+2] = 66; xmm7.b[i+3] = 0;
20 ; xmm6.b[i] = 0; xmm6.b[i+1] = 127; xmm6.b[i+2] = 0; xmm6.b[i+3] = 0;
21 ; xmm5.b[i] = 112; xmm5.b[i+1] = -74; xmm5.b[i+2] = -38; xmm5.b[i+3] = 0;
22 ; xmm4.b[i] = -18; xmm4.b[i+1] = -94; xmm4.b[i+2] = 112; xmm4.b[i+3] = 0;
25 %define XMM_CONST_Y0 xmm7
26 %define XMM_CONST_Y1 xmm6
27 %define XMM_CONST_U xmm5
28 %define XMM_CONST_V xmm4
29 %define XMM_CONST_128 xmm3
32 ; LOAD_XMM %1 (xmm), %2 (imm32)
33 ; Loads an immediate value to an XMM register.
34 ; %1.d[0] = %1.d[1] = %1.d[2] = %1.d[3] = %2;
39 pshufd
%1, %1, 00000000B
43 ; UNPACKRGB %1 (xmm), %2 (imm8)
44 ; Unpacks one RGB pixel in the specified XMM register.
45 ; for (int i = 15; i > %2; --i) %1.b[i] = %1.b[i - 1];
47 ; for (int i = %2 - 1; i >= 0; --i) %1.b[i] = %1.b[i];
59 ; READ_ARGB %1 (xmm), %2 (imm)
60 ; Read the specified number of ARGB (or RGB) pixels from the source and store
61 ; them to the destination xmm register. If the input format is RGB, we read RGB
62 ; pixels and convert them to ARGB pixels. (For this case, the alpha values of
63 ; the output pixels become 0.)
69 ; Read ARGB pixels from the source. (This macro assumes the input buffer may
70 ; not be aligned to a 16-byte boundary.)
72 movd
%1, DWORD [ARGBq
+ WIDTHq
* 4 * 2]
74 movq
%1, QWORD [ARGBq
+ WIDTHq
* 4 * 2]
76 movdqu
%1, DQWORD
[ARGBq
+ WIDTHq
* 4 * 2]
78 %error unsupported number of pixels.
83 ; Read RGB pixels from the source and convert them to ARGB pixels.
85 ; Read one RGB pixel and convert it to one ARGB pixel.
86 ; Save the WIDTH register to xmm1. (This macro needs to break it.)
89 ; Once read three bytes from the source to TEMPd, and copy it to the
90 ; destination xmm register.
91 lea WIDTHq
, [WIDTHq
+ WIDTHq
* 2]
92 movzx TEMPd
, BYTE [ARGBq
+ WIDTHq
* 2 + 2]
94 mov TEMPw
, WORD [ARGBq
+ WIDTHq
* 2]
97 ; Restore the WIDTH register.
100 ; Read two RGB pixels and convert them to two ARGB pixels.
101 ; Read six bytes from the source to the destination xmm register.
103 lea TEMPq
, [TEMPq
+ TEMPq
* 2]
104 movd
%1, DWORD [ARGBq
+ TEMPq
* 2]
105 pinsrw
%1, WORD [ARGBq
+ TEMPq
* 2 + 4], 3
107 ; Fill the alpha values of these RGB pixels with 0 and convert them to two
111 ; Read four RGB pixels and convert them to four ARGB pixels.
112 ; Read twelve bytes from the source to the destination xmm register.
114 lea TEMPq
, [TEMPq
+ TEMPq
* 2]
115 movq
%1, QWORD [ARGBq
+ TEMPq
* 2]
116 movd xmm1
, DWORD [ARGBq
+ TEMPq
* 2 + 8]
117 shufps
%1, xmm1
, 01000100B
119 ; Fill the alpha values of these RGB pixels with 0 and convert them to four
123 UNPACKRGB
%1, 4 + 4 + 3
125 %error unsupported number of pixels.
129 %error unsupported PIXELSIZE value.
135 ; CALC_Y %1 (xmm), %2 (xmm)
136 ; Calculates four Y values from four ARGB pixels stored in %2.
137 ; %1.b[0] = ToByte((25 * B(0) + 129 * G(0) + 66 * R(0) + 128) / 256 + 16);
138 ; %1.b[1] = ToByte((25 * B(1) + 129 * G(1) + 66 * R(1) + 128) / 256 + 16);
139 ; %1.b[2] = ToByte((25 * B(2) + 129 * G(2) + 66 * R(2) + 128) / 256 + 16);
140 ; %1.b[3] = ToByte((25 * B(3) + 129 * G(3) + 66 * R(3) + 128) / 256 + 16);
143 ; To avoid signed saturation, we divide this conversion formula into two
144 ; formulae and store their results into two XMM registers %1 and xmm2.
145 ; %1.w[0] = 25 * %2.b[0] + 2 * %2.b[1] + 66 * %2.b[2] + 0 * %2.b[3];
146 ; %1.w[1] = 25 * %2.b[4] + 2 * %2.b[5] + 66 * %2.b[6] + 0 * %2.b[7];
147 ; %1.w[2] = 25 * %2.b[8] + 2 * %2.b[9] + 66 * %2.b[10] + 0 * %2.b[11];
148 ; %1.w[3] = 25 * %2.b[12] + 2 * %2.b[13] + 66 * %2.b[14] + 0 * %2.b[15];
149 ; xmm2.w[0] = 0 * %2.b[0] + 127 * %2.b[1] + 0 * %2.b[2] + 0 * %2.b[3];
150 ; xmm2.w[1] = 0 * %2.b[4] + 127 * %2.b[5] + 0 * %2.b[6] + 0 * %2.b[7];
151 ; xmm2.w[2] = 0 * %2.b[8] + 127 * %2.b[9] + 0 * %2.b[10] + 0 * %2.b[11];
152 ; xmm2.w[3] = 0 * %2.b[12] + 127 * %2.b[13] + 0 * %2.b[14] + 0 * %2.b[15];
154 pmaddubsw
%1, XMM_CONST_Y0
157 pmaddubsw xmm2
, XMM_CONST_Y1
160 ; %1.b[0] = ToByte((%1.w[0] + xmm2.w[0] + 128) / 256 + 16);
161 ; %1.b[1] = ToByte((%1.w[1] + xmm2.w[1] + 128) / 256 + 16);
162 ; %1.b[2] = ToByte((%1.w[2] + xmm2.w[2] + 128) / 256 + 16);
163 ; %1.b[3] = ToByte((%1.w[3] + xmm2.w[3] + 128) / 256 + 16);
165 movdqa xmm2
, XMM_CONST_128
174 ; INIT_UV %1 (r32), %2 (reg) %3 (imm)
178 %if SUBSAMPLING
== 1 && LINE
== 1
179 %if
%3 == 1 ||
%3 == 2
180 movzx %1, BYTE [%2 + WIDTHq
]
182 movzx %1, WORD [%2 + WIDTHq
]
184 %error unsupported number of pixels.
191 ; CALC_UV %1 (xmm), %2 (xmm), %3 (xmm), %4 (r32)
192 ; Calculates two U (or V) values from four ARGB pixels stored in %2.
193 ; if %3 == XMM_CONST_U
195 ; %1.b[0] = ToByte((112 * B(0) - 74 * G(0) - 38 * R(0) + 128) / 256 + 128);
196 ; %1.b[0] = ToByte((112 * B(0) - 74 * G(0) - 38 * R(0) + 128) / 256 + 128);
197 ; %1.b[1] = ToByte((112 * B(2) - 74 * G(2) - 38 * R(2) + 128) / 256 + 128);
198 ; %1.b[1] = ToByte((112 * B(2) - 74 * G(2) - 38 * R(2) + 128) / 256 + 128);
200 ; %1.b[0] = ToByte((112 * B(0) - 74 * G(0) - 38 * R(0) + 128) / 256 + 128);
201 ; %1.b[1] = ToByte((112 * B(2) - 74 * G(2) - 38 * R(2) + 128) / 256 + 128);
203 ; if %3 == XMM_CONST_V
204 ; %1.b[0] = ToByte((-18 * B(0) - 94 * G(0) + 112 * R(0) + 128) / 256 + 128);
205 ; %1.b[1] = ToByte((-18 * B(2) - 94 * G(2) + 112 * R(2) + 128) / 256 + 128);
208 ; for (int i = 0; i < 4; ++i) {
210 ; for (int j = 0; j < 4; ++j)
211 ; %1.w[i] += %3.b[i * 4 + j] + %2.b[i * 4 + j];
218 ; %1.w[0] = (%1.w[0] + %1.w[1] + 1) / 2;
219 ; %1.w[1] = (%1.w[1] + %1.w[0] + 1) / 2;
220 ; %1.w[2] = (%1.w[2] + %1.w[3] + 1) / 2;
221 ; %1.w[3] = (%1.w[3] + %1.w[2] + 1) / 2;
222 pshuflw xmm2
, %1, 10110001B
226 ; %1.b[0] = ToByte((%1.w[0] + 128) / 256 + 128);
227 ; %1.b[1] = ToByte((%1.w[2] + 128) / 256 + 128);
228 pshuflw
%1, %1, 10001000B
229 paddw
%1, XMM_CONST_128
231 paddw
%1, XMM_CONST_128
234 %if SUBSAMPLING
== 1 && LINE
== 1
235 ; %1.b[0] = (%1.b[0] + %3.b[0] + 1) / 2;
236 ; %1.b[1] = (%1.b[1] + %3.b[1] + 1) / 2;
243 ; extern "C" void ConvertARGBToYUVRow_SSSE3(const uint8* argb,
249 %define SYMBOL ConvertARGBToYUVRow_SSSE3
251 %define SUBSAMPLING
0
253 %include "convert_rgb_to_yuv_ssse3.inc"
256 ; extern "C" void ConvertRGBToYUVRow_SSSE3(const uint8* rgb,
262 %define SYMBOL ConvertRGBToYUVRow_SSSE3
264 %define SUBSAMPLING
0
266 %include "convert_rgb_to_yuv_ssse3.inc"
269 ; extern "C" void ConvertARGBToYUVEven_SSSE3(const uint8* argb,
275 %define SYMBOL ConvertARGBToYUVEven_SSSE3
277 %define SUBSAMPLING
1
279 %include "convert_rgb_to_yuv_ssse3.inc"
282 ; extern "C" void ConvertARGBToYUVOdd_SSSE3(const uint8* argb,
288 %define SYMBOL ConvertARGBToYUVOdd_SSSE3
290 %define SUBSAMPLING
1
292 %include "convert_rgb_to_yuv_ssse3.inc"
295 ; extern "C" void ConvertRGBToYUVEven_SSSE3(const uint8* rgb,
301 %define SYMBOL ConvertRGBToYUVEven_SSSE3
303 %define SUBSAMPLING
1
305 %include "convert_rgb_to_yuv_ssse3.inc"
308 ; extern "C" void ConvertRGBToYUVOdd_SSSE3(const uint8* rgb,
314 %define SYMBOL ConvertRGBToYUVOdd_SSSE3
316 %define SUBSAMPLING
1
318 %include "convert_rgb_to_yuv_ssse3.inc"