Revert "Merged all Chromoting Host code into remoting_core.dll (Windows)."
[chromium-blink-merge.git] / third_party / qcms / src / transform-sse1.c
blobaaee1bf22e00e73569f70561d56185a6a33cd30e
1 // qcms
2 // Copyright (C) 2009 Mozilla Foundation
3 //
4 // Permission is hereby granted, free of charge, to any person obtaining
5 // a copy of this software and associated documentation files (the "Software"),
6 // to deal in the Software without restriction, including without limitation
7 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 // and/or sell copies of the Software, and to permit persons to whom the Software
9 // is furnished to do so, subject to the following conditions:
11 // The above copyright notice and this permission notice shall be included in
12 // all copies or substantial portions of the Software.
14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
16 // THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 #include <xmmintrin.h>
24 #include "qcmsint.h"
26 /* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */
27 #define FLOATSCALE (float)(PRECACHE_OUTPUT_SIZE)
28 #define CLAMPMAXVAL ( ((float) (PRECACHE_OUTPUT_SIZE - 1)) / PRECACHE_OUTPUT_SIZE )
29 static const ALIGN float floatScaleX4[4] =
30 { FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE};
31 static const ALIGN float clampMaxValueX4[4] =
32 { CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL};
34 void qcms_transform_data_rgb_out_lut_sse1(qcms_transform *transform,
35 unsigned char *src,
36 unsigned char *dest,
37 size_t length,
38 qcms_format_type output_format)
40 unsigned int i;
41 float (*mat)[4] = transform->matrix;
42 char input_back[32];
43 /* Ensure we have a buffer that's 16 byte aligned regardless of the original
44 * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
45 * because they don't work on stack variables. gcc 4.4 does do the right thing
46 * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
47 float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
48 /* share input and output locations to save having to keep the
49 * locations in separate registers */
50 uint32_t const * output = (uint32_t*)input;
52 /* deref *transform now to avoid it in loop */
53 const float *igtbl_r = transform->input_gamma_table_r;
54 const float *igtbl_g = transform->input_gamma_table_g;
55 const float *igtbl_b = transform->input_gamma_table_b;
57 /* deref *transform now to avoid it in loop */
58 const uint8_t *otdata_r = &transform->output_table_r->data[0];
59 const uint8_t *otdata_g = &transform->output_table_g->data[0];
60 const uint8_t *otdata_b = &transform->output_table_b->data[0];
62 /* input matrix values never change */
63 const __m128 mat0 = _mm_load_ps(mat[0]);
64 const __m128 mat1 = _mm_load_ps(mat[1]);
65 const __m128 mat2 = _mm_load_ps(mat[2]);
67 /* these values don't change, either */
68 const __m128 max = _mm_load_ps(clampMaxValueX4);
69 const __m128 min = _mm_setzero_ps();
70 const __m128 scale = _mm_load_ps(floatScaleX4);
72 /* working variables */
73 __m128 vec_r, vec_g, vec_b, result;
74 const int r_out = output_format.r;
75 const int b_out = output_format.b;
77 /* CYA */
78 if (!length)
79 return;
81 /* one pixel is handled outside of the loop */
82 length--;
84 /* setup for transforming 1st pixel */
85 vec_r = _mm_load_ss(&igtbl_r[src[0]]);
86 vec_g = _mm_load_ss(&igtbl_g[src[1]]);
87 vec_b = _mm_load_ss(&igtbl_b[src[2]]);
88 src += 3;
90 /* transform all but final pixel */
92 for (i=0; i<length; i++)
94 /* position values from gamma tables */
95 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
96 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
97 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
99 /* gamma * matrix */
100 vec_r = _mm_mul_ps(vec_r, mat0);
101 vec_g = _mm_mul_ps(vec_g, mat1);
102 vec_b = _mm_mul_ps(vec_b, mat2);
104 /* crunch, crunch, crunch */
105 vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
106 vec_r = _mm_max_ps(min, vec_r);
107 vec_r = _mm_min_ps(max, vec_r);
108 result = _mm_mul_ps(vec_r, scale);
110 /* store calc'd output tables indices */
111 *((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
112 result = _mm_movehl_ps(result, result);
113 *((__m64 *)&output[2]) = _mm_cvtps_pi32(result) ;
115 /* load for next loop while store completes */
116 vec_r = _mm_load_ss(&igtbl_r[src[0]]);
117 vec_g = _mm_load_ss(&igtbl_g[src[1]]);
118 vec_b = _mm_load_ss(&igtbl_b[src[2]]);
119 src += 3;
121 /* use calc'd indices to output RGB values */
122 dest[r_out] = otdata_r[output[0]];
123 dest[1] = otdata_g[output[1]];
124 dest[b_out] = otdata_b[output[2]];
125 dest += 3;
128 /* handle final (maybe only) pixel */
130 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
131 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
132 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
134 vec_r = _mm_mul_ps(vec_r, mat0);
135 vec_g = _mm_mul_ps(vec_g, mat1);
136 vec_b = _mm_mul_ps(vec_b, mat2);
138 vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
139 vec_r = _mm_max_ps(min, vec_r);
140 vec_r = _mm_min_ps(max, vec_r);
141 result = _mm_mul_ps(vec_r, scale);
143 *((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
144 result = _mm_movehl_ps(result, result);
145 *((__m64 *)&output[2]) = _mm_cvtps_pi32(result);
147 dest[r_out] = otdata_r[output[0]];
148 dest[1] = otdata_g[output[1]];
149 dest[b_out] = otdata_b[output[2]];
151 _mm_empty();
154 void qcms_transform_data_rgba_out_lut_sse1(qcms_transform *transform,
155 unsigned char *src,
156 unsigned char *dest,
157 size_t length,
158 qcms_format_type output_format)
160 unsigned int i;
161 float (*mat)[4] = transform->matrix;
162 char input_back[32];
163 /* Ensure we have a buffer that's 16 byte aligned regardless of the original
164 * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
165 * because they don't work on stack variables. gcc 4.4 does do the right thing
166 * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
167 float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
168 /* share input and output locations to save having to keep the
169 * locations in separate registers */
170 uint32_t const * output = (uint32_t*)input;
172 /* deref *transform now to avoid it in loop */
173 const float *igtbl_r = transform->input_gamma_table_r;
174 const float *igtbl_g = transform->input_gamma_table_g;
175 const float *igtbl_b = transform->input_gamma_table_b;
177 /* deref *transform now to avoid it in loop */
178 const uint8_t *otdata_r = &transform->output_table_r->data[0];
179 const uint8_t *otdata_g = &transform->output_table_g->data[0];
180 const uint8_t *otdata_b = &transform->output_table_b->data[0];
182 /* input matrix values never change */
183 const __m128 mat0 = _mm_load_ps(mat[0]);
184 const __m128 mat1 = _mm_load_ps(mat[1]);
185 const __m128 mat2 = _mm_load_ps(mat[2]);
187 /* these values don't change, either */
188 const __m128 max = _mm_load_ps(clampMaxValueX4);
189 const __m128 min = _mm_setzero_ps();
190 const __m128 scale = _mm_load_ps(floatScaleX4);
192 /* working variables */
193 __m128 vec_r, vec_g, vec_b, result;
194 const int r_out = output_format.r;
195 const int b_out = output_format.b;
196 unsigned char alpha;
198 /* CYA */
199 if (!length)
200 return;
202 /* one pixel is handled outside of the loop */
203 length--;
205 /* setup for transforming 1st pixel */
206 vec_r = _mm_load_ss(&igtbl_r[src[0]]);
207 vec_g = _mm_load_ss(&igtbl_g[src[1]]);
208 vec_b = _mm_load_ss(&igtbl_b[src[2]]);
209 alpha = src[3];
210 src += 4;
212 /* transform all but final pixel */
214 for (i=0; i<length; i++)
216 /* position values from gamma tables */
217 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
218 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
219 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
221 /* gamma * matrix */
222 vec_r = _mm_mul_ps(vec_r, mat0);
223 vec_g = _mm_mul_ps(vec_g, mat1);
224 vec_b = _mm_mul_ps(vec_b, mat2);
226 /* store alpha for this pixel; load alpha for next */
227 dest[3] = alpha;
228 alpha = src[3];
230 /* crunch, crunch, crunch */
231 vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
232 vec_r = _mm_max_ps(min, vec_r);
233 vec_r = _mm_min_ps(max, vec_r);
234 result = _mm_mul_ps(vec_r, scale);
236 /* store calc'd output tables indices */
237 *((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
238 result = _mm_movehl_ps(result, result);
239 *((__m64 *)&output[2]) = _mm_cvtps_pi32(result);
241 /* load gamma values for next loop while store completes */
242 vec_r = _mm_load_ss(&igtbl_r[src[0]]);
243 vec_g = _mm_load_ss(&igtbl_g[src[1]]);
244 vec_b = _mm_load_ss(&igtbl_b[src[2]]);
245 src += 4;
247 /* use calc'd indices to output RGB values */
248 dest[r_out] = otdata_r[output[0]];
249 dest[1] = otdata_g[output[1]];
250 dest[b_out] = otdata_b[output[2]];
251 dest += 4;
254 /* handle final (maybe only) pixel */
256 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
257 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
258 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
260 vec_r = _mm_mul_ps(vec_r, mat0);
261 vec_g = _mm_mul_ps(vec_g, mat1);
262 vec_b = _mm_mul_ps(vec_b, mat2);
264 dest[3] = alpha;
266 vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
267 vec_r = _mm_max_ps(min, vec_r);
268 vec_r = _mm_min_ps(max, vec_r);
269 result = _mm_mul_ps(vec_r, scale);
271 *((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
272 result = _mm_movehl_ps(result, result);
273 *((__m64 *)&output[2]) = _mm_cvtps_pi32(result);
275 dest[r_out] = otdata_r[output[0]];
276 dest[1] = otdata_g[output[1]];
277 dest[b_out] = otdata_b[output[2]];
279 _mm_empty();