Add ICU message format support
[chromium-blink-merge.git] / third_party / qcms / src / transform-sse2.c
blob29717c940b0118c65fdf73af80cba715e4f8c340
1 // qcms
2 // Copyright (C) 2009 Mozilla Foundation
3 // Copyright (C) 2015 Intel Corporation
4 //
5 // Permission is hereby granted, free of charge, to any person obtaining
6 // a copy of this software and associated documentation files (the "Software"),
7 // to deal in the Software without restriction, including without limitation
8 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 // and/or sell copies of the Software, and to permit persons to whom the Software
10 // is furnished to do so, subject to the following conditions:
12 // The above copyright notice and this permission notice shall be included in
13 // all copies or substantial portions of the Software.
15 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
17 // THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 #include <emmintrin.h>
25 #include "qcmsint.h"
27 /* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */
28 #define FLOATSCALE (float)(PRECACHE_OUTPUT_SIZE)
29 #define CLAMPMAXVAL ( ((float) (PRECACHE_OUTPUT_SIZE - 1)) / PRECACHE_OUTPUT_SIZE )
30 static const ALIGN float floatScaleX4[4] =
31 { FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE};
32 static const ALIGN float clampMaxValueX4[4] =
33 { CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL};
35 void qcms_transform_data_rgb_out_lut_sse2(qcms_transform *transform,
36 unsigned char *src,
37 unsigned char *dest,
38 size_t length,
39 qcms_format_type output_format)
41 unsigned int i;
42 float (*mat)[4] = transform->matrix;
43 char input_back[32];
44 /* Ensure we have a buffer that's 16 byte aligned regardless of the original
45 * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
46 * because they don't work on stack variables. gcc 4.4 does do the right thing
47 * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
48 float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
49 /* share input and output locations to save having to keep the
50 * locations in separate registers */
51 uint32_t const * output = (uint32_t*)input;
53 /* deref *transform now to avoid it in loop */
54 const float *igtbl_r = transform->input_gamma_table_r;
55 const float *igtbl_g = transform->input_gamma_table_g;
56 const float *igtbl_b = transform->input_gamma_table_b;
58 /* deref *transform now to avoid it in loop */
59 const uint8_t *otdata_r = &transform->output_table_r->data[0];
60 const uint8_t *otdata_g = &transform->output_table_g->data[0];
61 const uint8_t *otdata_b = &transform->output_table_b->data[0];
63 /* input matrix values never change */
64 const __m128 mat0 = _mm_load_ps(mat[0]);
65 const __m128 mat1 = _mm_load_ps(mat[1]);
66 const __m128 mat2 = _mm_load_ps(mat[2]);
68 /* these values don't change, either */
69 const __m128 max = _mm_load_ps(clampMaxValueX4);
70 const __m128 min = _mm_setzero_ps();
71 const __m128 scale = _mm_load_ps(floatScaleX4);
73 /* working variables */
74 __m128 vec_r, vec_g, vec_b, result;
75 const int r_out = output_format.r;
76 const int b_out = output_format.b;
78 /* CYA */
79 if (!length)
80 return;
82 /* one pixel is handled outside of the loop */
83 length--;
85 /* setup for transforming 1st pixel */
86 vec_r = _mm_load_ss(&igtbl_r[src[0]]);
87 vec_g = _mm_load_ss(&igtbl_g[src[1]]);
88 vec_b = _mm_load_ss(&igtbl_b[src[2]]);
89 src += 3;
91 /* transform all but final pixel */
93 for (i=0; i<length; i++)
95 /* position values from gamma tables */
96 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
97 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
98 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
100 /* gamma * matrix */
101 vec_r = _mm_mul_ps(vec_r, mat0);
102 vec_g = _mm_mul_ps(vec_g, mat1);
103 vec_b = _mm_mul_ps(vec_b, mat2);
105 /* crunch, crunch, crunch */
106 vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
107 vec_r = _mm_max_ps(min, vec_r);
108 vec_r = _mm_min_ps(max, vec_r);
109 result = _mm_mul_ps(vec_r, scale);
111 /* store calc'd output tables indices */
112 _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
114 /* load for next loop while store completes */
115 vec_r = _mm_load_ss(&igtbl_r[src[0]]);
116 vec_g = _mm_load_ss(&igtbl_g[src[1]]);
117 vec_b = _mm_load_ss(&igtbl_b[src[2]]);
118 src += 3;
120 /* use calc'd indices to output RGB values */
121 dest[r_out] = otdata_r[output[0]];
122 dest[1] = otdata_g[output[1]];
123 dest[b_out] = otdata_b[output[2]];
124 dest += 3;
127 /* handle final (maybe only) pixel */
129 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
130 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
131 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
133 vec_r = _mm_mul_ps(vec_r, mat0);
134 vec_g = _mm_mul_ps(vec_g, mat1);
135 vec_b = _mm_mul_ps(vec_b, mat2);
137 vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
138 vec_r = _mm_max_ps(min, vec_r);
139 vec_r = _mm_min_ps(max, vec_r);
140 result = _mm_mul_ps(vec_r, scale);
142 _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
144 dest[r_out] = otdata_r[output[0]];
145 dest[1] = otdata_g[output[1]];
146 dest[b_out] = otdata_b[output[2]];
149 void qcms_transform_data_rgba_out_lut_sse2(qcms_transform *transform,
150 unsigned char *src,
151 unsigned char *dest,
152 size_t length,
153 qcms_format_type output_format)
155 unsigned int i;
156 float (*mat)[4] = transform->matrix;
157 char input_back[32];
158 /* Ensure we have a buffer that's 16 byte aligned regardless of the original
159 * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
160 * because they don't work on stack variables. gcc 4.4 does do the right thing
161 * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
162 float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
163 /* share input and output locations to save having to keep the
164 * locations in separate registers */
165 uint32_t const * output = (uint32_t*)input;
167 /* deref *transform now to avoid it in loop */
168 const float *igtbl_r = transform->input_gamma_table_r;
169 const float *igtbl_g = transform->input_gamma_table_g;
170 const float *igtbl_b = transform->input_gamma_table_b;
172 /* deref *transform now to avoid it in loop */
173 const uint8_t *otdata_r = &transform->output_table_r->data[0];
174 const uint8_t *otdata_g = &transform->output_table_g->data[0];
175 const uint8_t *otdata_b = &transform->output_table_b->data[0];
177 /* input matrix values never change */
178 const __m128 mat0 = _mm_load_ps(mat[0]);
179 const __m128 mat1 = _mm_load_ps(mat[1]);
180 const __m128 mat2 = _mm_load_ps(mat[2]);
182 /* these values don't change, either */
183 const __m128 max = _mm_load_ps(clampMaxValueX4);
184 const __m128 min = _mm_setzero_ps();
185 const __m128 scale = _mm_load_ps(floatScaleX4);
187 /* working variables */
188 __m128 vec_r, vec_g, vec_b, result;
189 const int r_out = output_format.r;
190 const int b_out = output_format.b;
191 unsigned char alpha;
193 /* CYA */
194 if (!length)
195 return;
197 /* one pixel is handled outside of the loop */
198 length--;
200 /* setup for transforming 1st pixel */
201 vec_r = _mm_load_ss(&igtbl_r[src[0]]);
202 vec_g = _mm_load_ss(&igtbl_g[src[1]]);
203 vec_b = _mm_load_ss(&igtbl_b[src[2]]);
204 alpha = src[3];
205 src += 4;
207 /* transform all but final pixel */
209 for (i=0; i<length; i++)
211 /* position values from gamma tables */
212 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
213 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
214 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
216 /* gamma * matrix */
217 vec_r = _mm_mul_ps(vec_r, mat0);
218 vec_g = _mm_mul_ps(vec_g, mat1);
219 vec_b = _mm_mul_ps(vec_b, mat2);
221 /* store alpha for this pixel; load alpha for next */
222 dest[3] = alpha;
223 alpha = src[3];
225 /* crunch, crunch, crunch */
226 vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
227 vec_r = _mm_max_ps(min, vec_r);
228 vec_r = _mm_min_ps(max, vec_r);
229 result = _mm_mul_ps(vec_r, scale);
231 /* store calc'd output tables indices */
232 _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
234 /* load gamma values for next loop while store completes */
235 vec_r = _mm_load_ss(&igtbl_r[src[0]]);
236 vec_g = _mm_load_ss(&igtbl_g[src[1]]);
237 vec_b = _mm_load_ss(&igtbl_b[src[2]]);
238 src += 4;
240 /* use calc'd indices to output RGB values */
241 dest[r_out] = otdata_r[output[0]];
242 dest[1] = otdata_g[output[1]];
243 dest[b_out] = otdata_b[output[2]];
244 dest += 4;
247 /* handle final (maybe only) pixel */
249 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
250 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
251 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
253 vec_r = _mm_mul_ps(vec_r, mat0);
254 vec_g = _mm_mul_ps(vec_g, mat1);
255 vec_b = _mm_mul_ps(vec_b, mat2);
257 dest[3] = alpha;
259 vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
260 vec_r = _mm_max_ps(min, vec_r);
261 vec_r = _mm_min_ps(max, vec_r);
262 result = _mm_mul_ps(vec_r, scale);
264 _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
266 dest[r_out] = otdata_r[output[0]];
267 dest[1] = otdata_g[output[1]];
268 dest[b_out] = otdata_b[output[2]];
272 void qcms_transform_data_tetra_clut_rgba_sse2(qcms_transform *transform,
273 unsigned char *src,
274 unsigned char *dest,
275 size_t length,
276 qcms_format_type output_format)
278 const int r_out = output_format.r;
279 const int b_out = output_format.b;
281 size_t i;
283 const int xy_len = 1;
284 const int x_len = transform->grid_size;
285 const int len = x_len * x_len;
287 const __m128 __clut_stride = _mm_set_ps((float)(3 * xy_len), (float)(3 * x_len), (float)(3 * len), 0);
288 const __m128 __grid_scaled = _mm_set1_ps((1.0f / 255.0f) * (transform->grid_size - 1));
290 const __m128 __255 = _mm_set1_ps(255.0f);
291 const __m128 __one = _mm_set1_ps(1.0f);
292 const __m128 __000 = _mm_setzero_ps();
294 const float* r_table = transform->r_clut;
295 const float* g_table = transform->g_clut;
296 const float* b_table = transform->b_clut;
298 int i3, i2, i1, i0;
300 __m128 c3;
301 __m128 c2;
302 __m128 c1;
303 __m128 c0;
305 __m128 in;
307 __m128 xyz_r;
308 __m128 xyz_0;
309 __m128 xyz_n;
311 ALIGN float xyz_r_f[4];
312 ALIGN int xyz_0_i[4];
313 ALIGN int xyz_n_i[4];
315 __m128i result;
317 #define TETRA_SRC_RGB(r, g, b) _mm_set_ps((float)b, (float)g, (float)r, 0.f)
319 for (i = 0; i < length; ++i) {
320 // compute input point in cube lattice (grid) co-ordinates
321 in = _mm_mul_ps(TETRA_SRC_RGB(src[0], src[1], src[2]), __grid_scaled);
323 // floor: convert to int (truncate), convert back to float
324 xyz_0 = _mm_cvtepi32_ps(_mm_cvttps_epi32(in));
326 // ceil: where in is greater than xyz_0 = floor(in), add 1
327 xyz_n = _mm_add_ps(xyz_0, _mm_and_ps(_mm_cmpgt_ps(in, xyz_0), __one));
329 // compute the input point relative to the sub-cube origin
330 xyz_r = _mm_sub_ps(in, xyz_0);
332 #define rx (xyz_r_f[1])
333 #define ry (xyz_r_f[2])
334 #define rz (xyz_r_f[3])
336 _mm_store_ps(xyz_r_f, xyz_r);
338 #define x0 (xyz_0_i[1])
339 #define y0 (xyz_0_i[2])
340 #define z0 (xyz_0_i[3])
342 xyz_0 = _mm_mul_ps(xyz_0, __clut_stride);
343 _mm_store_si128((__m128i*) xyz_0_i, _mm_cvtps_epi32(xyz_0));
345 #define xn (xyz_n_i[1])
346 #define yn (xyz_n_i[2])
347 #define zn (xyz_n_i[3])
349 xyz_n = _mm_mul_ps(xyz_n, __clut_stride);
350 _mm_store_si128((__m128i*) xyz_n_i, _mm_cvtps_epi32(xyz_n));
352 dest[3] = src[3];
353 src += 4;
355 #define SET_I0_AND_PREFETCH_CLUT() \
356 _mm_prefetch((char*)&(r_table[i0 = x0 + y0 + z0]), _MM_HINT_T0)
358 #if !defined(_MSC_VER)
359 SET_I0_AND_PREFETCH_CLUT();
360 #endif
362 #define TETRA_LOOKUP_CLUT(i3, i2, i1, i0) \
363 c0 = _mm_set_ps(b_table[i0], g_table[i0], r_table[i0], 0.f), \
364 c1 = _mm_set_ps(b_table[i1], g_table[i1], r_table[i1], 0.f), \
365 c2 = _mm_set_ps(b_table[i2], g_table[i2], r_table[i2], 0.f), \
366 c3 = _mm_set_ps(b_table[i3], g_table[i3], r_table[i3], 0.f)
368 if (rx >= ry) {
370 #if defined(_MSC_VER)
371 SET_I0_AND_PREFETCH_CLUT();
372 #endif
373 if (ry >= rz) { // rx >= ry && ry >= rz
375 i3 = yn + (i1 = xn);
376 i1 += i0 - x0;
377 i2 = i3 + z0;
378 i3 += zn;
380 TETRA_LOOKUP_CLUT(i3, i2, i1, i0);
382 c3 = _mm_sub_ps(c3, c2);
383 c2 = _mm_sub_ps(c2, c1);
384 c1 = _mm_sub_ps(c1, c0);
386 } else if (rx >= rz) { // rx >= rz && rz >= ry
388 i3 = zn + (i1 = xn);
389 i1 += i0 - x0;
390 i2 = i3 + yn;
391 i3 += y0;
393 TETRA_LOOKUP_CLUT(i3, i2, i1, i0);
395 c2 = _mm_sub_ps(c2, c3);
396 c3 = _mm_sub_ps(c3, c1);
397 c1 = _mm_sub_ps(c1, c0);
399 } else { // rz > rx && rx >= ry
401 i2 = xn + (i3 = zn);
402 i3 += i0 - z0;
403 i1 = i2 + y0;
404 i2 += yn;
406 TETRA_LOOKUP_CLUT(i3, i2, i1, i0);
408 c2 = _mm_sub_ps(c2, c1);
409 c1 = _mm_sub_ps(c1, c3);
410 c3 = _mm_sub_ps(c3, c0);
412 } else {
414 #if defined(_MSC_VER)
415 SET_I0_AND_PREFETCH_CLUT();
416 #endif
417 if (rx >= rz) { // ry > rx && rx >= rz
419 i3 = xn + (i2 = yn);
420 i2 += i0 - y0;
421 i1 = i3 + z0;
422 i3 += zn;
424 TETRA_LOOKUP_CLUT(i3, i2, i1, i0);
426 c3 = _mm_sub_ps(c3, c1);
427 c1 = _mm_sub_ps(c1, c2);
428 c2 = _mm_sub_ps(c2, c0);
430 } else if (ry >= rz) { // ry >= rz && rz > rx
432 i3 = zn + (i2 = yn);
433 i2 += i0 - y0;
434 i1 = i3 + xn;
435 i3 += x0;
437 TETRA_LOOKUP_CLUT(i3, i2, i1, i0);
439 c1 = _mm_sub_ps(c1, c3);
440 c3 = _mm_sub_ps(c3, c2);
441 c2 = _mm_sub_ps(c2, c0);
443 } else { // rz > ry && ry > rx
445 i2 = yn + (i3 = zn);
446 i3 += i0 - z0;
447 i1 = i2 + xn;
448 i2 += x0;
450 TETRA_LOOKUP_CLUT(i3, i2, i1, i0);
452 c1 = _mm_sub_ps(c1, c2);
453 c2 = _mm_sub_ps(c2, c3);
454 c3 = _mm_sub_ps(c3, c0);
458 // output.xyz = column_matrix(c1, c2, c3) x r.xyz + c0.xyz
460 in = _mm_shuffle_ps(xyz_r, xyz_r, _MM_SHUFFLE(1, 1, 1, 1));
461 c1 = _mm_mul_ps(c1, in);
462 in = _mm_shuffle_ps(xyz_r, xyz_r, _MM_SHUFFLE(2, 2, 2, 2));
463 c2 = _mm_mul_ps(c2, in);
464 in = _mm_shuffle_ps(xyz_r, xyz_r, _MM_SHUFFLE(3, 3, 3, 3));
465 c3 = _mm_mul_ps(c3, in);
467 in = _mm_add_ps(c3, c2);
468 in = _mm_add_ps(in, c1);
469 in = _mm_add_ps(in, c0);
471 // clamp to [0.0..1.0] and scale by 255
473 in = _mm_max_ps(in, __000);
474 in = _mm_min_ps(in, __one);
475 in = _mm_mul_ps(in, __255);
477 result = _mm_cvtps_epi32(in); // convert to int (rounding)
479 dest[r_out] = (unsigned char) _mm_extract_epi16(result, 2);
480 dest[1] = (unsigned char) _mm_extract_epi16(result, 4);
481 dest[b_out] = (unsigned char) _mm_extract_epi16(result, 6);
483 dest += 4;