2 // Copyright (C) 2009 Mozilla Foundation
3 // Copyright (C) 2015 Intel Corporation
5 // Permission is hereby granted, free of charge, to any person obtaining
6 // a copy of this software and associated documentation files (the "Software"),
7 // to deal in the Software without restriction, including without limitation
8 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 // and/or sell copies of the Software, and to permit persons to whom the Software
10 // is furnished to do so, subject to the following conditions:
12 // The above copyright notice and this permission notice shall be included in
13 // all copies or substantial portions of the Software.
15 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
17 // THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 #include <emmintrin.h>
27 /* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */
28 #define FLOATSCALE (float)(PRECACHE_OUTPUT_SIZE)
29 #define CLAMPMAXVAL ( ((float) (PRECACHE_OUTPUT_SIZE - 1)) / PRECACHE_OUTPUT_SIZE )
30 static const ALIGN
float floatScaleX4
[4] =
31 { FLOATSCALE
, FLOATSCALE
, FLOATSCALE
, FLOATSCALE
};
32 static const ALIGN
float clampMaxValueX4
[4] =
33 { CLAMPMAXVAL
, CLAMPMAXVAL
, CLAMPMAXVAL
, CLAMPMAXVAL
};
35 void qcms_transform_data_rgb_out_lut_sse2(qcms_transform
*transform
,
39 qcms_format_type output_format
)
42 float (*mat
)[4] = transform
->matrix
;
44 /* Ensure we have a buffer that's 16 byte aligned regardless of the original
45 * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
46 * because they don't work on stack variables. gcc 4.4 does do the right thing
47 * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
48 float const * input
= (float*)(((uintptr_t)&input_back
[16]) & ~0xf);
49 /* share input and output locations to save having to keep the
50 * locations in separate registers */
51 uint32_t const * output
= (uint32_t*)input
;
53 /* deref *transform now to avoid it in loop */
54 const float *igtbl_r
= transform
->input_gamma_table_r
;
55 const float *igtbl_g
= transform
->input_gamma_table_g
;
56 const float *igtbl_b
= transform
->input_gamma_table_b
;
58 /* deref *transform now to avoid it in loop */
59 const uint8_t *otdata_r
= &transform
->output_table_r
->data
[0];
60 const uint8_t *otdata_g
= &transform
->output_table_g
->data
[0];
61 const uint8_t *otdata_b
= &transform
->output_table_b
->data
[0];
63 /* input matrix values never change */
64 const __m128 mat0
= _mm_load_ps(mat
[0]);
65 const __m128 mat1
= _mm_load_ps(mat
[1]);
66 const __m128 mat2
= _mm_load_ps(mat
[2]);
68 /* these values don't change, either */
69 const __m128 max
= _mm_load_ps(clampMaxValueX4
);
70 const __m128 min
= _mm_setzero_ps();
71 const __m128 scale
= _mm_load_ps(floatScaleX4
);
73 /* working variables */
74 __m128 vec_r
, vec_g
, vec_b
, result
;
75 const int r_out
= output_format
.r
;
76 const int b_out
= output_format
.b
;
82 /* one pixel is handled outside of the loop */
85 /* setup for transforming 1st pixel */
86 vec_r
= _mm_load_ss(&igtbl_r
[src
[0]]);
87 vec_g
= _mm_load_ss(&igtbl_g
[src
[1]]);
88 vec_b
= _mm_load_ss(&igtbl_b
[src
[2]]);
91 /* transform all but final pixel */
93 for (i
=0; i
<length
; i
++)
95 /* position values from gamma tables */
96 vec_r
= _mm_shuffle_ps(vec_r
, vec_r
, 0);
97 vec_g
= _mm_shuffle_ps(vec_g
, vec_g
, 0);
98 vec_b
= _mm_shuffle_ps(vec_b
, vec_b
, 0);
101 vec_r
= _mm_mul_ps(vec_r
, mat0
);
102 vec_g
= _mm_mul_ps(vec_g
, mat1
);
103 vec_b
= _mm_mul_ps(vec_b
, mat2
);
105 /* crunch, crunch, crunch */
106 vec_r
= _mm_add_ps(vec_r
, _mm_add_ps(vec_g
, vec_b
));
107 vec_r
= _mm_max_ps(min
, vec_r
);
108 vec_r
= _mm_min_ps(max
, vec_r
);
109 result
= _mm_mul_ps(vec_r
, scale
);
111 /* store calc'd output tables indices */
112 _mm_store_si128((__m128i
*)output
, _mm_cvtps_epi32(result
));
114 /* load for next loop while store completes */
115 vec_r
= _mm_load_ss(&igtbl_r
[src
[0]]);
116 vec_g
= _mm_load_ss(&igtbl_g
[src
[1]]);
117 vec_b
= _mm_load_ss(&igtbl_b
[src
[2]]);
120 /* use calc'd indices to output RGB values */
121 dest
[r_out
] = otdata_r
[output
[0]];
122 dest
[1] = otdata_g
[output
[1]];
123 dest
[b_out
] = otdata_b
[output
[2]];
127 /* handle final (maybe only) pixel */
129 vec_r
= _mm_shuffle_ps(vec_r
, vec_r
, 0);
130 vec_g
= _mm_shuffle_ps(vec_g
, vec_g
, 0);
131 vec_b
= _mm_shuffle_ps(vec_b
, vec_b
, 0);
133 vec_r
= _mm_mul_ps(vec_r
, mat0
);
134 vec_g
= _mm_mul_ps(vec_g
, mat1
);
135 vec_b
= _mm_mul_ps(vec_b
, mat2
);
137 vec_r
= _mm_add_ps(vec_r
, _mm_add_ps(vec_g
, vec_b
));
138 vec_r
= _mm_max_ps(min
, vec_r
);
139 vec_r
= _mm_min_ps(max
, vec_r
);
140 result
= _mm_mul_ps(vec_r
, scale
);
142 _mm_store_si128((__m128i
*)output
, _mm_cvtps_epi32(result
));
144 dest
[r_out
] = otdata_r
[output
[0]];
145 dest
[1] = otdata_g
[output
[1]];
146 dest
[b_out
] = otdata_b
[output
[2]];
149 void qcms_transform_data_rgba_out_lut_sse2(qcms_transform
*transform
,
153 qcms_format_type output_format
)
156 float (*mat
)[4] = transform
->matrix
;
158 /* Ensure we have a buffer that's 16 byte aligned regardless of the original
159 * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
160 * because they don't work on stack variables. gcc 4.4 does do the right thing
161 * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
162 float const * input
= (float*)(((uintptr_t)&input_back
[16]) & ~0xf);
163 /* share input and output locations to save having to keep the
164 * locations in separate registers */
165 uint32_t const * output
= (uint32_t*)input
;
167 /* deref *transform now to avoid it in loop */
168 const float *igtbl_r
= transform
->input_gamma_table_r
;
169 const float *igtbl_g
= transform
->input_gamma_table_g
;
170 const float *igtbl_b
= transform
->input_gamma_table_b
;
172 /* deref *transform now to avoid it in loop */
173 const uint8_t *otdata_r
= &transform
->output_table_r
->data
[0];
174 const uint8_t *otdata_g
= &transform
->output_table_g
->data
[0];
175 const uint8_t *otdata_b
= &transform
->output_table_b
->data
[0];
177 /* input matrix values never change */
178 const __m128 mat0
= _mm_load_ps(mat
[0]);
179 const __m128 mat1
= _mm_load_ps(mat
[1]);
180 const __m128 mat2
= _mm_load_ps(mat
[2]);
182 /* these values don't change, either */
183 const __m128 max
= _mm_load_ps(clampMaxValueX4
);
184 const __m128 min
= _mm_setzero_ps();
185 const __m128 scale
= _mm_load_ps(floatScaleX4
);
187 /* working variables */
188 __m128 vec_r
, vec_g
, vec_b
, result
;
189 const int r_out
= output_format
.r
;
190 const int b_out
= output_format
.b
;
197 /* one pixel is handled outside of the loop */
200 /* setup for transforming 1st pixel */
201 vec_r
= _mm_load_ss(&igtbl_r
[src
[0]]);
202 vec_g
= _mm_load_ss(&igtbl_g
[src
[1]]);
203 vec_b
= _mm_load_ss(&igtbl_b
[src
[2]]);
207 /* transform all but final pixel */
209 for (i
=0; i
<length
; i
++)
211 /* position values from gamma tables */
212 vec_r
= _mm_shuffle_ps(vec_r
, vec_r
, 0);
213 vec_g
= _mm_shuffle_ps(vec_g
, vec_g
, 0);
214 vec_b
= _mm_shuffle_ps(vec_b
, vec_b
, 0);
217 vec_r
= _mm_mul_ps(vec_r
, mat0
);
218 vec_g
= _mm_mul_ps(vec_g
, mat1
);
219 vec_b
= _mm_mul_ps(vec_b
, mat2
);
221 /* store alpha for this pixel; load alpha for next */
225 /* crunch, crunch, crunch */
226 vec_r
= _mm_add_ps(vec_r
, _mm_add_ps(vec_g
, vec_b
));
227 vec_r
= _mm_max_ps(min
, vec_r
);
228 vec_r
= _mm_min_ps(max
, vec_r
);
229 result
= _mm_mul_ps(vec_r
, scale
);
231 /* store calc'd output tables indices */
232 _mm_store_si128((__m128i
*)output
, _mm_cvtps_epi32(result
));
234 /* load gamma values for next loop while store completes */
235 vec_r
= _mm_load_ss(&igtbl_r
[src
[0]]);
236 vec_g
= _mm_load_ss(&igtbl_g
[src
[1]]);
237 vec_b
= _mm_load_ss(&igtbl_b
[src
[2]]);
240 /* use calc'd indices to output RGB values */
241 dest
[r_out
] = otdata_r
[output
[0]];
242 dest
[1] = otdata_g
[output
[1]];
243 dest
[b_out
] = otdata_b
[output
[2]];
247 /* handle final (maybe only) pixel */
249 vec_r
= _mm_shuffle_ps(vec_r
, vec_r
, 0);
250 vec_g
= _mm_shuffle_ps(vec_g
, vec_g
, 0);
251 vec_b
= _mm_shuffle_ps(vec_b
, vec_b
, 0);
253 vec_r
= _mm_mul_ps(vec_r
, mat0
);
254 vec_g
= _mm_mul_ps(vec_g
, mat1
);
255 vec_b
= _mm_mul_ps(vec_b
, mat2
);
259 vec_r
= _mm_add_ps(vec_r
, _mm_add_ps(vec_g
, vec_b
));
260 vec_r
= _mm_max_ps(min
, vec_r
);
261 vec_r
= _mm_min_ps(max
, vec_r
);
262 result
= _mm_mul_ps(vec_r
, scale
);
264 _mm_store_si128((__m128i
*)output
, _mm_cvtps_epi32(result
));
266 dest
[r_out
] = otdata_r
[output
[0]];
267 dest
[1] = otdata_g
[output
[1]];
268 dest
[b_out
] = otdata_b
[output
[2]];
272 void qcms_transform_data_tetra_clut_rgba_sse2(qcms_transform
*transform
,
276 qcms_format_type output_format
)
278 const int r_out
= output_format
.r
;
279 const int b_out
= output_format
.b
;
283 const int xy_len
= 1;
284 const int x_len
= transform
->grid_size
;
285 const int len
= x_len
* x_len
;
287 const __m128 __clut_stride
= _mm_set_ps((float)(3 * xy_len
), (float)(3 * x_len
), (float)(3 * len
), 0);
288 const __m128 __grid_scaled
= _mm_set1_ps((1.0f
/ 255.0f
) * (transform
->grid_size
- 1));
290 const __m128 __255
= _mm_set1_ps(255.0f
);
291 const __m128 __one
= _mm_set1_ps(1.0f
);
292 const __m128 __000
= _mm_setzero_ps();
294 const float* r_table
= transform
->r_clut
;
295 const float* g_table
= transform
->g_clut
;
296 const float* b_table
= transform
->b_clut
;
311 ALIGN
float xyz_r_f
[4];
312 ALIGN
int xyz_0_i
[4];
313 ALIGN
int xyz_n_i
[4];
317 #define TETRA_SRC_RGB(r, g, b) _mm_set_ps((float)b, (float)g, (float)r, 0.f)
319 for (i
= 0; i
< length
; ++i
) {
320 // compute input point in cube lattice (grid) co-ordinates
321 in
= _mm_mul_ps(TETRA_SRC_RGB(src
[0], src
[1], src
[2]), __grid_scaled
);
323 // floor: convert to int (truncate), convert back to float
324 xyz_0
= _mm_cvtepi32_ps(_mm_cvttps_epi32(in
));
326 // ceil: where in is greater than xyz_0 = floor(in), add 1
327 xyz_n
= _mm_add_ps(xyz_0
, _mm_and_ps(_mm_cmpgt_ps(in
, xyz_0
), __one
));
329 // compute the input point relative to the sub-cube origin
330 xyz_r
= _mm_sub_ps(in
, xyz_0
);
332 #define rx (xyz_r_f[1])
333 #define ry (xyz_r_f[2])
334 #define rz (xyz_r_f[3])
336 _mm_store_ps(xyz_r_f
, xyz_r
);
338 #define x0 (xyz_0_i[1])
339 #define y0 (xyz_0_i[2])
340 #define z0 (xyz_0_i[3])
342 xyz_0
= _mm_mul_ps(xyz_0
, __clut_stride
);
343 _mm_store_si128((__m128i
*) xyz_0_i
, _mm_cvtps_epi32(xyz_0
));
345 #define xn (xyz_n_i[1])
346 #define yn (xyz_n_i[2])
347 #define zn (xyz_n_i[3])
349 xyz_n
= _mm_mul_ps(xyz_n
, __clut_stride
);
350 _mm_store_si128((__m128i
*) xyz_n_i
, _mm_cvtps_epi32(xyz_n
));
355 #define SET_I0_AND_PREFETCH_CLUT() \
356 _mm_prefetch((char*)&(r_table[i0 = x0 + y0 + z0]), _MM_HINT_T0)
358 #if !defined(_MSC_VER)
359 SET_I0_AND_PREFETCH_CLUT();
362 #define TETRA_LOOKUP_CLUT(i3, i2, i1, i0) \
363 c0 = _mm_set_ps(b_table[i0], g_table[i0], r_table[i0], 0.f), \
364 c1 = _mm_set_ps(b_table[i1], g_table[i1], r_table[i1], 0.f), \
365 c2 = _mm_set_ps(b_table[i2], g_table[i2], r_table[i2], 0.f), \
366 c3 = _mm_set_ps(b_table[i3], g_table[i3], r_table[i3], 0.f)
370 #if defined(_MSC_VER)
371 SET_I0_AND_PREFETCH_CLUT();
373 if (ry
>= rz
) { // rx >= ry && ry >= rz
380 TETRA_LOOKUP_CLUT(i3
, i2
, i1
, i0
);
382 c3
= _mm_sub_ps(c3
, c2
);
383 c2
= _mm_sub_ps(c2
, c1
);
384 c1
= _mm_sub_ps(c1
, c0
);
386 } else if (rx
>= rz
) { // rx >= rz && rz >= ry
393 TETRA_LOOKUP_CLUT(i3
, i2
, i1
, i0
);
395 c2
= _mm_sub_ps(c2
, c3
);
396 c3
= _mm_sub_ps(c3
, c1
);
397 c1
= _mm_sub_ps(c1
, c0
);
399 } else { // rz > rx && rx >= ry
406 TETRA_LOOKUP_CLUT(i3
, i2
, i1
, i0
);
408 c2
= _mm_sub_ps(c2
, c1
);
409 c1
= _mm_sub_ps(c1
, c3
);
410 c3
= _mm_sub_ps(c3
, c0
);
414 #if defined(_MSC_VER)
415 SET_I0_AND_PREFETCH_CLUT();
417 if (rx
>= rz
) { // ry > rx && rx >= rz
424 TETRA_LOOKUP_CLUT(i3
, i2
, i1
, i0
);
426 c3
= _mm_sub_ps(c3
, c1
);
427 c1
= _mm_sub_ps(c1
, c2
);
428 c2
= _mm_sub_ps(c2
, c0
);
430 } else if (ry
>= rz
) { // ry >= rz && rz > rx
437 TETRA_LOOKUP_CLUT(i3
, i2
, i1
, i0
);
439 c1
= _mm_sub_ps(c1
, c3
);
440 c3
= _mm_sub_ps(c3
, c2
);
441 c2
= _mm_sub_ps(c2
, c0
);
443 } else { // rz > ry && ry > rx
450 TETRA_LOOKUP_CLUT(i3
, i2
, i1
, i0
);
452 c1
= _mm_sub_ps(c1
, c2
);
453 c2
= _mm_sub_ps(c2
, c3
);
454 c3
= _mm_sub_ps(c3
, c0
);
458 // output.xyz = column_matrix(c1, c2, c3) x r.xyz + c0.xyz
460 in
= _mm_shuffle_ps(xyz_r
, xyz_r
, _MM_SHUFFLE(1, 1, 1, 1));
461 c1
= _mm_mul_ps(c1
, in
);
462 in
= _mm_shuffle_ps(xyz_r
, xyz_r
, _MM_SHUFFLE(2, 2, 2, 2));
463 c2
= _mm_mul_ps(c2
, in
);
464 in
= _mm_shuffle_ps(xyz_r
, xyz_r
, _MM_SHUFFLE(3, 3, 3, 3));
465 c3
= _mm_mul_ps(c3
, in
);
467 in
= _mm_add_ps(c3
, c2
);
468 in
= _mm_add_ps(in
, c1
);
469 in
= _mm_add_ps(in
, c0
);
471 // clamp to [0.0..1.0] and scale by 255
473 in
= _mm_max_ps(in
, __000
);
474 in
= _mm_min_ps(in
, __one
);
475 in
= _mm_mul_ps(in
, __255
);
477 result
= _mm_cvtps_epi32(in
); // convert to int (rounding)
479 dest
[r_out
] = (unsigned char) _mm_extract_epi16(result
, 2);
480 dest
[1] = (unsigned char) _mm_extract_epi16(result
, 4);
481 dest
[b_out
] = (unsigned char) _mm_extract_epi16(result
, 6);