2 // Copyright (C) 2009 Mozilla Foundation
4 // Permission is hereby granted, free of charge, to any person obtaining
5 // a copy of this software and associated documentation files (the "Software"),
6 // to deal in the Software without restriction, including without limitation
7 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 // and/or sell copies of the Software, and to permit persons to whom the Software
9 // is furnished to do so, subject to the following conditions:
11 // The above copyright notice and this permission notice shall be included in
12 // all copies or substantial portions of the Software.
14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
16 // THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 #include <emmintrin.h>
26 /* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */
27 #define FLOATSCALE (float)(PRECACHE_OUTPUT_SIZE)
28 #define CLAMPMAXVAL ( ((float) (PRECACHE_OUTPUT_SIZE - 1)) / PRECACHE_OUTPUT_SIZE )
29 static const ALIGN
float floatScaleX4
[4] =
30 { FLOATSCALE
, FLOATSCALE
, FLOATSCALE
, FLOATSCALE
};
31 static const ALIGN
float clampMaxValueX4
[4] =
32 { CLAMPMAXVAL
, CLAMPMAXVAL
, CLAMPMAXVAL
, CLAMPMAXVAL
};
34 void qcms_transform_data_rgb_out_lut_sse2(qcms_transform
*transform
,
38 qcms_format_type output_format
)
41 float (*mat
)[4] = transform
->matrix
;
43 /* Ensure we have a buffer that's 16 byte aligned regardless of the original
44 * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
45 * because they don't work on stack variables. gcc 4.4 does do the right thing
46 * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
47 float const * input
= (float*)(((uintptr_t)&input_back
[16]) & ~0xf);
48 /* share input and output locations to save having to keep the
49 * locations in separate registers */
50 uint32_t const * output
= (uint32_t*)input
;
52 /* deref *transform now to avoid it in loop */
53 const float *igtbl_r
= transform
->input_gamma_table_r
;
54 const float *igtbl_g
= transform
->input_gamma_table_g
;
55 const float *igtbl_b
= transform
->input_gamma_table_b
;
57 /* deref *transform now to avoid it in loop */
58 const uint8_t *otdata_r
= &transform
->output_table_r
->data
[0];
59 const uint8_t *otdata_g
= &transform
->output_table_g
->data
[0];
60 const uint8_t *otdata_b
= &transform
->output_table_b
->data
[0];
62 /* input matrix values never change */
63 const __m128 mat0
= _mm_load_ps(mat
[0]);
64 const __m128 mat1
= _mm_load_ps(mat
[1]);
65 const __m128 mat2
= _mm_load_ps(mat
[2]);
67 /* these values don't change, either */
68 const __m128 max
= _mm_load_ps(clampMaxValueX4
);
69 const __m128 min
= _mm_setzero_ps();
70 const __m128 scale
= _mm_load_ps(floatScaleX4
);
72 /* working variables */
73 __m128 vec_r
, vec_g
, vec_b
, result
;
74 const int r_out
= output_format
.r
;
75 const int b_out
= output_format
.b
;
81 /* one pixel is handled outside of the loop */
84 /* setup for transforming 1st pixel */
85 vec_r
= _mm_load_ss(&igtbl_r
[src
[0]]);
86 vec_g
= _mm_load_ss(&igtbl_g
[src
[1]]);
87 vec_b
= _mm_load_ss(&igtbl_b
[src
[2]]);
90 /* transform all but final pixel */
92 for (i
=0; i
<length
; i
++)
94 /* position values from gamma tables */
95 vec_r
= _mm_shuffle_ps(vec_r
, vec_r
, 0);
96 vec_g
= _mm_shuffle_ps(vec_g
, vec_g
, 0);
97 vec_b
= _mm_shuffle_ps(vec_b
, vec_b
, 0);
100 vec_r
= _mm_mul_ps(vec_r
, mat0
);
101 vec_g
= _mm_mul_ps(vec_g
, mat1
);
102 vec_b
= _mm_mul_ps(vec_b
, mat2
);
104 /* crunch, crunch, crunch */
105 vec_r
= _mm_add_ps(vec_r
, _mm_add_ps(vec_g
, vec_b
));
106 vec_r
= _mm_max_ps(min
, vec_r
);
107 vec_r
= _mm_min_ps(max
, vec_r
);
108 result
= _mm_mul_ps(vec_r
, scale
);
110 /* store calc'd output tables indices */
111 _mm_store_si128((__m128i
*)output
, _mm_cvtps_epi32(result
));
113 /* load for next loop while store completes */
114 vec_r
= _mm_load_ss(&igtbl_r
[src
[0]]);
115 vec_g
= _mm_load_ss(&igtbl_g
[src
[1]]);
116 vec_b
= _mm_load_ss(&igtbl_b
[src
[2]]);
119 /* use calc'd indices to output RGB values */
120 dest
[r_out
] = otdata_r
[output
[0]];
121 dest
[1] = otdata_g
[output
[1]];
122 dest
[b_out
] = otdata_b
[output
[2]];
126 /* handle final (maybe only) pixel */
128 vec_r
= _mm_shuffle_ps(vec_r
, vec_r
, 0);
129 vec_g
= _mm_shuffle_ps(vec_g
, vec_g
, 0);
130 vec_b
= _mm_shuffle_ps(vec_b
, vec_b
, 0);
132 vec_r
= _mm_mul_ps(vec_r
, mat0
);
133 vec_g
= _mm_mul_ps(vec_g
, mat1
);
134 vec_b
= _mm_mul_ps(vec_b
, mat2
);
136 vec_r
= _mm_add_ps(vec_r
, _mm_add_ps(vec_g
, vec_b
));
137 vec_r
= _mm_max_ps(min
, vec_r
);
138 vec_r
= _mm_min_ps(max
, vec_r
);
139 result
= _mm_mul_ps(vec_r
, scale
);
141 _mm_store_si128((__m128i
*)output
, _mm_cvtps_epi32(result
));
143 dest
[r_out
] = otdata_r
[output
[0]];
144 dest
[1] = otdata_g
[output
[1]];
145 dest
[b_out
] = otdata_b
[output
[2]];
148 void qcms_transform_data_rgba_out_lut_sse2(qcms_transform
*transform
,
152 qcms_format_type output_format
)
155 float (*mat
)[4] = transform
->matrix
;
157 /* Ensure we have a buffer that's 16 byte aligned regardless of the original
158 * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
159 * because they don't work on stack variables. gcc 4.4 does do the right thing
160 * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
161 float const * input
= (float*)(((uintptr_t)&input_back
[16]) & ~0xf);
162 /* share input and output locations to save having to keep the
163 * locations in separate registers */
164 uint32_t const * output
= (uint32_t*)input
;
166 /* deref *transform now to avoid it in loop */
167 const float *igtbl_r
= transform
->input_gamma_table_r
;
168 const float *igtbl_g
= transform
->input_gamma_table_g
;
169 const float *igtbl_b
= transform
->input_gamma_table_b
;
171 /* deref *transform now to avoid it in loop */
172 const uint8_t *otdata_r
= &transform
->output_table_r
->data
[0];
173 const uint8_t *otdata_g
= &transform
->output_table_g
->data
[0];
174 const uint8_t *otdata_b
= &transform
->output_table_b
->data
[0];
176 /* input matrix values never change */
177 const __m128 mat0
= _mm_load_ps(mat
[0]);
178 const __m128 mat1
= _mm_load_ps(mat
[1]);
179 const __m128 mat2
= _mm_load_ps(mat
[2]);
181 /* these values don't change, either */
182 const __m128 max
= _mm_load_ps(clampMaxValueX4
);
183 const __m128 min
= _mm_setzero_ps();
184 const __m128 scale
= _mm_load_ps(floatScaleX4
);
186 /* working variables */
187 __m128 vec_r
, vec_g
, vec_b
, result
;
188 const int r_out
= output_format
.r
;
189 const int b_out
= output_format
.b
;
196 /* one pixel is handled outside of the loop */
199 /* setup for transforming 1st pixel */
200 vec_r
= _mm_load_ss(&igtbl_r
[src
[0]]);
201 vec_g
= _mm_load_ss(&igtbl_g
[src
[1]]);
202 vec_b
= _mm_load_ss(&igtbl_b
[src
[2]]);
206 /* transform all but final pixel */
208 for (i
=0; i
<length
; i
++)
210 /* position values from gamma tables */
211 vec_r
= _mm_shuffle_ps(vec_r
, vec_r
, 0);
212 vec_g
= _mm_shuffle_ps(vec_g
, vec_g
, 0);
213 vec_b
= _mm_shuffle_ps(vec_b
, vec_b
, 0);
216 vec_r
= _mm_mul_ps(vec_r
, mat0
);
217 vec_g
= _mm_mul_ps(vec_g
, mat1
);
218 vec_b
= _mm_mul_ps(vec_b
, mat2
);
220 /* store alpha for this pixel; load alpha for next */
224 /* crunch, crunch, crunch */
225 vec_r
= _mm_add_ps(vec_r
, _mm_add_ps(vec_g
, vec_b
));
226 vec_r
= _mm_max_ps(min
, vec_r
);
227 vec_r
= _mm_min_ps(max
, vec_r
);
228 result
= _mm_mul_ps(vec_r
, scale
);
230 /* store calc'd output tables indices */
231 _mm_store_si128((__m128i
*)output
, _mm_cvtps_epi32(result
));
233 /* load gamma values for next loop while store completes */
234 vec_r
= _mm_load_ss(&igtbl_r
[src
[0]]);
235 vec_g
= _mm_load_ss(&igtbl_g
[src
[1]]);
236 vec_b
= _mm_load_ss(&igtbl_b
[src
[2]]);
239 /* use calc'd indices to output RGB values */
240 dest
[r_out
] = otdata_r
[output
[0]];
241 dest
[1] = otdata_g
[output
[1]];
242 dest
[b_out
] = otdata_b
[output
[2]];
246 /* handle final (maybe only) pixel */
248 vec_r
= _mm_shuffle_ps(vec_r
, vec_r
, 0);
249 vec_g
= _mm_shuffle_ps(vec_g
, vec_g
, 0);
250 vec_b
= _mm_shuffle_ps(vec_b
, vec_b
, 0);
252 vec_r
= _mm_mul_ps(vec_r
, mat0
);
253 vec_g
= _mm_mul_ps(vec_g
, mat1
);
254 vec_b
= _mm_mul_ps(vec_b
, mat2
);
258 vec_r
= _mm_add_ps(vec_r
, _mm_add_ps(vec_g
, vec_b
));
259 vec_r
= _mm_max_ps(min
, vec_r
);
260 vec_r
= _mm_min_ps(max
, vec_r
);
261 result
= _mm_mul_ps(vec_r
, scale
);
263 _mm_store_si128((__m128i
*)output
, _mm_cvtps_epi32(result
));
265 dest
[r_out
] = otdata_r
[output
[0]];
266 dest
[1] = otdata_g
[output
[1]];
267 dest
[b_out
] = otdata_b
[output
[2]];