2 // Copyright (C) 2009 Mozilla Foundation
4 // Permission is hereby granted, free of charge, to any person obtaining
5 // a copy of this software and associated documentation files (the "Software"),
6 // to deal in the Software without restriction, including without limitation
7 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 // and/or sell copies of the Software, and to permit persons to whom the Software
9 // is furnished to do so, subject to the following conditions:
11 // The above copyright notice and this permission notice shall be included in
12 // all copies or substantial portions of the Software.
14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
16 // THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 #include <xmmintrin.h>
26 /* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */
27 #define FLOATSCALE (float)(PRECACHE_OUTPUT_SIZE)
28 #define CLAMPMAXVAL ( ((float) (PRECACHE_OUTPUT_SIZE - 1)) / PRECACHE_OUTPUT_SIZE )
29 static const ALIGN
float floatScaleX4
[4] =
30 { FLOATSCALE
, FLOATSCALE
, FLOATSCALE
, FLOATSCALE
};
31 static const ALIGN
float clampMaxValueX4
[4] =
32 { CLAMPMAXVAL
, CLAMPMAXVAL
, CLAMPMAXVAL
, CLAMPMAXVAL
};
34 void qcms_transform_data_rgb_out_lut_sse1(qcms_transform
*transform
,
38 qcms_format_type output_format
)
41 float (*mat
)[4] = transform
->matrix
;
43 /* Ensure we have a buffer that's 16 byte aligned regardless of the original
44 * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
45 * because they don't work on stack variables. gcc 4.4 does do the right thing
46 * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
47 float const * input
= (float*)(((uintptr_t)&input_back
[16]) & ~0xf);
48 /* share input and output locations to save having to keep the
49 * locations in separate registers */
50 uint32_t const * output
= (uint32_t*)input
;
52 /* deref *transform now to avoid it in loop */
53 const float *igtbl_r
= transform
->input_gamma_table_r
;
54 const float *igtbl_g
= transform
->input_gamma_table_g
;
55 const float *igtbl_b
= transform
->input_gamma_table_b
;
57 /* deref *transform now to avoid it in loop */
58 const uint8_t *otdata_r
= &transform
->output_table_r
->data
[0];
59 const uint8_t *otdata_g
= &transform
->output_table_g
->data
[0];
60 const uint8_t *otdata_b
= &transform
->output_table_b
->data
[0];
62 /* input matrix values never change */
63 const __m128 mat0
= _mm_load_ps(mat
[0]);
64 const __m128 mat1
= _mm_load_ps(mat
[1]);
65 const __m128 mat2
= _mm_load_ps(mat
[2]);
67 /* these values don't change, either */
68 const __m128 max
= _mm_load_ps(clampMaxValueX4
);
69 const __m128 min
= _mm_setzero_ps();
70 const __m128 scale
= _mm_load_ps(floatScaleX4
);
72 /* working variables */
73 __m128 vec_r
, vec_g
, vec_b
, result
;
74 const int r_out
= output_format
.r
;
75 const int b_out
= output_format
.b
;
81 /* one pixel is handled outside of the loop */
84 /* setup for transforming 1st pixel */
85 vec_r
= _mm_load_ss(&igtbl_r
[src
[0]]);
86 vec_g
= _mm_load_ss(&igtbl_g
[src
[1]]);
87 vec_b
= _mm_load_ss(&igtbl_b
[src
[2]]);
90 /* transform all but final pixel */
92 for (i
=0; i
<length
; i
++)
94 /* position values from gamma tables */
95 vec_r
= _mm_shuffle_ps(vec_r
, vec_r
, 0);
96 vec_g
= _mm_shuffle_ps(vec_g
, vec_g
, 0);
97 vec_b
= _mm_shuffle_ps(vec_b
, vec_b
, 0);
100 vec_r
= _mm_mul_ps(vec_r
, mat0
);
101 vec_g
= _mm_mul_ps(vec_g
, mat1
);
102 vec_b
= _mm_mul_ps(vec_b
, mat2
);
104 /* crunch, crunch, crunch */
105 vec_r
= _mm_add_ps(vec_r
, _mm_add_ps(vec_g
, vec_b
));
106 vec_r
= _mm_max_ps(min
, vec_r
);
107 vec_r
= _mm_min_ps(max
, vec_r
);
108 result
= _mm_mul_ps(vec_r
, scale
);
110 /* store calc'd output tables indices */
111 *((__m64
*)&output
[0]) = _mm_cvtps_pi32(result
);
112 result
= _mm_movehl_ps(result
, result
);
113 *((__m64
*)&output
[2]) = _mm_cvtps_pi32(result
) ;
115 /* load for next loop while store completes */
116 vec_r
= _mm_load_ss(&igtbl_r
[src
[0]]);
117 vec_g
= _mm_load_ss(&igtbl_g
[src
[1]]);
118 vec_b
= _mm_load_ss(&igtbl_b
[src
[2]]);
121 /* use calc'd indices to output RGB values */
122 dest
[r_out
] = otdata_r
[output
[0]];
123 dest
[1] = otdata_g
[output
[1]];
124 dest
[b_out
] = otdata_b
[output
[2]];
128 /* handle final (maybe only) pixel */
130 vec_r
= _mm_shuffle_ps(vec_r
, vec_r
, 0);
131 vec_g
= _mm_shuffle_ps(vec_g
, vec_g
, 0);
132 vec_b
= _mm_shuffle_ps(vec_b
, vec_b
, 0);
134 vec_r
= _mm_mul_ps(vec_r
, mat0
);
135 vec_g
= _mm_mul_ps(vec_g
, mat1
);
136 vec_b
= _mm_mul_ps(vec_b
, mat2
);
138 vec_r
= _mm_add_ps(vec_r
, _mm_add_ps(vec_g
, vec_b
));
139 vec_r
= _mm_max_ps(min
, vec_r
);
140 vec_r
= _mm_min_ps(max
, vec_r
);
141 result
= _mm_mul_ps(vec_r
, scale
);
143 *((__m64
*)&output
[0]) = _mm_cvtps_pi32(result
);
144 result
= _mm_movehl_ps(result
, result
);
145 *((__m64
*)&output
[2]) = _mm_cvtps_pi32(result
);
147 dest
[r_out
] = otdata_r
[output
[0]];
148 dest
[1] = otdata_g
[output
[1]];
149 dest
[b_out
] = otdata_b
[output
[2]];
154 void qcms_transform_data_rgba_out_lut_sse1(qcms_transform
*transform
,
158 qcms_format_type output_format
)
161 float (*mat
)[4] = transform
->matrix
;
163 /* Ensure we have a buffer that's 16 byte aligned regardless of the original
164 * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
165 * because they don't work on stack variables. gcc 4.4 does do the right thing
166 * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
167 float const * input
= (float*)(((uintptr_t)&input_back
[16]) & ~0xf);
168 /* share input and output locations to save having to keep the
169 * locations in separate registers */
170 uint32_t const * output
= (uint32_t*)input
;
172 /* deref *transform now to avoid it in loop */
173 const float *igtbl_r
= transform
->input_gamma_table_r
;
174 const float *igtbl_g
= transform
->input_gamma_table_g
;
175 const float *igtbl_b
= transform
->input_gamma_table_b
;
177 /* deref *transform now to avoid it in loop */
178 const uint8_t *otdata_r
= &transform
->output_table_r
->data
[0];
179 const uint8_t *otdata_g
= &transform
->output_table_g
->data
[0];
180 const uint8_t *otdata_b
= &transform
->output_table_b
->data
[0];
182 /* input matrix values never change */
183 const __m128 mat0
= _mm_load_ps(mat
[0]);
184 const __m128 mat1
= _mm_load_ps(mat
[1]);
185 const __m128 mat2
= _mm_load_ps(mat
[2]);
187 /* these values don't change, either */
188 const __m128 max
= _mm_load_ps(clampMaxValueX4
);
189 const __m128 min
= _mm_setzero_ps();
190 const __m128 scale
= _mm_load_ps(floatScaleX4
);
192 /* working variables */
193 __m128 vec_r
, vec_g
, vec_b
, result
;
194 const int r_out
= output_format
.r
;
195 const int b_out
= output_format
.b
;
202 /* one pixel is handled outside of the loop */
205 /* setup for transforming 1st pixel */
206 vec_r
= _mm_load_ss(&igtbl_r
[src
[0]]);
207 vec_g
= _mm_load_ss(&igtbl_g
[src
[1]]);
208 vec_b
= _mm_load_ss(&igtbl_b
[src
[2]]);
212 /* transform all but final pixel */
214 for (i
=0; i
<length
; i
++)
216 /* position values from gamma tables */
217 vec_r
= _mm_shuffle_ps(vec_r
, vec_r
, 0);
218 vec_g
= _mm_shuffle_ps(vec_g
, vec_g
, 0);
219 vec_b
= _mm_shuffle_ps(vec_b
, vec_b
, 0);
222 vec_r
= _mm_mul_ps(vec_r
, mat0
);
223 vec_g
= _mm_mul_ps(vec_g
, mat1
);
224 vec_b
= _mm_mul_ps(vec_b
, mat2
);
226 /* store alpha for this pixel; load alpha for next */
230 /* crunch, crunch, crunch */
231 vec_r
= _mm_add_ps(vec_r
, _mm_add_ps(vec_g
, vec_b
));
232 vec_r
= _mm_max_ps(min
, vec_r
);
233 vec_r
= _mm_min_ps(max
, vec_r
);
234 result
= _mm_mul_ps(vec_r
, scale
);
236 /* store calc'd output tables indices */
237 *((__m64
*)&output
[0]) = _mm_cvtps_pi32(result
);
238 result
= _mm_movehl_ps(result
, result
);
239 *((__m64
*)&output
[2]) = _mm_cvtps_pi32(result
);
241 /* load gamma values for next loop while store completes */
242 vec_r
= _mm_load_ss(&igtbl_r
[src
[0]]);
243 vec_g
= _mm_load_ss(&igtbl_g
[src
[1]]);
244 vec_b
= _mm_load_ss(&igtbl_b
[src
[2]]);
247 /* use calc'd indices to output RGB values */
248 dest
[r_out
] = otdata_r
[output
[0]];
249 dest
[1] = otdata_g
[output
[1]];
250 dest
[b_out
] = otdata_b
[output
[2]];
254 /* handle final (maybe only) pixel */
256 vec_r
= _mm_shuffle_ps(vec_r
, vec_r
, 0);
257 vec_g
= _mm_shuffle_ps(vec_g
, vec_g
, 0);
258 vec_b
= _mm_shuffle_ps(vec_b
, vec_b
, 0);
260 vec_r
= _mm_mul_ps(vec_r
, mat0
);
261 vec_g
= _mm_mul_ps(vec_g
, mat1
);
262 vec_b
= _mm_mul_ps(vec_b
, mat2
);
266 vec_r
= _mm_add_ps(vec_r
, _mm_add_ps(vec_g
, vec_b
));
267 vec_r
= _mm_max_ps(min
, vec_r
);
268 vec_r
= _mm_min_ps(max
, vec_r
);
269 result
= _mm_mul_ps(vec_r
, scale
);
271 *((__m64
*)&output
[0]) = _mm_cvtps_pi32(result
);
272 result
= _mm_movehl_ps(result
, result
);
273 *((__m64
*)&output
[2]) = _mm_cvtps_pi32(result
);
275 dest
[r_out
] = otdata_r
[output
[0]];
276 dest
[1] = otdata_g
[output
[1]];
277 dest
[b_out
] = otdata_b
[output
[2]];