babl: fix some annotation to make the function usable in bindings.
[babl.git] / extensions / sse-half.c
blob653d68ffd11414155ad75f8001aab50d0ff44105
1 /* babl - dynamically extendable universal pixel conversion library.
2 * Copyright (C) 2015 Daniel Sabo
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 3 of the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General
15 * Public License along with this library; if not, see
16 * <https://www.gnu.org/licenses/>.
19 #include "config.h"
21 #if defined(USE_SSE4_1) && defined(USE_F16C) && defined(ARCH_X86_64)
23 #include <immintrin.h>
25 #include <stdint.h>
26 #include <stdlib.h>
28 #include "babl.h"
29 #include "babl-cpuaccel.h"
30 #include "extensions/util.h"
32 static inline void
33 conv_yHalf_yF (const Babl *conversion,
34 const uint16_t *src,
35 float *dst,
36 long samples)
38 const uint64_t *s_vec;
39 __v4sf *d_vec;
41 long n = samples;
43 s_vec = (const uint64_t *)src;
44 d_vec = (__v4sf *)dst;
46 while (n >= 4)
48 __m128i in_val = _mm_insert_epi64((__m128i)_mm_setzero_ps(), *s_vec++, 0);
49 __v4sf out_val = (__v4sf)_mm_cvtph_ps(in_val);
50 _mm_storeu_ps((float *)d_vec++, out_val);
51 n -= 4;
54 src = (const uint16_t *)s_vec;
55 dst = (float *)d_vec;
57 while (n)
59 __m128i in_val = _mm_insert_epi16((__m128i)_mm_setzero_ps(), *src++, 0);
60 __v4sf out_val = (__v4sf)_mm_cvtph_ps(in_val);
61 _mm_store_ss(dst++, out_val);
62 n -= 1;
66 static void
67 conv_yaHalf_yaF (const Babl *conversion,
68 const uint16_t *src,
69 float *dst,
70 long samples)
72 conv_yHalf_yF (conversion, src, dst, samples * 2);
75 static void
76 conv_rgbHalf_rgbF (const Babl *conversion,
77 const uint16_t *src,
78 float *dst,
79 long samples)
81 conv_yHalf_yF (conversion, src, dst, samples * 3);
84 static void
85 conv_rgbaHalf_rgbaF (const Babl *conversion,
86 const uint16_t *src,
87 float *dst,
88 long samples)
90 conv_yHalf_yF (conversion, src, dst, samples * 4);
93 static inline void
94 conv_yF_yHalf (const Babl *conversion,
95 const float *src,
96 uint16_t *dst,
97 long samples)
99 const __v4sf *s_vec;
100 uint64_t *d_vec;
102 long n = samples;
104 s_vec = (const __v4sf *)src;
105 d_vec = (uint64_t *)dst;
107 while (n >= 4)
109 __m128 in_val = _mm_loadu_ps((float *)s_vec++);
110 __m128i out_val = _mm_cvtps_ph(in_val, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
111 _mm_storel_epi64((__m128i *)d_vec++, out_val);
112 n -= 4;
115 src = (const float *)s_vec;
116 dst = (uint16_t *)d_vec;
118 while (n)
120 __m128 in_val = _mm_load_ss(src++);
121 __m128i out_val = _mm_cvtps_ph(in_val, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
122 *dst++ = _mm_extract_epi16(out_val, 0);
123 n -= 1;
127 static void
128 conv_yaF_yaHalf (const Babl *conversion,
129 const float *src,
130 uint16_t *dst,
131 long samples)
133 conv_yF_yHalf (conversion, src, dst, samples * 2);
136 #define conv_yAF_yAHalf conv_yaF_yaHalf
137 #define conv_yAHalf_yAF conv_yaHalf_yaF
139 static void
140 conv_rgbF_rgbHalf (const Babl *conversion,
141 const float *src,
142 uint16_t *dst,
143 long samples)
145 conv_yF_yHalf (conversion, src, dst, samples * 3);
148 static void
149 conv_rgbaF_rgbaHalf (const Babl *conversion,
150 const float *src,
151 uint16_t *dst,
152 long samples)
154 conv_yF_yHalf (conversion, src, dst, samples * 4);
157 #endif /* defined(USE_SSE4_1) && defined(USE_F16C) && defined(ARCH_X86_64) */
159 int init (void);
162 init (void)
164 #if defined(USE_SSE4_1) && defined(USE_F16C) && defined(ARCH_X86_64)
165 const Babl *rgbaF_linear = babl_format_new (
166 babl_model ("RGBA"),
167 babl_type ("float"),
168 babl_component ("R"),
169 babl_component ("G"),
170 babl_component ("B"),
171 babl_component ("A"),
172 NULL);
173 const Babl *rgbaHalf_linear = babl_format_new (
174 babl_model ("RGBA"),
175 babl_type ("half"),
176 babl_component ("R"),
177 babl_component ("G"),
178 babl_component ("B"),
179 babl_component ("A"),
180 NULL);
181 const Babl *rgbaF_gamma = babl_format_new (
182 babl_model ("R'G'B'A"),
183 babl_type ("float"),
184 babl_component ("R'"),
185 babl_component ("G'"),
186 babl_component ("B'"),
187 babl_component ("A"),
188 NULL);
189 const Babl *rgbaHalf_gamma = babl_format_new (
190 babl_model ("R'G'B'A"),
191 babl_type ("half"),
192 babl_component ("R'"),
193 babl_component ("G'"),
194 babl_component ("B'"),
195 babl_component ("A"),
196 NULL);
197 const Babl *rgbF_linear = babl_format_new (
198 babl_model ("RGB"),
199 babl_type ("float"),
200 babl_component ("R"),
201 babl_component ("G"),
202 babl_component ("B"),
203 NULL);
204 const Babl *rgbHalf_linear = babl_format_new (
205 babl_model ("RGB"),
206 babl_type ("half"),
207 babl_component ("R"),
208 babl_component ("G"),
209 babl_component ("B"),
210 NULL);
211 const Babl *rgbF_gamma = babl_format_new (
212 babl_model ("R'G'B'"),
213 babl_type ("float"),
214 babl_component ("R'"),
215 babl_component ("G'"),
216 babl_component ("B'"),
217 NULL);
218 const Babl *rgbHalf_gamma = babl_format_new (
219 babl_model ("R'G'B'"),
220 babl_type ("half"),
221 babl_component ("R'"),
222 babl_component ("G'"),
223 babl_component ("B'"),
224 NULL);
225 const Babl *yaF_linear = babl_format_new (
226 babl_model ("YA"),
227 babl_type ("float"),
228 babl_component ("Y"),
229 babl_component ("A"),
230 NULL);
231 const Babl *yaHalf_linear = babl_format_new (
232 babl_model ("YA"),
233 babl_type ("half"),
234 babl_component ("Y"),
235 babl_component ("A"),
236 NULL);
237 const Babl *yaF_gamma = babl_format_new (
238 babl_model ("Y'A"),
239 babl_type ("float"),
240 babl_component ("Y'"),
241 babl_component ("A"),
242 NULL);
243 const Babl *yaHalf_gamma = babl_format_new (
244 babl_model ("Y'A"),
245 babl_type ("half"),
246 babl_component ("Y'"),
247 babl_component ("A"),
248 NULL);
249 const Babl *yAF_linear = babl_format_new (
250 babl_model ("YaA"),
251 babl_type ("float"),
252 babl_component ("Ya"),
253 babl_component ("A"),
254 NULL);
255 const Babl *yAHalf_linear = babl_format_new (
256 babl_model ("YaA"),
257 babl_type ("half"),
258 babl_component ("Ya"),
259 babl_component ("A"),
260 NULL);
261 const Babl *yAF_gamma = babl_format_new (
262 babl_model ("Y'aA"),
263 babl_type ("float"),
264 babl_component ("Y'a"),
265 babl_component ("A"),
266 NULL);
267 const Babl *yAHalf_gamma = babl_format_new (
268 babl_model ("Y'aA"),
269 babl_type ("half"),
270 babl_component ("Y'a"),
271 babl_component ("A"),
272 NULL);
273 const Babl *yF_linear = babl_format_new (
274 babl_model ("Y"),
275 babl_type ("float"),
276 babl_component ("Y"),
277 NULL);
278 const Babl *yHalf_linear = babl_format_new (
279 babl_model ("Y"),
280 babl_type ("half"),
281 babl_component ("Y"),
282 NULL);
283 const Babl *yF_gamma = babl_format_new (
284 babl_model ("Y'"),
285 babl_type ("float"),
286 babl_component ("Y'"),
287 NULL);
288 const Babl *yHalf_gamma = babl_format_new (
289 babl_model ("Y'"),
290 babl_type ("half"),
291 babl_component ("Y'"),
292 NULL);
294 #define CONV(src, dst) \
296 babl_conversion_new (src ## _linear, dst ## _linear, "linear", conv_ ## src ## _ ## dst, NULL); \
297 babl_conversion_new (src ## _gamma, dst ## _gamma, "linear", conv_ ## src ## _ ## dst, NULL); \
300 if ((babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE4_1) &&
301 (babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_F16C))
303 CONV(rgbaHalf, rgbaF);
304 CONV(rgbHalf, rgbF);
305 CONV(yaHalf, yaF);
306 CONV(yAHalf, yAF);
307 CONV(yHalf, yF);
308 CONV(rgbaF, rgbaHalf);
309 CONV(rgbF, rgbHalf);
310 CONV(yaF, yaHalf);
311 CONV(yAF, yAHalf);
312 CONV(yF, yHalf);
315 #endif /* defined(USE_SSE4_1) && defined(USE_F16C) && defined(ARCH_X86_64) */
316 return 0;