extensions/sse-half.c

   1 /* babl - dynamically extendable universal pixel conversion library.
   2  * Copyright (C) 2015 Daniel Sabo
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public
   6  * License as published by the Free Software Foundation; either
   7  * version 3 of the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General
  15  * Public License along with this library; if not, see
  16  * <https://www.gnu.org/licenses/>.
  17  */
  18
  19 #include "config.h"
  20
  21 #if defined(USE_SSE4_1) && defined(USE_F16C) && defined(ARCH_X86_64)
  22
  23 #include <immintrin.h>
  24
  25 #include <stdint.h>
  26 #include <stdlib.h>
  27
  28 #include "babl.h"
  29 #include "babl-cpuaccel.h"
  30 #include "extensions/util.h"
  31
  32 static inline void
  33 conv_yHalf_yF (const Babl     *conversion,
  34                const uint16_t *src,
  35                float          *dst,
  36                long            samples)
  37 {
  38   const uint64_t *s_vec;
  39   __v4sf         *d_vec;
  40
  41   long n = samples;
  42
  43   s_vec = (const uint64_t *)src;
  44   d_vec = (__v4sf *)dst;
  45
  46   while (n >= 4)
  47     {
  48       __m128i in_val = _mm_insert_epi64((__m128i)_mm_setzero_ps(), *s_vec++, 0);
  49       __v4sf out_val = (__v4sf)_mm_cvtph_ps(in_val);
  50       _mm_storeu_ps((float *)d_vec++, out_val);
  51       n -= 4;
  52     }
  53
  54   src = (const uint16_t *)s_vec;
  55   dst = (float *)d_vec;
  56
  57   while (n)
  58     {
  59       __m128i in_val = _mm_insert_epi16((__m128i)_mm_setzero_ps(), *src++, 0);
  60       __v4sf out_val = (__v4sf)_mm_cvtph_ps(in_val);
  61       _mm_store_ss(dst++, out_val);
  62       n -= 1;
  63     }
  64 }
  65
  66 static void
  67 conv_yaHalf_yaF (const Babl     *conversion,
  68                  const uint16_t *src,
  69                  float          *dst,
  70                  long            samples)
  71 {
  72   conv_yHalf_yF (conversion, src, dst, samples * 2);
  73 }
  74
  75 static void
  76 conv_rgbHalf_rgbF (const Babl     *conversion,
  77                    const uint16_t *src,
  78                    float          *dst,
  79                    long            samples)
  80 {
  81   conv_yHalf_yF (conversion, src, dst, samples * 3);
  82 }
  83
  84 static void
  85 conv_rgbaHalf_rgbaF (const Babl     *conversion,
  86                      const uint16_t *src,
  87                      float          *dst,
  88                      long            samples)
  89 {
  90   conv_yHalf_yF (conversion, src, dst, samples * 4);
  91 }
  92
  93 static inline void
  94 conv_yF_yHalf (const Babl  *conversion,
  95                const float *src,
  96                uint16_t    *dst,
  97                long         samples)
  98 {
  99   const __v4sf *s_vec;
 100   uint64_t     *d_vec;
 101
 102   long n = samples;
 103
 104   s_vec = (const __v4sf *)src;
 105   d_vec = (uint64_t *)dst;
 106
 107   while (n >= 4)
 108     {
 109       __m128 in_val = _mm_loadu_ps((float *)s_vec++);
 110       __m128i out_val = _mm_cvtps_ph(in_val, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 111       _mm_storel_epi64((__m128i *)d_vec++, out_val);
 112       n -= 4;
 113     }
 114
 115   src = (const float *)s_vec;
 116   dst = (uint16_t *)d_vec;
 117
 118   while (n)
 119     {
 120       __m128 in_val = _mm_load_ss(src++);
 121       __m128i out_val = _mm_cvtps_ph(in_val, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 122       *dst++ = _mm_extract_epi16(out_val, 0);
 123       n -= 1;
 124     }
 125 }
 126
 127 static void
 128 conv_yaF_yaHalf (const Babl  *conversion,
 129                  const float *src,
 130                  uint16_t    *dst,
 131                  long         samples)
 132 {
 133   conv_yF_yHalf (conversion, src, dst, samples * 2);
 134 }
 135
 136 #define conv_yAF_yAHalf conv_yaF_yaHalf
 137 #define conv_yAHalf_yAF conv_yaHalf_yaF
 138
 139 static void
 140 conv_rgbF_rgbHalf (const Babl  *conversion,
 141                    const float *src,
 142                    uint16_t    *dst,
 143                    long         samples)
 144 {
 145   conv_yF_yHalf (conversion, src, dst, samples * 3);
 146 }
 147
 148 static void
 149 conv_rgbaF_rgbaHalf (const Babl  *conversion,
 150                      const float *src,
 151                      uint16_t    *dst,
 152                      long         samples)
 153 {
 154   conv_yF_yHalf (conversion, src, dst, samples * 4);
 155 }
 156
 157 #endif /* defined(USE_SSE4_1) && defined(USE_F16C) && defined(ARCH_X86_64) */
 158
 159 int init (void);
 160
 161 int
 162 init (void)
 163 {
 164 #if defined(USE_SSE4_1) && defined(USE_F16C) && defined(ARCH_X86_64)
 165   const Babl *rgbaF_linear = babl_format_new (
 166     babl_model ("RGBA"),
 167     babl_type ("float"),
 168     babl_component ("R"),
 169     babl_component ("G"),
 170     babl_component ("B"),
 171     babl_component ("A"),
 172     NULL);
 173   const Babl *rgbaHalf_linear = babl_format_new (
 174     babl_model ("RGBA"),
 175     babl_type ("half"),
 176     babl_component ("R"),
 177     babl_component ("G"),
 178     babl_component ("B"),
 179     babl_component ("A"),
 180     NULL);
 181   const Babl *rgbaF_gamma = babl_format_new (
 182     babl_model ("R'G'B'A"),
 183     babl_type ("float"),
 184     babl_component ("R'"),
 185     babl_component ("G'"),
 186     babl_component ("B'"),
 187     babl_component ("A"),
 188     NULL);
 189   const Babl *rgbaHalf_gamma = babl_format_new (
 190     babl_model ("R'G'B'A"),
 191     babl_type ("half"),
 192     babl_component ("R'"),
 193     babl_component ("G'"),
 194     babl_component ("B'"),
 195     babl_component ("A"),
 196     NULL);
 197   const Babl *rgbF_linear = babl_format_new (
 198     babl_model ("RGB"),
 199     babl_type ("float"),
 200     babl_component ("R"),
 201     babl_component ("G"),
 202     babl_component ("B"),
 203     NULL);
 204   const Babl *rgbHalf_linear = babl_format_new (
 205     babl_model ("RGB"),
 206     babl_type ("half"),
 207     babl_component ("R"),
 208     babl_component ("G"),
 209     babl_component ("B"),
 210     NULL);
 211   const Babl *rgbF_gamma = babl_format_new (
 212     babl_model ("R'G'B'"),
 213     babl_type ("float"),
 214     babl_component ("R'"),
 215     babl_component ("G'"),
 216     babl_component ("B'"),
 217     NULL);
 218   const Babl *rgbHalf_gamma = babl_format_new (
 219     babl_model ("R'G'B'"),
 220     babl_type ("half"),
 221     babl_component ("R'"),
 222     babl_component ("G'"),
 223     babl_component ("B'"),
 224     NULL);
 225   const Babl *yaF_linear = babl_format_new (
 226     babl_model ("YA"),
 227     babl_type ("float"),
 228     babl_component ("Y"),
 229     babl_component ("A"),
 230     NULL);
 231   const Babl *yaHalf_linear = babl_format_new (
 232     babl_model ("YA"),
 233     babl_type ("half"),
 234     babl_component ("Y"),
 235     babl_component ("A"),
 236     NULL);
 237   const Babl *yaF_gamma = babl_format_new (
 238     babl_model ("Y'A"),
 239     babl_type ("float"),
 240     babl_component ("Y'"),
 241     babl_component ("A"),
 242     NULL);
 243   const Babl *yaHalf_gamma = babl_format_new (
 244     babl_model ("Y'A"),
 245     babl_type ("half"),
 246     babl_component ("Y'"),
 247     babl_component ("A"),
 248     NULL);
 249   const Babl *yAF_linear = babl_format_new (
 250     babl_model ("YaA"),
 251     babl_type ("float"),
 252     babl_component ("Ya"),
 253     babl_component ("A"),
 254     NULL);
 255   const Babl *yAHalf_linear = babl_format_new (
 256     babl_model ("YaA"),
 257     babl_type ("half"),
 258     babl_component ("Ya"),
 259     babl_component ("A"),
 260     NULL);
 261   const Babl *yAF_gamma = babl_format_new (
 262     babl_model ("Y'aA"),
 263     babl_type ("float"),
 264     babl_component ("Y'a"),
 265     babl_component ("A"),
 266     NULL);
 267   const Babl *yAHalf_gamma = babl_format_new (
 268     babl_model ("Y'aA"),
 269     babl_type ("half"),
 270     babl_component ("Y'a"),
 271     babl_component ("A"),
 272     NULL);
 273   const Babl *yF_linear = babl_format_new (
 274     babl_model ("Y"),
 275     babl_type ("float"),
 276     babl_component ("Y"),
 277     NULL);
 278   const Babl *yHalf_linear = babl_format_new (
 279     babl_model ("Y"),
 280     babl_type ("half"),
 281     babl_component ("Y"),
 282     NULL);
 283   const Babl *yF_gamma = babl_format_new (
 284     babl_model ("Y'"),
 285     babl_type ("float"),
 286     babl_component ("Y'"),
 287     NULL);
 288   const Babl *yHalf_gamma = babl_format_new (
 289     babl_model ("Y'"),
 290     babl_type ("half"),
 291     babl_component ("Y'"),
 292     NULL);
 293
 294 #define CONV(src, dst) \
 295 { \
 296   babl_conversion_new (src ## _linear, dst ## _linear, "linear", conv_ ## src ## _ ## dst, NULL); \
 297   babl_conversion_new (src ## _gamma, dst ## _gamma, "linear", conv_ ## src ## _ ## dst, NULL); \
 298 }
 299
 300   if ((babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE4_1) &&
 301       (babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_F16C))
 302     {
 303       CONV(rgbaHalf, rgbaF);
 304       CONV(rgbHalf,  rgbF);
 305       CONV(yaHalf,   yaF);
 306       CONV(yAHalf,   yAF);
 307       CONV(yHalf,    yF);
 308       CONV(rgbaF,    rgbaHalf);
 309       CONV(rgbF,     rgbHalf);
 310       CONV(yaF,      yaHalf);
 311       CONV(yAF,      yAHalf);
 312       CONV(yF,       yHalf);
 313     }
 314
 315 #endif /* defined(USE_SSE4_1) && defined(USE_F16C) && defined(ARCH_X86_64) */
 316   return 0;
 317 }
 318