third_party/qcms/src/transform-sse2.c

   1 //  qcms
   2 //  Copyright (C) 2009 Mozilla Foundation
   3 //  Copyright (C) 2015 Intel Corporation
   4 //
   5 // Permission is hereby granted, free of charge, to any person obtaining
   6 // a copy of this software and associated documentation files (the "Software"),
   7 // to deal in the Software without restriction, including without limitation
   8 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9 // and/or sell copies of the Software, and to permit persons to whom the Software
  10 // is furnished to do so, subject to the following conditions:
  11 //
  12 // The above copyright notice and this permission notice shall be included in
  13 // all copies or substantial portions of the Software.
  14 //
  15 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  16 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
  17 // THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  18 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
  19 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  20 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  21 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  22
  23 #include <emmintrin.h>
  24
  25 #include "qcmsint.h"
  26
  27 /* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */
  28 #define FLOATSCALE  (float)(PRECACHE_OUTPUT_SIZE)
  29 #define CLAMPMAXVAL ( ((float) (PRECACHE_OUTPUT_SIZE - 1)) / PRECACHE_OUTPUT_SIZE )
  30 static const ALIGN float floatScaleX4[4] =
  31     { FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE};
  32 static const ALIGN float clampMaxValueX4[4] =
  33     { CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL};
  34
  35 void qcms_transform_data_rgb_out_lut_sse2(qcms_transform *transform,
  36                                           unsigned char *src,
  37                                           unsigned char *dest,
  38                                           size_t length,
  39                                           qcms_format_type output_format)
  40 {
  41     unsigned int i;
  42     float (*mat)[4] = transform->matrix;
  43     char input_back[32];
  44     /* Ensure we have a buffer that's 16 byte aligned regardless of the original
  45      * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
  46      * because they don't work on stack variables. gcc 4.4 does do the right thing
  47      * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
  48     float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
  49     /* share input and output locations to save having to keep the
  50      * locations in separate registers */
  51     uint32_t const * output = (uint32_t*)input;
  52
  53     /* deref *transform now to avoid it in loop */
  54     const float *igtbl_r = transform->input_gamma_table_r;
  55     const float *igtbl_g = transform->input_gamma_table_g;
  56     const float *igtbl_b = transform->input_gamma_table_b;
  57
  58     /* deref *transform now to avoid it in loop */
  59     const uint8_t *otdata_r = &transform->output_table_r->data[0];
  60     const uint8_t *otdata_g = &transform->output_table_g->data[0];
  61     const uint8_t *otdata_b = &transform->output_table_b->data[0];
  62
  63     /* input matrix values never change */
  64     const __m128 mat0  = _mm_load_ps(mat[0]);
  65     const __m128 mat1  = _mm_load_ps(mat[1]);
  66     const __m128 mat2  = _mm_load_ps(mat[2]);
  67
  68     /* these values don't change, either */
  69     const __m128 max   = _mm_load_ps(clampMaxValueX4);
  70     const __m128 min   = _mm_setzero_ps();
  71     const __m128 scale = _mm_load_ps(floatScaleX4);
  72
  73     /* working variables */
  74     __m128 vec_r, vec_g, vec_b, result;
  75     const int r_out = output_format.r;
  76     const int b_out = output_format.b;
  77
  78     /* CYA */
  79     if (!length)
  80         return;
  81
  82     /* one pixel is handled outside of the loop */
  83     length--;
  84
  85     /* setup for transforming 1st pixel */
  86     vec_r = _mm_load_ss(&igtbl_r[src[0]]);
  87     vec_g = _mm_load_ss(&igtbl_g[src[1]]);
  88     vec_b = _mm_load_ss(&igtbl_b[src[2]]);
  89     src += 3;
  90
  91     /* transform all but final pixel */
  92
  93     for (i=0; i<length; i++)
  94     {
  95         /* position values from gamma tables */
  96         vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
  97         vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
  98         vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
  99
 100         /* gamma * matrix */
 101         vec_r = _mm_mul_ps(vec_r, mat0);
 102         vec_g = _mm_mul_ps(vec_g, mat1);
 103         vec_b = _mm_mul_ps(vec_b, mat2);
 104
 105         /* crunch, crunch, crunch */
 106         vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
 107         vec_r  = _mm_max_ps(min, vec_r);
 108         vec_r  = _mm_min_ps(max, vec_r);
 109         result = _mm_mul_ps(vec_r, scale);
 110
 111         /* store calc'd output tables indices */
 112         _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
 113
 114         /* load for next loop while store completes */
 115         vec_r = _mm_load_ss(&igtbl_r[src[0]]);
 116         vec_g = _mm_load_ss(&igtbl_g[src[1]]);
 117         vec_b = _mm_load_ss(&igtbl_b[src[2]]);
 118         src += 3;
 119
 120         /* use calc'd indices to output RGB values */
 121         dest[r_out] = otdata_r[output[0]];
 122         dest[1]     = otdata_g[output[1]];
 123         dest[b_out] = otdata_b[output[2]];
 124         dest += 3;
 125     }
 126
 127     /* handle final (maybe only) pixel */
 128
 129     vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
 130     vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
 131     vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
 132
 133     vec_r = _mm_mul_ps(vec_r, mat0);
 134     vec_g = _mm_mul_ps(vec_g, mat1);
 135     vec_b = _mm_mul_ps(vec_b, mat2);
 136
 137     vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
 138     vec_r  = _mm_max_ps(min, vec_r);
 139     vec_r  = _mm_min_ps(max, vec_r);
 140     result = _mm_mul_ps(vec_r, scale);
 141
 142     _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
 143
 144     dest[r_out] = otdata_r[output[0]];
 145     dest[1]     = otdata_g[output[1]];
 146     dest[b_out] = otdata_b[output[2]];
 147 }
 148
 149 void qcms_transform_data_rgba_out_lut_sse2(qcms_transform *transform,
 150                                            unsigned char *src,
 151                                            unsigned char *dest,
 152                                            size_t length,
 153                                            qcms_format_type output_format)
 154 {
 155     unsigned int i;
 156     float (*mat)[4] = transform->matrix;
 157     char input_back[32];
 158     /* Ensure we have a buffer that's 16 byte aligned regardless of the original
 159      * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
 160      * because they don't work on stack variables. gcc 4.4 does do the right thing
 161      * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
 162     float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
 163     /* share input and output locations to save having to keep the
 164      * locations in separate registers */
 165     uint32_t const * output = (uint32_t*)input;
 166
 167     /* deref *transform now to avoid it in loop */
 168     const float *igtbl_r = transform->input_gamma_table_r;
 169     const float *igtbl_g = transform->input_gamma_table_g;
 170     const float *igtbl_b = transform->input_gamma_table_b;
 171
 172     /* deref *transform now to avoid it in loop */
 173     const uint8_t *otdata_r = &transform->output_table_r->data[0];
 174     const uint8_t *otdata_g = &transform->output_table_g->data[0];
 175     const uint8_t *otdata_b = &transform->output_table_b->data[0];
 176
 177     /* input matrix values never change */
 178     const __m128 mat0  = _mm_load_ps(mat[0]);
 179     const __m128 mat1  = _mm_load_ps(mat[1]);
 180     const __m128 mat2  = _mm_load_ps(mat[2]);
 181
 182     /* these values don't change, either */
 183     const __m128 max   = _mm_load_ps(clampMaxValueX4);
 184     const __m128 min   = _mm_setzero_ps();
 185     const __m128 scale = _mm_load_ps(floatScaleX4);
 186
 187     /* working variables */
 188     __m128 vec_r, vec_g, vec_b, result;
 189     const int r_out = output_format.r;
 190     const int b_out = output_format.b;
 191     unsigned char alpha;
 192
 193     /* CYA */
 194     if (!length)
 195         return;
 196
 197     /* one pixel is handled outside of the loop */
 198     length--;
 199
 200     /* setup for transforming 1st pixel */
 201     vec_r = _mm_load_ss(&igtbl_r[src[0]]);
 202     vec_g = _mm_load_ss(&igtbl_g[src[1]]);
 203     vec_b = _mm_load_ss(&igtbl_b[src[2]]);
 204     alpha = src[3];
 205     src += 4;
 206
 207     /* transform all but final pixel */
 208
 209     for (i=0; i<length; i++)
 210     {
 211         /* position values from gamma tables */
 212         vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
 213         vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
 214         vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
 215
 216         /* gamma * matrix */
 217         vec_r = _mm_mul_ps(vec_r, mat0);
 218         vec_g = _mm_mul_ps(vec_g, mat1);
 219         vec_b = _mm_mul_ps(vec_b, mat2);
 220
 221         /* store alpha for this pixel; load alpha for next */
 222         dest[3] = alpha;
 223         alpha   = src[3];
 224
 225         /* crunch, crunch, crunch */
 226         vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
 227         vec_r  = _mm_max_ps(min, vec_r);
 228         vec_r  = _mm_min_ps(max, vec_r);
 229         result = _mm_mul_ps(vec_r, scale);
 230
 231         /* store calc'd output tables indices */
 232         _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
 233
 234         /* load gamma values for next loop while store completes */
 235         vec_r = _mm_load_ss(&igtbl_r[src[0]]);
 236         vec_g = _mm_load_ss(&igtbl_g[src[1]]);
 237         vec_b = _mm_load_ss(&igtbl_b[src[2]]);
 238         src += 4;
 239
 240         /* use calc'd indices to output RGB values */
 241         dest[r_out] = otdata_r[output[0]];
 242         dest[1]     = otdata_g[output[1]];
 243         dest[b_out] = otdata_b[output[2]];
 244         dest += 4;
 245     }
 246
 247     /* handle final (maybe only) pixel */
 248
 249     vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
 250     vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
 251     vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
 252
 253     vec_r = _mm_mul_ps(vec_r, mat0);
 254     vec_g = _mm_mul_ps(vec_g, mat1);
 255     vec_b = _mm_mul_ps(vec_b, mat2);
 256
 257     dest[3] = alpha;
 258
 259     vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
 260     vec_r  = _mm_max_ps(min, vec_r);
 261     vec_r  = _mm_min_ps(max, vec_r);
 262     result = _mm_mul_ps(vec_r, scale);
 263
 264     _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
 265
 266     dest[r_out] = otdata_r[output[0]];
 267     dest[1]     = otdata_g[output[1]];
 268     dest[b_out] = otdata_b[output[2]];
 269 }
 270
 271
 272 void qcms_transform_data_tetra_clut_rgba_sse2(qcms_transform *transform,
 273                                               unsigned char *src,
 274                                               unsigned char *dest,
 275                                               size_t length,
 276                                               qcms_format_type output_format)
 277 {
 278     const int r_out = output_format.r;
 279     const int b_out = output_format.b;
 280
 281     size_t i;
 282
 283     const int xy_len = 1;
 284     const int x_len = transform->grid_size;
 285     const int len = x_len * x_len;
 286
 287     const __m128 __clut_stride = _mm_set_ps((float)(3 * xy_len), (float)(3 * x_len), (float)(3 * len), 0);
 288     const __m128 __grid_scaled = _mm_set1_ps((1.0f / 255.0f) * (transform->grid_size - 1));
 289
 290     const __m128 __255 = _mm_set1_ps(255.0f);
 291     const __m128 __one = _mm_set1_ps(1.0f);
 292     const __m128 __000 = _mm_setzero_ps();
 293
 294     const float* r_table = transform->r_clut;
 295     const float* g_table = transform->g_clut;
 296     const float* b_table = transform->b_clut;
 297
 298     int i3, i2, i1, i0;
 299
 300     __m128 c3;
 301     __m128 c2;
 302     __m128 c1;
 303     __m128 c0;
 304
 305     __m128 in;
 306
 307     __m128 xyz_r;
 308     __m128 xyz_0;
 309     __m128 xyz_n;
 310
 311     ALIGN float xyz_r_f[4];
 312     ALIGN int   xyz_0_i[4];
 313     ALIGN int   xyz_n_i[4];
 314
 315     __m128i result;
 316
 317 #define TETRA_SRC_RGB(r, g, b) _mm_set_ps((float)b, (float)g, (float)r, 0.f)
 318
 319     for (i = 0; i < length; ++i) {
 320         // compute input point in cube lattice (grid) co-ordinates
 321         in = _mm_mul_ps(TETRA_SRC_RGB(src[0], src[1], src[2]), __grid_scaled);
 322
 323         // floor: convert to int (truncate), convert back to float
 324         xyz_0 = _mm_cvtepi32_ps(_mm_cvttps_epi32(in));
 325
 326         // ceil: where in is greater than xyz_0 = floor(in), add 1
 327         xyz_n = _mm_add_ps(xyz_0, _mm_and_ps(_mm_cmpgt_ps(in, xyz_0), __one));
 328
 329         // compute the input point relative to the sub-cube origin
 330         xyz_r = _mm_sub_ps(in, xyz_0);
 331
 332 #define rx (xyz_r_f[1])
 333 #define ry (xyz_r_f[2])
 334 #define rz (xyz_r_f[3])
 335
 336         _mm_store_ps(xyz_r_f, xyz_r);
 337
 338 #define x0 (xyz_0_i[1])
 339 #define y0 (xyz_0_i[2])
 340 #define z0 (xyz_0_i[3])
 341
 342         xyz_0 = _mm_mul_ps(xyz_0, __clut_stride);
 343         _mm_store_si128((__m128i*) xyz_0_i, _mm_cvtps_epi32(xyz_0));
 344
 345 #define xn (xyz_n_i[1])
 346 #define yn (xyz_n_i[2])
 347 #define zn (xyz_n_i[3])
 348
 349         xyz_n = _mm_mul_ps(xyz_n, __clut_stride);
 350         _mm_store_si128((__m128i*) xyz_n_i, _mm_cvtps_epi32(xyz_n));
 351
 352         dest[3] = src[3];
 353         src += 4;
 354
 355 #define SET_I0_AND_PREFETCH_CLUT() \
 356         _mm_prefetch((char*)&(r_table[i0 = x0 + y0 + z0]), _MM_HINT_T0)
 357
 358 #if !defined(_MSC_VER)
 359         SET_I0_AND_PREFETCH_CLUT();
 360 #endif
 361
 362 #define TETRA_LOOKUP_CLUT(i3, i2, i1, i0) \
 363         c0 = _mm_set_ps(b_table[i0], g_table[i0], r_table[i0], 0.f), \
 364         c1 = _mm_set_ps(b_table[i1], g_table[i1], r_table[i1], 0.f), \
 365         c2 = _mm_set_ps(b_table[i2], g_table[i2], r_table[i2], 0.f), \
 366         c3 = _mm_set_ps(b_table[i3], g_table[i3], r_table[i3], 0.f)
 367
 368         if (rx >= ry) {
 369
 370 #if defined(_MSC_VER)
 371             SET_I0_AND_PREFETCH_CLUT();
 372 #endif
 373             if (ry >= rz) {         // rx >= ry && ry >= rz
 374
 375                 i3 = yn + (i1 = xn);
 376                 i1 += i0 - x0;
 377                 i2 = i3 + z0;
 378                 i3 += zn;
 379
 380                 TETRA_LOOKUP_CLUT(i3, i2, i1, i0);
 381
 382                 c3 = _mm_sub_ps(c3, c2);
 383                 c2 = _mm_sub_ps(c2, c1);
 384                 c1 = _mm_sub_ps(c1, c0);
 385
 386             } else if (rx >= rz) {  // rx >= rz && rz >= ry
 387
 388                 i3 = zn + (i1 = xn);
 389                 i1 += i0 - x0;
 390                 i2 = i3 + yn;
 391                 i3 += y0;
 392
 393                 TETRA_LOOKUP_CLUT(i3, i2, i1, i0);
 394
 395                 c2 = _mm_sub_ps(c2, c3);
 396                 c3 = _mm_sub_ps(c3, c1);
 397                 c1 = _mm_sub_ps(c1, c0);
 398
 399             } else {                // rz > rx && rx >= ry
 400
 401                 i2 = xn + (i3 = zn);
 402                 i3 += i0 - z0;
 403                 i1 = i2 + y0;
 404                 i2 += yn;
 405
 406                 TETRA_LOOKUP_CLUT(i3, i2, i1, i0);
 407
 408                 c2 = _mm_sub_ps(c2, c1);
 409                 c1 = _mm_sub_ps(c1, c3);
 410                 c3 = _mm_sub_ps(c3, c0);
 411             }
 412         } else {
 413
 414 #if defined(_MSC_VER)
 415             SET_I0_AND_PREFETCH_CLUT();
 416 #endif
 417             if (rx >= rz) {         // ry > rx && rx >= rz
 418
 419                 i3 = xn + (i2 = yn);
 420                 i2 += i0 - y0;
 421                 i1 = i3 + z0;
 422                 i3 += zn;
 423
 424                 TETRA_LOOKUP_CLUT(i3, i2, i1, i0);
 425
 426                 c3 = _mm_sub_ps(c3, c1);
 427                 c1 = _mm_sub_ps(c1, c2);
 428                 c2 = _mm_sub_ps(c2, c0);
 429
 430             } else if (ry >= rz) {  // ry >= rz && rz > rx
 431
 432                 i3 = zn + (i2 = yn);
 433                 i2 += i0 - y0;
 434                 i1 = i3 + xn;
 435                 i3 += x0;
 436
 437                 TETRA_LOOKUP_CLUT(i3, i2, i1, i0);
 438
 439                 c1 = _mm_sub_ps(c1, c3);
 440                 c3 = _mm_sub_ps(c3, c2);
 441                 c2 = _mm_sub_ps(c2, c0);
 442
 443             } else {                // rz > ry && ry > rx
 444
 445                 i2 = yn + (i3 = zn);
 446                 i3 += i0 - z0;
 447                 i1 = i2 + xn;
 448                 i2 += x0;
 449
 450                 TETRA_LOOKUP_CLUT(i3, i2, i1, i0);
 451
 452                 c1 = _mm_sub_ps(c1, c2);
 453                 c2 = _mm_sub_ps(c2, c3);
 454                 c3 = _mm_sub_ps(c3, c0);
 455             }
 456         }
 457
 458         // output.xyz = column_matrix(c1, c2, c3) x r.xyz + c0.xyz
 459
 460         in = _mm_shuffle_ps(xyz_r, xyz_r, _MM_SHUFFLE(1, 1, 1, 1));
 461         c1 = _mm_mul_ps(c1, in);
 462         in = _mm_shuffle_ps(xyz_r, xyz_r, _MM_SHUFFLE(2, 2, 2, 2));
 463         c2 = _mm_mul_ps(c2, in);
 464         in = _mm_shuffle_ps(xyz_r, xyz_r, _MM_SHUFFLE(3, 3, 3, 3));
 465         c3 = _mm_mul_ps(c3, in);
 466
 467         in = _mm_add_ps(c3, c2);
 468         in = _mm_add_ps(in, c1);
 469         in = _mm_add_ps(in, c0);
 470
 471         // clamp to [0.0..1.0] and scale by 255
 472
 473         in = _mm_max_ps(in, __000);
 474         in = _mm_min_ps(in, __one);
 475         in = _mm_mul_ps(in, __255);
 476
 477         result = _mm_cvtps_epi32(in); // convert to int (rounding)
 478
 479         dest[r_out] = (unsigned char) _mm_extract_epi16(result, 2);
 480         dest[1]     = (unsigned char) _mm_extract_epi16(result, 4);
 481         dest[b_out] = (unsigned char) _mm_extract_epi16(result, 6);
 482
 483         dest += 4;
 484     }
 485 }