third_party/qcms/src/transform.c

   1 /* vim: set ts=8 sw=8 noexpandtab: */
   2 //  qcms
   3 //  Copyright (C) 2009 Mozilla Corporation
   4 //  Copyright (C) 1998-2007 Marti Maria
   5 //
   6 // Permission is hereby granted, free of charge, to any person obtaining
   7 // a copy of this software and associated documentation files (the "Software"),
   8 // to deal in the Software without restriction, including without limitation
   9 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10 // and/or sell copies of the Software, and to permit persons to whom the Software
  11 // is furnished to do so, subject to the following conditions:
  12 //
  13 // The above copyright notice and this permission notice shall be included in
  14 // all copies or substantial portions of the Software.
  15 //
  16 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  17 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
  18 // THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  19 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
  20 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  21 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  22 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23
  24 #include <stdlib.h>
  25 #include <math.h>
  26 #include <assert.h>
  27 #include <string.h> //memcpy
  28 #include "qcmsint.h"
  29 #include "chain.h"
  30 #include "matrix.h"
  31 #include "transform_util.h"
  32
  33 /* for MSVC, GCC, Intel, and Sun compilers */
  34 #if defined(_M_IX86) || defined(__i386__) || defined(__i386) || defined(_M_AMD64) || defined(__x86_64__) || defined(__x86_64)
  35 #define X86
  36 #endif /* _M_IX86 || __i386__ || __i386 || _M_AMD64 || __x86_64__ || __x86_64 */
  37
  38 // Build a White point, primary chromas transfer matrix from RGB to CIE XYZ
  39 // This is just an approximation, I am not handling all the non-linear
  40 // aspects of the RGB to XYZ process, and assumming that the gamma correction
  41 // has transitive property in the tranformation chain.
  42 //
  43 // the alghoritm:
  44 //
  45 //            - First I build the absolute conversion matrix using
  46 //              primaries in XYZ. This matrix is next inverted
  47 //            - Then I eval the source white point across this matrix
  48 //              obtaining the coeficients of the transformation
  49 //            - Then, I apply these coeficients to the original matrix
  50 static struct matrix build_RGB_to_XYZ_transfer_matrix(qcms_CIE_xyY white, qcms_CIE_xyYTRIPLE primrs)
  51 {
  52         struct matrix primaries;
  53         struct matrix primaries_invert;
  54         struct matrix result;
  55         struct vector white_point;
  56         struct vector coefs;
  57
  58         double xn, yn;
  59         double xr, yr;
  60         double xg, yg;
  61         double xb, yb;
  62
  63         xn = white.x;
  64         yn = white.y;
  65
  66         if (yn == 0.0)
  67                 return matrix_invalid();
  68
  69         xr = primrs.red.x;
  70         yr = primrs.red.y;
  71         xg = primrs.green.x;
  72         yg = primrs.green.y;
  73         xb = primrs.blue.x;
  74         yb = primrs.blue.y;
  75
  76         primaries.m[0][0] = xr;
  77         primaries.m[0][1] = xg;
  78         primaries.m[0][2] = xb;
  79
  80         primaries.m[1][0] = yr;
  81         primaries.m[1][1] = yg;
  82         primaries.m[1][2] = yb;
  83
  84         primaries.m[2][0] = 1 - xr - yr;
  85         primaries.m[2][1] = 1 - xg - yg;
  86         primaries.m[2][2] = 1 - xb - yb;
  87         primaries.invalid = false;
  88
  89         white_point.v[0] = xn/yn;
  90         white_point.v[1] = 1.;
  91         white_point.v[2] = (1.0-xn-yn)/yn;
  92
  93         primaries_invert = matrix_invert(primaries);
  94
  95         coefs = matrix_eval(primaries_invert, white_point);
  96
  97         result.m[0][0] = coefs.v[0]*xr;
  98         result.m[0][1] = coefs.v[1]*xg;
  99         result.m[0][2] = coefs.v[2]*xb;
 100
 101         result.m[1][0] = coefs.v[0]*yr;
 102         result.m[1][1] = coefs.v[1]*yg;
 103         result.m[1][2] = coefs.v[2]*yb;
 104
 105         result.m[2][0] = coefs.v[0]*(1.-xr-yr);
 106         result.m[2][1] = coefs.v[1]*(1.-xg-yg);
 107         result.m[2][2] = coefs.v[2]*(1.-xb-yb);
 108         result.invalid = primaries_invert.invalid;
 109
 110         return result;
 111 }
 112
 113 struct CIE_XYZ {
 114         double X;
 115         double Y;
 116         double Z;
 117 };
 118
 119 /* CIE Illuminant D50 */
 120 static const struct CIE_XYZ D50_XYZ = {
 121         0.9642,
 122         1.0000,
 123         0.8249
 124 };
 125
 126 /* from lcms: xyY2XYZ()
 127  * corresponds to argyll: icmYxy2XYZ() */
 128 static struct CIE_XYZ xyY2XYZ(qcms_CIE_xyY source)
 129 {
 130         struct CIE_XYZ dest;
 131         dest.X = (source.x / source.y) * source.Y;
 132         dest.Y = source.Y;
 133         dest.Z = ((1 - source.x - source.y) / source.y) * source.Y;
 134         return dest;
 135 }
 136
 137 /* from lcms: ComputeChromaticAdaption */
 138 // Compute chromatic adaption matrix using chad as cone matrix
 139 static struct matrix
 140 compute_chromatic_adaption(struct CIE_XYZ source_white_point,
 141                            struct CIE_XYZ dest_white_point,
 142                            struct matrix chad)
 143 {
 144         struct matrix chad_inv;
 145         struct vector cone_source_XYZ, cone_source_rgb;
 146         struct vector cone_dest_XYZ, cone_dest_rgb;
 147         struct matrix cone, tmp;
 148
 149         tmp = chad;
 150         chad_inv = matrix_invert(tmp);
 151
 152         cone_source_XYZ.v[0] = source_white_point.X;
 153         cone_source_XYZ.v[1] = source_white_point.Y;
 154         cone_source_XYZ.v[2] = source_white_point.Z;
 155
 156         cone_dest_XYZ.v[0] = dest_white_point.X;
 157         cone_dest_XYZ.v[1] = dest_white_point.Y;
 158         cone_dest_XYZ.v[2] = dest_white_point.Z;
 159
 160         cone_source_rgb = matrix_eval(chad, cone_source_XYZ);
 161         cone_dest_rgb   = matrix_eval(chad, cone_dest_XYZ);
 162
 163         cone.m[0][0] = cone_dest_rgb.v[0]/cone_source_rgb.v[0];
 164         cone.m[0][1] = 0;
 165         cone.m[0][2] = 0;
 166         cone.m[1][0] = 0;
 167         cone.m[1][1] = cone_dest_rgb.v[1]/cone_source_rgb.v[1];
 168         cone.m[1][2] = 0;
 169         cone.m[2][0] = 0;
 170         cone.m[2][1] = 0;
 171         cone.m[2][2] = cone_dest_rgb.v[2]/cone_source_rgb.v[2];
 172         cone.invalid = false;
 173
 174         // Normalize
 175         return matrix_multiply(chad_inv, matrix_multiply(cone, chad));
 176 }
 177
 178 /* from lcms: cmsAdaptionMatrix */
 179 // Returns the final chrmatic adaptation from illuminant FromIll to Illuminant ToIll
 180 // Bradford is assumed
 181 static struct matrix
 182 adaption_matrix(struct CIE_XYZ source_illumination, struct CIE_XYZ target_illumination)
 183 {
 184 #if defined (_MSC_VER)
 185 #pragma warning(push)
 186 /* Disable double to float truncation warning 4305 */
 187 #pragma warning(disable:4305)
 188 #endif
 189         struct matrix lam_rigg = {{ // Bradford matrix
 190                                  {  0.8951,  0.2664, -0.1614 },
 191                                  { -0.7502,  1.7135,  0.0367 },
 192                                  {  0.0389, -0.0685,  1.0296 }
 193                                  }};
 194 #if defined (_MSC_VER)
 195 /* Restore warnings */
 196 #pragma warning(pop)
 197 #endif
 198         return compute_chromatic_adaption(source_illumination, target_illumination, lam_rigg);
 199 }
 200
 201 /* from lcms: cmsAdaptMatrixToD50 */
 202 static struct matrix adapt_matrix_to_D50(struct matrix r, qcms_CIE_xyY source_white_pt)
 203 {
 204         struct CIE_XYZ Dn;
 205         struct matrix Bradford;
 206
 207         if (source_white_pt.y == 0.0)
 208                 return matrix_invalid();
 209
 210         Dn = xyY2XYZ(source_white_pt);
 211
 212         Bradford = adaption_matrix(Dn, D50_XYZ);
 213         return matrix_multiply(Bradford, r);
 214 }
 215
 216 qcms_bool set_rgb_colorants(qcms_profile *profile, qcms_CIE_xyY white_point, qcms_CIE_xyYTRIPLE primaries)
 217 {
 218         struct matrix colorants;
 219         colorants = build_RGB_to_XYZ_transfer_matrix(white_point, primaries);
 220         colorants = adapt_matrix_to_D50(colorants, white_point);
 221
 222         if (colorants.invalid)
 223                 return false;
 224
 225         /* note: there's a transpose type of operation going on here */
 226         profile->redColorant.X = double_to_s15Fixed16Number(colorants.m[0][0]);
 227         profile->redColorant.Y = double_to_s15Fixed16Number(colorants.m[1][0]);
 228         profile->redColorant.Z = double_to_s15Fixed16Number(colorants.m[2][0]);
 229
 230         profile->greenColorant.X = double_to_s15Fixed16Number(colorants.m[0][1]);
 231         profile->greenColorant.Y = double_to_s15Fixed16Number(colorants.m[1][1]);
 232         profile->greenColorant.Z = double_to_s15Fixed16Number(colorants.m[2][1]);
 233
 234         profile->blueColorant.X = double_to_s15Fixed16Number(colorants.m[0][2]);
 235         profile->blueColorant.Y = double_to_s15Fixed16Number(colorants.m[1][2]);
 236         profile->blueColorant.Z = double_to_s15Fixed16Number(colorants.m[2][2]);
 237
 238         return true;
 239 }
 240
 241 #if 0
 242 static void qcms_transform_data_rgb_out_pow(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
 243 {
 244         const int r_out = output_format.r;
 245         const int b_out = output_format.b;
 246
 247         int i;
 248         float (*mat)[4] = transform->matrix;
 249         for (i=0; i<length; i++) {
 250                 unsigned char device_r = *src++;
 251                 unsigned char device_g = *src++;
 252                 unsigned char device_b = *src++;
 253
 254                 float linear_r = transform->input_gamma_table_r[device_r];
 255                 float linear_g = transform->input_gamma_table_g[device_g];
 256                 float linear_b = transform->input_gamma_table_b[device_b];
 257
 258                 float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
 259                 float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
 260                 float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
 261
 262                 float out_device_r = pow(out_linear_r, transform->out_gamma_r);
 263                 float out_device_g = pow(out_linear_g, transform->out_gamma_g);
 264                 float out_device_b = pow(out_linear_b, transform->out_gamma_b);
 265
 266                 dest[r_out] = clamp_u8(out_device_r*255);
 267                 dest[1]     = clamp_u8(out_device_g*255);
 268                 dest[b_out] = clamp_u8(out_device_b*255);
 269                 dest += 3;
 270         }
 271 }
 272 #endif
 273
 274 static void qcms_transform_data_gray_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
 275 {
 276         const int r_out = output_format.r;
 277         const int b_out = output_format.b;
 278
 279         unsigned int i;
 280         for (i = 0; i < length; i++) {
 281                 float out_device_r, out_device_g, out_device_b;
 282                 unsigned char device = *src++;
 283
 284                 float linear = transform->input_gamma_table_gray[device];
 285
 286                 out_device_r = lut_interp_linear(linear, transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
 287                 out_device_g = lut_interp_linear(linear, transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
 288                 out_device_b = lut_interp_linear(linear, transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
 289
 290                 dest[r_out] = clamp_u8(out_device_r*255);
 291                 dest[1]     = clamp_u8(out_device_g*255);
 292                 dest[b_out] = clamp_u8(out_device_b*255);
 293                 dest += 3;
 294         }
 295 }
 296
 297 /* Alpha is not corrected.
 298    A rationale for this is found in Alvy Ray's "Should Alpha Be Nonlinear If
 299    RGB Is?" Tech Memo 17 (December 14, 1998).
 300         See: ftp://ftp.alvyray.com/Acrobat/17_Nonln.pdf
 301 */
 302
 303 static void qcms_transform_data_graya_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
 304 {
 305         const int r_out = output_format.r;
 306         const int b_out = output_format.b;
 307
 308         unsigned int i;
 309         for (i = 0; i < length; i++) {
 310                 float out_device_r, out_device_g, out_device_b;
 311                 unsigned char device = *src++;
 312                 unsigned char alpha = *src++;
 313
 314                 float linear = transform->input_gamma_table_gray[device];
 315
 316                 out_device_r = lut_interp_linear(linear, transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
 317                 out_device_g = lut_interp_linear(linear, transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
 318                 out_device_b = lut_interp_linear(linear, transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
 319
 320                 dest[r_out] = clamp_u8(out_device_r*255);
 321                 dest[1]     = clamp_u8(out_device_g*255);
 322                 dest[b_out] = clamp_u8(out_device_b*255);
 323                 dest[3]     = alpha;
 324                 dest += 4;
 325         }
 326 }
 327
 328
 329 static void qcms_transform_data_gray_out_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
 330 {
 331         const int r_out = output_format.r;
 332         const int b_out = output_format.b;
 333
 334         unsigned int i;
 335         for (i = 0; i < length; i++) {
 336                 unsigned char device = *src++;
 337                 uint16_t gray;
 338
 339                 float linear = transform->input_gamma_table_gray[device];
 340
 341                 /* we could round here... */
 342                 gray = linear * PRECACHE_OUTPUT_MAX;
 343
 344                 dest[r_out] = transform->output_table_r->data[gray];
 345                 dest[1]     = transform->output_table_g->data[gray];
 346                 dest[b_out] = transform->output_table_b->data[gray];
 347                 dest += 3;
 348         }
 349 }
 350
 351
 352 static void qcms_transform_data_graya_out_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
 353 {
 354         const int r_out = output_format.r;
 355         const int b_out = output_format.b;
 356
 357         unsigned int i;
 358         for (i = 0; i < length; i++) {
 359                 unsigned char device = *src++;
 360                 unsigned char alpha = *src++;
 361                 uint16_t gray;
 362
 363                 float linear = transform->input_gamma_table_gray[device];
 364
 365                 /* we could round here... */
 366                 gray = linear * PRECACHE_OUTPUT_MAX;
 367
 368                 dest[r_out] = transform->output_table_r->data[gray];
 369                 dest[1]     = transform->output_table_g->data[gray];
 370                 dest[b_out] = transform->output_table_b->data[gray];
 371                 dest[3]     = alpha;
 372                 dest += 4;
 373         }
 374 }
 375
 376 static void qcms_transform_data_rgb_out_lut_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
 377 {
 378         const int r_out = output_format.r;
 379         const int b_out = output_format.b;
 380
 381         unsigned int i;
 382         float (*mat)[4] = transform->matrix;
 383         for (i = 0; i < length; i++) {
 384                 unsigned char device_r = *src++;
 385                 unsigned char device_g = *src++;
 386                 unsigned char device_b = *src++;
 387                 uint16_t r, g, b;
 388
 389                 float linear_r = transform->input_gamma_table_r[device_r];
 390                 float linear_g = transform->input_gamma_table_g[device_g];
 391                 float linear_b = transform->input_gamma_table_b[device_b];
 392
 393                 float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
 394                 float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
 395                 float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
 396
 397                 out_linear_r = clamp_float(out_linear_r);
 398                 out_linear_g = clamp_float(out_linear_g);
 399                 out_linear_b = clamp_float(out_linear_b);
 400
 401                 /* we could round here... */
 402                 r = out_linear_r * PRECACHE_OUTPUT_MAX;
 403                 g = out_linear_g * PRECACHE_OUTPUT_MAX;
 404                 b = out_linear_b * PRECACHE_OUTPUT_MAX;
 405
 406                 dest[r_out] = transform->output_table_r->data[r];
 407                 dest[1]     = transform->output_table_g->data[g];
 408                 dest[b_out] = transform->output_table_b->data[b];
 409                 dest += 3;
 410         }
 411 }
 412
 413 static void qcms_transform_data_rgba_out_lut_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
 414 {
 415         const int r_out = output_format.r;
 416         const int b_out = output_format.b;
 417
 418         unsigned int i;
 419         float (*mat)[4] = transform->matrix;
 420         for (i = 0; i < length; i++) {
 421                 unsigned char device_r = *src++;
 422                 unsigned char device_g = *src++;
 423                 unsigned char device_b = *src++;
 424                 unsigned char alpha = *src++;
 425                 uint16_t r, g, b;
 426
 427                 float linear_r = transform->input_gamma_table_r[device_r];
 428                 float linear_g = transform->input_gamma_table_g[device_g];
 429                 float linear_b = transform->input_gamma_table_b[device_b];
 430
 431                 float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
 432                 float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
 433                 float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
 434
 435                 out_linear_r = clamp_float(out_linear_r);
 436                 out_linear_g = clamp_float(out_linear_g);
 437                 out_linear_b = clamp_float(out_linear_b);
 438
 439                 /* we could round here... */
 440                 r = out_linear_r * PRECACHE_OUTPUT_MAX;
 441                 g = out_linear_g * PRECACHE_OUTPUT_MAX;
 442                 b = out_linear_b * PRECACHE_OUTPUT_MAX;
 443
 444                 dest[r_out] = transform->output_table_r->data[r];
 445                 dest[1]     = transform->output_table_g->data[g];
 446                 dest[b_out] = transform->output_table_b->data[b];
 447                 dest[3]     = alpha;
 448                 dest += 4;
 449         }
 450 }
 451
 452 // Not used
 453 /*
 454 static void qcms_transform_data_clut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
 455 {
 456         const int r_out = output_format.r;
 457         const int b_out = output_format.b;
 458
 459         unsigned int i;
 460         int xy_len = 1;
 461         int x_len = transform->grid_size;
 462         int len = x_len * x_len;
 463         float* r_table = transform->r_clut;
 464         float* g_table = transform->g_clut;
 465         float* b_table = transform->b_clut;
 466
 467         for (i = 0; i < length; i++) {
 468                 unsigned char in_r = *src++;
 469                 unsigned char in_g = *src++;
 470                 unsigned char in_b = *src++;
 471                 float linear_r = in_r/255.0f, linear_g=in_g/255.0f, linear_b = in_b/255.0f;
 472
 473                 int x = floor(linear_r * (transform->grid_size-1));
 474                 int y = floor(linear_g * (transform->grid_size-1));
 475                 int z = floor(linear_b * (transform->grid_size-1));
 476                 int x_n = ceil(linear_r * (transform->grid_size-1));
 477                 int y_n = ceil(linear_g * (transform->grid_size-1));
 478                 int z_n = ceil(linear_b * (transform->grid_size-1));
 479                 float x_d = linear_r * (transform->grid_size-1) - x;
 480                 float y_d = linear_g * (transform->grid_size-1) - y;
 481                 float z_d = linear_b * (transform->grid_size-1) - z;
 482
 483                 float r_x1 = lerp(CLU(r_table,x,y,z), CLU(r_table,x_n,y,z), x_d);
 484                 float r_x2 = lerp(CLU(r_table,x,y_n,z), CLU(r_table,x_n,y_n,z), x_d);
 485                 float r_y1 = lerp(r_x1, r_x2, y_d);
 486                 float r_x3 = lerp(CLU(r_table,x,y,z_n), CLU(r_table,x_n,y,z_n), x_d);
 487                 float r_x4 = lerp(CLU(r_table,x,y_n,z_n), CLU(r_table,x_n,y_n,z_n), x_d);
 488                 float r_y2 = lerp(r_x3, r_x4, y_d);
 489                 float clut_r = lerp(r_y1, r_y2, z_d);
 490
 491                 float g_x1 = lerp(CLU(g_table,x,y,z), CLU(g_table,x_n,y,z), x_d);
 492                 float g_x2 = lerp(CLU(g_table,x,y_n,z), CLU(g_table,x_n,y_n,z), x_d);
 493                 float g_y1 = lerp(g_x1, g_x2, y_d);
 494                 float g_x3 = lerp(CLU(g_table,x,y,z_n), CLU(g_table,x_n,y,z_n), x_d);
 495                 float g_x4 = lerp(CLU(g_table,x,y_n,z_n), CLU(g_table,x_n,y_n,z_n), x_d);
 496                 float g_y2 = lerp(g_x3, g_x4, y_d);
 497                 float clut_g = lerp(g_y1, g_y2, z_d);
 498
 499                 float b_x1 = lerp(CLU(b_table,x,y,z), CLU(b_table,x_n,y,z), x_d);
 500                 float b_x2 = lerp(CLU(b_table,x,y_n,z), CLU(b_table,x_n,y_n,z), x_d);
 501                 float b_y1 = lerp(b_x1, b_x2, y_d);
 502                 float b_x3 = lerp(CLU(b_table,x,y,z_n), CLU(b_table,x_n,y,z_n), x_d);
 503                 float b_x4 = lerp(CLU(b_table,x,y_n,z_n), CLU(b_table,x_n,y_n,z_n), x_d);
 504                 float b_y2 = lerp(b_x3, b_x4, y_d);
 505                 float clut_b = lerp(b_y1, b_y2, z_d);
 506
 507                 dest[r_out] = clamp_u8(clut_r*255.0f);
 508                 dest[1]     = clamp_u8(clut_g*255.0f);
 509                 dest[b_out] = clamp_u8(clut_b*255.0f);
 510                 dest += 3;
 511         }
 512 }
 513 */
 514
 515 // Using lcms' tetra interpolation algorithm.
 516 static void qcms_transform_data_tetra_clut_rgba(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
 517 {
 518         const int r_out = output_format.r;
 519         const int b_out = output_format.b;
 520
 521         unsigned int i;
 522         int xy_len = 1;
 523         int x_len = transform->grid_size;
 524         int len = x_len * x_len;
 525         float* r_table = transform->r_clut;
 526         float* g_table = transform->g_clut;
 527         float* b_table = transform->b_clut;
 528         float c0_r, c1_r, c2_r, c3_r;
 529         float c0_g, c1_g, c2_g, c3_g;
 530         float c0_b, c1_b, c2_b, c3_b;
 531         float clut_r, clut_g, clut_b;
 532         for (i = 0; i < length; i++) {
 533                 unsigned char in_r = *src++;
 534                 unsigned char in_g = *src++;
 535                 unsigned char in_b = *src++;
 536                 unsigned char in_a = *src++;
 537                 float linear_r = in_r/255.0f, linear_g=in_g/255.0f, linear_b = in_b/255.0f;
 538
 539                 int x = floor(linear_r * (transform->grid_size-1));
 540                 int y = floor(linear_g * (transform->grid_size-1));
 541                 int z = floor(linear_b * (transform->grid_size-1));
 542                 int x_n = ceil(linear_r * (transform->grid_size-1));
 543                 int y_n = ceil(linear_g * (transform->grid_size-1));
 544                 int z_n = ceil(linear_b * (transform->grid_size-1));
 545                 float rx = linear_r * (transform->grid_size-1) - x;
 546                 float ry = linear_g * (transform->grid_size-1) - y;
 547                 float rz = linear_b * (transform->grid_size-1) - z;
 548
 549                 c0_r = CLU(r_table, x, y, z);
 550                 c0_g = CLU(g_table, x, y, z);
 551                 c0_b = CLU(b_table, x, y, z);
 552
 553                 if( rx >= ry ) {
 554                         if (ry >= rz) { //rx >= ry && ry >= rz
 555                                 c1_r = CLU(r_table, x_n, y, z) - c0_r;
 556                                 c2_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x_n, y, z);
 557                                 c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
 558                                 c1_g = CLU(g_table, x_n, y, z) - c0_g;
 559                                 c2_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x_n, y, z);
 560                                 c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
 561                                 c1_b = CLU(b_table, x_n, y, z) - c0_b;
 562                                 c2_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x_n, y, z);
 563                                 c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
 564                         } else {
 565                                 if (rx >= rz) { //rx >= rz && rz >= ry
 566                                         c1_r = CLU(r_table, x_n, y, z) - c0_r;
 567                                         c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
 568                                         c3_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x_n, y, z);
 569                                         c1_g = CLU(g_table, x_n, y, z) - c0_g;
 570                                         c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
 571                                         c3_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x_n, y, z);
 572                                         c1_b = CLU(b_table, x_n, y, z) - c0_b;
 573                                         c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
 574                                         c3_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x_n, y, z);
 575                                 } else { //rz > rx && rx >= ry
 576                                         c1_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x, y, z_n);
 577                                         c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
 578                                         c3_r = CLU(r_table, x, y, z_n) - c0_r;
 579                                         c1_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x, y, z_n);
 580                                         c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
 581                                         c3_g = CLU(g_table, x, y, z_n) - c0_g;
 582                                         c1_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x, y, z_n);
 583                                         c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
 584                                         c3_b = CLU(b_table, x, y, z_n) - c0_b;
 585                                 }
 586                         }
 587                 } else {
 588                         if (rx >= rz) { //ry > rx && rx >= rz
 589                                 c1_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x, y_n, z);
 590                                 c2_r = CLU(r_table, x, y_n, z) - c0_r;
 591                                 c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
 592                                 c1_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x, y_n, z);
 593                                 c2_g = CLU(g_table, x, y_n, z) - c0_g;
 594                                 c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
 595                                 c1_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x, y_n, z);
 596                                 c2_b = CLU(b_table, x, y_n, z) - c0_b;
 597                                 c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
 598                         } else {
 599                                 if (ry >= rz) { //ry >= rz && rz > rx
 600                                         c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
 601                                         c2_r = CLU(r_table, x, y_n, z) - c0_r;
 602                                         c3_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y_n, z);
 603                                         c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
 604                                         c2_g = CLU(g_table, x, y_n, z) - c0_g;
 605                                         c3_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y_n, z);
 606                                         c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
 607                                         c2_b = CLU(b_table, x, y_n, z) - c0_b;
 608                                         c3_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y_n, z);
 609                                 } else { //rz > ry && ry > rx
 610                                         c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
 611                                         c2_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y, z_n);
 612                                         c3_r = CLU(r_table, x, y, z_n) - c0_r;
 613                                         c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
 614                                         c2_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y, z_n);
 615                                         c3_g = CLU(g_table, x, y, z_n) - c0_g;
 616                                         c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
 617                                         c2_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y, z_n);
 618                                         c3_b = CLU(b_table, x, y, z_n) - c0_b;
 619                                 }
 620                         }
 621                 }
 622
 623                 clut_r = c0_r + c1_r*rx + c2_r*ry + c3_r*rz;
 624                 clut_g = c0_g + c1_g*rx + c2_g*ry + c3_g*rz;
 625                 clut_b = c0_b + c1_b*rx + c2_b*ry + c3_b*rz;
 626
 627                 dest[r_out] = clamp_u8(clut_r*255.0f);
 628                 dest[1]     = clamp_u8(clut_g*255.0f);
 629                 dest[b_out] = clamp_u8(clut_b*255.0f);
 630                 dest[3]     = in_a;
 631                 dest += 4;
 632         }
 633 }
 634
 635 // Using lcms' tetra interpolation code.
 636 static void qcms_transform_data_tetra_clut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
 637 {
 638         const int r_out = output_format.r;
 639         const int b_out = output_format.b;
 640
 641         unsigned int i;
 642         int xy_len = 1;
 643         int x_len = transform->grid_size;
 644         int len = x_len * x_len;
 645         float* r_table = transform->r_clut;
 646         float* g_table = transform->g_clut;
 647         float* b_table = transform->b_clut;
 648         float c0_r, c1_r, c2_r, c3_r;
 649         float c0_g, c1_g, c2_g, c3_g;
 650         float c0_b, c1_b, c2_b, c3_b;
 651         float clut_r, clut_g, clut_b;
 652         for (i = 0; i < length; i++) {
 653                 unsigned char in_r = *src++;
 654                 unsigned char in_g = *src++;
 655                 unsigned char in_b = *src++;
 656                 float linear_r = in_r/255.0f, linear_g=in_g/255.0f, linear_b = in_b/255.0f;
 657
 658                 int x = floor(linear_r * (transform->grid_size-1));
 659                 int y = floor(linear_g * (transform->grid_size-1));
 660                 int z = floor(linear_b * (transform->grid_size-1));
 661                 int x_n = ceil(linear_r * (transform->grid_size-1));
 662                 int y_n = ceil(linear_g * (transform->grid_size-1));
 663                 int z_n = ceil(linear_b * (transform->grid_size-1));
 664                 float rx = linear_r * (transform->grid_size-1) - x;
 665                 float ry = linear_g * (transform->grid_size-1) - y;
 666                 float rz = linear_b * (transform->grid_size-1) - z;
 667
 668                 c0_r = CLU(r_table, x, y, z);
 669                 c0_g = CLU(g_table, x, y, z);
 670                 c0_b = CLU(b_table, x, y, z);
 671
 672                 if( rx >= ry ) {
 673                         if (ry >= rz) { //rx >= ry && ry >= rz
 674                                 c1_r = CLU(r_table, x_n, y, z) - c0_r;
 675                                 c2_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x_n, y, z);
 676                                 c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
 677                                 c1_g = CLU(g_table, x_n, y, z) - c0_g;
 678                                 c2_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x_n, y, z);
 679                                 c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
 680                                 c1_b = CLU(b_table, x_n, y, z) - c0_b;
 681                                 c2_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x_n, y, z);
 682                                 c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
 683                         } else {
 684                                 if (rx >= rz) { //rx >= rz && rz >= ry
 685                                         c1_r = CLU(r_table, x_n, y, z) - c0_r;
 686                                         c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
 687                                         c3_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x_n, y, z);
 688                                         c1_g = CLU(g_table, x_n, y, z) - c0_g;
 689                                         c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
 690                                         c3_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x_n, y, z);
 691                                         c1_b = CLU(b_table, x_n, y, z) - c0_b;
 692                                         c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
 693                                         c3_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x_n, y, z);
 694                                 } else { //rz > rx && rx >= ry
 695                                         c1_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x, y, z_n);
 696                                         c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
 697                                         c3_r = CLU(r_table, x, y, z_n) - c0_r;
 698                                         c1_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x, y, z_n);
 699                                         c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
 700                                         c3_g = CLU(g_table, x, y, z_n) - c0_g;
 701                                         c1_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x, y, z_n);
 702                                         c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
 703                                         c3_b = CLU(b_table, x, y, z_n) - c0_b;
 704                                 }
 705                         }
 706                 } else {
 707                         if (rx >= rz) { //ry > rx && rx >= rz
 708                                 c1_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x, y_n, z);
 709                                 c2_r = CLU(r_table, x, y_n, z) - c0_r;
 710                                 c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
 711                                 c1_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x, y_n, z);
 712                                 c2_g = CLU(g_table, x, y_n, z) - c0_g;
 713                                 c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
 714                                 c1_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x, y_n, z);
 715                                 c2_b = CLU(b_table, x, y_n, z) - c0_b;
 716                                 c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
 717                         } else {
 718                                 if (ry >= rz) { //ry >= rz && rz > rx
 719                                         c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
 720                                         c2_r = CLU(r_table, x, y_n, z) - c0_r;
 721                                         c3_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y_n, z);
 722                                         c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
 723                                         c2_g = CLU(g_table, x, y_n, z) - c0_g;
 724                                         c3_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y_n, z);
 725                                         c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
 726                                         c2_b = CLU(b_table, x, y_n, z) - c0_b;
 727                                         c3_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y_n, z);
 728                                 } else { //rz > ry && ry > rx
 729                                         c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
 730                                         c2_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y, z_n);
 731                                         c3_r = CLU(r_table, x, y, z_n) - c0_r;
 732                                         c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
 733                                         c2_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y, z_n);
 734                                         c3_g = CLU(g_table, x, y, z_n) - c0_g;
 735                                         c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
 736                                         c2_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y, z_n);
 737                                         c3_b = CLU(b_table, x, y, z_n) - c0_b;
 738                                 }
 739                         }
 740                 }
 741
 742                 clut_r = c0_r + c1_r*rx + c2_r*ry + c3_r*rz;
 743                 clut_g = c0_g + c1_g*rx + c2_g*ry + c3_g*rz;
 744                 clut_b = c0_b + c1_b*rx + c2_b*ry + c3_b*rz;
 745
 746                 dest[r_out] = clamp_u8(clut_r*255.0f);
 747                 dest[1]     = clamp_u8(clut_g*255.0f);
 748                 dest[b_out] = clamp_u8(clut_b*255.0f);
 749                 dest += 3;
 750         }
 751 }
 752
 753 static void qcms_transform_data_rgb_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
 754 {
 755         const int r_out = output_format.r;
 756         const int b_out = output_format.b;
 757
 758         unsigned int i;
 759         float (*mat)[4] = transform->matrix;
 760         for (i = 0; i < length; i++) {
 761                 unsigned char device_r = *src++;
 762                 unsigned char device_g = *src++;
 763                 unsigned char device_b = *src++;
 764                 float out_device_r, out_device_g, out_device_b;
 765
 766                 float linear_r = transform->input_gamma_table_r[device_r];
 767                 float linear_g = transform->input_gamma_table_g[device_g];
 768                 float linear_b = transform->input_gamma_table_b[device_b];
 769
 770                 float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
 771                 float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
 772                 float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
 773
 774                 out_linear_r = clamp_float(out_linear_r);
 775                 out_linear_g = clamp_float(out_linear_g);
 776                 out_linear_b = clamp_float(out_linear_b);
 777
 778                 out_device_r = lut_interp_linear(out_linear_r,
 779                                 transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
 780                 out_device_g = lut_interp_linear(out_linear_g,
 781                                 transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
 782                 out_device_b = lut_interp_linear(out_linear_b,
 783                                 transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
 784
 785                 dest[r_out] = clamp_u8(out_device_r*255);
 786                 dest[1]     = clamp_u8(out_device_g*255);
 787                 dest[b_out] = clamp_u8(out_device_b*255);
 788                 dest += 3;
 789         }
 790 }
 791
 792 static void qcms_transform_data_rgba_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
 793 {
 794         const int r_out = output_format.r;
 795         const int b_out = output_format.b;
 796
 797         unsigned int i;
 798         float (*mat)[4] = transform->matrix;
 799         for (i = 0; i < length; i++) {
 800                 unsigned char device_r = *src++;
 801                 unsigned char device_g = *src++;
 802                 unsigned char device_b = *src++;
 803                 unsigned char alpha = *src++;
 804                 float out_device_r, out_device_g, out_device_b;
 805
 806                 float linear_r = transform->input_gamma_table_r[device_r];
 807                 float linear_g = transform->input_gamma_table_g[device_g];
 808                 float linear_b = transform->input_gamma_table_b[device_b];
 809
 810                 float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
 811                 float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
 812                 float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
 813
 814                 out_linear_r = clamp_float(out_linear_r);
 815                 out_linear_g = clamp_float(out_linear_g);
 816                 out_linear_b = clamp_float(out_linear_b);
 817
 818                 out_device_r = lut_interp_linear(out_linear_r,
 819                                 transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
 820                 out_device_g = lut_interp_linear(out_linear_g,
 821                                 transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
 822                 out_device_b = lut_interp_linear(out_linear_b,
 823                                 transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
 824
 825                 dest[r_out] = clamp_u8(out_device_r*255);
 826                 dest[1]     = clamp_u8(out_device_g*255);
 827                 dest[b_out] = clamp_u8(out_device_b*255);
 828                 dest[3]     = alpha;
 829                 dest += 4;
 830         }
 831 }
 832
 833 #if 0
 834 static void qcms_transform_data_rgb_out_linear(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
 835 {
 836         const int r_out = output_format.r;
 837         const int b_out = output_format.b;
 838
 839         int i;
 840         float (*mat)[4] = transform->matrix;
 841         for (i = 0; i < length; i++) {
 842                 unsigned char device_r = *src++;
 843                 unsigned char device_g = *src++;
 844                 unsigned char device_b = *src++;
 845
 846                 float linear_r = transform->input_gamma_table_r[device_r];
 847                 float linear_g = transform->input_gamma_table_g[device_g];
 848                 float linear_b = transform->input_gamma_table_b[device_b];
 849
 850                 float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
 851                 float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
 852                 float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
 853
 854                 dest[r_out] = clamp_u8(out_linear_r*255);
 855                 dest[1]     = clamp_u8(out_linear_g*255);
 856                 dest[b_out] = clamp_u8(out_linear_b*255);
 857                 dest += 3;
 858         }
 859 }
 860 #endif
 861
 862 /*
 863  * If users create and destroy objects on different threads, even if the same
 864  * objects aren't used on different threads at the same time, we can still run
 865  * in to trouble with refcounts if they aren't atomic.
 866  *
 867  * This can lead to us prematurely deleting the precache if threads get unlucky
 868  * and write the wrong value to the ref count.
 869  */
 870 static struct precache_output *precache_reference(struct precache_output *p)
 871 {
 872         qcms_atomic_increment(p->ref_count);
 873         return p;
 874 }
 875
 876 static struct precache_output *precache_create()
 877 {
 878         struct precache_output *p = malloc(sizeof(struct precache_output));
 879         if (p)
 880                 p->ref_count = 1;
 881         return p;
 882 }
 883
 884 void precache_release(struct precache_output *p)
 885 {
 886         if (qcms_atomic_decrement(p->ref_count) == 0) {
 887                 free(p);
 888         }
 889 }
 890
 891 #ifdef HAVE_POSIX_MEMALIGN
 892 static qcms_transform *transform_alloc(void)
 893 {
 894         qcms_transform *t;
 895         if (!posix_memalign(&t, 16, sizeof(*t))) {
 896                 return t;
 897         } else {
 898                 return NULL;
 899         }
 900 }
 901 static void transform_free(qcms_transform *t)
 902 {
 903         free(t);
 904 }
 905 #else
 906 static qcms_transform *transform_alloc(void)
 907 {
 908         /* transform needs to be aligned on a 16byte boundrary */
 909         char *original_block = calloc(sizeof(qcms_transform) + sizeof(void*) + 16, 1);
 910         /* make room for a pointer to the block returned by calloc */
 911         void *transform_start = original_block + sizeof(void*);
 912         /* align transform_start */
 913         qcms_transform *transform_aligned = (qcms_transform*)(((uintptr_t)transform_start + 15) & ~0xf);
 914
 915         /* store a pointer to the block returned by calloc so that we can free it later */
 916         void **(original_block_ptr) = (void**)transform_aligned;
 917         if (!original_block)
 918                 return NULL;
 919         original_block_ptr--;
 920         *original_block_ptr = original_block;
 921
 922         return transform_aligned;
 923 }
 924 static void transform_free(qcms_transform *t)
 925 {
 926         /* get at the pointer to the unaligned block returned by calloc */
 927         void **p = (void**)t;
 928         p--;
 929         free(*p);
 930 }
 931 #endif
 932
 933 void qcms_transform_release(qcms_transform *t)
 934 {
 935         /* ensure we only free the gamma tables once even if there are
 936          * multiple references to the same data */
 937
 938         if (t->output_table_r)
 939                 precache_release(t->output_table_r);
 940         if (t->output_table_g)
 941                 precache_release(t->output_table_g);
 942         if (t->output_table_b)
 943                 precache_release(t->output_table_b);
 944
 945         free(t->input_gamma_table_r);
 946         if (t->input_gamma_table_g != t->input_gamma_table_r)
 947                 free(t->input_gamma_table_g);
 948         if (t->input_gamma_table_g != t->input_gamma_table_r &&
 949             t->input_gamma_table_g != t->input_gamma_table_b)
 950                 free(t->input_gamma_table_b);
 951
 952         free(t->input_gamma_table_gray);
 953
 954         free(t->output_gamma_lut_r);
 955         free(t->output_gamma_lut_g);
 956         free(t->output_gamma_lut_b);
 957
 958         transform_free(t);
 959 }
 960
 961 #ifdef X86
 962 // Determine if we can build with SSE2 (this was partly copied from jmorecfg.h in
 963 // mozilla/jpeg)
 964  // -------------------------------------------------------------------------
 965 #if defined(_M_IX86) && defined(_MSC_VER)
 966 #define HAS_CPUID
 967 /* Get us a CPUID function. Avoid clobbering EBX because sometimes it's the PIC
 968    register - I'm not sure if that ever happens on windows, but cpuid isn't
 969    on the critical path so we just preserve the register to be safe and to be
 970    consistent with the non-windows version. */
 971 static void cpuid(uint32_t fxn, uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d) {
 972        uint32_t a_, b_, c_, d_;
 973        __asm {
 974               xchg   ebx, esi
 975               mov    eax, fxn
 976               cpuid
 977               mov    a_, eax
 978               mov    b_, ebx
 979               mov    c_, ecx
 980               mov    d_, edx
 981               xchg   ebx, esi
 982        }
 983        *a = a_;
 984        *b = b_;
 985        *c = c_;
 986        *d = d_;
 987 }
 988 #elif (defined(__GNUC__) || defined(__SUNPRO_C)) && (defined(__i386__) || defined(__i386))
 989 #define HAS_CPUID
 990 /* Get us a CPUID function. We can't use ebx because it's the PIC register on
 991    some platforms, so we use ESI instead and save ebx to avoid clobbering it. */
 992 static void cpuid(uint32_t fxn, uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d) {
 993
 994         uint32_t a_, b_, c_, d_;
 995        __asm__ __volatile__ ("xchgl %%ebx, %%esi; cpuid; xchgl %%ebx, %%esi;"
 996                              : "=a" (a_), "=S" (b_), "=c" (c_), "=d" (d_) : "a" (fxn));
 997            *a = a_;
 998            *b = b_;
 999            *c = c_;
1000            *d = d_;
1001 }
1002 #endif
1003
1004 // -------------------------Runtime SSEx Detection-----------------------------
1005
1006 /* MMX is always supported per
1007  *  Gecko v1.9.1 minimum CPU requirements */
1008 #define SSE1_EDX_MASK (1UL << 25)
1009 #define SSE2_EDX_MASK (1UL << 26)
1010 #define SSE3_ECX_MASK (1UL <<  0)
1011
1012 static int sse_version_available(void)
1013 {
1014 #if defined(__x86_64__) || defined(__x86_64) || defined(_M_AMD64)
1015         /* we know at build time that 64-bit CPUs always have SSE2
1016          * this tells the compiler that non-SSE2 branches will never be
1017          * taken (i.e. OK to optimze away the SSE1 and non-SIMD code */
1018         return 2;
1019 #elif defined(HAS_CPUID)
1020         static int sse_version = -1;
1021         uint32_t a, b, c, d;
1022         uint32_t function = 0x00000001;
1023
1024         if (sse_version == -1) {
1025                 sse_version = 0;
1026                 cpuid(function, &a, &b, &c, &d);
1027                 if (c & SSE3_ECX_MASK)
1028                         sse_version = 3;
1029                 else if (d & SSE2_EDX_MASK)
1030                         sse_version = 2;
1031                 else if (d & SSE1_EDX_MASK)
1032                         sse_version = 1;
1033         }
1034
1035         return sse_version;
1036 #else
1037         return 0;
1038 #endif
1039 }
1040 #endif
1041
1042 static const struct matrix bradford_matrix = {{ { 0.8951f, 0.2664f,-0.1614f},
1043                                                 {-0.7502f, 1.7135f, 0.0367f},
1044                                                 { 0.0389f,-0.0685f, 1.0296f}},
1045                                                 false};
1046
1047 static const struct matrix bradford_matrix_inv = {{ { 0.9869929f,-0.1470543f, 0.1599627f},
1048                                                     { 0.4323053f, 0.5183603f, 0.0492912f},
1049                                                     {-0.0085287f, 0.0400428f, 0.9684867f}},
1050                                                     false};
1051
1052 // See ICCv4 E.3
1053 struct matrix compute_whitepoint_adaption(float X, float Y, float Z) {
1054         float p = (0.96422f*bradford_matrix.m[0][0] + 1.000f*bradford_matrix.m[1][0] + 0.82521f*bradford_matrix.m[2][0]) /
1055                   (X*bradford_matrix.m[0][0]      + Y*bradford_matrix.m[1][0]      + Z*bradford_matrix.m[2][0]     );
1056         float y = (0.96422f*bradford_matrix.m[0][1] + 1.000f*bradford_matrix.m[1][1] + 0.82521f*bradford_matrix.m[2][1]) /
1057                   (X*bradford_matrix.m[0][1]      + Y*bradford_matrix.m[1][1]      + Z*bradford_matrix.m[2][1]     );
1058         float b = (0.96422f*bradford_matrix.m[0][2] + 1.000f*bradford_matrix.m[1][2] + 0.82521f*bradford_matrix.m[2][2]) /
1059                   (X*bradford_matrix.m[0][2]      + Y*bradford_matrix.m[1][2]      + Z*bradford_matrix.m[2][2]     );
1060         struct matrix white_adaption = {{ {p,0,0}, {0,y,0}, {0,0,b}}, false};
1061         return matrix_multiply( bradford_matrix_inv, matrix_multiply(white_adaption, bradford_matrix) );
1062 }
1063
1064 void qcms_profile_precache_output_transform(qcms_profile *profile)
1065 {
1066         /* we only support precaching on rgb profiles */
1067         if (profile->color_space != RGB_SIGNATURE)
1068                 return;
1069
1070         if (qcms_supports_iccv4) {
1071                 /* don't precache since we will use the B2A LUT */
1072                 if (profile->B2A0)
1073                         return;
1074
1075                 /* don't precache since we will use the mBA LUT */
1076                 if (profile->mBA)
1077                         return;
1078         }
1079
1080         /* don't precache if we do not have the TRC curves */
1081         if (!profile->redTRC || !profile->greenTRC || !profile->blueTRC)
1082                 return;
1083
1084         if (!profile->output_table_r) {
1085                 profile->output_table_r = precache_create();
1086                 if (profile->output_table_r &&
1087                                 !compute_precache(profile->redTRC, profile->output_table_r->data)) {
1088                         precache_release(profile->output_table_r);
1089                         profile->output_table_r = NULL;
1090                 }
1091         }
1092         if (!profile->output_table_g) {
1093                 profile->output_table_g = precache_create();
1094                 if (profile->output_table_g &&
1095                                 !compute_precache(profile->greenTRC, profile->output_table_g->data)) {
1096                         precache_release(profile->output_table_g);
1097                         profile->output_table_g = NULL;
1098                 }
1099         }
1100         if (!profile->output_table_b) {
1101                 profile->output_table_b = precache_create();
1102                 if (profile->output_table_b &&
1103                                 !compute_precache(profile->blueTRC, profile->output_table_b->data)) {
1104                         precache_release(profile->output_table_b);
1105                         profile->output_table_b = NULL;
1106                 }
1107         }
1108 }
1109
1110 /* Replace the current transformation with a LUT transformation using a given number of sample points */
1111 qcms_transform* qcms_transform_precacheLUT_float(qcms_transform *transform, qcms_profile *in, qcms_profile *out,
1112                                                  int samples, qcms_data_type in_type)
1113 {
1114         /* The range between which 2 consecutive sample points can be used to interpolate */
1115         uint16_t x,y,z;
1116         uint32_t l;
1117         uint32_t lutSize = 3 * samples * samples * samples;
1118         float* src = NULL;
1119         float* dest = NULL;
1120         float* lut = NULL;
1121
1122         src = malloc(lutSize*sizeof(float));
1123         dest = malloc(lutSize*sizeof(float));
1124
1125         if (src && dest) {
1126                 /* Prepare a list of points we want to sample */
1127                 l = 0;
1128                 for (x = 0; x < samples; x++) {
1129                         for (y = 0; y < samples; y++) {
1130                                 for (z = 0; z < samples; z++) {
1131                                         src[l++] = x / (float)(samples-1);
1132                                         src[l++] = y / (float)(samples-1);
1133                                         src[l++] = z / (float)(samples-1);
1134                                 }
1135                         }
1136                 }
1137
1138                 lut = qcms_chain_transform(in, out, src, dest, lutSize);
1139                 if (lut) {
1140                         transform->r_clut = &lut[0];
1141                         transform->g_clut = &lut[1];
1142                         transform->b_clut = &lut[2];
1143                         transform->grid_size = samples;
1144                         if (in_type == QCMS_DATA_RGBA_8) {
1145                                 transform->transform_fn = qcms_transform_data_tetra_clut_rgba;
1146                         } else {
1147                                 transform->transform_fn = qcms_transform_data_tetra_clut;
1148                         }
1149                 }
1150         }
1151
1152
1153         //XXX: qcms_modular_transform_data may return either the src or dest buffer. If so it must not be free-ed
1154         if (src && lut != src) {
1155                 free(src);
1156         }
1157         if (dest && lut != dest) {
1158                 free(dest);
1159         }
1160
1161         if (lut == NULL) {
1162                 return NULL;
1163         }
1164         return transform;
1165 }
1166
1167 #define NO_MEM_TRANSFORM NULL
1168
1169 qcms_transform* qcms_transform_create(
1170                 qcms_profile *in, qcms_data_type in_type,
1171                 qcms_profile *out, qcms_data_type out_type,
1172                 qcms_intent intent)
1173 {
1174         bool precache = false;
1175
1176         qcms_transform *transform = transform_alloc();
1177         if (!transform) {
1178                 return NULL;
1179         }
1180         if (out_type != QCMS_DATA_RGB_8 &&
1181                 out_type != QCMS_DATA_RGBA_8) {
1182             assert(0 && "output type");
1183             transform_free(transform);
1184             return NULL;
1185         }
1186
1187         if (out->output_table_r &&
1188                         out->output_table_g &&
1189                         out->output_table_b) {
1190                 precache = true;
1191         }
1192
1193         if (qcms_supports_iccv4 && (in->A2B0 || out->B2A0 || in->mAB || out->mAB)) {
1194                 // Precache the transformation to a CLUT 33x33x33 in size.
1195                 // 33 is used by many profiles and works well in pratice.
1196                 // This evenly divides 256 into blocks of 8x8x8.
1197                 // TODO For transforming small data sets of about 200x200 or less
1198                 // precaching should be avoided.
1199                 qcms_transform *result = qcms_transform_precacheLUT_float(transform, in, out, 33, in_type);
1200                 if (!result) {
1201                         assert(0 && "precacheLUT failed");
1202                         transform_free(transform);
1203                         return NULL;
1204                 }
1205                 return result;
1206         }
1207
1208         if (precache) {
1209                 transform->output_table_r = precache_reference(out->output_table_r);
1210                 transform->output_table_g = precache_reference(out->output_table_g);
1211                 transform->output_table_b = precache_reference(out->output_table_b);
1212         } else {
1213                 if (!out->redTRC || !out->greenTRC || !out->blueTRC) {
1214                         qcms_transform_release(transform);
1215                         return NO_MEM_TRANSFORM;
1216                 }
1217                 build_output_lut(out->redTRC, &transform->output_gamma_lut_r, &transform->output_gamma_lut_r_length);
1218                 build_output_lut(out->greenTRC, &transform->output_gamma_lut_g, &transform->output_gamma_lut_g_length);
1219                 build_output_lut(out->blueTRC, &transform->output_gamma_lut_b, &transform->output_gamma_lut_b_length);
1220                 if (!transform->output_gamma_lut_r || !transform->output_gamma_lut_g || !transform->output_gamma_lut_b) {
1221                         qcms_transform_release(transform);
1222                         return NO_MEM_TRANSFORM;
1223                 }
1224         }
1225
1226         if (in->color_space == RGB_SIGNATURE) {
1227                 struct matrix in_matrix, out_matrix, result;
1228
1229                 if (in_type != QCMS_DATA_RGB_8 &&
1230                     in_type != QCMS_DATA_RGBA_8){
1231                         assert(0 && "input type");
1232                         transform_free(transform);
1233                         return NULL;
1234                 }
1235                 if (precache) {
1236 #if defined(SSE2_ENABLE) && defined(X86)
1237                     if (sse_version_available() >= 2) {
1238                             if (in_type == QCMS_DATA_RGB_8)
1239                                     transform->transform_fn = qcms_transform_data_rgb_out_lut_sse2;
1240                             else
1241                                     transform->transform_fn = qcms_transform_data_rgba_out_lut_sse2;
1242
1243 #if defined(SSE2_ENABLE) && !(defined(_MSC_VER) && defined(_M_AMD64))
1244                     /* Microsoft Compiler for x64 doesn't support MMX.
1245                      * SSE code uses MMX so that we disable on x64 */
1246                     } else
1247                     if (sse_version_available() >= 1) {
1248                             if (in_type == QCMS_DATA_RGB_8)
1249                                     transform->transform_fn = qcms_transform_data_rgb_out_lut_sse1;
1250                             else
1251                                     transform->transform_fn = qcms_transform_data_rgba_out_lut_sse1;
1252 #endif
1253                     } else
1254 #endif
1255                         {
1256                                 if (in_type == QCMS_DATA_RGB_8)
1257                                         transform->transform_fn = qcms_transform_data_rgb_out_lut_precache;
1258                                 else
1259                                         transform->transform_fn = qcms_transform_data_rgba_out_lut_precache;
1260                         }
1261                 } else {
1262                         if (in_type == QCMS_DATA_RGB_8)
1263                                 transform->transform_fn = qcms_transform_data_rgb_out_lut;
1264                         else
1265                                 transform->transform_fn = qcms_transform_data_rgba_out_lut;
1266                 }
1267
1268                 //XXX: avoid duplicating tables if we can
1269                 transform->input_gamma_table_r = build_input_gamma_table(in->redTRC);
1270                 transform->input_gamma_table_g = build_input_gamma_table(in->greenTRC);
1271                 transform->input_gamma_table_b = build_input_gamma_table(in->blueTRC);
1272                 if (!transform->input_gamma_table_r || !transform->input_gamma_table_g || !transform->input_gamma_table_b) {
1273                         qcms_transform_release(transform);
1274                         return NO_MEM_TRANSFORM;
1275                 }
1276
1277
1278                 /* build combined colorant matrix */
1279                 in_matrix = build_colorant_matrix(in);
1280                 out_matrix = build_colorant_matrix(out);
1281                 out_matrix = matrix_invert(out_matrix);
1282                 if (out_matrix.invalid) {
1283                         qcms_transform_release(transform);
1284                         return NULL;
1285                 }
1286                 result = matrix_multiply(out_matrix, in_matrix);
1287
1288                 /* store the results in column major mode
1289                  * this makes doing the multiplication with sse easier */
1290                 transform->matrix[0][0] = result.m[0][0];
1291                 transform->matrix[1][0] = result.m[0][1];
1292                 transform->matrix[2][0] = result.m[0][2];
1293                 transform->matrix[0][1] = result.m[1][0];
1294                 transform->matrix[1][1] = result.m[1][1];
1295                 transform->matrix[2][1] = result.m[1][2];
1296                 transform->matrix[0][2] = result.m[2][0];
1297                 transform->matrix[1][2] = result.m[2][1];
1298                 transform->matrix[2][2] = result.m[2][2];
1299
1300         } else if (in->color_space == GRAY_SIGNATURE) {
1301                 if (in_type != QCMS_DATA_GRAY_8 &&
1302                                 in_type != QCMS_DATA_GRAYA_8){
1303                         assert(0 && "input type");
1304                         transform_free(transform);
1305                         return NULL;
1306                 }
1307
1308                 transform->input_gamma_table_gray = build_input_gamma_table(in->grayTRC);
1309                 if (!transform->input_gamma_table_gray) {
1310                         qcms_transform_release(transform);
1311                         return NO_MEM_TRANSFORM;
1312                 }
1313
1314                 if (precache) {
1315                         if (in_type == QCMS_DATA_GRAY_8) {
1316                                 transform->transform_fn = qcms_transform_data_gray_out_precache;
1317                         } else {
1318                                 transform->transform_fn = qcms_transform_data_graya_out_precache;
1319                         }
1320                 } else {
1321                         if (in_type == QCMS_DATA_GRAY_8) {
1322                                 transform->transform_fn = qcms_transform_data_gray_out_lut;
1323                         } else {
1324                                 transform->transform_fn = qcms_transform_data_graya_out_lut;
1325                         }
1326                 }
1327         } else {
1328                 assert(0 && "unexpected colorspace");
1329                 transform_free(transform);
1330                 return NULL;
1331         }
1332         return transform;
1333 }
1334
1335 /* __force_align_arg_pointer__ is an x86-only attribute, and gcc/clang warns on unused
1336  * attributes. Don't use this on ARM or AMD64. __has_attribute can detect the presence
1337  * of the attribute but is currently only supported by clang */
1338 #if defined(__has_attribute)
1339 #define HAS_FORCE_ALIGN_ARG_POINTER __has_attribute(__force_align_arg_pointer__)
1340 #elif defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) && !defined(__arm__) && !defined(__mips__)
1341 #define HAS_FORCE_ALIGN_ARG_POINTER 1
1342 #else
1343 #define HAS_FORCE_ALIGN_ARG_POINTER 0
1344 #endif
1345
1346 #if HAS_FORCE_ALIGN_ARG_POINTER
1347 /* we need this to avoid crashes when gcc assumes the stack is 128bit aligned */
1348 __attribute__((__force_align_arg_pointer__))
1349 #endif
1350 void qcms_transform_data(qcms_transform *transform, void *src, void *dest, size_t length)
1351 {
1352         static const struct _qcms_format_type output_rgbx = { 0, 2 };
1353
1354         transform->transform_fn(transform, src, dest, length, output_rgbx);
1355 }
1356
1357 void qcms_transform_data_type(qcms_transform *transform, void *src, void *dest, size_t length, qcms_output_type type)
1358 {
1359         static const struct _qcms_format_type output_rgbx = { 0, 2 };
1360         static const struct _qcms_format_type output_bgrx = { 2, 0 };
1361
1362         transform->transform_fn(transform, src, dest, length, type == QCMS_OUTPUT_BGRX ? output_bgrx : output_rgbx);
1363 }
1364
1365 qcms_bool qcms_supports_iccv4;
1366 void qcms_enable_iccv4()
1367 {
1368         qcms_supports_iccv4 = true;
1369 }