workbench/libs/mesa/src/gallium/auxiliary/gallivm/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009-2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include "util/u_memory.h"
  49 #include "util/u_debug.h"
  50 #include "util/u_math.h"
  51 #include "util/u_string.h"
  52 #include "util/u_cpu_detect.h"
  53
  54 #include "lp_bld_type.h"
  55 #include "lp_bld_const.h"
  56 #include "lp_bld_init.h"
  57 #include "lp_bld_intr.h"
  58 #include "lp_bld_logic.h"
  59 #include "lp_bld_pack.h"
  60 #include "lp_bld_debug.h"
  61 #include "lp_bld_arit.h"
  62
  63
  64 #define EXP_POLY_DEGREE 3
  65
  66 #define LOG_POLY_DEGREE 5
  67
  68
  69 /**
  70  * Generate min(a, b)
  71  * No checks for special case values of a or b = 1 or 0 are done.
  72  */
  73 static LLVMValueRef
  74 lp_build_min_simple(struct lp_build_context *bld,
  75                     LLVMValueRef a,
  76                     LLVMValueRef b)
  77 {
  78    LLVMBuilderRef builder = bld->gallivm->builder;
  79    const struct lp_type type = bld->type;
  80    const char *intrinsic = NULL;
  81    LLVMValueRef cond;
  82
  83    assert(lp_check_value(type, a));
  84    assert(lp_check_value(type, b));
  85
  86    /* TODO: optimize the constant case */
  87
  88    if(type.width * type.length == 128) {
  89       if(type.floating) {
  90          if(type.width == 32 && util_cpu_caps.has_sse)
  91             intrinsic = "llvm.x86.sse.min.ps";
  92          if(type.width == 64 && util_cpu_caps.has_sse2)
  93             intrinsic = "llvm.x86.sse2.min.pd";
  94       }
  95       else {
  96          if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
  97             intrinsic = "llvm.x86.sse2.pminu.b";
  98          if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
  99             intrinsic = "llvm.x86.sse41.pminsb";
 100          if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
 101             intrinsic = "llvm.x86.sse41.pminuw";
 102          if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
 103             intrinsic = "llvm.x86.sse2.pmins.w";
 104          if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
 105             intrinsic = "llvm.x86.sse41.pminud";
 106          if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
 107             intrinsic = "llvm.x86.sse41.pminsd";
 108       }
 109    }
 110
 111    if(intrinsic)
 112       return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 113
 114    cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 115    return lp_build_select(bld, cond, a, b);
 116 }
 117
 118
 119 /**
 120  * Generate max(a, b)
 121  * No checks for special case values of a or b = 1 or 0 are done.
 122  */
 123 static LLVMValueRef
 124 lp_build_max_simple(struct lp_build_context *bld,
 125                     LLVMValueRef a,
 126                     LLVMValueRef b)
 127 {
 128    LLVMBuilderRef builder = bld->gallivm->builder;
 129    const struct lp_type type = bld->type;
 130    const char *intrinsic = NULL;
 131    LLVMValueRef cond;
 132
 133    assert(lp_check_value(type, a));
 134    assert(lp_check_value(type, b));
 135
 136    /* TODO: optimize the constant case */
 137
 138    if(type.width * type.length == 128) {
 139       if(type.floating) {
 140          if(type.width == 32 && util_cpu_caps.has_sse)
 141             intrinsic = "llvm.x86.sse.max.ps";
 142          if(type.width == 64 && util_cpu_caps.has_sse2)
 143             intrinsic = "llvm.x86.sse2.max.pd";
 144       }
 145       else {
 146          if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
 147             intrinsic = "llvm.x86.sse2.pmaxu.b";
 148          if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
 149             intrinsic = "llvm.x86.sse41.pmaxsb";
 150          if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
 151             intrinsic = "llvm.x86.sse41.pmaxuw";
 152          if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
 153             intrinsic = "llvm.x86.sse2.pmaxs.w";
 154          if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
 155             intrinsic = "llvm.x86.sse41.pmaxud";
 156          if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
 157             intrinsic = "llvm.x86.sse41.pmaxsd";
 158       }
 159    }
 160
 161    if(intrinsic)
 162       return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 163
 164    cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 165    return lp_build_select(bld, cond, a, b);
 166 }
 167
 168
 169 /**
 170  * Generate 1 - a, or ~a depending on bld->type.
 171  */
 172 LLVMValueRef
 173 lp_build_comp(struct lp_build_context *bld,
 174               LLVMValueRef a)
 175 {
 176    LLVMBuilderRef builder = bld->gallivm->builder;
 177    const struct lp_type type = bld->type;
 178
 179    assert(lp_check_value(type, a));
 180
 181    if(a == bld->one)
 182       return bld->zero;
 183    if(a == bld->zero)
 184       return bld->one;
 185
 186    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 187       if(LLVMIsConstant(a))
 188          return LLVMConstNot(a);
 189       else
 190          return LLVMBuildNot(builder, a, "");
 191    }
 192
 193    if(LLVMIsConstant(a))
 194       if (type.floating)
 195           return LLVMConstFSub(bld->one, a);
 196       else
 197           return LLVMConstSub(bld->one, a);
 198    else
 199       if (type.floating)
 200          return LLVMBuildFSub(builder, bld->one, a, "");
 201       else
 202          return LLVMBuildSub(builder, bld->one, a, "");
 203 }
 204
 205
 206 /**
 207  * Generate a + b
 208  */
 209 LLVMValueRef
 210 lp_build_add(struct lp_build_context *bld,
 211              LLVMValueRef a,
 212              LLVMValueRef b)
 213 {
 214    LLVMBuilderRef builder = bld->gallivm->builder;
 215    const struct lp_type type = bld->type;
 216    LLVMValueRef res;
 217
 218    assert(lp_check_value(type, a));
 219    assert(lp_check_value(type, b));
 220
 221    if(a == bld->zero)
 222       return b;
 223    if(b == bld->zero)
 224       return a;
 225    if(a == bld->undef || b == bld->undef)
 226       return bld->undef;
 227
 228    if(bld->type.norm) {
 229       const char *intrinsic = NULL;
 230
 231       if(a == bld->one || b == bld->one)
 232         return bld->one;
 233
 234       if(util_cpu_caps.has_sse2 &&
 235          type.width * type.length == 128 &&
 236          !type.floating && !type.fixed) {
 237          if(type.width == 8)
 238             intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
 239          if(type.width == 16)
 240             intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
 241       }
 242
 243       if(intrinsic)
 244          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 245    }
 246
 247    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 248       if (type.floating)
 249          res = LLVMConstFAdd(a, b);
 250       else
 251          res = LLVMConstAdd(a, b);
 252    else
 253       if (type.floating)
 254          res = LLVMBuildFAdd(builder, a, b, "");
 255       else
 256          res = LLVMBuildAdd(builder, a, b, "");
 257
 258    /* clamp to ceiling of 1.0 */
 259    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 260       res = lp_build_min_simple(bld, res, bld->one);
 261
 262    /* XXX clamp to floor of -1 or 0??? */
 263
 264    return res;
 265 }
 266
 267
 268 /** Return the scalar sum of the elements of a */
 269 LLVMValueRef
 270 lp_build_sum_vector(struct lp_build_context *bld,
 271                     LLVMValueRef a)
 272 {
 273    LLVMBuilderRef builder = bld->gallivm->builder;
 274    const struct lp_type type = bld->type;
 275    LLVMValueRef index, res;
 276    unsigned i;
 277
 278    assert(lp_check_value(type, a));
 279
 280    if (type.length == 1) {
 281       return a;
 282    }
 283
 284    assert(!bld->type.norm);
 285
 286    index = lp_build_const_int32(bld->gallivm, 0);
 287    res = LLVMBuildExtractElement(builder, a, index, "");
 288
 289    for (i = 1; i < type.length; i++) {
 290       index = lp_build_const_int32(bld->gallivm, i);
 291       if (type.floating)
 292          res = LLVMBuildFAdd(builder, res,
 293                             LLVMBuildExtractElement(builder,
 294                                                     a, index, ""),
 295                             "");
 296       else
 297          res = LLVMBuildAdd(builder, res,
 298                             LLVMBuildExtractElement(builder,
 299                                                     a, index, ""),
 300                             "");
 301    }
 302
 303    return res;
 304 }
 305
 306
 307 /**
 308  * Generate a - b
 309  */
 310 LLVMValueRef
 311 lp_build_sub(struct lp_build_context *bld,
 312              LLVMValueRef a,
 313              LLVMValueRef b)
 314 {
 315    LLVMBuilderRef builder = bld->gallivm->builder;
 316    const struct lp_type type = bld->type;
 317    LLVMValueRef res;
 318
 319    assert(lp_check_value(type, a));
 320    assert(lp_check_value(type, b));
 321
 322    if(b == bld->zero)
 323       return a;
 324    if(a == bld->undef || b == bld->undef)
 325       return bld->undef;
 326    if(a == b)
 327       return bld->zero;
 328
 329    if(bld->type.norm) {
 330       const char *intrinsic = NULL;
 331
 332       if(b == bld->one)
 333         return bld->zero;
 334
 335       if(util_cpu_caps.has_sse2 &&
 336          type.width * type.length == 128 &&
 337          !type.floating && !type.fixed) {
 338          if(type.width == 8)
 339             intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
 340          if(type.width == 16)
 341             intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
 342       }
 343
 344       if(intrinsic)
 345          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 346    }
 347
 348    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 349       if (type.floating)
 350          res = LLVMConstFSub(a, b);
 351       else
 352          res = LLVMConstSub(a, b);
 353    else
 354       if (type.floating)
 355          res = LLVMBuildFSub(builder, a, b, "");
 356       else
 357          res = LLVMBuildSub(builder, a, b, "");
 358
 359    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 360       res = lp_build_max_simple(bld, res, bld->zero);
 361
 362    return res;
 363 }
 364
 365
 366 /**
 367  * Normalized 8bit multiplication.
 368  *
 369  * - alpha plus one
 370  *
 371  *     makes the following approximation to the division (Sree)
 372  *
 373  *       a*b/255 ~= (a*(b + 1)) >> 256
 374  *
 375  *     which is the fastest method that satisfies the following OpenGL criteria
 376  *
 377  *       0*0 = 0 and 255*255 = 255
 378  *
 379  * - geometric series
 380  *
 381  *     takes the geometric series approximation to the division
 382  *
 383  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 384  *
 385  *     in this case just the first two terms to fit in 16bit arithmetic
 386  *
 387  *       t/255 ~= (t + (t >> 8)) >> 8
 388  *
 389  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 390  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 391  *     must be used
 392  *
 393  * - geometric series plus rounding
 394  *
 395  *     when using a geometric series division instead of truncating the result
 396  *     use roundoff in the approximation (Jim Blinn)
 397  *
 398  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
 399  *
 400  *     achieving the exact results
 401  *
 402  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
 403  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
 404  * @sa Michael Herf, The "double blend trick", May 2000,
 405  *     http://www.stereopsis.com/doubleblend.html
 406  */
 407 static LLVMValueRef
 408 lp_build_mul_u8n(struct gallivm_state *gallivm,
 409                  struct lp_type i16_type,
 410                  LLVMValueRef a, LLVMValueRef b)
 411 {
 412    LLVMBuilderRef builder = gallivm->builder;
 413    LLVMValueRef c8;
 414    LLVMValueRef ab;
 415
 416    assert(!i16_type.floating);
 417    assert(lp_check_value(i16_type, a));
 418    assert(lp_check_value(i16_type, b));
 419
 420    c8 = lp_build_const_int_vec(gallivm, i16_type, 8);
 421
 422 #if 0
 423
 424    /* a*b/255 ~= (a*(b + 1)) >> 256 */
 425    b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(gallium, i16_type, 1), "");
 426    ab = LLVMBuildMul(builder, a, b, "");
 427
 428 #else
 429
 430    /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
 431    ab = LLVMBuildMul(builder, a, b, "");
 432    ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
 433    ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(gallivm, i16_type, 0x80), "");
 434
 435 #endif
 436
 437    ab = LLVMBuildLShr(builder, ab, c8, "");
 438
 439    return ab;
 440 }
 441
 442
 443 /**
 444  * Generate a * b
 445  */
 446 LLVMValueRef
 447 lp_build_mul(struct lp_build_context *bld,
 448              LLVMValueRef a,
 449              LLVMValueRef b)
 450 {
 451    LLVMBuilderRef builder = bld->gallivm->builder;
 452    const struct lp_type type = bld->type;
 453    LLVMValueRef shift;
 454    LLVMValueRef res;
 455
 456    assert(lp_check_value(type, a));
 457    assert(lp_check_value(type, b));
 458
 459    if(a == bld->zero)
 460       return bld->zero;
 461    if(a == bld->one)
 462       return b;
 463    if(b == bld->zero)
 464       return bld->zero;
 465    if(b == bld->one)
 466       return a;
 467    if(a == bld->undef || b == bld->undef)
 468       return bld->undef;
 469
 470    if(!type.floating && !type.fixed && type.norm) {
 471       if(type.width == 8) {
 472          struct lp_type i16_type = lp_wider_type(type);
 473          LLVMValueRef al, ah, bl, bh, abl, abh, ab;
 474
 475          lp_build_unpack2(bld->gallivm, type, i16_type, a, &al, &ah);
 476          lp_build_unpack2(bld->gallivm, type, i16_type, b, &bl, &bh);
 477
 478          /* PMULLW, PSRLW, PADDW */
 479          abl = lp_build_mul_u8n(bld->gallivm, i16_type, al, bl);
 480          abh = lp_build_mul_u8n(bld->gallivm, i16_type, ah, bh);
 481
 482          ab = lp_build_pack2(bld->gallivm, i16_type, type, abl, abh);
 483
 484          return ab;
 485       }
 486
 487       /* FIXME */
 488       assert(0);
 489    }
 490
 491    if(type.fixed)
 492       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
 493    else
 494       shift = NULL;
 495
 496    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
 497       if (type.floating)
 498          res = LLVMConstFMul(a, b);
 499       else
 500          res = LLVMConstMul(a, b);
 501       if(shift) {
 502          if(type.sign)
 503             res = LLVMConstAShr(res, shift);
 504          else
 505             res = LLVMConstLShr(res, shift);
 506       }
 507    }
 508    else {
 509       if (type.floating)
 510          res = LLVMBuildFMul(builder, a, b, "");
 511       else
 512          res = LLVMBuildMul(builder, a, b, "");
 513       if(shift) {
 514          if(type.sign)
 515             res = LLVMBuildAShr(builder, res, shift, "");
 516          else
 517             res = LLVMBuildLShr(builder, res, shift, "");
 518       }
 519    }
 520
 521    return res;
 522 }
 523
 524
 525 /**
 526  * Small vector x scale multiplication optimization.
 527  */
 528 LLVMValueRef
 529 lp_build_mul_imm(struct lp_build_context *bld,
 530                  LLVMValueRef a,
 531                  int b)
 532 {
 533    LLVMBuilderRef builder = bld->gallivm->builder;
 534    LLVMValueRef factor;
 535
 536    assert(lp_check_value(bld->type, a));
 537
 538    if(b == 0)
 539       return bld->zero;
 540
 541    if(b == 1)
 542       return a;
 543
 544    if(b == -1)
 545       return lp_build_negate(bld, a);
 546
 547    if(b == 2 && bld->type.floating)
 548       return lp_build_add(bld, a, a);
 549
 550    if(util_is_power_of_two(b)) {
 551       unsigned shift = ffs(b) - 1;
 552
 553       if(bld->type.floating) {
 554 #if 0
 555          /*
 556           * Power of two multiplication by directly manipulating the mantissa.
 557           *
 558           * XXX: This might not be always faster, it will introduce a small error
 559           * for multiplication by zero, and it will produce wrong results
 560           * for Inf and NaN.
 561           */
 562          unsigned mantissa = lp_mantissa(bld->type);
 563          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
 564          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
 565          a = LLVMBuildAdd(builder, a, factor, "");
 566          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
 567          return a;
 568 #endif
 569       }
 570       else {
 571          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
 572          return LLVMBuildShl(builder, a, factor, "");
 573       }
 574    }
 575
 576    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
 577    return lp_build_mul(bld, a, factor);
 578 }
 579
 580
 581 /**
 582  * Generate a / b
 583  */
 584 LLVMValueRef
 585 lp_build_div(struct lp_build_context *bld,
 586              LLVMValueRef a,
 587              LLVMValueRef b)
 588 {
 589    LLVMBuilderRef builder = bld->gallivm->builder;
 590    const struct lp_type type = bld->type;
 591
 592    assert(lp_check_value(type, a));
 593    assert(lp_check_value(type, b));
 594
 595    if(a == bld->zero)
 596       return bld->zero;
 597    if(a == bld->one)
 598       return lp_build_rcp(bld, b);
 599    if(b == bld->zero)
 600       return bld->undef;
 601    if(b == bld->one)
 602       return a;
 603    if(a == bld->undef || b == bld->undef)
 604       return bld->undef;
 605
 606    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
 607       if (type.floating)
 608          return LLVMConstFDiv(a, b);
 609       else if (type.sign)
 610          return LLVMConstSDiv(a, b);
 611       else
 612          return LLVMConstUDiv(a, b);
 613    }
 614
 615    if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
 616       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
 617
 618    if (type.floating)
 619       return LLVMBuildFDiv(builder, a, b, "");
 620    else if (type.sign)
 621       return LLVMBuildSDiv(builder, a, b, "");
 622    else
 623       return LLVMBuildUDiv(builder, a, b, "");
 624 }
 625
 626
 627 /**
 628  * Linear interpolation -- without any checks.
 629  *
 630  * @sa http://www.stereopsis.com/doubleblend.html
 631  */
 632 static INLINE LLVMValueRef
 633 lp_build_lerp_simple(struct lp_build_context *bld,
 634                      LLVMValueRef x,
 635                      LLVMValueRef v0,
 636                      LLVMValueRef v1)
 637 {
 638    LLVMBuilderRef builder = bld->gallivm->builder;
 639    LLVMValueRef delta;
 640    LLVMValueRef res;
 641
 642    assert(lp_check_value(bld->type, x));
 643    assert(lp_check_value(bld->type, v0));
 644    assert(lp_check_value(bld->type, v1));
 645
 646    delta = lp_build_sub(bld, v1, v0);
 647
 648    res = lp_build_mul(bld, x, delta);
 649
 650    res = lp_build_add(bld, v0, res);
 651
 652    if (bld->type.fixed) {
 653       /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
 654        * but it will be wrong for other uses. Basically we need a more
 655        * powerful lp_type, capable of further distinguishing the values
 656        * interpretation from the value storage. */
 657       res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << bld->type.width/2) - 1), "");
 658    }
 659
 660    return res;
 661 }
 662
 663
 664 /**
 665  * Linear interpolation.
 666  */
 667 LLVMValueRef
 668 lp_build_lerp(struct lp_build_context *bld,
 669               LLVMValueRef x,
 670               LLVMValueRef v0,
 671               LLVMValueRef v1)
 672 {
 673    LLVMBuilderRef builder = bld->gallivm->builder;
 674    const struct lp_type type = bld->type;
 675    LLVMValueRef res;
 676
 677    assert(lp_check_value(type, x));
 678    assert(lp_check_value(type, v0));
 679    assert(lp_check_value(type, v1));
 680
 681    if (type.norm) {
 682       struct lp_type wide_type;
 683       struct lp_build_context wide_bld;
 684       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
 685       LLVMValueRef shift;
 686
 687       assert(type.length >= 2);
 688       assert(!type.sign);
 689
 690       /*
 691        * Create a wider type, enough to hold the intermediate result of the
 692        * multiplication.
 693        */
 694       memset(&wide_type, 0, sizeof wide_type);
 695       wide_type.fixed  = TRUE;
 696       wide_type.width  = type.width*2;
 697       wide_type.length = type.length/2;
 698
 699       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
 700
 701       lp_build_unpack2(bld->gallivm, type, wide_type, x,  &xl,  &xh);
 702       lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
 703       lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
 704
 705       /*
 706        * Scale x from [0, 255] to [0, 256]
 707        */
 708
 709       shift = lp_build_const_int_vec(bld->gallivm, wide_type, type.width - 1);
 710
 711       xl = lp_build_add(&wide_bld, xl,
 712                         LLVMBuildAShr(builder, xl, shift, ""));
 713       xh = lp_build_add(&wide_bld, xh,
 714                         LLVMBuildAShr(builder, xh, shift, ""));
 715
 716       /*
 717        * Lerp both halves.
 718        */
 719
 720       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l);
 721       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h);
 722
 723       res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
 724    } else {
 725       res = lp_build_lerp_simple(bld, x, v0, v1);
 726    }
 727
 728    return res;
 729 }
 730
 731
 732 LLVMValueRef
 733 lp_build_lerp_2d(struct lp_build_context *bld,
 734                  LLVMValueRef x,
 735                  LLVMValueRef y,
 736                  LLVMValueRef v00,
 737                  LLVMValueRef v01,
 738                  LLVMValueRef v10,
 739                  LLVMValueRef v11)
 740 {
 741    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
 742    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
 743    return lp_build_lerp(bld, y, v0, v1);
 744 }
 745
 746
 747 /**
 748  * Generate min(a, b)
 749  * Do checks for special cases.
 750  */
 751 LLVMValueRef
 752 lp_build_min(struct lp_build_context *bld,
 753              LLVMValueRef a,
 754              LLVMValueRef b)
 755 {
 756    assert(lp_check_value(bld->type, a));
 757    assert(lp_check_value(bld->type, b));
 758
 759    if(a == bld->undef || b == bld->undef)
 760       return bld->undef;
 761
 762    if(a == b)
 763       return a;
 764
 765    if(bld->type.norm) {
 766       if(a == bld->zero || b == bld->zero)
 767          return bld->zero;
 768       if(a == bld->one)
 769          return b;
 770       if(b == bld->one)
 771          return a;
 772    }
 773
 774    return lp_build_min_simple(bld, a, b);
 775 }
 776
 777
 778 /**
 779  * Generate max(a, b)
 780  * Do checks for special cases.
 781  */
 782 LLVMValueRef
 783 lp_build_max(struct lp_build_context *bld,
 784              LLVMValueRef a,
 785              LLVMValueRef b)
 786 {
 787    assert(lp_check_value(bld->type, a));
 788    assert(lp_check_value(bld->type, b));
 789
 790    if(a == bld->undef || b == bld->undef)
 791       return bld->undef;
 792
 793    if(a == b)
 794       return a;
 795
 796    if(bld->type.norm) {
 797       if(a == bld->one || b == bld->one)
 798          return bld->one;
 799       if(a == bld->zero)
 800          return b;
 801       if(b == bld->zero)
 802          return a;
 803    }
 804
 805    return lp_build_max_simple(bld, a, b);
 806 }
 807
 808
 809 /**
 810  * Generate clamp(a, min, max)
 811  * Do checks for special cases.
 812  */
 813 LLVMValueRef
 814 lp_build_clamp(struct lp_build_context *bld,
 815                LLVMValueRef a,
 816                LLVMValueRef min,
 817                LLVMValueRef max)
 818 {
 819    assert(lp_check_value(bld->type, a));
 820    assert(lp_check_value(bld->type, min));
 821    assert(lp_check_value(bld->type, max));
 822
 823    a = lp_build_min(bld, a, max);
 824    a = lp_build_max(bld, a, min);
 825    return a;
 826 }
 827
 828
 829 /**
 830  * Generate abs(a)
 831  */
 832 LLVMValueRef
 833 lp_build_abs(struct lp_build_context *bld,
 834              LLVMValueRef a)
 835 {
 836    LLVMBuilderRef builder = bld->gallivm->builder;
 837    const struct lp_type type = bld->type;
 838    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
 839
 840    assert(lp_check_value(type, a));
 841
 842    if(!type.sign)
 843       return a;
 844
 845    if(type.floating) {
 846       /* Mask out the sign bit */
 847       LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
 848       unsigned long long absMask = ~(1ULL << (type.width - 1));
 849       LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
 850       a = LLVMBuildBitCast(builder, a, int_vec_type, "");
 851       a = LLVMBuildAnd(builder, a, mask, "");
 852       a = LLVMBuildBitCast(builder, a, vec_type, "");
 853       return a;
 854    }
 855
 856    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
 857       switch(type.width) {
 858       case 8:
 859          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
 860       case 16:
 861          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
 862       case 32:
 863          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
 864       }
 865    }
 866
 867    return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
 868 }
 869
 870
 871 LLVMValueRef
 872 lp_build_negate(struct lp_build_context *bld,
 873                 LLVMValueRef a)
 874 {
 875    LLVMBuilderRef builder = bld->gallivm->builder;
 876
 877    assert(lp_check_value(bld->type, a));
 878
 879 #if HAVE_LLVM >= 0x0207
 880    if (bld->type.floating)
 881       a = LLVMBuildFNeg(builder, a, "");
 882    else
 883 #endif
 884       a = LLVMBuildNeg(builder, a, "");
 885
 886    return a;
 887 }
 888
 889
 890 /** Return -1, 0 or +1 depending on the sign of a */
 891 LLVMValueRef
 892 lp_build_sgn(struct lp_build_context *bld,
 893              LLVMValueRef a)
 894 {
 895    LLVMBuilderRef builder = bld->gallivm->builder;
 896    const struct lp_type type = bld->type;
 897    LLVMValueRef cond;
 898    LLVMValueRef res;
 899
 900    assert(lp_check_value(type, a));
 901
 902    /* Handle non-zero case */
 903    if(!type.sign) {
 904       /* if not zero then sign must be positive */
 905       res = bld->one;
 906    }
 907    else if(type.floating) {
 908       LLVMTypeRef vec_type;
 909       LLVMTypeRef int_type;
 910       LLVMValueRef mask;
 911       LLVMValueRef sign;
 912       LLVMValueRef one;
 913       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
 914
 915       int_type = lp_build_int_vec_type(bld->gallivm, type);
 916       vec_type = lp_build_vec_type(bld->gallivm, type);
 917       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
 918
 919       /* Take the sign bit and add it to 1 constant */
 920       sign = LLVMBuildBitCast(builder, a, int_type, "");
 921       sign = LLVMBuildAnd(builder, sign, mask, "");
 922       one = LLVMConstBitCast(bld->one, int_type);
 923       res = LLVMBuildOr(builder, sign, one, "");
 924       res = LLVMBuildBitCast(builder, res, vec_type, "");
 925    }
 926    else
 927    {
 928       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
 929       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
 930       res = lp_build_select(bld, cond, bld->one, minus_one);
 931    }
 932
 933    /* Handle zero */
 934    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
 935    res = lp_build_select(bld, cond, bld->zero, res);
 936
 937    return res;
 938 }
 939
 940
 941 /**
 942  * Set the sign of float vector 'a' according to 'sign'.
 943  * If sign==0, return abs(a).
 944  * If sign==1, return -abs(a);
 945  * Other values for sign produce undefined results.
 946  */
 947 LLVMValueRef
 948 lp_build_set_sign(struct lp_build_context *bld,
 949                   LLVMValueRef a, LLVMValueRef sign)
 950 {
 951    LLVMBuilderRef builder = bld->gallivm->builder;
 952    const struct lp_type type = bld->type;
 953    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
 954    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
 955    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
 956    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
 957                              ~((unsigned long long) 1 << (type.width - 1)));
 958    LLVMValueRef val, res;
 959
 960    assert(type.floating);
 961    assert(lp_check_value(type, a));
 962
 963    /* val = reinterpret_cast<int>(a) */
 964    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
 965    /* val = val & mask */
 966    val = LLVMBuildAnd(builder, val, mask, "");
 967    /* sign = sign << shift */
 968    sign = LLVMBuildShl(builder, sign, shift, "");
 969    /* res = val | sign */
 970    res = LLVMBuildOr(builder, val, sign, "");
 971    /* res = reinterpret_cast<float>(res) */
 972    res = LLVMBuildBitCast(builder, res, vec_type, "");
 973
 974    return res;
 975 }
 976
 977
 978 /**
 979  * Convert vector of (or scalar) int to vector of (or scalar) float.
 980  */
 981 LLVMValueRef
 982 lp_build_int_to_float(struct lp_build_context *bld,
 983                       LLVMValueRef a)
 984 {
 985    LLVMBuilderRef builder = bld->gallivm->builder;
 986    const struct lp_type type = bld->type;
 987    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
 988
 989    assert(type.floating);
 990
 991    return LLVMBuildSIToFP(builder, a, vec_type, "");
 992 }
 993
 994
 995
 996 enum lp_build_round_sse41_mode
 997 {
 998    LP_BUILD_ROUND_SSE41_NEAREST = 0,
 999    LP_BUILD_ROUND_SSE41_FLOOR = 1,
1000    LP_BUILD_ROUND_SSE41_CEIL = 2,
1001    LP_BUILD_ROUND_SSE41_TRUNCATE = 3
1002 };
1003
1004
1005 /**
1006  * Helper for SSE4.1's ROUNDxx instructions.
1007  *
1008  * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1009  * result is the even value.  That is, rounding 2.5 will be 2.0, and not 3.0.
1010  */
1011 static INLINE LLVMValueRef
1012 lp_build_round_sse41(struct lp_build_context *bld,
1013                      LLVMValueRef a,
1014                      enum lp_build_round_sse41_mode mode)
1015 {
1016    LLVMBuilderRef builder = bld->gallivm->builder;
1017    const struct lp_type type = bld->type;
1018    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1019    const char *intrinsic;
1020    LLVMValueRef res;
1021
1022    assert(type.floating);
1023
1024    assert(lp_check_value(type, a));
1025    assert(util_cpu_caps.has_sse4_1);
1026
1027    if (type.length == 1) {
1028       LLVMTypeRef vec_type;
1029       LLVMValueRef undef;
1030       LLVMValueRef args[3];
1031       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1032
1033       switch(type.width) {
1034       case 32:
1035          intrinsic = "llvm.x86.sse41.round.ss";
1036          break;
1037       case 64:
1038          intrinsic = "llvm.x86.sse41.round.sd";
1039          break;
1040       default:
1041          assert(0);
1042          return bld->undef;
1043       }
1044
1045       vec_type = LLVMVectorType(bld->elem_type, 4);
1046
1047       undef = LLVMGetUndef(vec_type);
1048
1049       args[0] = undef;
1050       args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
1051       args[2] = LLVMConstInt(i32t, mode, 0);
1052
1053       res = lp_build_intrinsic(builder, intrinsic,
1054                                vec_type, args, Elements(args));
1055
1056       res = LLVMBuildExtractElement(builder, res, index0, "");
1057    }
1058    else {
1059       assert(type.width*type.length == 128);
1060
1061       switch(type.width) {
1062       case 32:
1063          intrinsic = "llvm.x86.sse41.round.ps";
1064          break;
1065       case 64:
1066          intrinsic = "llvm.x86.sse41.round.pd";
1067          break;
1068       default:
1069          assert(0);
1070          return bld->undef;
1071       }
1072
1073       res = lp_build_intrinsic_binary(builder, intrinsic,
1074                                       bld->vec_type, a,
1075                                       LLVMConstInt(i32t, mode, 0));
1076    }
1077
1078    return res;
1079 }
1080
1081
1082 static INLINE LLVMValueRef
1083 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1084                              LLVMValueRef a)
1085 {
1086    LLVMBuilderRef builder = bld->gallivm->builder;
1087    const struct lp_type type = bld->type;
1088    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1089    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1090    const char *intrinsic;
1091    LLVMValueRef res;
1092
1093    assert(type.floating);
1094    /* using the double precision conversions is a bit more complicated */
1095    assert(type.width == 32);
1096
1097    assert(lp_check_value(type, a));
1098    assert(util_cpu_caps.has_sse2);
1099
1100    /* This is relying on MXCSR rounding mode, which should always be nearest. */
1101    if (type.length == 1) {
1102       LLVMTypeRef vec_type;
1103       LLVMValueRef undef;
1104       LLVMValueRef arg;
1105       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1106
1107       vec_type = LLVMVectorType(bld->elem_type, 4);
1108
1109       intrinsic = "llvm.x86.sse.cvtss2si";
1110
1111       undef = LLVMGetUndef(vec_type);
1112
1113       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1114
1115       res = lp_build_intrinsic_unary(builder, intrinsic,
1116                                      ret_type, arg);
1117    }
1118    else {
1119       assert(type.width*type.length == 128);
1120
1121       intrinsic = "llvm.x86.sse2.cvtps2dq";
1122
1123       res = lp_build_intrinsic_unary(builder, intrinsic,
1124                                      ret_type, a);
1125    }
1126
1127    return res;
1128 }
1129
1130
1131 /**
1132  * Return the integer part of a float (vector) value (== round toward zero).
1133  * The returned value is a float (vector).
1134  * Ex: trunc(-1.5) = -1.0
1135  */
1136 LLVMValueRef
1137 lp_build_trunc(struct lp_build_context *bld,
1138                LLVMValueRef a)
1139 {
1140    LLVMBuilderRef builder = bld->gallivm->builder;
1141    const struct lp_type type = bld->type;
1142
1143    assert(type.floating);
1144    assert(lp_check_value(type, a));
1145
1146    if (util_cpu_caps.has_sse4_1 &&
1147        (type.length == 1 || type.width*type.length == 128)) {
1148       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
1149    }
1150    else {
1151       LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1152       LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1153       LLVMValueRef res;
1154       res = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1155       res = LLVMBuildSIToFP(builder, res, vec_type, "");
1156       return res;
1157    }
1158 }
1159
1160
1161 /**
1162  * Return float (vector) rounded to nearest integer (vector).  The returned
1163  * value is a float (vector).
1164  * Ex: round(0.9) = 1.0
1165  * Ex: round(-1.5) = -2.0
1166  */
1167 LLVMValueRef
1168 lp_build_round(struct lp_build_context *bld,
1169                LLVMValueRef a)
1170 {
1171    LLVMBuilderRef builder = bld->gallivm->builder;
1172    const struct lp_type type = bld->type;
1173
1174    assert(type.floating);
1175    assert(lp_check_value(type, a));
1176
1177    if (util_cpu_caps.has_sse4_1 &&
1178        (type.length == 1 || type.width*type.length == 128)) {
1179       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1180    }
1181    else {
1182       LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1183       LLVMValueRef res;
1184       res = lp_build_iround(bld, a);
1185       res = LLVMBuildSIToFP(builder, res, vec_type, "");
1186       return res;
1187    }
1188 }
1189
1190
1191 /**
1192  * Return floor of float (vector), result is a float (vector)
1193  * Ex: floor(1.1) = 1.0
1194  * Ex: floor(-1.1) = -2.0
1195  */
1196 LLVMValueRef
1197 lp_build_floor(struct lp_build_context *bld,
1198                LLVMValueRef a)
1199 {
1200    LLVMBuilderRef builder = bld->gallivm->builder;
1201    const struct lp_type type = bld->type;
1202
1203    assert(type.floating);
1204    assert(lp_check_value(type, a));
1205
1206    if (util_cpu_caps.has_sse4_1 &&
1207        (type.length == 1 || type.width*type.length == 128)) {
1208       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1209    }
1210    else {
1211       LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1212       LLVMValueRef res;
1213       res = lp_build_ifloor(bld, a);
1214       res = LLVMBuildSIToFP(builder, res, vec_type, "");
1215       return res;
1216    }
1217 }
1218
1219
1220 /**
1221  * Return ceiling of float (vector), returning float (vector).
1222  * Ex: ceil( 1.1) = 2.0
1223  * Ex: ceil(-1.1) = -1.0
1224  */
1225 LLVMValueRef
1226 lp_build_ceil(struct lp_build_context *bld,
1227               LLVMValueRef a)
1228 {
1229    LLVMBuilderRef builder = bld->gallivm->builder;
1230    const struct lp_type type = bld->type;
1231
1232    assert(type.floating);
1233    assert(lp_check_value(type, a));
1234
1235    if (util_cpu_caps.has_sse4_1 &&
1236        (type.length == 1 || type.width*type.length == 128)) {
1237       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1238    }
1239    else {
1240       LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1241       LLVMValueRef res;
1242       res = lp_build_iceil(bld, a);
1243       res = LLVMBuildSIToFP(builder, res, vec_type, "");
1244       return res;
1245    }
1246 }
1247
1248
1249 /**
1250  * Return fractional part of 'a' computed as a - floor(a)
1251  * Typically used in texture coord arithmetic.
1252  */
1253 LLVMValueRef
1254 lp_build_fract(struct lp_build_context *bld,
1255                LLVMValueRef a)
1256 {
1257    assert(bld->type.floating);
1258    return lp_build_sub(bld, a, lp_build_floor(bld, a));
1259 }
1260
1261
1262 /**
1263  * Return the integer part of a float (vector) value (== round toward zero).
1264  * The returned value is an integer (vector).
1265  * Ex: itrunc(-1.5) = -1
1266  */
1267 LLVMValueRef
1268 lp_build_itrunc(struct lp_build_context *bld,
1269                 LLVMValueRef a)
1270 {
1271    LLVMBuilderRef builder = bld->gallivm->builder;
1272    const struct lp_type type = bld->type;
1273    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1274
1275    assert(type.floating);
1276    assert(lp_check_value(type, a));
1277
1278    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
1279 }
1280
1281
1282 /**
1283  * Return float (vector) rounded to nearest integer (vector).  The returned
1284  * value is an integer (vector).
1285  * Ex: iround(0.9) = 1
1286  * Ex: iround(-1.5) = -2
1287  */
1288 LLVMValueRef
1289 lp_build_iround(struct lp_build_context *bld,
1290                 LLVMValueRef a)
1291 {
1292    LLVMBuilderRef builder = bld->gallivm->builder;
1293    const struct lp_type type = bld->type;
1294    LLVMTypeRef int_vec_type = bld->int_vec_type;
1295    LLVMValueRef res;
1296
1297    assert(type.floating);
1298
1299    assert(lp_check_value(type, a));
1300
1301    if (util_cpu_caps.has_sse2 &&
1302        ((type.width == 32) && (type.length == 1 || type.length == 4))) {
1303       return lp_build_iround_nearest_sse2(bld, a);
1304    }
1305    else if (util_cpu_caps.has_sse4_1 &&
1306        (type.length == 1 || type.width*type.length == 128)) {
1307       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1308    }
1309    else {
1310       LLVMValueRef half;
1311
1312       half = lp_build_const_vec(bld->gallivm, type, 0.5);
1313
1314       if (type.sign) {
1315          LLVMTypeRef vec_type = bld->vec_type;
1316          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1317                                     (unsigned long long)1 << (type.width - 1));
1318          LLVMValueRef sign;
1319
1320          /* get sign bit */
1321          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1322          sign = LLVMBuildAnd(builder, sign, mask, "");
1323
1324          /* sign * 0.5 */
1325          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
1326          half = LLVMBuildOr(builder, sign, half, "");
1327          half = LLVMBuildBitCast(builder, half, vec_type, "");
1328       }
1329
1330       res = LLVMBuildFAdd(builder, a, half, "");
1331    }
1332
1333    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
1334
1335    return res;
1336 }
1337
1338
1339 /**
1340  * Return floor of float (vector), result is an int (vector)
1341  * Ex: ifloor(1.1) = 1.0
1342  * Ex: ifloor(-1.1) = -2.0
1343  */
1344 LLVMValueRef
1345 lp_build_ifloor(struct lp_build_context *bld,
1346                 LLVMValueRef a)
1347 {
1348    LLVMBuilderRef builder = bld->gallivm->builder;
1349    const struct lp_type type = bld->type;
1350    LLVMTypeRef int_vec_type = bld->int_vec_type;
1351    LLVMValueRef res;
1352
1353    assert(type.floating);
1354    assert(lp_check_value(type, a));
1355
1356    if (util_cpu_caps.has_sse4_1 &&
1357        (type.length == 1 || type.width*type.length == 128)) {
1358       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1359    }
1360    else {
1361       res = a;
1362
1363       if (type.sign) {
1364          /* Take the sign bit and add it to 1 constant */
1365          LLVMTypeRef vec_type = bld->vec_type;
1366          unsigned mantissa = lp_mantissa(type);
1367          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1368                                   (unsigned long long)1 << (type.width - 1));
1369          LLVMValueRef sign;
1370          LLVMValueRef offset;
1371
1372          /* sign = a < 0 ? ~0 : 0 */
1373          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1374          sign = LLVMBuildAnd(builder, sign, mask, "");
1375          sign = LLVMBuildAShr(builder, sign,
1376                               lp_build_const_int_vec(bld->gallivm, type,
1377                                                      type.width - 1),
1378                               "ifloor.sign");
1379
1380          /* offset = -0.99999(9)f */
1381          offset = lp_build_const_vec(bld->gallivm, type,
1382                                      -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1383          offset = LLVMConstBitCast(offset, int_vec_type);
1384
1385          /* offset = a < 0 ? offset : 0.0f */
1386          offset = LLVMBuildAnd(builder, offset, sign, "");
1387          offset = LLVMBuildBitCast(builder, offset, vec_type, "ifloor.offset");
1388
1389          res = LLVMBuildFAdd(builder, res, offset, "ifloor.res");
1390       }
1391    }
1392
1393    /* round to nearest (toward zero) */
1394    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
1395
1396    return res;
1397 }
1398
1399
1400 /**
1401  * Return ceiling of float (vector), returning int (vector).
1402  * Ex: iceil( 1.1) = 2
1403  * Ex: iceil(-1.1) = -1
1404  */
1405 LLVMValueRef
1406 lp_build_iceil(struct lp_build_context *bld,
1407                LLVMValueRef a)
1408 {
1409    LLVMBuilderRef builder = bld->gallivm->builder;
1410    const struct lp_type type = bld->type;
1411    LLVMTypeRef int_vec_type = bld->int_vec_type;
1412    LLVMValueRef res;
1413
1414    assert(type.floating);
1415    assert(lp_check_value(type, a));
1416
1417    if (util_cpu_caps.has_sse4_1 &&
1418        (type.length == 1 || type.width*type.length == 128)) {
1419       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1420    }
1421    else {
1422       LLVMTypeRef vec_type = bld->vec_type;
1423       unsigned mantissa = lp_mantissa(type);
1424       LLVMValueRef offset;
1425
1426       /* offset = 0.99999(9)f */
1427       offset = lp_build_const_vec(bld->gallivm, type,
1428                                   (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1429
1430       if (type.sign) {
1431          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1432                                 (unsigned long long)1 << (type.width - 1));
1433          LLVMValueRef sign;
1434
1435          /* sign = a < 0 ? 0 : ~0 */
1436          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1437          sign = LLVMBuildAnd(builder, sign, mask, "");
1438          sign = LLVMBuildAShr(builder, sign,
1439                               lp_build_const_int_vec(bld->gallivm, type,
1440                                                      type.width - 1),
1441                               "iceil.sign");
1442          sign = LLVMBuildNot(builder, sign, "iceil.not");
1443
1444          /* offset = a < 0 ? 0.0 : offset */
1445          offset = LLVMConstBitCast(offset, int_vec_type);
1446          offset = LLVMBuildAnd(builder, offset, sign, "");
1447          offset = LLVMBuildBitCast(builder, offset, vec_type, "iceil.offset");
1448       }
1449
1450       res = LLVMBuildFAdd(builder, a, offset, "iceil.res");
1451    }
1452
1453    /* round to nearest (toward zero) */
1454    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
1455
1456    return res;
1457 }
1458
1459
1460 /**
1461  * Combined ifloor() & fract().
1462  *
1463  * Preferred to calling the functions separately, as it will ensure that the
1464  * stratergy (floor() vs ifloor()) that results in less redundant work is used.
1465  */
1466 void
1467 lp_build_ifloor_fract(struct lp_build_context *bld,
1468                       LLVMValueRef a,
1469                       LLVMValueRef *out_ipart,
1470                       LLVMValueRef *out_fpart)
1471 {
1472    LLVMBuilderRef builder = bld->gallivm->builder;
1473    const struct lp_type type = bld->type;
1474    LLVMValueRef ipart;
1475
1476    assert(type.floating);
1477    assert(lp_check_value(type, a));
1478
1479    if (util_cpu_caps.has_sse4_1 &&
1480        (type.length == 1 || type.width*type.length == 128)) {
1481       /*
1482        * floor() is easier.
1483        */
1484
1485       ipart = lp_build_floor(bld, a);
1486       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
1487       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
1488    }
1489    else {
1490       /*
1491        * ifloor() is easier.
1492        */
1493
1494       *out_ipart = lp_build_ifloor(bld, a);
1495       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
1496       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
1497    }
1498 }
1499
1500
1501 LLVMValueRef
1502 lp_build_sqrt(struct lp_build_context *bld,
1503               LLVMValueRef a)
1504 {
1505    LLVMBuilderRef builder = bld->gallivm->builder;
1506    const struct lp_type type = bld->type;
1507    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1508    char intrinsic[32];
1509
1510    assert(lp_check_value(type, a));
1511
1512    /* TODO: optimize the constant case */
1513    /* TODO: optimize the constant case */
1514
1515    assert(type.floating);
1516    util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
1517
1518    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1519 }
1520
1521
1522 /**
1523  * Do one Newton-Raphson step to improve reciprocate precision:
1524  *
1525  *   x_{i+1} = x_i * (2 - a * x_i)
1526  *
1527  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
1528  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
1529  * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
1530  * halo. It would be necessary to clamp the argument to prevent this.
1531  *
1532  * See also:
1533  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
1534  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1535  */
1536 static INLINE LLVMValueRef
1537 lp_build_rcp_refine(struct lp_build_context *bld,
1538                     LLVMValueRef a,
1539                     LLVMValueRef rcp_a)
1540 {
1541    LLVMBuilderRef builder = bld->gallivm->builder;
1542    LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
1543    LLVMValueRef res;
1544
1545    res = LLVMBuildFMul(builder, a, rcp_a, "");
1546    res = LLVMBuildFSub(builder, two, res, "");
1547    res = LLVMBuildFMul(builder, rcp_a, res, "");
1548
1549    return res;
1550 }
1551
1552
1553 LLVMValueRef
1554 lp_build_rcp(struct lp_build_context *bld,
1555              LLVMValueRef a)
1556 {
1557    LLVMBuilderRef builder = bld->gallivm->builder;
1558    const struct lp_type type = bld->type;
1559
1560    assert(lp_check_value(type, a));
1561
1562    if(a == bld->zero)
1563       return bld->undef;
1564    if(a == bld->one)
1565       return bld->one;
1566    if(a == bld->undef)
1567       return bld->undef;
1568
1569    assert(type.floating);
1570
1571    if(LLVMIsConstant(a))
1572       return LLVMConstFDiv(bld->one, a);
1573
1574    /*
1575     * We don't use RCPPS because:
1576     * - it only has 10bits of precision
1577     * - it doesn't even get the reciprocate of 1.0 exactly
1578     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
1579     * - for recent processors the benefit over DIVPS is marginal, a case
1580     *   depedent
1581     *
1582     * We could still use it on certain processors if benchmarks show that the
1583     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
1584     * particular uses that require less workarounds.
1585     */
1586
1587    if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1588       const unsigned num_iterations = 0;
1589       LLVMValueRef res;
1590       unsigned i;
1591
1592       res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a);
1593
1594       for (i = 0; i < num_iterations; ++i) {
1595          res = lp_build_rcp_refine(bld, a, res);
1596       }
1597
1598       return res;
1599    }
1600
1601    return LLVMBuildFDiv(builder, bld->one, a, "");
1602 }
1603
1604
1605 /**
1606  * Do one Newton-Raphson step to improve rsqrt precision:
1607  *
1608  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
1609  *
1610  * See also:
1611  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1612  */
1613 static INLINE LLVMValueRef
1614 lp_build_rsqrt_refine(struct lp_build_context *bld,
1615                       LLVMValueRef a,
1616                       LLVMValueRef rsqrt_a)
1617 {
1618    LLVMBuilderRef builder = bld->gallivm->builder;
1619    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
1620    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
1621    LLVMValueRef res;
1622
1623    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
1624    res = LLVMBuildFMul(builder, a, res, "");
1625    res = LLVMBuildFSub(builder, three, res, "");
1626    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
1627    res = LLVMBuildFMul(builder, half, res, "");
1628
1629    return res;
1630 }
1631
1632
1633 /**
1634  * Generate 1/sqrt(a)
1635  */
1636 LLVMValueRef
1637 lp_build_rsqrt(struct lp_build_context *bld,
1638                LLVMValueRef a)
1639 {
1640    LLVMBuilderRef builder = bld->gallivm->builder;
1641    const struct lp_type type = bld->type;
1642
1643    assert(lp_check_value(type, a));
1644
1645    assert(type.floating);
1646
1647    if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1648       const unsigned num_iterations = 0;
1649       LLVMValueRef res;
1650       unsigned i;
1651
1652       res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a);
1653
1654       for (i = 0; i < num_iterations; ++i) {
1655          res = lp_build_rsqrt_refine(bld, a, res);
1656       }
1657
1658       return res;
1659    }
1660
1661    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
1662 }
1663
1664
1665 static inline LLVMValueRef
1666 lp_build_const_v4si(struct gallivm_state *gallivm, unsigned long value)
1667 {
1668    LLVMValueRef element = lp_build_const_int32(gallivm, value);
1669    LLVMValueRef elements[4] = { element, element, element, element };
1670    return LLVMConstVector(elements, 4);
1671 }
1672
1673 static inline LLVMValueRef
1674 lp_build_const_v4sf(struct gallivm_state *gallivm, float value)
1675 {
1676    LLVMValueRef element = lp_build_const_float(gallivm, value);
1677    LLVMValueRef elements[4] = { element, element, element, element };
1678    return LLVMConstVector(elements, 4);
1679 }
1680
1681
1682 /**
1683  * Generate sin(a) using SSE2
1684  */
1685 LLVMValueRef
1686 lp_build_sin(struct lp_build_context *bld,
1687              LLVMValueRef a)
1688 {
1689    LLVMBuilderRef builder = bld->gallivm->builder;
1690    struct gallivm_state *gallivm = bld->gallivm;
1691    struct lp_type int_type = lp_int_type(bld->type);
1692    LLVMBuilderRef b = builder;
1693    LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatTypeInContext(bld->gallivm->context), 4);
1694    LLVMTypeRef v4si = LLVMVectorType(LLVMInt32TypeInContext(bld->gallivm->context), 4);
1695
1696    /*
1697     *  take the absolute value,
1698     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1699     */
1700
1701    LLVMValueRef inv_sig_mask = lp_build_const_v4si(bld->gallivm, ~0x80000000);
1702    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1703
1704    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1705    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1706
1707    /*
1708     * extract the sign bit (upper one)
1709     * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
1710     */
1711    LLVMValueRef sig_mask = lp_build_const_v4si(bld->gallivm, 0x80000000);
1712    LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
1713
1714    /*
1715     * scale by 4/Pi
1716     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1717     */
1718
1719    LLVMValueRef FOPi = lp_build_const_v4sf(gallivm, 1.27323954473516);
1720    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1721
1722    /*
1723     * store the integer part of y in mm0
1724     * emm2 = _mm_cvttps_epi32(y);
1725     */
1726
1727    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1728
1729    /*
1730     * j=(j+1) & (~1) (see the cephes sources)
1731     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1732     */
1733
1734    LLVMValueRef all_one = lp_build_const_v4si(bld->gallivm, 1);
1735    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1736    /*
1737     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1738     */
1739    LLVMValueRef inv_one = lp_build_const_v4si(bld->gallivm, ~1);
1740    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1741
1742    /*
1743     * y = _mm_cvtepi32_ps(emm2);
1744     */
1745    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1746
1747    /* get the swap sign flag
1748     * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
1749     */
1750    LLVMValueRef pi32_4 = lp_build_const_v4si(bld->gallivm, 4);
1751    LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
1752
1753    /*
1754     * emm2 = _mm_slli_epi32(emm0, 29);
1755     */
1756    LLVMValueRef const_29 = lp_build_const_v4si(bld->gallivm, 29);
1757    LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
1758
1759    /*
1760     * get the polynom selection mask
1761     * there is one polynom for 0 <= x <= Pi/4
1762     * and another one for Pi/4<x<=Pi/2
1763     * Both branches will be computed.
1764     *
1765     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1766     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1767     */
1768
1769    LLVMValueRef pi32_2 = lp_build_const_v4si(bld->gallivm, 2);
1770    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
1771    LLVMValueRef poly_mask = lp_build_compare(bld->gallivm,
1772                                              int_type, PIPE_FUNC_EQUAL,
1773                                              emm2_3, lp_build_const_v4si(bld->gallivm, 0));
1774    /*
1775     *   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
1776     */
1777    LLVMValueRef sign_bit_1 =  LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
1778
1779    /*
1780     * _PS_CONST(minus_cephes_DP1, -0.78515625);
1781     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1782     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1783     */
1784    LLVMValueRef DP1 = lp_build_const_v4sf(gallivm, -0.78515625);
1785    LLVMValueRef DP2 = lp_build_const_v4sf(gallivm, -2.4187564849853515625e-4);
1786    LLVMValueRef DP3 = lp_build_const_v4sf(gallivm, -3.77489497744594108e-8);
1787
1788    /*
1789     * The magic pass: "Extended precision modular arithmetic"
1790     * x = ((x - y * DP1) - y * DP2) - y * DP3;
1791     * xmm1 = _mm_mul_ps(y, xmm1);
1792     * xmm2 = _mm_mul_ps(y, xmm2);
1793     * xmm3 = _mm_mul_ps(y, xmm3);
1794     */
1795    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1796    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1797    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1798
1799    /*
1800     * x = _mm_add_ps(x, xmm1);
1801     * x = _mm_add_ps(x, xmm2);
1802     * x = _mm_add_ps(x, xmm3);
1803     */
1804
1805    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1806    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1807    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1808
1809    /*
1810     * Evaluate the first polynom  (0 <= x <= Pi/4)
1811     *
1812     * z = _mm_mul_ps(x,x);
1813     */
1814    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1815
1816    /*
1817     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
1818     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1819     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
1820     */
1821    LLVMValueRef coscof_p0 = lp_build_const_v4sf(gallivm, 2.443315711809948E-005);
1822    LLVMValueRef coscof_p1 = lp_build_const_v4sf(gallivm, -1.388731625493765E-003);
1823    LLVMValueRef coscof_p2 = lp_build_const_v4sf(gallivm, 4.166664568298827E-002);
1824
1825    /*
1826     * y = *(v4sf*)_ps_coscof_p0;
1827     * y = _mm_mul_ps(y, z);
1828     */
1829    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1830    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1831    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1832    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1833    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1834    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1835
1836
1837    /*
1838     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1839     * y = _mm_sub_ps(y, tmp);
1840     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1841     */
1842    LLVMValueRef half = lp_build_const_v4sf(gallivm, 0.5);
1843    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1844    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1845    LLVMValueRef one = lp_build_const_v4sf(gallivm, 1.0);
1846    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1847
1848    /*
1849     * _PS_CONST(sincof_p0, -1.9515295891E-4);
1850     * _PS_CONST(sincof_p1,  8.3321608736E-3);
1851     * _PS_CONST(sincof_p2, -1.6666654611E-1);
1852     */
1853    LLVMValueRef sincof_p0 = lp_build_const_v4sf(gallivm, -1.9515295891E-4);
1854    LLVMValueRef sincof_p1 = lp_build_const_v4sf(gallivm, 8.3321608736E-3);
1855    LLVMValueRef sincof_p2 = lp_build_const_v4sf(gallivm, -1.6666654611E-1);
1856
1857    /*
1858     * Evaluate the second polynom  (Pi/4 <= x <= 0)
1859     *
1860     * y2 = *(v4sf*)_ps_sincof_p0;
1861     * y2 = _mm_mul_ps(y2, z);
1862     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1863     * y2 = _mm_mul_ps(y2, z);
1864     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1865     * y2 = _mm_mul_ps(y2, z);
1866     * y2 = _mm_mul_ps(y2, x);
1867     * y2 = _mm_add_ps(y2, x);
1868     */
1869
1870    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1871    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1872    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1873    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1874    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1875    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1876    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1877
1878    /*
1879     * select the correct result from the two polynoms
1880     * xmm3 = poly_mask;
1881     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1882     * y = _mm_andnot_ps(xmm3, y);
1883     * y = _mm_add_ps(y,y2);
1884     */
1885    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
1886    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
1887    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1888    LLVMValueRef inv = lp_build_const_v4si(bld->gallivm, ~0);
1889    LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1890    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1891    LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1892
1893    /*
1894     * update the sign
1895     * y = _mm_xor_ps(y, sign_bit);
1896     */
1897    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
1898    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
1899    return y_result;
1900 }
1901
1902
1903 /**
1904  * Generate cos(a) using SSE2
1905  */
1906 LLVMValueRef
1907 lp_build_cos(struct lp_build_context *bld,
1908              LLVMValueRef a)
1909 {
1910    LLVMBuilderRef builder = bld->gallivm->builder;
1911    struct gallivm_state *gallivm = bld->gallivm;
1912    struct lp_type int_type = lp_int_type(bld->type);
1913    LLVMBuilderRef b = builder;
1914    LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatTypeInContext(bld->gallivm->context), 4);
1915    LLVMTypeRef v4si = LLVMVectorType(LLVMInt32TypeInContext(bld->gallivm->context), 4);
1916
1917    /*
1918     *  take the absolute value,
1919     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1920     */
1921
1922    LLVMValueRef inv_sig_mask = lp_build_const_v4si(bld->gallivm, ~0x80000000);
1923    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1924
1925    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1926    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1927
1928    /*
1929     * scale by 4/Pi
1930     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1931     */
1932
1933    LLVMValueRef FOPi = lp_build_const_v4sf(gallivm, 1.27323954473516);
1934    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1935
1936    /*
1937     * store the integer part of y in mm0
1938     * emm2 = _mm_cvttps_epi32(y);
1939     */
1940
1941    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1942
1943    /*
1944     * j=(j+1) & (~1) (see the cephes sources)
1945     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1946     */
1947
1948    LLVMValueRef all_one = lp_build_const_v4si(bld->gallivm, 1);
1949    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1950    /*
1951     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1952     */
1953    LLVMValueRef inv_one = lp_build_const_v4si(bld->gallivm, ~1);
1954    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1955
1956    /*
1957     * y = _mm_cvtepi32_ps(emm2);
1958     */
1959    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1960
1961
1962    /*
1963     * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
1964     */
1965    LLVMValueRef const_2 = lp_build_const_v4si(bld->gallivm, 2);
1966    LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
1967
1968
1969    /* get the swap sign flag
1970     * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
1971     */
1972    LLVMValueRef inv = lp_build_const_v4si(bld->gallivm, ~0);
1973    LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
1974    LLVMValueRef pi32_4 = lp_build_const_v4si(bld->gallivm, 4);
1975    LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
1976
1977    /*
1978     * emm2 = _mm_slli_epi32(emm0, 29);
1979     */
1980    LLVMValueRef const_29 = lp_build_const_v4si(bld->gallivm, 29);
1981    LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
1982
1983    /*
1984     * get the polynom selection mask
1985     * there is one polynom for 0 <= x <= Pi/4
1986     * and another one for Pi/4<x<=Pi/2
1987     * Both branches will be computed.
1988     *
1989     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1990     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1991     */
1992
1993    LLVMValueRef pi32_2 = lp_build_const_v4si(bld->gallivm, 2);
1994    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
1995    LLVMValueRef poly_mask = lp_build_compare(bld->gallivm,
1996                                              int_type, PIPE_FUNC_EQUAL,
1997                                              emm2_3, lp_build_const_v4si(bld->gallivm, 0));
1998
1999    /*
2000     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2001     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2002     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2003     */
2004    LLVMValueRef DP1 = lp_build_const_v4sf(gallivm, -0.78515625);
2005    LLVMValueRef DP2 = lp_build_const_v4sf(gallivm, -2.4187564849853515625e-4);
2006    LLVMValueRef DP3 = lp_build_const_v4sf(gallivm, -3.77489497744594108e-8);
2007
2008    /*
2009     * The magic pass: "Extended precision modular arithmetic"
2010     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2011     * xmm1 = _mm_mul_ps(y, xmm1);
2012     * xmm2 = _mm_mul_ps(y, xmm2);
2013     * xmm3 = _mm_mul_ps(y, xmm3);
2014     */
2015    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2016    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2017    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2018
2019    /*
2020     * x = _mm_add_ps(x, xmm1);
2021     * x = _mm_add_ps(x, xmm2);
2022     * x = _mm_add_ps(x, xmm3);
2023     */
2024
2025    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2026    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2027    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2028
2029    /*
2030     * Evaluate the first polynom  (0 <= x <= Pi/4)
2031     *
2032     * z = _mm_mul_ps(x,x);
2033     */
2034    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2035
2036    /*
2037     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2038     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2039     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2040     */
2041    LLVMValueRef coscof_p0 = lp_build_const_v4sf(gallivm, 2.443315711809948E-005);
2042    LLVMValueRef coscof_p1 = lp_build_const_v4sf(gallivm, -1.388731625493765E-003);
2043    LLVMValueRef coscof_p2 = lp_build_const_v4sf(gallivm, 4.166664568298827E-002);
2044
2045    /*
2046     * y = *(v4sf*)_ps_coscof_p0;
2047     * y = _mm_mul_ps(y, z);
2048     */
2049    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2050    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2051    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2052    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2053    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2054    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2055
2056
2057    /*
2058     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2059     * y = _mm_sub_ps(y, tmp);
2060     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2061     */
2062    LLVMValueRef half = lp_build_const_v4sf(gallivm, 0.5);
2063    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2064    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2065    LLVMValueRef one = lp_build_const_v4sf(gallivm, 1.0);
2066    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2067
2068    /*
2069     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2070     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2071     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2072     */
2073    LLVMValueRef sincof_p0 = lp_build_const_v4sf(gallivm, -1.9515295891E-4);
2074    LLVMValueRef sincof_p1 = lp_build_const_v4sf(gallivm, 8.3321608736E-3);
2075    LLVMValueRef sincof_p2 = lp_build_const_v4sf(gallivm, -1.6666654611E-1);
2076
2077    /*
2078     * Evaluate the second polynom  (Pi/4 <= x <= 0)
2079     *
2080     * y2 = *(v4sf*)_ps_sincof_p0;
2081     * y2 = _mm_mul_ps(y2, z);
2082     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2083     * y2 = _mm_mul_ps(y2, z);
2084     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2085     * y2 = _mm_mul_ps(y2, z);
2086     * y2 = _mm_mul_ps(y2, x);
2087     * y2 = _mm_add_ps(y2, x);
2088     */
2089
2090    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2091    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2092    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2093    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2094    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2095    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2096    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2097
2098    /*
2099     * select the correct result from the two polynoms
2100     * xmm3 = poly_mask;
2101     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2102     * y = _mm_andnot_ps(xmm3, y);
2103     * y = _mm_add_ps(y,y2);
2104     */
2105    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
2106    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
2107    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2108    LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
2109    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2110    LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
2111
2112    /*
2113     * update the sign
2114     * y = _mm_xor_ps(y, sign_bit);
2115     */
2116    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
2117    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
2118    return y_result;
2119 }
2120
2121
2122 /**
2123  * Generate pow(x, y)
2124  */
2125 LLVMValueRef
2126 lp_build_pow(struct lp_build_context *bld,
2127              LLVMValueRef x,
2128              LLVMValueRef y)
2129 {
2130    /* TODO: optimize the constant case */
2131    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2132        LLVMIsConstant(x) && LLVMIsConstant(y)) {
2133       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2134                    __FUNCTION__);
2135    }
2136
2137    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2138 }
2139
2140
2141 /**
2142  * Generate exp(x)
2143  */
2144 LLVMValueRef
2145 lp_build_exp(struct lp_build_context *bld,
2146              LLVMValueRef x)
2147 {
2148    /* log2(e) = 1/log(2) */
2149    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2150                                            1.4426950408889634);
2151
2152    assert(lp_check_value(bld->type, x));
2153
2154    return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
2155 }
2156
2157
2158 /**
2159  * Generate log(x)
2160  */
2161 LLVMValueRef
2162 lp_build_log(struct lp_build_context *bld,
2163              LLVMValueRef x)
2164 {
2165    /* log(2) */
2166    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2167                                           0.69314718055994529);
2168
2169    assert(lp_check_value(bld->type, x));
2170
2171    return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
2172 }
2173
2174
2175 /**
2176  * Generate polynomial.
2177  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2178  */
2179 static LLVMValueRef
2180 lp_build_polynomial(struct lp_build_context *bld,
2181                     LLVMValueRef x,
2182                     const double *coeffs,
2183                     unsigned num_coeffs)
2184 {
2185    const struct lp_type type = bld->type;
2186    LLVMValueRef res = NULL;
2187    unsigned i;
2188
2189    assert(lp_check_value(bld->type, x));
2190
2191    /* TODO: optimize the constant case */
2192    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2193        LLVMIsConstant(x)) {
2194       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2195                    __FUNCTION__);
2196    }
2197
2198    for (i = num_coeffs; i--; ) {
2199       LLVMValueRef coeff;
2200
2201       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
2202
2203       if(res)
2204          res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
2205       else
2206          res = coeff;
2207    }
2208
2209    if(res)
2210       return res;
2211    else
2212       return bld->undef;
2213 }
2214
2215
2216 /**
2217  * Minimax polynomial fit of 2**x, in range [0, 1[
2218  */
2219 const double lp_build_exp2_polynomial[] = {
2220 #if EXP_POLY_DEGREE == 5
2221    0.999999999690134838155,
2222    0.583974334321735217258,
2223    0.164553105719676828492,
2224    0.0292811063701710962255,
2225    0.00354944426657875141846,
2226    0.000296253726543423377365
2227 #elif EXP_POLY_DEGREE == 4
2228    1.00000001502262084505,
2229    0.563586057338685991394,
2230    0.150436017652442413623,
2231    0.0243220604213317927308,
2232    0.0025359088446580436489
2233 #elif EXP_POLY_DEGREE == 3
2234    0.999925218562710312959,
2235    0.695833540494823811697,
2236    0.226067155427249155588,
2237    0.0780245226406372992967
2238 #elif EXP_POLY_DEGREE == 2
2239    1.00172476321474503578,
2240    0.657636275736077639316,
2241    0.33718943461968720704
2242 #else
2243 #error
2244 #endif
2245 };
2246
2247
2248 void
2249 lp_build_exp2_approx(struct lp_build_context *bld,
2250                      LLVMValueRef x,
2251                      LLVMValueRef *p_exp2_int_part,
2252                      LLVMValueRef *p_frac_part,
2253                      LLVMValueRef *p_exp2)
2254 {
2255    LLVMBuilderRef builder = bld->gallivm->builder;
2256    const struct lp_type type = bld->type;
2257    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2258    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2259    LLVMValueRef ipart = NULL;
2260    LLVMValueRef fpart = NULL;
2261    LLVMValueRef expipart = NULL;
2262    LLVMValueRef expfpart = NULL;
2263    LLVMValueRef res = NULL;
2264
2265    assert(lp_check_value(bld->type, x));
2266
2267    if(p_exp2_int_part || p_frac_part || p_exp2) {
2268       /* TODO: optimize the constant case */
2269       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2270           LLVMIsConstant(x)) {
2271          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2272                       __FUNCTION__);
2273       }
2274
2275       assert(type.floating && type.width == 32);
2276
2277       x = lp_build_min(bld, x, lp_build_const_vec(bld->gallivm, type,  129.0));
2278       x = lp_build_max(bld, x, lp_build_const_vec(bld->gallivm, type, -126.99999));
2279
2280       /* ipart = floor(x) */
2281       ipart = lp_build_floor(bld, x);
2282
2283       /* fpart = x - ipart */
2284       fpart = LLVMBuildFSub(builder, x, ipart, "");
2285    }
2286
2287    if(p_exp2_int_part || p_exp2) {
2288       /* expipart = (float) (1 << ipart) */
2289       ipart = LLVMBuildFPToSI(builder, ipart, int_vec_type, "");
2290       expipart = LLVMBuildAdd(builder, ipart,
2291                               lp_build_const_int_vec(bld->gallivm, type, 127), "");
2292       expipart = LLVMBuildShl(builder, expipart,
2293                               lp_build_const_int_vec(bld->gallivm, type, 23), "");
2294       expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
2295    }
2296
2297    if(p_exp2) {
2298       expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
2299                                      Elements(lp_build_exp2_polynomial));
2300
2301       res = LLVMBuildFMul(builder, expipart, expfpart, "");
2302    }
2303
2304    if(p_exp2_int_part)
2305       *p_exp2_int_part = expipart;
2306
2307    if(p_frac_part)
2308       *p_frac_part = fpart;
2309
2310    if(p_exp2)
2311       *p_exp2 = res;
2312 }
2313
2314
2315 LLVMValueRef
2316 lp_build_exp2(struct lp_build_context *bld,
2317               LLVMValueRef x)
2318 {
2319    LLVMValueRef res;
2320    lp_build_exp2_approx(bld, x, NULL, NULL, &res);
2321    return res;
2322 }
2323
2324
2325 /**
2326  * Extract the exponent of a IEEE-754 floating point value.
2327  *
2328  * Optionally apply an integer bias.
2329  *
2330  * Result is an integer value with
2331  *
2332  *   ifloor(log2(x)) + bias
2333  */
2334 LLVMValueRef
2335 lp_build_extract_exponent(struct lp_build_context *bld,
2336                           LLVMValueRef x,
2337                           int bias)
2338 {
2339    LLVMBuilderRef builder = bld->gallivm->builder;
2340    const struct lp_type type = bld->type;
2341    unsigned mantissa = lp_mantissa(type);
2342    LLVMValueRef res;
2343
2344    assert(type.floating);
2345
2346    assert(lp_check_value(bld->type, x));
2347
2348    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
2349
2350    res = LLVMBuildLShr(builder, x,
2351                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
2352    res = LLVMBuildAnd(builder, res,
2353                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
2354    res = LLVMBuildSub(builder, res,
2355                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
2356
2357    return res;
2358 }
2359
2360
2361 /**
2362  * Extract the mantissa of the a floating.
2363  *
2364  * Result is a floating point value with
2365  *
2366  *   x / floor(log2(x))
2367  */
2368 LLVMValueRef
2369 lp_build_extract_mantissa(struct lp_build_context *bld,
2370                           LLVMValueRef x)
2371 {
2372    LLVMBuilderRef builder = bld->gallivm->builder;
2373    const struct lp_type type = bld->type;
2374    unsigned mantissa = lp_mantissa(type);
2375    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
2376                                                   (1ULL << mantissa) - 1);
2377    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
2378    LLVMValueRef res;
2379
2380    assert(lp_check_value(bld->type, x));
2381
2382    assert(type.floating);
2383
2384    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
2385
2386    /* res = x / 2**ipart */
2387    res = LLVMBuildAnd(builder, x, mantmask, "");
2388    res = LLVMBuildOr(builder, res, one, "");
2389    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
2390
2391    return res;
2392 }
2393
2394
2395
2396 /**
2397  * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
2398  * These coefficients can be generate with
2399  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
2400  */
2401 const double lp_build_log2_polynomial[] = {
2402 #if LOG_POLY_DEGREE == 6
2403    3.11578814719469302614,
2404    -3.32419399085241980044,
2405    2.59883907202499966007,
2406    -1.23152682416275988241,
2407    0.318212422185251071475,
2408    -0.0344359067839062357313
2409 #elif LOG_POLY_DEGREE == 5
2410    2.8882704548164776201,
2411    -2.52074962577807006663,
2412    1.48116647521213171641,
2413    -0.465725644288844778798,
2414    0.0596515482674574969533
2415 #elif LOG_POLY_DEGREE == 4
2416    2.61761038894603480148,
2417    -1.75647175389045657003,
2418    0.688243882994381274313,
2419    -0.107254423828329604454
2420 #elif LOG_POLY_DEGREE == 3
2421    2.28330284476918490682,
2422    -1.04913055217340124191,
2423    0.204446009836232697516
2424 #else
2425 #error
2426 #endif
2427 };
2428
2429
2430 /**
2431  * See http://www.devmaster.net/forums/showthread.php?p=43580
2432  */
2433 void
2434 lp_build_log2_approx(struct lp_build_context *bld,
2435                      LLVMValueRef x,
2436                      LLVMValueRef *p_exp,
2437                      LLVMValueRef *p_floor_log2,
2438                      LLVMValueRef *p_log2)
2439 {
2440    LLVMBuilderRef builder = bld->gallivm->builder;
2441    const struct lp_type type = bld->type;
2442    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2443    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2444
2445    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
2446    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
2447    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
2448
2449    LLVMValueRef i = NULL;
2450    LLVMValueRef exp = NULL;
2451    LLVMValueRef mant = NULL;
2452    LLVMValueRef logexp = NULL;
2453    LLVMValueRef logmant = NULL;
2454    LLVMValueRef res = NULL;
2455
2456    assert(lp_check_value(bld->type, x));
2457
2458    if(p_exp || p_floor_log2 || p_log2) {
2459       /* TODO: optimize the constant case */
2460       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2461           LLVMIsConstant(x)) {
2462          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2463                       __FUNCTION__);
2464       }
2465
2466       assert(type.floating && type.width == 32);
2467
2468       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
2469
2470       /* exp = (float) exponent(x) */
2471       exp = LLVMBuildAnd(builder, i, expmask, "");
2472    }
2473
2474    if(p_floor_log2 || p_log2) {
2475       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
2476       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
2477       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
2478    }
2479
2480    if(p_log2) {
2481       /* mant = (float) mantissa(x) */
2482       mant = LLVMBuildAnd(builder, i, mantmask, "");
2483       mant = LLVMBuildOr(builder, mant, one, "");
2484       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
2485
2486       logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
2487                                     Elements(lp_build_log2_polynomial));
2488
2489       /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
2490       logmant = LLVMBuildFMul(builder, logmant, LLVMBuildFSub(builder, mant, bld->one, ""), "");
2491
2492       res = LLVMBuildFAdd(builder, logmant, logexp, "");
2493    }
2494
2495    if(p_exp) {
2496       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
2497       *p_exp = exp;
2498    }
2499
2500    if(p_floor_log2)
2501       *p_floor_log2 = logexp;
2502
2503    if(p_log2)
2504       *p_log2 = res;
2505 }
2506
2507
2508 LLVMValueRef
2509 lp_build_log2(struct lp_build_context *bld,
2510               LLVMValueRef x)
2511 {
2512    LLVMValueRef res;
2513    lp_build_log2_approx(bld, x, NULL, NULL, &res);
2514    return res;
2515 }
2516
2517
2518 /**
2519  * Faster (and less accurate) log2.
2520  *
2521  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
2522  *
2523  * Piece-wise linear approximation, with exact results when x is a
2524  * power of two.
2525  *
2526  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
2527  */
2528 LLVMValueRef
2529 lp_build_fast_log2(struct lp_build_context *bld,
2530                    LLVMValueRef x)
2531 {
2532    LLVMBuilderRef builder = bld->gallivm->builder;
2533    LLVMValueRef ipart;
2534    LLVMValueRef fpart;
2535
2536    assert(lp_check_value(bld->type, x));
2537
2538    assert(bld->type.floating);
2539
2540    /* ipart = floor(log2(x)) - 1 */
2541    ipart = lp_build_extract_exponent(bld, x, -1);
2542    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
2543
2544    /* fpart = x / 2**ipart */
2545    fpart = lp_build_extract_mantissa(bld, x);
2546
2547    /* ipart + fpart */
2548    return LLVMBuildFAdd(builder, ipart, fpart, "");
2549 }
2550
2551
2552 /**
2553  * Fast implementation of iround(log2(x)).
2554  *
2555  * Not an approximation -- it should give accurate results all the time.
2556  */
2557 LLVMValueRef
2558 lp_build_ilog2(struct lp_build_context *bld,
2559                LLVMValueRef x)
2560 {
2561    LLVMBuilderRef builder = bld->gallivm->builder;
2562    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
2563    LLVMValueRef ipart;
2564
2565    assert(bld->type.floating);
2566
2567    assert(lp_check_value(bld->type, x));
2568
2569    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
2570    x = LLVMBuildFMul(builder, x, sqrt2, "");
2571
2572    /* ipart = floor(log2(x) + 0.5)  */
2573    ipart = lp_build_extract_exponent(bld, x, 0);
2574
2575    return ipart;
2576 }