workbench/libs/mesa/src/gallium/auxiliary/tgsi/tgsi_sse2.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the
   9  * "Software"), to deal in the Software without restriction, including
  10  * without limitation the rights to use, copy, modify, merge, publish,
  11  * distribute, sub license, and/or sell copies of the Software, and to
  12  * permit persons to whom the Software is furnished to do so, subject to
  13  * the following conditions:
  14  *
  15  * The above copyright notice and this permission notice (including the
  16  * next paragraph) shall be included in all copies or substantial portions
  17  * of the Software.
  18  *
  19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  20  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  22  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  23  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  26  *
  27  **************************************************************************/
  28
  29 #include "pipe/p_config.h"
  30
  31 #include "tgsi/tgsi_sse2.h"
  32
  33 #if defined(PIPE_ARCH_X86) && 0 /* See FIXME notes below */
  34
  35 #include "util/u_debug.h"
  36 #include "pipe/p_shader_tokens.h"
  37 #include "util/u_math.h"
  38 #include "util/u_memory.h"
  39 #if defined(PIPE_ARCH_SSE)
  40 #include "util/u_sse.h"
  41 #endif
  42 #include "tgsi/tgsi_info.h"
  43 #include "tgsi/tgsi_parse.h"
  44 #include "tgsi/tgsi_util.h"
  45 #include "tgsi/tgsi_dump.h"
  46 #include "tgsi/tgsi_exec.h"
  47
  48 #include "rtasm/rtasm_x86sse.h"
  49
  50 /* for 1/sqrt()
  51  *
  52  * This costs about 100fps (close to 10%) in gears:
  53  */
  54 #define HIGH_PRECISION 1
  55
  56 #define FAST_MATH 1
  57
  58
  59 #define FOR_EACH_CHANNEL( CHAN )\
  60    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
  61
  62 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  63    ((INST).Dst[0].Register.WriteMask & (1 << (CHAN)))
  64
  65 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  66    if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
  67
  68 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
  69    FOR_EACH_CHANNEL( CHAN )\
  70       IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
  71
  72 #define CHAN_X 0
  73 #define CHAN_Y 1
  74 #define CHAN_Z 2
  75 #define CHAN_W 3
  76
  77 #define TEMP_ONE_I   TGSI_EXEC_TEMP_ONE_I
  78 #define TEMP_ONE_C   TGSI_EXEC_TEMP_ONE_C
  79
  80 #define TEMP_R0   TGSI_EXEC_TEMP_R0
  81 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
  82 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
  83 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
  84
  85
  86 /**
  87  * X86 utility functions.
  88  */
  89
  90 static struct x86_reg
  91 make_xmm(
  92    unsigned xmm )
  93 {
  94    return x86_make_reg(
  95       file_XMM,
  96       (enum x86_reg_name) xmm );
  97 }
  98
  99 /**
 100  * X86 register mapping helpers.
 101  */
 102
 103 static struct x86_reg
 104 get_const_base( void )
 105 {
 106    return x86_make_reg(
 107       file_REG32,
 108       reg_AX );
 109 }
 110
 111 static struct x86_reg
 112 get_machine_base( void )
 113 {
 114    return x86_make_reg(
 115       file_REG32,
 116       reg_CX );
 117 }
 118
 119 static struct x86_reg
 120 get_input_base( void )
 121 {
 122    /* FIXME: tgsi_exec_machine::Inputs is a pointer now! */
 123    return x86_make_disp(
 124       get_machine_base(),
 125       Offset(struct tgsi_exec_machine, Inputs) );
 126 }
 127
 128 static struct x86_reg
 129 get_output_base( void )
 130 {
 131    /* FIXME: tgsi_exec_machine::Ouputs is a pointer now! */
 132    return x86_make_disp(
 133       get_machine_base(),
 134       Offset(struct tgsi_exec_machine, Outputs) );
 135 }
 136
 137 static struct x86_reg
 138 get_temp_base( void )
 139 {
 140    return x86_make_disp(
 141       get_machine_base(),
 142       Offset(struct tgsi_exec_machine, Temps) );
 143 }
 144
 145 static struct x86_reg
 146 get_coef_base( void )
 147 {
 148    return x86_make_reg(
 149       file_REG32,
 150       reg_BX );
 151 }
 152
 153 static struct x86_reg
 154 get_sampler_base( void )
 155 {
 156    return x86_make_reg(
 157       file_REG32,
 158       reg_DI );
 159 }
 160
 161 static struct x86_reg
 162 get_immediate_base( void )
 163 {
 164    return x86_make_reg(
 165       file_REG32,
 166       reg_DX );
 167 }
 168
 169 static struct x86_reg
 170 get_system_value_base( void )
 171 {
 172    return x86_make_disp(
 173       get_machine_base(),
 174       Offset(struct tgsi_exec_machine, SystemValue) );
 175 }
 176
 177
 178 /**
 179  * Data access helpers.
 180  */
 181
 182
 183 static struct x86_reg
 184 get_immediate(
 185    unsigned vec,
 186    unsigned chan )
 187 {
 188    return x86_make_disp(
 189       get_immediate_base(),
 190       (vec * 4 + chan) * 4 );
 191 }
 192
 193 static struct x86_reg
 194 get_const(
 195    unsigned vec,
 196    unsigned chan )
 197 {
 198    return x86_make_disp(
 199       get_const_base(),
 200       (vec * 4 + chan) * 4 );
 201 }
 202
 203 static struct x86_reg
 204 get_sampler_ptr(
 205    unsigned unit )
 206 {
 207    return x86_make_disp(
 208       get_sampler_base(),
 209       unit * sizeof( struct tgsi_sampler * ) );
 210 }
 211
 212 static struct x86_reg
 213 get_input(
 214    unsigned vec,
 215    unsigned chan )
 216 {
 217    return x86_make_disp(
 218       get_input_base(),
 219       (vec * 4 + chan) * 16 );
 220 }
 221
 222 static struct x86_reg
 223 get_output(
 224    unsigned vec,
 225    unsigned chan )
 226 {
 227    return x86_make_disp(
 228       get_output_base(),
 229       (vec * 4 + chan) * 16 );
 230 }
 231
 232 static struct x86_reg
 233 get_temp(
 234    unsigned vec,
 235    unsigned chan )
 236 {
 237    return x86_make_disp(
 238       get_temp_base(),
 239       (vec * 4 + chan) * 16 );
 240 }
 241
 242 static struct x86_reg
 243 get_system_value(
 244    unsigned vec,
 245    unsigned chan )
 246 {
 247    return x86_make_disp(
 248       get_system_value_base(), /* base */
 249       (vec * 4 + chan) * 4 );  /* byte offset from base */
 250 }
 251
 252 static struct x86_reg
 253 get_coef(
 254    unsigned vec,
 255    unsigned chan,
 256    unsigned member )
 257 {
 258    return x86_make_disp(
 259       get_coef_base(),
 260       ((vec * 3 + member) * 4 + chan) * 4 );
 261 }
 262
 263
 264 static void
 265 emit_ret(
 266    struct x86_function  *func )
 267 {
 268    x86_ret( func );
 269 }
 270
 271
 272 /**
 273  * Data fetch helpers.
 274  */
 275
 276 /**
 277  * Copy a shader constant to xmm register
 278  * \param xmm  the destination xmm register
 279  * \param vec  the src const buffer index
 280  * \param chan  src channel to fetch (X, Y, Z or W)
 281  */
 282 static void
 283 emit_const(
 284    struct x86_function *func,
 285    uint xmm,
 286    int vec,
 287    uint chan,
 288    uint indirect,
 289    uint indirectFile,
 290    int indirectIndex )
 291 {
 292    if (indirect) {
 293       /* 'vec' is the offset from the address register's value.
 294        * We're loading CONST[ADDR+vec] into an xmm register.
 295        */
 296       struct x86_reg r0 = get_immediate_base();
 297       struct x86_reg r1 = get_coef_base();
 298       uint i;
 299
 300       assert( indirectFile == TGSI_FILE_ADDRESS );
 301       assert( indirectIndex == 0 );
 302       assert( r0.mod == mod_REG );
 303       assert( r1.mod == mod_REG );
 304
 305       x86_push( func, r0 );
 306       x86_push( func, r1 );
 307
 308       /*
 309        * Loop over the four pixels or vertices in the quad.
 310        * Get the value of the address (offset) register for pixel/vertex[i],
 311        * add it to the src offset and index into the constant buffer.
 312        * Note that we're working on SOA data.
 313        * If any of the pixel/vertex execution channels are unused their
 314        * values will be garbage.  It's very important that we don't use
 315        * those garbage values as indexes into the constant buffer since
 316        * that'll cause segfaults.
 317        * The solution is to bitwise-AND the offset with the execution mask
 318        * register whose values are either 0 or ~0.
 319        * The caller must setup the execution mask register to indicate
 320        * which channels are valid/alive before running the shader.
 321        * The execution mask will also figure into loops and conditionals
 322        * someday.
 323        */
 324       for (i = 0; i < QUAD_SIZE; i++) {
 325          /* r1 = address register[i] */
 326          x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
 327          /* r0 = execution mask[i] */
 328          x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
 329          /* r1 = r1 & r0 */
 330          x86_and( func, r1, r0 );
 331          /* r0 = 'vec', the offset */
 332          x86_lea( func, r0, get_const( vec, chan ) );
 333
 334          /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
 335           */
 336          x86_add( func, r1, r1 );
 337          x86_add( func, r1, r1 );
 338          x86_add( func, r1, r1 );
 339          x86_add( func, r1, r1 );
 340
 341          x86_add( func, r0, r1 );  /* r0 = r0 + r1 */
 342          x86_mov( func, r1, x86_deref( r0 ) );
 343          x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
 344       }
 345
 346       x86_pop( func, r1 );
 347       x86_pop( func, r0 );
 348
 349       sse_movaps(
 350          func,
 351          make_xmm( xmm ),
 352          get_temp( TEMP_R0, CHAN_X ) );
 353    }
 354    else {
 355       /* 'vec' is the index into the src register file, such as TEMP[vec] */
 356       assert( vec >= 0 );
 357
 358       sse_movss(
 359          func,
 360          make_xmm( xmm ),
 361          get_const( vec, chan ) );
 362       sse_shufps(
 363          func,
 364          make_xmm( xmm ),
 365          make_xmm( xmm ),
 366          SHUF( 0, 0, 0, 0 ) );
 367    }
 368 }
 369
 370 static void
 371 emit_immediate(
 372    struct x86_function *func,
 373    unsigned xmm,
 374    unsigned vec,
 375    unsigned chan )
 376 {
 377    sse_movss(
 378       func,
 379       make_xmm( xmm ),
 380       get_immediate( vec, chan ) );
 381    sse_shufps(
 382       func,
 383       make_xmm( xmm ),
 384       make_xmm( xmm ),
 385       SHUF( 0, 0, 0, 0 ) );
 386 }
 387
 388
 389 /**
 390  * Copy a shader input to xmm register
 391  * \param xmm  the destination xmm register
 392  * \param vec  the src input attrib
 393  * \param chan  src channel to fetch (X, Y, Z or W)
 394  */
 395 static void
 396 emit_inputf(
 397    struct x86_function *func,
 398    unsigned xmm,
 399    unsigned vec,
 400    unsigned chan )
 401 {
 402    sse_movups(
 403       func,
 404       make_xmm( xmm ),
 405       get_input( vec, chan ) );
 406 }
 407
 408 /**
 409  * Store an xmm register to a shader output
 410  * \param xmm  the source xmm register
 411  * \param vec  the dest output attrib
 412  * \param chan  src dest channel to store (X, Y, Z or W)
 413  */
 414 static void
 415 emit_output(
 416    struct x86_function *func,
 417    unsigned xmm,
 418    unsigned vec,
 419    unsigned chan )
 420 {
 421    sse_movups(
 422       func,
 423       get_output( vec, chan ),
 424       make_xmm( xmm ) );
 425 }
 426
 427 /**
 428  * Copy a shader temporary to xmm register
 429  * \param xmm  the destination xmm register
 430  * \param vec  the src temp register
 431  * \param chan  src channel to fetch (X, Y, Z or W)
 432  */
 433 static void
 434 emit_tempf(
 435    struct x86_function *func,
 436    unsigned xmm,
 437    unsigned vec,
 438    unsigned chan )
 439 {
 440    sse_movaps(
 441       func,
 442       make_xmm( xmm ),
 443       get_temp( vec, chan ) );
 444 }
 445
 446 /**
 447  * Copy a system value to xmm register
 448  * \param xmm  the destination xmm register
 449  * \param vec  the source system value register
 450  * \param chan  src channel to fetch (X, Y, Z or W)
 451  */
 452 static void
 453 emit_system_value(
 454    struct x86_function *func,
 455    unsigned xmm,
 456    unsigned vec,
 457    unsigned chan )
 458 {
 459    sse_movss(
 460       func,
 461       make_xmm( xmm ),
 462       get_system_value( vec, chan ) );
 463    sse_shufps(
 464       func,
 465       make_xmm( xmm ),
 466       make_xmm( xmm ),
 467       SHUF( 0, 0, 0, 0 ) );
 468 }
 469
 470 /**
 471  * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
 472  * \param xmm  the destination xmm register
 473  * \param vec  the src input/attribute coefficient index
 474  * \param chan  src channel to fetch (X, Y, Z or W)
 475  * \param member  0=a0, 1=dadx, 2=dady
 476  */
 477 static void
 478 emit_coef(
 479    struct x86_function *func,
 480    unsigned xmm,
 481    unsigned vec,
 482    unsigned chan,
 483    unsigned member )
 484 {
 485    sse_movss(
 486       func,
 487       make_xmm( xmm ),
 488       get_coef( vec, chan, member ) );
 489    sse_shufps(
 490       func,
 491       make_xmm( xmm ),
 492       make_xmm( xmm ),
 493       SHUF( 0, 0, 0, 0 ) );
 494 }
 495
 496 /**
 497  * Data store helpers.
 498  */
 499
 500 static void
 501 emit_inputs(
 502    struct x86_function *func,
 503    unsigned xmm,
 504    unsigned vec,
 505    unsigned chan )
 506 {
 507    sse_movups(
 508       func,
 509       get_input( vec, chan ),
 510       make_xmm( xmm ) );
 511 }
 512
 513 static void
 514 emit_temps(
 515    struct x86_function *func,
 516    unsigned xmm,
 517    unsigned vec,
 518    unsigned chan )
 519 {
 520    sse_movaps(
 521       func,
 522       get_temp( vec, chan ),
 523       make_xmm( xmm ) );
 524 }
 525
 526 static void
 527 emit_addrs(
 528    struct x86_function *func,
 529    unsigned xmm,
 530    unsigned vec,
 531    unsigned chan )
 532 {
 533    assert( vec == 0 );
 534
 535    emit_temps(
 536       func,
 537       xmm,
 538       vec + TGSI_EXEC_TEMP_ADDR,
 539       chan );
 540 }
 541
 542 /**
 543  * Coefficent fetch helpers.
 544  */
 545
 546 static void
 547 emit_coef_a0(
 548    struct x86_function *func,
 549    unsigned xmm,
 550    unsigned vec,
 551    unsigned chan )
 552 {
 553    emit_coef(
 554       func,
 555       xmm,
 556       vec,
 557       chan,
 558       0 );
 559 }
 560
 561 static void
 562 emit_coef_dadx(
 563    struct x86_function *func,
 564    unsigned xmm,
 565    unsigned vec,
 566    unsigned chan )
 567 {
 568    emit_coef(
 569       func,
 570       xmm,
 571       vec,
 572       chan,
 573       1 );
 574 }
 575
 576 static void
 577 emit_coef_dady(
 578    struct x86_function *func,
 579    unsigned xmm,
 580    unsigned vec,
 581    unsigned chan )
 582 {
 583    emit_coef(
 584       func,
 585       xmm,
 586       vec,
 587       chan,
 588       2 );
 589 }
 590
 591 /**
 592  * Function call helpers.
 593  */
 594
 595 /**
 596  * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
 597  * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
 598  * that the stack pointer is 16 byte aligned, as expected.
 599  */
 600 static void
 601 emit_func_call(
 602    struct x86_function *func,
 603    unsigned xmm_save_mask,
 604    const struct x86_reg *arg,
 605    unsigned nr_args,
 606    void (PIPE_CDECL *code)() )
 607 {
 608    struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
 609    unsigned i, n;
 610
 611    x86_push(
 612       func,
 613       x86_make_reg( file_REG32, reg_AX) );
 614    x86_push(
 615       func,
 616       x86_make_reg( file_REG32, reg_CX) );
 617    x86_push(
 618       func,
 619       x86_make_reg( file_REG32, reg_DX) );
 620
 621    /* Store XMM regs to the stack
 622     */
 623    for(i = 0, n = 0; i < 8; ++i)
 624       if(xmm_save_mask & (1 << i))
 625          ++n;
 626
 627    x86_sub_imm(
 628       func,
 629       x86_make_reg( file_REG32, reg_SP ),
 630       n*16);
 631
 632    for(i = 0, n = 0; i < 8; ++i)
 633       if(xmm_save_mask & (1 << i)) {
 634          sse_movups(
 635             func,
 636             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
 637             make_xmm( i ) );
 638          ++n;
 639       }
 640
 641    for (i = 0; i < nr_args; i++) {
 642       /* Load the address of the buffer we use for passing arguments and
 643        * receiving results:
 644        */
 645       x86_lea(
 646          func,
 647          ecx,
 648          arg[i] );
 649
 650       /* Push actual function arguments (currently just the pointer to
 651        * the buffer above), and call the function:
 652        */
 653       x86_push( func, ecx );
 654    }
 655
 656    x86_mov_reg_imm( func, ecx, (unsigned long) code );
 657    x86_call( func, ecx );
 658
 659    /* Pop the arguments (or just add an immediate to esp)
 660     */
 661    for (i = 0; i < nr_args; i++) {
 662       x86_pop(func, ecx );
 663    }
 664
 665    /* Pop the saved XMM regs:
 666     */
 667    for(i = 0, n = 0; i < 8; ++i)
 668       if(xmm_save_mask & (1 << i)) {
 669          sse_movups(
 670             func,
 671             make_xmm( i ),
 672             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
 673          ++n;
 674       }
 675
 676    x86_add_imm(
 677       func,
 678       x86_make_reg( file_REG32, reg_SP ),
 679       n*16);
 680
 681    /* Restore GP registers in a reverse order.
 682     */
 683    x86_pop(
 684       func,
 685       x86_make_reg( file_REG32, reg_DX) );
 686    x86_pop(
 687       func,
 688       x86_make_reg( file_REG32, reg_CX) );
 689    x86_pop(
 690       func,
 691       x86_make_reg( file_REG32, reg_AX) );
 692 }
 693
 694 static void
 695 emit_func_call_dst_src1(
 696    struct x86_function *func,
 697    unsigned xmm_save,
 698    unsigned xmm_dst,
 699    unsigned xmm_src0,
 700    void (PIPE_CDECL *code)() )
 701 {
 702    struct x86_reg store = get_temp( TEMP_R0, 0 );
 703    unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
 704
 705    /* Store our input parameters (in xmm regs) to the buffer we use
 706     * for passing arguments.  We will pass a pointer to this buffer as
 707     * the actual function argument.
 708     */
 709    sse_movaps(
 710       func,
 711       store,
 712       make_xmm( xmm_src0 ) );
 713
 714    emit_func_call( func,
 715                    xmm_mask,
 716                    &store,
 717                    1,
 718                    code );
 719
 720    sse_movaps(
 721       func,
 722       make_xmm( xmm_dst ),
 723       store );
 724 }
 725
 726
 727 static void
 728 emit_func_call_dst_src2(
 729    struct x86_function *func,
 730    unsigned xmm_save,
 731    unsigned xmm_dst,
 732    unsigned xmm_src0,
 733    unsigned xmm_src1,
 734    void (PIPE_CDECL *code)() )
 735 {
 736    struct x86_reg store = get_temp( TEMP_R0, 0 );
 737    unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
 738
 739    /* Store two inputs to parameter buffer.
 740     */
 741    sse_movaps(
 742       func,
 743       store,
 744       make_xmm( xmm_src0 ) );
 745
 746    sse_movaps(
 747       func,
 748       x86_make_disp( store, 4 * sizeof(float) ),
 749       make_xmm( xmm_src1 ) );
 750
 751
 752    /* Emit the call
 753     */
 754    emit_func_call( func,
 755                    xmm_mask,
 756                    &store,
 757                    1,
 758                    code );
 759
 760    /* Retrieve the results:
 761     */
 762    sse_movaps(
 763       func,
 764       make_xmm( xmm_dst ),
 765       store );
 766 }
 767
 768
 769
 770
 771
 772 #if defined(PIPE_ARCH_SSE)
 773
 774 /*
 775  * Fast SSE2 implementation of special math functions.
 776  */
 777
 778 #define POLY0(x, c0) _mm_set1_ps(c0)
 779 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
 780 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
 781 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
 782 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
 783 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
 784
 785 #define EXP_POLY_DEGREE 3
 786 #define LOG_POLY_DEGREE 5
 787
 788 /**
 789  * See http://www.devmaster.net/forums/showthread.php?p=43580
 790  */
 791 static INLINE __m128
 792 exp2f4(__m128 x)
 793 {
 794    __m128i ipart;
 795    __m128 fpart, expipart, expfpart;
 796
 797    x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
 798    x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
 799
 800    /* ipart = int(x - 0.5) */
 801    ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
 802
 803    /* fpart = x - ipart */
 804    fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
 805
 806    /* expipart = (float) (1 << ipart) */
 807    expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
 808
 809    /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
 810 #if EXP_POLY_DEGREE == 5
 811    expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
 812 #elif EXP_POLY_DEGREE == 4
 813    expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
 814 #elif EXP_POLY_DEGREE == 3
 815    expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
 816 #elif EXP_POLY_DEGREE == 2
 817    expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
 818 #else
 819 #error
 820 #endif
 821
 822    return _mm_mul_ps(expipart, expfpart);
 823 }
 824
 825
 826 /**
 827  * See http://www.devmaster.net/forums/showthread.php?p=43580
 828  */
 829 static INLINE __m128
 830 log2f4(__m128 x)
 831 {
 832    __m128i expmask = _mm_set1_epi32(0x7f800000);
 833    __m128i mantmask = _mm_set1_epi32(0x007fffff);
 834    __m128 one = _mm_set1_ps(1.0f);
 835
 836    __m128i i = _mm_castps_si128(x);
 837
 838    /* exp = (float) exponent(x) */
 839    __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
 840
 841    /* mant = (float) mantissa(x) */
 842    __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
 843
 844    __m128 logmant;
 845
 846    /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
 847     * These coefficients can be generate with
 848     * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
 849     */
 850 #if LOG_POLY_DEGREE == 6
 851    logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
 852 #elif LOG_POLY_DEGREE == 5
 853    logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
 854 #elif LOG_POLY_DEGREE == 4
 855    logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
 856 #elif LOG_POLY_DEGREE == 3
 857    logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
 858 #else
 859 #error
 860 #endif
 861
 862    /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
 863    logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
 864
 865    return _mm_add_ps(logmant, exp);
 866 }
 867
 868
 869 static INLINE __m128
 870 powf4(__m128 x, __m128 y)
 871 {
 872    return exp2f4(_mm_mul_ps(log2f4(x), y));
 873 }
 874
 875 #endif /* PIPE_ARCH_SSE */
 876
 877
 878
 879 /**
 880  * Low-level instruction translators.
 881  */
 882
 883 static void
 884 emit_abs(
 885    struct x86_function *func,
 886    unsigned xmm )
 887 {
 888    sse_andps(
 889       func,
 890       make_xmm( xmm ),
 891       get_temp(
 892          TGSI_EXEC_TEMP_7FFFFFFF_I,
 893          TGSI_EXEC_TEMP_7FFFFFFF_C ) );
 894 }
 895
 896 static void
 897 emit_add(
 898    struct x86_function *func,
 899    unsigned xmm_dst,
 900    unsigned xmm_src )
 901 {
 902    sse_addps(
 903       func,
 904       make_xmm( xmm_dst ),
 905       make_xmm( xmm_src ) );
 906 }
 907
 908 static void PIPE_CDECL
 909 cos4f(
 910    float *store )
 911 {
 912    store[0] = cosf( store[0] );
 913    store[1] = cosf( store[1] );
 914    store[2] = cosf( store[2] );
 915    store[3] = cosf( store[3] );
 916 }
 917
 918 static void
 919 emit_cos(
 920    struct x86_function *func,
 921    unsigned xmm_save,
 922    unsigned xmm_dst )
 923 {
 924    emit_func_call_dst_src1(
 925       func,
 926       xmm_save,
 927       xmm_dst,
 928       xmm_dst,
 929       cos4f );
 930 }
 931
 932 static void PIPE_CDECL
 933 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 934 __attribute__((force_align_arg_pointer))
 935 #endif
 936 ex24f(
 937    float *store )
 938 {
 939 #if defined(PIPE_ARCH_SSE)
 940    _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
 941 #else
 942    store[0] = util_fast_exp2( store[0] );
 943    store[1] = util_fast_exp2( store[1] );
 944    store[2] = util_fast_exp2( store[2] );
 945    store[3] = util_fast_exp2( store[3] );
 946 #endif
 947 }
 948
 949 static void
 950 emit_ex2(
 951    struct x86_function *func,
 952    unsigned xmm_save,
 953    unsigned xmm_dst )
 954 {
 955    emit_func_call_dst_src1(
 956       func,
 957       xmm_save,
 958       xmm_dst,
 959       xmm_dst,
 960       ex24f );
 961 }
 962
 963 static void
 964 emit_f2it(
 965    struct x86_function *func,
 966    unsigned xmm )
 967 {
 968    sse2_cvttps2dq(
 969       func,
 970       make_xmm( xmm ),
 971       make_xmm( xmm ) );
 972 }
 973
 974 static void
 975 emit_i2f(
 976    struct x86_function *func,
 977    unsigned xmm )
 978 {
 979    sse2_cvtdq2ps(
 980       func,
 981       make_xmm( xmm ),
 982       make_xmm( xmm ) );
 983 }
 984
 985 static void PIPE_CDECL
 986 flr4f(
 987    float *store )
 988 {
 989    store[0] = floorf( store[0] );
 990    store[1] = floorf( store[1] );
 991    store[2] = floorf( store[2] );
 992    store[3] = floorf( store[3] );
 993 }
 994
 995 static void
 996 emit_flr(
 997    struct x86_function *func,
 998    unsigned xmm_save,
 999    unsigned xmm_dst )
1000 {
1001    emit_func_call_dst_src1(
1002       func,
1003       xmm_save,
1004       xmm_dst,
1005       xmm_dst,
1006       flr4f );
1007 }
1008
1009 static void PIPE_CDECL
1010 frc4f(
1011    float *store )
1012 {
1013    store[0] -= floorf( store[0] );
1014    store[1] -= floorf( store[1] );
1015    store[2] -= floorf( store[2] );
1016    store[3] -= floorf( store[3] );
1017 }
1018
1019 static void
1020 emit_frc(
1021    struct x86_function *func,
1022    unsigned xmm_save,
1023    unsigned xmm_dst )
1024 {
1025    emit_func_call_dst_src1(
1026       func,
1027       xmm_save,
1028       xmm_dst,
1029       xmm_dst,
1030       frc4f );
1031 }
1032
1033 static void PIPE_CDECL
1034 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
1035 __attribute__((force_align_arg_pointer))
1036 #endif
1037 lg24f(
1038    float *store )
1039 {
1040 #if defined(PIPE_ARCH_SSE)
1041    _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
1042 #else
1043    store[0] = util_fast_log2( store[0] );
1044    store[1] = util_fast_log2( store[1] );
1045    store[2] = util_fast_log2( store[2] );
1046    store[3] = util_fast_log2( store[3] );
1047 #endif
1048 }
1049
1050 static void
1051 emit_lg2(
1052    struct x86_function *func,
1053    unsigned xmm_save,
1054    unsigned xmm_dst )
1055 {
1056    emit_func_call_dst_src1(
1057       func,
1058       xmm_save,
1059       xmm_dst,
1060       xmm_dst,
1061       lg24f );
1062 }
1063
1064 static void
1065 emit_MOV(
1066    struct x86_function *func,
1067    unsigned xmm_dst,
1068    unsigned xmm_src )
1069 {
1070    sse_movups(
1071       func,
1072       make_xmm( xmm_dst ),
1073       make_xmm( xmm_src ) );
1074 }
1075
1076 static void
1077 emit_mul (struct x86_function *func,
1078           unsigned xmm_dst,
1079           unsigned xmm_src)
1080 {
1081    sse_mulps(
1082       func,
1083       make_xmm( xmm_dst ),
1084       make_xmm( xmm_src ) );
1085 }
1086
1087 static void
1088 emit_neg(
1089    struct x86_function *func,
1090    unsigned xmm )
1091 {
1092    sse_xorps(
1093       func,
1094       make_xmm( xmm ),
1095       get_temp(
1096          TGSI_EXEC_TEMP_80000000_I,
1097          TGSI_EXEC_TEMP_80000000_C ) );
1098 }
1099
1100 static void PIPE_CDECL
1101 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
1102 __attribute__((force_align_arg_pointer))
1103 #endif
1104 pow4f(
1105    float *store )
1106 {
1107 #if defined(PIPE_ARCH_SSE)
1108    _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
1109 #else
1110    store[0] = util_fast_pow( store[0], store[4] );
1111    store[1] = util_fast_pow( store[1], store[5] );
1112    store[2] = util_fast_pow( store[2], store[6] );
1113    store[3] = util_fast_pow( store[3], store[7] );
1114 #endif
1115 }
1116
1117 static void
1118 emit_pow(
1119    struct x86_function *func,
1120    unsigned xmm_save,
1121    unsigned xmm_dst,
1122    unsigned xmm_src0,
1123    unsigned xmm_src1 )
1124 {
1125    emit_func_call_dst_src2(
1126       func,
1127       xmm_save,
1128       xmm_dst,
1129       xmm_src0,
1130       xmm_src1,
1131       pow4f );
1132 }
1133
1134 static void
1135 emit_rcp (
1136    struct x86_function *func,
1137    unsigned xmm_dst,
1138    unsigned xmm_src )
1139 {
1140    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1141     * good enough.  Need to either emit a proper divide or use the
1142     * iterative technique described below in emit_rsqrt().
1143     */
1144    sse2_rcpps(
1145       func,
1146       make_xmm( xmm_dst ),
1147       make_xmm( xmm_src ) );
1148 }
1149
1150 static void PIPE_CDECL
1151 rnd4f(
1152    float *store )
1153 {
1154    store[0] = floorf( store[0] + 0.5f );
1155    store[1] = floorf( store[1] + 0.5f );
1156    store[2] = floorf( store[2] + 0.5f );
1157    store[3] = floorf( store[3] + 0.5f );
1158 }
1159
1160 static void
1161 emit_rnd(
1162    struct x86_function *func,
1163    unsigned xmm_save,
1164    unsigned xmm_dst )
1165 {
1166    emit_func_call_dst_src1(
1167       func,
1168       xmm_save,
1169       xmm_dst,
1170       xmm_dst,
1171       rnd4f );
1172 }
1173
1174 static void
1175 emit_rsqrt(
1176    struct x86_function *func,
1177    unsigned xmm_dst,
1178    unsigned xmm_src )
1179 {
1180 #if HIGH_PRECISION
1181    /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1182     * implementations, it is possible to improve its precision at
1183     * fairly low cost, using a newton/raphson step, as below:
1184     *
1185     * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1186     * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1187     *
1188     * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1189     */
1190    {
1191       struct x86_reg dst = make_xmm( xmm_dst );
1192       struct x86_reg src = make_xmm( xmm_src );
1193       struct x86_reg tmp0 = make_xmm( 2 );
1194       struct x86_reg tmp1 = make_xmm( 3 );
1195
1196       assert( xmm_dst != xmm_src );
1197       assert( xmm_dst != 2 && xmm_dst != 3 );
1198       assert( xmm_src != 2 && xmm_src != 3 );
1199
1200       sse_movaps(  func, dst,  get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
1201       sse_movaps(  func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
1202       sse_rsqrtps( func, tmp1, src  );
1203       sse_mulps(   func, src,  tmp1 );
1204       sse_mulps(   func, dst,  tmp1 );
1205       sse_mulps(   func, src,  tmp1 );
1206       sse_subps(   func, tmp0, src  );
1207       sse_mulps(   func, dst,  tmp0 );
1208    }
1209 #else
1210    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1211     * good enough.
1212     */
1213    sse_rsqrtps(
1214       func,
1215       make_xmm( xmm_dst ),
1216       make_xmm( xmm_src ) );
1217 #endif
1218 }
1219
1220 static void
1221 emit_setsign(
1222    struct x86_function *func,
1223    unsigned xmm )
1224 {
1225    sse_orps(
1226       func,
1227       make_xmm( xmm ),
1228       get_temp(
1229          TGSI_EXEC_TEMP_80000000_I,
1230          TGSI_EXEC_TEMP_80000000_C ) );
1231 }
1232
1233 static void PIPE_CDECL
1234 sgn4f(
1235    float *store )
1236 {
1237    store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f;
1238    store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f;
1239    store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f;
1240    store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f;
1241 }
1242
1243 static void
1244 emit_sgn(
1245    struct x86_function *func,
1246    unsigned xmm_save,
1247    unsigned xmm_dst )
1248 {
1249    emit_func_call_dst_src1(
1250       func,
1251       xmm_save,
1252       xmm_dst,
1253       xmm_dst,
1254       sgn4f );
1255 }
1256
1257 static void PIPE_CDECL
1258 sin4f(
1259    float *store )
1260 {
1261    store[0] = sinf( store[0] );
1262    store[1] = sinf( store[1] );
1263    store[2] = sinf( store[2] );
1264    store[3] = sinf( store[3] );
1265 }
1266
1267 static void
1268 emit_sin (struct x86_function *func,
1269           unsigned xmm_save,
1270           unsigned xmm_dst)
1271 {
1272    emit_func_call_dst_src1(
1273       func,
1274       xmm_save,
1275       xmm_dst,
1276       xmm_dst,
1277       sin4f );
1278 }
1279
1280 static void
1281 emit_sub(
1282    struct x86_function *func,
1283    unsigned xmm_dst,
1284    unsigned xmm_src )
1285 {
1286    sse_subps(
1287       func,
1288       make_xmm( xmm_dst ),
1289       make_xmm( xmm_src ) );
1290 }
1291
1292 /**
1293  * Register fetch.
1294  */
1295 static void
1296 emit_fetch(
1297    struct x86_function *func,
1298    unsigned xmm,
1299    const struct tgsi_full_src_register *reg,
1300    const unsigned chan_index )
1301 {
1302    unsigned swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1303
1304    switch (swizzle) {
1305    case TGSI_SWIZZLE_X:
1306    case TGSI_SWIZZLE_Y:
1307    case TGSI_SWIZZLE_Z:
1308    case TGSI_SWIZZLE_W:
1309       switch (reg->Register.File) {
1310       case TGSI_FILE_CONSTANT:
1311          emit_const(
1312             func,
1313             xmm,
1314             reg->Register.Index,
1315             swizzle,
1316             reg->Register.Indirect,
1317             reg->Indirect.File,
1318             reg->Indirect.Index );
1319          break;
1320
1321       case TGSI_FILE_IMMEDIATE:
1322          emit_immediate(
1323             func,
1324             xmm,
1325             reg->Register.Index,
1326             swizzle );
1327          break;
1328
1329       case TGSI_FILE_SYSTEM_VALUE:
1330          emit_system_value(
1331             func,
1332             xmm,
1333             reg->Register.Index,
1334             swizzle );
1335          break;
1336
1337       case TGSI_FILE_INPUT:
1338          emit_inputf(
1339             func,
1340             xmm,
1341             reg->Register.Index,
1342             swizzle );
1343          break;
1344
1345       case TGSI_FILE_TEMPORARY:
1346          emit_tempf(
1347             func,
1348             xmm,
1349             reg->Register.Index,
1350             swizzle );
1351          break;
1352
1353       default:
1354          assert( 0 );
1355       }
1356       break;
1357
1358    default:
1359       assert( 0 );
1360    }
1361
1362    switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1363    case TGSI_UTIL_SIGN_CLEAR:
1364       emit_abs( func, xmm );
1365       break;
1366
1367    case TGSI_UTIL_SIGN_SET:
1368       emit_setsign( func, xmm );
1369       break;
1370
1371    case TGSI_UTIL_SIGN_TOGGLE:
1372       emit_neg( func, xmm );
1373       break;
1374
1375    case TGSI_UTIL_SIGN_KEEP:
1376       break;
1377    }
1378 }
1379
1380 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1381    emit_fetch( FUNC, XMM, &(INST).Src[INDEX], CHAN )
1382
1383 /**
1384  * Register store.
1385  */
1386 static void
1387 emit_store(
1388    struct x86_function *func,
1389    unsigned xmm,
1390    const struct tgsi_full_dst_register *reg,
1391    const struct tgsi_full_instruction *inst,
1392    unsigned chan_index )
1393 {
1394    switch( inst->Instruction.Saturate ) {
1395    case TGSI_SAT_NONE:
1396       break;
1397
1398    case TGSI_SAT_ZERO_ONE:
1399       sse_maxps(
1400          func,
1401          make_xmm( xmm ),
1402          get_temp(
1403             TGSI_EXEC_TEMP_00000000_I,
1404             TGSI_EXEC_TEMP_00000000_C ) );
1405
1406       sse_minps(
1407          func,
1408          make_xmm( xmm ),
1409          get_temp(
1410             TGSI_EXEC_TEMP_ONE_I,
1411             TGSI_EXEC_TEMP_ONE_C ) );
1412       break;
1413
1414    case TGSI_SAT_MINUS_PLUS_ONE:
1415       assert( 0 );
1416       break;
1417    }
1418
1419
1420    switch( reg->Register.File ) {
1421    case TGSI_FILE_OUTPUT:
1422       emit_output(
1423          func,
1424          xmm,
1425          reg->Register.Index,
1426          chan_index );
1427       break;
1428
1429    case TGSI_FILE_TEMPORARY:
1430       emit_temps(
1431          func,
1432          xmm,
1433          reg->Register.Index,
1434          chan_index );
1435       break;
1436
1437    case TGSI_FILE_ADDRESS:
1438       emit_addrs(
1439          func,
1440          xmm,
1441          reg->Register.Index,
1442          chan_index );
1443       break;
1444
1445    default:
1446       assert( 0 );
1447    }
1448 }
1449
1450 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1451    emit_store( FUNC, XMM, &(INST).Dst[INDEX], &(INST), CHAN )
1452
1453
1454 static void PIPE_CDECL
1455 fetch_texel( struct tgsi_sampler **sampler,
1456              float *store )
1457 {
1458 #if 0
1459    uint j;
1460
1461    debug_printf("%s sampler: %p (%p) store: %p\n",
1462                 __FUNCTION__,
1463                 sampler, *sampler,
1464                 store );
1465
1466    for (j = 0; j < 4; j++)
1467       debug_printf("sample %d texcoord %f %f %f lodbias %f\n",
1468                    j,
1469                    store[0+j],
1470                    store[4+j],
1471                    store[8 + j],
1472                    store[12 + j]);
1473 #endif
1474
1475    {
1476       float rgba[NUM_CHANNELS][QUAD_SIZE];
1477       (*sampler)->get_samples(*sampler,
1478                               &store[0],  /* s */
1479                               &store[4],  /* t */
1480                               &store[8],  /* r */
1481                               &store[12], /* lodbias */
1482                               tgsi_sampler_lod_bias,
1483                               rgba);      /* results */
1484
1485       memcpy( store, rgba, 16 * sizeof(float));
1486    }
1487
1488 #if 0
1489    for (j = 0; j < 4; j++)
1490       debug_printf("sample %d result %f %f %f %f\n",
1491                    j,
1492                    store[0+j],
1493                    store[4+j],
1494                    store[8+j],
1495                    store[12+j]);
1496 #endif
1497 }
1498
1499 /**
1500  * High-level instruction translators.
1501  */
1502 static void
1503 emit_tex( struct x86_function *func,
1504           const struct tgsi_full_instruction *inst,
1505           boolean lodbias,
1506           boolean projected)
1507 {
1508    const uint unit = inst->Src[1].Register.Index;
1509    struct x86_reg args[2];
1510    unsigned count;
1511    unsigned i;
1512
1513    assert(inst->Instruction.Texture);
1514    switch (inst->Texture.Texture) {
1515    case TGSI_TEXTURE_1D:
1516       count = 1;
1517       break;
1518    case TGSI_TEXTURE_2D:
1519    case TGSI_TEXTURE_RECT:
1520    case TGSI_TEXTURE_1D_ARRAY:
1521       count = 2;
1522       break;
1523    case TGSI_TEXTURE_SHADOW1D:
1524    case TGSI_TEXTURE_SHADOW2D:
1525    case TGSI_TEXTURE_SHADOWRECT:
1526    case TGSI_TEXTURE_3D:
1527    case TGSI_TEXTURE_CUBE:
1528    case TGSI_TEXTURE_2D_ARRAY:
1529       count = 3;
1530       break;
1531    default:
1532       assert(0);
1533       return;
1534    }
1535
1536    if (lodbias) {
1537       FETCH( func, *inst, 3, 0, 3 );
1538    }
1539    else {
1540       emit_tempf(
1541          func,
1542          3,
1543          TGSI_EXEC_TEMP_00000000_I,
1544          TGSI_EXEC_TEMP_00000000_C );
1545
1546    }
1547
1548    /* store lodbias whether enabled or not -- fetch_texel currently
1549     * respects it always.
1550     */
1551    sse_movaps( func,
1552                get_temp( TEMP_R0, 3 ),
1553                make_xmm( 3 ) );
1554
1555    if (projected) {
1556       FETCH( func, *inst, 3, 0, 3 );
1557
1558       emit_rcp( func, 3, 3 );
1559    }
1560
1561    for (i = 0; i < count; i++) {
1562       FETCH( func, *inst, i, 0, i );
1563
1564       if (projected) {
1565          sse_mulps(
1566             func,
1567             make_xmm( i ),
1568             make_xmm( 3 ) );
1569       }
1570
1571       /* Store in the argument buffer:
1572        */
1573       sse_movaps(
1574          func,
1575          get_temp( TEMP_R0, i ),
1576          make_xmm( i ) );
1577    }
1578
1579    args[0] = get_temp( TEMP_R0, 0 );
1580    args[1] = get_sampler_ptr( unit );
1581
1582    emit_func_call( func,
1583                    0,
1584                    args,
1585                    Elements(args),
1586                    fetch_texel );
1587
1588    /* If all four channels are enabled, could use a pointer to
1589     * dst[0].x instead of TEMP_R0 for store?
1590     */
1591    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, i ) {
1592
1593       sse_movaps(
1594          func,
1595          make_xmm( 0 ),
1596          get_temp( TEMP_R0, i ) );
1597
1598       STORE( func, *inst, 0, 0, i );
1599    }
1600 }
1601
1602
1603 static void
1604 emit_kil(
1605    struct x86_function *func,
1606    const struct tgsi_full_src_register *reg )
1607 {
1608    unsigned uniquemask;
1609    unsigned unique_count = 0;
1610    unsigned chan_index;
1611    unsigned i;
1612
1613    /* This mask stores component bits that were already tested. Note that
1614     * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1615     * tested.
1616     */
1617    uniquemask = 0;
1618
1619    FOR_EACH_CHANNEL( chan_index ) {
1620       unsigned swizzle;
1621
1622       /* unswizzle channel */
1623       swizzle = tgsi_util_get_full_src_register_swizzle(
1624          reg,
1625          chan_index );
1626
1627       /* check if the component has not been already tested */
1628       if( !(uniquemask & (1 << swizzle)) ) {
1629          uniquemask |= 1 << swizzle;
1630
1631          /* allocate register */
1632          emit_fetch(
1633             func,
1634             unique_count++,
1635             reg,
1636             chan_index );
1637       }
1638    }
1639
1640    x86_push(
1641       func,
1642       x86_make_reg( file_REG32, reg_AX ) );
1643    x86_push(
1644       func,
1645       x86_make_reg( file_REG32, reg_DX ) );
1646
1647    for (i = 0 ; i < unique_count; i++ ) {
1648       struct x86_reg dataXMM = make_xmm(i);
1649
1650       sse_cmpps(
1651          func,
1652          dataXMM,
1653          get_temp(
1654             TGSI_EXEC_TEMP_00000000_I,
1655             TGSI_EXEC_TEMP_00000000_C ),
1656          cc_LessThan );
1657
1658       if( i == 0 ) {
1659          sse_movmskps(
1660             func,
1661             x86_make_reg( file_REG32, reg_AX ),
1662             dataXMM );
1663       }
1664       else {
1665          sse_movmskps(
1666             func,
1667             x86_make_reg( file_REG32, reg_DX ),
1668             dataXMM );
1669          x86_or(
1670             func,
1671             x86_make_reg( file_REG32, reg_AX ),
1672             x86_make_reg( file_REG32, reg_DX ) );
1673       }
1674    }
1675
1676    x86_or(
1677       func,
1678       get_temp(
1679          TGSI_EXEC_TEMP_KILMASK_I,
1680          TGSI_EXEC_TEMP_KILMASK_C ),
1681       x86_make_reg( file_REG32, reg_AX ) );
1682
1683    x86_pop(
1684       func,
1685       x86_make_reg( file_REG32, reg_DX ) );
1686    x86_pop(
1687       func,
1688       x86_make_reg( file_REG32, reg_AX ) );
1689 }
1690
1691
1692 static void
1693 emit_kilp(
1694    struct x86_function *func )
1695 {
1696    /* XXX todo / fix me */
1697 }
1698
1699
1700 static void
1701 emit_setcc(
1702    struct x86_function *func,
1703    struct tgsi_full_instruction *inst,
1704    enum sse_cc cc )
1705 {
1706    unsigned chan_index;
1707
1708    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1709       FETCH( func, *inst, 0, 0, chan_index );
1710       FETCH( func, *inst, 1, 1, chan_index );
1711       sse_cmpps(
1712          func,
1713          make_xmm( 0 ),
1714          make_xmm( 1 ),
1715          cc );
1716       sse_andps(
1717          func,
1718          make_xmm( 0 ),
1719          get_temp(
1720             TEMP_ONE_I,
1721             TEMP_ONE_C ) );
1722       STORE( func, *inst, 0, 0, chan_index );
1723    }
1724 }
1725
1726 static void
1727 emit_cmp(
1728    struct x86_function *func,
1729    struct tgsi_full_instruction *inst )
1730 {
1731    unsigned chan_index;
1732
1733    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1734       FETCH( func, *inst, 0, 0, chan_index );
1735       FETCH( func, *inst, 1, 1, chan_index );
1736       FETCH( func, *inst, 2, 2, chan_index );
1737       sse_cmpps(
1738          func,
1739          make_xmm( 0 ),
1740          get_temp(
1741             TGSI_EXEC_TEMP_00000000_I,
1742             TGSI_EXEC_TEMP_00000000_C ),
1743          cc_LessThan );
1744       sse_andps(
1745          func,
1746          make_xmm( 1 ),
1747          make_xmm( 0 ) );
1748       sse_andnps(
1749          func,
1750          make_xmm( 0 ),
1751          make_xmm( 2 ) );
1752       sse_orps(
1753          func,
1754          make_xmm( 0 ),
1755          make_xmm( 1 ) );
1756       STORE( func, *inst, 0, 0, chan_index );
1757    }
1758 }
1759
1760
1761 /**
1762  * Check if inst src/dest regs use indirect addressing into temporary,
1763  * input or output register files.
1764  */
1765 static boolean
1766 indirect_reg_reference(const struct tgsi_full_instruction *inst)
1767 {
1768    uint i;
1769    for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1770       const struct tgsi_full_src_register *reg = &inst->Src[i];
1771       if ((reg->Register.File == TGSI_FILE_TEMPORARY ||
1772            reg->Register.File == TGSI_FILE_INPUT ||
1773            reg->Register.File == TGSI_FILE_OUTPUT) &&
1774           reg->Register.Indirect)
1775          return TRUE;
1776    }
1777    for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
1778       const struct tgsi_full_dst_register *reg = &inst->Dst[i];
1779       if ((reg->Register.File == TGSI_FILE_TEMPORARY ||
1780            reg->Register.File == TGSI_FILE_INPUT ||
1781            reg->Register.File == TGSI_FILE_OUTPUT) &&
1782           reg->Register.Indirect)
1783          return TRUE;
1784    }
1785    return FALSE;
1786 }
1787
1788
1789 static int
1790 emit_instruction(
1791    struct x86_function *func,
1792    struct tgsi_full_instruction *inst )
1793 {
1794    unsigned chan_index;
1795
1796    /* we can't handle indirect addressing into temp register file yet */
1797    if (indirect_reg_reference(inst))
1798       return FALSE;
1799
1800    switch (inst->Instruction.Opcode) {
1801    case TGSI_OPCODE_ARL:
1802       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1803          FETCH( func, *inst, 0, 0, chan_index );
1804          emit_flr(func, 0, 0);
1805          emit_f2it( func, 0 );
1806          STORE( func, *inst, 0, 0, chan_index );
1807       }
1808       break;
1809
1810    case TGSI_OPCODE_MOV:
1811       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1812          FETCH( func, *inst, 4 + chan_index, 0, chan_index );
1813       }
1814       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1815          STORE( func, *inst, 4 + chan_index, 0, chan_index );
1816       }
1817       break;
1818
1819    case TGSI_OPCODE_LIT:
1820       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1821           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1822          emit_tempf(
1823             func,
1824             0,
1825             TEMP_ONE_I,
1826             TEMP_ONE_C);
1827          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1828             STORE( func, *inst, 0, 0, CHAN_X );
1829          }
1830          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1831             STORE( func, *inst, 0, 0, CHAN_W );
1832          }
1833       }
1834       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1835           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1836          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1837             FETCH( func, *inst, 0, 0, CHAN_X );
1838             sse_maxps(
1839                func,
1840                make_xmm( 0 ),
1841                get_temp(
1842                   TGSI_EXEC_TEMP_00000000_I,
1843                   TGSI_EXEC_TEMP_00000000_C ) );
1844             STORE( func, *inst, 0, 0, CHAN_Y );
1845          }
1846          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1847             /* XMM[1] = SrcReg[0].yyyy */
1848             FETCH( func, *inst, 1, 0, CHAN_Y );
1849             /* XMM[1] = max(XMM[1], 0) */
1850             sse_maxps(
1851                func,
1852                make_xmm( 1 ),
1853                get_temp(
1854                   TGSI_EXEC_TEMP_00000000_I,
1855                   TGSI_EXEC_TEMP_00000000_C ) );
1856             /* XMM[2] = SrcReg[0].wwww */
1857             FETCH( func, *inst, 2, 0, CHAN_W );
1858             /* XMM[2] = min(XMM[2], 128.0) */
1859             sse_minps(
1860                func,
1861                make_xmm( 2 ),
1862                get_temp(
1863                   TGSI_EXEC_TEMP_128_I,
1864                   TGSI_EXEC_TEMP_128_C ) );
1865             /* XMM[2] = max(XMM[2], -128.0) */
1866             sse_maxps(
1867                func,
1868                make_xmm( 2 ),
1869                get_temp(
1870                   TGSI_EXEC_TEMP_MINUS_128_I,
1871                   TGSI_EXEC_TEMP_MINUS_128_C ) );
1872             emit_pow( func, 3, 1, 1, 2 );
1873             FETCH( func, *inst, 0, 0, CHAN_X );
1874             sse_xorps(
1875                func,
1876                make_xmm( 2 ),
1877                make_xmm( 2 ) );
1878             sse_cmpps(
1879                func,
1880                make_xmm( 2 ),
1881                make_xmm( 0 ),
1882                cc_LessThan );
1883             sse_andps(
1884                func,
1885                make_xmm( 2 ),
1886                make_xmm( 1 ) );
1887             STORE( func, *inst, 2, 0, CHAN_Z );
1888          }
1889       }
1890       break;
1891
1892    case TGSI_OPCODE_RCP:
1893       FETCH( func, *inst, 0, 0, CHAN_X );
1894       emit_rcp( func, 0, 0 );
1895       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1896          STORE( func, *inst, 0, 0, chan_index );
1897       }
1898       break;
1899
1900    case TGSI_OPCODE_RSQ:
1901       FETCH( func, *inst, 0, 0, CHAN_X );
1902       emit_abs( func, 0 );
1903       emit_rsqrt( func, 1, 0 );
1904       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1905          STORE( func, *inst, 1, 0, chan_index );
1906       }
1907       break;
1908
1909    case TGSI_OPCODE_EXP:
1910       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1911           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1912           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1913          FETCH( func, *inst, 0, 0, CHAN_X );
1914          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1915              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1916             emit_MOV( func, 1, 0 );
1917             emit_flr( func, 2, 1 );
1918             /* dst.x = ex2(floor(src.x)) */
1919             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1920                emit_MOV( func, 2, 1 );
1921                emit_ex2( func, 3, 2 );
1922                STORE( func, *inst, 2, 0, CHAN_X );
1923             }
1924             /* dst.y = src.x - floor(src.x) */
1925             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1926                emit_MOV( func, 2, 0 );
1927                emit_sub( func, 2, 1 );
1928                STORE( func, *inst, 2, 0, CHAN_Y );
1929             }
1930          }
1931          /* dst.z = ex2(src.x) */
1932          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1933             emit_ex2( func, 3, 0 );
1934             STORE( func, *inst, 0, 0, CHAN_Z );
1935          }
1936       }
1937       /* dst.w = 1.0 */
1938       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1939          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1940          STORE( func, *inst, 0, 0, CHAN_W );
1941       }
1942       break;
1943
1944    case TGSI_OPCODE_LOG:
1945       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1946           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1947           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1948          FETCH( func, *inst, 0, 0, CHAN_X );
1949          emit_abs( func, 0 );
1950          emit_MOV( func, 1, 0 );
1951          emit_lg2( func, 2, 1 );
1952          /* dst.z = lg2(abs(src.x)) */
1953          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1954             STORE( func, *inst, 1, 0, CHAN_Z );
1955          }
1956          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1957              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1958             emit_flr( func, 2, 1 );
1959             /* dst.x = floor(lg2(abs(src.x))) */
1960             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1961                STORE( func, *inst, 1, 0, CHAN_X );
1962             }
1963             /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1964             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1965                emit_ex2( func, 2, 1 );
1966                emit_rcp( func, 1, 1 );
1967                emit_mul( func, 0, 1 );
1968                STORE( func, *inst, 0, 0, CHAN_Y );
1969             }
1970          }
1971       }
1972       /* dst.w = 1.0 */
1973       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1974          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1975          STORE( func, *inst, 0, 0, CHAN_W );
1976       }
1977       break;
1978
1979    case TGSI_OPCODE_MUL:
1980       /* do all fetches and adds, storing results in temp regs */
1981       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1982          int r = chan_index + 1;
1983          FETCH( func, *inst, 0, 0, chan_index ); /* load xmm[0] */
1984          FETCH( func, *inst, r, 1, chan_index ); /* load xmm[r] */
1985          emit_mul( func, r, 0 );   /* xmm[r] = xmm[r] * xmm[0] */
1986       }
1987       /* do all stores of the temp regs */
1988       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1989          int r = chan_index + 1;
1990          STORE( func, *inst, r, 0, chan_index ); /* store xmm[r] */
1991       }
1992       break;
1993
1994    case TGSI_OPCODE_ADD:
1995       /* do all fetches and adds, storing results in temp regs */
1996       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1997          int r = chan_index + 1;
1998          FETCH( func, *inst, 0, 0, chan_index ); /* load xmm[0] */
1999          FETCH( func, *inst, r, 1, chan_index ); /* load xmm[r] */
2000          emit_add( func, r, 0 );   /* xmm[r] = xmm[r] + xmm[0] */
2001       }
2002       /* do all stores of the temp regs */
2003       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2004          int r = chan_index + 1;
2005          STORE( func, *inst, r, 0, chan_index ); /* store xmm[r] */
2006       }
2007       break;
2008
2009    case TGSI_OPCODE_DP3:
2010       FETCH( func, *inst, 0, 0, CHAN_X );
2011       FETCH( func, *inst, 1, 1, CHAN_X );
2012       emit_mul( func, 0, 1 );
2013       FETCH( func, *inst, 1, 0, CHAN_Y );
2014       FETCH( func, *inst, 2, 1, CHAN_Y );
2015       emit_mul( func, 1, 2 );
2016       emit_add( func, 0, 1 );
2017       FETCH( func, *inst, 1, 0, CHAN_Z );
2018       FETCH( func, *inst, 2, 1, CHAN_Z );
2019       emit_mul( func, 1, 2 );
2020       emit_add( func, 0, 1 );
2021       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2022          STORE( func, *inst, 0, 0, chan_index );
2023       }
2024       break;
2025
2026    case TGSI_OPCODE_DP4:
2027       FETCH( func, *inst, 0, 0, CHAN_X );
2028       FETCH( func, *inst, 1, 1, CHAN_X );
2029       emit_mul( func, 0, 1 );
2030       FETCH( func, *inst, 1, 0, CHAN_Y );
2031       FETCH( func, *inst, 2, 1, CHAN_Y );
2032       emit_mul( func, 1, 2 );
2033       emit_add( func, 0, 1 );
2034       FETCH( func, *inst, 1, 0, CHAN_Z );
2035       FETCH( func, *inst, 2, 1, CHAN_Z );
2036       emit_mul(func, 1, 2 );
2037       emit_add(func, 0, 1 );
2038       FETCH( func, *inst, 1, 0, CHAN_W );
2039       FETCH( func, *inst, 2, 1, CHAN_W );
2040       emit_mul( func, 1, 2 );
2041       emit_add( func, 0, 1 );
2042       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2043          STORE( func, *inst, 0, 0, chan_index );
2044       }
2045       break;
2046
2047    case TGSI_OPCODE_DST:
2048       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2049          emit_tempf(
2050             func,
2051             0,
2052             TEMP_ONE_I,
2053             TEMP_ONE_C );
2054          STORE( func, *inst, 0, 0, CHAN_X );
2055       }
2056       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2057          FETCH( func, *inst, 0, 0, CHAN_Y );
2058          FETCH( func, *inst, 1, 1, CHAN_Y );
2059          emit_mul( func, 0, 1 );
2060          STORE( func, *inst, 0, 0, CHAN_Y );
2061       }
2062       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2063          FETCH( func, *inst, 0, 0, CHAN_Z );
2064          STORE( func, *inst, 0, 0, CHAN_Z );
2065       }
2066       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2067          FETCH( func, *inst, 0, 1, CHAN_W );
2068          STORE( func, *inst, 0, 0, CHAN_W );
2069       }
2070       break;
2071
2072    case TGSI_OPCODE_MIN:
2073       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2074          FETCH( func, *inst, 0, 0, chan_index );
2075          FETCH( func, *inst, 1, 1, chan_index );
2076          sse_minps(
2077             func,
2078             make_xmm( 0 ),
2079             make_xmm( 1 ) );
2080          STORE( func, *inst, 0, 0, chan_index );
2081       }
2082       break;
2083
2084    case TGSI_OPCODE_MAX:
2085       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2086          FETCH( func, *inst, 0, 0, chan_index );
2087          FETCH( func, *inst, 1, 1, chan_index );
2088          sse_maxps(
2089             func,
2090             make_xmm( 0 ),
2091             make_xmm( 1 ) );
2092          STORE( func, *inst, 0, 0, chan_index );
2093       }
2094       break;
2095
2096    case TGSI_OPCODE_SLT:
2097       emit_setcc( func, inst, cc_LessThan );
2098       break;
2099
2100    case TGSI_OPCODE_SGE:
2101       emit_setcc( func, inst, cc_NotLessThan );
2102       break;
2103
2104    case TGSI_OPCODE_MAD:
2105       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2106          FETCH( func, *inst, 0, 0, chan_index );
2107          FETCH( func, *inst, 1, 1, chan_index );
2108          FETCH( func, *inst, 2, 2, chan_index );
2109          emit_mul( func, 0, 1 );
2110          emit_add( func, 0, 2 );
2111          STORE( func, *inst, 0, 0, chan_index );
2112       }
2113       break;
2114
2115    case TGSI_OPCODE_SUB:
2116       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2117          FETCH( func, *inst, 0, 0, chan_index );
2118          FETCH( func, *inst, 1, 1, chan_index );
2119          emit_sub( func, 0, 1 );
2120          STORE( func, *inst, 0, 0, chan_index );
2121       }
2122       break;
2123
2124    case TGSI_OPCODE_LRP:
2125       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2126          FETCH( func, *inst, 0, 0, chan_index );
2127          FETCH( func, *inst, 1, 1, chan_index );
2128          FETCH( func, *inst, 2, 2, chan_index );
2129          emit_sub( func, 1, 2 );
2130          emit_mul( func, 0, 1 );
2131          emit_add( func, 0, 2 );
2132          STORE( func, *inst, 0, 0, chan_index );
2133       }
2134       break;
2135
2136    case TGSI_OPCODE_CND:
2137       return 0;
2138       break;
2139
2140    case TGSI_OPCODE_DP2A:
2141       FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
2142       FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
2143       emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
2144       FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
2145       FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
2146       emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
2147       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2148       FETCH( func, *inst, 1, 2, CHAN_X );  /* xmm1 = src[2].x */
2149       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2150       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2151          STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
2152       }
2153       break;
2154
2155    case TGSI_OPCODE_FRC:
2156       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2157          FETCH( func, *inst, 0, 0, chan_index );
2158          emit_frc( func, 0, 0 );
2159          STORE( func, *inst, 0, 0, chan_index );
2160       }
2161       break;
2162
2163    case TGSI_OPCODE_CLAMP:
2164       return 0;
2165       break;
2166
2167    case TGSI_OPCODE_FLR:
2168       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2169          FETCH( func, *inst, 0, 0, chan_index );
2170          emit_flr( func, 0, 0 );
2171          STORE( func, *inst, 0, 0, chan_index );
2172       }
2173       break;
2174
2175    case TGSI_OPCODE_ROUND:
2176       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2177          FETCH( func, *inst, 0, 0, chan_index );
2178          emit_rnd( func, 0, 0 );
2179          STORE( func, *inst, 0, 0, chan_index );
2180       }
2181       break;
2182
2183    case TGSI_OPCODE_EX2:
2184       FETCH( func, *inst, 0, 0, CHAN_X );
2185       emit_ex2( func, 0, 0 );
2186       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2187          STORE( func, *inst, 0, 0, chan_index );
2188       }
2189       break;
2190
2191    case TGSI_OPCODE_LG2:
2192       FETCH( func, *inst, 0, 0, CHAN_X );
2193       emit_lg2( func, 0, 0 );
2194       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2195          STORE( func, *inst, 0, 0, chan_index );
2196       }
2197       break;
2198
2199    case TGSI_OPCODE_POW:
2200       FETCH( func, *inst, 0, 0, CHAN_X );
2201       FETCH( func, *inst, 1, 1, CHAN_X );
2202       emit_pow( func, 0, 0, 0, 1 );
2203       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2204          STORE( func, *inst, 0, 0, chan_index );
2205       }
2206       break;
2207
2208    case TGSI_OPCODE_XPD:
2209       /* Note: we do all stores after all operands have been fetched
2210        * to avoid src/dst register aliasing issues for an instruction
2211        * such as:  XPD TEMP[2].xyz, TEMP[0], TEMP[2];
2212        */
2213       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2214           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2215          FETCH( func, *inst, 1, 1, CHAN_Z ); /* xmm[1] = src[1].z */
2216          FETCH( func, *inst, 3, 0, CHAN_Z ); /* xmm[3] = src[0].z */
2217       }
2218       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2219           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2220          FETCH( func, *inst, 0, 0, CHAN_Y ); /* xmm[0] = src[0].y */
2221          FETCH( func, *inst, 4, 1, CHAN_Y ); /* xmm[4] = src[1].y */
2222       }
2223       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2224          emit_MOV( func, 7, 0 );  /* xmm[7] = xmm[0] */
2225          emit_mul( func, 7, 1 );  /* xmm[7] = xmm[2] * xmm[1] */
2226          emit_MOV( func, 5, 3 );  /* xmm[5] = xmm[3] */
2227          emit_mul( func, 5, 4 );  /* xmm[5] = xmm[5] * xmm[4] */
2228          emit_sub( func, 7, 5 );  /* xmm[7] = xmm[2] - xmm[5] */
2229          /* store xmm[7] in dst.x below */
2230       }
2231       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
2232           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2233          FETCH( func, *inst, 2, 1, CHAN_X ); /* xmm[2] = src[1].x */
2234          FETCH( func, *inst, 5, 0, CHAN_X ); /* xmm[5] = src[0].x */
2235       }
2236       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2237          emit_mul( func, 3, 2 );  /* xmm[3] = xmm[3] * xmm[2] */
2238          emit_mul( func, 1, 5 );  /* xmm[1] = xmm[1] * xmm[5] */
2239          emit_sub( func, 3, 1 );  /* xmm[3] = xmm[3] - xmm[1] */
2240          /* store xmm[3] in dst.y below */
2241       }
2242       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2243          emit_mul( func, 5, 4 );  /* xmm[5] = xmm[5] * xmm[4] */
2244          emit_mul( func, 0, 2 );  /* xmm[0] = xmm[0] * xmm[2] */
2245          emit_sub( func, 5, 0 );  /* xmm[5] = xmm[5] - xmm[0] */
2246          STORE( func, *inst, 5, 0, CHAN_Z ); /* dst.z = xmm[5] */
2247       }
2248       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2249          STORE( func, *inst, 7, 0, CHAN_X ); /* dst.x = xmm[7] */
2250       }
2251       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2252          STORE( func, *inst, 3, 0, CHAN_Y ); /* dst.y = xmm[3] */
2253       }
2254       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2255          emit_tempf(
2256             func,
2257             0,
2258             TEMP_ONE_I,
2259             TEMP_ONE_C );
2260          STORE( func, *inst, 0, 0, CHAN_W );
2261       }
2262       break;
2263
2264    case TGSI_OPCODE_ABS:
2265       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2266          FETCH( func, *inst, 0, 0, chan_index );
2267          emit_abs( func, 0) ;
2268
2269          STORE( func, *inst, 0, 0, chan_index );
2270       }
2271       break;
2272
2273    case TGSI_OPCODE_RCC:
2274       return 0;
2275       break;
2276
2277    case TGSI_OPCODE_DPH:
2278       FETCH( func, *inst, 0, 0, CHAN_X );
2279       FETCH( func, *inst, 1, 1, CHAN_X );
2280       emit_mul( func, 0, 1 );
2281       FETCH( func, *inst, 1, 0, CHAN_Y );
2282       FETCH( func, *inst, 2, 1, CHAN_Y );
2283       emit_mul( func, 1, 2 );
2284       emit_add( func, 0, 1 );
2285       FETCH( func, *inst, 1, 0, CHAN_Z );
2286       FETCH( func, *inst, 2, 1, CHAN_Z );
2287       emit_mul( func, 1, 2 );
2288       emit_add( func, 0, 1 );
2289       FETCH( func, *inst, 1, 1, CHAN_W );
2290       emit_add( func, 0, 1 );
2291       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2292          STORE( func, *inst, 0, 0, chan_index );
2293       }
2294       break;
2295
2296    case TGSI_OPCODE_COS:
2297       FETCH( func, *inst, 0, 0, CHAN_X );
2298       emit_cos( func, 0, 0 );
2299       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2300          STORE( func, *inst, 0, 0, chan_index );
2301       }
2302       break;
2303
2304    case TGSI_OPCODE_DDX:
2305       return 0;
2306       break;
2307
2308    case TGSI_OPCODE_DDY:
2309       return 0;
2310       break;
2311
2312    case TGSI_OPCODE_KILP:
2313       /* predicated kill */
2314       emit_kilp( func );
2315       return 0; /* XXX fix me */
2316       break;
2317
2318    case TGSI_OPCODE_KIL:
2319       /* conditional kill */
2320       emit_kil( func, &inst->Src[0] );
2321       break;
2322
2323    case TGSI_OPCODE_PK2H:
2324       return 0;
2325       break;
2326
2327    case TGSI_OPCODE_PK2US:
2328       return 0;
2329       break;
2330
2331    case TGSI_OPCODE_PK4B:
2332       return 0;
2333       break;
2334
2335    case TGSI_OPCODE_PK4UB:
2336       return 0;
2337       break;
2338
2339    case TGSI_OPCODE_RFL:
2340       return 0;
2341       break;
2342
2343    case TGSI_OPCODE_SEQ:
2344       emit_setcc( func, inst, cc_Equal );
2345       break;
2346
2347    case TGSI_OPCODE_SFL:
2348       return 0;
2349       break;
2350
2351    case TGSI_OPCODE_SGT:
2352       emit_setcc( func, inst, cc_NotLessThanEqual );
2353       break;
2354
2355    case TGSI_OPCODE_SIN:
2356       FETCH( func, *inst, 0, 0, CHAN_X );
2357       emit_sin( func, 0, 0 );
2358       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2359          STORE( func, *inst, 0, 0, chan_index );
2360       }
2361       break;
2362
2363    case TGSI_OPCODE_SLE:
2364       emit_setcc( func, inst, cc_LessThanEqual );
2365       break;
2366
2367    case TGSI_OPCODE_SNE:
2368       emit_setcc( func, inst, cc_NotEqual );
2369       break;
2370
2371    case TGSI_OPCODE_STR:
2372       return 0;
2373       break;
2374
2375    case TGSI_OPCODE_TEX:
2376       emit_tex( func, inst, FALSE, FALSE );
2377       break;
2378
2379    case TGSI_OPCODE_TXD:
2380       return 0;
2381       break;
2382
2383    case TGSI_OPCODE_UP2H:
2384       return 0;
2385       break;
2386
2387    case TGSI_OPCODE_UP2US:
2388       return 0;
2389       break;
2390
2391    case TGSI_OPCODE_UP4B:
2392       return 0;
2393       break;
2394
2395    case TGSI_OPCODE_UP4UB:
2396       return 0;
2397       break;
2398
2399    case TGSI_OPCODE_X2D:
2400       return 0;
2401       break;
2402
2403    case TGSI_OPCODE_ARA:
2404       return 0;
2405       break;
2406
2407    case TGSI_OPCODE_ARR:
2408       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2409          FETCH( func, *inst, 0, 0, chan_index );
2410          emit_rnd( func, 0, 0 );
2411          emit_f2it( func, 0 );
2412          STORE( func, *inst, 0, 0, chan_index );
2413       }
2414       break;
2415
2416    case TGSI_OPCODE_BRA:
2417       return 0;
2418       break;
2419
2420    case TGSI_OPCODE_CAL:
2421       return 0;
2422       break;
2423
2424    case TGSI_OPCODE_RET:
2425       emit_ret( func );
2426       break;
2427
2428    case TGSI_OPCODE_END:
2429       break;
2430
2431    case TGSI_OPCODE_SSG:
2432       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2433          FETCH( func, *inst, 0, 0, chan_index );
2434          emit_sgn( func, 0, 0 );
2435          STORE( func, *inst, 0, 0, chan_index );
2436       }
2437       break;
2438
2439    case TGSI_OPCODE_CMP:
2440       emit_cmp (func, inst);
2441       break;
2442
2443    case TGSI_OPCODE_SCS:
2444       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2445          FETCH( func, *inst, 0, 0, CHAN_X );
2446          emit_cos( func, 0, 0 );
2447          STORE( func, *inst, 0, 0, CHAN_X );
2448       }
2449       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2450          FETCH( func, *inst, 0, 0, CHAN_X );
2451          emit_sin( func, 0, 0 );
2452          STORE( func, *inst, 0, 0, CHAN_Y );
2453       }
2454       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2455          emit_tempf(
2456             func,
2457             0,
2458             TGSI_EXEC_TEMP_00000000_I,
2459             TGSI_EXEC_TEMP_00000000_C );
2460          STORE( func, *inst, 0, 0, CHAN_Z );
2461       }
2462       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2463          emit_tempf(
2464             func,
2465             0,
2466             TEMP_ONE_I,
2467             TEMP_ONE_C );
2468          STORE( func, *inst, 0, 0, CHAN_W );
2469       }
2470       break;
2471
2472    case TGSI_OPCODE_TXB:
2473       emit_tex( func, inst, TRUE, FALSE );
2474       break;
2475
2476    case TGSI_OPCODE_NRM:
2477       /* fall-through */
2478    case TGSI_OPCODE_NRM4:
2479       /* 3 or 4-component normalization */
2480       {
2481          uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
2482
2483          if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
2484              IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2485              IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) ||
2486              (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 4)) {
2487
2488             /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
2489
2490             /* xmm4 = src.x */
2491             /* xmm0 = src.x * src.x */
2492             FETCH(func, *inst, 0, 0, CHAN_X);
2493             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2494                emit_MOV(func, 4, 0);
2495             }
2496             emit_mul(func, 0, 0);
2497
2498             /* xmm5 = src.y */
2499             /* xmm0 = xmm0 + src.y * src.y */
2500             FETCH(func, *inst, 1, 0, CHAN_Y);
2501             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2502                emit_MOV(func, 5, 1);
2503             }
2504             emit_mul(func, 1, 1);
2505             emit_add(func, 0, 1);
2506
2507             /* xmm6 = src.z */
2508             /* xmm0 = xmm0 + src.z * src.z */
2509             FETCH(func, *inst, 1, 0, CHAN_Z);
2510             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2511                emit_MOV(func, 6, 1);
2512             }
2513             emit_mul(func, 1, 1);
2514             emit_add(func, 0, 1);
2515
2516             if (dims == 4) {
2517                /* xmm7 = src.w */
2518                /* xmm0 = xmm0 + src.w * src.w */
2519                FETCH(func, *inst, 1, 0, CHAN_W);
2520                if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
2521                   emit_MOV(func, 7, 1);
2522                }
2523                emit_mul(func, 1, 1);
2524                emit_add(func, 0, 1);
2525             }
2526
2527             /* xmm1 = 1 / sqrt(xmm0) */
2528             emit_rsqrt(func, 1, 0);
2529
2530             /* dst.x = xmm1 * src.x */
2531             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2532                emit_mul(func, 4, 1);
2533                STORE(func, *inst, 4, 0, CHAN_X);
2534             }
2535
2536             /* dst.y = xmm1 * src.y */
2537             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2538                emit_mul(func, 5, 1);
2539                STORE(func, *inst, 5, 0, CHAN_Y);
2540             }
2541
2542             /* dst.z = xmm1 * src.z */
2543             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2544                emit_mul(func, 6, 1);
2545                STORE(func, *inst, 6, 0, CHAN_Z);
2546             }
2547
2548             /* dst.w = xmm1 * src.w */
2549             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) && dims == 4) {
2550                emit_mul(func, 7, 1);
2551                STORE(func, *inst, 7, 0, CHAN_W);
2552             }
2553          }
2554
2555          /* dst0.w = 1.0 */
2556          if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 3) {
2557             emit_tempf(func, 0, TEMP_ONE_I, TEMP_ONE_C);
2558             STORE(func, *inst, 0, 0, CHAN_W);
2559          }
2560       }
2561       break;
2562
2563    case TGSI_OPCODE_DIV:
2564       return 0;
2565       break;
2566
2567    case TGSI_OPCODE_DP2:
2568       FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
2569       FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
2570       emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
2571       FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
2572       FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
2573       emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
2574       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2575       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2576          STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
2577       }
2578       break;
2579
2580    case TGSI_OPCODE_TXL:
2581       return 0;
2582       break;
2583
2584    case TGSI_OPCODE_TXP:
2585       emit_tex( func, inst, FALSE, TRUE );
2586       break;
2587
2588    case TGSI_OPCODE_BRK:
2589       return 0;
2590       break;
2591
2592    case TGSI_OPCODE_IF:
2593       return 0;
2594       break;
2595
2596    case TGSI_OPCODE_ELSE:
2597       return 0;
2598       break;
2599
2600    case TGSI_OPCODE_ENDIF:
2601       return 0;
2602       break;
2603
2604    case TGSI_OPCODE_PUSHA:
2605       return 0;
2606       break;
2607
2608    case TGSI_OPCODE_POPA:
2609       return 0;
2610       break;
2611
2612    case TGSI_OPCODE_CEIL:
2613       return 0;
2614       break;
2615
2616    case TGSI_OPCODE_I2F:
2617       return 0;
2618       break;
2619
2620    case TGSI_OPCODE_NOT:
2621       return 0;
2622       break;
2623
2624    case TGSI_OPCODE_TRUNC:
2625       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2626          FETCH( func, *inst, 0, 0, chan_index );
2627          emit_f2it( func, 0 );
2628          emit_i2f( func, 0 );
2629          STORE( func, *inst, 0, 0, chan_index );
2630       }
2631       break;
2632
2633    case TGSI_OPCODE_SHL:
2634       return 0;
2635       break;
2636
2637    case TGSI_OPCODE_ISHR:
2638       return 0;
2639       break;
2640
2641    case TGSI_OPCODE_AND:
2642       return 0;
2643       break;
2644
2645    case TGSI_OPCODE_OR:
2646       return 0;
2647       break;
2648
2649    case TGSI_OPCODE_MOD:
2650       return 0;
2651       break;
2652
2653    case TGSI_OPCODE_XOR:
2654       return 0;
2655       break;
2656
2657    case TGSI_OPCODE_SAD:
2658       return 0;
2659       break;
2660
2661    case TGSI_OPCODE_TXF:
2662       return 0;
2663       break;
2664
2665    case TGSI_OPCODE_TXQ:
2666       return 0;
2667       break;
2668
2669    case TGSI_OPCODE_CONT:
2670       return 0;
2671       break;
2672
2673    case TGSI_OPCODE_EMIT:
2674       return 0;
2675       break;
2676
2677    case TGSI_OPCODE_ENDPRIM:
2678       return 0;
2679       break;
2680
2681    default:
2682       return 0;
2683    }
2684
2685    return 1;
2686 }
2687
2688 static void
2689 emit_declaration(
2690    struct x86_function *func,
2691    struct tgsi_full_declaration *decl )
2692 {
2693    if( decl->Declaration.File == TGSI_FILE_INPUT ) {
2694       unsigned first, last, mask;
2695       unsigned i, j;
2696
2697       first = decl->Range.First;
2698       last = decl->Range.Last;
2699       mask = decl->Declaration.UsageMask;
2700
2701       for( i = first; i <= last; i++ ) {
2702          for( j = 0; j < NUM_CHANNELS; j++ ) {
2703             if( mask & (1 << j) ) {
2704                switch( decl->Declaration.Interpolate ) {
2705                case TGSI_INTERPOLATE_CONSTANT:
2706                   emit_coef_a0( func, 0, i, j );
2707                   emit_inputs( func, 0, i, j );
2708                   break;
2709
2710                case TGSI_INTERPOLATE_LINEAR:
2711                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2712                   emit_coef_dadx( func, 1, i, j );
2713                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2714                   emit_coef_dady( func, 3, i, j );
2715                   emit_mul( func, 0, 1 );    /* x * dadx */
2716                   emit_coef_a0( func, 4, i, j );
2717                   emit_mul( func, 2, 3 );    /* y * dady */
2718                   emit_add( func, 0, 4 );    /* x * dadx + a0 */
2719                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2720                   emit_inputs( func, 0, i, j );
2721                   break;
2722
2723                case TGSI_INTERPOLATE_PERSPECTIVE:
2724                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2725                   emit_coef_dadx( func, 1, i, j );
2726                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2727                   emit_coef_dady( func, 3, i, j );
2728                   emit_mul( func, 0, 1 );    /* x * dadx */
2729                   emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2730                   emit_coef_a0( func, 5, i, j );
2731                   emit_rcp( func, 4, 4 );    /* 1.0 / w */
2732                   emit_mul( func, 2, 3 );    /* y * dady */
2733                   emit_add( func, 0, 5 );    /* x * dadx + a0 */
2734                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2735                   emit_mul( func, 0, 4 );    /* (x * dadx + y * dady + a0) / w */
2736                   emit_inputs( func, 0, i, j );
2737                   break;
2738
2739                default:
2740                   assert( 0 );
2741                   break;
2742                }
2743             }
2744          }
2745       }
2746    }
2747 }
2748
2749 static void aos_to_soa( struct x86_function *func,
2750                         uint arg_aos,
2751                         uint arg_machine,
2752                         uint arg_num,
2753                         uint arg_stride )
2754 {
2755    struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2756    struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2757    struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2758    struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2759    int loop_top, loop_exit_fixup;
2760
2761    /* Save EBX */
2762    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2763
2764    x86_mov( func, aos_input,  x86_fn_arg( func, arg_aos ) );
2765    x86_mov( func, soa_input,  x86_fn_arg( func, arg_machine ) );
2766    /* FIXME: tgsi_exec_machine::Inputs is a pointer now! */
2767    x86_lea( func, soa_input,
2768             x86_make_disp( soa_input,
2769                            Offset(struct tgsi_exec_machine, Inputs) ) );
2770    x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2771    x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) );
2772
2773    /* while (num_inputs != 0) */
2774    loop_top = x86_get_label( func );
2775    x86_cmp_imm( func, num_inputs, 0 );
2776    loop_exit_fixup = x86_jcc_forward( func, cc_E );
2777
2778    {
2779       x86_push( func, aos_input );
2780       sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2781       sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2782       x86_add( func, aos_input, stride );
2783       sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2784       sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2785       x86_add( func, aos_input, stride );
2786       sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2787       sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2788       x86_add( func, aos_input, stride );
2789       sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2790       sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2791       x86_pop( func, aos_input );
2792
2793       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2794       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2795       sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2796       sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2797       sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2798       sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2799
2800       sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2801       sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2802       sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2803       sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2804
2805       /* Advance to next input */
2806       x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2807       x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2808    }
2809    /* --num_inputs */
2810    x86_dec( func, num_inputs );
2811    x86_jmp( func, loop_top );
2812    x86_fixup_fwd_jump( func, loop_exit_fixup );
2813
2814    /* Restore EBX */
2815    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2816 }
2817
2818 static void soa_to_aos( struct x86_function *func,
2819                         uint arg_aos,
2820                         uint arg_machine,
2821                         uint arg_num,
2822                         uint arg_stride )
2823 {
2824    struct x86_reg soa_output = x86_make_reg( file_REG32, reg_AX );
2825    struct x86_reg aos_output = x86_make_reg( file_REG32, reg_BX );
2826    struct x86_reg num_outputs = x86_make_reg( file_REG32, reg_CX );
2827    struct x86_reg temp = x86_make_reg( file_REG32, reg_DX );
2828    int inner_loop;
2829
2830    /* Save EBX */
2831    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2832
2833    x86_mov( func, aos_output, x86_fn_arg( func, arg_aos ) );
2834    x86_mov( func, soa_output, x86_fn_arg( func, arg_machine ) );
2835    /* FIXME: tgsi_exec_machine::Ouputs is a pointer now! */
2836    x86_lea( func, soa_output,
2837             x86_make_disp( soa_output,
2838                            Offset(struct tgsi_exec_machine, Outputs) ) );
2839    x86_mov( func, num_outputs, x86_fn_arg( func, arg_num ) );
2840
2841    /* do */
2842    inner_loop = x86_get_label( func );
2843    {
2844       sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2845       sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2846       sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2847       sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2848
2849       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2850       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2851       sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2852       sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2853       sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2854       sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2855
2856       x86_mov( func, temp, x86_fn_arg( func, arg_stride ) );
2857       x86_push( func, aos_output );
2858       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2859       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2860       x86_add( func, aos_output, temp );
2861       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2862       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2863       x86_add( func, aos_output, temp );
2864       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2865       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2866       x86_add( func, aos_output, temp );
2867       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2868       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2869       x86_pop( func, aos_output );
2870
2871       /* Advance to next output */
2872       x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2873       x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2874    }
2875    /* while --num_outputs */
2876    x86_dec( func, num_outputs );
2877    x86_jcc( func, cc_NE, inner_loop );
2878
2879    /* Restore EBX */
2880    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2881 }
2882
2883
2884 /**
2885  * Check if the instructions dst register is the same as any src
2886  * register and warn if there's a posible SOA dependency.
2887  */
2888 static boolean
2889 check_soa_dependencies(const struct tgsi_full_instruction *inst)
2890 {
2891    uint opcode = inst->Instruction.Opcode;
2892
2893    /* XXX: we only handle src/dst aliasing in a few opcodes currently.
2894     * Need to use an additional temporay to hold the result in the
2895     * cases where the code is too opaque to fix.
2896     */
2897
2898    switch (opcode) {
2899    case TGSI_OPCODE_ADD:
2900    case TGSI_OPCODE_MOV:
2901    case TGSI_OPCODE_MUL:
2902    case TGSI_OPCODE_RCP:
2903    case TGSI_OPCODE_RSQ:
2904    case TGSI_OPCODE_EXP:
2905    case TGSI_OPCODE_LOG:
2906    case TGSI_OPCODE_DP3:
2907    case TGSI_OPCODE_DP4:
2908    case TGSI_OPCODE_DP2A:
2909    case TGSI_OPCODE_EX2:
2910    case TGSI_OPCODE_LG2:
2911    case TGSI_OPCODE_POW:
2912    case TGSI_OPCODE_XPD:
2913    case TGSI_OPCODE_DPH:
2914    case TGSI_OPCODE_COS:
2915    case TGSI_OPCODE_SIN:
2916    case TGSI_OPCODE_TEX:
2917    case TGSI_OPCODE_TXB:
2918    case TGSI_OPCODE_TXP:
2919    case TGSI_OPCODE_NRM:
2920    case TGSI_OPCODE_NRM4:
2921    case TGSI_OPCODE_DP2:
2922       /* OK - these opcodes correctly handle SOA dependencies */
2923       return TRUE;
2924    default:
2925       if (!tgsi_check_soa_dependencies(inst))
2926          return TRUE;
2927
2928       debug_printf("Warning: src/dst aliasing in instruction"
2929                    " is not handled:\n");
2930       debug_printf("Warning: ");
2931       tgsi_dump_instruction(inst, 1);
2932
2933       return FALSE;
2934    }
2935 }
2936
2937
2938 /**
2939  * Translate a TGSI vertex/fragment shader to SSE2 code.
2940  * Slightly different things are done for vertex vs. fragment shaders.
2941  *
2942  * \param tokens  the TGSI input shader
2943  * \param func  the output SSE code/function
2944  * \param immediates  buffer to place immediates, later passed to SSE func
2945  * \param return  1 for success, 0 if translation failed
2946  */
2947 unsigned
2948 tgsi_emit_sse2(
2949    const struct tgsi_token *tokens,
2950    struct x86_function *func,
2951    float (*immediates)[4],
2952    boolean do_swizzles )
2953 {
2954    struct tgsi_parse_context parse;
2955    unsigned ok = 1;
2956    uint num_immediates = 0;
2957
2958    util_init_math();
2959
2960    func->csr = func->store;
2961
2962    tgsi_parse_init( &parse, tokens );
2963
2964    /* Can't just use EDI, EBX without save/restoring them:
2965     */
2966    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2967    x86_push( func, x86_make_reg( file_REG32, reg_DI ) );
2968
2969    /*
2970     * Different function args for vertex/fragment shaders:
2971     */
2972    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2973       if (do_swizzles)
2974          aos_to_soa( func,
2975                      4,         /* aos_input */
2976                      1,         /* machine */
2977                      5,         /* num_inputs */
2978                      6 );       /* input_stride */
2979    }
2980
2981    x86_mov(
2982       func,
2983       get_machine_base(),
2984       x86_fn_arg( func, 1 ) );
2985    x86_mov(
2986       func,
2987       get_const_base(),
2988       x86_fn_arg( func, 2 ) );
2989    x86_mov(
2990       func,
2991       get_immediate_base(),
2992       x86_fn_arg( func, 3 ) );
2993
2994    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2995       x86_mov(
2996          func,
2997          get_coef_base(),
2998          x86_fn_arg( func, 4 ) );
2999    }
3000
3001    x86_mov(
3002       func,
3003       get_sampler_base(),
3004       x86_make_disp( get_machine_base(),
3005                      Offset( struct tgsi_exec_machine, Samplers ) ) );
3006
3007    while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
3008       tgsi_parse_token( &parse );
3009
3010       switch( parse.FullToken.Token.Type ) {
3011       case TGSI_TOKEN_TYPE_DECLARATION:
3012          if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
3013             emit_declaration(
3014                func,
3015                &parse.FullToken.FullDeclaration );
3016          }
3017          break;
3018
3019       case TGSI_TOKEN_TYPE_INSTRUCTION:
3020          ok = emit_instruction(
3021             func,
3022             &parse.FullToken.FullInstruction );
3023
3024          if (!ok) {
3025             uint opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
3026             uint proc = parse.FullHeader.Processor.Processor;
3027             debug_printf("failed to translate tgsi opcode %d (%s) to SSE (%s)\n",
3028                          opcode,
3029                          tgsi_get_opcode_name(opcode),
3030                          tgsi_get_processor_name(proc));
3031          }
3032
3033          if (ok)
3034             ok = check_soa_dependencies(&parse.FullToken.FullInstruction);
3035          break;
3036
3037       case TGSI_TOKEN_TYPE_IMMEDIATE:
3038          /* simply copy the immediate values into the next immediates[] slot */
3039          {
3040             const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
3041             uint i;
3042             assert(size <= 4);
3043             assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
3044             for( i = 0; i < size; i++ ) {
3045                immediates[num_immediates][i] =
3046                   parse.FullToken.FullImmediate.u[i].Float;
3047             }
3048 #if 0
3049             debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
3050                    num_immediates,
3051                    immediates[num_immediates][0],
3052                    immediates[num_immediates][1],
3053                    immediates[num_immediates][2],
3054                    immediates[num_immediates][3]);
3055 #endif
3056             num_immediates++;
3057          }
3058          break;
3059       case TGSI_TOKEN_TYPE_PROPERTY:
3060          /* we just ignore them for now */
3061          break;
3062
3063       default:
3064          ok = 0;
3065          assert( 0 );
3066       }
3067    }
3068
3069    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
3070       if (do_swizzles)
3071          soa_to_aos( func,
3072                      7,         /* aos_output */
3073                      1,         /* machine */
3074                      8,         /* num_outputs */
3075                      9 );       /* output_stride */
3076    }
3077
3078    /* Can't just use EBX, EDI without save/restoring them:
3079     */
3080    x86_pop( func, x86_make_reg( file_REG32, reg_DI ) );
3081    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
3082
3083    emit_ret( func );
3084
3085    tgsi_parse_free( &parse );
3086
3087    return ok;
3088 }
3089
3090 #else /* !PIPE_ARCH_X86 */
3091
3092 unsigned
3093 tgsi_emit_sse2(
3094    const struct tgsi_token *tokens,
3095    struct x86_function *func,
3096    float (*immediates)[4],
3097    boolean do_swizzles )
3098 {
3099    return 0;
3100 }
3101
3102 #endif /* !PIPE_ARCH_X86 */