js/src/nanojit/Nativei386.cpp

   1 /* -*- Mode: C++; c-basic-offset: 4; indent-tabs-mode: t; tab-width: 4 -*- */
   2 /* ***** BEGIN LICENSE BLOCK *****
   3  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
   4  *
   5  * The contents of this file are subject to the Mozilla Public License Version
   6  * 1.1 (the "License"); you may not use this file except in compliance with
   7  * the License. You may obtain a copy of the License at
   8  * http://www.mozilla.org/MPL/
   9  *
  10  * Software distributed under the License is distributed on an "AS IS" basis,
  11  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  12  * for the specific language governing rights and limitations under the
  13  * License.
  14  *
  15  * The Original Code is [Open Source Virtual Machine].
  16  *
  17  * The Initial Developer of the Original Code is
  18  * Adobe System Incorporated.
  19  * Portions created by the Initial Developer are Copyright (C) 2004-2007
  20  * the Initial Developer. All Rights Reserved.
  21  *
  22  * Contributor(s):
  23  *   Adobe AS3 Team
  24  *   Mozilla TraceMonkey Team
  25  *   Asko Tontti <atontti@cc.hut.fi>
  26  *
  27  * Alternatively, the contents of this file may be used under the terms of
  28  * either the GNU General Public License Version 2 or later (the "GPL"), or
  29  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  30  * in which case the provisions of the GPL or the LGPL are applicable instead
  31  * of those above. If you wish to allow use of your version of this file only
  32  * under the terms of either the GPL or the LGPL, and not to allow others to
  33  * use your version of this file under the terms of the MPL, indicate your
  34  * decision by deleting the provisions above and replace them with the notice
  35  * and other provisions required by the GPL or the LGPL. If you do not delete
  36  * the provisions above, a recipient may use your version of this file under
  37  * the terms of any one of the MPL, the GPL or the LGPL.
  38  *
  39  * ***** END LICENSE BLOCK ***** */
  40
  41 #ifdef _MAC
  42 // for MakeDataExecutable
  43 #include <CoreServices/CoreServices.h>
  44 #endif
  45
  46 #if defined AVMPLUS_UNIX || defined AVMPLUS_MAC
  47 #include <sys/mman.h>
  48 #include <errno.h>
  49 #include <stdlib.h>
  50 #endif
  51 #include "nanojit.h"
  52
  53 namespace nanojit
  54 {
  55         #ifdef FEATURE_NANOJIT
  56
  57         #ifdef NJ_VERBOSE
  58                 const char *regNames[] = {
  59 #if defined NANOJIT_IA32
  60                         "eax", "ecx", "edx", "ebx", "esp", "ebp", "esi", "edi",
  61                         "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7",
  62                         "f0",  "f1",  "f2",  "f3",  "f4",  "f5",  "f6",  "f7"
  63 #elif defined NANOJIT_AMD64
  64                         "rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi",
  65                         "r8",  "r9",  "r10", "r11", "r12", "r13", "r14", "r15",
  66                         "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7",
  67             "xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"
  68 #endif
  69                 };
  70         #endif
  71
  72 #if defined NANOJIT_IA32
  73     const Register Assembler::argRegs[] = { ECX, EDX };
  74     const Register Assembler::retRegs[] = { EAX, EDX };
  75     const Register Assembler::savedRegs[] = { EBX, ESI, EDI };
  76 #elif defined NANOJIT_AMD64
  77 #if defined WIN64
  78         const Register Assembler::argRegs[] = { R8, R9, RCX, RDX };
  79 #else
  80         const Register Assembler::argRegs[] = { RDI, RSI, RDX, RCX, R8, R9 };
  81 #endif
  82         const Register Assembler::retRegs[] = { RAX, RDX };
  83         const Register Assembler::savedRegs[] = { R13, R14, R15 };
  84 #endif
  85
  86     const static uint8_t max_abi_regs[] = {
  87         2, /* ABI_FASTCALL */
  88         1, /* ABI_THISCALL */
  89         0, /* ABI_STDCALL */
  90         0  /* ABI_CDECL */
  91     };
  92
  93
  94         void Assembler::nInit(AvmCore* core)
  95         {
  96                 (void) core;
  97         OSDep::getDate();
  98         }
  99
 100         NIns* Assembler::genPrologue()
 101         {
 102                 /**
 103                  * Prologue
 104                  */
 105                 uint32_t stackNeeded = STACK_GRANULARITY * _activation.highwatermark;
 106
 107                 uint32_t stackPushed =
 108             STACK_GRANULARITY + // returnaddr
 109             STACK_GRANULARITY; // ebp
 110
 111                 if (!_thisfrag->lirbuf->explicitSavedRegs)
 112                         stackPushed += NumSavedRegs * STACK_GRANULARITY;
 113
 114                 uint32_t aligned = alignUp(stackNeeded + stackPushed, NJ_ALIGN_STACK);
 115                 uint32_t amt = aligned - stackPushed;
 116
 117                 // Reserve stackNeeded bytes, padded
 118                 // to preserve NJ_ALIGN_STACK-byte alignment.
 119                 if (amt)
 120                 {
 121 #if defined NANOJIT_IA32
 122                         SUBi(SP, amt);
 123 #elif defined NANOJIT_AMD64
 124                         SUBQi(SP, amt);
 125 #endif
 126                 }
 127
 128                 verbose_only( outputAddr=true; asm_output("[frag entry]"); )
 129         NIns *fragEntry = _nIns;
 130                 MR(FP, SP); // Establish our own FP.
 131         PUSHr(FP); // Save caller's FP.
 132
 133                 if (!_thisfrag->lirbuf->explicitSavedRegs)
 134                         for (int i = 0; i < NumSavedRegs; ++i)
 135                                 PUSHr(savedRegs[i]);
 136
 137         // align the entry point
 138         asm_align_code();
 139
 140                 return fragEntry;
 141         }
 142
 143     void Assembler::asm_align_code() {
 144         static uint8_t nop[][9] = {
 145                 {0x90},
 146                 {0x66,0x90},
 147                 {0x0f,0x1f,0x00},
 148                 {0x0f,0x1f,0x40,0x00},
 149                 {0x0f,0x1f,0x44,0x00,0x00},
 150                 {0x66,0x0f,0x1f,0x44,0x00,0x00},
 151                 {0x0f,0x1f,0x80,0x00,0x00,0x00,0x00},
 152                 {0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00},
 153                 {0x66,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00},
 154         };
 155         unsigned n;
 156         while((n = uintptr_t(_nIns) & 15) != 0) {
 157             if (n > 9)
 158                 n = 9;
 159             underrunProtect(n);
 160             _nIns -= n;
 161             memcpy(_nIns, nop[n-1], n);
 162             asm_output("nop%d", n);
 163         }
 164     }
 165
 166         void Assembler::nFragExit(LInsp guard)
 167         {
 168                 SideExit *exit = guard->record()->exit;
 169                 bool trees = _frago->core()->config.tree_opt;
 170         Fragment *frag = exit->target;
 171         GuardRecord *lr = 0;
 172                 bool destKnown = (frag && frag->fragEntry);
 173                 if (destKnown && !trees)
 174                 {
 175                         // already exists, emit jump now.  no patching required.
 176                         JMP(frag->fragEntry);
 177             lr = 0;
 178                 }
 179                 else
 180                 {
 181                         // target doesn't exit yet.  emit jump to epilog, and set up to patch later.
 182                         lr = guard->record();
 183 #if defined NANOJIT_AMD64
 184             /* 8 bytes for address, 4 for imm32, 2 for jmp */
 185             underrunProtect(14);
 186             _nIns -= 8;
 187             *(intptr_t *)_nIns = intptr_t(_epilogue);
 188             lr->jmp = _nIns;
 189             JMPm_nochk(0);
 190 #else
 191             JMP_long(_epilogue);
 192             lr->jmp = _nIns;
 193 #endif
 194                 }
 195                 // first restore ESP from EBP, undoing SUBi(SP,amt) from genPrologue
 196         MR(SP,FP);
 197
 198                 // return value is GuardRecord*
 199         #if defined NANOJIT_IA32
 200         LDi(EAX, int(lr));
 201         #elif defined NANOJIT_AMD64
 202                 LDQi(RAX, intptr_t(lr));
 203         #endif
 204         }
 205
 206     NIns *Assembler::genEpilogue()
 207     {
 208         RET();
 209
 210                 if (!_thisfrag->lirbuf->explicitSavedRegs)
 211                         for (int i = NumSavedRegs - 1; i >= 0; --i)
 212                                 POPr(savedRegs[i]);
 213
 214         POPr(FP); // Restore caller's FP.
 215         MR(SP,FP); // pop the stack frame
 216         return  _nIns;
 217     }
 218
 219 #if defined NANOJIT_IA32
 220         void Assembler::asm_call(LInsp ins)
 221         {
 222         const CallInfo* call = ins->callInfo();
 223                 // must be signed, not unsigned
 224                 uint32_t iargs = call->count_iargs();
 225                 int32_t fargs = call->count_args() - iargs - call->isIndirect();
 226
 227         bool imt = call->isInterface();
 228         if (imt)
 229             iargs --;
 230
 231         uint32_t max_regs = max_abi_regs[call->_abi];
 232         if (max_regs > iargs)
 233             max_regs = iargs;
 234
 235         int32_t istack = iargs-max_regs;  // first 2 4B args are in registers
 236         int32_t extra = 0;
 237                 const int32_t pushsize = 4*istack + 8*fargs; // actual stack space used
 238
 239 #if _MSC_VER
 240         // msc is slack, and MIR doesn't do anything extra, so lets use this
 241         // call-site alignment to at least have code size parity with MIR.
 242         uint32_t align = 4;//NJ_ALIGN_STACK;
 243 #else
 244         uint32_t align = NJ_ALIGN_STACK;
 245 #endif
 246
 247         if (pushsize) {
 248                     // stack re-alignment
 249                     // only pop our adjustment amount since callee pops args in FASTCALL mode
 250                     extra = alignUp(pushsize, align) - pushsize;
 251             if (call->_abi == ABI_CDECL) {
 252                                 // with CDECL only, caller pops args
 253                 ADDi(SP, extra+pushsize);
 254             } else if (extra > 0) {
 255                                 ADDi(SP, extra);
 256             }
 257         }
 258
 259         bool indirect = false;
 260         if (ins->isop(LIR_call) || ins->isop(LIR_fcall)) {
 261                 CALL(call);
 262         }
 263         else {
 264             // indirect call.  x86 Calling conventions don't use EAX as an
 265             // argument, and do use EAX as a return value.  We need a register
 266             // for the address to call, so we use EAX since it will always be
 267             // available
 268             NanoAssert(ins->isop(LIR_calli) || ins->isop(LIR_fcalli));
 269             CALLr(call, EAX);
 270             indirect = true;
 271         }
 272
 273                 // make sure fpu stack is empty before call (restoreCallerSaved)
 274                 NanoAssert(_allocator.isFree(FST0));
 275                 // note: this code requires that ref arguments (ARGSIZE_Q)
 276         // be one of the first two arguments
 277                 // pre-assign registers to the first N 4B args based on the calling convention
 278                 uint32_t n = 0;
 279
 280         ArgSize sizes[2*MAXARGS];
 281         uint32_t argc = call->get_sizes(sizes);
 282         if (indirect) {
 283             argc--;
 284             asm_arg(ARGSIZE_LO, ins->arg(argc), EAX);
 285         }
 286
 287         if (imt) {
 288             // interface thunk calling convention: put iid in EDX
 289             NanoAssert(call->_abi == ABI_CDECL);
 290             argc--;
 291             asm_arg(ARGSIZE_LO, ins->arg(argc), EDX);
 292         }
 293
 294                 for(uint32_t i=0; i < argc; i++)
 295                 {
 296                         uint32_t j = argc-i-1;
 297             ArgSize sz = sizes[j];
 298             Register r = UnknownReg;
 299             if (n < max_regs && sz != ARGSIZE_F) {
 300                         r = argRegs[n++]; // tell asm_arg what reg to use
 301             }
 302             asm_arg(sz, ins->arg(j), r);
 303                 }
 304
 305                 if (extra > 0)
 306                         SUBi(SP, extra);
 307         }
 308
 309 #elif defined NANOJIT_AMD64
 310
 311         void Assembler::asm_call(LInsp ins)
 312         {
 313                 Register fpu_reg = XMM0;
 314         const CallInfo* call = ins->callInfo();
 315                 int n = 0;
 316
 317                 CALL(call);
 318
 319         ArgSize sizes[10];
 320         uint32_t argc = call->get_sizes(sizes);
 321
 322                 for(uint32_t i=0; i < argc; i++)
 323                 {
 324                         uint32_t j = argc-i-1;
 325             ArgSize sz = sizes[j];
 326             Register r = UnknownReg;
 327             if (sz != ARGSIZE_F) {
 328                             r = argRegs[n++]; // tell asm_arg what reg to use
 329                         } else {
 330                                 r = fpu_reg;
 331                                 fpu_reg = nextreg(fpu_reg);
 332                         }
 333                         findSpecificRegFor(ins->arg(j), r);
 334                 }
 335         }
 336 #endif
 337
 338         void Assembler::nMarkExecute(Page* page, int flags)
 339         {
 340                 NanoAssert(sizeof(Page) == NJ_PAGE_SIZE);
 341                 #if defined WIN32 || defined WIN64
 342                         DWORD dwIgnore;
 343                         static const DWORD kProtFlags[4] =
 344                         {
 345                                 PAGE_READONLY,                  // 0
 346                                 PAGE_READWRITE,                 // PAGE_WRITE
 347                                 PAGE_EXECUTE_READ,              // PAGE_EXEC
 348                                 PAGE_EXECUTE_READWRITE  // PAGE_EXEC|PAGE_WRITE
 349                         };
 350                         DWORD prot = kProtFlags[flags & (PAGE_WRITE|PAGE_EXEC)];
 351                         BOOL res = VirtualProtect(page, NJ_PAGE_SIZE, prot, &dwIgnore);
 352                         if (!res)
 353                         {
 354                                 // todo: we can't abort or assert here, we have to fail gracefully.
 355                                 NanoAssertMsg(false, "FATAL ERROR: VirtualProtect() failed\n");
 356                         }
 357                 #elif defined AVMPLUS_UNIX || defined AVMPLUS_MAC
 358                         static const int kProtFlags[4] =
 359                         {
 360                                 PROT_READ,                                              // 0
 361                                 PROT_READ|PROT_WRITE,                   // PAGE_WRITE
 362                                 PROT_READ|PROT_EXEC,                    // PAGE_EXEC
 363                                 PROT_READ|PROT_WRITE|PROT_EXEC  // PAGE_EXEC|PAGE_WRITE
 364                         };
 365                         int prot = kProtFlags[flags & (PAGE_WRITE|PAGE_EXEC)];
 366                         intptr_t addr = (intptr_t)page;
 367                         addr &= ~((uintptr_t)NJ_PAGE_SIZE - 1);
 368                         NanoAssert(addr == (intptr_t)page);
 369                         #if defined SOLARIS
 370                         if (mprotect((char *)addr, NJ_PAGE_SIZE, prot) == -1)
 371                         #else
 372                         if (mprotect((void *)addr, NJ_PAGE_SIZE, prot) == -1)
 373                         #endif
 374                         {
 375                                 // todo: we can't abort or assert here, we have to fail gracefully.
 376                                 NanoAssertMsg(false, "FATAL ERROR: mprotect(PROT_EXEC) failed\n");
 377                 abort();
 378             }
 379         #else
 380                         (void)page;
 381                 #endif
 382         }
 383
 384         Register Assembler::nRegisterAllocFromSet(int set)
 385         {
 386                 Register r;
 387                 RegAlloc &regs = _allocator;
 388         #ifdef WIN32
 389                 _asm
 390                 {
 391                         mov ecx, regs
 392                         bsf eax, set                                    // i = first bit set
 393                         btr RegAlloc::free[ecx], eax    // free &= ~rmask(i)
 394                         mov r, eax
 395                 }
 396         #elif defined WIN64
 397                 unsigned long tr, fr;
 398                 _BitScanForward(&tr, set);
 399                 _bittestandreset(&fr, tr);
 400                 regs.free = fr;
 401                 r = tr;
 402         #else
 403                 asm(
 404                         "bsf    %1, %%eax\n\t"
 405                         "btr    %%eax, %2\n\t"
 406                         "movl   %%eax, %0\n\t"
 407                         : "=m"(r) : "m"(set), "m"(regs.free) : "%eax", "memory" );
 408         #endif /* WIN32 */
 409                 return r;
 410         }
 411
 412         void Assembler::nRegisterResetAll(RegAlloc& a)
 413         {
 414                 // add scratch registers to our free list for the allocator
 415                 a.clear();
 416                 a.used = 0;
 417                 a.free = SavedRegs | ScratchRegs;
 418 #if defined NANOJIT_IA32
 419         if (!config.sse2)
 420             a.free &= ~XmmRegs;
 421 #endif
 422                 debug_only( a.managed = a.free; )
 423         }
 424
 425         NIns* Assembler::nPatchBranch(NIns* branch, NIns* targ)
 426         {
 427 #if defined NANOJIT_IA32
 428         NIns* was = 0;
 429                 intptr_t offset = intptr_t(targ) - intptr_t(branch);
 430                 if (branch[0] == JMP32) {
 431             was = branch + *(int32_t*)&branch[1] + 5;
 432                     *(int32_t*)&branch[1] = offset - 5;
 433                 } else if (branch[0] == JCC32) {
 434             was = branch + *(int32_t*)&branch[2] + 6;
 435                     *(int32_t*)&branch[2] = offset - 6;
 436                 } else
 437                     NanoAssertMsg(0, "Unknown branch type in nPatchBranch");
 438 #else
 439         if (branch[0] == 0xFF && branch[1] == 0x25) {
 440             NIns *mem;
 441             mem = &branch[6] + *(int32_t *)&branch[2];
 442             was = *(intptr_t*)mem;
 443             *(intptr_t *)mem = intptr_t(targ);
 444         } else {
 445             NanoAssertMsg(0, "Unknown branch type in nPatchBranch");
 446         }
 447 #endif
 448         return was;
 449         }
 450
 451         RegisterMask Assembler::hint(LIns* i, RegisterMask allow)
 452         {
 453                 uint32_t op = i->opcode();
 454                 int prefer = allow;
 455         if (op == LIR_call || op == LIR_calli) {
 456                         prefer &= rmask(retRegs[0]);
 457         }
 458         else if (op == LIR_fcall || op == LIR_fcalli) {
 459             prefer &= rmask(FST0);
 460         }
 461         else if (op == LIR_param) {
 462             uint32_t max_regs = max_abi_regs[_thisfrag->lirbuf->abi];
 463             if (i->imm8() < max_regs)
 464                         prefer &= rmask(Register(i->imm8()));
 465         }
 466         else if (op == LIR_callh || op == LIR_rsh && i->oprnd1()->opcode()==LIR_callh) {
 467             prefer &= rmask(retRegs[1]);
 468         }
 469         else if (i->isCmp()) {
 470                         prefer &= AllowableFlagRegs;
 471         }
 472         else if (i->isconst()) {
 473             prefer &= ScratchRegs;
 474         }
 475                 return (_allocator.free & prefer) ? prefer : allow;
 476         }
 477
 478     void Assembler::asm_qjoin(LIns *ins)
 479     {
 480                 int d = findMemFor(ins);
 481                 AvmAssert(d);
 482                 LIns* lo = ins->oprnd1();
 483                 LIns* hi = ins->oprnd2();
 484
 485         Reservation *resv = getresv(ins);
 486         Register rr = resv->reg;
 487
 488         if (rr != UnknownReg && (rmask(rr) & FpRegs))
 489             evict(rr);
 490
 491         if (hi->isconst())
 492                 {
 493                         STi(FP, d+4, hi->constval());
 494                 }
 495                 else
 496                 {
 497                         Register r = findRegFor(hi, GpRegs);
 498                         ST(FP, d+4, r);
 499                 }
 500
 501         if (lo->isconst())
 502                 {
 503                         STi(FP, d, lo->constval());
 504                 }
 505                 else
 506                 {
 507                         // okay if r gets recycled.
 508                         Register r = findRegFor(lo, GpRegs);
 509                         ST(FP, d, r);
 510                 }
 511
 512         freeRsrcOf(ins, false); // if we had a reg in use, emit a ST to flush it to mem
 513     }
 514
 515         void Assembler::asm_load(int d, Register r)
 516         {
 517                 if (rmask(r) & FpRegs)
 518                 {
 519 #if defined NANOJIT_IA32
 520                         if (rmask(r) & XmmRegs) {
 521 #endif
 522                                 SSE_LDQ(r, d, FP);
 523 #if defined NANOJIT_IA32
 524                         } else {
 525                                 FLDQ(d, FP);
 526                         }
 527 #endif
 528                 }
 529 #if defined NANOJIT_AMD64
 530                 else if (i->opcode() == LIR_param)
 531                 {
 532                         LDQ(r, d, FP);
 533                 }
 534 #endif
 535                 else
 536                 {
 537                         LD(r, d, FP);
 538                 }
 539         }
 540
 541         void Assembler::asm_restore(LInsp i, Reservation *resv, Register r)
 542         {
 543         if (i->isop(LIR_alloc)) {
 544                         verbose_only( if (_verbose) { outputForEOL("  <= remat %s size %d", _thisfrag->lirbuf->names->formatRef(i), i->size()); } )
 545             LEA(r, disp(resv), FP);
 546         }
 547         else if (i->isconst()) {
 548             if (!resv->arIndex) {
 549                 reserveFree(i);
 550             }
 551             LDi(r, i->constval());
 552         }
 553         else {
 554             int d = findMemFor(i);
 555                         verbose_only( if (_verbose) { outputForEOL("  <= restore %s", _thisfrag->lirbuf->names->formatRef(i)); } )
 556                         asm_load(d,r);
 557         }
 558         }
 559
 560     void Assembler::asm_store32(LIns *value, int dr, LIns *base)
 561     {
 562         if (value->isconst())
 563         {
 564                         Register rb = getBaseReg(base, dr, GpRegs);
 565             int c = value->constval();
 566                         STi(rb, dr, c);
 567         }
 568         else
 569         {
 570                     // make sure what is in a register
 571                     Reservation *rA, *rB;
 572             Register ra, rb;
 573             if (base->isop(LIR_alloc)) {
 574                 rb = FP;
 575                 dr += findMemFor(base);
 576                 ra = findRegFor(value, GpRegs);
 577             } else if (base->isconst()) {
 578                 // absolute address
 579                 dr += base->constval();
 580                 ra = findRegFor(value, GpRegs);
 581                 rb = UnknownReg;
 582             } else {
 583                     findRegFor2(GpRegs, value, rA, base, rB);
 584                         ra = rA->reg;
 585                         rb = rB->reg;
 586             }
 587                     ST(rb, dr, ra);
 588         }
 589     }
 590
 591         void Assembler::asm_spill(Register rr, int d, bool pop, bool quad)
 592         {
 593                 (void)quad;
 594                 if (d)
 595                 {
 596                         // save to spill location
 597             if (rmask(rr) & FpRegs)
 598                         {
 599 #if defined NANOJIT_IA32
 600                 if (rmask(rr) & XmmRegs) {
 601 #endif
 602                     SSE_STQ(d, FP, rr);
 603 #if defined NANOJIT_IA32
 604                 } else {
 605                                         FSTQ((pop?1:0), d, FP);
 606                 }
 607 #endif
 608                         }
 609 #if defined NANOJIT_AMD64
 610                         else if (quad)
 611                         {
 612                                 STQ(FP, d, rr);
 613                         }
 614 #endif
 615                         else
 616                         {
 617                                 ST(FP, d, rr);
 618                         }
 619                 }
 620 #if defined NANOJIT_IA32
 621                 else if (pop && (rmask(rr) & x87Regs))
 622                 {
 623                         // pop the fpu result since it isn't used
 624                         FSTP(FST0);
 625                 }
 626 #endif
 627         }
 628
 629         void Assembler::asm_load64(LInsp ins)
 630         {
 631                 LIns* base = ins->oprnd1();
 632                 int db = ins->oprnd2()->constval();
 633                 Reservation *resv = getresv(ins);
 634                 Register rr = resv->reg;
 635
 636                 if (rr != UnknownReg && rmask(rr) & XmmRegs)
 637                 {
 638                         freeRsrcOf(ins, false);
 639                         Register rb = getBaseReg(base, db, GpRegs);
 640                         SSE_LDQ(rr, db, rb);
 641                 }
 642 #if defined NANOJIT_AMD64
 643                 else if (rr != UnknownReg && rmask(rr) & GpRegs)
 644                 {
 645                         freeRsrcOf(ins, false);
 646                         Register rb = findRegFor(base, GpRegs);
 647                         LDQ(rr, db, rb);
 648                 }
 649                 else
 650                 {
 651             int d = disp(resv);
 652             Register rb = findRegFor(base, GpRegs);
 653
 654             /* We need a temporary register we can move the desination into */
 655             rr = registerAlloc(GpRegs);
 656
 657             STQ(FP, d, rr);
 658             LDQ(rr, db, rb);
 659
 660             /* Mark as free */
 661             _allocator.addFree(rr);
 662
 663                         freeRsrcOf(ins, false);
 664                 }
 665 #elif defined NANOJIT_IA32
 666                 else
 667                 {
 668                         int dr = disp(resv);
 669                         Register rb;
 670             if (base->isop(LIR_alloc)) {
 671                 rb = FP;
 672                 db += findMemFor(base);
 673             } else {
 674                 rb = findRegFor(base, GpRegs);
 675             }
 676                         resv->reg = UnknownReg;
 677
 678                         // don't use an fpu reg to simply load & store the value.
 679                         if (dr)
 680                                 asm_mmq(FP, dr, rb, db);
 681
 682                         freeRsrcOf(ins, false);
 683
 684                         if (rr != UnknownReg)
 685                         {
 686                                 NanoAssert(rmask(rr)&FpRegs);
 687                                 _allocator.retire(rr);
 688                                 FLDQ(db, rb);
 689                         }
 690                 }
 691 #endif
 692         }
 693
 694         void Assembler::asm_store64(LInsp value, int dr, LInsp base)
 695         {
 696                 if (value->isconstq())
 697                 {
 698                         // if a constant 64-bit value just store it now rather than
 699                         // generating a pointless store/load/store sequence
 700                         Register rb;
 701             if (base->isop(LIR_alloc)) {
 702                 rb = FP;
 703                 dr += findMemFor(base);
 704             } else {
 705                 rb = findRegFor(base, GpRegs);
 706             }
 707                         const int32_t* p = (const int32_t*) (value-2);
 708                         STi(rb, dr+4, p[1]);
 709                         STi(rb, dr, p[0]);
 710             return;
 711                 }
 712
 713 #if defined NANOJIT_IA32
 714         if (value->isop(LIR_ldq) || value->isop(LIR_ldqc) || value->isop(LIR_qjoin))
 715                 {
 716                         // value is 64bit struct or int64_t, or maybe a double.
 717                         // it may be live in an FPU reg.  Either way, don't
 718                         // put it in an FPU reg just to load & store it.
 719
 720                         // a) if we know it's not a double, this is right.
 721                         // b) if we guarded that its a double, this store could be on
 722                         // the side exit, copying a non-double.
 723                         // c) maybe its a double just being stored.  oh well.
 724
 725                         if (config.sse2) {
 726                 Register rv = findRegFor(value, XmmRegs);
 727                 Register rb;
 728                 if (base->isop(LIR_alloc)) {
 729                     rb = FP;
 730                     dr += findMemFor(base);
 731                 } else {
 732                     rb = findRegFor(base, GpRegs);
 733                 }
 734                 SSE_STQ(dr, rb, rv);
 735                                 return;
 736             }
 737
 738                         int da = findMemFor(value);
 739                     Register rb;
 740                     if (base->isop(LIR_alloc)) {
 741                                         rb = FP;
 742                                         dr += findMemFor(base);
 743                     } else {
 744                                         rb = findRegFor(base, GpRegs);
 745                     }
 746                     asm_mmq(rb, dr, FP, da);
 747             return;
 748                 }
 749
 750                 Register rb;
 751                 if (base->isop(LIR_alloc)) {
 752                     rb = FP;
 753                     dr += findMemFor(base);
 754                 } else {
 755                     rb = findRegFor(base, GpRegs);
 756                 }
 757
 758                 // if value already in a reg, use that, otherwise
 759                 // try to get it into XMM regs before FPU regs.
 760                 Reservation* rA = getresv(value);
 761                 Register rv;
 762                 int pop = !rA || rA->reg==UnknownReg;
 763                 if (pop) {
 764                     rv = findRegFor(value, config.sse2 ? XmmRegs : FpRegs);
 765                 } else {
 766                     rv = rA->reg;
 767                 }
 768
 769                 if (rmask(rv) & XmmRegs) {
 770             SSE_STQ(dr, rb, rv);
 771                 } else {
 772                         FSTQ(pop, dr, rb);
 773                 }
 774 #elif defined NANOJIT_AMD64
 775                 /* If this is not a float operation, we can use GpRegs instead.
 776                  * We can do this in a few other cases but for now I'll keep it simple.
 777                  */
 778             Register rb = findRegFor(base, GpRegs);
 779         Reservation *rV = getresv(value);
 780
 781         if (rV != NULL && rV->reg != UnknownReg) {
 782             if (rmask(rV->reg) & GpRegs) {
 783                 STQ(rb, dr, rV->reg);
 784             } else {
 785                 SSE_STQ(dr, rb, rV->reg);
 786             }
 787         } else {
 788             Register rv;
 789
 790             /* Try to catch some common patterns.
 791              * Note: this is a necessity, since in between things like
 792              * asm_fop() could see the reservation and try to use a non-SSE
 793              * register for adding.  Same for asm_qbinop in theory.
 794              * There should probably be asserts to catch more cases.
 795              */
 796             if (value->isop(LIR_u2f)
 797                 || value->isop(LIR_i2f)
 798                 || (value->opcode() >= LIR_fneg && value->opcode() <= LIR_fmul)
 799                 || value->opcode() == LIR_fdiv
 800                 || value->opcode() == LIR_fcall) {
 801                 rv = findRegFor(value, XmmRegs);
 802                 SSE_STQ(dr, rb, rv);
 803             } else {
 804                 rv = findRegFor(value, GpRegs);
 805                 STQ(rb, dr, rv);
 806             }
 807         }
 808 #endif
 809         }
 810
 811     /**
 812      * copy 64 bits: (rd+dd) <- (rs+ds)
 813      */
 814     void Assembler::asm_mmq(Register rd, int dd, Register rs, int ds)
 815     {
 816         // value is either a 64bit struct or maybe a float
 817         // that isn't live in an FPU reg.  Either way, don't
 818         // put it in an FPU reg just to load & store it.
 819 #if defined NANOJIT_IA32
 820         if (config.sse2)
 821         {
 822 #endif
 823             // use SSE to load+store 64bits
 824             Register t = registerAlloc(XmmRegs);
 825             _allocator.addFree(t);
 826             SSE_STQ(dd, rd, t);
 827             SSE_LDQ(t, ds, rs);
 828 #if defined NANOJIT_IA32
 829         }
 830         else
 831         {
 832             // get a scratch reg
 833             Register t = registerAlloc(GpRegs & ~(rmask(rd)|rmask(rs)));
 834             _allocator.addFree(t);
 835             ST(rd, dd+4, t);
 836             LD(t, ds+4, rs);
 837             ST(rd, dd, t);
 838             LD(t, ds, rs);
 839         }
 840 #endif
 841     }
 842
 843         NIns* Assembler::asm_branch(bool branchOnFalse, LInsp cond, NIns* targ, bool isfar)
 844         {
 845                 NIns* at = 0;
 846                 LOpcode condop = cond->opcode();
 847                 NanoAssert(cond->isCond());
 848 #ifndef NJ_SOFTFLOAT
 849                 if (condop >= LIR_feq && condop <= LIR_fge)
 850                 {
 851                         return asm_jmpcc(branchOnFalse, cond, targ);
 852                 }
 853 #endif
 854                 // produce the branch
 855                 if (branchOnFalse)
 856                 {
 857                         if (condop == LIR_eq)
 858                                 JNE(targ, isfar);
 859                         else if (condop == LIR_ov)
 860                                 JNO(targ, isfar);
 861                         else if (condop == LIR_cs)
 862                                 JNC(targ, isfar);
 863                         else if (condop == LIR_lt)
 864                                 JNL(targ, isfar);
 865                         else if (condop == LIR_le)
 866                                 JNLE(targ, isfar);
 867                         else if (condop == LIR_gt)
 868                                 JNG(targ, isfar);
 869                         else if (condop == LIR_ge)
 870                                 JNGE(targ, isfar);
 871                         else if (condop == LIR_ult)
 872                                 JNB(targ, isfar);
 873                         else if (condop == LIR_ule)
 874                                 JNBE(targ, isfar);
 875                         else if (condop == LIR_ugt)
 876                                 JNA(targ, isfar);
 877                         else //if (condop == LIR_uge)
 878                                 JNAE(targ, isfar);
 879                 }
 880                 else // op == LIR_xt
 881                 {
 882                         if (condop == LIR_eq)
 883                                 JE(targ, isfar);
 884                         else if (condop == LIR_ov)
 885                                 JO(targ, isfar);
 886                         else if (condop == LIR_cs)
 887                                 JC(targ, isfar);
 888                         else if (condop == LIR_lt)
 889                                 JL(targ, isfar);
 890                         else if (condop == LIR_le)
 891                                 JLE(targ, isfar);
 892                         else if (condop == LIR_gt)
 893                                 JG(targ, isfar);
 894                         else if (condop == LIR_ge)
 895                                 JGE(targ, isfar);
 896                         else if (condop == LIR_ult)
 897                                 JB(targ, isfar);
 898                         else if (condop == LIR_ule)
 899                                 JBE(targ, isfar);
 900                         else if (condop == LIR_ugt)
 901                                 JA(targ, isfar);
 902                         else //if (condop == LIR_uge)
 903                                 JAE(targ, isfar);
 904                 }
 905                 at = _nIns;
 906                 asm_cmp(cond);
 907                 return at;
 908         }
 909
 910         void Assembler::asm_cmp(LIns *cond)
 911         {
 912         LOpcode condop = cond->opcode();
 913
 914         // LIR_ov and LIR_cs recycle the flags set by arithmetic ops
 915         if ((condop == LIR_ov) || (condop == LIR_cs))
 916             return;
 917
 918         LInsp lhs = cond->oprnd1();
 919                 LInsp rhs = cond->oprnd2();
 920                 Reservation *rA, *rB;
 921
 922                 NanoAssert((!lhs->isQuad() && !rhs->isQuad()) || (lhs->isQuad() && rhs->isQuad()));
 923
 924                 // Not supported yet.
 925 #if !defined NANOJIT_64BIT
 926                 NanoAssert(!lhs->isQuad() && !rhs->isQuad());
 927 #endif
 928
 929                 // ready to issue the compare
 930                 if (rhs->isconst())
 931                 {
 932                         int c = rhs->constval();
 933                         if (c == 0 && cond->isop(LIR_eq)) {
 934                                 Register r = findRegFor(lhs, GpRegs);
 935                                 if (rhs->isQuad()) {
 936 #if defined NANOJIT_64BIT
 937                                         TESTQ(r, r);
 938 #endif
 939                                 } else {
 940                                         TEST(r,r);
 941                                 }
 942                         // No 64-bit immediates so fall-back to below
 943                         }
 944                         else if (!rhs->isQuad()) {
 945                                 Register r = getBaseReg(lhs, c, GpRegs);
 946                                 CMPi(r, c);
 947                         }
 948                 }
 949                 else
 950                 {
 951                         findRegFor2(GpRegs, lhs, rA, rhs, rB);
 952                         Register ra = rA->reg;
 953                         Register rb = rB->reg;
 954                         if (rhs->isQuad()) {
 955 #if defined NANOJIT_64BIT
 956                                 CMPQ(ra, rb);
 957 #endif
 958                         } else {
 959                                 CMP(ra, rb);
 960                         }
 961                 }
 962         }
 963
 964         void Assembler::asm_loop(LInsp ins, NInsList& loopJumps)
 965         {
 966                 JMP_long(0);
 967         loopJumps.add(_nIns);
 968
 969                 // If the target we are looping to is in a different fragment, we have to restore
 970                 // SP since we will target fragEntry and not loopEntry.
 971             if (ins->record()->exit->target != _thisfrag)
 972                 MR(SP,FP);
 973         }
 974
 975         void Assembler::asm_fcond(LInsp ins)
 976         {
 977                 // only want certain regs
 978                 Register r = prepResultReg(ins, AllowableFlagRegs);
 979                 asm_setcc(r, ins);
 980 #ifdef NJ_ARM_VFP
 981                 SETE(r);
 982 #else
 983                 // SETcc only sets low 8 bits, so extend
 984                 MOVZX8(r,r);
 985                 SETNP(r);
 986 #endif
 987                 asm_fcmp(ins);
 988         }
 989
 990         void Assembler::asm_cond(LInsp ins)
 991         {
 992                 // only want certain regs
 993                 LOpcode op = ins->opcode();
 994                 Register r = prepResultReg(ins, AllowableFlagRegs);
 995                 // SETcc only sets low 8 bits, so extend
 996                 MOVZX8(r,r);
 997                 if (op == LIR_eq)
 998                         SETE(r);
 999                 else if (op == LIR_ov)
1000                         SETO(r);
1001                 else if (op == LIR_cs)
1002                         SETC(r);
1003                 else if (op == LIR_lt)
1004                         SETL(r);
1005                 else if (op == LIR_le)
1006                         SETLE(r);
1007                 else if (op == LIR_gt)
1008                         SETG(r);
1009                 else if (op == LIR_ge)
1010                         SETGE(r);
1011                 else if (op == LIR_ult)
1012                         SETB(r);
1013                 else if (op == LIR_ule)
1014                         SETBE(r);
1015                 else if (op == LIR_ugt)
1016                         SETA(r);
1017                 else // if (op == LIR_uge)
1018                         SETAE(r);
1019                 asm_cmp(ins);
1020         }
1021
1022         void Assembler::asm_arith(LInsp ins)
1023         {
1024                 LOpcode op = ins->opcode();
1025                 LInsp lhs = ins->oprnd1();
1026                 LInsp rhs = ins->oprnd2();
1027
1028                 Register rb = UnknownReg;
1029                 RegisterMask allow = GpRegs;
1030                 bool forceReg = (op == LIR_mul || !rhs->isconst());
1031
1032         /* Even if lhs == rhs && forceReg, shift instructions require ECX on the rhs. */
1033                 if ((lhs != rhs || (op == LIR_lsh || op == LIR_rsh || op == LIR_ush)) && forceReg)
1034                 {
1035                         if ((rb = asm_binop_rhs_reg(ins)) == UnknownReg) {
1036                                 rb = findRegFor(rhs, allow);
1037                         }
1038                         allow &= ~rmask(rb);
1039                 }
1040                 else if ((op == LIR_add||op == LIR_addp) && lhs->isop(LIR_alloc) && rhs->isconst()) {
1041                         // add alloc+const, use lea
1042                         Register rr = prepResultReg(ins, allow);
1043                         int d = findMemFor(lhs) + rhs->constval();
1044                         LEA(rr, d, FP);
1045                 }
1046
1047                 Register rr = prepResultReg(ins, allow);
1048                 Reservation* rA = getresv(lhs);
1049                 Register ra;
1050                 // if this is last use of lhs in reg, we can re-use result reg
1051                 if (rA == 0 || (ra = rA->reg) == UnknownReg)
1052                         ra = findSpecificRegFor(lhs, rr);
1053                 // else, rA already has a register assigned.
1054
1055                 if (forceReg)
1056                 {
1057                         if (lhs == rhs)
1058                                 rb = ra;
1059
1060                         if (op == LIR_add || op == LIR_addp)
1061                                 ADD(rr, rb);
1062                         else if (op == LIR_sub)
1063                                 SUB(rr, rb);
1064                         else if (op == LIR_mul)
1065                                 MUL(rr, rb);
1066                         else if (op == LIR_and)
1067                                 AND(rr, rb);
1068                         else if (op == LIR_or)
1069                                 OR(rr, rb);
1070                         else if (op == LIR_xor)
1071                                 XOR(rr, rb);
1072                         else if (op == LIR_lsh)
1073                                 SHL(rr, rb);
1074                         else if (op == LIR_rsh)
1075                                 SAR(rr, rb);
1076                         else if (op == LIR_ush)
1077                                 SHR(rr, rb);
1078                         else
1079                                 NanoAssertMsg(0, "Unsupported");
1080                 }
1081                 else
1082                 {
1083                         int c = rhs->constval();
1084                         if (op == LIR_add || op == LIR_addp) {
1085 #ifdef NANOJIT_IA32_TODO
1086                                 if (ra != rr) {
1087                                         // this doesn't set cc's, only use it when cc's not required.
1088                                         LEA(rr, c, ra);
1089                                         ra = rr; // suppress mov
1090                                 } else
1091 #endif
1092                                 {
1093                                         ADDi(rr, c);
1094                                 }
1095                         } else if (op == LIR_sub) {
1096 #ifdef NANOJIT_IA32
1097                                 if (ra != rr) {
1098                                         LEA(rr, -c, ra);
1099                                         ra = rr;
1100                                 } else
1101 #endif
1102                                 {
1103                                         SUBi(rr, c);
1104                                 }
1105                         } else if (op == LIR_and)
1106                                 ANDi(rr, c);
1107                         else if (op == LIR_or)
1108                                 ORi(rr, c);
1109                         else if (op == LIR_xor)
1110                                 XORi(rr, c);
1111                         else if (op == LIR_lsh)
1112                                 SHLi(rr, c);
1113                         else if (op == LIR_rsh)
1114                                 SARi(rr, c);
1115                         else if (op == LIR_ush)
1116                                 SHRi(rr, c);
1117                         else
1118                                 NanoAssertMsg(0, "Unsupported");
1119                 }
1120
1121                 if ( rr != ra )
1122                         MR(rr,ra);
1123         }
1124
1125         void Assembler::asm_neg_not(LInsp ins)
1126         {
1127                 LOpcode op = ins->opcode();
1128                 Register rr = prepResultReg(ins, GpRegs);
1129
1130                 LIns* lhs = ins->oprnd1();
1131                 Reservation *rA = getresv(lhs);
1132                 // if this is last use of lhs in reg, we can re-use result reg
1133                 Register ra;
1134                 if (rA == 0 || (ra=rA->reg) == UnknownReg)
1135                         ra = findSpecificRegFor(lhs, rr);
1136                 // else, rA already has a register assigned.
1137
1138                 if (op == LIR_not)
1139                         NOT(rr);
1140                 else
1141                         NEG(rr);
1142
1143                 if ( rr != ra )
1144                         MR(rr,ra);
1145         }
1146
1147         void Assembler::asm_ld(LInsp ins)
1148         {
1149                 LOpcode op = ins->opcode();
1150                 LIns* base = ins->oprnd1();
1151                 LIns* disp = ins->oprnd2();
1152                 Register rr = prepResultReg(ins, GpRegs);
1153                 int d = disp->constval();
1154
1155 #ifdef NANOJIT_IA32
1156                 /* Can't use this on AMD64, no 64-bit immediate addresses. */
1157                 if (base->isconst()) {
1158                         intptr_t addr = base->constval();
1159                         addr += d;
1160                         if (op == LIR_ldcb)
1161                                 LD8Zdm(rr, addr);
1162                         else if (op == LIR_ldcs)
1163                                 LD16Zdm(rr, addr);
1164                         else
1165                                 LDdm(rr, addr);
1166                         return;
1167                 }
1168
1169                 /* :TODO: Use this on AMD64 as well. */
1170                 /* Search for add(X,Y) */
1171                 if (base->opcode() == LIR_piadd) {
1172                         int scale = 0;
1173                         LIns *lhs = base->oprnd1();
1174                         LIns *rhs = base->oprnd2();
1175
1176                         /* See if we can bypass any SHLs, by searching for
1177                          * add(X, shl(Y,Z)) -> mov r, [X+Y*Z]
1178                          */
1179                         if (rhs->opcode() == LIR_pilsh && rhs->oprnd2()->isconst()) {
1180                                 scale = rhs->oprnd2()->constval();
1181                                 if (scale >= 1 && scale <= 3)
1182                                         rhs = rhs->oprnd1();
1183                                 else
1184                                         scale = 0;
1185                         }
1186
1187                         Register rleft;
1188                         Reservation *rL = getresv(lhs);
1189
1190                         /* Does LHS have a register yet? If not, re-use the result reg.
1191                          * :TODO: If LHS is const, we could eliminate a register use.
1192                          */
1193                         if (rL == NULL || rL->reg == UnknownReg)
1194                                 rleft = findSpecificRegFor(lhs, rr);
1195                         else
1196                                 rleft = rL->reg;
1197
1198                         Register rright = UnknownReg;
1199                         Reservation *rR = getresv(rhs);
1200
1201                         /* Does RHS have a register yet? If not, try to re-use the result reg. */
1202                         if (rr != rleft && (rR == NULL || rR->reg == UnknownReg))
1203                                 rright = findSpecificRegFor(rhs, rr);
1204                         if (rright == UnknownReg)
1205                                 rright = findRegFor(rhs, GpRegs & ~(rmask(rleft)));
1206
1207                         if (op == LIR_ldcb)
1208                                 LD8Zsib(rr, d, rleft, rright, scale);
1209                         else if (op == LIR_ldcs)
1210                                 LD16Zsib(rr, d, rleft, rright, scale);
1211                         else
1212                                 LDsib(rr, d, rleft, rright, scale);
1213
1214                         return;
1215                 }
1216 #endif
1217
1218                 Register ra = getBaseReg(base, d, GpRegs);
1219                 if (op == LIR_ldcb)
1220                         LD8Z(rr, d, ra);
1221                 else if (op == LIR_ldcs)
1222                     LD16Z(rr, d, ra);
1223                 else
1224                         LD(rr, d, ra);
1225         }
1226
1227         void Assembler::asm_cmov(LInsp ins)
1228         {
1229                 LOpcode op = ins->opcode();
1230                 LIns* condval = ins->oprnd1();
1231                 NanoAssert(condval->isCmp());
1232
1233                 LIns* values = ins->oprnd2();
1234
1235                 NanoAssert(values->opcode() == LIR_2);
1236                 LIns* iftrue = values->oprnd1();
1237                 LIns* iffalse = values->oprnd2();
1238
1239                 NanoAssert(op == LIR_qcmov || (!iftrue->isQuad() && !iffalse->isQuad()));
1240
1241                 const Register rr = prepResultReg(ins, GpRegs);
1242
1243                 // this code assumes that neither LD nor MR nor MRcc set any of the condition flags.
1244                 // (This is true on Intel, is it true on all architectures?)
1245                 const Register iffalsereg = findRegFor(iffalse, GpRegs & ~rmask(rr));
1246                 if (op == LIR_cmov) {
1247                         switch (condval->opcode())
1248                         {
1249                                 // note that these are all opposites...
1250                                 case LIR_eq:    MRNE(rr, iffalsereg);   break;
1251                                 case LIR_ov:    MRNO(rr, iffalsereg);   break;
1252                                 case LIR_cs:    MRNC(rr, iffalsereg);   break;
1253                                 case LIR_lt:    MRGE(rr, iffalsereg);   break;
1254                                 case LIR_le:    MRG(rr, iffalsereg);    break;
1255                                 case LIR_gt:    MRLE(rr, iffalsereg);   break;
1256                                 case LIR_ge:    MRL(rr, iffalsereg);    break;
1257                                 case LIR_ult:   MRAE(rr, iffalsereg);   break;
1258                                 case LIR_ule:   MRA(rr, iffalsereg);    break;
1259                                 case LIR_ugt:   MRBE(rr, iffalsereg);   break;
1260                                 case LIR_uge:   MRB(rr, iffalsereg);    break;
1261                                 debug_only( default: NanoAssert(0); break; )
1262                         }
1263                 } else if (op == LIR_qcmov) {
1264 #if !defined NANOJIT_64BIT
1265                         NanoAssert(0);
1266 #else
1267                         switch (condval->opcode())
1268                         {
1269                                 // note that these are all opposites...
1270                                 case LIR_eq:    MRQNE(rr, iffalsereg);  break;
1271                                 case LIR_ov:    MRQNO(rr, iffalsereg);   break;
1272                                 case LIR_cs:    MRQNC(rr, iffalsereg);   break;
1273                                 case LIR_lt:    MRQGE(rr, iffalsereg);  break;
1274                                 case LIR_le:    MRQG(rr, iffalsereg);   break;
1275                                 case LIR_gt:    MRQLE(rr, iffalsereg);  break;
1276                                 case LIR_ge:    MRQL(rr, iffalsereg);   break;
1277                                 case LIR_ult:   MRQAE(rr, iffalsereg);  break;
1278                                 case LIR_ule:   MRQA(rr, iffalsereg);   break;
1279                                 case LIR_ugt:   MRQBE(rr, iffalsereg);  break;
1280                                 case LIR_uge:   MRQB(rr, iffalsereg);   break;
1281                                 debug_only( default: NanoAssert(0); break; )
1282                         }
1283 #endif
1284                 }
1285                 /*const Register iftruereg =*/ findSpecificRegFor(iftrue, rr);
1286                 asm_cmp(condval);
1287         }
1288
1289         void Assembler::asm_qhi(LInsp ins)
1290         {
1291                 Register rr = prepResultReg(ins, GpRegs);
1292                 LIns *q = ins->oprnd1();
1293                 int d = findMemFor(q);
1294                 LD(rr, d+4, FP);
1295         }
1296
1297         void Assembler::asm_param(LInsp ins)
1298         {
1299                 uint32_t a = ins->imm8();
1300                 uint32_t kind = ins->imm8b();
1301                 if (kind == 0) {
1302                         // ordinary param
1303                         AbiKind abi = _thisfrag->lirbuf->abi;
1304                         uint32_t abi_regcount = max_abi_regs[abi];
1305                         if (a < abi_regcount) {
1306                                 // incoming arg in register
1307                                 prepResultReg(ins, rmask(argRegs[a]));
1308                         } else {
1309                                 // incoming arg is on stack, and EBP points nearby (see genPrologue)
1310                                 Register r = prepResultReg(ins, GpRegs);
1311                                 int d = (a - abi_regcount) * sizeof(intptr_t) + 8;
1312                                 LD(r, d, FP);
1313                         }
1314                 }
1315                 else {
1316                         // saved param
1317                         prepResultReg(ins, rmask(savedRegs[a]));
1318                 }
1319         }
1320
1321         void Assembler::asm_short(LInsp ins)
1322         {
1323                 Register rr = prepResultReg(ins, GpRegs);
1324                 int32_t val = ins->imm16();
1325                 if (val == 0)
1326                         XOR(rr,rr);
1327                 else
1328                         LDi(rr, val);
1329         }
1330
1331         void Assembler::asm_int(LInsp ins)
1332         {
1333                 Register rr = prepResultReg(ins, GpRegs);
1334                 int32_t val = ins->imm32();
1335                 if (val == 0)
1336                         XOR(rr,rr);
1337                 else
1338                         LDi(rr, val);
1339         }
1340
1341         void Assembler::asm_quad(LInsp ins)
1342         {
1343 #if defined NANOJIT_IA32
1344         Reservation *rR = getresv(ins);
1345                 Register rr = rR->reg;
1346                 if (rr != UnknownReg)
1347                 {
1348                         // @todo -- add special-cases for 0 and 1
1349                         _allocator.retire(rr);
1350                         rR->reg = UnknownReg;
1351                         NanoAssert((rmask(rr) & FpRegs) != 0);
1352
1353                         const double d = ins->constvalf();
1354             const uint64_t q = ins->constvalq();
1355                         if (rmask(rr) & XmmRegs) {
1356                                 if (q == 0.0) {
1357                     // test (int64)0 since -0.0 == 0.0
1358                                         SSE_XORPDr(rr, rr);
1359                                 } else if (d == 1.0) {
1360                                         // 1.0 is extremely frequent and worth special-casing!
1361                                         static const double k_ONE = 1.0;
1362                                         LDSDm(rr, &k_ONE);
1363                                 } else {
1364                                         findMemFor(ins);
1365                                         const int d = disp(rR);
1366                                         SSE_LDQ(rr, d, FP);
1367                                 }
1368                         } else {
1369                                 if (q == 0.0) {
1370                     // test (int64)0 since -0.0 == 0.0
1371                                         FLDZ();
1372                                 } else if (d == 1.0) {
1373                                         FLD1();
1374                                 } else {
1375                                         findMemFor(ins);
1376                                         int d = disp(rR);
1377                                         FLDQ(d,FP);
1378                                 }
1379                         }
1380                 }
1381
1382                 // @todo, if we used xor, ldsd, fldz, etc above, we don't need mem here
1383                 int d = disp(rR);
1384                 freeRsrcOf(ins, false);
1385                 if (d)
1386                 {
1387                         const int32_t* p = (const int32_t*) (ins-2);
1388                         STi(FP,d+4,p[1]);
1389                         STi(FP,d,p[0]);
1390                 }
1391 #elif defined NANOJIT_AMD64
1392                 Reservation *rR = getresv(ins);
1393                 int64_t val = *(int64_t *)(ins - 2);
1394
1395                 if (rR->reg != UnknownReg)
1396                 {
1397                         if (rmask(rR->reg) & GpRegs)
1398                         {
1399                                 LDQi(rR->reg, val);
1400                         }
1401                         else if (rmask(rR->reg) & XmmRegs)
1402                         {
1403                                 if (ins->constvalf() == 0.0)
1404                                 {
1405                                         SSE_XORPDr(rR->reg, rR->reg);
1406                                 }
1407                                 else
1408                                 {
1409                                         /* Get a short-lived register, not associated with instruction */
1410                                         Register rd = rR->reg;
1411                                         Register rs = registerAlloc(GpRegs);
1412
1413                                         SSE_MOVD(rd, rs);
1414                                         LDQi(rs, val);
1415
1416                                         _allocator.addFree(rs);
1417                                 }
1418                         }
1419                 }
1420                 else
1421                 {
1422                         const int32_t* p = (const int32_t*) (ins-2);
1423                         int dr = disp(rR);
1424                         STi(FP, dr+4, p[1]);
1425                         STi(FP, dr, p[0]);
1426                 }
1427
1428                 freeRsrcOf(ins, false);
1429 #endif
1430         }
1431
1432         void Assembler::asm_qlo(LInsp ins)
1433         {
1434                 LIns *q = ins->oprnd1();
1435
1436 #if defined NANOJIT_IA32
1437                 if (!config.sse2)
1438                 {
1439                         Register rr = prepResultReg(ins, GpRegs);
1440                         int d = findMemFor(q);
1441                         LD(rr, d, FP);
1442                 }
1443                 else
1444 #endif
1445                 {
1446                         Reservation *resv = getresv(ins);
1447                         Register rr = resv->reg;
1448                         if (rr == UnknownReg) {
1449                                 // store quad in spill loc
1450                                 int d = disp(resv);
1451                                 freeRsrcOf(ins, false);
1452                                 Register qr = findRegFor(q, XmmRegs);
1453                                 SSE_MOVDm(d, FP, qr);
1454                         } else {
1455                                 freeRsrcOf(ins, false);
1456                                 Register qr = findRegFor(q, XmmRegs);
1457                                 SSE_MOVD(rr,qr);
1458                         }
1459                 }
1460         }
1461
1462         void Assembler::asm_fneg(LInsp ins)
1463         {
1464 #if defined NANOJIT_IA32
1465                 if (config.sse2)
1466                 {
1467 #endif
1468                         LIns *lhs = ins->oprnd1();
1469
1470                         Register rr = prepResultReg(ins, XmmRegs);
1471                         Reservation *rA = getresv(lhs);
1472                         Register ra;
1473
1474                         // if this is last use of lhs in reg, we can re-use result reg
1475                         if (rA == 0 || (ra = rA->reg) == UnknownReg) {
1476                                 ra = findSpecificRegFor(lhs, rr);
1477                         } else if ((rmask(ra) & XmmRegs) == 0) {
1478                                 /* We need this case on AMD64, because it's possible that
1479                                  * an earlier instruction has done a quadword load and reserved a
1480                                  * GPR.  If so, ask for a new register.
1481                                  */
1482                                 ra = findRegFor(lhs, XmmRegs);
1483                         }
1484                         // else, rA already has a register assigned.
1485
1486 #if defined __SUNPRO_CC
1487                         // from Sun Studio C++ Readme: #pragma align inside namespace requires mangled names
1488                         static uint32_t temp[] = {0, 0, 0, 0, 0, 0, 0};
1489                         static uint32_t *negateMask = (uint32_t *)alignUp(temp, 16);
1490                         negateMask[1] = 0x80000000;
1491 #else
1492                         static const AVMPLUS_ALIGN16(uint32_t) negateMask[] = {0,0x80000000,0,0};
1493 #endif
1494                         SSE_XORPD(rr, negateMask);
1495
1496                         if (rr != ra)
1497                                 SSE_MOVSD(rr, ra);
1498 #if defined NANOJIT_IA32
1499                 }
1500                 else
1501                 {
1502                         Register rr = prepResultReg(ins, FpRegs);
1503
1504                         LIns* lhs = ins->oprnd1();
1505
1506                         // lhs into reg, prefer same reg as result
1507                         Reservation* rA = getresv(lhs);
1508                         // if this is last use of lhs in reg, we can re-use result reg
1509                         if (rA == 0 || rA->reg == UnknownReg)
1510                                 findSpecificRegFor(lhs, rr);
1511                         // else, rA already has a different reg assigned
1512
1513                         NanoAssert(getresv(lhs)!=0 && getresv(lhs)->reg==FST0);
1514                         // assume that the lhs is in ST(0) and rhs is on stack
1515                         FCHS();
1516
1517                         // if we had more than one fpu reg, this is where
1518                         // we would move ra into rr if rr != ra.
1519                 }
1520 #endif
1521         }
1522
1523     void Assembler::asm_arg(ArgSize sz, LInsp p, Register r)
1524     {
1525         if (sz == ARGSIZE_Q)
1526         {
1527                         // ref arg - use lea
1528                         if (r != UnknownReg)
1529                         {
1530                                 // arg in specific reg
1531                                 int da = findMemFor(p);
1532                                 LEA(r, da, FP);
1533                         }
1534                         else
1535                         {
1536                                 NanoAssert(0); // not supported
1537                         }
1538                 }
1539         else if (sz == ARGSIZE_LO)
1540                 {
1541                         if (r != UnknownReg) {
1542                                 // arg goes in specific register
1543                 if (p->isconst()) {
1544                                         LDi(r, p->constval());
1545                 } else {
1546                         Reservation* rA = getresv(p);
1547                     if (rA) {
1548                         if (rA->reg == UnknownReg) {
1549                             // load it into the arg reg
1550                             int d = findMemFor(p);
1551                             if (p->isop(LIR_alloc)) {
1552                                 LEA(r, d, FP);
1553                             } else {
1554                                 LD(r, d, FP);
1555                             }
1556                         } else {
1557                             // it must be in a saved reg
1558                             MR(r, rA->reg);
1559                         }
1560                     }
1561                     else {
1562                         // this is the last use, so fine to assign it
1563                         // to the scratch reg, it's dead after this point.
1564                                         findSpecificRegFor(p, r);
1565                     }
1566                 }
1567                         }
1568             else {
1569                                 asm_pusharg(p);
1570                         }
1571                 }
1572         else
1573                 {
1574             NanoAssert(sz == ARGSIZE_F);
1575                         asm_farg(p);
1576                 }
1577     }
1578
1579         void Assembler::asm_pusharg(LInsp p)
1580         {
1581                 // arg goes on stack
1582                 Reservation* rA = getresv(p);
1583                 if (rA == 0 && p->isconst())
1584                 {
1585                         // small const we push directly
1586                         PUSHi(p->constval());
1587                 }
1588                 else if (rA == 0 || p->isop(LIR_alloc))
1589                 {
1590                         Register ra = findRegFor(p, GpRegs);
1591                         PUSHr(ra);
1592                 }
1593                 else if (rA->reg == UnknownReg)
1594                 {
1595                         PUSHm(disp(rA), FP);
1596                 }
1597                 else
1598                 {
1599                         PUSHr(rA->reg);
1600                 }
1601         }
1602
1603         void Assembler::asm_farg(LInsp p)
1604         {
1605 #if defined NANOJIT_IA32
1606         NanoAssert(p->isQuad());
1607                 Register r = findRegFor(p, FpRegs);
1608                 if (rmask(r) & XmmRegs) {
1609                         SSE_STQ(0, SP, r);
1610                 } else {
1611                         FSTPQ(0, SP);
1612                         /* It's possible that the same LIns* with r=FST0 will appear in the argument list more
1613                          * than once.  In this case FST0 will not have been evicted and the multiple pop
1614                          * actions will unbalance the FPU stack.  A quick fix is to always evict FST0 manually.
1615                          */
1616                         evict(FST0);
1617                 }
1618         SUBi(ESP,8);
1619                 //PUSHr(ECX); // 2*pushr is smaller than sub
1620                 //PUSHr(ECX);
1621 #endif
1622         }
1623
1624         void Assembler::asm_fop(LInsp ins)
1625         {
1626                 LOpcode op = ins->opcode();
1627 #if defined NANOJIT_IA32
1628                 if (config.sse2)
1629                 {
1630 #endif
1631                         LIns *lhs = ins->oprnd1();
1632                         LIns *rhs = ins->oprnd2();
1633
1634                         RegisterMask allow = XmmRegs;
1635                         Register rb = UnknownReg;
1636                         if (lhs != rhs) {
1637                                 rb = findRegFor(rhs,allow);
1638                                 allow &= ~rmask(rb);
1639                         }
1640
1641                         Register rr = prepResultReg(ins, allow);
1642                         Reservation *rA = getresv(lhs);
1643                         Register ra;
1644
1645                         // if this is last use of lhs in reg, we can re-use result reg
1646                         if (rA == 0 || (ra = rA->reg) == UnknownReg) {
1647                                 ra = findSpecificRegFor(lhs, rr);
1648                         } else if ((rmask(ra) & XmmRegs) == 0) {
1649                                 /* We need this case on AMD64, because it's possible that
1650                                  * an earlier instruction has done a quadword load and reserved a
1651                                  * GPR.  If so, ask for a new register.
1652                                  */
1653                                 ra = findRegFor(lhs, XmmRegs);
1654                         }
1655             else {
1656                         // rA already has a register assigned but maybe not from the allow set
1657                 ra = findRegFor(lhs, allow);
1658             }
1659
1660                         if (lhs == rhs)
1661                                 rb = ra;
1662
1663                         if (op == LIR_fadd)
1664                                 SSE_ADDSD(rr, rb);
1665                         else if (op == LIR_fsub)
1666                                 SSE_SUBSD(rr, rb);
1667                         else if (op == LIR_fmul)
1668                                 SSE_MULSD(rr, rb);
1669                         else //if (op == LIR_fdiv)
1670                                 SSE_DIVSD(rr, rb);
1671
1672                         if (rr != ra)
1673                                 SSE_MOVSD(rr, ra);
1674 #if defined NANOJIT_IA32
1675                 }
1676                 else
1677                 {
1678                         // we swap lhs/rhs on purpose here, works out better
1679                         // if you only have one fpu reg.  use divr/subr.
1680                         LIns* rhs = ins->oprnd1();
1681                         LIns* lhs = ins->oprnd2();
1682                         Register rr = prepResultReg(ins, rmask(FST0));
1683
1684                         // make sure rhs is in memory
1685                         int db = findMemFor(rhs);
1686
1687                         // lhs into reg, prefer same reg as result
1688                         Reservation* rA = getresv(lhs);
1689                         // last use of lhs in reg, can reuse rr
1690                         if (rA == 0 || rA->reg == UnknownReg)
1691                                 findSpecificRegFor(lhs, rr);
1692                         // else, rA already has a different reg assigned
1693
1694                         NanoAssert(getresv(lhs)!=0 && getresv(lhs)->reg==FST0);
1695                         // assume that the lhs is in ST(0) and rhs is on stack
1696                         if (op == LIR_fadd)
1697                                 { FADD(db, FP); }
1698                         else if (op == LIR_fsub)
1699                                 { FSUBR(db, FP); }
1700                         else if (op == LIR_fmul)
1701                                 { FMUL(db, FP); }
1702                         else if (op == LIR_fdiv)
1703                                 { FDIVR(db, FP); }
1704                 }
1705 #endif
1706         }
1707
1708         void Assembler::asm_i2f(LInsp ins)
1709         {
1710                 // where our result goes
1711                 Register rr = prepResultReg(ins, FpRegs);
1712 #if defined NANOJIT_IA32
1713                 if (rmask(rr) & XmmRegs)
1714                 {
1715 #endif
1716                         // todo support int value in memory
1717                         Register gr = findRegFor(ins->oprnd1(), GpRegs);
1718                         SSE_CVTSI2SD(rr, gr);
1719 #if defined NANOJIT_IA32
1720                 }
1721                 else
1722                 {
1723                         int d = findMemFor(ins->oprnd1());
1724                         FILD(d, FP);
1725                 }
1726 #endif
1727         }
1728
1729         Register Assembler::asm_prep_fcall(Reservation *rR, LInsp ins)
1730         {
1731                 #if defined NANOJIT_IA32
1732                 if (rR) {
1733                 Register rr;
1734                         if ((rr=rR->reg) != UnknownReg && (rmask(rr) & XmmRegs))
1735                                 evict(rr);
1736                 }
1737                 return prepResultReg(ins, rmask(FST0));
1738                 #elif defined NANOJIT_AMD64
1739                 evict(RAX);
1740                 return prepResultReg(ins, rmask(XMM0));
1741                 #endif
1742         }
1743
1744         void Assembler::asm_u2f(LInsp ins)
1745         {
1746                 // where our result goes
1747                 Register rr = prepResultReg(ins, FpRegs);
1748 #if defined NANOJIT_IA32
1749                 if (rmask(rr) & XmmRegs)
1750                 {
1751 #endif
1752                         // don't call findRegFor, we want a reg we can stomp on for a very short time,
1753                         // not a reg that will continue to be associated with the LIns
1754                         Register gr = registerAlloc(GpRegs);
1755
1756                         // technique inspired by gcc disassembly
1757                         // Edwin explains it:
1758                         //
1759                         // gr is 0..2^32-1
1760                         //
1761                         //         sub gr,0x80000000
1762                         //
1763                         // now gr is -2^31..2^31-1, i.e. the range of int, but not the same value
1764                         // as before
1765                         //
1766                         //         cvtsi2sd rr,gr
1767                         //
1768                         // rr is now a double with the int value range
1769                         //
1770                         //     addsd rr, 2147483648.0
1771                         //
1772                         // adding back double(0x80000000) makes the range 0..2^32-1.
1773
1774                         static const double k_NEGONE = 2147483648.0;
1775 #if defined NANOJIT_IA32
1776                         SSE_ADDSDm(rr, &k_NEGONE);
1777 #elif defined NANOJIT_AMD64
1778                         /* Squirrel the constant at the bottom of the page. */
1779                         if (_dblNegPtr != NULL)
1780                         {
1781                                 underrunProtect(10);
1782                         }
1783                         if (_dblNegPtr == NULL)
1784                         {
1785                                 underrunProtect(30);
1786                                 uint8_t *base, *begin;
1787                                 base = (uint8_t *)((intptr_t)_nIns & ~((intptr_t)NJ_PAGE_SIZE-1));
1788                                 base += sizeof(PageHeader) + _pageData;
1789                                 begin = base;
1790                                 /* Make sure we align */
1791                                 if ((uintptr_t)base & 0xF) {
1792                                         base = (NIns *)((uintptr_t)base & ~(0xF));
1793                                         base += 16;
1794                                 }
1795                                 _pageData += (int32_t)(base - begin) + sizeof(double);
1796                                 _negOnePtr = (NIns *)base;
1797                                 *(double *)_negOnePtr = k_NEGONE;
1798                         }
1799                         SSE_ADDSDm(rr, _negOnePtr);
1800 #endif
1801
1802                         SSE_CVTSI2SD(rr, gr);
1803
1804                         Reservation* resv = getresv(ins->oprnd1());
1805                         Register xr;
1806                         if (resv && (xr = resv->reg) != UnknownReg && (rmask(xr) & GpRegs))
1807                         {
1808                                 LEA(gr, 0x80000000, xr);
1809                         }
1810                         else
1811                         {
1812                                 const int d = findMemFor(ins->oprnd1());
1813                                 SUBi(gr, 0x80000000);
1814                                 LD(gr, d, FP);
1815                         }
1816
1817                         // ok, we're done with it
1818                         _allocator.addFree(gr);
1819 #if defined NANOJIT_IA32
1820                 }
1821                 else
1822                 {
1823             const int disp = -8;
1824             const Register base = SP;
1825                         Register gr = findRegFor(ins->oprnd1(), GpRegs);
1826                         NanoAssert(rr == FST0);
1827                         FILDQ(disp, base);
1828                         STi(base, disp+4, 0);   // high 32 bits = 0
1829                         ST(base, disp, gr);             // low 32 bits = unsigned value
1830                 }
1831 #endif
1832         }
1833
1834         void Assembler::asm_nongp_copy(Register r, Register s)
1835         {
1836                 if ((rmask(r) & XmmRegs) && (rmask(s) & XmmRegs)) {
1837                         SSE_MOVSD(r, s);
1838                 } else if ((rmask(r) & GpRegs) && (rmask(s) & XmmRegs)) {
1839                         SSE_MOVD(r, s);
1840                 } else {
1841                         if (rmask(r) & XmmRegs) {
1842                                 // x87 -> xmm
1843                                 NanoAssertMsg(false, "Should not move data from GPR to XMM");
1844                         } else {
1845                                 // xmm -> x87
1846                                 NanoAssertMsg(false, "Should not move data from GPR/XMM to x87 FPU");
1847                         }
1848                 }
1849         }
1850
1851     NIns * Assembler::asm_jmpcc(bool branchOnFalse, LIns *cond, NIns *targ)
1852     {
1853         LOpcode c = cond->opcode();
1854         if (config.sse2 && c != LIR_feq) {
1855             LIns *lhs = cond->oprnd1();
1856             LIns *rhs = cond->oprnd2();
1857             if (c == LIR_flt) {
1858                 LIns *t = lhs; lhs = rhs; rhs = t;
1859                 c = LIR_fgt;
1860             }
1861             else if (c == LIR_fle) {
1862                 LIns *t = lhs; lhs = rhs; rhs = t;
1863                 c = LIR_fge;
1864             }
1865
1866             if (c == LIR_fgt) {
1867                 if (branchOnFalse) { JNA(targ, false); } else { JA(targ, false); }
1868             }
1869             else { // if (c == LIR_fge)
1870                 if (branchOnFalse) { JNAE(targ, false); } else { JAE(targ, false); }
1871             }
1872             NIns *at = _nIns;
1873             Reservation *rA, *rB;
1874             findRegFor2(XmmRegs, lhs, rA, rhs, rB);
1875             SSE_UCOMISD(rA->reg, rB->reg);
1876             return at;
1877         }
1878
1879         if (branchOnFalse)
1880                         JP(targ, false);
1881                 else
1882                         JNP(targ, false);
1883                 NIns *at = _nIns;
1884                 asm_fcmp(cond);
1885         return at;
1886     }
1887
1888     void Assembler::asm_setcc(Register r, LIns *cond)
1889     {
1890         LOpcode c = cond->opcode();
1891         if (config.sse2 && c != LIR_feq) {
1892                 MOVZX8(r,r);
1893             LIns *lhs = cond->oprnd1();
1894             LIns *rhs = cond->oprnd2();
1895             if (c == LIR_flt) {
1896                 LIns *t = lhs; lhs = rhs; rhs = t;
1897                 SETA(r);
1898             }
1899             else if (c == LIR_fle) {
1900                 LIns *t = lhs; lhs = rhs; rhs = t;
1901                 SETAE(r);
1902             }
1903             else if (c == LIR_fgt) {
1904                 SETA(r);
1905             }
1906             else { // if (c == LIR_fge)
1907                 SETAE(r);
1908             }
1909             Reservation *rA, *rB;
1910             findRegFor2(XmmRegs, lhs, rA, rhs, rB);
1911             SSE_UCOMISD(rA->reg, rB->reg);
1912             return;
1913         }
1914                 // SETcc only sets low 8 bits, so extend
1915                 MOVZX8(r,r);
1916                 SETNP(r);
1917         asm_fcmp(cond);
1918     }
1919
1920         void Assembler::asm_fcmp(LIns *cond)
1921         {
1922                 LOpcode condop = cond->opcode();
1923                 NanoAssert(condop >= LIR_feq && condop <= LIR_fge);
1924             LIns* lhs = cond->oprnd1();
1925             LIns* rhs = cond->oprnd2();
1926
1927         int mask;
1928             if (condop == LIR_feq)
1929                     mask = 0x44;
1930             else if (condop == LIR_fle)
1931                     mask = 0x41;
1932             else if (condop == LIR_flt)
1933                     mask = 0x05;
1934         else if (condop == LIR_fge) {
1935             // swap, use le
1936             condop = LIR_fle;
1937             LIns* t = lhs; lhs = rhs; rhs = t;
1938             mask = 0x41;
1939         } else { // if (condop == LIR_fgt)
1940             // swap, use lt
1941             condop = LIR_flt;
1942             LIns* t = lhs; lhs = rhs; rhs = t;
1943                     mask = 0x05;
1944         }
1945
1946 #if defined NANOJIT_IA32
1947         if (config.sse2)
1948         {
1949 #endif
1950             // UNORDERED:    ZF,PF,CF <- 111;
1951             // GREATER_THAN: ZF,PF,CF <- 000;
1952             // LESS_THAN:    ZF,PF,CF <- 001;
1953             // EQUAL:        ZF,PF,CF <- 100;
1954
1955             if (condop == LIR_feq && lhs == rhs) {
1956                 // nan check
1957                 Register r = findRegFor(lhs, XmmRegs);
1958                 SSE_UCOMISD(r, r);
1959             }
1960             else {
1961 #if defined NANOJIT_IA32
1962                 evict(EAX);
1963                 TEST_AH(mask);
1964                 LAHF();
1965 #elif defined NANOJIT_AMD64
1966                 evict(RAX);
1967                 TEST_AL(mask);
1968                 POPr(RAX);
1969                 PUSHFQ();
1970 #endif
1971                 Reservation *rA, *rB;
1972                 findRegFor2(XmmRegs, lhs, rA, rhs, rB);
1973                 SSE_UCOMISD(rA->reg, rB->reg);
1974             }
1975 #if defined NANOJIT_IA32
1976         }
1977         else
1978         {
1979             evict(EAX);
1980             TEST_AH(mask);
1981                     FNSTSW_AX();
1982                     NanoAssert(lhs->isQuad() && rhs->isQuad());
1983                     Reservation *rA;
1984                     if (lhs != rhs)
1985                     {
1986                             // compare two different numbers
1987                             int d = findMemFor(rhs);
1988                             rA = getresv(lhs);
1989                             int pop = !rA || rA->reg == UnknownReg;
1990                             findSpecificRegFor(lhs, FST0);
1991                             // lhs is in ST(0) and rhs is on stack
1992                             FCOM(pop, d, FP);
1993                     }
1994                     else
1995                     {
1996                             // compare n to itself, this is a NaN test.
1997                             rA = getresv(lhs);
1998                             int pop = !rA || rA->reg == UnknownReg;
1999                             findSpecificRegFor(lhs, FST0);
2000                             // value in ST(0)
2001                             if (pop)
2002                                     FCOMPP();
2003                             else
2004                                     FCOMP();
2005                             FLDr(FST0); // DUP
2006                     }
2007         }
2008 #endif
2009         }
2010
2011         void Assembler::nativePageReset()
2012         {
2013 #if defined NANOJIT_AMD64
2014         /* We store some stuff at the bottom of the page.
2015          * We reserve 8-bytes for long jumps just in case we need them.
2016          */
2017                 _pageData = 0;
2018                 _dblNegPtr = NULL;
2019                 _negOnePtr = NULL;
2020 #endif
2021         }
2022
2023         Register Assembler::asm_binop_rhs_reg(LInsp ins)
2024         {
2025                 LOpcode op = ins->opcode();
2026                 LIns *rhs = ins->oprnd2();
2027
2028                 if (op == LIR_lsh || op == LIR_rsh || op == LIR_ush) {
2029 #if defined NANOJIT_IA32
2030                         return findSpecificRegFor(rhs, ECX);
2031 #elif defined NANOJIT_AMD64
2032                         return findSpecificRegFor(rhs, RCX);
2033 #endif
2034                 }
2035
2036                 return UnknownReg;
2037         }
2038
2039 #if defined NANOJIT_AMD64
2040     void Assembler::asm_qbinop(LIns *ins)
2041     {
2042         LInsp lhs = ins->oprnd1();
2043         LInsp rhs = ins->oprnd2();
2044         LOpcode op = ins->opcode();
2045
2046         Register rr = prepResultReg(ins, GpRegs);
2047         Reservation *rA = getresv(lhs);
2048         Register ra;
2049
2050         if (rA == NULL || (ra = rA->reg) == UnknownReg) {
2051             ra = findSpecificRegFor(lhs, rr);
2052         }
2053
2054         if (rhs->isconst())
2055         {
2056             int c = rhs->constval();
2057
2058             if (op == LIR_qiadd)
2059             {
2060                 ADDQi(rr, c);
2061             } else if (op == LIR_qiand) {
2062                 ANDQi(rr, c);
2063             } else if (op == LIR_qilsh) {
2064                 SHLQi(rr, c);
2065             } else if (op == LIR_qior) {
2066                 ORQi(rr, c);
2067             }
2068         } else {
2069             Register rv;
2070
2071             if (lhs == rhs) {
2072                 rv = ra;
2073             } else {
2074                 rv = findRegFor(rhs, GpRegs & ~(rmask(rr)));
2075             }
2076
2077             if (op == LIR_qiadd) {
2078                 ADDQ(rr, rv);
2079             } else if (op == LIR_qiand) {
2080                 ANDQ(rr, rv);
2081             } else if (op == LIR_qior) {
2082                 ORQ(rr, rv);
2083             } else {
2084                 NanoAssert(rhs->isconst());
2085             }
2086         }
2087
2088         if (rr != ra) {
2089             MR(rr, ra);
2090         }
2091     }
2092 #endif
2093
2094         void Assembler::nativePageSetup()
2095         {
2096                 if (!_nIns)              _nIns     = pageAlloc();
2097                 if (!_nExitIns)  _nExitIns = pageAlloc(true);
2098         }
2099
2100         // enough room for n bytes
2101     void Assembler::underrunProtect(int n)
2102     {
2103                 NanoAssertMsg(n<=LARGEST_UNDERRUN_PROT, "constant LARGEST_UNDERRUN_PROT is too small");
2104         NIns *eip = this->_nIns;
2105         Page *p = (Page*)pageTop(eip-1);
2106         NIns *top = (NIns*) &p->code[0];
2107         if (eip - n < top) {
2108                         _nIns = pageAlloc(_inExit);
2109             JMP(eip);
2110         }
2111     }
2112
2113         #endif /* FEATURE_NANOJIT */
2114 }