Fixed some bugs.
[llvm/zpu.git] / lib / Target / X86 / Disassembler / X86DisassemblerDecoder.c
blob1fd66853f75e8514824a81f527e4dd9c8291b712
1 /*===- X86DisassemblerDecoder.c - Disassembler decoder -------------*- C -*-==*
3 * The LLVM Compiler Infrastructure
5 * This file is distributed under the University of Illinois Open Source
6 * License. See LICENSE.TXT for details.
8 *===----------------------------------------------------------------------===*
10 * This file is part of the X86 Disassembler.
11 * It contains the implementation of the instruction decoder.
12 * Documentation for the disassembler can be found in X86Disassembler.h.
14 *===----------------------------------------------------------------------===*/
16 #include <stdarg.h> /* for va_*() */
17 #include <stdio.h> /* for vsnprintf() */
18 #include <stdlib.h> /* for exit() */
19 #include <string.h> /* for memset() */
21 #include "X86DisassemblerDecoder.h"
23 #include "X86GenDisassemblerTables.inc"
25 #define TRUE 1
26 #define FALSE 0
28 typedef int8_t bool;
30 #ifndef NDEBUG
31 #define debug(s) do { x86DisassemblerDebug(__FILE__, __LINE__, s); } while (0)
32 #else
33 #define debug(s) do { } while (0)
34 #endif
38 * contextForAttrs - Client for the instruction context table. Takes a set of
39 * attributes and returns the appropriate decode context.
41 * @param attrMask - Attributes, from the enumeration attributeBits.
42 * @return - The InstructionContext to use when looking up an
43 * an instruction with these attributes.
45 static InstructionContext contextForAttrs(uint8_t attrMask) {
46 return CONTEXTS_SYM[attrMask];
50 * modRMRequired - Reads the appropriate instruction table to determine whether
51 * the ModR/M byte is required to decode a particular instruction.
53 * @param type - The opcode type (i.e., how many bytes it has).
54 * @param insnContext - The context for the instruction, as returned by
55 * contextForAttrs.
56 * @param opcode - The last byte of the instruction's opcode, not counting
57 * ModR/M extensions and escapes.
58 * @return - TRUE if the ModR/M byte is required, FALSE otherwise.
60 static int modRMRequired(OpcodeType type,
61 InstructionContext insnContext,
62 uint8_t opcode) {
63 const struct ContextDecision* decision = 0;
65 switch (type) {
66 case ONEBYTE:
67 decision = &ONEBYTE_SYM;
68 break;
69 case TWOBYTE:
70 decision = &TWOBYTE_SYM;
71 break;
72 case THREEBYTE_38:
73 decision = &THREEBYTE38_SYM;
74 break;
75 case THREEBYTE_3A:
76 decision = &THREEBYTE3A_SYM;
77 break;
80 return decision->opcodeDecisions[insnContext].modRMDecisions[opcode].
81 modrm_type != MODRM_ONEENTRY;
83 return 0;
87 * decode - Reads the appropriate instruction table to obtain the unique ID of
88 * an instruction.
90 * @param type - See modRMRequired().
91 * @param insnContext - See modRMRequired().
92 * @param opcode - See modRMRequired().
93 * @param modRM - The ModR/M byte if required, or any value if not.
94 * @return - The UID of the instruction, or 0 on failure.
96 static InstrUID decode(OpcodeType type,
97 InstructionContext insnContext,
98 uint8_t opcode,
99 uint8_t modRM) {
100 const struct ModRMDecision* dec;
102 switch (type) {
103 default:
104 debug("Unknown opcode type");
105 return 0;
106 case ONEBYTE:
107 dec = &ONEBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
108 break;
109 case TWOBYTE:
110 dec = &TWOBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
111 break;
112 case THREEBYTE_38:
113 dec = &THREEBYTE38_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
114 break;
115 case THREEBYTE_3A:
116 dec = &THREEBYTE3A_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
117 break;
120 switch (dec->modrm_type) {
121 default:
122 debug("Corrupt table! Unknown modrm_type");
123 return 0;
124 case MODRM_ONEENTRY:
125 return dec->instructionIDs[0];
126 case MODRM_SPLITRM:
127 if (modFromModRM(modRM) == 0x3)
128 return dec->instructionIDs[1];
129 else
130 return dec->instructionIDs[0];
131 case MODRM_FULL:
132 return dec->instructionIDs[modRM];
137 * specifierForUID - Given a UID, returns the name and operand specification for
138 * that instruction.
140 * @param uid - The unique ID for the instruction. This should be returned by
141 * decode(); specifierForUID will not check bounds.
142 * @return - A pointer to the specification for that instruction.
144 static const struct InstructionSpecifier *specifierForUID(InstrUID uid) {
145 return &INSTRUCTIONS_SYM[uid];
149 * consumeByte - Uses the reader function provided by the user to consume one
150 * byte from the instruction's memory and advance the cursor.
152 * @param insn - The instruction with the reader function to use. The cursor
153 * for this instruction is advanced.
154 * @param byte - A pointer to a pre-allocated memory buffer to be populated
155 * with the data read.
156 * @return - 0 if the read was successful; nonzero otherwise.
158 static int consumeByte(struct InternalInstruction* insn, uint8_t* byte) {
159 int ret = insn->reader(insn->readerArg, byte, insn->readerCursor);
161 if (!ret)
162 ++(insn->readerCursor);
164 return ret;
168 * lookAtByte - Like consumeByte, but does not advance the cursor.
170 * @param insn - See consumeByte().
171 * @param byte - See consumeByte().
172 * @return - See consumeByte().
174 static int lookAtByte(struct InternalInstruction* insn, uint8_t* byte) {
175 return insn->reader(insn->readerArg, byte, insn->readerCursor);
178 static void unconsumeByte(struct InternalInstruction* insn) {
179 insn->readerCursor--;
182 #define CONSUME_FUNC(name, type) \
183 static int name(struct InternalInstruction* insn, type* ptr) { \
184 type combined = 0; \
185 unsigned offset; \
186 for (offset = 0; offset < sizeof(type); ++offset) { \
187 uint8_t byte; \
188 int ret = insn->reader(insn->readerArg, \
189 &byte, \
190 insn->readerCursor + offset); \
191 if (ret) \
192 return ret; \
193 combined = combined | ((type)byte << ((type)offset * 8)); \
195 *ptr = combined; \
196 insn->readerCursor += sizeof(type); \
197 return 0; \
201 * consume* - Use the reader function provided by the user to consume data
202 * values of various sizes from the instruction's memory and advance the
203 * cursor appropriately. These readers perform endian conversion.
205 * @param insn - See consumeByte().
206 * @param ptr - A pointer to a pre-allocated memory of appropriate size to
207 * be populated with the data read.
208 * @return - See consumeByte().
210 CONSUME_FUNC(consumeInt8, int8_t)
211 CONSUME_FUNC(consumeInt16, int16_t)
212 CONSUME_FUNC(consumeInt32, int32_t)
213 CONSUME_FUNC(consumeUInt16, uint16_t)
214 CONSUME_FUNC(consumeUInt32, uint32_t)
215 CONSUME_FUNC(consumeUInt64, uint64_t)
218 * dbgprintf - Uses the logging function provided by the user to log a single
219 * message, typically without a carriage-return.
221 * @param insn - The instruction containing the logging function.
222 * @param format - See printf().
223 * @param ... - See printf().
225 static void dbgprintf(struct InternalInstruction* insn,
226 const char* format,
227 ...) {
228 char buffer[256];
229 va_list ap;
231 if (!insn->dlog)
232 return;
234 va_start(ap, format);
235 (void)vsnprintf(buffer, sizeof(buffer), format, ap);
236 va_end(ap);
238 insn->dlog(insn->dlogArg, buffer);
240 return;
244 * setPrefixPresent - Marks that a particular prefix is present at a particular
245 * location.
247 * @param insn - The instruction to be marked as having the prefix.
248 * @param prefix - The prefix that is present.
249 * @param location - The location where the prefix is located (in the address
250 * space of the instruction's reader).
252 static void setPrefixPresent(struct InternalInstruction* insn,
253 uint8_t prefix,
254 uint64_t location)
256 insn->prefixPresent[prefix] = 1;
257 insn->prefixLocations[prefix] = location;
261 * isPrefixAtLocation - Queries an instruction to determine whether a prefix is
262 * present at a given location.
264 * @param insn - The instruction to be queried.
265 * @param prefix - The prefix.
266 * @param location - The location to query.
267 * @return - Whether the prefix is at that location.
269 static BOOL isPrefixAtLocation(struct InternalInstruction* insn,
270 uint8_t prefix,
271 uint64_t location)
273 if (insn->prefixPresent[prefix] == 1 &&
274 insn->prefixLocations[prefix] == location)
275 return TRUE;
276 else
277 return FALSE;
281 * readPrefixes - Consumes all of an instruction's prefix bytes, and marks the
282 * instruction as having them. Also sets the instruction's default operand,
283 * address, and other relevant data sizes to report operands correctly.
285 * @param insn - The instruction whose prefixes are to be read.
286 * @return - 0 if the instruction could be read until the end of the prefix
287 * bytes, and no prefixes conflicted; nonzero otherwise.
289 static int readPrefixes(struct InternalInstruction* insn) {
290 BOOL isPrefix = TRUE;
291 BOOL prefixGroups[4] = { FALSE };
292 uint64_t prefixLocation;
293 uint8_t byte;
295 BOOL hasAdSize = FALSE;
296 BOOL hasOpSize = FALSE;
298 dbgprintf(insn, "readPrefixes()");
300 while (isPrefix) {
301 prefixLocation = insn->readerCursor;
303 if (consumeByte(insn, &byte))
304 return -1;
306 switch (byte) {
307 case 0xf0: /* LOCK */
308 case 0xf2: /* REPNE/REPNZ */
309 case 0xf3: /* REP or REPE/REPZ */
310 if (prefixGroups[0])
311 dbgprintf(insn, "Redundant Group 1 prefix");
312 prefixGroups[0] = TRUE;
313 setPrefixPresent(insn, byte, prefixLocation);
314 break;
315 case 0x2e: /* CS segment override -OR- Branch not taken */
316 case 0x36: /* SS segment override -OR- Branch taken */
317 case 0x3e: /* DS segment override */
318 case 0x26: /* ES segment override */
319 case 0x64: /* FS segment override */
320 case 0x65: /* GS segment override */
321 switch (byte) {
322 case 0x2e:
323 insn->segmentOverride = SEG_OVERRIDE_CS;
324 break;
325 case 0x36:
326 insn->segmentOverride = SEG_OVERRIDE_SS;
327 break;
328 case 0x3e:
329 insn->segmentOverride = SEG_OVERRIDE_DS;
330 break;
331 case 0x26:
332 insn->segmentOverride = SEG_OVERRIDE_ES;
333 break;
334 case 0x64:
335 insn->segmentOverride = SEG_OVERRIDE_FS;
336 break;
337 case 0x65:
338 insn->segmentOverride = SEG_OVERRIDE_GS;
339 break;
340 default:
341 debug("Unhandled override");
342 return -1;
344 if (prefixGroups[1])
345 dbgprintf(insn, "Redundant Group 2 prefix");
346 prefixGroups[1] = TRUE;
347 setPrefixPresent(insn, byte, prefixLocation);
348 break;
349 case 0x66: /* Operand-size override */
350 if (prefixGroups[2])
351 dbgprintf(insn, "Redundant Group 3 prefix");
352 prefixGroups[2] = TRUE;
353 hasOpSize = TRUE;
354 setPrefixPresent(insn, byte, prefixLocation);
355 break;
356 case 0x67: /* Address-size override */
357 if (prefixGroups[3])
358 dbgprintf(insn, "Redundant Group 4 prefix");
359 prefixGroups[3] = TRUE;
360 hasAdSize = TRUE;
361 setPrefixPresent(insn, byte, prefixLocation);
362 break;
363 default: /* Not a prefix byte */
364 isPrefix = FALSE;
365 break;
368 if (isPrefix)
369 dbgprintf(insn, "Found prefix 0x%hhx", byte);
372 if (insn->mode == MODE_64BIT) {
373 if ((byte & 0xf0) == 0x40) {
374 uint8_t opcodeByte;
376 if (lookAtByte(insn, &opcodeByte) || ((opcodeByte & 0xf0) == 0x40)) {
377 dbgprintf(insn, "Redundant REX prefix");
378 return -1;
381 insn->rexPrefix = byte;
382 insn->necessaryPrefixLocation = insn->readerCursor - 2;
384 dbgprintf(insn, "Found REX prefix 0x%hhx", byte);
385 } else {
386 unconsumeByte(insn);
387 insn->necessaryPrefixLocation = insn->readerCursor - 1;
389 } else {
390 unconsumeByte(insn);
393 if (insn->mode == MODE_16BIT) {
394 insn->registerSize = (hasOpSize ? 4 : 2);
395 insn->addressSize = (hasAdSize ? 4 : 2);
396 insn->displacementSize = (hasAdSize ? 4 : 2);
397 insn->immediateSize = (hasOpSize ? 4 : 2);
398 } else if (insn->mode == MODE_32BIT) {
399 insn->registerSize = (hasOpSize ? 2 : 4);
400 insn->addressSize = (hasAdSize ? 2 : 4);
401 insn->displacementSize = (hasAdSize ? 2 : 4);
402 insn->immediateSize = (hasOpSize ? 2 : 4);
403 } else if (insn->mode == MODE_64BIT) {
404 if (insn->rexPrefix && wFromREX(insn->rexPrefix)) {
405 insn->registerSize = 8;
406 insn->addressSize = (hasAdSize ? 4 : 8);
407 insn->displacementSize = 4;
408 insn->immediateSize = 4;
409 } else if (insn->rexPrefix) {
410 insn->registerSize = (hasOpSize ? 2 : 4);
411 insn->addressSize = (hasAdSize ? 4 : 8);
412 insn->displacementSize = (hasOpSize ? 2 : 4);
413 insn->immediateSize = (hasOpSize ? 2 : 4);
414 } else {
415 insn->registerSize = (hasOpSize ? 2 : 4);
416 insn->addressSize = (hasAdSize ? 4 : 8);
417 insn->displacementSize = (hasOpSize ? 2 : 4);
418 insn->immediateSize = (hasOpSize ? 2 : 4);
422 return 0;
426 * readOpcode - Reads the opcode (excepting the ModR/M byte in the case of
427 * extended or escape opcodes).
429 * @param insn - The instruction whose opcode is to be read.
430 * @return - 0 if the opcode could be read successfully; nonzero otherwise.
432 static int readOpcode(struct InternalInstruction* insn) {
433 /* Determine the length of the primary opcode */
435 uint8_t current;
437 dbgprintf(insn, "readOpcode()");
439 insn->opcodeType = ONEBYTE;
440 if (consumeByte(insn, &current))
441 return -1;
443 if (current == 0x0f) {
444 dbgprintf(insn, "Found a two-byte escape prefix (0x%hhx)", current);
446 insn->twoByteEscape = current;
448 if (consumeByte(insn, &current))
449 return -1;
451 if (current == 0x38) {
452 dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
454 insn->threeByteEscape = current;
456 if (consumeByte(insn, &current))
457 return -1;
459 insn->opcodeType = THREEBYTE_38;
460 } else if (current == 0x3a) {
461 dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
463 insn->threeByteEscape = current;
465 if (consumeByte(insn, &current))
466 return -1;
468 insn->opcodeType = THREEBYTE_3A;
469 } else {
470 dbgprintf(insn, "Didn't find a three-byte escape prefix");
472 insn->opcodeType = TWOBYTE;
477 * At this point we have consumed the full opcode.
478 * Anything we consume from here on must be unconsumed.
481 insn->opcode = current;
483 return 0;
486 static int readModRM(struct InternalInstruction* insn);
489 * getIDWithAttrMask - Determines the ID of an instruction, consuming
490 * the ModR/M byte as appropriate for extended and escape opcodes,
491 * and using a supplied attribute mask.
493 * @param instructionID - A pointer whose target is filled in with the ID of the
494 * instruction.
495 * @param insn - The instruction whose ID is to be determined.
496 * @param attrMask - The attribute mask to search.
497 * @return - 0 if the ModR/M could be read when needed or was not
498 * needed; nonzero otherwise.
500 static int getIDWithAttrMask(uint16_t* instructionID,
501 struct InternalInstruction* insn,
502 uint8_t attrMask) {
503 BOOL hasModRMExtension;
505 uint8_t instructionClass;
507 instructionClass = contextForAttrs(attrMask);
509 hasModRMExtension = modRMRequired(insn->opcodeType,
510 instructionClass,
511 insn->opcode);
513 if (hasModRMExtension) {
514 readModRM(insn);
516 *instructionID = decode(insn->opcodeType,
517 instructionClass,
518 insn->opcode,
519 insn->modRM);
520 } else {
521 *instructionID = decode(insn->opcodeType,
522 instructionClass,
523 insn->opcode,
527 return 0;
531 * is16BitEquivalent - Determines whether two instruction names refer to
532 * equivalent instructions but one is 16-bit whereas the other is not.
534 * @param orig - The instruction that is not 16-bit
535 * @param equiv - The instruction that is 16-bit
537 static BOOL is16BitEquvalent(const char* orig, const char* equiv) {
538 off_t i;
540 for (i = 0;; i++) {
541 if (orig[i] == '\0' && equiv[i] == '\0')
542 return TRUE;
543 if (orig[i] == '\0' || equiv[i] == '\0')
544 return FALSE;
545 if (orig[i] != equiv[i]) {
546 if ((orig[i] == 'Q' || orig[i] == 'L') && equiv[i] == 'W')
547 continue;
548 if ((orig[i] == '6' || orig[i] == '3') && equiv[i] == '1')
549 continue;
550 if ((orig[i] == '4' || orig[i] == '2') && equiv[i] == '6')
551 continue;
552 return FALSE;
558 * is64BitEquivalent - Determines whether two instruction names refer to
559 * equivalent instructions but one is 64-bit whereas the other is not.
561 * @param orig - The instruction that is not 64-bit
562 * @param equiv - The instruction that is 64-bit
564 static BOOL is64BitEquivalent(const char* orig, const char* equiv) {
565 off_t i;
567 for (i = 0;; i++) {
568 if (orig[i] == '\0' && equiv[i] == '\0')
569 return TRUE;
570 if (orig[i] == '\0' || equiv[i] == '\0')
571 return FALSE;
572 if (orig[i] != equiv[i]) {
573 if ((orig[i] == 'W' || orig[i] == 'L') && equiv[i] == 'Q')
574 continue;
575 if ((orig[i] == '1' || orig[i] == '3') && equiv[i] == '6')
576 continue;
577 if ((orig[i] == '6' || orig[i] == '2') && equiv[i] == '4')
578 continue;
579 return FALSE;
586 * getID - Determines the ID of an instruction, consuming the ModR/M byte as
587 * appropriate for extended and escape opcodes. Determines the attributes and
588 * context for the instruction before doing so.
590 * @param insn - The instruction whose ID is to be determined.
591 * @return - 0 if the ModR/M could be read when needed or was not needed;
592 * nonzero otherwise.
594 static int getID(struct InternalInstruction* insn) {
595 uint8_t attrMask;
596 uint16_t instructionID;
598 dbgprintf(insn, "getID()");
600 attrMask = ATTR_NONE;
602 if (insn->mode == MODE_64BIT)
603 attrMask |= ATTR_64BIT;
605 if (insn->rexPrefix & 0x08)
606 attrMask |= ATTR_REXW;
608 if (isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation))
609 attrMask |= ATTR_OPSIZE;
610 else if (isPrefixAtLocation(insn, 0xf3, insn->necessaryPrefixLocation))
611 attrMask |= ATTR_XS;
612 else if (isPrefixAtLocation(insn, 0xf2, insn->necessaryPrefixLocation))
613 attrMask |= ATTR_XD;
615 if (getIDWithAttrMask(&instructionID, insn, attrMask))
616 return -1;
618 /* The following clauses compensate for limitations of the tables. */
620 if ((attrMask & ATTR_XD) && (attrMask & ATTR_REXW)) {
622 * Although for SSE instructions it is usually necessary to treat REX.W+F2
623 * as F2 for decode (in the absence of a 64BIT_REXW_XD category) there is
624 * an occasional instruction where F2 is incidental and REX.W is the more
625 * significant. If the decoded instruction is 32-bit and adding REX.W
626 * instead of F2 changes a 32 to a 64, we adopt the new encoding.
629 const struct InstructionSpecifier *spec;
630 uint16_t instructionIDWithREXw;
631 const struct InstructionSpecifier *specWithREXw;
633 spec = specifierForUID(instructionID);
635 if (getIDWithAttrMask(&instructionIDWithREXw,
636 insn,
637 attrMask & (~ATTR_XD))) {
639 * Decoding with REX.w would yield nothing; give up and return original
640 * decode.
643 insn->instructionID = instructionID;
644 insn->spec = spec;
645 return 0;
648 specWithREXw = specifierForUID(instructionIDWithREXw);
650 if (is64BitEquivalent(spec->name, specWithREXw->name)) {
651 insn->instructionID = instructionIDWithREXw;
652 insn->spec = specWithREXw;
653 } else {
654 insn->instructionID = instructionID;
655 insn->spec = spec;
657 return 0;
660 if (insn->prefixPresent[0x66] && !(attrMask & ATTR_OPSIZE)) {
662 * The instruction tables make no distinction between instructions that
663 * allow OpSize anywhere (i.e., 16-bit operations) and that need it in a
664 * particular spot (i.e., many MMX operations). In general we're
665 * conservative, but in the specific case where OpSize is present but not
666 * in the right place we check if there's a 16-bit operation.
669 const struct InstructionSpecifier *spec;
670 uint16_t instructionIDWithOpsize;
671 const struct InstructionSpecifier *specWithOpsize;
673 spec = specifierForUID(instructionID);
675 if (getIDWithAttrMask(&instructionIDWithOpsize,
676 insn,
677 attrMask | ATTR_OPSIZE)) {
679 * ModRM required with OpSize but not present; give up and return version
680 * without OpSize set
683 insn->instructionID = instructionID;
684 insn->spec = spec;
685 return 0;
688 specWithOpsize = specifierForUID(instructionIDWithOpsize);
690 if (is16BitEquvalent(spec->name, specWithOpsize->name)) {
691 insn->instructionID = instructionIDWithOpsize;
692 insn->spec = specWithOpsize;
693 } else {
694 insn->instructionID = instructionID;
695 insn->spec = spec;
697 return 0;
700 insn->instructionID = instructionID;
701 insn->spec = specifierForUID(insn->instructionID);
703 return 0;
707 * readSIB - Consumes the SIB byte to determine addressing information for an
708 * instruction.
710 * @param insn - The instruction whose SIB byte is to be read.
711 * @return - 0 if the SIB byte was successfully read; nonzero otherwise.
713 static int readSIB(struct InternalInstruction* insn) {
714 SIBIndex sibIndexBase = 0;
715 SIBBase sibBaseBase = 0;
716 uint8_t index, base;
718 dbgprintf(insn, "readSIB()");
720 if (insn->consumedSIB)
721 return 0;
723 insn->consumedSIB = TRUE;
725 switch (insn->addressSize) {
726 case 2:
727 dbgprintf(insn, "SIB-based addressing doesn't work in 16-bit mode");
728 return -1;
729 break;
730 case 4:
731 sibIndexBase = SIB_INDEX_EAX;
732 sibBaseBase = SIB_BASE_EAX;
733 break;
734 case 8:
735 sibIndexBase = SIB_INDEX_RAX;
736 sibBaseBase = SIB_BASE_RAX;
737 break;
740 if (consumeByte(insn, &insn->sib))
741 return -1;
743 index = indexFromSIB(insn->sib) | (xFromREX(insn->rexPrefix) << 3);
745 switch (index) {
746 case 0x4:
747 insn->sibIndex = SIB_INDEX_NONE;
748 break;
749 default:
750 insn->sibIndex = (EABase)(sibIndexBase + index);
751 if (insn->sibIndex == SIB_INDEX_sib ||
752 insn->sibIndex == SIB_INDEX_sib64)
753 insn->sibIndex = SIB_INDEX_NONE;
754 break;
757 switch (scaleFromSIB(insn->sib)) {
758 case 0:
759 insn->sibScale = 1;
760 break;
761 case 1:
762 insn->sibScale = 2;
763 break;
764 case 2:
765 insn->sibScale = 4;
766 break;
767 case 3:
768 insn->sibScale = 8;
769 break;
772 base = baseFromSIB(insn->sib) | (bFromREX(insn->rexPrefix) << 3);
774 switch (base) {
775 case 0x5:
776 switch (modFromModRM(insn->modRM)) {
777 case 0x0:
778 insn->eaDisplacement = EA_DISP_32;
779 insn->sibBase = SIB_BASE_NONE;
780 break;
781 case 0x1:
782 insn->eaDisplacement = EA_DISP_8;
783 insn->sibBase = (insn->addressSize == 4 ?
784 SIB_BASE_EBP : SIB_BASE_RBP);
785 break;
786 case 0x2:
787 insn->eaDisplacement = EA_DISP_32;
788 insn->sibBase = (insn->addressSize == 4 ?
789 SIB_BASE_EBP : SIB_BASE_RBP);
790 break;
791 case 0x3:
792 debug("Cannot have Mod = 0b11 and a SIB byte");
793 return -1;
795 break;
796 default:
797 insn->sibBase = (EABase)(sibBaseBase + base);
798 break;
801 return 0;
805 * readDisplacement - Consumes the displacement of an instruction.
807 * @param insn - The instruction whose displacement is to be read.
808 * @return - 0 if the displacement byte was successfully read; nonzero
809 * otherwise.
811 static int readDisplacement(struct InternalInstruction* insn) {
812 int8_t d8;
813 int16_t d16;
814 int32_t d32;
816 dbgprintf(insn, "readDisplacement()");
818 if (insn->consumedDisplacement)
819 return 0;
821 insn->consumedDisplacement = TRUE;
823 switch (insn->eaDisplacement) {
824 case EA_DISP_NONE:
825 insn->consumedDisplacement = FALSE;
826 break;
827 case EA_DISP_8:
828 if (consumeInt8(insn, &d8))
829 return -1;
830 insn->displacement = d8;
831 break;
832 case EA_DISP_16:
833 if (consumeInt16(insn, &d16))
834 return -1;
835 insn->displacement = d16;
836 break;
837 case EA_DISP_32:
838 if (consumeInt32(insn, &d32))
839 return -1;
840 insn->displacement = d32;
841 break;
844 insn->consumedDisplacement = TRUE;
845 return 0;
849 * readModRM - Consumes all addressing information (ModR/M byte, SIB byte, and
850 * displacement) for an instruction and interprets it.
852 * @param insn - The instruction whose addressing information is to be read.
853 * @return - 0 if the information was successfully read; nonzero otherwise.
855 static int readModRM(struct InternalInstruction* insn) {
856 uint8_t mod, rm, reg;
858 dbgprintf(insn, "readModRM()");
860 if (insn->consumedModRM)
861 return 0;
863 consumeByte(insn, &insn->modRM);
864 insn->consumedModRM = TRUE;
866 mod = modFromModRM(insn->modRM);
867 rm = rmFromModRM(insn->modRM);
868 reg = regFromModRM(insn->modRM);
871 * This goes by insn->registerSize to pick the correct register, which messes
872 * up if we're using (say) XMM or 8-bit register operands. That gets fixed in
873 * fixupReg().
875 switch (insn->registerSize) {
876 case 2:
877 insn->regBase = MODRM_REG_AX;
878 insn->eaRegBase = EA_REG_AX;
879 break;
880 case 4:
881 insn->regBase = MODRM_REG_EAX;
882 insn->eaRegBase = EA_REG_EAX;
883 break;
884 case 8:
885 insn->regBase = MODRM_REG_RAX;
886 insn->eaRegBase = EA_REG_RAX;
887 break;
890 reg |= rFromREX(insn->rexPrefix) << 3;
891 rm |= bFromREX(insn->rexPrefix) << 3;
893 insn->reg = (Reg)(insn->regBase + reg);
895 switch (insn->addressSize) {
896 case 2:
897 insn->eaBaseBase = EA_BASE_BX_SI;
899 switch (mod) {
900 case 0x0:
901 if (rm == 0x6) {
902 insn->eaBase = EA_BASE_NONE;
903 insn->eaDisplacement = EA_DISP_16;
904 if (readDisplacement(insn))
905 return -1;
906 } else {
907 insn->eaBase = (EABase)(insn->eaBaseBase + rm);
908 insn->eaDisplacement = EA_DISP_NONE;
910 break;
911 case 0x1:
912 insn->eaBase = (EABase)(insn->eaBaseBase + rm);
913 insn->eaDisplacement = EA_DISP_8;
914 if (readDisplacement(insn))
915 return -1;
916 break;
917 case 0x2:
918 insn->eaBase = (EABase)(insn->eaBaseBase + rm);
919 insn->eaDisplacement = EA_DISP_16;
920 if (readDisplacement(insn))
921 return -1;
922 break;
923 case 0x3:
924 insn->eaBase = (EABase)(insn->eaRegBase + rm);
925 if (readDisplacement(insn))
926 return -1;
927 break;
929 break;
930 case 4:
931 case 8:
932 insn->eaBaseBase = (insn->addressSize == 4 ? EA_BASE_EAX : EA_BASE_RAX);
934 switch (mod) {
935 case 0x0:
936 insn->eaDisplacement = EA_DISP_NONE; /* readSIB may override this */
937 switch (rm) {
938 case 0x4:
939 case 0xc: /* in case REXW.b is set */
940 insn->eaBase = (insn->addressSize == 4 ?
941 EA_BASE_sib : EA_BASE_sib64);
942 readSIB(insn);
943 if (readDisplacement(insn))
944 return -1;
945 break;
946 case 0x5:
947 insn->eaBase = EA_BASE_NONE;
948 insn->eaDisplacement = EA_DISP_32;
949 if (readDisplacement(insn))
950 return -1;
951 break;
952 default:
953 insn->eaBase = (EABase)(insn->eaBaseBase + rm);
954 break;
956 break;
957 case 0x1:
958 case 0x2:
959 insn->eaDisplacement = (mod == 0x1 ? EA_DISP_8 : EA_DISP_32);
960 switch (rm) {
961 case 0x4:
962 case 0xc: /* in case REXW.b is set */
963 insn->eaBase = EA_BASE_sib;
964 readSIB(insn);
965 if (readDisplacement(insn))
966 return -1;
967 break;
968 default:
969 insn->eaBase = (EABase)(insn->eaBaseBase + rm);
970 if (readDisplacement(insn))
971 return -1;
972 break;
974 break;
975 case 0x3:
976 insn->eaDisplacement = EA_DISP_NONE;
977 insn->eaBase = (EABase)(insn->eaRegBase + rm);
978 break;
980 break;
981 } /* switch (insn->addressSize) */
983 return 0;
986 #define GENERIC_FIXUP_FUNC(name, base, prefix) \
987 static uint8_t name(struct InternalInstruction *insn, \
988 OperandType type, \
989 uint8_t index, \
990 uint8_t *valid) { \
991 *valid = 1; \
992 switch (type) { \
993 default: \
994 debug("Unhandled register type"); \
995 *valid = 0; \
996 return 0; \
997 case TYPE_Rv: \
998 return base + index; \
999 case TYPE_R8: \
1000 if (insn->rexPrefix && \
1001 index >= 4 && index <= 7) { \
1002 return prefix##_SPL + (index - 4); \
1003 } else { \
1004 return prefix##_AL + index; \
1006 case TYPE_R16: \
1007 return prefix##_AX + index; \
1008 case TYPE_R32: \
1009 return prefix##_EAX + index; \
1010 case TYPE_R64: \
1011 return prefix##_RAX + index; \
1012 case TYPE_XMM128: \
1013 case TYPE_XMM64: \
1014 case TYPE_XMM32: \
1015 case TYPE_XMM: \
1016 return prefix##_XMM0 + index; \
1017 case TYPE_MM64: \
1018 case TYPE_MM32: \
1019 case TYPE_MM: \
1020 if (index > 7) \
1021 *valid = 0; \
1022 return prefix##_MM0 + index; \
1023 case TYPE_SEGMENTREG: \
1024 if (index > 5) \
1025 *valid = 0; \
1026 return prefix##_ES + index; \
1027 case TYPE_DEBUGREG: \
1028 if (index > 7) \
1029 *valid = 0; \
1030 return prefix##_DR0 + index; \
1031 case TYPE_CONTROLREG: \
1032 if (index > 8) \
1033 *valid = 0; \
1034 return prefix##_CR0 + index; \
1039 * fixup*Value - Consults an operand type to determine the meaning of the
1040 * reg or R/M field. If the operand is an XMM operand, for example, an
1041 * operand would be XMM0 instead of AX, which readModRM() would otherwise
1042 * misinterpret it as.
1044 * @param insn - The instruction containing the operand.
1045 * @param type - The operand type.
1046 * @param index - The existing value of the field as reported by readModRM().
1047 * @param valid - The address of a uint8_t. The target is set to 1 if the
1048 * field is valid for the register class; 0 if not.
1049 * @return - The proper value.
1051 GENERIC_FIXUP_FUNC(fixupRegValue, insn->regBase, MODRM_REG)
1052 GENERIC_FIXUP_FUNC(fixupRMValue, insn->eaRegBase, EA_REG)
1055 * fixupReg - Consults an operand specifier to determine which of the
1056 * fixup*Value functions to use in correcting readModRM()'ss interpretation.
1058 * @param insn - See fixup*Value().
1059 * @param op - The operand specifier.
1060 * @return - 0 if fixup was successful; -1 if the register returned was
1061 * invalid for its class.
1063 static int fixupReg(struct InternalInstruction *insn,
1064 const struct OperandSpecifier *op) {
1065 uint8_t valid;
1067 dbgprintf(insn, "fixupReg()");
1069 switch ((OperandEncoding)op->encoding) {
1070 default:
1071 debug("Expected a REG or R/M encoding in fixupReg");
1072 return -1;
1073 case ENCODING_REG:
1074 insn->reg = (Reg)fixupRegValue(insn,
1075 (OperandType)op->type,
1076 insn->reg - insn->regBase,
1077 &valid);
1078 if (!valid)
1079 return -1;
1080 break;
1081 case ENCODING_RM:
1082 if (insn->eaBase >= insn->eaRegBase) {
1083 insn->eaBase = (EABase)fixupRMValue(insn,
1084 (OperandType)op->type,
1085 insn->eaBase - insn->eaRegBase,
1086 &valid);
1087 if (!valid)
1088 return -1;
1090 break;
1093 return 0;
1097 * readOpcodeModifier - Reads an operand from the opcode field of an
1098 * instruction. Handles AddRegFrm instructions.
1100 * @param insn - The instruction whose opcode field is to be read.
1101 * @param inModRM - Indicates that the opcode field is to be read from the
1102 * ModR/M extension; useful for escape opcodes
1103 * @return - 0 on success; nonzero otherwise.
1105 static int readOpcodeModifier(struct InternalInstruction* insn) {
1106 dbgprintf(insn, "readOpcodeModifier()");
1108 if (insn->consumedOpcodeModifier)
1109 return 0;
1111 insn->consumedOpcodeModifier = TRUE;
1113 switch (insn->spec->modifierType) {
1114 default:
1115 debug("Unknown modifier type.");
1116 return -1;
1117 case MODIFIER_NONE:
1118 debug("No modifier but an operand expects one.");
1119 return -1;
1120 case MODIFIER_OPCODE:
1121 insn->opcodeModifier = insn->opcode - insn->spec->modifierBase;
1122 return 0;
1123 case MODIFIER_MODRM:
1124 insn->opcodeModifier = insn->modRM - insn->spec->modifierBase;
1125 return 0;
1130 * readOpcodeRegister - Reads an operand from the opcode field of an
1131 * instruction and interprets it appropriately given the operand width.
1132 * Handles AddRegFrm instructions.
1134 * @param insn - See readOpcodeModifier().
1135 * @param size - The width (in bytes) of the register being specified.
1136 * 1 means AL and friends, 2 means AX, 4 means EAX, and 8 means
1137 * RAX.
1138 * @return - 0 on success; nonzero otherwise.
1140 static int readOpcodeRegister(struct InternalInstruction* insn, uint8_t size) {
1141 dbgprintf(insn, "readOpcodeRegister()");
1143 if (readOpcodeModifier(insn))
1144 return -1;
1146 if (size == 0)
1147 size = insn->registerSize;
1149 switch (size) {
1150 case 1:
1151 insn->opcodeRegister = (Reg)(MODRM_REG_AL + ((bFromREX(insn->rexPrefix) << 3)
1152 | insn->opcodeModifier));
1153 if (insn->rexPrefix &&
1154 insn->opcodeRegister >= MODRM_REG_AL + 0x4 &&
1155 insn->opcodeRegister < MODRM_REG_AL + 0x8) {
1156 insn->opcodeRegister = (Reg)(MODRM_REG_SPL
1157 + (insn->opcodeRegister - MODRM_REG_AL - 4));
1160 break;
1161 case 2:
1162 insn->opcodeRegister = (Reg)(MODRM_REG_AX
1163 + ((bFromREX(insn->rexPrefix) << 3)
1164 | insn->opcodeModifier));
1165 break;
1166 case 4:
1167 insn->opcodeRegister = (Reg)(MODRM_REG_EAX
1168 + ((bFromREX(insn->rexPrefix) << 3)
1169 | insn->opcodeModifier));
1170 break;
1171 case 8:
1172 insn->opcodeRegister = (Reg)(MODRM_REG_RAX
1173 + ((bFromREX(insn->rexPrefix) << 3)
1174 | insn->opcodeModifier));
1175 break;
1178 return 0;
1182 * readImmediate - Consumes an immediate operand from an instruction, given the
1183 * desired operand size.
1185 * @param insn - The instruction whose operand is to be read.
1186 * @param size - The width (in bytes) of the operand.
1187 * @return - 0 if the immediate was successfully consumed; nonzero
1188 * otherwise.
1190 static int readImmediate(struct InternalInstruction* insn, uint8_t size) {
1191 uint8_t imm8;
1192 uint16_t imm16;
1193 uint32_t imm32;
1194 uint64_t imm64;
1196 dbgprintf(insn, "readImmediate()");
1198 if (insn->numImmediatesConsumed == 2) {
1199 debug("Already consumed two immediates");
1200 return -1;
1203 if (size == 0)
1204 size = insn->immediateSize;
1205 else
1206 insn->immediateSize = size;
1208 switch (size) {
1209 case 1:
1210 if (consumeByte(insn, &imm8))
1211 return -1;
1212 insn->immediates[insn->numImmediatesConsumed] = imm8;
1213 break;
1214 case 2:
1215 if (consumeUInt16(insn, &imm16))
1216 return -1;
1217 insn->immediates[insn->numImmediatesConsumed] = imm16;
1218 break;
1219 case 4:
1220 if (consumeUInt32(insn, &imm32))
1221 return -1;
1222 insn->immediates[insn->numImmediatesConsumed] = imm32;
1223 break;
1224 case 8:
1225 if (consumeUInt64(insn, &imm64))
1226 return -1;
1227 insn->immediates[insn->numImmediatesConsumed] = imm64;
1228 break;
1231 insn->numImmediatesConsumed++;
1233 return 0;
1237 * readOperands - Consults the specifier for an instruction and consumes all
1238 * operands for that instruction, interpreting them as it goes.
1240 * @param insn - The instruction whose operands are to be read and interpreted.
1241 * @return - 0 if all operands could be read; nonzero otherwise.
1243 static int readOperands(struct InternalInstruction* insn) {
1244 int index;
1246 dbgprintf(insn, "readOperands()");
1248 for (index = 0; index < X86_MAX_OPERANDS; ++index) {
1249 switch (insn->spec->operands[index].encoding) {
1250 case ENCODING_NONE:
1251 break;
1252 case ENCODING_REG:
1253 case ENCODING_RM:
1254 if (readModRM(insn))
1255 return -1;
1256 if (fixupReg(insn, &insn->spec->operands[index]))
1257 return -1;
1258 break;
1259 case ENCODING_CB:
1260 case ENCODING_CW:
1261 case ENCODING_CD:
1262 case ENCODING_CP:
1263 case ENCODING_CO:
1264 case ENCODING_CT:
1265 dbgprintf(insn, "We currently don't hande code-offset encodings");
1266 return -1;
1267 case ENCODING_IB:
1268 if (readImmediate(insn, 1))
1269 return -1;
1270 if (insn->spec->operands[index].type == TYPE_IMM3 &&
1271 insn->immediates[insn->numImmediatesConsumed - 1] > 7)
1272 return -1;
1273 break;
1274 case ENCODING_IW:
1275 if (readImmediate(insn, 2))
1276 return -1;
1277 break;
1278 case ENCODING_ID:
1279 if (readImmediate(insn, 4))
1280 return -1;
1281 break;
1282 case ENCODING_IO:
1283 if (readImmediate(insn, 8))
1284 return -1;
1285 break;
1286 case ENCODING_Iv:
1287 if (readImmediate(insn, insn->immediateSize))
1288 return -1;
1289 break;
1290 case ENCODING_Ia:
1291 if (readImmediate(insn, insn->addressSize))
1292 return -1;
1293 break;
1294 case ENCODING_RB:
1295 if (readOpcodeRegister(insn, 1))
1296 return -1;
1297 break;
1298 case ENCODING_RW:
1299 if (readOpcodeRegister(insn, 2))
1300 return -1;
1301 break;
1302 case ENCODING_RD:
1303 if (readOpcodeRegister(insn, 4))
1304 return -1;
1305 break;
1306 case ENCODING_RO:
1307 if (readOpcodeRegister(insn, 8))
1308 return -1;
1309 break;
1310 case ENCODING_Rv:
1311 if (readOpcodeRegister(insn, 0))
1312 return -1;
1313 break;
1314 case ENCODING_I:
1315 if (readOpcodeModifier(insn))
1316 return -1;
1317 case ENCODING_DUP:
1318 break;
1319 default:
1320 dbgprintf(insn, "Encountered an operand with an unknown encoding.");
1321 return -1;
1325 return 0;
1329 * decodeInstruction - Reads and interprets a full instruction provided by the
1330 * user.
1332 * @param insn - A pointer to the instruction to be populated. Must be
1333 * pre-allocated.
1334 * @param reader - The function to be used to read the instruction's bytes.
1335 * @param readerArg - A generic argument to be passed to the reader to store
1336 * any internal state.
1337 * @param logger - If non-NULL, the function to be used to write log messages
1338 * and warnings.
1339 * @param loggerArg - A generic argument to be passed to the logger to store
1340 * any internal state.
1341 * @param startLoc - The address (in the reader's address space) of the first
1342 * byte in the instruction.
1343 * @param mode - The mode (real mode, IA-32e, or IA-32e in 64-bit mode) to
1344 * decode the instruction in.
1345 * @return - 0 if the instruction's memory could be read; nonzero if
1346 * not.
1348 int decodeInstruction(struct InternalInstruction* insn,
1349 byteReader_t reader,
1350 void* readerArg,
1351 dlog_t logger,
1352 void* loggerArg,
1353 uint64_t startLoc,
1354 DisassemblerMode mode) {
1355 memset(insn, 0, sizeof(struct InternalInstruction));
1357 insn->reader = reader;
1358 insn->readerArg = readerArg;
1359 insn->dlog = logger;
1360 insn->dlogArg = loggerArg;
1361 insn->startLocation = startLoc;
1362 insn->readerCursor = startLoc;
1363 insn->mode = mode;
1364 insn->numImmediatesConsumed = 0;
1366 if (readPrefixes(insn) ||
1367 readOpcode(insn) ||
1368 getID(insn) ||
1369 insn->instructionID == 0 ||
1370 readOperands(insn))
1371 return -1;
1373 insn->length = insn->readerCursor - insn->startLocation;
1375 dbgprintf(insn, "Read from 0x%llx to 0x%llx: length %zu",
1376 startLoc, insn->readerCursor, insn->length);
1378 if (insn->length > 15)
1379 dbgprintf(insn, "Instruction exceeds 15-byte limit");
1381 return 0;