engine: Add set_arch_id() and code_addr_mask var.
[ScratchABit.git] / scratchabit / engine.py
blob9f23bcffb3ada816157b1e1232aaf3526cea2950
1 # ScratchABit - interactive disassembler
3 # Copyright (c) 2015 Paul Sokolovsky
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation, either version 3 of the License, or
8 # (at your option) any later version.
10 # This program is distributed in the hope that it will be useful, but
11 # WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 # General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program. If not, see <http://www.gnu.org/licenses/>.
17 import sys
18 import binascii
19 import json
20 import bisect
21 import logging as log
23 from rangeset import RangeSet
25 import idaapi
28 # ScratchABit API and code
31 START = 0
32 END = 1
33 PROPS = 2
34 BYTES = 3
35 FLAGS = 4
37 from .defs import *
40 arch_id = None
41 # Some architectures, e.g. ARM uses lowest bit(s) of code address to
42 # encode additional information (e.g. Thumb mode for ARM). To get
43 # real instruction address from such an encoded address, we nee to
44 # AND it with this value.
45 code_addr_mask = -1
48 def str_area(area):
49 if not area:
50 return "Area(None)"
51 return "Area(0x%x-0x%x, %s)" % (area[START], area[END], area[PROPS])
53 def area_props(area):
54 return area[PROPS]
57 class Function:
59 def __init__(self, start, end=None):
60 self.ranges = RangeSet()
61 self.start = start
62 self.end = end
64 def add_insn(self, addr, sz):
65 self.ranges.add((addr, addr + sz))
67 def add_range(self, start, end):
68 self.ranges.add((start, end))
70 def get_ranges(self):
71 return self.ranges.to_list()
73 def get_end(self):
74 if self.end is not None:
75 return self.end
76 bounds = self.ranges.bounds()
77 if bounds:
78 return bounds[1]
80 def get_end_method(self):
81 if self.end is not None:
82 bounds = self.ranges.bounds()
83 addr = "?"
84 if bounds:
85 addr = "0x%x" % (self.ranges.bounds()[1] - 1)
86 return "as set by loader (detected: %s)" % addr
87 return "as detected"
89 class AddressSpace:
90 UNK = 0
91 CODE = 0x01
92 CODE_CONT = 0x02
93 DATA = 0x04
94 DATA_CONT = 0x08
95 STR = 0x10 # Continuation is DATA_CONT
96 FILL = 0x40 # Filler/alignment bytes
97 FUNC = 0x80 # Can appear with CODE, meaning this instruction belongs to a function
99 def __init__(self):
100 self.area_list = []
101 # List of subareas and binary search index for it
102 self.subarea_list = []
103 self.subarea_search = []
104 # Map from referenced addresses to their properties. Among them:
105 # "args":
106 # Properties of instruction's args; at the very least, this should
107 # differentiate between literal numeric values and addresses/offsets/pointers
108 # to other objects
109 # "comm":
110 # Comment
111 # "label"
112 # Label
113 # "xref":
114 # Cross-reference records
115 # "fun_s", "fun_e"
116 # Function start and beyond-end addresses, map to Function object
117 self.addr_map = {}
118 # Map from label to its address
119 self.labels_rev = {}
120 # Problem spots which automatic control/data flow couldn't resolve
121 self.issues = {}
122 # Cached last accessed area
123 self.last_area = None
124 # Cached function start addresses
125 self.func_starts = None
126 # Map from func_starts's indexes to function objects
127 self.func_starts_arr = []
128 # True during loading stage, False during UI interaction stage
129 self.is_loading = False
130 # Was area flags/content changed (and thus require saving)?
131 self.changed = False
133 # Memory Area API
135 def add_area(self, start, end, props):
136 log.debug("add_area(%x, %x, %s)", start, end, props)
137 sz = end - start + 1
138 bytes = bytearray(sz)
139 flags = bytearray(sz)
140 a = (start, end, props, bytes, flags)
141 self.area_list.append(a)
142 # Area list should be sorted. Assume it's short and just resort it each time.
143 self.area_list.sort()
144 return a
146 def get_areas(self):
147 return self.area_list
149 def area_no(self, area):
150 return self.area_list.index(area)
152 def addr2area(self, addr):
153 if self.last_area:
154 a = self.last_area
155 if a[0] <= addr <= a[1]:
156 return (addr - a[0], a)
157 for a in self.area_list:
158 if a[0] <= addr <= a[1]:
159 self.last_area = a
160 return (addr - a[0], a)
161 return (None, None)
163 def min_addr(self):
164 return self.area_list[0][START]
166 def max_addr(self):
167 return self.area_list[-1][END]
169 # Return next address in the address space, or None
170 def next_addr(self, addr):
171 offset, area = self.addr2area(addr)
172 if addr != area[END]:
173 return addr + 1
174 i = self.area_no(area) + 1
175 if i == len(self.area_list):
176 return None
177 return self.area_list[i][START]
179 def is_exec(self, addr):
180 off, area = self.addr2area(addr)
181 if not area:
182 return False
183 return "X" in area[PROPS]["access"]
185 # Binary Data API
187 def load_content(self, file, addr, sz=None):
188 off, area = self.addr2area(addr)
189 to = off + sz if sz else None
190 file.readinto(memoryview(area[BYTES])[off:to])
192 def is_valid_addr(self, addr):
193 off, area = self.addr2area(addr)
194 return area is not None
196 def get_byte(self, addr):
197 off, area = self.addr2area(addr)
198 if area is None:
199 raise InvalidAddrException(addr)
200 return area[BYTES][off]
202 def set_byte(self, addr, val):
203 self.changed = True
204 off, area = self.addr2area(addr)
205 if area is None:
206 raise InvalidAddrException(addr)
207 area[BYTES][off] = val & 0xff
209 def get_bytes(self, addr, sz):
210 off, area = self.addr2area(addr)
211 if area is None:
212 raise InvalidAddrException(addr)
213 return area[BYTES][off:off + sz]
215 def get_data(self, addr, sz):
216 # TODO: address size
217 if sz == 4:
218 sym = self.get_addr_prop(addr, "sym")
219 if sym is not None:
220 return sym
222 off, area = self.addr2area(addr)
223 val = 0
224 for i in range(sz):
225 val = val | (area[BYTES][off + i] << 8 * i)
226 return val
228 def set_data(self, addr, data, sz):
229 self.changed = True
230 off, area = self.addr2area(addr)
231 val = 0
232 for i in range(sz):
233 area[BYTES][off + i] = data & 0xff
234 data >>= 8
236 # Convenience function for plugins
237 def memcpy(self, dst, src, sz):
238 for i in range(sz):
239 b = self.get_byte(src)
240 self.set_byte(dst, b)
241 src += 1
242 dst += 1
244 # Binary Data Flags API
246 def get_flags(self, addr, mask=0x7f):
247 off, area = self.addr2area(addr)
248 if area is None:
249 raise InvalidAddrException(addr)
250 return area[FLAGS][off] & mask
252 def get_unit_size(self, addr):
253 off, area = self.addr2area(addr)
254 flags = area[FLAGS]
255 sz = 1
256 if flags[off] & 0x7f == self.CODE:
257 f = self.CODE_CONT
258 elif flags[off] in (self.DATA, self.STR):
259 f = self.DATA_CONT
260 elif flags[off] == self.FILL:
261 f = self.FILL
262 else:
263 return 1
264 off += 1
266 try:
267 while flags[off] == f:
268 off += 1
269 sz += 1
270 except IndexError:
271 pass
273 return sz
276 # Taking an offset inside unit, return offset to the beginning of unit
277 @classmethod
278 def adjust_offset_reverse(cls, off, area):
279 flags = area[FLAGS]
280 if flags[off] == cls.FILL:
281 while off > 0:
282 if flags[off] != cls.FILL:
283 off += 1
284 break
285 off -= 1
286 return off
288 while off > 0:
289 if flags[off] in (cls.CODE_CONT, cls.DATA_CONT):
290 off -= 1
291 else:
292 break
293 return off
295 def adjust_addr_reverse(self, addr):
296 off, area = self.addr2area(addr)
297 if area is None:
298 return None
299 return self.adjust_offset_reverse(off, area) + area[START]
301 def set_flags(self, addr, sz, head_fl, rest_fl=0):
302 self.changed = True
303 off, area = self.addr2area(addr)
304 flags = area[FLAGS]
305 flags[off] = head_fl
306 off += 1
307 for i in range(sz - 1):
308 flags[off + i] = rest_fl
310 def make_undefined(self, addr, sz):
311 self.set_flags(addr, sz, self.UNK, self.UNK)
313 def make_code(self, addr, sz, extra_flags=0):
314 self.changed = True
315 off, area = self.addr2area(addr)
316 area_byte_flags = area[FLAGS]
317 area_byte_flags[off] |= self.CODE | extra_flags
318 for i in range(sz - 1):
319 area_byte_flags[off + 1 + i] |= self.CODE_CONT
321 # Mark instructions in given range as belonging to function
322 def mark_func_bytes(self, addr, sz):
323 self.changed = True
324 off, area = self.addr2area(addr)
325 area_byte_flags = area[FLAGS]
326 for i in range(sz):
327 fl = area_byte_flags[off + i]
328 assert fl in (self.CODE, self.CODE_CONT)
329 if fl == self.CODE:
330 area_byte_flags[off + i] |= self.FUNC
332 def make_data(self, addr, sz):
333 self.changed = True
334 off, area = self.addr2area(addr)
335 area_byte_flags = area[FLAGS]
336 area_byte_flags[off] |= self.DATA
337 for i in range(sz - 1):
338 area_byte_flags[off + 1 + i] |= self.DATA_CONT
340 def make_data_array(self, addr, sz, num_items, prefix=""):
341 # Make a data array. First-class arrays are not supported so far,
342 # so just mark data units sequentially
343 self.append_comment(addr, "%sArray, num %s: %d" % (prefix, "bytes" if sz == 1 else "items", num_items))
344 for i in range(num_items):
345 self.make_data(addr, sz)
346 addr += sz
348 def make_filler(self, addr, sz):
349 self.set_flags(addr, sz, self.FILL, self.FILL)
351 # Address properties API
353 def set_addr_prop(self, addr, prop, val):
354 self.changed = True
355 self.addr_map.setdefault(addr, {})[prop] = val
357 def get_addr_prop(self, addr, prop, default=None):
358 return self.addr_map.get(addr, {}).get(prop, default)
360 def get_addr_prop_dict(self, addr):
361 return self.addr_map.get(addr, {})
363 # Label API
365 def get_default_label_prefix(self, ea):
366 fl = self.get_flags(ea)
367 if fl == self.CODE:
368 prefix = "loc_"
369 elif fl & self.DATA:
370 prefix = "dat_"
371 else:
372 prefix = "unk_"
373 return prefix
375 def get_default_label(self, ea):
376 prefix = self.get_default_label_prefix(ea)
377 return "%s%08x" % (prefix, ea)
379 def make_label(self, prefix, ea):
380 l = self.get_addr_prop(ea, "label")
381 if isinstance(l, str):
382 # If it's real label, don't change it
383 return
384 if not prefix:
385 prefix = self.get_default_label_prefix(ea)
386 l = "%s%08x" % (prefix, ea)
387 self.set_addr_prop(ea, "label", l)
388 self.labels_rev[l] = ea
390 # auto_label will change its prefix automatically based on
391 # type of data it points.
392 def make_auto_label(self, ea):
393 if self.get_addr_prop(ea, "label"):
394 return
395 self.set_addr_prop(ea, "label", ea)
396 self.labels_rev[ea] = ea
398 # Delete a label, only if it's auto
399 def del_auto_label(self, ea):
400 label = self.get_addr_prop(ea, "label")
401 if not label or isinstance(label, str):
402 return
403 self.set_addr_prop(ea, "label", None)
404 del self.labels_rev[ea]
406 def get_label(self, ea):
407 label = self.get_addr_prop(ea, "label")
408 if isinstance(label, int):
409 return "%s%08x" % (self.get_default_label_prefix(ea), label)
410 return label
412 def set_label(self, ea, label):
413 # Make sure the label can be actually visible - create an area for it if none
414 off, area = self.addr2area(ea)
415 if area is None:
416 self.add_area(ea, ea, {"name": "autocreated to host %s label" % label})
417 if self.is_loading:
418 existing = self.get_addr_prop(ea, "label")
419 if existing is not None and not isinstance(existing, int):
420 log.warn("Duplicate label for %x: %s (existing: %s)" % (ea, label, existing))
421 self.append_comment(ea, "Another label: " + label)
422 return
423 self.set_addr_prop(ea, "label", label)
424 self.labels_rev[label] = ea
426 def make_unique_label(self, ea, label):
427 existing = self.get_label(ea)
428 if existing == label:
429 return label
430 cnt = 0
431 while True:
432 l = label
433 if cnt > 0:
434 l += "__%d" % cnt
435 if l not in self.labels_rev:
436 self.set_label(ea, l)
437 if self.is_loading and cnt > 0:
438 self.append_comment(ea, "Original label: " + label)
439 return l
440 cnt += 1
442 def get_label_list(self):
443 return sorted([x if isinstance(x, str) else self.get_default_label(x) for x in self.labels_rev.keys()])
445 def resolve_label(self, label):
446 if label in self.labels_rev:
447 return self.labels_rev[label]
448 try:
449 ea = int(label.split("_", 1)[1], 16)
450 except:
451 return None
452 if ea in self.labels_rev and self.get_default_label(ea) == label:
453 return ea
455 def label_exists(self, label):
456 return label in self.labels_rev
458 # Comment API
460 def get_comment(self, ea):
461 comm = self.get_addr_prop(ea, "comm")
462 return comm
464 def set_comment(self, ea, comm):
465 self.set_addr_prop(ea, "comm", comm)
467 def append_comment(self, ea, comm):
468 existing = self.get_comment(ea)
469 if existing is not None:
470 comm = existing + "\n" + comm
471 self.set_addr_prop(ea, "comm", comm)
473 # (Pseudo)instruction Argument Properties API
475 def set_arg_prop(self, ea, arg_no, prop, prop_val):
476 arg_props = self.get_addr_prop(ea, "args", {})
477 if arg_no not in arg_props:
478 arg_props[arg_no] = {}
479 props = arg_props[arg_no]
480 props[prop] = prop_val
481 self.set_addr_prop(ea, "args", arg_props)
483 def get_arg_prop(self, ea, arg_no, prop):
484 arg_props = self.get_addr_prop(ea, "args", {})
485 return arg_props.get(arg_no, {}).get(prop)
487 def get_arg_prop_dict(self, ea, arg_no):
488 arg_props = self.get_addr_prop(ea, "args", {})
489 return arg_props.get(arg_no, {})
491 def make_arg_offset(self, insn_addr, arg_no, ref_addr):
492 # Convert an immediate argument to an offset one
493 # insn_addr - address of (pseudo)instruction
494 # arg_no - argument no. of instruction
495 # ref_addr - value of the argument (i.e. address it refers to)
496 old_subtype = self.get_arg_prop(insn_addr, arg_no, "subtype")
497 if old_subtype and old_subtype != IMM_ADDR:
498 # Preserve old numeric value subtype to unconvert back to it
499 # if need.
500 self.set_arg_prop(insn_addr, arg_no, "num_subtype", old_subtype)
502 self.set_arg_prop(insn_addr, arg_no, "subtype", IMM_ADDR)
504 if isinstance(ref_addr, str):
505 # Symbolic address
506 # TODO: this works only for "dd" virtual instruction
507 self.set_addr_prop(insn_addr, "sym", ref_addr)
508 return
510 label = self.get_label(ref_addr)
511 if not label:
512 self.make_auto_label(ref_addr)
513 self.add_xref(insn_addr, ref_addr, idaapi.dr_O)
515 def unmake_arg_offset(self, insn_addr, arg_no, ref_addr):
516 # Convert offset argument to normal immediate value
517 old_subtype = self.get_arg_prop(insn_addr, arg_no, "num_subtype")
518 self.set_arg_prop(insn_addr, arg_no, "subtype", old_subtype)
519 self.del_xref(insn_addr, ref_addr, idaapi.dr_O)
520 # If this was last xref, and label is automatic, kill it too
521 if not self.get_xrefs(ref_addr):
522 self.del_auto_label(ref_addr)
524 def is_arg_offset(self, insn_addr, arg_no):
525 old_subtype = self.get_arg_prop(insn_addr, arg_no, "subtype")
526 return old_subtype == IMM_ADDR
528 # Xref API
530 def add_xref(self, from_ea, to_ea, type):
531 xrefs = self.get_addr_prop(to_ea, "xrefs", {})
532 xrefs[from_ea] = type
533 self.set_addr_prop(to_ea, "xrefs", xrefs)
535 def del_xref(self, from_ea, to_ea, type):
536 xrefs = self.get_addr_prop(to_ea, "xrefs", {})
537 del xrefs[from_ea]
538 self.set_addr_prop(to_ea, "xrefs", xrefs)
540 def get_xrefs(self, ea):
541 xrefs = self.get_addr_prop(ea, "xrefs", None)
542 return xrefs
544 # Functions API
546 def make_func(self, from_ea, to_ea_excl=None):
547 f = self.get_addr_prop(from_ea, "fun_s")
548 if f is not None:
549 return f
550 f = Function(from_ea, to_ea_excl)
551 self.set_addr_prop(from_ea, "fun_s", f)
553 if to_ea_excl is not None:
554 self.set_addr_prop(to_ea_excl, "fun_e", f)
555 # Reset cache
556 self.func_starts = None
557 return f
559 def is_func(self, ea):
560 return self.get_addr_prop(ea, "fun_s") is not None
562 # If ea is start of function, return Function object
563 def get_func_start(self, ea):
564 return self.get_addr_prop(ea, "fun_s")
566 # If ea is end of function, return Function object
567 def get_func_end(self, ea):
568 return self.get_addr_prop(ea, "fun_e")
570 def set_func_end(self, func, ea):
571 self.set_addr_prop(ea, "fun_e", func)
573 # Look up function containing address
574 def lookup_func(self, ea):
575 # TODO: cache func ranges, use binary search instead
576 if self.func_starts is None:
577 self.func_starts = []
578 self.func_starts_arr = []
579 for start, props in sorted(self.addr_map.items()):
580 func = props.get("fun_s")
581 if func:
582 self.func_starts.append(start)
583 self.func_starts_arr.append(func)
585 i = bisect.bisect_right(self.func_starts, ea)
586 if i:
587 func = self.func_starts_arr[i - 1]
588 end = func.get_end()
589 if end and func.start <= ea < end:
590 return func
591 return None
593 # Get all functions
594 def iter_funcs(self):
595 for addr, props in self.addr_map.items():
596 func = props.get("fun_s")
597 if func:
598 yield (addr, func)
600 def get_func_list(self):
601 return sorted([self.get_label(addr) for addr, f in self.iter_funcs()])
603 # Memory Subarea API
605 def add_subarea(self, start, end, name):
606 log.debug("add_subarea(%x, %x, %s)", start, end, name)
607 self.subarea_list.append((start, end, name))
608 self.subarea_search.append(start)
610 # Call this once all add_subarea() calls were made
611 def finish_subareas(self):
612 self.subarea_list.sort()
613 self.subarea_search.sort()
615 # Look up subarea containing address
616 def lookup_subarea(self, ea):
617 i = bisect.bisect_right(self.subarea_search, ea)
618 if i:
619 area = self.subarea_list[i - 1]
620 if area[0] <= ea <= area[1]:
621 return area
622 return None
624 # Issues API
626 def add_issue(self, ea, descr):
627 self.issues[ea] = descr
629 def get_issues(self):
630 res = []
631 for ea in sorted(self.issues.keys()):
632 res.append((ea, self.issues[ea]))
633 return res
635 # Persistence API
637 def save_area(self, stream, area):
638 stream.write("%08x %08x\n" % (area[START], area[END]))
639 flags = area[FLAGS]
640 i = 0
641 while True:
642 chunk = flags[i:i + 32]
643 if not chunk:
644 break
645 stream.write(str(binascii.hexlify(chunk), 'utf-8') + "\n")
646 i += 32
647 stream.write("\n")
650 def save_areas(self, stream):
651 for a in self.area_list:
652 self.save_area(stream, a)
655 def save_addr_props(self, prefix):
656 areas = self.area_list
657 area_i = 0
658 stream = open(prefix + ".%08x" % areas[area_i][START], "w")
659 area_end = areas[area_i][END]
660 stream.write("header:\n")
661 stream.write(" version: 1.0\n")
662 for addr, props in sorted(self.addr_map.items()):
663 # If entry has just fun_e data, skip it. As fun_e is set
664 # on an address past the last byte of func, this address
665 # also may not belong to any section, so skipping it
666 # to start with is helpful.
667 if len(props) == 1 and "fun_e" in props:
668 continue
670 if addr > area_end:
671 stream.close()
672 area_i += 1
673 while addr > areas[area_i][END]:
674 area_i += 1
675 assert addr >= areas[area_i][START]
676 stream = open(prefix + ".%08x" % areas[area_i][START], "w")
677 #stream.write("addr=%x area_end=%x\n" % (addr, area_end))
678 area_end = areas[area_i][END]
679 stream.write("header:\n")
680 stream.write(" version: 1.0\n")
681 stream.write("0x%08x:\n" % addr)
682 fl = self.get_flags(addr)
683 stream.write(" f: %s %02x\n" % (flag2char(fl), fl))
684 label = props.get("label")
685 arg_props = props.get("args")
686 comm = props.get("comm")
687 xrefs = props.get("xrefs")
688 func = props.get("fun_s")
689 if label is not None:
690 if label == addr:
691 stream.write(" l:\n")
692 else:
693 stream.write(" l: %s\n" % label)
694 if arg_props is not None:
695 arg_props_header = False
696 for arg_no, data in sorted(arg_props.items()):
697 data = {k: v for k, v in data.items() if v is not None}
698 if data:
699 if not arg_props_header:
700 stream.write(" args:\n")
701 arg_props_header = True
702 stream.write(" %s: %r\n" % (arg_no, data))
703 #for k, v in sorted(data.items()):
704 # stream.write(" %s: %s\n" % (k, v))
705 if comm is not None:
706 stream.write(" cmnt: %r\n" % comm)
708 if func is not None:
709 if func.end is not None:
710 stream.write(" fn_end: 0x%08x\n" % func.end)
711 else:
712 stream.write(" fn_end: '?'\n")
713 stream.write(" fn_ranges: [")
714 first = True
715 for r in func.get_ranges():
716 if not first:
717 stream.write(", ")
718 stream.write("[0x%08x,0x%08x]" % r)
719 first = False
720 stream.write("]\n")
722 if xrefs:
723 stream.write(" x:\n" % xrefs)
724 for from_addr in sorted(xrefs.keys()):
725 stream.write(" - 0x%08x: %s\n" % (from_addr, xrefs[from_addr]))
727 def load_addr_props(self, stream):
728 l = stream.readline()
729 assert l == "header:\n"
730 l = stream.readline()
731 assert l == " version: 1.0\n"
732 l = stream.readline()
733 while l:
734 assert l.endswith(":\n")
735 addr = int(l[:-2], 0)
736 props = self.addr_map.get(addr, {})
737 l = stream.readline()
738 while l and l[0] == " ":
739 key, val = [x.strip() for x in l.split(":", 1)]
740 l = None
742 if key == "l":
743 if not val:
744 val = addr
745 props["label"] = val
746 self.labels_rev[val] = addr
747 elif key == "cmnt":
748 props["comm"] = val[1:-1].replace("\\n", "\n")
749 elif key == "fn_end":
750 if val == "'?'":
751 end = None
752 else:
753 end = int(val, 0)
754 f = Function(addr, end)
755 props["fun_s"] = f
756 # Handled by finish_func() below
757 #if end is not None:
758 # self.addr_map[end] = {"fun_e": f}
759 elif key == "fn_ranges":
760 if val != "[]":
761 assert val.startswith("[[") and val.endswith("]]"), val
762 val = val[2:-2]
763 f = props["fun_s"]
764 for r in val.split("], ["):
765 r = [int(x, 0) for x in r.split(",")]
766 f.add_range(*r)
767 # Now, call finish func to set func end address, either from
768 # fn_end or fn_ranges
769 finish_func(f)
771 elif key == "args":
772 arg_props = {}
773 while True:
774 l = stream.readline()
775 if not l or not l.startswith(" "):
776 break
777 arg_no, data = [x.strip() for x in l.split(":", 1)]
778 assert data[0] == "{" and data[-1] == "}"
779 data = data[1:-1]
780 vals = {}
781 for pair in data.split(","):
782 seq = [x.strip() for x in pair.split(":", 1)]
783 for x in seq:
784 assert x[0] == "'" and x[-1] == "'", x
785 k, v = [x[1:-1] for x in seq]
786 vals[k] = v
787 arg_props[int(arg_no)] = vals
788 props["args"] = arg_props
790 elif key == "x":
791 xrefs = {}
792 while True:
793 l = stream.readline()
794 if not l or not l.startswith(" - "):
795 break
796 key, val = [x.strip() for x in l[3:].split(":", 1)]
797 xrefs[int(key, 0)] = val
798 assert xrefs
799 props["xrefs"] = xrefs
801 if l is None:
802 l = stream.readline()
804 self.addr_map[addr] = props
806 def load_area(self, stream, area):
807 l = stream.readline()
808 vals = [int(v, 16) for v in l.split()]
809 assert area[START] == vals[0] and area[END] == vals[1]
810 flags = area[FLAGS]
811 i = 0
812 while True:
813 l = stream.readline().rstrip()
814 if not l:
815 break
816 l = binascii.unhexlify(l)
817 flags[i:i + len(l)] = l
818 i += len(l)
820 def load_areas(self, stream):
821 for a in self.area_list:
822 self.load_area(stream, a)
825 # Hack for idaapi interfacing
826 # TODO: should go to "Analysis" object
827 def analisys_stack_push(self, ea, flow_flag=idaapi.fl_JN):
828 global analisys_stack_branches, analisys_stack_calls
829 global analisys_stack_returns, analysis_current_func
830 # If we know something is func (e.g. from loader), jump
831 # to it means tail-call.
832 if flow_flag == idaapi.fl_RET_FROM_CALL:
833 analisys_stack_returns.append((ea, analysis_current_func))
834 elif flow_flag == idaapi.fl_CN or self.is_func(ea):
835 analisys_stack_calls.append(ea)
836 else:
837 analisys_stack_branches.append(ea)
840 ADDRESS_SPACE = AddressSpace()
841 _processor = None
842 def set_processor(p):
843 global _processor
844 _processor = p
845 idaapi.set_processor(p)
848 def set_arch_id(id):
849 global arch_id, code_addr_mask
850 arch_id = id
851 if arch_id == "arm_32_thumb":
852 code_addr_mask = -2
855 analisys_stack_calls = []
856 analisys_stack_returns = []
857 analisys_stack_branches = []
858 analysis_current_func = None
860 def add_entrypoint(ea, as_func=True):
861 if as_func:
862 ADDRESS_SPACE.make_func(ea, None)
863 analisys_stack_calls.append(ea)
864 else:
865 analisys_stack_branches.append(ea)
867 def init_cmd(ea):
868 _processor.cmd.ea = ea
869 _processor.cmd.size = 0
870 _processor.cmd.disasm = None
872 def finish_func(f):
873 if f:
874 log.info("Function %s (0x%x) ranges: %s" % (ADDRESS_SPACE.get_label(f.start), f.start, f.ranges.str(hex)))
875 end = f.get_end()
876 if end is not None:
877 ADDRESS_SPACE.set_func_end(f, end)
879 def analyze(callback=lambda cnt:None):
880 global analysis_current_func
881 cnt = 0
882 limit = 1000000
883 analysis_current_func = None
884 while limit:
885 if analisys_stack_branches:
886 ea = analisys_stack_branches.pop()
887 try:
888 fl = ADDRESS_SPACE.get_flags(ea, 0xff)
889 except InvalidAddrException:
890 log.warn("Branch outside address space detected: 0x%x" % ea)
891 continue
893 if fl == ADDRESS_SPACE.CODE | ADDRESS_SPACE.FUNC:
894 fun = ADDRESS_SPACE.get_func_start(ea)
895 if fun:
896 log.warn("Jump to (or flow into) a function at 0x%x detected" % ea)
898 if analysis_current_func:
899 if fl == ADDRESS_SPACE.CODE | ADDRESS_SPACE.FUNC:
900 continue
901 if fl not in (ADDRESS_SPACE.CODE, ADDRESS_SPACE.UNK):
902 log.warn("Unexpected flags 0x%x at 0x%x while tracing code branch, skipping it", fl, ea)
903 ADDRESS_SPACE.add_issue(ea, "Jump/flow into non-code")
904 continue
905 else:
906 if fl != ADDRESS_SPACE.UNK:
907 if fl != ADDRESS_SPACE.CODE:
908 ADDRESS_SPACE.add_issue(ea, "Jump/flow into non-code")
909 continue
910 elif analisys_stack_calls:
911 finish_func(analysis_current_func)
912 analysis_current_func = None
913 ea = analisys_stack_calls.pop()
914 fun = ADDRESS_SPACE.get_func_start(ea)
915 if fun.get_ranges():
916 continue
917 log.info("Starting analysis of function 0x%x" % ea)
918 analysis_current_func = ADDRESS_SPACE.make_func(ea)
919 elif analisys_stack_returns:
920 ea, analysis_current_func = analisys_stack_returns.pop()
921 #log.debug("Restarting analysis of call return at 0x%x (fl=%x)", ea, ADDRESS_SPACE.get_flags(ea, 0xff))
922 analisys_stack_branches.append(ea)
923 continue
924 else:
925 finish_func(analysis_current_func)
926 break
927 init_cmd(ea)
928 try:
929 insn_sz = _processor.ana()
930 except InvalidAddrException:
931 # Ran out of memory area, just continue
932 # with the rest of paths
933 continue
934 # print("size: %d" % insn_sz, _processor.cmd)
935 if insn_sz:
936 if not _processor.emu():
937 assert False
938 if analysis_current_func:
939 analysis_current_func.add_insn(ea, insn_sz)
940 ADDRESS_SPACE.make_code(ea, insn_sz, ADDRESS_SPACE.FUNC)
941 else:
942 ADDRESS_SPACE.make_code(ea, insn_sz)
943 _processor.out()
944 # print("%08x %s" % (_processor.cmd.ea, _processor.cmd.disasm))
945 # print("---------")
946 limit -= 1
947 cnt += 1
948 if cnt % 1000 == 0:
949 callback(cnt)
950 # if not analisys_stack:
951 # print("Analisys finished")
955 class Model:
957 def __init__(self, target_addr=0, target_subno=0):
958 self._lines = []
959 self._cnt = 0
960 self._subcnt = 0
961 self._last_addr = -1
962 self._addr2line = {}
963 self.AS = None
964 self.target_addr = target_addr
965 self.target_subno = target_subno
966 self.target_addr_lineno_0 = -1
967 self.target_addr_lineno = -1
968 self.target_addr_lineno_real = -1
970 def lines(self):
971 return self._lines
973 def add_object(self, addr, line):
974 if addr != self._last_addr:
975 self._last_addr = addr
976 self._subcnt = 0
977 if addr == self.target_addr:
978 if self._subcnt == 0:
979 # Contains first line related to the given addr
980 self.target_addr_lineno_0 = self._cnt
981 if self._subcnt == self.target_subno:
982 # Contains line no. target_subno related to the given addr
983 self.target_addr_lineno = self._cnt
984 if not line.virtual:
985 # Contains line where actual instr/data/unknown bytes are
986 # rendered (vs labels/xrefs/etc.)
987 self.target_addr_lineno_real = self._cnt
988 self._lines.append(line)
989 self._addr2line[(addr, self._subcnt)] = self._cnt
990 line.subno = self._subcnt
991 if not line.virtual:
992 # Line of "real" disasm object
993 self._addr2line[(addr, -1)] = self._cnt
994 self._cnt += 1
995 self._subcnt += 1
997 def addr2line_no(self, addr, subno=-1):
998 return self._addr2line.get((addr, subno))
1000 def undefine_unit(self, addr):
1001 sz = self.AS.get_unit_size(addr)
1002 self.AS.make_undefined(addr, sz)
1005 def data_sz2mnem(sz):
1006 s = {1: "db", 2: "dw", 4: "dd"}[sz]
1007 return idaapi.fillstr(s, idaapi.DEFAULT_WIDTH)
1010 class DisasmObj:
1012 # Size of "leader fields" in disasm window - address, raw bytes, etc.
1013 # May be set by MVC controller
1014 LEADER_SIZE = 9
1016 # Default indent for a line
1017 indent = " " * idaapi.DEFAULT_INDENT
1019 # Default operand positions list is empty and set on class level
1020 # to save memory. To be overriden on object level.
1021 arg_pos = ()
1023 # If False, this object corresponds to real bytes in input binary stream
1024 # If True, doesn't correspond to bytes in memory: labels, etc.
1025 virtual = True
1027 # Textual comment to append
1028 comment = ""
1030 # Instance variable expected to be set on each instance:
1031 # ea =
1032 # size =
1033 # subno = # relative no. of several lines corresponding to the same ea
1035 def render(self):
1036 # Render object as a string, set it as .cache, and return it
1037 pass
1039 def get_operand_addr(self):
1040 # Get "the most addressful" operand
1041 # This for example will be called when Enter is pressed
1042 # not on a specific instruction operand, so this should
1043 # return value of the operand which contains an address
1044 # (or the "most suitable" of them if there're few).
1045 return None
1047 def __len__(self):
1048 # Each object should return real character len as display on the screen.
1049 # Should be fast - called on each cursor movement.
1050 try:
1051 return self.LEADER_SIZE + len(self.indent) + len(self.cache)
1052 except AttributeError:
1053 return self.LEADER_SIZE + len(self.indent) + len(self.render())
1055 def content_len(self):
1056 return len(self) - (self.LEADER_SIZE + len(self.indent))
1059 class Instruction(idaapi.insn_t, DisasmObj):
1061 virtual = False
1063 def render(self):
1064 _processor.cmd = self
1065 _processor.out()
1066 s = self.disasm + self.comment
1067 self.cache = s
1068 return s
1070 def get_operand_addr(self):
1071 # Assumes RISC design where only one operand can be address
1072 mem = imm = None
1073 for o in self._operands:
1074 if o.flags & idaapi.OF_SHOW:
1075 if o.type == idaapi.o_near:
1076 # Jumps have priority
1077 return o
1078 if o.type == idaapi.o_mem:
1079 mem = o
1080 elif o.type == idaapi.o_imm:
1081 imm = o
1082 if mem:
1083 return mem
1084 return imm
1087 class Data(DisasmObj):
1089 virtual = False
1091 def __init__(self, ea, sz, val):
1092 self.ea = ea
1093 self.size = sz
1094 self.val = val
1096 def render(self):
1097 subtype = ADDRESS_SPACE.get_arg_prop(self.ea, 0, "subtype")
1098 if subtype == IMM_ADDR:
1099 label = self.val
1100 if not isinstance(label, str):
1101 label = ADDRESS_SPACE.get_label(label)
1102 s = "%s%s" % (data_sz2mnem(self.size), label)
1103 else:
1104 s = "%s0x%x" % (data_sz2mnem(self.size), self.val)
1105 s += self.comment
1106 self.cache = s
1107 return s
1109 def get_operand_addr(self):
1110 o = idaapi.op_t(0)
1111 o.value = self.val
1112 o.addr = self.val
1113 o.type = idaapi.o_imm
1114 return o
1117 class String(DisasmObj):
1119 virtual = False
1121 def __init__(self, ea, sz, val):
1122 self.ea = ea
1123 self.size = sz
1124 self.val = val
1126 def render(self):
1127 s = "%s%s" % (data_sz2mnem(1), repr(self.val).replace("\\x00", "\\0"))
1128 s += self.comment
1129 self.cache = s
1130 return s
1133 class Fill(DisasmObj):
1135 virtual = False
1137 def __init__(self, ea, sz):
1138 self.ea = ea
1139 self.size = sz
1140 self.cache = idaapi.fillstr(".fill", idaapi.DEFAULT_WIDTH) + str(sz)
1142 def render(self):
1143 return self.cache
1146 class Unknown(DisasmObj):
1148 virtual = False
1149 size = 1
1151 def __init__(self, ea, val):
1152 self.ea = ea
1153 self.val = val
1155 def render(self):
1156 ch = ""
1157 if 0x20 <= self.val <= 0x7e:
1158 ch = " ; '%s'" % chr(self.val)
1159 s = "%s0x%02x%s" % (idaapi.fillstr("unk", idaapi.DEFAULT_WIDTH), self.val, ch)
1160 s += self.comment
1161 self.cache = s
1162 return s
1165 class Label(DisasmObj):
1167 indent = ""
1169 def __init__(self, ea):
1170 self.ea = ea
1172 def render(self):
1173 label = ADDRESS_SPACE.get_label(self.ea)
1174 s = "%s:" % label
1175 self.cache = s
1176 return s
1179 class Xref(DisasmObj):
1181 indent = ""
1183 def __init__(self, ea, from_addr, type):
1184 self.ea = ea
1185 self.from_addr = from_addr
1186 self.type = type
1188 def render(self):
1189 func = ADDRESS_SPACE.lookup_func(self.from_addr)
1190 extra = ""
1191 if func:
1192 extra = ADDRESS_SPACE.get_label(func.start)
1193 off = self.from_addr - func.start
1194 if off != 0:
1195 extra += "+0x%x" % off
1196 extra = " (%s)" % extra
1197 s = (" " * idaapi.DEFAULT_XREF_INDENT) + "; xref: %s 0x%x" % (self.type, self.from_addr) + extra
1198 self.cache = s
1199 return s
1201 def get_operand_addr(self):
1202 o = idaapi.op_t(0)
1203 o.addr = self.from_addr
1204 return o
1207 class Literal(DisasmObj):
1209 indent = ""
1211 def __init__(self, ea, str):
1212 self.ea = ea
1213 self.cache = str
1215 def render(self):
1216 return self.cache
1219 # Separate types to differentiate content
1220 class AreaWrapper(Literal):
1221 pass
1223 # Separate types to differentiate content
1224 class FunctionWrapper(Literal):
1225 pass
1228 def render():
1229 model = Model()
1230 render_partial(model, 0, 0, 1000000)
1231 return model
1233 # How much bytes may a single disasm object (i.e. a line) occupy
1234 MAX_UNIT_SIZE = 4
1236 def render_partial_around(addr, subno, context_lines):
1237 log.debug("render_partial_around(%x, %d)", addr, subno)
1238 off, area = ADDRESS_SPACE.addr2area(addr)
1239 if area is None:
1240 return None
1241 back = context_lines * MAX_UNIT_SIZE
1242 off -= back
1243 if off < 0:
1244 area_no = ADDRESS_SPACE.area_no(area) - 1
1245 while area_no >= 0:
1246 area = ADDRESS_SPACE.area_list[area_no]
1247 sz = area[1] - area[0] + 1
1248 off += sz
1249 if off >= 0:
1250 break
1251 area_no -= 1
1252 if off < 0:
1253 # Reached beginning of address space, just set as such
1254 off = 0
1255 assert off >= 0
1256 log.debug("render_partial_around: off=0x%x, %s", off, str_area(area))
1257 off = ADDRESS_SPACE.adjust_offset_reverse(off, area)
1258 log.debug("render_partial_around adjusted: off=0x%x, %s", off, str_area(area))
1259 model = Model(addr, subno)
1260 render_partial(model, ADDRESS_SPACE.area_list.index(area), off, context_lines, addr)
1261 log.debug("render_partial_around model done, lines: %d", len(model.lines()))
1262 assert model.target_addr_lineno_0 >= 0
1263 if model.target_addr_lineno == -1:
1264 # If we couldn't find exact subno, use 0th subno of that addr
1265 # TODO: maybe should be last subno, because if we couldn't find
1266 # exact one, it was ~ last and removed, so current last is "closer"
1267 # to it.
1268 model.target_addr_lineno = model.target_addr_lineno_0
1269 return model
1272 def render_from(model, addr, num_lines):
1273 off, area = ADDRESS_SPACE.addr2area(addr)
1274 if area is None:
1275 return None
1276 return render_partial(model, ADDRESS_SPACE.area_list.index(area), off, num_lines)
1279 def render_partial(model, area_no, offset, num_lines, target_addr=-1):
1280 model.AS = ADDRESS_SPACE
1281 start = True
1282 #for a in ADDRESS_SPACE.area_list:
1283 while area_no < len(ADDRESS_SPACE.area_list):
1284 a = ADDRESS_SPACE.area_list[area_no]
1285 area_no += 1
1286 i = 0
1287 if start:
1288 i = offset
1289 start = False
1290 if i == 0:
1291 model.add_object(a[START], AreaWrapper(a[START], "; Start of 0x%x area (%s)" % (a[START], a[PROPS].get("name", "noname"))))
1292 bytes = a[BYTES]
1293 flags = a[FLAGS]
1294 areasize = len(bytes)
1295 while i < areasize:
1296 addr = a[START] + i
1297 # If we didn't yet reach target address, compensate for
1298 # the following decrement of num_lines. The logic is:
1299 # render all lines up to target_addr, and then num_lines past it.
1300 if target_addr >= 0 and addr < target_addr:
1301 num_lines += 1
1303 props = ADDRESS_SPACE.get_addr_prop_dict(addr)
1304 func = props.get("fun_s")
1305 if func:
1306 model.add_object(addr, FunctionWrapper(addr, "; Start of function '%s'" % ADDRESS_SPACE.get_label(func.start)))
1308 xrefs = props.get("xrefs")
1309 if xrefs:
1310 for from_addr in sorted(xrefs.keys()):
1311 model.add_object(addr, Xref(addr, from_addr, xrefs[from_addr]))
1313 label = props.get("label")
1314 if label:
1315 model.add_object(addr, Label(addr))
1317 f = flags[i] & 0x7f
1318 if f == AddressSpace.UNK:
1319 out = Unknown(addr, bytes[i])
1320 sz = 1
1321 i += 1
1322 elif f & AddressSpace.DATA:
1323 sz = 1
1324 j = i + 1
1325 while j < areasize and flags[j] & AddressSpace.DATA_CONT:
1326 sz += 1
1327 j += 1
1328 assert sz <= 4
1329 out = Data(addr, sz, ADDRESS_SPACE.get_data(addr, sz))
1330 i += sz
1331 elif f == AddressSpace.STR:
1332 str = chr(bytes[i])
1333 sz = 1
1334 j = i + 1
1335 while j < areasize and flags[j] == AddressSpace.DATA_CONT:
1336 str += chr(bytes[j])
1337 sz += 1
1338 j += 1
1339 out = String(addr, sz, str)
1340 i += sz
1341 elif f == AddressSpace.FILL:
1342 sz = 1
1343 j = i + 1
1344 while j < areasize and flags[j] == AddressSpace.FILL:
1345 sz += 1
1346 j += 1
1347 out = Fill(addr, sz)
1348 i += sz
1349 elif f == AddressSpace.CODE:
1350 out = Instruction(addr)
1351 _processor.cmd = out
1352 sz = _processor.ana()
1353 _processor.out()
1354 i += sz
1355 else:
1356 out = Literal(addr, "; UNEXPECTED value: %02x flags: %02x" % (bytes[i], f))
1357 sz = 1
1358 i += 1
1359 assert 0, "@%08x flags=%x" % (addr, f)
1361 comm = props.get("comm")
1362 if comm:
1363 comm_indent = " " * (out.content_len() + len(out.indent) + 2)
1364 out.comment = " ; " + comm.split("\n", 1)[0]
1366 model.add_object(addr, out)
1367 #sys.stdout.write(out + "\n")
1369 if comm:
1370 for comm_l in comm.split("\n")[1:]:
1371 comm_obj = Literal(addr, "; " + comm_l)
1372 comm_obj.indent = comm_indent
1373 model.add_object(addr, comm_obj)
1375 next_addr = addr + sz
1376 next_props = ADDRESS_SPACE.get_addr_prop_dict(next_addr)
1377 func_end = next_props.get("fun_e")
1378 if func_end:
1379 model.add_object(addr, FunctionWrapper(addr, "; End of function '%s' (%s)" % (
1380 ADDRESS_SPACE.get_label(func_end.start), func_end.get_end_method()
1383 num_lines -= 1
1384 if not num_lines:
1385 return next_addr
1387 model.add_object(a[END], AreaWrapper(a[END], "; End of 0x%x area (%s)" % (a[START], a[PROPS].get("name", "noname"))))
1390 def flag2char(f):
1391 if f == AddressSpace.UNK:
1392 return "."
1393 elif f == AddressSpace.CODE:
1394 return "C"
1395 elif f == AddressSpace.CODE | AddressSpace.FUNC:
1396 return "F"
1397 elif f == AddressSpace.CODE_CONT:
1398 return "c"
1399 elif f == AddressSpace.DATA:
1400 return "D"
1401 elif f == AddressSpace.DATA_CONT:
1402 return "d"
1403 elif f == AddressSpace.STR:
1404 return "A"
1405 elif f == AddressSpace.FILL:
1406 return "-"
1407 else:
1408 return "X"
1410 def print_address_map():
1411 for a in ADDRESS_SPACE.area_list:
1412 for i in range(len(a[FLAGS])):
1413 if i % 128 == 0:
1414 sys.stdout.write("\n")
1415 sys.stdout.write("%08x " % (a[START] + i))
1416 sys.stdout.write(flag2char(a[FLAGS][i]))
1417 sys.stdout.write("\n")
1420 idaapi.set_address_space(ADDRESS_SPACE)