Skip to content

Commit

Permalink
Improved behavior for indirect name lookup (#89)
Browse files Browse the repository at this point in the history
* Improved behavior for indirect name lookup

* Make sure import entities have a name
  • Loading branch information
disinvite authored Mar 2, 2025
1 parent 07045cb commit c51e57b
Show file tree
Hide file tree
Showing 6 changed files with 437 additions and 128 deletions.
57 changes: 28 additions & 29 deletions reccmp/isledecomp/compare/asm/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,7 @@
placeholder string."""

import re
import struct
from functools import cache
from typing import Callable
from .const import JUMP_MNEMONICS, SINGLE_OPERAND_INSTS
from .instgen import InstructGen, SectionType
from .replacement import AddrTestProtocol, NameReplacementProtocol
Expand All @@ -33,28 +31,22 @@ def from_hex(string: str) -> int | None:
return None


def bytes_to_dword(b: bytes) -> int | None:
if len(b) == 4:
return struct.unpack("<L", b)[0]

return None


class ParseAsm:
def __init__(
self,
addr_test: AddrTestProtocol | None = None,
name_lookup: NameReplacementProtocol | None = None,
bin_lookup: Callable[[int, int], bytes | None] | None = None,
) -> None:
self.addr_test = addr_test
self.name_lookup = name_lookup
self.bin_lookup = bin_lookup

self.replacements: dict[int, str] = {}
self.indirect_replacements: dict[int, str] = {}
self.number_placeholders = True

def reset(self):
self.replacements = {}
self.indirect_replacements = {}

def is_addr(self, value: int) -> bool:
"""Wrapper for user-provided address test"""
Expand All @@ -63,13 +55,22 @@ def is_addr(self, value: int) -> bool:

return False

def lookup(self, addr: int, exact: bool = False) -> str | None:
def lookup(
self, addr: int, exact: bool = False, indirect: bool = False
) -> str | None:
"""Wrapper for user-provided name lookup"""
if callable(self.name_lookup):
return self.name_lookup(addr, exact=exact)
return self.name_lookup(addr, exact=exact, indirect=indirect)

return None

def _next_placeholder(self) -> str:
"""The placeholder number corresponds to the number of addresses we have
already replaced. This is so the number will be consistent across the diff
if we can replace some symbols with actual names in recomp but not orig."""
number = len(self.replacements) + len(self.indirect_replacements) + 1
return f"<OFFSET{number}>" if self.number_placeholders else "<OFFSET>"

def replace(self, addr: int, exact: bool = False) -> str:
"""Provide a replacement name for the given address."""
if addr in self.replacements:
Expand All @@ -79,14 +80,22 @@ def replace(self, addr: int, exact: bool = False) -> str:
self.replacements[addr] = name
return name

# The placeholder number corresponds to the number of addresses we have
# already replaced. This is so the number will be consistent across the diff
# if we can replace some symbols with actual names in recomp but not orig.
idx = len(self.replacements) + 1
placeholder = f"<OFFSET{idx}>" if self.number_placeholders else "<OFFSET>"
placeholder = self._next_placeholder()
self.replacements[addr] = placeholder
return placeholder

def indirect_replace(self, addr: int) -> str:
if addr in self.indirect_replacements:
return self.indirect_replacements[addr]

if (name := self.lookup(addr, exact=True, indirect=True)) is not None:
self.indirect_replacements[addr] = name
return name

placeholder = self._next_placeholder()
self.indirect_replacements[addr] = placeholder
return placeholder

def hex_replace_always(self, match: re.Match) -> str:
"""If a pointer value was matched, always insert a placeholder"""
value = int(match.group(1), 16)
Expand Down Expand Up @@ -119,17 +128,7 @@ def hex_replace_indirect(self, match: re.Match) -> str:
If we cannot identify the indirect address, fall back to a lookup
on the original pointer value so we might display something useful."""
value = int(match.group(1), 16)
indirect_value = None

if callable(self.bin_lookup):
indirect_value = self.bin_lookup(value, 4)

if indirect_value is not None:
indirect_addr = bytes_to_dword(indirect_value)
if indirect_addr is not None and self.lookup(indirect_addr) is not None:
return "->" + self.replace(indirect_addr)

return self.replace(value)
return self.indirect_replace(value)

def sanitize(self, inst: DisasmLiteInst) -> tuple[str, str]:
# For jumps or calls, if the entire op_str is a hex number, the value
Expand Down
86 changes: 75 additions & 11 deletions reccmp/isledecomp/compare/asm/replacement.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,28 +10,92 @@ def __call__(self, addr: int, /) -> bool:


class NameReplacementProtocol(Protocol):
def __call__(self, addr: int, exact: bool = False) -> str | None:
def __call__(
self, addr: int, exact: bool = False, indirect: bool = False
) -> str | None:
...


def create_name_lookup(
db_getter: Callable[[int, bool], ReccmpEntity | None], addr_attribute: str
db_getter: Callable[[int, bool], ReccmpEntity | None],
bin_read: Callable[[int], int | None],
addr_attribute: str,
) -> NameReplacementProtocol:
"""Function generator for name replacement"""

@cache
def lookup(addr: int, exact: bool = False) -> str | None:
m = db_getter(addr, exact)
if m is None:
def follow_indirect(pointer: int) -> ReccmpEntity | None:
"""Read the pointer address and open the entity (if it exists) at the indirect location."""
addr = bin_read(pointer)
if addr is not None:
return db_getter(addr, True)

return None

def get_name(entity: ReccmpEntity, offset: int = 0) -> str | None:
"""The offset is the difference between the input search address and the entity's
starting address. Decide whether to return the base name (match_name) or
a string wtih the base name plus the offset.
Returns None if there is no suitable name."""
if offset == 0:
return entity.match_name()

# We will not return an offset name if this is not a variable
# or if the offset is outside the range of the entity.
if entity.entity_type != EntityType.DATA or offset >= entity.size:
return None

if getattr(m, addr_attribute) == addr:
return m.match_name()
return entity.offset_name(offset)

def indirect_lookup(addr: int) -> str | None:
"""Same as regular lookup but aware of the fact that the address is a pointer.
Indirect implies exact search, so we drop both parameters from the lookup entry point.
"""
entity = db_getter(addr, True)
if entity is not None:
# If the indirect call points at a variable initialized to a function,
# prefer the variable name as this is more useful.
if entity.entity_type == EntityType.DATA:
return entity.match_name()

if entity.entity_type == EntityType.IMPORT:
import_name = entity.get("import_name")
if import_name is not None:
return "->" + import_name + " (FUNCTION)"

return entity.match_name()

# No suitable entity at the base address. Read the pointer and see what we get.
entity = follow_indirect(addr)

if entity is None:
return None

# Exact match only for indirect.
# The 'addr' variable still points at the indirect addr.
name = get_name(entity, offset=0)
if name is not None:
return "->" + name

return None

@cache
def lookup(addr: int, exact: bool = False, indirect: bool = False) -> str | None:
"""Returns the name that represents the entity at the given addresss.
If there is no suitable name, return None and let the caller choose one (i.e. placeholder).
* exact: If the addr is an offset of an entity (e.g. struct/array) we may return
a name like 'variable+8'. If exact is True, return a name only if the entity's addr
matches the addr parameter.
* indirect: If True, the given addr is a pointer so we have the option to read the address
from the binary to find the name."""
if indirect:
return indirect_lookup(addr)

entity = db_getter(addr, exact)

offset = addr - getattr(m, addr_attribute)
if m.entity_type != EntityType.DATA or offset >= m.size:
if entity is None:
return None

return m.offset_name(offset)
offset = addr - getattr(entity, addr_attribute)
return get_name(entity, offset)

return lookup
96 changes: 47 additions & 49 deletions reccmp/isledecomp/compare/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@
import uuid
from dataclasses import dataclass
from typing import Callable, Iterable, Iterator
from reccmp.isledecomp.formats.exceptions import InvalidVirtualAddressError
from reccmp.isledecomp.formats.exceptions import (
InvalidVirtualAddressError,
InvalidVirtualReadError,
)
from reccmp.isledecomp.formats.pe import PEImage
from reccmp.isledecomp.cvdump.demangler import (
demangle_string_const,
Expand Down Expand Up @@ -75,13 +78,14 @@ def lookup(addr: int) -> bool:
return lookup


def create_bin_lookup(bin_file: PEImage) -> Callable[[int, int], bytes | None]:
"""Function generator for reading from the bin file"""
def create_bin_lookup(bin_file: PEImage) -> Callable[[int], int | None]:
"""Function generator to read a pointer from the bin file"""

def lookup(addr: int, size: int) -> bytes | None:
def lookup(addr: int) -> int | None:
try:
return bin_file.read(addr, size)
except InvalidVirtualAddressError:
(ptr,) = struct.unpack("<L", bin_file.read(addr, 4))
return ptr
except (struct.error, InvalidVirtualAddressError, InvalidVirtualReadError):
return None

return lookup
Expand Down Expand Up @@ -143,13 +147,17 @@ def __init__(

self.orig_sanitize = ParseAsm(
addr_test=create_reloc_lookup(self.orig_bin),
name_lookup=create_name_lookup(self._db.get_by_orig, "orig_addr"),
bin_lookup=create_bin_lookup(self.orig_bin),
name_lookup=create_name_lookup(
self._db.get_by_orig, create_bin_lookup(self.orig_bin), "orig_addr"
),
)
self.recomp_sanitize = ParseAsm(
addr_test=create_reloc_lookup(self.recomp_bin),
name_lookup=create_name_lookup(self._db.get_by_recomp, "recomp_addr"),
bin_lookup=create_bin_lookup(self.recomp_bin),
name_lookup=create_name_lookup(
self._db.get_by_recomp,
create_bin_lookup(self.recomp_bin),
"recomp_addr",
),
)

def _load_cvdump(self):
Expand Down Expand Up @@ -521,48 +529,38 @@ def _match_imports(self):
recomp_byname = {
(dll.upper(), name): addr for (dll, name, addr) in self.recomp_bin.imports
}
# Combine these two dictionaries. We don't care about imports from recomp
# not found in orig because:
# 1. They shouldn't be there
# 2. They are already identified via cvdump
orig_to_recomp = {
addr: recomp_byname.get(pair, None) for addr, pair in orig_byaddr.items()
}

# Now: we have the IAT offset in each matched up, so we need to make
# the connection between the thunk functions.
# We already have the symbol name we need from the PDB.
for orig, recomp in orig_to_recomp.items():
if orig is None or recomp is None:
continue

# Match the __imp__ symbol
self._db.set_pair(orig, recomp, EntityType.POINTER)
with self._db.batch() as batch:
for dll, name, addr in self.orig_bin.imports:
import_name = f"{dll.upper()}:{name}"
batch.set_orig(
addr,
name=f"__imp__{name}",
import_name=import_name,
size=4,
type=EntityType.IMPORT,
)

# Read the relative address from .idata
try:
(recomp_rva,) = struct.unpack("<L", self.recomp_bin.read(recomp, 4))
(orig_rva,) = struct.unpack("<L", self.orig_bin.read(orig, 4))
except ValueError:
# Bail out if there's a problem with struct.unpack
continue
for dll, name, addr in self.recomp_bin.imports:
# TODO: recomp imports should already have a name from the PDB
# but set it anyway to avoid problems later.
import_name = f"{dll.upper()}:{name}"
batch.set_recomp(
addr,
name=f"__imp__{name}",
import_name=import_name,
size=4,
type=EntityType.IMPORT,
)

# Strictly speaking, this is a hack to support asm sanitize.
# When calling an import, we will recognize that the address for the
# CALL instruction is a pointer to the actual address, but this is
# not only not the address of a function, it is not an address at all.
# To make the asm display work correctly (i.e. to match what you see
# in ghidra) create a function match on the RVA. This is not a valid
# virtual address because it is before the imagebase, but it will
# do what we need it to do in the sanitize function.

(dll_name, func_name) = orig_byaddr[orig]
fullname = dll_name + ":" + func_name
self._db.set_recomp_symbol(
recomp_rva, type=EntityType.FUNCTION, name=fullname, size=4
)
self._db.set_pair(orig_rva, recomp_rva, EntityType.FUNCTION)
self._db.skip_compare(orig_rva)
# Combine these two dictionaries. We don't care about imports from recomp
# not found in orig because:
# 1. They shouldn't be there
# 2. They are already identified via cvdump
for orig_addr, pair in orig_byaddr.items():
recomp_addr = recomp_byname.get(pair, None)
if recomp_addr is not None:
batch.match(orig_addr, recomp_addr)

def _match_thunks(self):
"""Thunks are (by nature) matched by indirection. If a thunk from orig
Expand Down
1 change: 1 addition & 0 deletions reccmp/isledecomp/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ class EntityType(IntEnum):
STRING = 4
VTABLE = 5
FLOAT = 6
IMPORT = 7
Loading

0 comments on commit c51e57b

Please sign in to comment.