diff --git a/reccmp/isledecomp/formats/exceptions.py b/reccmp/isledecomp/formats/exceptions.py index b1d4b924..86f349ca 100644 --- a/reccmp/isledecomp/formats/exceptions.py +++ b/reccmp/isledecomp/formats/exceptions.py @@ -5,3 +5,9 @@ class SectionNotFoundError(KeyError): class InvalidVirtualAddressError(IndexError): """The given virtual address is too high or low to point to something in the binary file.""" + + +class InvalidVirtualReadError(IndexError): + """Reading the given number of bytes from the given virtual address + would cause us to read past the end of the section or past the end + of the virtual address space.""" diff --git a/reccmp/isledecomp/formats/pe.py b/reccmp/isledecomp/formats/pe.py index 530c7f94..fed08444 100644 --- a/reccmp/isledecomp/formats/pe.py +++ b/reccmp/isledecomp/formats/pe.py @@ -5,7 +5,6 @@ - Debug information: https://www.debuginfo.com/examples/src/DebugDir.cpp """ -import bisect import dataclasses from enum import IntEnum, IntFlag from functools import cached_property @@ -13,7 +12,11 @@ import struct from typing import Iterator, Optional -from .exceptions import InvalidVirtualAddressError, SectionNotFoundError +from .exceptions import ( + InvalidVirtualAddressError, + InvalidVirtualReadError, + SectionNotFoundError, +) from .image import Image from .mz import ImageDosHeader @@ -467,7 +470,6 @@ class PEImage(Image): # FIXME: do these belong to PEImage? Shouldn't the loade apply these to the data? _relocated_addrs: set[int] = dataclasses.field(default_factory=set, repr=False) _relocations: set[int] = dataclasses.field(default_factory=set, repr=False) - _section_vaddr: list[int] = dataclasses.field(default_factory=list, repr=False) # find_str: bool = dataclasses.field(default=False, repr=False) imports: list[tuple[int, str]] = dataclasses.field(default_factory=list, repr=False) exports: list[tuple[str, str, int]] = dataclasses.field( @@ -524,9 +526,6 @@ def load(self): f"reccmp only supports i386 binaries: {self.header.machine}." ) - # bisect does not support key on the GitHub CI version of python - self._section_vaddr = [section.virtual_address for section in self.sections] - self._populate_relocations() self._populate_imports() self._populate_thunks() @@ -941,16 +940,27 @@ def get_abs_addr(self, section: int, offset: int) -> int: into an absolute vaddr.""" return self.get_section_offset_by_index(section) + offset - def get_relative_addr(self, addr: int) -> tuple[int, int]: - """Convert an absolute address back into a (section, offset) pair.""" - i = bisect.bisect_right(self._section_vaddr, addr) - 1 - i = max(0, i) + @cached_property + def vaddr_ranges(self) -> list[tuple[int, int]]: + """Return the start and end virtual address of each section in the file.""" + return list( + ( + self.imagebase + section.virtual_address, + self.imagebase + + section.virtual_address + + max(section.size_of_raw_data, section.virtual_size), + ) + for section in self.section_headers + ) - section = self.sections[i] - if section.contains_vaddr(addr): - return i + 1, addr - section.virtual_address + def get_relative_addr(self, addr: int) -> tuple[int, int]: + """Convert an absolute address back into a (section_id, offset) pair. + n.b. section_id is 1-based to match PDB output.""" + for i, (start, end) in enumerate(self.vaddr_ranges): + if start <= addr < end: + return i + 1, addr - start - raise InvalidVirtualAddressError(f"{self.filepath} : 0x{addr:08x} {section=}") + raise InvalidVirtualAddressError(f"{self.filepath} : 0x{addr:x}") def is_valid_section(self, section_id: int) -> bool: """The PDB will refer to sections that are not listed in the headers @@ -962,36 +972,64 @@ def is_valid_section(self, section_id: int) -> bool: return False def is_valid_vaddr(self, vaddr: int) -> bool: - """Does this virtual address point to anything in the exe?""" - try: - (_, __) = self.get_relative_addr(vaddr) - except InvalidVirtualAddressError: - return False + """Is this virtual address part of the image when loaded?""" + # Use max here just in case the section headers are not ordered by v.addr + (_, last_vaddr) = max(self.vaddr_ranges, key=lambda s: s[1]) + return self.imagebase <= vaddr < last_vaddr - return True + @cached_property + def uninitialized_ranges(self) -> list[tuple[int, int]]: + """Return a start and end range of each region in the file that holds uninitialized data. + This can be an entire section (.bss) or the gap between the end of the physical data + and the virtual size. These ranges do not correspond to section ids.""" + output = [] + for section in self.section_headers: + if ( + section.characteristics + & PESectionFlags.IMAGE_SCN_CNT_UNINITIALIZED_DATA + ): + output.append( + ( + self.imagebase + section.virtual_address, + self.imagebase + section.virtual_address + section.virtual_size, + ) + ) + elif section.virtual_size > section.size_of_raw_data: + # Should also cover the case where size_of_raw_data = 0. + output.append( + ( + self.imagebase + + section.virtual_address + + section.size_of_raw_data, + self.imagebase + section.virtual_address + section.virtual_size, + ) + ) - def read_string(self, offset: int, chunk_size: int = 1000) -> Optional[bytes]: - """Read until we find a zero byte.""" - b = self.read(offset, chunk_size) - if b is None: - return None + return output - try: - return b[: b.index(b"\x00")] - except ValueError: - # No terminator found, just return what we have - return b + def addr_is_uninitialized(self, vaddr: int) -> bool: + return any(start <= vaddr < end for start, end in self.uninitialized_ranges) - def read(self, vaddr: int, size: int) -> Optional[bytes]: - """Read (at most) the given number of bytes at the given virtual address. - If we return None, the given address points to uninitialized data.""" + def read_string(self, vaddr: int, chunk_size: int = 1000) -> bytes: + """Read up to chunk_size or until we find a zero byte.""" (section_id, offset) = self.get_relative_addr(vaddr) section = self.sections[section_id - 1] + view = section.view[offset : offset + chunk_size] + # Don't call read() here because we might not get the entire chunk size. + # Use whatever we can get if we are at the end of the section. + return view.tobytes().partition(b"\x00")[0] - if section.addr_is_uninitialized(vaddr): - return None + def read(self, vaddr: int, size: int) -> bytes: + (section_id, offset) = self.get_relative_addr(vaddr) + section = self.sections[section_id - 1] + + # If we try to read off the end of the section + if size < 0 or (offset + size) > section.extent: + raise InvalidVirtualReadError( + f"{self.filepath} : Cannot read {size} bytes from 0x{vaddr:x}" + ) - # Clamp the read within the extent of the current section. - # Reading off the end will most likely misrepresent the virtual addressing. - _size = min(size, section.size_of_raw_data - offset) - return bytes(section.view[offset : offset + _size]) + # Pad with zero bytes if reading uninitialized data. + # Assumes the section memoryview is cropped to the initialized bytes + view = section.view[offset : offset + size] + return bytes(view) + b"\x00" * (size - len(view)) diff --git a/reccmp/tools/datacmp.py b/reccmp/tools/datacmp.py index cef5058f..2524ee21 100755 --- a/reccmp/tools/datacmp.py +++ b/reccmp/tools/datacmp.py @@ -142,6 +142,7 @@ def create_comparison_item( def do_the_comparison(target: RecCmpBuiltTarget) -> Iterable[ComparisonItem]: + # pylint: disable=too-many-locals """Run through each variable in our compare DB, then do the comparison according to the variable's type. Emit the result.""" origfile = detect_image(filepath=target.original_path) @@ -193,48 +194,47 @@ def do_the_comparison(target: RecCmpBuiltTarget) -> Iterable[ComparisonItem]: orig_raw = origfile.read(var.orig_addr, data_size) recomp_raw = recompfile.read(var.recomp_addr, data_size) - # The IMAGE_SECTION_HEADER defines the SizeOfRawData and VirtualSize for the section. - # If VirtualSize > SizeOfRawData, the section is comprised of the initialized data - # corresponding to bytes in the file, and the rest is padded with zeroes when - # Windows loads the image. - # The linker might place variables initialized to zero on the threshold between - # physical data and the virtual (uninitialized) data. - # If this happens (i.e. we get an incomplete read) we just do the same padding - # to prepare for the comparison. - if orig_raw is not None and len(orig_raw) < data_size: - orig_raw = orig_raw.ljust(data_size, b"\x00") - - if recomp_raw is not None and len(recomp_raw) < data_size: - recomp_raw = recomp_raw.ljust(data_size, b"\x00") - - # If one or both variables are entirely uninitialized - if orig_raw is None or recomp_raw is None: - # If both variables are uninitialized, we consider them equal. - match = orig_raw is None and recomp_raw is None - - # We can match a variable initialized to all zeroes with - # an uninitialized variable, but this may or may not actually - # be correct, so we flag it for the user. - uninit_force_match = not match and ( - (orig_raw is None and all(b == 0 for b in recomp_raw)) - or (recomp_raw is None and all(b == 0 for b in orig_raw)) + orig_is_null = all(b == 0 for b in orig_raw) + recomp_is_null = all(b == 0 for b in recomp_raw) + + # If all bytes are zero on either read, it's possible that the variable + # is uninitialized on one or both sides. Special handling for that situation: + if orig_is_null or recomp_is_null: + # Check the last address of the variable in each file to see if any of it is + # in the uninitialized area of the section. + orig_in_bss = origfile.addr_is_uninitialized(var.orig_addr + data_size - 1) + recomp_in_bss = recompfile.addr_is_uninitialized( + var.recomp_addr + data_size - 1 ) - orig_value = "(uninitialized)" if orig_raw is None else "(initialized)" - recomp_value = "(uninitialized)" if recomp_raw is None else "(initialized)" - yield create_comparison_item( - var, - compared=[ - ComparedOffset( - offset=0, - name=None, - match=match, - values=(orig_value, recomp_value), - ) - ], - raw_only=uninit_force_match, - ) - continue + if orig_in_bss or recomp_in_bss: + # We record a match if both items are null and: + # 1. Both values are entirely initialized to zero + # 2. All or part of both values are in the uninitialized area + match = ( + orig_is_null and recomp_is_null and (orig_in_bss == recomp_in_bss) + ) + + # However... you may not have full control over where the variable sits in the + # section, so we will only warn (and not log a diff) if the variable is + # initialized in one file but not the other. + uninit_force_match = orig_is_null and recomp_is_null + + orig_value = "(uninitialized)" if orig_in_bss else "(initialized)" + recomp_value = "(uninitialized)" if recomp_in_bss else "(initialized)" + yield create_comparison_item( + var, + compared=[ + ComparedOffset( + offset=0, + name=None, + match=match, + values=(orig_value, recomp_value), + ) + ], + raw_only=uninit_force_match, + ) + continue if not is_type_aware: # If there is no specific type information available diff --git a/tests/test_islebin.py b/tests/test_islebin.py index 86868681..7e643cf4 100644 --- a/tests/test_islebin.py +++ b/tests/test_islebin.py @@ -9,6 +9,7 @@ from reccmp.isledecomp.formats.exceptions import ( SectionNotFoundError, InvalidVirtualAddressError, + InvalidVirtualReadError, ) @@ -65,12 +66,47 @@ def test_unusual_reads(binfile: PEImage): with pytest.raises(InvalidVirtualAddressError): binfile.read(0xFFFFFFFF, 1) - # Uninitialized part of .data - assert binfile.read(0x1010A600, 4) is None + # Initialized bytes for .data end at 0x10102c00. Read from the uninitialized section. + # Older versions of reccmp would return None for uninitialized data. + # We now return zeroes to emulate the behavior of the real program. + assert binfile.read(0x1010A600, 4) == b"\x00\x00\x00\x00" - # Past the end of virtual size in .text + # Read the last 16 initialized bytes of .data + assert len(binfile.read(0x10102BF0, 16)) == 16 + + # Keep reading into the uninitialized section without an exception. + assert len(binfile.read(0x10102BF0, 32)) == 32 + + # Read 8 initialized and 8 uninitialized bytes. Should pad with zeroes. + assert binfile.read(0x10102BF8, 16) == (b"\x00" * 16) + + # Unlike .data, physical size for .text is larger than virtual size. + # This means the padding bytes are stored in the file. + # Read the unused but initialized bytes. assert binfile.read(0x100D3A70, 4) == b"\x00\x00\x00\x00" + # .text ends at 0x100d3c00. Even though .rdata does not begin until 0x100d4000, + # we still should not read past the end of virtual data. + with pytest.raises(InvalidVirtualReadError): + binfile.read(0x100D3BFF, 10) + + # Read past the final virtual address in .reloc + with pytest.raises(InvalidVirtualReadError): + binfile.read(0x10120DF0, 32) + + # Reading zero bytes is okay + assert binfile.read(0x100DB588, 0) == b"" + + # Cannot read with negative size + with pytest.raises(InvalidVirtualReadError): + binfile.read(0x100DB588, -1) + + # There are fewer than 1000 bytes in .reloc at the location of this string. + # Chunk size is chosen arbitrarily for string reads. The reason is that we need + # to read until we hit a zero byte, so we don't use the memoryview directly. + # This should not fail. + assert binfile.read_string(0x1010BFFC, 1000) == b"d3drm.dll" + STRING_ADDRESSES = ( (0x100DB588, b"November"),