Allow reads on uninitialized data (#43)

isledecomp · Dec 29, 2024 · 573b819 · 573b819
1 parent 4d457a0
commit 573b819
Show file tree

Hide file tree

Showing 4 changed files with 162 additions and 82 deletions.
diff --git a/reccmp/isledecomp/formats/exceptions.py b/reccmp/isledecomp/formats/exceptions.py
@@ -5,3 +5,9 @@ class SectionNotFoundError(KeyError):
 class InvalidVirtualAddressError(IndexError):
     """The given virtual address is too high or low
     to point to something in the binary file."""
+
+
+class InvalidVirtualReadError(IndexError):
+    """Reading the given number of bytes from the given virtual address
+    would cause us to read past the end of the section or past the end
+    of the virtual address space."""
diff --git a/reccmp/isledecomp/formats/pe.py b/reccmp/isledecomp/formats/pe.py
@@ -5,15 +5,18 @@
 - Debug information: https://www.debuginfo.com/examples/src/DebugDir.cpp
 """
 
-import bisect
 import dataclasses
 from enum import IntEnum, IntFlag
 from functools import cached_property
 from pathlib import Path
 import struct
 from typing import Iterator, Optional
 
-from .exceptions import InvalidVirtualAddressError, SectionNotFoundError
+from .exceptions import (
+    InvalidVirtualAddressError,
+    InvalidVirtualReadError,
+    SectionNotFoundError,
+)
 from .image import Image
 from .mz import ImageDosHeader
 
@@ -467,7 +470,6 @@ class PEImage(Image):
     # FIXME: do these belong to PEImage? Shouldn't the loade apply these to the data?
     _relocated_addrs: set[int] = dataclasses.field(default_factory=set, repr=False)
     _relocations: set[int] = dataclasses.field(default_factory=set, repr=False)
-    _section_vaddr: list[int] = dataclasses.field(default_factory=list, repr=False)
     # find_str: bool = dataclasses.field(default=False, repr=False)
     imports: list[tuple[int, str]] = dataclasses.field(default_factory=list, repr=False)
     exports: list[tuple[str, str, int]] = dataclasses.field(
@@ -524,9 +526,6 @@ def load(self):
                 f"reccmp only supports i386 binaries: {self.header.machine}."
             )
 
-        # bisect does not support key on the GitHub CI version of python
-        self._section_vaddr = [section.virtual_address for section in self.sections]
-
         self._populate_relocations()
         self._populate_imports()
         self._populate_thunks()
@@ -941,16 +940,27 @@ def get_abs_addr(self, section: int, offset: int) -> int:
         into an absolute vaddr."""
         return self.get_section_offset_by_index(section) + offset
 
-    def get_relative_addr(self, addr: int) -> tuple[int, int]:
-        """Convert an absolute address back into a (section, offset) pair."""
-        i = bisect.bisect_right(self._section_vaddr, addr) - 1
-        i = max(0, i)
+    @cached_property
+    def vaddr_ranges(self) -> list[tuple[int, int]]:
+        """Return the start and end virtual address of each section in the file."""
+        return list(
+            (
+                self.imagebase + section.virtual_address,
+                self.imagebase
+                + section.virtual_address
+                + max(section.size_of_raw_data, section.virtual_size),
+            )
+            for section in self.section_headers
+        )
 
-        section = self.sections[i]
-        if section.contains_vaddr(addr):
-            return i + 1, addr - section.virtual_address
+    def get_relative_addr(self, addr: int) -> tuple[int, int]:
+        """Convert an absolute address back into a (section_id, offset) pair.
+        n.b. section_id is 1-based to match PDB output."""
+        for i, (start, end) in enumerate(self.vaddr_ranges):
+            if start <= addr < end:
+                return i + 1, addr - start
 
-        raise InvalidVirtualAddressError(f"{self.filepath} : 0x{addr:08x} {section=}")
+        raise InvalidVirtualAddressError(f"{self.filepath} : 0x{addr:x}")
 
     def is_valid_section(self, section_id: int) -> bool:
         """The PDB will refer to sections that are not listed in the headers
@@ -962,36 +972,64 @@ def is_valid_section(self, section_id: int) -> bool:
             return False
 
     def is_valid_vaddr(self, vaddr: int) -> bool:
-        """Does this virtual address point to anything in the exe?"""
-        try:
-            (_, __) = self.get_relative_addr(vaddr)
-        except InvalidVirtualAddressError:
-            return False
+        """Is this virtual address part of the image when loaded?"""
+        # Use max here just in case the section headers are not ordered by v.addr
+        (_, last_vaddr) = max(self.vaddr_ranges, key=lambda s: s[1])
+        return self.imagebase <= vaddr < last_vaddr
 
-        return True
+    @cached_property
+    def uninitialized_ranges(self) -> list[tuple[int, int]]:
+        """Return a start and end range of each region in the file that holds uninitialized data.
+        This can be an entire section (.bss) or the gap between the end of the physical data
+        and the virtual size. These ranges do not correspond to section ids."""
+        output = []
+        for section in self.section_headers:
+            if (
+                section.characteristics
+                & PESectionFlags.IMAGE_SCN_CNT_UNINITIALIZED_DATA
+            ):
+                output.append(
+                    (
+                        self.imagebase + section.virtual_address,
+                        self.imagebase + section.virtual_address + section.virtual_size,
+                    )
+                )
+            elif section.virtual_size > section.size_of_raw_data:
+                # Should also cover the case where size_of_raw_data = 0.
+                output.append(
+                    (
+                        self.imagebase
+                        + section.virtual_address
+                        + section.size_of_raw_data,
+                        self.imagebase + section.virtual_address + section.virtual_size,
+                    )
+                )
 
-    def read_string(self, offset: int, chunk_size: int = 1000) -> Optional[bytes]:
-        """Read until we find a zero byte."""
-        b = self.read(offset, chunk_size)
-        if b is None:
-            return None
+        return output
 
-        try:
-            return b[: b.index(b"\x00")]
-        except ValueError:
-            # No terminator found, just return what we have
-            return b
+    def addr_is_uninitialized(self, vaddr: int) -> bool:
+        return any(start <= vaddr < end for start, end in self.uninitialized_ranges)
 
-    def read(self, vaddr: int, size: int) -> Optional[bytes]:
-        """Read (at most) the given number of bytes at the given virtual address.
-        If we return None, the given address points to uninitialized data."""
+    def read_string(self, vaddr: int, chunk_size: int = 1000) -> bytes:
+        """Read up to chunk_size or until we find a zero byte."""
         (section_id, offset) = self.get_relative_addr(vaddr)
         section = self.sections[section_id - 1]
+        view = section.view[offset : offset + chunk_size]
+        # Don't call read() here because we might not get the entire chunk size.
+        # Use whatever we can get if we are at the end of the section.
+        return view.tobytes().partition(b"\x00")[0]
 
-        if section.addr_is_uninitialized(vaddr):
-            return None
+    def read(self, vaddr: int, size: int) -> bytes:
+        (section_id, offset) = self.get_relative_addr(vaddr)
+        section = self.sections[section_id - 1]
+
+        # If we try to read off the end of the section
+        if size < 0 or (offset + size) > section.extent:
+            raise InvalidVirtualReadError(
+                f"{self.filepath} : Cannot read {size} bytes from 0x{vaddr:x}"
+            )
 
-        # Clamp the read within the extent of the current section.
-        # Reading off the end will most likely misrepresent the virtual addressing.
-        _size = min(size, section.size_of_raw_data - offset)
-        return bytes(section.view[offset : offset + _size])
+        # Pad with zero bytes if reading uninitialized data.
+        # Assumes the section memoryview is cropped to the initialized bytes
+        view = section.view[offset : offset + size]
+        return bytes(view) + b"\x00" * (size - len(view))
diff --git a/reccmp/tools/datacmp.py b/reccmp/tools/datacmp.py
@@ -142,6 +142,7 @@ def create_comparison_item(
 
 
 def do_the_comparison(target: RecCmpBuiltTarget) -> Iterable[ComparisonItem]:
+    # pylint: disable=too-many-locals
     """Run through each variable in our compare DB, then do the comparison
     according to the variable's type. Emit the result."""
     origfile = detect_image(filepath=target.original_path)
@@ -193,48 +194,47 @@ def do_the_comparison(target: RecCmpBuiltTarget) -> Iterable[ComparisonItem]:
         orig_raw = origfile.read(var.orig_addr, data_size)
         recomp_raw = recompfile.read(var.recomp_addr, data_size)
 
-        # The IMAGE_SECTION_HEADER defines the SizeOfRawData and VirtualSize for the section.
-        # If VirtualSize > SizeOfRawData, the section is comprised of the initialized data
-        # corresponding to bytes in the file, and the rest is padded with zeroes when
-        # Windows loads the image.
-        # The linker might place variables initialized to zero on the threshold between
-        # physical data and the virtual (uninitialized) data.
-        # If this happens (i.e. we get an incomplete read) we just do the same padding
-        # to prepare for the comparison.
-        if orig_raw is not None and len(orig_raw) < data_size:
-            orig_raw = orig_raw.ljust(data_size, b"\x00")
-
-        if recomp_raw is not None and len(recomp_raw) < data_size:
-            recomp_raw = recomp_raw.ljust(data_size, b"\x00")
-
-        # If one or both variables are entirely uninitialized
-        if orig_raw is None or recomp_raw is None:
-            # If both variables are uninitialized, we consider them equal.
-            match = orig_raw is None and recomp_raw is None
-
-            # We can match a variable initialized to all zeroes with
-            # an uninitialized variable, but this may or may not actually
-            # be correct, so we flag it for the user.
-            uninit_force_match = not match and (
-                (orig_raw is None and all(b == 0 for b in recomp_raw))
-                or (recomp_raw is None and all(b == 0 for b in orig_raw))
+        orig_is_null = all(b == 0 for b in orig_raw)
+        recomp_is_null = all(b == 0 for b in recomp_raw)
+
+        # If all bytes are zero on either read, it's possible that the variable
+        # is uninitialized on one or both sides. Special handling for that situation:
+        if orig_is_null or recomp_is_null:
+            # Check the last address of the variable in each file to see if any of it is
+            # in the uninitialized area of the section.
+            orig_in_bss = origfile.addr_is_uninitialized(var.orig_addr + data_size - 1)
+            recomp_in_bss = recompfile.addr_is_uninitialized(
+                var.recomp_addr + data_size - 1
             )
 
-            orig_value = "(uninitialized)" if orig_raw is None else "(initialized)"
-            recomp_value = "(uninitialized)" if recomp_raw is None else "(initialized)"
-            yield create_comparison_item(
-                var,
-                compared=[
-                    ComparedOffset(
-                        offset=0,
-                        name=None,
-                        match=match,
-                        values=(orig_value, recomp_value),
-                    )
-                ],
-                raw_only=uninit_force_match,
-            )
-            continue
+            if orig_in_bss or recomp_in_bss:
+                # We record a match if both items are null and:
+                # 1. Both values are entirely initialized to zero
+                # 2. All or part of both values are in the uninitialized area
+                match = (
+                    orig_is_null and recomp_is_null and (orig_in_bss == recomp_in_bss)
+                )
+
+                # However... you may not have full control over where the variable sits in the
+                # section, so we will only warn (and not log a diff) if the variable is
+                # initialized in one file but not the other.
+                uninit_force_match = orig_is_null and recomp_is_null
+
+                orig_value = "(uninitialized)" if orig_in_bss else "(initialized)"
+                recomp_value = "(uninitialized)" if recomp_in_bss else "(initialized)"
+                yield create_comparison_item(
+                    var,
+                    compared=[
+                        ComparedOffset(
+                            offset=0,
+                            name=None,
+                            match=match,
+                            values=(orig_value, recomp_value),
+                        )
+                    ],
+                    raw_only=uninit_force_match,
+                )
+                continue
 
         if not is_type_aware:
             # If there is no specific type information available

diff --git a/tests/test_islebin.py b/tests/test_islebin.py
@@ -9,6 +9,7 @@
 from reccmp.isledecomp.formats.exceptions import (
     SectionNotFoundError,
     InvalidVirtualAddressError,
+    InvalidVirtualReadError,
 )
 
 
@@ -65,12 +66,47 @@ def test_unusual_reads(binfile: PEImage):
     with pytest.raises(InvalidVirtualAddressError):
         binfile.read(0xFFFFFFFF, 1)
 
-    # Uninitialized part of .data
-    assert binfile.read(0x1010A600, 4) is None
+    # Initialized bytes for .data end at 0x10102c00. Read from the uninitialized section.
+    # Older versions of reccmp would return None for uninitialized data.
+    # We now return zeroes to emulate the behavior of the real program.
+    assert binfile.read(0x1010A600, 4) == b"\x00\x00\x00\x00"
 
-    # Past the end of virtual size in .text
+    # Read the last 16 initialized bytes of .data
+    assert len(binfile.read(0x10102BF0, 16)) == 16
+
+    # Keep reading into the uninitialized section without an exception.
+    assert len(binfile.read(0x10102BF0, 32)) == 32
+
+    # Read 8 initialized and 8 uninitialized bytes. Should pad with zeroes.
+    assert binfile.read(0x10102BF8, 16) == (b"\x00" * 16)
+
+    # Unlike .data, physical size for .text is larger than virtual size.
+    # This means the padding bytes are stored in the file.
+    # Read the unused but initialized bytes.
     assert binfile.read(0x100D3A70, 4) == b"\x00\x00\x00\x00"
 
+    # .text ends at 0x100d3c00. Even though .rdata does not begin until 0x100d4000,
+    # we still should not read past the end of virtual data.
+    with pytest.raises(InvalidVirtualReadError):
+        binfile.read(0x100D3BFF, 10)
+
+    # Read past the final virtual address in .reloc
+    with pytest.raises(InvalidVirtualReadError):
+        binfile.read(0x10120DF0, 32)
+
+    # Reading zero bytes is okay
+    assert binfile.read(0x100DB588, 0) == b""
+
+    # Cannot read with negative size
+    with pytest.raises(InvalidVirtualReadError):
+        binfile.read(0x100DB588, -1)
+
+    # There are fewer than 1000 bytes in .reloc at the location of this string.
+    # Chunk size is chosen arbitrarily for string reads. The reason is that we need
+    # to read until we hit a zero byte, so we don't use the memoryview directly.
+    # This should not fail.
+    assert binfile.read_string(0x1010BFFC, 1000) == b"d3drm.dll"
+
 
 STRING_ADDRESSES = (
     (0x100DB588, b"November"),