Skip to content

Commit

Permalink
Allow reads on uninitialized data (#43)
Browse files Browse the repository at this point in the history
  • Loading branch information
disinvite authored Dec 29, 2024
1 parent 4d457a0 commit 573b819
Show file tree
Hide file tree
Showing 4 changed files with 162 additions and 82 deletions.
6 changes: 6 additions & 0 deletions reccmp/isledecomp/formats/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,9 @@ class SectionNotFoundError(KeyError):
class InvalidVirtualAddressError(IndexError):
"""The given virtual address is too high or low
to point to something in the binary file."""


class InvalidVirtualReadError(IndexError):
"""Reading the given number of bytes from the given virtual address
would cause us to read past the end of the section or past the end
of the virtual address space."""
116 changes: 77 additions & 39 deletions reccmp/isledecomp/formats/pe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,18 @@
- Debug information: https://www.debuginfo.com/examples/src/DebugDir.cpp
"""

import bisect
import dataclasses
from enum import IntEnum, IntFlag
from functools import cached_property
from pathlib import Path
import struct
from typing import Iterator, Optional

from .exceptions import InvalidVirtualAddressError, SectionNotFoundError
from .exceptions import (
InvalidVirtualAddressError,
InvalidVirtualReadError,
SectionNotFoundError,
)
from .image import Image
from .mz import ImageDosHeader

Expand Down Expand Up @@ -467,7 +470,6 @@ class PEImage(Image):
# FIXME: do these belong to PEImage? Shouldn't the loade apply these to the data?
_relocated_addrs: set[int] = dataclasses.field(default_factory=set, repr=False)
_relocations: set[int] = dataclasses.field(default_factory=set, repr=False)
_section_vaddr: list[int] = dataclasses.field(default_factory=list, repr=False)
# find_str: bool = dataclasses.field(default=False, repr=False)
imports: list[tuple[int, str]] = dataclasses.field(default_factory=list, repr=False)
exports: list[tuple[str, str, int]] = dataclasses.field(
Expand Down Expand Up @@ -524,9 +526,6 @@ def load(self):
f"reccmp only supports i386 binaries: {self.header.machine}."
)

# bisect does not support key on the GitHub CI version of python
self._section_vaddr = [section.virtual_address for section in self.sections]

self._populate_relocations()
self._populate_imports()
self._populate_thunks()
Expand Down Expand Up @@ -941,16 +940,27 @@ def get_abs_addr(self, section: int, offset: int) -> int:
into an absolute vaddr."""
return self.get_section_offset_by_index(section) + offset

def get_relative_addr(self, addr: int) -> tuple[int, int]:
"""Convert an absolute address back into a (section, offset) pair."""
i = bisect.bisect_right(self._section_vaddr, addr) - 1
i = max(0, i)
@cached_property
def vaddr_ranges(self) -> list[tuple[int, int]]:
"""Return the start and end virtual address of each section in the file."""
return list(
(
self.imagebase + section.virtual_address,
self.imagebase
+ section.virtual_address
+ max(section.size_of_raw_data, section.virtual_size),
)
for section in self.section_headers
)

section = self.sections[i]
if section.contains_vaddr(addr):
return i + 1, addr - section.virtual_address
def get_relative_addr(self, addr: int) -> tuple[int, int]:
"""Convert an absolute address back into a (section_id, offset) pair.
n.b. section_id is 1-based to match PDB output."""
for i, (start, end) in enumerate(self.vaddr_ranges):
if start <= addr < end:
return i + 1, addr - start

raise InvalidVirtualAddressError(f"{self.filepath} : 0x{addr:08x} {section=}")
raise InvalidVirtualAddressError(f"{self.filepath} : 0x{addr:x}")

def is_valid_section(self, section_id: int) -> bool:
"""The PDB will refer to sections that are not listed in the headers
Expand All @@ -962,36 +972,64 @@ def is_valid_section(self, section_id: int) -> bool:
return False

def is_valid_vaddr(self, vaddr: int) -> bool:
"""Does this virtual address point to anything in the exe?"""
try:
(_, __) = self.get_relative_addr(vaddr)
except InvalidVirtualAddressError:
return False
"""Is this virtual address part of the image when loaded?"""
# Use max here just in case the section headers are not ordered by v.addr
(_, last_vaddr) = max(self.vaddr_ranges, key=lambda s: s[1])
return self.imagebase <= vaddr < last_vaddr

return True
@cached_property
def uninitialized_ranges(self) -> list[tuple[int, int]]:
"""Return a start and end range of each region in the file that holds uninitialized data.
This can be an entire section (.bss) or the gap between the end of the physical data
and the virtual size. These ranges do not correspond to section ids."""
output = []
for section in self.section_headers:
if (
section.characteristics
& PESectionFlags.IMAGE_SCN_CNT_UNINITIALIZED_DATA
):
output.append(
(
self.imagebase + section.virtual_address,
self.imagebase + section.virtual_address + section.virtual_size,
)
)
elif section.virtual_size > section.size_of_raw_data:
# Should also cover the case where size_of_raw_data = 0.
output.append(
(
self.imagebase
+ section.virtual_address
+ section.size_of_raw_data,
self.imagebase + section.virtual_address + section.virtual_size,
)
)

def read_string(self, offset: int, chunk_size: int = 1000) -> Optional[bytes]:
"""Read until we find a zero byte."""
b = self.read(offset, chunk_size)
if b is None:
return None
return output

try:
return b[: b.index(b"\x00")]
except ValueError:
# No terminator found, just return what we have
return b
def addr_is_uninitialized(self, vaddr: int) -> bool:
return any(start <= vaddr < end for start, end in self.uninitialized_ranges)

def read(self, vaddr: int, size: int) -> Optional[bytes]:
"""Read (at most) the given number of bytes at the given virtual address.
If we return None, the given address points to uninitialized data."""
def read_string(self, vaddr: int, chunk_size: int = 1000) -> bytes:
"""Read up to chunk_size or until we find a zero byte."""
(section_id, offset) = self.get_relative_addr(vaddr)
section = self.sections[section_id - 1]
view = section.view[offset : offset + chunk_size]
# Don't call read() here because we might not get the entire chunk size.
# Use whatever we can get if we are at the end of the section.
return view.tobytes().partition(b"\x00")[0]

if section.addr_is_uninitialized(vaddr):
return None
def read(self, vaddr: int, size: int) -> bytes:
(section_id, offset) = self.get_relative_addr(vaddr)
section = self.sections[section_id - 1]

# If we try to read off the end of the section
if size < 0 or (offset + size) > section.extent:
raise InvalidVirtualReadError(
f"{self.filepath} : Cannot read {size} bytes from 0x{vaddr:x}"
)

# Clamp the read within the extent of the current section.
# Reading off the end will most likely misrepresent the virtual addressing.
_size = min(size, section.size_of_raw_data - offset)
return bytes(section.view[offset : offset + _size])
# Pad with zero bytes if reading uninitialized data.
# Assumes the section memoryview is cropped to the initialized bytes
view = section.view[offset : offset + size]
return bytes(view) + b"\x00" * (size - len(view))
80 changes: 40 additions & 40 deletions reccmp/tools/datacmp.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ def create_comparison_item(


def do_the_comparison(target: RecCmpBuiltTarget) -> Iterable[ComparisonItem]:
# pylint: disable=too-many-locals
"""Run through each variable in our compare DB, then do the comparison
according to the variable's type. Emit the result."""
origfile = detect_image(filepath=target.original_path)
Expand Down Expand Up @@ -193,48 +194,47 @@ def do_the_comparison(target: RecCmpBuiltTarget) -> Iterable[ComparisonItem]:
orig_raw = origfile.read(var.orig_addr, data_size)
recomp_raw = recompfile.read(var.recomp_addr, data_size)

# The IMAGE_SECTION_HEADER defines the SizeOfRawData and VirtualSize for the section.
# If VirtualSize > SizeOfRawData, the section is comprised of the initialized data
# corresponding to bytes in the file, and the rest is padded with zeroes when
# Windows loads the image.
# The linker might place variables initialized to zero on the threshold between
# physical data and the virtual (uninitialized) data.
# If this happens (i.e. we get an incomplete read) we just do the same padding
# to prepare for the comparison.
if orig_raw is not None and len(orig_raw) < data_size:
orig_raw = orig_raw.ljust(data_size, b"\x00")

if recomp_raw is not None and len(recomp_raw) < data_size:
recomp_raw = recomp_raw.ljust(data_size, b"\x00")

# If one or both variables are entirely uninitialized
if orig_raw is None or recomp_raw is None:
# If both variables are uninitialized, we consider them equal.
match = orig_raw is None and recomp_raw is None

# We can match a variable initialized to all zeroes with
# an uninitialized variable, but this may or may not actually
# be correct, so we flag it for the user.
uninit_force_match = not match and (
(orig_raw is None and all(b == 0 for b in recomp_raw))
or (recomp_raw is None and all(b == 0 for b in orig_raw))
orig_is_null = all(b == 0 for b in orig_raw)
recomp_is_null = all(b == 0 for b in recomp_raw)

# If all bytes are zero on either read, it's possible that the variable
# is uninitialized on one or both sides. Special handling for that situation:
if orig_is_null or recomp_is_null:
# Check the last address of the variable in each file to see if any of it is
# in the uninitialized area of the section.
orig_in_bss = origfile.addr_is_uninitialized(var.orig_addr + data_size - 1)
recomp_in_bss = recompfile.addr_is_uninitialized(
var.recomp_addr + data_size - 1
)

orig_value = "(uninitialized)" if orig_raw is None else "(initialized)"
recomp_value = "(uninitialized)" if recomp_raw is None else "(initialized)"
yield create_comparison_item(
var,
compared=[
ComparedOffset(
offset=0,
name=None,
match=match,
values=(orig_value, recomp_value),
)
],
raw_only=uninit_force_match,
)
continue
if orig_in_bss or recomp_in_bss:
# We record a match if both items are null and:
# 1. Both values are entirely initialized to zero
# 2. All or part of both values are in the uninitialized area
match = (
orig_is_null and recomp_is_null and (orig_in_bss == recomp_in_bss)
)

# However... you may not have full control over where the variable sits in the
# section, so we will only warn (and not log a diff) if the variable is
# initialized in one file but not the other.
uninit_force_match = orig_is_null and recomp_is_null

orig_value = "(uninitialized)" if orig_in_bss else "(initialized)"
recomp_value = "(uninitialized)" if recomp_in_bss else "(initialized)"
yield create_comparison_item(
var,
compared=[
ComparedOffset(
offset=0,
name=None,
match=match,
values=(orig_value, recomp_value),
)
],
raw_only=uninit_force_match,
)
continue

if not is_type_aware:
# If there is no specific type information available
Expand Down
42 changes: 39 additions & 3 deletions tests/test_islebin.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from reccmp.isledecomp.formats.exceptions import (
SectionNotFoundError,
InvalidVirtualAddressError,
InvalidVirtualReadError,
)


Expand Down Expand Up @@ -65,12 +66,47 @@ def test_unusual_reads(binfile: PEImage):
with pytest.raises(InvalidVirtualAddressError):
binfile.read(0xFFFFFFFF, 1)

# Uninitialized part of .data
assert binfile.read(0x1010A600, 4) is None
# Initialized bytes for .data end at 0x10102c00. Read from the uninitialized section.
# Older versions of reccmp would return None for uninitialized data.
# We now return zeroes to emulate the behavior of the real program.
assert binfile.read(0x1010A600, 4) == b"\x00\x00\x00\x00"

# Past the end of virtual size in .text
# Read the last 16 initialized bytes of .data
assert len(binfile.read(0x10102BF0, 16)) == 16

# Keep reading into the uninitialized section without an exception.
assert len(binfile.read(0x10102BF0, 32)) == 32

# Read 8 initialized and 8 uninitialized bytes. Should pad with zeroes.
assert binfile.read(0x10102BF8, 16) == (b"\x00" * 16)

# Unlike .data, physical size for .text is larger than virtual size.
# This means the padding bytes are stored in the file.
# Read the unused but initialized bytes.
assert binfile.read(0x100D3A70, 4) == b"\x00\x00\x00\x00"

# .text ends at 0x100d3c00. Even though .rdata does not begin until 0x100d4000,
# we still should not read past the end of virtual data.
with pytest.raises(InvalidVirtualReadError):
binfile.read(0x100D3BFF, 10)

# Read past the final virtual address in .reloc
with pytest.raises(InvalidVirtualReadError):
binfile.read(0x10120DF0, 32)

# Reading zero bytes is okay
assert binfile.read(0x100DB588, 0) == b""

# Cannot read with negative size
with pytest.raises(InvalidVirtualReadError):
binfile.read(0x100DB588, -1)

# There are fewer than 1000 bytes in .reloc at the location of this string.
# Chunk size is chosen arbitrarily for string reads. The reason is that we need
# to read until we hit a zero byte, so we don't use the memoryview directly.
# This should not fail.
assert binfile.read_string(0x1010BFFC, 1000) == b"d3drm.dll"


STRING_ADDRESSES = (
(0x100DB588, b"November"),
Expand Down

0 comments on commit 573b819

Please sign in to comment.