Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JSON report aggregate tool #91

Merged
merged 5 commits into from
Mar 4, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@ All scripts will become available to use in your terminal with the `reccmp-` pre

* [`decomplint`](/reccmp/tools/decomplint.py): Checks the decompilation annotations (see above)
* e.g. `reccmp-decomplint --module LEGO1 LEGO1`
* [`denoise`](/reccmp/tools/denoise.py): Combines JSON reports into a single aggregate file.
* Create aggregate report: `reccmp-denoise --samples ./sample0.json ./sample1.json ./sample2.json --output ./combined.json`
* Diff two saved reports: `reccmp-denoise --diff ./before.json ./after.json`
* Diff against the aggregate: `reccmp-denoise --samples ./sample0.json ./sample1.json ./sample2.json --diff ./before.json`
* [`reccmp`](/reccmp/tools/asmcmp.py): Compares an original binary with a recompiled binary, provided a PDB file. For example:
* Display the diff for a single function: `reccmp-reccmp --target LEGO1 --verbose 0x100ae1a0`
* Generate an HTML report: `reccmp-reccmp --target LEGO1 --html output.html`
Expand Down
5 changes: 4 additions & 1 deletion reccmp/assets/template.html
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,10 @@
margin-bottom: 0;
}
</style>
<script>var data = {{{data}}};</script>
<script>
var report = {{{report}}};
var data = report["data"];
</script>
<script>{{{reccmp_js}}}</script>
</script>
</head>
Expand Down
3 changes: 1 addition & 2 deletions reccmp/isledecomp/compare/diff.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from difflib import SequenceMatcher
from typing import TypedDict
from typing_extensions import NotRequired
from typing_extensions import NotRequired, TypedDict

CombinedDiffInput = list[tuple[str, str]]

Expand Down
185 changes: 185 additions & 0 deletions reccmp/isledecomp/compare/report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
from datetime import datetime
from dataclasses import dataclass
from typing import Literal, Iterable, Iterator
from pydantic import BaseModel, ValidationError
from pydantic_core import from_json
from .diff import CombinedDiffOutput


class ReccmpReportDeserializeError(Exception):
"""The given file is not a serialized reccmp report file"""


@dataclass
class ReccmpComparedEntity:
orig_addr: str
name: str
accuracy: float
recomp_addr: str | None = None
is_effective_match: bool = False
is_stub: bool = False
diff: CombinedDiffOutput | None = None


class ReccmpStatusReport:
# The filename of the original binary.
# This is here to avoid comparing reports derived from different files.
# TODO: in the future, we may want to use the hash instead
filename: str

# Creation date of the report file.
timestamp: datetime

# Using orig addr as the key.
entities: dict[str, ReccmpComparedEntity]

def __init__(self, filename: str, timestamp: datetime | None = None) -> None:
self.filename = filename
if timestamp is not None:
self.timestamp = timestamp
else:
self.timestamp = datetime.now().replace(microsecond=0)

self.entities = {}


def _get_entity_for_addr(
samples: Iterable[ReccmpStatusReport], addr: str
) -> Iterator[ReccmpComparedEntity]:
"""Helper to return entities from xreports that have the given address."""
for sample in samples:
if addr in sample.entities:
yield sample.entities[addr]


def _accuracy_sort_key(entity: ReccmpComparedEntity) -> float:
"""Helper to sort entity samples by accuracy score.
100% match is preferred over effective match.
Effective match is preferred over any accuracy.
Stubs rank lower than any accuracy score."""
if entity.is_stub:
return -1.0

if entity.accuracy == 1.0:
if not entity.is_effective_match:
return 1000.0

if entity.is_effective_match:
return 1.0

return entity.accuracy


def combine_reports(samples: list[ReccmpStatusReport]) -> ReccmpStatusReport:
"""Combines the sample reports into a single report.
The current strategy is to use the entity with the highest
accuracy score from any report."""
assert len(samples) > 0

# TODO: hack
assert all(samples[0].filename == s.filename for s in samples)

output = ReccmpStatusReport(filename=samples[0].filename)

# Combine every orig addr used in any of the reports.
orig_addr_set = {key for sample in samples for key in sample.entities.keys()}

all_orig_addrs = sorted(list(orig_addr_set))

for addr in all_orig_addrs:
e_list = list(_get_entity_for_addr(samples, addr))
assert len(e_list) > 0

# Our aggregate accuracy score is the highest from any report.
e_list.sort(key=_accuracy_sort_key, reverse=True)

output.entities[addr] = e_list[0]

# Recomp addr will most likely vary between samples, so clear it
output.entities[addr].recomp_addr = None

return output


#### JSON schemas and conversion functions ####


@dataclass
class JSONEntityVersion1:
address: str
name: str
matching: float
# Optional fields
recomp: str | None = None
stub: bool = False
effective: bool = False
diff: CombinedDiffOutput | None = None


class JSONReportVersion1(BaseModel):
file: str
format: Literal[1]
timestamp: float
data: list[JSONEntityVersion1]


def _serialize_version_1(
report: ReccmpStatusReport, diff_included: bool = False
) -> JSONReportVersion1:
"""The HTML file needs the diff data, but it is omitted from the JSON report."""
entities = [
JSONEntityVersion1(
address=addr, # prefer dict key over redundant value in entity
name=e.name,
matching=e.accuracy,
recomp=e.recomp_addr,
stub=e.is_stub,
effective=e.is_effective_match,
diff=e.diff if diff_included else None,
)
for addr, e in report.entities.items()
]

return JSONReportVersion1(
file=report.filename,
format=1,
timestamp=report.timestamp.timestamp(),
data=entities,
)


def _deserialize_version_1(obj: JSONReportVersion1) -> ReccmpStatusReport:
report = ReccmpStatusReport(
filename=obj.file, timestamp=datetime.fromtimestamp(obj.timestamp)
)

for e in obj.data:
report.entities[e.address] = ReccmpComparedEntity(
orig_addr=e.address,
name=e.name,
accuracy=e.matching,
recomp_addr=e.recomp,
is_stub=e.stub,
is_effective_match=e.effective,
)

return report


def deserialize_reccmp_report(json_str: str) -> ReccmpStatusReport:
try:
obj = JSONReportVersion1.model_validate(from_json(json_str))
return _deserialize_version_1(obj)
except ValidationError as ex:
raise ReccmpReportDeserializeError from ex


def serialize_reccmp_report(
report: ReccmpStatusReport, diff_included: bool = False
) -> str:
"""Create a JSON string for the report so it can be written to a file."""
now = datetime.now().replace(microsecond=0)
report.timestamp = now
obj = _serialize_version_1(report, diff_included=diff_included)

return obj.model_dump_json(exclude_defaults=True)
Loading