Skip to content

Commit

Permalink
JSON report aggregate tool (#91)
Browse files Browse the repository at this point in the history
* JSON report aggregate tool

* Error message if reports are from different sources

* Expand argparse error messages

* Rename to reccmp-aggregate
  • Loading branch information
disinvite authored Mar 4, 2025
1 parent c51e57b commit 4350c24
Show file tree
Hide file tree
Showing 9 changed files with 560 additions and 106 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ The next steps differ based on what kind of project you have.
All scripts will become available to use in your terminal with the `reccmp-` prefix. Note that these scripts need to be executed in the directory where `reccmp-build.yml` is located.
* [`aggregate`](/reccmp/tools/aggregate.py): Combines JSON reports into a single file.
* Aggregate using highest accuracy score: `reccmp-aggregate --samples ./sample0.json ./sample1.json ./sample2.json --output ./combined.json`
* Diff two saved reports: `reccmp-aggregate --diff ./before.json ./after.json`
* Diff against the aggregate: `reccmp-aggregate --samples ./sample0.json ./sample1.json ./sample2.json --diff ./before.json`
* [`decomplint`](/reccmp/tools/decomplint.py): Checks the decompilation annotations (see above)
* e.g. `reccmp-decomplint --module LEGO1 LEGO1`
* [`reccmp`](/reccmp/tools/asmcmp.py): Compares an original binary with a recompiled binary, provided a PDB file. For example:
Expand Down
5 changes: 4 additions & 1 deletion reccmp/assets/template.html
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,10 @@
margin-bottom: 0;
}
</style>
<script>var data = {{{data}}};</script>
<script>
var report = {{{report}}};
var data = report["data"];
</script>
<script>{{{reccmp_js}}}</script>
</script>
</head>
Expand Down
3 changes: 1 addition & 2 deletions reccmp/isledecomp/compare/diff.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from difflib import SequenceMatcher
from typing import TypedDict
from typing_extensions import NotRequired
from typing_extensions import NotRequired, TypedDict

CombinedDiffInput = list[tuple[str, str]]

Expand Down
189 changes: 189 additions & 0 deletions reccmp/isledecomp/compare/report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
from datetime import datetime
from dataclasses import dataclass
from typing import Literal, Iterable, Iterator
from pydantic import BaseModel, ValidationError
from pydantic_core import from_json
from .diff import CombinedDiffOutput


class ReccmpReportDeserializeError(Exception):
"""The given file is not a serialized reccmp report file"""


class ReccmpReportSameSourceError(Exception):
"""Tried to aggregate reports derived from different source files."""


@dataclass
class ReccmpComparedEntity:
orig_addr: str
name: str
accuracy: float
recomp_addr: str | None = None
is_effective_match: bool = False
is_stub: bool = False
diff: CombinedDiffOutput | None = None


class ReccmpStatusReport:
# The filename of the original binary.
# This is here to avoid comparing reports derived from different files.
# TODO: in the future, we may want to use the hash instead
filename: str

# Creation date of the report file.
timestamp: datetime

# Using orig addr as the key.
entities: dict[str, ReccmpComparedEntity]

def __init__(self, filename: str, timestamp: datetime | None = None) -> None:
self.filename = filename
if timestamp is not None:
self.timestamp = timestamp
else:
self.timestamp = datetime.now().replace(microsecond=0)

self.entities = {}


def _get_entity_for_addr(
samples: Iterable[ReccmpStatusReport], addr: str
) -> Iterator[ReccmpComparedEntity]:
"""Helper to return entities from xreports that have the given address."""
for sample in samples:
if addr in sample.entities:
yield sample.entities[addr]


def _accuracy_sort_key(entity: ReccmpComparedEntity) -> float:
"""Helper to sort entity samples by accuracy score.
100% match is preferred over effective match.
Effective match is preferred over any accuracy.
Stubs rank lower than any accuracy score."""
if entity.is_stub:
return -1.0

if entity.accuracy == 1.0:
if not entity.is_effective_match:
return 1000.0

if entity.is_effective_match:
return 1.0

return entity.accuracy


def combine_reports(samples: list[ReccmpStatusReport]) -> ReccmpStatusReport:
"""Combines the sample reports into a single report.
The current strategy is to use the entity with the highest
accuracy score from any report."""
assert len(samples) > 0

if not all(samples[0].filename == s.filename for s in samples):
raise ReccmpReportSameSourceError

output = ReccmpStatusReport(filename=samples[0].filename)

# Combine every orig addr used in any of the reports.
orig_addr_set = {key for sample in samples for key in sample.entities.keys()}

all_orig_addrs = sorted(list(orig_addr_set))

for addr in all_orig_addrs:
e_list = list(_get_entity_for_addr(samples, addr))
assert len(e_list) > 0

# Our aggregate accuracy score is the highest from any report.
e_list.sort(key=_accuracy_sort_key, reverse=True)

output.entities[addr] = e_list[0]

# Recomp addr will most likely vary between samples, so clear it
output.entities[addr].recomp_addr = None

return output


#### JSON schemas and conversion functions ####


@dataclass
class JSONEntityVersion1:
address: str
name: str
matching: float
# Optional fields
recomp: str | None = None
stub: bool = False
effective: bool = False
diff: CombinedDiffOutput | None = None


class JSONReportVersion1(BaseModel):
file: str
format: Literal[1]
timestamp: float
data: list[JSONEntityVersion1]


def _serialize_version_1(
report: ReccmpStatusReport, diff_included: bool = False
) -> JSONReportVersion1:
"""The HTML file needs the diff data, but it is omitted from the JSON report."""
entities = [
JSONEntityVersion1(
address=addr, # prefer dict key over redundant value in entity
name=e.name,
matching=e.accuracy,
recomp=e.recomp_addr,
stub=e.is_stub,
effective=e.is_effective_match,
diff=e.diff if diff_included else None,
)
for addr, e in report.entities.items()
]

return JSONReportVersion1(
file=report.filename,
format=1,
timestamp=report.timestamp.timestamp(),
data=entities,
)


def _deserialize_version_1(obj: JSONReportVersion1) -> ReccmpStatusReport:
report = ReccmpStatusReport(
filename=obj.file, timestamp=datetime.fromtimestamp(obj.timestamp)
)

for e in obj.data:
report.entities[e.address] = ReccmpComparedEntity(
orig_addr=e.address,
name=e.name,
accuracy=e.matching,
recomp_addr=e.recomp,
is_stub=e.stub,
is_effective_match=e.effective,
)

return report


def deserialize_reccmp_report(json_str: str) -> ReccmpStatusReport:
try:
obj = JSONReportVersion1.model_validate(from_json(json_str))
return _deserialize_version_1(obj)
except ValidationError as ex:
raise ReccmpReportDeserializeError from ex


def serialize_reccmp_report(
report: ReccmpStatusReport, diff_included: bool = False
) -> str:
"""Create a JSON string for the report so it can be written to a file."""
now = datetime.now().replace(microsecond=0)
report.timestamp = now
obj = _serialize_version_1(report, diff_included=diff_included)

return obj.model_dump_json(exclude_defaults=True)
Loading

0 comments on commit 4350c24

Please sign in to comment.