Skip to content

Commit f40a520

Browse files
lobsterkatieandrewshie-sentry
authored andcommitted
fix(seer grouping): Store events with unusable Seer matches in Seer (#89318)
When we send an event to Seer during ingest, if Seer doesn't find a match, it stores the event's data in its database, so that future events can match with it. Conversely, if Seer does find a match, it doesn't store the new event's data, under the presumption that any future events which would match it would also match the event it matched.* Now that we're sending events with hybrid fingerprints to Seer, though, that logic breaks down, since an event being a stacktrace distance match doesn't guarantee that it's an overall match, because either or both could have a hybrid fingerprint, and the non-default parts of those potential hybrid fingerprints might or might not match. In other words, we can no longer assume that the event already in Seer and the new event are equivalent in terms of matching to future events. If neither the current event nor the existing event in Seer has a hybrid fingerprint, or if they both do _and_ the non-default parts of their fingerprints match, the assumption holds. But if neither of those is the case, future events could match one but not the other, meaning both need to be represented in the Seer database. This PR thus adds a second request to Seer to store the new event in the case where none of the results returned by Seer are an overall match. In order to force the event to be stored, the request sets the desired number of matches to 0, so that Seer sees the event as one with no matches. It also turns off re-ranking, in order to prevent Seer from pulling an initial list of matches to whittle down, since we know we're not going to use any of them anyway. *In terms of stacktrace distance, this is not perfectly theoretically true - something just barely close enough to the new event to count might be just slightly too far from the existing event - but in practice those corner cases are rare enough that we don't worry about them.
1 parent bcb51c6 commit f40a520

File tree

3 files changed

+102
-1
lines changed

3 files changed

+102
-1
lines changed

src/sentry/grouping/ingest/seer.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,30 @@ def get_seer_similar_issues(
323323
seer_match_status = "match_found"
324324
break
325325

326+
# If Seer sent back matches, that means it didn't store the incoming event's data in its
327+
# database. But if we then weren't able to use any of the matches Seer sent back, we do actually
328+
# want a Seer record to be created, so that future events with this fingerprint have something
329+
# with which to match.
330+
if seer_match_status == "no_matches_usable":
331+
request_data = {
332+
**request_data,
333+
"referrer": "ingest_follow_up",
334+
# By asking Seer to find zero matches, we can trick it into thinking there aren't
335+
# any, thereby forcing it to create the record
336+
"k": 0,
337+
# Turn off re-ranking to speed up the process of finding nothing
338+
"use_reranking": False,
339+
}
340+
341+
# TODO: Temporary log to prove things are working as they should. This should come in a pair
342+
# with the `get_similarity_data_from_seer.ingest_follow_up` log in `similar_issues.py`,
343+
# which should show that no matches are returned.
344+
logger.info("get_seer_similar_issues.follow_up_seer_request", extra={"hash": event_hash})
345+
346+
# We only want this for the side effect, and we know it'll return no matches, so we don't
347+
# bother to capture the return value.
348+
get_similarity_data_from_seer(request_data)
349+
326350
is_hybrid_fingerprint_case = (
327351
event_has_hybrid_fingerprint
328352
# This means we had to reject at least one match because it was a hybrid even though the

src/sentry/seer/similarity/similar_issues.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,17 @@ def get_similarity_data_from_seer(
134134
)
135135
return []
136136

137+
# TODO: Temporary log to prove things are working as they should. This should come in a pair
138+
# with the `get_seer_similar_issues.follow_up_seer_request` log in `seer.py`.
139+
if referrer == "ingest_follow_up":
140+
logger.info(
141+
"get_similarity_data_from_seer.ingest_follow_up",
142+
extra={
143+
"hash": request_hash,
144+
"response_data": response_data, # Should always be an empty list
145+
},
146+
)
147+
137148
if not response_data:
138149
metrics.incr(
139150
"seer.similar_issues_request",

tests/sentry/grouping/seer_similarity/test_get_seer_similar_issues.py

Lines changed: 67 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from typing import Any
2-
from unittest.mock import MagicMock, patch
2+
from unittest.mock import MagicMock, call, patch
33

44
from sentry import options
55
from sentry.eventstore.models import Event
@@ -112,6 +112,72 @@ def test_sends_expected_data_to_seer(self, mock_get_similarity_data: MagicMock)
112112
{"hybrid_fingerprint": False},
113113
)
114114

115+
@patch("sentry.grouping.ingest.seer.metrics.incr")
116+
def test_sends_second_seer_request_when_seer_matches_are_unusable(
117+
self, mock_incr: MagicMock
118+
) -> None:
119+
120+
existing_event = save_new_event(
121+
{"message": "Dogs are great!", "fingerprint": ["{{ default }}", "maisey"]},
122+
self.project,
123+
)
124+
assert existing_event.group_id
125+
126+
new_event, new_variants, new_grouphash, new_stacktrace_string = create_new_event(
127+
self.project
128+
)
129+
130+
seer_result_data = [
131+
SeerSimilarIssueData(
132+
parent_hash=existing_event.get_primary_hash(),
133+
parent_group_id=existing_event.group_id,
134+
stacktrace_distance=0.01,
135+
should_group=True,
136+
)
137+
]
138+
139+
with patch(
140+
"sentry.grouping.ingest.seer.get_similarity_data_from_seer",
141+
return_value=seer_result_data,
142+
) as mock_get_similarity_data:
143+
get_seer_similar_issues(new_event, new_grouphash, new_variants)
144+
145+
assert_metrics_call(
146+
mock_incr, "get_seer_similar_issues", "no_matches_usable", {"is_hybrid": True}
147+
)
148+
149+
base_request_params = {
150+
"event_id": new_event.event_id,
151+
"hash": new_event.get_primary_hash(),
152+
"project_id": self.project.id,
153+
"stacktrace": new_stacktrace_string,
154+
"exception_type": "FailedToFetchError",
155+
}
156+
157+
assert mock_get_similarity_data.call_count == 2
158+
assert mock_get_similarity_data.mock_calls == [
159+
# Initial call to Seer
160+
call(
161+
{
162+
**base_request_params,
163+
"k": options.get("seer.similarity.ingest.num_matches_to_request"),
164+
"referrer": "ingest",
165+
"use_reranking": True,
166+
},
167+
{"hybrid_fingerprint": False},
168+
),
169+
# Second call to store the event's data since the match that came back from Seer
170+
# wasn't usable
171+
call(
172+
{
173+
**base_request_params,
174+
"k": 0,
175+
"referrer": "ingest_follow_up",
176+
"use_reranking": False,
177+
}
178+
),
179+
]
180+
115181

116182
class ParentGroupFoundTest(TestCase):
117183
@patch("sentry.grouping.ingest.seer.metrics.distribution")

0 commit comments

Comments
 (0)