Skip to content

Commit 848387b

Browse files
feat(grouping): Improve uniq_id token identification in parameterization logic (#67672)
Also add support for hostname replacements and corresponding tests --------- Co-authored-by: getsantry[bot] <66042841+getsantry[bot]@users.noreply.github.com>
1 parent 645e8ae commit 848387b

File tree

7 files changed

+56
-16
lines changed

7 files changed

+56
-16
lines changed

src/sentry/grouping/strategies/message.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,15 @@
3131
(?P<url>
3232
\b(wss?|https?|ftp)://[^\s/$.?#].[^\s]*
3333
) |
34+
(?P<hostname> # Top 100 TLDs. The complete list is 1000s long.
35+
\b
36+
([a-zA-Z0-9\-]{1,63}\.)+?
37+
(
38+
(COM|NET|ORG|JP|DE|UK|FR|BR|IT|RU|ES|ME|GOV|PL|CA|AU|CN|CO|IN|NL|EDU|INFO|EU|CH|ID|AT|KR|CZ|MX|BE|TV|SE|TR|TW|AL|UA|IR|VN|CL|SK|LY|CC|TO|NO|FI|US|PT|DK|AR|HU|TK|GR|IL|NEWS|RO|MY|BIZ|IE|ZA|NZ|SG|EE|TH|IO|XYZ|PE|BG|HK|RS|LT|LINK|PH|CLUB|SI|SITE|MOBI|BY|CAT|WIKI|LA|GA|XXX|CF|HR|NG|JOBS|ONLINE|KZ|UG|GQ|AE|IS|LV|PRO|FM|TIPS|MS|SA|APP)|
39+
(com|net|org|jp|de|uk|fr|br|it|ru|es|me|gov|pl|ca|au|cn|co|in|nl|edu|info|eu|ch|id|at|kr|cz|mx|be|tv|se|tr|tw|al|ua|ir|vn|cl|sk|ly|cc|to|no|fi|us|pt|dk|ar|hu|tk|gr|il|news|ro|my|biz|ie|za|nz|sg|ee|th|io|xyz|pe|bg|hk|rs|lt|link|ph|club|si|site|mobi|by|cat|wiki|la|ga|xxx|cf|hr|ng|jobs|online|kz|ug|gq|ae|is|lv|pro|fm|tips|ms|sa|app)
40+
)
41+
\b
42+
) |
3443
(?P<ip>
3544
(
3645
([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|
@@ -173,11 +182,12 @@ def num_tokens_from_string(token_str: str) -> int:
173182
4 # Tokens smaller than this are unlikely to be unique ids regardless of other attributes
174183
)
175184
UNIQ_ID_TOKEN_LENGTH_RATIO_DEFAULT = 0.5
176-
UNIQ_ID_TOKEN_LENGTH_LONG = 8
185+
UNIQ_ID_TOKEN_LENGTH_LONG = 10
177186
UNIQ_ID_TOKEN_LENGTH_RATIO_LONG = 0.4
178187

179188

180189
def is_probably_uniq_id(token_str: str) -> bool:
190+
token_str = token_str.strip("\"'[]{}():;")
181191
if len(token_str) < UNIQ_ID_TOKEN_LENGTH_MINIMUM:
182192
return False
183193
if token_str[0] == "<" and token_str[-1] == ">": # Don't replace already-parameterized tokens

tests/sentry/grouping/snapshots/test_variants/test_event_hash_variant/mobile@2021_02_12/csp_style_src_elem.pysnap

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
11
---
2-
created: '2020-02-26T19:54:41.012465Z'
3-
creator: sentry
42
source: tests/sentry/grouping/test_variants.py
53
---
64
default:
@@ -14,4 +12,4 @@ default:
1412
uri*
1513
"use.fontawesome.com"
1614
message (csp takes precedence)
17-
"Blocked 'style' from 'use.fontawesome.com'"
15+
"Blocked 'style' from '<hostname>'"

tests/sentry/grouping/snapshots/test_variants/test_event_hash_variant/newstyle@2019_04_17/csp_style_src_elem.pysnap

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
11
---
2-
created: '2020-02-26T19:54:33.710918Z'
3-
creator: sentry
42
source: tests/sentry/grouping/test_variants.py
53
---
64
default:
@@ -14,4 +12,4 @@ default:
1412
uri*
1513
"use.fontawesome.com"
1614
message (csp takes precedence)
17-
"Blocked 'style' from 'use.fontawesome.com'"
15+
"Blocked 'style' from '<hostname>'"

tests/sentry/grouping/snapshots/test_variants/test_event_hash_variant/newstyle@2019_05_08/csp_style_src_elem.pysnap

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
11
---
2-
created: '2020-02-26T19:54:37.543865Z'
3-
creator: sentry
42
source: tests/sentry/grouping/test_variants.py
53
---
64
default:
@@ -14,4 +12,4 @@ default:
1412
uri*
1513
"use.fontawesome.com"
1614
message (csp takes precedence)
17-
"Blocked 'style' from 'use.fontawesome.com'"
15+
"Blocked 'style' from '<hostname>'"

tests/sentry/grouping/snapshots/test_variants/test_event_hash_variant/newstyle@2019_10_29/csp_style_src_elem.pysnap

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
11
---
2-
created: '2020-02-26T19:54:41.012465Z'
3-
creator: sentry
42
source: tests/sentry/grouping/test_variants.py
53
---
64
default:
@@ -14,4 +12,4 @@ default:
1412
uri*
1513
"use.fontawesome.com"
1614
message (csp takes precedence)
17-
"Blocked 'style' from 'use.fontawesome.com'"
15+
"Blocked 'style' from '<hostname>'"

tests/sentry/grouping/snapshots/test_variants/test_event_hash_variant/newstyle@2023_01_11/csp_style_src_elem.pysnap

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
11
---
2-
created: '2023-01-11T11:41:28.890755Z'
3-
creator: sentry
42
source: tests/sentry/grouping/test_variants.py
53
---
64
default:
@@ -14,4 +12,4 @@ default:
1412
uri*
1513
"use.fontawesome.com"
1614
message (csp takes precedence)
17-
"Blocked 'style' from 'use.fontawesome.com'"
15+
"Blocked 'style' from '<hostname>'"

tests/sentry/grouping/test_normalize_message.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,16 @@
110110
"""blah connection failed after 12345ms 1.899s 3s""",
111111
"""blah connection failed after <duration> <duration> <duration>""",
112112
),
113+
(
114+
"Hostname - 2 levels",
115+
"""Blocked 'connect' from 'gggggggdasdwefwewqqqfefwef.com'""",
116+
"""Blocked 'connect' from '<hostname>'""",
117+
),
118+
(
119+
"Hostname - 3 levels",
120+
"""Blocked 'font' from 'www.time.co'""",
121+
"""Blocked 'font' from '<hostname>'""",
122+
),
113123
(
114124
"Uniq ID - sql savepoint",
115125
'''SQL: RELEASE SAVEPOINT "s140177518376768_x2"''',
@@ -145,6 +155,26 @@
145155
"""I am the test words 1password python3 abc123 123abc""",
146156
"""I am the test words 1password python3 abc123 123abc""",
147157
),
158+
(
159+
"Uniq ID - react element",
160+
"""Permission denied to access property "__reactFiber$b6c78e70asw" """,
161+
"""Permission denied to access property <uniq_id> """,
162+
),
163+
(
164+
"Uniq ID - no change variable name",
165+
"""TypeError: Cannot read property 'startRTM' of undefined""",
166+
"""TypeError: Cannot read property 'startRTM' of undefined""",
167+
),
168+
(
169+
"Uniq ID - json ignored properly",
170+
"""[401,""]""",
171+
"""[<int>,""]""",
172+
),
173+
(
174+
"Uniq ID - no change",
175+
"""Blocked 'script' from 'wasm-eval:'""",
176+
"""Blocked 'script' from 'wasm-eval:'""",
177+
),
148178
],
149179
)
150180
def test_normalize_message(name, input, expected):
@@ -168,6 +198,16 @@ def test_normalize_message(name, input, expected):
168198
"""blah <url> had a problem""",
169199
),
170200
("URL - IP w/ port", """blah 0.0.0.0:10 had a problem""", """blah <ip> had a problem"""),
201+
(
202+
"Int - parens",
203+
"""Tb.Worker {"msg" => "(#239323) Received ...""",
204+
"""Tb.Worker {"msg" => "(#<int>) Received ...""",
205+
),
206+
(
207+
"Uniq ID - Snuba query",
208+
"""Error running query: SELECT (divide(plus(sumMergeIf((value AS _snuba_value), equals((arrayElement(tags.raw_value, indexOf(tags.key, 9223372036854776026)) AS `_snuba_tags_raw[9223372036854776026]`), 'satisfactory') AND equals((metric_id AS _snuba_metric_id), 9223372036854775936)), divide(sumMergeIf(_snuba_value, equals(`_snuba_tags_raw[9223372036854776026]`, 'tolerable') AND equals(_snuba_metric_id, 9223372036854775936)), 2)), sumMergeIf(_snuba_value, equals(_snuba_metric_id, 9223372036854775936))) AS `_snuba_c:transactions/on_demand@none`) FROM generic_metric_counters_aggregated_dist WHERE equals(granularity, 1) AND equals((org_id AS _snuba_org_id), 1383997) AND in((project_id AS _snuba_project_id), [6726638]) AND greaterOrEquals((timestamp AS _snuba_timestamp), toDateTime('2024-03-18T22:52:00', 'Universal')) AND less(_snuba_timestamp, toDateTime('2024-03-18T23:22:00', 'Universal')) AND equals((arrayElement(tags.raw_value, indexOf(tags.key, 9223372036854776069)) AS `_snuba_tags_raw[9223372036854776069]`), '2d896d92') AND in(_s...}""",
209+
"""Error running query: SELECT (divide(plus(sumMergeIf((value AS _snuba_value), equals((arrayElement(tags.raw_value, indexOf(tags.key, <int>)) AS `_snuba_tags_raw[<int>]`), 'satisfactory') AND equals((metric_id AS _snuba_metric_id), <int>)), divide(sumMergeIf(_snuba_value, equals(`_snuba_tags_raw[<int>]`, 'tolerable') AND equals(_snuba_metric_id, <int>)), 2)), sumMergeIf(_snuba_value, equals(_snuba_metric_id, <int>))) AS `_snuba_c:transactions/on_demand@none`) FROM generic_metric_counters_aggregated_dist WHERE equals(granularity, 1) AND equals((org_id AS _snuba_org_id), <int>) AND in((project_id AS _snuba_project_id), [<int>]) AND greaterOrEquals((timestamp AS _snuba_timestamp), toDateTime('2024-03-18T22:52:00', 'Universal')) AND less(_snuba_timestamp, toDateTime('<date>', 'Universal')) AND equals((arrayElement(tags.raw_value, indexOf(tags.key, <int>)) AS `_snuba_tags_raw[<int>]`), '<uniq_id>') AND in(_s...}""",
210+
),
171211
],
172212
)
173213
def test_fail_to_normalize_message(name, input, expected):

0 commit comments

Comments
 (0)