Skip to content

Commit ca029db

Browse files
authored
feat(comparative workflows): Add more tests for Comparison Workflows project (#91730)
This PR adds a few regression tests for Comparative Workflows project. These tests check multiple scenarios related to the logic of the ML algorithm.
1 parent f24eb49 commit ca029db

File tree

1 file changed

+154
-0
lines changed

1 file changed

+154
-0
lines changed

tests/sentry/seer/workflows/test_compare.py

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,3 +57,157 @@ def test_keyed_rrf_score():
5757
assert math.isclose(scores[0][1], 0.01639, rel_tol=1e-3)
5858
assert scores[1][0] == "other"
5959
assert math.isclose(scores[1][1], 0.01612, abs_tol=1e-3)
60+
61+
62+
def test_synthetic_baseline_kl():
63+
"""
64+
This test checks the KL divergence for the synthetic generated baseline, describe in the tech spec.
65+
There are 4 attributes: country, device, error_code, browser.
66+
All the attributes are uniformly distributed in the baseline. Two changes are introduced to the outliers:
67+
A new value "unknown" is added to the browser attribute, making it the most distributed attribute.
68+
The values for country are also skewed making "fr" the most frequent one.
69+
The distributions of error_code and device are kept the same.
70+
"""
71+
72+
baseline = [
73+
("country", "fr", 249871.0),
74+
("country", "us", 249662.0),
75+
("country", "uk", 249366.0),
76+
("country", "de", 249166.0),
77+
("device", "tablet", 333215.0),
78+
("device", "mobile", 332586.0),
79+
("device", "desktop", 332264.0),
80+
("error_code", "404", 333386.0),
81+
("error_code", "400", 332364.0),
82+
("error_code", "504", 332315.0),
83+
("browser", "edge", 249804.0),
84+
("browser", "firefox", 249578.0),
85+
("browser", "chrome", 249489.0),
86+
("browser", "safari", 249194.0),
87+
]
88+
outliers = [
89+
("country", "fr", 987.0),
90+
("country", "uk", 381.0),
91+
("country", "us", 367.0),
92+
("country", "de", 200.0),
93+
("device", "desktop", 695.0),
94+
("device", "mobile", 638.0),
95+
("device", "tablet", 602.0),
96+
("error_code", "404", 691.0),
97+
("error_code", "504", 632.0),
98+
("error_code", "400", 612.0),
99+
("browser", "unknown", 1351.0),
100+
("browser", "safari", 197.0),
101+
("browser", "edge", 195.0),
102+
("browser", "firefox", 192.0),
103+
]
104+
105+
scores = keyed_kl_score(
106+
baseline,
107+
outliers,
108+
total_baseline=998065, # hardcoded
109+
total_outliers=1935, # hardcoded
110+
)
111+
attributes = [s[0] for s in scores]
112+
assert attributes == ["browser", "country", "device", "error_code"]
113+
kl_scores = [s[1] for s in scores]
114+
assert math.isclose(kl_scores[0], 3.9547, rel_tol=1e-3)
115+
assert math.isclose(kl_scores[1], 0.17, abs_tol=1e-3)
116+
117+
118+
def test_zero_kl():
119+
"""
120+
This test checks the if the distirbution of the outliers is more or less the same as the baseline, the KL divergence should be close to 0.
121+
"""
122+
baseline = [
123+
("browser", "edge", 249804.0),
124+
("browser", "firefox", 249578.0),
125+
("browser", "chrome", 249489.0),
126+
("browser", "safari", 249194.0),
127+
]
128+
outliers = [
129+
("browser", "chrome", 194.0),
130+
("browser", "safari", 197.0),
131+
("browser", "edge", 195.0),
132+
("browser", "firefox", 192.0),
133+
]
134+
135+
scores = keyed_kl_score(
136+
baseline,
137+
outliers,
138+
total_baseline=int(sum(i[2] for i in baseline)),
139+
total_outliers=int(sum(i[2] for i in outliers)),
140+
)
141+
assert math.isclose(scores[0][1], 0.0, abs_tol=1e-4)
142+
143+
144+
def test_entropy_only():
145+
"""
146+
This test ranks the attributes by entropy only.
147+
'country' has zero entropy, so it will be ranked the highest. 'browser' has the highest entropy, so it will be ranked the lowest.
148+
"""
149+
# 1000 rows in the baseline
150+
baseline = [
151+
("device", "desktop", 1000.0),
152+
("country", "fr", 1000.0),
153+
("browser", "edge", 1000.0),
154+
]
155+
# 100 rows in the outliers
156+
outlier = [
157+
("country", "fr", 100.0), # zero entropy
158+
("device", "tablet", 33.0),
159+
("device", "mobile", 33.0),
160+
("device", "desktop", 34.0),
161+
("browser", "edge", 20.0),
162+
("browser", "firefox", 20.0),
163+
("browser", "chrome", 20.0),
164+
("browser", "safari", 20.0),
165+
("browser", "brave", 20.0),
166+
]
167+
168+
scores = keyed_rrf_score(
169+
baseline,
170+
outlier,
171+
total_baseline=1000,
172+
total_outliers=100,
173+
entropy_alpha=1.0,
174+
kl_alpha=0.0,
175+
)
176+
attributes = [s[0] for s in scores]
177+
assert attributes == ["country", "device", "browser"]
178+
179+
180+
def test_small_support():
181+
"""
182+
This test checkes the that logic for adding unseen values to the distribution works.
183+
This logic is used to prevent small support attributes from being ranked higher than large support attributes.
184+
Attribute 'device' has the lowest support (it is only present in 11 rows out of 1000), but large difference in distribution of baseline and outlier. So if there was no unseen value, 'device' would be ranked the highest.
185+
But since we add the unseen value, 'device' will be ranked the lowest.
186+
"""
187+
# 1000 rows in the baseline
188+
baseline = [
189+
("device", "desktop", 10.0),
190+
("device", "mobile", 1.0),
191+
("browser", "edge", 350.0),
192+
("browser", "chrome", 650.0),
193+
("country", "fr", 300.0),
194+
("country", "us", 700.0),
195+
]
196+
# 100 rows in the outlier
197+
outlier = [
198+
("device", "desktop", 1.0),
199+
("device", "mobile", 10.0),
200+
("browser", "edge", 65.0),
201+
("browser", "chrome", 35.0),
202+
("country", "fr", 70.0),
203+
("country", "us", 30.0),
204+
]
205+
206+
scores = keyed_kl_score(
207+
baseline,
208+
outlier,
209+
total_baseline=1000,
210+
total_outliers=100,
211+
)
212+
attributes = [s[0] for s in scores]
213+
assert attributes == ["country", "browser", "device"]

0 commit comments

Comments
 (0)