1
1
from __future__ import annotations
2
2
3
- import contextlib
4
3
import logging
5
4
from collections .abc import Sequence
6
5
from datetime import datetime , timezone
7
6
7
+ from snuba_sdk import Column , Condition , Entity , Function , Granularity , Limit , Offset , Op , Query
8
+
8
9
from sentry .api .event_search import QueryToken , parse_search_query
9
10
from sentry .models .organization import Organization
10
11
from sentry .replays .lib .kafka import initialize_replays_publisher
11
- from sentry .replays .post_process import generate_normalized_output
12
- from sentry .replays .query import query_replays_collection_paginated , replay_url_parser_config
13
- from sentry .replays .tasks import archive_replay , delete_replay_recording_async
12
+ from sentry .replays .query import replay_url_parser_config
13
+ from sentry .replays .tasks import archive_replay , delete_replays_script_async
14
+ from sentry .replays .usecases .query import execute_query , handle_search_filters
15
+ from sentry .replays .usecases .query .configs .scalar import scalar_search_config
14
16
15
17
logger = logging .getLogger ()
16
18
@@ -33,21 +35,15 @@ def delete_replays(
33
35
34
36
has_more = True
35
37
while has_more :
36
- response = query_replays_collection_paginated (
37
- project_ids = [ project_id ] ,
38
+ replays , has_more = _get_rows_matching_deletion_pattern (
39
+ project_id = project_id ,
38
40
start = start_utc ,
39
41
end = end_utc ,
40
- fields = ["id" ],
41
42
limit = batch_size ,
42
- environment = environment ,
43
43
offset = offset ,
44
44
search_filters = search_filters ,
45
- sort = "started_at" ,
46
- organization = Organization .objects .filter (project__id = project_id ).get (),
47
- preferred_source = "scalar" ,
45
+ environment = environment ,
48
46
)
49
- replays = list (generate_normalized_output (response .response ))
50
- has_more = response .has_more
51
47
52
48
# Exit early if no replays were found.
53
49
if not replays :
@@ -58,16 +54,16 @@ def delete_replays(
58
54
if dry_run :
59
55
print (f"Replays to be deleted (dry run): { len (replays )} " ) # NOQA
60
56
else :
61
- delete_replay_ids (project_id , replay_ids = [ r [ "id" ] for r in replays ] )
57
+ delete_replay_ids (project_id , replays )
62
58
63
59
64
60
def translate_cli_tags_param_to_snuba_tag_param (tags : list [str ]) -> Sequence [QueryToken ]:
65
61
return parse_search_query (" AND " .join (tags ), config = replay_url_parser_config )
66
62
67
63
68
- def delete_replay_ids (project_id : int , replay_ids : list [str ]) -> None :
64
+ def delete_replay_ids (project_id : int , rows : list [tuple [ int , str , int ] ]) -> None :
69
65
"""Delete a set of replay-ids for a specific project."""
70
- logger .info ("Archiving %d replays." , len (replay_ids ))
66
+ logger .info ("Archiving %d replays." , len (rows ))
71
67
72
68
# Bulk produce archived replay rows to the ingest-replay-events topic before flushing.
73
69
#
@@ -79,30 +75,75 @@ def delete_replay_ids(project_id: int, replay_ids: list[str]) -> None:
79
75
#
80
76
# This also gives us reasonable assurances that if the script ran to completion the customer
81
77
# will not be able to access their deleted data even if the actual deletion takes place some
82
- # time later.
83
- with _bulk_produce_then_flush () as publisher :
84
- for replay_id in replay_ids :
85
- archive_replay (publisher , project_id , replay_id )
78
+ # time later
79
+ publisher = initialize_replays_publisher (is_async = True )
80
+ for _ , replay_id , _ in rows :
81
+ archive_replay (publisher , project_id , replay_id )
82
+ publisher .flush ()
86
83
87
- logger .info ("Scheduling %d replays for deletion." , len (replay_ids ))
84
+ logger .info ("Scheduling %d replays for deletion." , len (rows ))
88
85
89
86
# Asynchronously delete RRWeb recording data.
90
87
#
91
88
# Because this operation could involve millions of requests to the blob storage provider we
92
89
# schedule the tasks to run on a cluster of workers. This allows us to parallelize the work
93
90
# and complete the task as quickly as possible.
94
- for replay_id in replay_ids :
95
- delete_replay_recording_async .delay (project_id , replay_id )
91
+ for retention_days , replay_id , max_segment_id in rows :
92
+ delete_replays_script_async .delay (retention_days , project_id , replay_id , max_segment_id )
96
93
97
- logger .info ("%d replays were successfully deleted." , len (replay_ids ))
94
+ logger .info ("%d replays were successfully deleted." , len (rows ))
98
95
logger .info (
99
96
"The customer will no longer have access to the replays passed to this function. Deletion "
100
97
"of RRWeb data will complete asynchronously."
101
98
)
102
99
103
100
104
- @contextlib .contextmanager
105
- def _bulk_produce_then_flush ():
106
- publisher = initialize_replays_publisher (is_async = True )
107
- yield publisher
108
- publisher .flush ()
101
+ def _get_rows_matching_deletion_pattern (
102
+ project_id : int ,
103
+ limit : int ,
104
+ offset : int ,
105
+ end : datetime ,
106
+ start : datetime ,
107
+ search_filters : Sequence [QueryToken ],
108
+ environment : list [str ],
109
+ ) -> tuple [list [tuple [int , str , int ]], bool ]:
110
+ where = handle_search_filters (scalar_search_config , search_filters )
111
+
112
+ if environment :
113
+ where .append (Condition (Column ("environment" ), Op .IN , environment ))
114
+
115
+ query = Query (
116
+ match = Entity ("replays" ),
117
+ select = [
118
+ Function ("any" , parameters = [Column ("retention_days" )], alias = "retention_days" ),
119
+ Column ("replay_id" ),
120
+ Function ("max" , parameters = [Column ("segment_id" )], alias = "max_segment_id" ),
121
+ ],
122
+ where = [
123
+ Condition (Column ("project_id" ), Op .EQ , project_id ),
124
+ Condition (Column ("timestamp" ), Op .LT , end ),
125
+ Condition (Column ("timestamp" ), Op .GTE , start ),
126
+ * where ,
127
+ ],
128
+ groupby = [Column ("replay_id" )],
129
+ granularity = Granularity (3600 ),
130
+ limit = Limit (limit ),
131
+ offset = Offset (offset ),
132
+ )
133
+
134
+ response = execute_query (
135
+ query ,
136
+ {"tenant_id" : Organization .objects .filter (project__id = project_id ).get ().id },
137
+ "replays.scripts.delete_replays" ,
138
+ )
139
+
140
+ data = response .get ("data" , [])
141
+ has_more = len (data ) == limit
142
+
143
+ return (
144
+ [
145
+ (item ["retention_days" ], item ["replay_id" ].replace ("-" , "" ), item ["max_segment_id" ])
146
+ for item in data
147
+ ],
148
+ has_more ,
149
+ )
0 commit comments