Skip to content

Commit ad8c5f5

Browse files
authored
ref(span-buffer/flusher): More metrics and more lenient backpressure. (#92195)
VIEPF-30
1 parent f1297a7 commit ad8c5f5

File tree

4 files changed

+47
-34
lines changed

4 files changed

+47
-34
lines changed

src/sentry/options/defaults.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2663,6 +2663,11 @@
26632663
default=False,
26642664
flags=FLAG_PRIORITIZE_DISK | FLAG_AUTOMATOR_MODIFIABLE,
26652665
)
2666+
register(
2667+
"standalone-spans.buffer.flusher.backpressure_seconds",
2668+
default=10,
2669+
flags=FLAG_PRIORITIZE_DISK | FLAG_AUTOMATOR_MODIFIABLE,
2670+
)
26662671
register(
26672672
"indexed-spans.agg-span-waterfall.enable",
26682673
default=False,

src/sentry/spans/buffer.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -344,15 +344,17 @@ def flush_segments(self, now: int, max_segments: int = 0) -> dict[SegmentKey, Fl
344344

345345
result = p.execute()
346346

347-
segment_keys: list[tuple[QueueKey, SegmentKey]] = []
347+
segment_keys: list[tuple[int, QueueKey, SegmentKey]] = []
348348

349349
with metrics.timer("spans.buffer.flush_segments.load_segment_data"):
350350
with self.client.pipeline(transaction=False) as p:
351351
# ZRANGEBYSCORE output
352-
for queue_key, segment_span_ids in zip(queue_keys, result):
352+
for shard, queue_key, segment_span_ids in zip(
353+
self.assigned_shards, queue_keys, result
354+
):
353355
# process return value of zrevrangebyscore
354356
for segment_key in segment_span_ids:
355-
segment_keys.append((queue_key, segment_key))
357+
segment_keys.append((shard, queue_key, segment_key))
356358
p.smembers(segment_key)
357359

358360
segments = p.execute()
@@ -361,7 +363,7 @@ def flush_segments(self, now: int, max_segments: int = 0) -> dict[SegmentKey, Fl
361363

362364
num_has_root_spans = 0
363365

364-
for (queue_key, segment_key), segment in zip(segment_keys, segments):
366+
for (shard, queue_key, segment_key), segment in zip(segment_keys, segments):
365367
segment_span_id = _segment_key_to_span_id(segment_key).decode("ascii")
366368

367369
output_spans = []
@@ -396,6 +398,9 @@ def flush_segments(self, now: int, max_segments: int = 0) -> dict[SegmentKey, Fl
396398

397399
output_spans.append(OutputSpan(payload=val))
398400

401+
metrics.incr(
402+
"spans.buffer.flush_segments.num_segments_per_shard", tags={"shard_i": shard}
403+
)
399404
return_segments[segment_key] = FlushedSegment(queue_key=queue_key, spans=output_spans)
400405
num_has_root_spans += int(has_root_span)
401406

src/sentry/spans/consumers/process/flusher.py

Lines changed: 32 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,14 @@
44
import time
55
from collections.abc import Callable
66

7-
import rapidjson
7+
import orjson
88
import sentry_sdk
99
from arroyo import Topic as ArroyoTopic
1010
from arroyo.backends.kafka import KafkaPayload, KafkaProducer, build_kafka_configuration
1111
from arroyo.processing.strategies.abstract import MessageRejected, ProcessingStrategy
1212
from arroyo.types import FilteredPayload, Message
1313

14+
from sentry import options
1415
from sentry.conf.types.kafka_definition import Topic
1516
from sentry.spans.buffer import SpansBuffer
1617
from sentry.utils import metrics
@@ -51,7 +52,7 @@ def __init__(
5152
self.stopped = multiprocessing.Value("i", 0)
5253
self.redis_was_full = False
5354
self.current_drift = multiprocessing.Value("i", 0)
54-
self.should_backpressure = multiprocessing.Value("i", 0)
55+
self.backpressure_since = multiprocessing.Value("i", 0)
5556
self.produce_to_pipe = produce_to_pipe
5657

5758
self._create_process()
@@ -73,7 +74,7 @@ def _create_process(self):
7374
initializer,
7475
self.stopped,
7576
self.current_drift,
76-
self.should_backpressure,
77+
self.backpressure_since,
7778
self.buffer,
7879
self.max_flush_segments,
7980
self.produce_to_pipe,
@@ -89,7 +90,7 @@ def main(
8990
initializer: Callable | None,
9091
stopped,
9192
current_drift,
92-
should_backpressure,
93+
backpressure_since,
9394
buffer: SpansBuffer,
9495
max_flush_segments: int,
9596
produce_to_pipe: Callable[[KafkaPayload], None] | None,
@@ -121,34 +122,36 @@ def produce(payload: KafkaPayload) -> None:
121122
now = int(time.time()) + current_drift.value
122123
flushed_segments = buffer.flush_segments(max_segments=max_flush_segments, now=now)
123124

124-
should_backpressure.value = len(flushed_segments) >= max_flush_segments * len(
125-
buffer.assigned_shards
126-
)
125+
if len(flushed_segments) >= max_flush_segments * len(buffer.assigned_shards):
126+
if backpressure_since.value == 0:
127+
backpressure_since.value = int(time.time())
128+
else:
129+
backpressure_since.value = 0
127130

128131
if not flushed_segments:
129132
time.sleep(1)
130133
continue
131134

132-
for _, flushed_segment in flushed_segments.items():
133-
if not flushed_segment.spans:
134-
# This is a bug, most likely the input topic is not
135-
# partitioned by trace_id so multiple consumers are writing
136-
# over each other. The consequence is duplicated segments,
137-
# worst-case.
138-
metrics.incr("sentry.spans.buffer.empty_segments")
139-
continue
135+
with metrics.timer("spans.buffer.flusher.produce"):
136+
for _, flushed_segment in flushed_segments.items():
137+
if not flushed_segment.spans:
138+
# This is a bug, most likely the input topic is not
139+
# partitioned by trace_id so multiple consumers are writing
140+
# over each other. The consequence is duplicated segments,
141+
# worst-case.
142+
metrics.incr("sentry.spans.buffer.empty_segments")
143+
continue
140144

141-
spans = [span.payload for span in flushed_segment.spans]
145+
spans = [span.payload for span in flushed_segment.spans]
142146

143-
kafka_payload = KafkaPayload(
144-
None, rapidjson.dumps({"spans": spans}).encode("utf8"), []
145-
)
147+
kafka_payload = KafkaPayload(None, orjson.dumps({"spans": spans}), [])
146148

147-
metrics.timing("spans.buffer.segment_size_bytes", len(kafka_payload.value))
148-
produce(kafka_payload)
149+
metrics.timing("spans.buffer.segment_size_bytes", len(kafka_payload.value))
150+
produce(kafka_payload)
149151

150-
for future in producer_futures:
151-
future.result()
152+
with metrics.timer("spans.buffer.flusher.wait_produce"):
153+
for future in producer_futures:
154+
future.result()
152155

153156
producer_futures.clear()
154157

@@ -187,12 +190,12 @@ def submit(self, message: Message[FilteredPayload | int]) -> None:
187190
# efforts, it is still always going to be less durable than Kafka.
188191
# Minimizing our Redis memory usage also makes COGS easier to reason
189192
# about.
190-
#
191-
# should_backpressure is true if there are many segments to flush, but
192-
# the flusher can't get all of them out.
193-
if self.should_backpressure.value:
194-
metrics.incr("sentry.spans.buffer.flusher.backpressure")
195-
raise MessageRejected()
193+
if self.backpressure_since.value > 0:
194+
if int(time.time()) - self.backpressure_since.value > options.get(
195+
"standalone-spans.buffer.flusher.backpressure_seconds"
196+
):
197+
metrics.incr("sentry.spans.buffer.flusher.backpressure")
198+
raise MessageRejected()
196199

197200
# We set the drift. The backpressure based on redis memory comes after.
198201
# If Redis is full for a long time, the drift will grow into a large

tests/sentry/spans/consumers/process/test_flusher.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,4 +80,4 @@ def append(msg):
8080

8181
assert messages
8282

83-
assert flusher.should_backpressure.value
83+
assert flusher.backpressure_since.value

0 commit comments

Comments
 (0)