Skip to content

Serialize Monitoring API calls #260

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 44 additions & 62 deletions stackdriver/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ import (
"net"
"net/url"
"strconv"
"sync"
"time"

"go.opencensus.io/plugin/ocgrpc"
Expand Down Expand Up @@ -178,73 +177,56 @@ func (c *Client) Store(req *monitoring.CreateTimeSeriesRequest) error {

service := monitoring.NewMetricServiceClient(conn)

errors := make(chan error, len(tss)/MaxTimeseriesesPerRequest+1)
var wg sync.WaitGroup
for i := 0; i < len(tss); i += MaxTimeseriesesPerRequest {
end := i + MaxTimeseriesesPerRequest
if end > len(tss) {
end = len(tss)
req_copy := &monitoring.CreateTimeSeriesRequest{
Name: c.projectID,
TimeSeries: req.TimeSeries,
}
_, err = service.CreateTimeSeries(ctx, req_copy)
if err == nil {
// The response is empty if all points were successfully written.
stats.RecordWithTags(ctx,
[]tag.Mutator{tag.Upsert(StatusTag, "0")},
PointCount.M(int64(len(tss))))
} else {
level.Debug(c.logger).Log(
"msg", "Partial failure calling CreateTimeSeries",
"err", err)
status, ok := status.FromError(err)
if !ok {
level.Warn(c.logger).Log("msg", "Unexpected error message type from Monitoring API", "err", err)
return err
}
wg.Add(1)
go func(begin int, end int) {
defer wg.Done()
req_copy := &monitoring.CreateTimeSeriesRequest{
Name: c.projectID,
TimeSeries: req.TimeSeries[begin:end],
}
_, err := service.CreateTimeSeries(ctx, req_copy)
if err == nil {
// The response is empty if all points were successfully written.
for _, details := range status.Details() {
if summary, ok := details.(*monitoring.CreateTimeSeriesSummary); ok {
level.Debug(c.logger).Log("summary", summary)
stats.RecordWithTags(ctx,
[]tag.Mutator{tag.Upsert(StatusTag, "0")},
PointCount.M(int64(end-begin)))
} else {
level.Debug(c.logger).Log(
"msg", "Partial failure calling CreateTimeSeries",
"err", err)
status, ok := status.FromError(err)
if !ok {
level.Warn(c.logger).Log("msg", "Unexpected error message type from Monitoring API", "err", err)
errors <- err
return
}
for _, details := range status.Details() {
if summary, ok := details.(*monitoring.CreateTimeSeriesSummary); ok {
level.Debug(c.logger).Log("summary", summary)
stats.RecordWithTags(ctx,
[]tag.Mutator{tag.Upsert(StatusTag, "0")},
PointCount.M(int64(summary.SuccessPointCount)))
for _, e := range summary.Errors {
stats.RecordWithTags(ctx,
[]tag.Mutator{tag.Upsert(StatusTag, fmt.Sprint(uint32(e.Status.Code)))},
PointCount.M(int64(e.PointCount)))
}
}
}
switch status.Code() {
// codes.DeadlineExceeded:
// It is safe to retry
// google.monitoring.v3.MetricService.CreateTimeSeries
// requests with backoff because QueueManager
// enforces in-order writes on a time series, which
// is a requirement for Stackdriver monitoring.
//
// codes.Unavailable:
// The condition is most likely transient. The request can
// be retried with backoff.
case codes.DeadlineExceeded, codes.Unavailable:
errors <- recoverableError{err}
default:
errors <- err
PointCount.M(int64(summary.SuccessPointCount)))
for _, e := range summary.Errors {
stats.RecordWithTags(ctx,
[]tag.Mutator{tag.Upsert(StatusTag, fmt.Sprint(uint32(e.Status.Code)))},
PointCount.M(int64(e.PointCount)))
}
}
}(i, end)
}
wg.Wait()
close(errors)
if err, ok := <-errors; ok {
return err
}
switch status.Code() {
// codes.DeadlineExceeded:
// It is safe to retry
// google.monitoring.v3.MetricService.CreateTimeSeries
// requests with backoff because QueueManager
// enforces in-order writes on a time series, which
// is a requirement for Stackdriver monitoring.
//
// codes.Unavailable:
// The condition is most likely transient. The request can
// be retried with backoff.
case codes.DeadlineExceeded, codes.Unavailable:
return recoverableError{err}
default:
return err
}
}

return nil
}

Expand Down
8 changes: 7 additions & 1 deletion stackdriver/queue_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -509,7 +509,13 @@ func (s *shardCollection) runShard(i int) {

func (s *shardCollection) sendSamples(client StorageClient, samples []*monitoring_pb.TimeSeries) {
begin := time.Now()
s.sendSamplesWithBackoff(client, samples)
for i := 0; i < len(samples); i += MaxTimeseriesesPerRequest {
end := i + MaxTimeseriesesPerRequest
if end > len(samples) {
end = len(samples)
}
s.sendSamplesWithBackoff(client, samples[i:end])
}

// These counters are used to calculate the dynamic sharding, and as such
// should be maintained irrespective of success or failure.
Expand Down