Skip to content

Commit

Permalink
Implement OTLP retry mechanism (#4616)
Browse files Browse the repository at this point in the history
  • Loading branch information
alanwest authored Aug 30, 2023
1 parent bb1253e commit 1da0297
Show file tree
Hide file tree
Showing 4 changed files with 547 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
// <copyright file="OtlpRetry.cs" company="OpenTelemetry Authors">
// Copyright The OpenTelemetry Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// </copyright>

using System.Diagnostics;
using System.Net;
using System.Net.Http.Headers;
using Google.Rpc;
using Grpc.Core;
using Status = Google.Rpc.Status;

namespace OpenTelemetry.Exporter.OpenTelemetryProtocol.Implementation.ExportClient;

/// <summary>
/// Implementation of the OTLP retry policy used by both OTLP/gRPC and OTLP/HTTP.
///
/// OTLP/gRPC
/// https://github.com/open-telemetry/opentelemetry-proto/blob/main/docs/specification.md#failures
///
/// OTLP/HTTP
/// https://github.com/open-telemetry/opentelemetry-proto/blob/main/docs/specification.md#failures-1
///
/// The specification requires retries use an exponential backoff strategy,
/// but does not provide specifics for the implementation. As such, this
/// implementation is inspired by the retry strategy provided by
/// Grpc.Net.Client which implements the gRPC retry specification.
///
/// Grpc.Net.Client retry implementation
/// https://github.com/grpc/grpc-dotnet/blob/83d12ea1cb628156c990243bc98699829b88738b/src/Grpc.Net.Client/Internal/Retry/RetryCall.cs#L94
///
/// gRPC retry specification
/// https://github.com/grpc/proposal/blob/master/A6-client-retries.md
///
/// The gRPC retry specification outlines configurable parameters used in its
/// exponential backoff strategy: initial backoff, max backoff, backoff
/// multiplier, and max retry attempts. The OTLP specification does not declare
/// a similar set of parameters, so this implementation uses fixed settings.
/// Furthermore, since the OTLP spec does not specify a max number of attempts,
/// this implementation will retry until the deadline is reached.
///
/// The throttling mechanism for OTLP differs from the throttling mechanism
/// described in the gRPC retry specification. See:
/// https://github.com/open-telemetry/opentelemetry-proto/blob/main/docs/specification.md#otlpgrpc-throttling.
/// </summary>
internal static class OtlpRetry
{
public const string GrpcStatusDetailsHeader = "grpc-status-details-bin";
public const int InitialBackoffMilliseconds = 1000;
private const int MaxBackoffMilliseconds = 5000;
private const double BackoffMultiplier = 1.5;

#if !NET6_0_OR_GREATER
private static readonly Random Random = new Random();
#endif

public static bool TryGetHttpRetryResult(HttpStatusCode statusCode, DateTime? deadline, HttpResponseHeaders responseHeaders, int retryDelayMilliseconds, out RetryResult retryResult)
{
return OtlpRetry.TryGetRetryResult(statusCode, IsHttpStatusCodeRetryable, deadline, responseHeaders, TryGetHttpRetryDelay, retryDelayMilliseconds, out retryResult);
}

public static bool TryGetGrpcRetryResult(StatusCode statusCode, DateTime? deadline, Metadata trailers, int retryDelayMilliseconds, out RetryResult retryResult)
{
return OtlpRetry.TryGetRetryResult(statusCode, IsGrpcStatusCodeRetryable, deadline, trailers, TryGetGrpcRetryDelay, retryDelayMilliseconds, out retryResult);
}

private static bool TryGetRetryResult<TStatusCode, TCarrier>(TStatusCode statusCode, Func<TStatusCode, bool, bool> isRetryable, DateTime? deadline, TCarrier carrier, Func<TStatusCode, TCarrier, TimeSpan?> throttleGetter, int nextRetryDelayMilliseconds, out RetryResult retryResult)
{
retryResult = default;

// TODO: Consider introducing a fixed max number of retries (e.g. max 5 retries).
// The spec does not specify a max number of retries, but it may be bad to not cap the number of attempts.
// Without a max number of retry attempts, retries would continue until the deadline.
// Maybe this is ok? However, it may lead to an unexpected behavioral change. For example:
// 1) When using a batch processor, a longer delay due to repeated
// retries up to the deadline may lead to a higher chance that the queue will be exhausted.
// 2) When using the simple processor, a longer delay due to repeated
// retries up to the deadline will lead to a prolonged blocking call.
// if (attemptCount >= MaxAttempts)
// {
// return false
// }

if (IsDeadlineExceeded(deadline))
{
return false;
}

var throttleDelay = throttleGetter(statusCode, carrier);
var retryable = isRetryable(statusCode, throttleDelay.HasValue);
if (!retryable)
{
return false;
}

var delayDuration = throttleDelay.HasValue
? throttleDelay.Value
: TimeSpan.FromMilliseconds(GetRandomNumber(0, nextRetryDelayMilliseconds));

if (deadline.HasValue && IsDeadlineExceeded(deadline + delayDuration))
{
return false;
}

if (throttleDelay.HasValue)
{
try
{
// TODO: Consider making nextRetryDelayMilliseconds a double to avoid the need for convert/overflow handling
nextRetryDelayMilliseconds = Convert.ToInt32(throttleDelay.Value.TotalMilliseconds);
}
catch (OverflowException)
{
nextRetryDelayMilliseconds = MaxBackoffMilliseconds;
}
}

nextRetryDelayMilliseconds = CalculateNextRetryDelay(nextRetryDelayMilliseconds);
retryResult = new RetryResult(throttleDelay.HasValue, delayDuration, nextRetryDelayMilliseconds);
return true;
}

private static bool IsDeadlineExceeded(DateTime? deadline)
{
// This implementation is internal, and it is guaranteed that deadline is UTC.
return deadline.HasValue && deadline <= DateTime.UtcNow;
}

private static int CalculateNextRetryDelay(int nextRetryDelayMilliseconds)
{
var nextMilliseconds = nextRetryDelayMilliseconds * BackoffMultiplier;
nextMilliseconds = Math.Min(nextMilliseconds, MaxBackoffMilliseconds);
return Convert.ToInt32(nextMilliseconds);
}

private static TimeSpan? TryGetGrpcRetryDelay(StatusCode statusCode, Metadata trailers)
{
Debug.Assert(trailers != null, "trailers was null");

if (statusCode != StatusCode.ResourceExhausted && statusCode != StatusCode.Unavailable)
{
return null;
}

var statusDetails = trailers.Get(GrpcStatusDetailsHeader);
if (statusDetails != null && statusDetails.IsBinary)
{
var status = Status.Parser.ParseFrom(statusDetails.ValueBytes);
foreach (var item in status.Details)
{
var success = item.TryUnpack<RetryInfo>(out var retryInfo);
if (success)
{
return retryInfo.RetryDelay.ToTimeSpan();
}
}
}

return null;
}

private static TimeSpan? TryGetHttpRetryDelay(HttpStatusCode statusCode, HttpResponseHeaders headers)
{
Debug.Assert(headers != null, "headers was null");

#if NETSTANDARD2_1_OR_GREATER || NET6_0_OR_GREATER
return statusCode == HttpStatusCode.TooManyRequests || statusCode == HttpStatusCode.ServiceUnavailable
#else
return statusCode == (HttpStatusCode)429 || statusCode == HttpStatusCode.ServiceUnavailable
#endif
? headers.RetryAfter?.Delta
: null;
}

private static bool IsGrpcStatusCodeRetryable(StatusCode statusCode, bool hasRetryDelay)
{
switch (statusCode)
{
case StatusCode.Cancelled:
case StatusCode.DeadlineExceeded:
case StatusCode.Aborted:
case StatusCode.OutOfRange:
case StatusCode.Unavailable:
case StatusCode.DataLoss:
return true;
case StatusCode.ResourceExhausted:
return hasRetryDelay;
default:
return false;
}
}

#pragma warning disable SA1313 // Parameter should begin with lower-case letter
private static bool IsHttpStatusCodeRetryable(HttpStatusCode statusCode, bool _)
#pragma warning restore SA1313 // Parameter should begin with lower-case letter
{
switch (statusCode)
{
#if NETSTANDARD2_1_OR_GREATER || NET6_0_OR_GREATER
case HttpStatusCode.TooManyRequests:
#else
case (HttpStatusCode)429:
#endif
case HttpStatusCode.BadGateway:
case HttpStatusCode.ServiceUnavailable:
case HttpStatusCode.GatewayTimeout:
return true;
default:
return false;
}
}

private static int GetRandomNumber(int min, int max)
{
#if NET6_0_OR_GREATER
return Random.Shared.Next(min, max);
#else
// TODO: Implement this better to minimize lock contention.
// Consider pulling in Random.Shared implementation.
lock (Random)
{
return Random.Next(min, max);
}
#endif
}

public readonly struct RetryResult
{
public readonly bool Throttled;
public readonly TimeSpan RetryDelay;
public readonly int NextRetryDelayMilliseconds;

public RetryResult(bool throttled, TimeSpan retryDelay, int nextRetryDelayMilliseconds)
{
this.Throttled = throttled;
this.RetryDelay = retryDelay;
this.NextRetryDelayMilliseconds = nextRetryDelayMilliseconds;
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto3";

package google.rpc;

import "google/protobuf/duration.proto";

// Describes when the clients can retry a failed request. Clients could ignore
// the recommendation here or retry when this information is missing from error
// responses.
//
// It's always recommended that clients should use exponential backoff when
// retrying.
//
// Clients should wait until `retry_delay` amount of time has passed since
// receiving the error response before retrying. If retrying requests also
// fail, clients should use an exponential backoff scheme to gradually increase
// the delay between retries based on `retry_delay`, until either a maximum
// number of retries have been reached or a maximum retry delay cap has been
// reached.
message RetryInfo {
// Clients should wait at least this long between retrying the same request.
google.protobuf.Duration retry_delay = 1;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto3";

package google.rpc;

import "google/protobuf/any.proto";

// The `Status` type defines a logical error model that is suitable for
// different programming environments, including REST APIs and RPC APIs. It is
// used by [gRPC](https://github.com/grpc). Each `Status` message contains
// three pieces of data: error code, error message, and error details.
//
// You can find out more about this error model and how to work with it in the
// [API Design Guide](https://cloud.google.com/apis/design/errors).
message Status {
// The status code, which should be an enum value of
// [google.rpc.Code][google.rpc.Code].
int32 code = 1;

// A developer-facing error message, which should be in English. Any
// user-facing error message should be localized and sent in the
// [google.rpc.Status.details][google.rpc.Status.details] field, or localized
// by the client.
string message = 2;

// A list of messages that carry the error details. There is a common set of
// message types for APIs to use.
repeated google.protobuf.Any details = 3;
}
Loading

0 comments on commit 1da0297

Please sign in to comment.