Skip to content

Commit 3b68c6d

Browse files
Sushisourcecretz
andauthoredMay 14, 2024··
Add request failure code label to metrics (#1472)
Co-authored-by: Chad Retz <chad.retz@gmail.com>
1 parent c69831e commit 3b68c6d

File tree

8 files changed

+94
-27
lines changed

8 files changed

+94
-27
lines changed
 

‎internal/client.go

+5-8
Original file line numberDiff line numberDiff line change
@@ -471,6 +471,9 @@ type (
471471
// worker options, the ones here wrap the ones in worker options. The same
472472
// interceptor should not be set here and in worker options.
473473
Interceptors []ClientInterceptor
474+
475+
// If set true, error code labels will not be included on request failure metrics.
476+
DisableErrorCodeMetricTags bool
474477
}
475478

476479
// HeadersProvider returns a map of gRPC headers that should be used on every request.
@@ -813,14 +816,8 @@ func newDialParameters(options *ClientOptions, excludeInternalFromRetry *atomic.
813816
return dialParameters{
814817
UserConnectionOptions: options.ConnectionOptions,
815818
HostPort: options.HostPort,
816-
RequiredInterceptors: requiredInterceptors(
817-
options.MetricsHandler,
818-
options.HeadersProvider,
819-
options.TrafficController,
820-
excludeInternalFromRetry,
821-
options.Credentials,
822-
),
823-
DefaultServiceConfig: defaultServiceConfig,
819+
RequiredInterceptors: requiredInterceptors(options, excludeInternalFromRetry),
820+
DefaultServiceConfig: defaultServiceConfig,
824821
}
825822
}
826823

‎internal/common/metrics/constants.go

+1
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ const (
9595
OperationTagName = "operation"
9696
CauseTagName = "cause"
9797
WorkflowTaskFailureReason = "failure_reason"
98+
RequestFailureCode = "status_code"
9899
)
99100

100101
// Metric tag values

‎internal/common/metrics/grpc.go

+7-3
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ type HandlerContextKey struct{}
4242
type LongPollContextKey struct{}
4343

4444
// NewGRPCInterceptor creates a new gRPC unary interceptor to record metrics.
45-
func NewGRPCInterceptor(defaultHandler Handler, suffix string) grpc.UnaryClientInterceptor {
45+
func NewGRPCInterceptor(defaultHandler Handler, suffix string, disableRequestFailCodes bool) grpc.UnaryClientInterceptor {
4646
return func(
4747
ctx context.Context,
4848
method string,
@@ -78,7 +78,7 @@ func NewGRPCInterceptor(defaultHandler Handler, suffix string) grpc.UnaryClientI
7878
start := time.Now()
7979
recordRequestStart(handler, longPoll, suffix)
8080
err := invoker(ctx, method, req, reply, cc, opts...)
81-
recordRequestEnd(handler, longPoll, suffix, start, err)
81+
recordRequestEnd(handler, longPoll, suffix, start, err, disableRequestFailCodes)
8282
return err
8383
}
8484
}
@@ -93,7 +93,7 @@ func recordRequestStart(handler Handler, longPoll bool, suffix string) {
9393
handler.Counter(metric).Inc(1)
9494
}
9595

96-
func recordRequestEnd(handler Handler, longPoll bool, suffix string, start time.Time, err error) {
96+
func recordRequestEnd(handler Handler, longPoll bool, suffix string, start time.Time, err error, disableRequestFailCodes bool) {
9797
// Record latency
9898
timerMetric := TemporalRequestLatency
9999
if longPoll {
@@ -109,6 +109,10 @@ func recordRequestEnd(handler Handler, longPoll bool, suffix string, start time.
109109
failureMetric = TemporalLongRequestFailure
110110
}
111111
failureMetric += suffix
112+
errStatus, _ := status.FromError(err)
113+
if !disableRequestFailCodes {
114+
handler = handler.WithTags(RequestFailureCodeTags(errStatus.Code()))
115+
}
112116
handler.Counter(failureMetric).Inc(1)
113117

114118
// If it's a resource exhausted, extract cause if present and increment

‎internal/common/metrics/grpc_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ func TestGRPCInterceptor(t *testing.T) {
5252
handler := metrics.NewCapturingHandler()
5353
cc, err := grpc.Dial(l.Addr().String(),
5454
grpc.WithTransportCredentials(insecure.NewCredentials()),
55-
grpc.WithUnaryInterceptor(metrics.NewGRPCInterceptor(handler, "_my_suffix")))
55+
grpc.WithUnaryInterceptor(metrics.NewGRPCInterceptor(handler, "_my_suffix", true)))
5656
require.NoError(t, err)
5757
defer func() { _ = cc.Close() }()
5858
client := grpc_health_v1.NewHealthClient(cc)

‎internal/common/metrics/tags.go

+56
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,12 @@
2222

2323
package metrics
2424

25+
import (
26+
"strconv"
27+
28+
"google.golang.org/grpc/codes"
29+
)
30+
2531
// RootTags returns a set of base tags for all metrics.
2632
func RootTags(namespace string) map[string]string {
2733
return map[string]string{
@@ -94,3 +100,53 @@ func WorkflowTaskFailedTags(reason string) map[string]string {
94100
WorkflowTaskFailureReason: reason,
95101
}
96102
}
103+
104+
// RequestFailureCodeTags returns a set of tags for a request failure.
105+
func RequestFailureCodeTags(statusCode codes.Code) map[string]string {
106+
asStr := canonicalString(statusCode)
107+
return map[string]string{
108+
RequestFailureCode: asStr,
109+
}
110+
}
111+
112+
// Annoyingly gRPC defines this, but does not expose it publicly.
113+
func canonicalString(c codes.Code) string {
114+
switch c {
115+
case codes.OK:
116+
return "OK"
117+
case codes.Canceled:
118+
return "CANCELLED"
119+
case codes.Unknown:
120+
return "UNKNOWN"
121+
case codes.InvalidArgument:
122+
return "INVALID_ARGUMENT"
123+
case codes.DeadlineExceeded:
124+
return "DEADLINE_EXCEEDED"
125+
case codes.NotFound:
126+
return "NOT_FOUND"
127+
case codes.AlreadyExists:
128+
return "ALREADY_EXISTS"
129+
case codes.PermissionDenied:
130+
return "PERMISSION_DENIED"
131+
case codes.ResourceExhausted:
132+
return "RESOURCE_EXHAUSTED"
133+
case codes.FailedPrecondition:
134+
return "FAILED_PRECONDITION"
135+
case codes.Aborted:
136+
return "ABORTED"
137+
case codes.OutOfRange:
138+
return "OUT_OF_RANGE"
139+
case codes.Unimplemented:
140+
return "UNIMPLEMENTED"
141+
case codes.Internal:
142+
return "INTERNAL"
143+
case codes.Unavailable:
144+
return "UNAVAILABLE"
145+
case codes.DataLoss:
146+
return "DATA_LOSS"
147+
case codes.Unauthenticated:
148+
return "UNAUTHENTICATED"
149+
default:
150+
return "CODE(" + strconv.FormatInt(int64(c), 10) + ")"
151+
}
152+
}

‎internal/grpc_dialer.go

+9-12
Original file line numberDiff line numberDiff line change
@@ -145,34 +145,31 @@ func dial(params dialParameters) (*grpc.ClientConn, error) {
145145
}
146146

147147
func requiredInterceptors(
148-
metricsHandler metrics.Handler,
149-
headersProvider HeadersProvider,
150-
controller TrafficController,
148+
clientOptions *ClientOptions,
151149
excludeInternalFromRetry *atomic.Bool,
152-
credentials Credentials,
153150
) []grpc.UnaryClientInterceptor {
154151
interceptors := []grpc.UnaryClientInterceptor{
155152
errorInterceptor,
156153
// Report aggregated metrics for the call, this is done outside of the retry loop.
157-
metrics.NewGRPCInterceptor(metricsHandler, ""),
154+
metrics.NewGRPCInterceptor(clientOptions.MetricsHandler, "", clientOptions.DisableErrorCodeMetricTags),
158155
// By default the grpc retry interceptor *is disabled*, preventing accidental use of retries.
159156
// We add call options for retry configuration based on the values present in the context.
160157
retry.NewRetryOptionsInterceptor(excludeInternalFromRetry),
161158
// Performs retries *IF* retry options are set for the call.
162159
grpc_retry.UnaryClientInterceptor(),
163160
// Report metrics for every call made to the server.
164-
metrics.NewGRPCInterceptor(metricsHandler, attemptSuffix),
161+
metrics.NewGRPCInterceptor(clientOptions.MetricsHandler, attemptSuffix, clientOptions.DisableErrorCodeMetricTags),
165162
}
166-
if headersProvider != nil {
167-
interceptors = append(interceptors, headersProviderInterceptor(headersProvider))
163+
if clientOptions.HeadersProvider != nil {
164+
interceptors = append(interceptors, headersProviderInterceptor(clientOptions.HeadersProvider))
168165
}
169-
if controller != nil {
170-
interceptors = append(interceptors, trafficControllerInterceptor(controller))
166+
if clientOptions.TrafficController != nil {
167+
interceptors = append(interceptors, trafficControllerInterceptor(clientOptions.TrafficController))
171168
}
172169
// Add credentials interceptor. This is intentionally added after headers
173170
// provider to overwrite anything set there.
174-
if credentials != nil {
175-
if interceptor := credentials.gRPCInterceptor(); interceptor != nil {
171+
if clientOptions.Credentials != nil {
172+
if interceptor := clientOptions.Credentials.gRPCInterceptor(); interceptor != nil {
176173
interceptors = append(interceptors, interceptor)
177174
}
178175
}

‎internal/grpc_dialer_test.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -128,13 +128,13 @@ func TestHeadersProvider_Error(t *testing.T) {
128128
}
129129

130130
func TestHeadersProvider_NotIncludedWhenNil(t *testing.T) {
131-
interceptors := requiredInterceptors(nil, nil, nil, nil, nil)
131+
interceptors := requiredInterceptors(&ClientOptions{}, nil)
132132
require.Equal(t, 5, len(interceptors))
133133
}
134134

135135
func TestHeadersProvider_IncludedWithHeadersProvider(t *testing.T) {
136-
interceptors := requiredInterceptors(nil,
137-
authHeadersProvider{token: "test-auth-token"}, nil, nil, nil)
136+
opts := &ClientOptions{HeadersProvider: authHeadersProvider{token: "test-auth-token"}}
137+
interceptors := requiredInterceptors(opts, nil)
138138
require.Equal(t, 6, len(interceptors))
139139
}
140140

‎test/integration_test.go

+12
Original file line numberDiff line numberDiff line change
@@ -4828,6 +4828,18 @@ func (ts *IntegrationTestSuite) TestNondeterministicUpdateRegistertion() {
48284828
ts.EqualValues(expected, ts.activities.invoked())
48294829
}
48304830

4831+
func (ts *IntegrationTestSuite) TestRequestFailureMetric() {
4832+
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
4833+
defer cancel()
4834+
4835+
// Unset namespace field will cause an invalid argument error
4836+
_, _ = ts.client.WorkflowService().DescribeNamespace(ctx, &workflowservice.DescribeNamespaceRequest{})
4837+
4838+
ts.assertMetricCount(metrics.TemporalRequestFailure, 1,
4839+
metrics.OperationTagName, "DescribeNamespace",
4840+
metrics.RequestFailureCode, "INVALID_ARGUMENT")
4841+
}
4842+
48314843
// executeWorkflow executes a given workflow and waits for the result
48324844
func (ts *IntegrationTestSuite) executeWorkflow(
48334845
wfID string, wfFunc interface{}, retValPtr interface{}, args ...interface{},

0 commit comments

Comments
 (0)
Please sign in to comment.