From c65b7ed24f1a960be000b49e2db51c3615c4e8ea Mon Sep 17 00:00:00 2001 From: "Jonathan A. Sternberg" Date: Tue, 30 Jan 2024 13:23:30 -0600 Subject: [PATCH] otel: include service instance id attribute to resource and move to metricutil package Add the service instance id to the resource attributes to prevent downstream OTEL processors and exporters from thinking that the CLI invocations are a single process that keeps restarting. The unique id can be removed through downstream aggregation to prevent cardinality issues, but we need some way to tell OTEL that it shouldn't reset the counters. Move the check for the experimental flag to its own package and then use that invocation to prevent creating exporters so metrics are disabled completely. This makes it so we don't have to check for the experimental flag in every place we add metrics until we decide to make metrics stable in general. This also moves the OTEL initialization to a `util/metricutil` package to be more consistent with the existing util naming and to differentiate it from the upstream `metric` name. Using both `metrics` and `metric` as import names was confusing since `metric` was an upstream dependency and `metrics` was a local utility. `metricutil` matches with the existing utilities and makes clear that it isn't a spelling mistake. The record version metric has been removed since we weren't planning on keeping that metric anyway and most of the information is now included in the instrumentation library name and version. That function is included as a utility in the `otel/sdk/metric` package to retrieve the appropriate meter from the meter provider. Signed-off-by: Jonathan A. Sternberg --- commands/bake.go | 9 -- commands/build.go | 39 +----- go.mod | 2 +- util/confutil/exp.go | 15 +++ .../metrics.go => metricutil/metric.go} | 126 +++++++++++------- util/{metrics => metricutil}/otlp.go | 10 +- util/metricutil/resource.go | 53 ++++++++ 7 files changed, 151 insertions(+), 103 deletions(-) create mode 100644 util/confutil/exp.go rename util/{metrics/metrics.go => metricutil/metric.go} (60%) rename util/{metrics => metricutil}/otlp.go (86%) create mode 100644 util/metricutil/resource.go diff --git a/commands/bake.go b/commands/bake.go index e17a8ebc..565a72b5 100644 --- a/commands/bake.go +++ b/commands/bake.go @@ -19,7 +19,6 @@ import ( "github.com/docker/buildx/util/confutil" "github.com/docker/buildx/util/desktop" "github.com/docker/buildx/util/dockerutil" - "github.com/docker/buildx/util/metrics" "github.com/docker/buildx/util/progress" "github.com/docker/buildx/util/tracing" "github.com/docker/cli/cli/command" @@ -43,14 +42,6 @@ type bakeOptions struct { } func runBake(ctx context.Context, dockerCli command.Cli, targets []string, in bakeOptions, cFlags commonFlags) (err error) { - mp, report, err := metrics.MeterProvider(dockerCli) - if err != nil { - return err - } - defer report() - - recordVersionInfo(mp, "bake") - ctx, end, err := tracing.TraceCurrentCommand(ctx, "bake") if err != nil { return err diff --git a/commands/build.go b/commands/build.go index e142befd..843c6a07 100644 --- a/commands/build.go +++ b/commands/build.go @@ -30,10 +30,9 @@ import ( "github.com/docker/buildx/util/cobrautil" "github.com/docker/buildx/util/desktop" "github.com/docker/buildx/util/ioset" - "github.com/docker/buildx/util/metrics" + "github.com/docker/buildx/util/metricutil" "github.com/docker/buildx/util/progress" "github.com/docker/buildx/util/tracing" - "github.com/docker/buildx/version" "github.com/docker/cli-docs-tool/annotation" "github.com/docker/cli/cli" "github.com/docker/cli/cli/command" @@ -53,9 +52,6 @@ import ( "github.com/sirupsen/logrus" "github.com/spf13/cobra" "github.com/spf13/pflag" - "go.opentelemetry.io/otel" - "go.opentelemetry.io/otel/attribute" - "go.opentelemetry.io/otel/metric" "google.golang.org/grpc/codes" ) @@ -216,13 +212,11 @@ func (o *buildOptions) toDisplayMode() (progressui.DisplayMode, error) { } func runBuild(ctx context.Context, dockerCli command.Cli, options buildOptions) (err error) { - mp, report, err := metrics.MeterProvider(dockerCli) + mp, err := metricutil.NewMeterProvider(ctx, dockerCli) if err != nil { return err } - defer report() - - recordVersionInfo(mp, "build") + defer mp.Report(context.Background()) ctx, end, err := tracing.TraceCurrentCommand(ctx, "build") if err != nil { @@ -936,30 +930,3 @@ func maybeJSONArray(v string) []string { } return []string{v} } - -func recordVersionInfo(mp metric.MeterProvider, command string) { - // Still in the process of testing/stabilizing these counters. - if !isExperimental() { - return - } - - meter := mp.Meter("github.com/docker/buildx", - metric.WithInstrumentationVersion(version.Version), - ) - - counter, err := meter.Int64Counter("docker.cli.count", - metric.WithDescription("Number of invocations of the docker buildx command."), - ) - if err != nil { - otel.Handle(err) - } - - counter.Add(context.Background(), 1, - metric.WithAttributes( - attribute.String("command", command), - attribute.String("package", version.Package), - attribute.String("version", version.Version), - attribute.String("revision", version.Revision), - ), - ) -} diff --git a/go.mod b/go.mod index 19049f90..98f14fed 100644 --- a/go.mod +++ b/go.mod @@ -42,6 +42,7 @@ require ( go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v0.42.0 go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v0.42.0 go.opentelemetry.io/otel/metric v1.19.0 + go.opentelemetry.io/otel/sdk v1.19.0 go.opentelemetry.io/otel/sdk/metric v1.19.0 go.opentelemetry.io/otel/trace v1.19.0 golang.org/x/mod v0.13.0 @@ -147,7 +148,6 @@ require ( go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.19.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.19.0 // indirect go.opentelemetry.io/otel/exporters/prometheus v0.42.0 // indirect - go.opentelemetry.io/otel/sdk v1.19.0 // indirect go.opentelemetry.io/proto/otlp v1.0.0 // indirect golang.org/x/crypto v0.17.0 // indirect golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1 // indirect diff --git a/util/confutil/exp.go b/util/confutil/exp.go new file mode 100644 index 00000000..03c6b97a --- /dev/null +++ b/util/confutil/exp.go @@ -0,0 +1,15 @@ +package confutil + +import ( + "os" + "strconv" +) + +// IsExperimental checks if the experimental flag has been configured. +func IsExperimental() bool { + if v, ok := os.LookupEnv("BUILDX_EXPERIMENTAL"); ok { + vv, _ := strconv.ParseBool(v) + return vv + } + return false +} diff --git a/util/metrics/metrics.go b/util/metricutil/metric.go similarity index 60% rename from util/metrics/metrics.go rename to util/metricutil/metric.go index 36e1cc62..ee8cbaa2 100644 --- a/util/metrics/metrics.go +++ b/util/metricutil/metric.go @@ -1,4 +1,4 @@ -package metrics +package metricutil import ( "context" @@ -7,8 +7,9 @@ import ( "path" "time" + "github.com/docker/buildx/util/confutil" + "github.com/docker/buildx/version" "github.com/docker/cli/cli/command" - "github.com/moby/buildkit/util/tracing/detect" "github.com/pkg/errors" "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc" "go.opentelemetry.io/otel/metric" @@ -20,75 +21,86 @@ import ( const ( otelConfigFieldName = "otel" - shutdownTimeout = 2 * time.Second + reportTimeout = 2 * time.Second ) -// ReportFunc is invoked to signal the metrics should be sent to the -// desired endpoint. It should be invoked on application shutdown. -type ReportFunc func() +// MeterProvider holds a MeterProvider for metric generation and the configured +// exporters for reporting metrics from the CLI. +type MeterProvider struct { + metric.MeterProvider + reader *sdkmetric.ManualReader + exporters []sdkmetric.Exporter +} -// MeterProvider returns a MeterProvider suitable for CLI usage. -// The primary difference between this metric reader and a more typical -// usage is that metric reporting only happens once when ReportFunc -// is invoked. -func MeterProvider(cli command.Cli) (metric.MeterProvider, ReportFunc, error) { +// NewMeterProvider configures a MeterProvider from the CLI context. +func NewMeterProvider(ctx context.Context, cli command.Cli) (*MeterProvider, error) { var exps []sdkmetric.Exporter - if exp, err := dockerOtelExporter(cli); err != nil { - return nil, nil, err - } else if exp != nil { - exps = append(exps, exp) - } + // Only metric exporters if the experimental flag is set. + if confutil.IsExperimental() { + if exp, err := dockerOtelExporter(cli); err != nil { + return nil, err + } else if exp != nil { + exps = append(exps, exp) + } - if exp, err := detectOtlpExporter(context.Background()); err != nil { - return nil, nil, err - } else if exp != nil { - exps = append(exps, exp) + if exp, err := detectOtlpExporter(ctx); err != nil { + return nil, err + } else if exp != nil { + exps = append(exps, exp) + } } if len(exps) == 0 { // No exporters are configured so use a noop provider. - return noop.NewMeterProvider(), func() {}, nil + return &MeterProvider{ + MeterProvider: noop.NewMeterProvider(), + }, nil } - // Use delta temporality because, since this is a CLI program, we can never - // know the cumulative value. reader := sdkmetric.NewManualReader( sdkmetric.WithTemporalitySelector(deltaTemporality), ) mp := sdkmetric.NewMeterProvider( - sdkmetric.WithResource(detect.Resource()), + sdkmetric.WithResource(Resource()), sdkmetric.WithReader(reader), ) - return mp, reportFunc(reader, exps), nil + return &MeterProvider{ + MeterProvider: mp, + reader: reader, + exporters: exps, + }, nil } -// reportFunc returns a ReportFunc for collecting ResourceMetrics and then -// exporting them to the configured Exporter. -func reportFunc(reader sdkmetric.Reader, exps []sdkmetric.Exporter) ReportFunc { - return func() { - ctx, cancel := context.WithTimeout(context.Background(), shutdownTimeout) - defer cancel() - - var rm metricdata.ResourceMetrics - if err := reader.Collect(ctx, &rm); err != nil { - // Error when collecting metrics. Do not send any. - return - } - - var eg errgroup.Group - for _, exp := range exps { - exp := exp - eg.Go(func() error { - _ = exp.Export(ctx, &rm) - _ = exp.Shutdown(ctx) - return nil - }) - } - - // Can't report an error because we don't allow it to. - _ = eg.Wait() +// Report exports metrics to the configured exporter. This should be done before the CLI +// exits. +func (m *MeterProvider) Report(ctx context.Context) { + if m.reader == nil { + // Not configured. + return } + + ctx, cancel := context.WithTimeout(ctx, reportTimeout) + defer cancel() + + var rm metricdata.ResourceMetrics + if err := m.reader.Collect(ctx, &rm); err != nil { + // Error when collecting metrics. Do not send any. + return + } + + var eg errgroup.Group + for _, exp := range m.exporters { + exp := exp + eg.Go(func() error { + _ = exp.Export(ctx, &rm) + _ = exp.Shutdown(ctx) + return nil + }) + } + + // Can't report an error because we don't allow it to. + _ = eg.Wait() } // dockerOtelExporter reads the CLI metadata to determine an OTLP exporter @@ -184,6 +196,20 @@ func otelExporterOtlpEndpoint(cli command.Cli) (string, error) { } // deltaTemporality sets the Temporality of every instrument to delta. +// +// This isn't really needed since we create a unique resource on each invocation, +// but it can help with cardinality concerns for downstream processors since they can +// perform aggregation for a time interval and then discard the data once that time +// period has passed. Cumulative temporality would imply to the downstream processor +// that they might receive a successive point and they may unnecessarily keep state +// they really shouldn't. func deltaTemporality(_ sdkmetric.InstrumentKind) metricdata.Temporality { return metricdata.DeltaTemporality } + +// Meter returns a Meter from the MetricProvider that indicates the measurement +// comes from buildx with the appropriate version. +func Meter(mp metric.MeterProvider) metric.Meter { + return mp.Meter(version.Package, + metric.WithInstrumentationVersion(version.Version)) +} diff --git a/util/metrics/otlp.go b/util/metricutil/otlp.go similarity index 86% rename from util/metrics/otlp.go rename to util/metricutil/otlp.go index b121ac3c..c07d244e 100644 --- a/util/metrics/otlp.go +++ b/util/metricutil/otlp.go @@ -1,4 +1,4 @@ -package metrics +package metricutil import ( "context" @@ -35,13 +35,9 @@ func detectOtlpExporter(ctx context.Context) (sdkmetric.Exporter, error) { switch proto { case "grpc": - return otlpmetricgrpc.New(ctx, - otlpmetricgrpc.WithTemporalitySelector(deltaTemporality), - ) + return otlpmetricgrpc.New(ctx) case "http/protobuf": - return otlpmetrichttp.New(ctx, - otlpmetrichttp.WithTemporalitySelector(deltaTemporality), - ) + return otlpmetrichttp.New(ctx) // case "http/json": // unsupported by library default: return nil, errors.Errorf("unsupported otlp protocol %v", proto) diff --git a/util/metricutil/resource.go b/util/metricutil/resource.go new file mode 100644 index 00000000..65e95683 --- /dev/null +++ b/util/metricutil/resource.go @@ -0,0 +1,53 @@ +package metricutil + +import ( + "context" + "os" + "path/filepath" + "sync" + + "github.com/google/uuid" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/sdk/resource" + semconv "go.opentelemetry.io/otel/semconv/v1.21.0" +) + +var ( + res *resource.Resource + resOnce sync.Once +) + +// Resource retrieves the OTEL resource for the buildx CLI. +func Resource() *resource.Resource { + resOnce.Do(func() { + var err error + res, err = resource.New(context.Background(), + resource.WithDetectors(serviceNameDetector{}), + resource.WithAttributes( + // Use a unique instance id so OTEL knows that each invocation + // of the CLI is its own instance. Without this, downstream + // OTEL processors may think the same process is restarting + // continuously and reset the metric counters. + semconv.ServiceInstanceID(uuid.New().String()), + ), + resource.WithFromEnv(), + resource.WithTelemetrySDK(), + ) + if err != nil { + otel.Handle(err) + } + }) + return res +} + +type serviceNameDetector struct{} + +func (serviceNameDetector) Detect(ctx context.Context) (*resource.Resource, error) { + return resource.StringDetector( + semconv.SchemaURL, + semconv.ServiceNameKey, + func() (string, error) { + return filepath.Base(os.Args[0]), nil + }, + ).Detect(ctx) +}