feat(prometheus): robust label validation and registration handling

This commit is contained in:
Marc Schäfer
2026-05-04 00:12:24 +02:00
parent 73d4d4d37c
commit 191b4fa26a
2 changed files with 222 additions and 39 deletions

View File

@@ -7,6 +7,7 @@ package prometheus
import (
"context"
"log"
"net/http"
"github.com/prometheus/client_golang/prometheus"
@@ -30,9 +31,10 @@ type Config struct {
// in the backend-specific instrument types that implement the observability
// instrument interfaces.
type Backend struct {
cfg Config
registry *prometheus.Registry
handler http.Handler
cfg Config
registry *prometheus.Registry
handler http.Handler
droppedSamplesCounter prometheus.Counter
}
// New creates and initialises a Prometheus backend.
@@ -48,6 +50,11 @@ func New(cfg Config) (*Backend, error) {
}
registry := prometheus.NewRegistry()
droppedSamplesCounter := prometheus.NewCounter(prometheus.CounterOpts{
Name: "gerbil_dropped_metric_samples_total",
Help: "Total number of metric samples dropped due to invalid labels or unsupported label sets",
})
registry.MustRegister(droppedSamplesCounter)
// Include Go and process metrics by default.
includeGo := cfg.IncludeGoMetrics == nil || *cfg.IncludeGoMetrics
@@ -62,7 +69,7 @@ func New(cfg Config) (*Backend, error) {
EnableOpenMetrics: false,
})
return &Backend{cfg: cfg, registry: registry, handler: handler}, nil
return &Backend{cfg: cfg, registry: registry, handler: handler, droppedSamplesCounter: droppedSamplesCounter}, nil
}
// HTTPHandler returns the Prometheus /metrics HTTP handler.
@@ -78,60 +85,107 @@ func (b *Backend) Shutdown(_ context.Context) error {
}
// NewCounter creates a Prometheus CounterVec registered on the backend's registry.
func (b *Backend) NewCounter(name, desc string, labelNames ...string) *Counter {
func (b *Backend) NewCounter(name, desc string, labelNames ...string) (*Counter, error) {
vec := prometheus.NewCounterVec(prometheus.CounterOpts{
Name: name,
Help: desc,
}, labelNames)
b.registry.MustRegister(vec)
return &Counter{vec: vec}
if err := b.registry.Register(vec); err != nil {
if are, ok := err.(prometheus.AlreadyRegisteredError); ok {
existing, ok := are.ExistingCollector.(*prometheus.CounterVec)
if !ok {
return nil, err
}
return &Counter{vec: existing, labelNames: append([]string(nil), labelNames...), droppedSamplesCounter: b.droppedSamplesCounter}, nil
}
return nil, err
}
return &Counter{vec: vec, labelNames: append([]string(nil), labelNames...), droppedSamplesCounter: b.droppedSamplesCounter}, nil
}
// NewUpDownCounter creates a Prometheus GaugeVec (Prometheus gauges are
// bidirectional) registered on the backend's registry.
func (b *Backend) NewUpDownCounter(name, desc string, labelNames ...string) *UpDownCounter {
func (b *Backend) NewUpDownCounter(name, desc string, labelNames ...string) (*UpDownCounter, error) {
vec := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: name,
Help: desc,
}, labelNames)
b.registry.MustRegister(vec)
return &UpDownCounter{vec: vec}
if err := b.registry.Register(vec); err != nil {
if are, ok := err.(prometheus.AlreadyRegisteredError); ok {
existing, ok := are.ExistingCollector.(*prometheus.GaugeVec)
if !ok {
return nil, err
}
return &UpDownCounter{vec: existing, labelNames: append([]string(nil), labelNames...), droppedSamplesCounter: b.droppedSamplesCounter}, nil
}
return nil, err
}
return &UpDownCounter{vec: vec, labelNames: append([]string(nil), labelNames...), droppedSamplesCounter: b.droppedSamplesCounter}, nil
}
// NewInt64Gauge creates a Prometheus GaugeVec registered on the backend's registry.
func (b *Backend) NewInt64Gauge(name, desc string, labelNames ...string) *Int64Gauge {
func (b *Backend) NewInt64Gauge(name, desc string, labelNames ...string) (*Int64Gauge, error) {
vec := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: name,
Help: desc,
}, labelNames)
b.registry.MustRegister(vec)
return &Int64Gauge{vec: vec}
if err := b.registry.Register(vec); err != nil {
if are, ok := err.(prometheus.AlreadyRegisteredError); ok {
existing, ok := are.ExistingCollector.(*prometheus.GaugeVec)
if !ok {
return nil, err
}
return &Int64Gauge{vec: existing, labelNames: append([]string(nil), labelNames...), droppedSamplesCounter: b.droppedSamplesCounter}, nil
}
return nil, err
}
return &Int64Gauge{vec: vec, labelNames: append([]string(nil), labelNames...), droppedSamplesCounter: b.droppedSamplesCounter}, nil
}
// NewFloat64Gauge creates a Prometheus GaugeVec registered on the backend's registry.
func (b *Backend) NewFloat64Gauge(name, desc string, labelNames ...string) *Float64Gauge {
func (b *Backend) NewFloat64Gauge(name, desc string, labelNames ...string) (*Float64Gauge, error) {
vec := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: name,
Help: desc,
}, labelNames)
b.registry.MustRegister(vec)
return &Float64Gauge{vec: vec}
if err := b.registry.Register(vec); err != nil {
if are, ok := err.(prometheus.AlreadyRegisteredError); ok {
existing, ok := are.ExistingCollector.(*prometheus.GaugeVec)
if !ok {
return nil, err
}
return &Float64Gauge{vec: existing, labelNames: append([]string(nil), labelNames...), droppedSamplesCounter: b.droppedSamplesCounter}, nil
}
return nil, err
}
return &Float64Gauge{vec: vec, labelNames: append([]string(nil), labelNames...), droppedSamplesCounter: b.droppedSamplesCounter}, nil
}
// NewHistogram creates a Prometheus HistogramVec registered on the backend's registry.
func (b *Backend) NewHistogram(name, desc string, buckets []float64, labelNames ...string) *Histogram {
func (b *Backend) NewHistogram(name, desc string, buckets []float64, labelNames ...string) (*Histogram, error) {
vec := prometheus.NewHistogramVec(prometheus.HistogramOpts{
Name: name,
Help: desc,
Buckets: buckets,
}, labelNames)
b.registry.MustRegister(vec)
return &Histogram{vec: vec}
if err := b.registry.Register(vec); err != nil {
if are, ok := err.(prometheus.AlreadyRegisteredError); ok {
existing, ok := are.ExistingCollector.(*prometheus.HistogramVec)
if !ok {
return nil, err
}
return &Histogram{vec: existing, labelNames: append([]string(nil), labelNames...), droppedSamplesCounter: b.droppedSamplesCounter}, nil
}
return nil, err
}
return &Histogram{vec: vec, labelNames: append([]string(nil), labelNames...), droppedSamplesCounter: b.droppedSamplesCounter}, nil
}
// Counter is a native Prometheus counter instrument.
type Counter struct {
vec *prometheus.CounterVec
vec *prometheus.CounterVec
labelNames []string
droppedSamplesCounter prometheus.Counter
}
// Add increments the counter by value for the given labels.
@@ -139,47 +193,118 @@ type Counter struct {
// value must be non-negative. Negative values are ignored.
func (c *Counter) Add(_ context.Context, value int64, labels map[string]string) {
if value < 0 {
log.Printf("WARN: counter add called with negative value=%d labels=%v expected_labels=%v", value, labels, c.labelNames)
return
}
c.vec.With(prometheus.Labels(labels)).Add(float64(value))
normalized, ok := normalizeLabels(c.labelNames, labels, c.droppedSamplesCounter)
if !ok {
return
}
defer guardMetricPanic("counter", c.labelNames, labels)
c.vec.With(normalized).Add(float64(value))
}
// UpDownCounter is a native Prometheus gauge used as a bidirectional counter.
type UpDownCounter struct {
vec *prometheus.GaugeVec
vec *prometheus.GaugeVec
labelNames []string
droppedSamplesCounter prometheus.Counter
}
// Add adjusts the gauge by value for the given labels.
func (u *UpDownCounter) Add(_ context.Context, value int64, labels map[string]string) {
u.vec.With(prometheus.Labels(labels)).Add(float64(value))
normalized, ok := normalizeLabels(u.labelNames, labels, u.droppedSamplesCounter)
if !ok {
return
}
defer guardMetricPanic("updown", u.labelNames, labels)
u.vec.With(normalized).Add(float64(value))
}
// Int64Gauge is a native Prometheus gauge recording integer snapshot values.
type Int64Gauge struct {
vec *prometheus.GaugeVec
vec *prometheus.GaugeVec
labelNames []string
droppedSamplesCounter prometheus.Counter
}
// Record sets the gauge to value for the given labels.
func (g *Int64Gauge) Record(_ context.Context, value int64, labels map[string]string) {
g.vec.With(prometheus.Labels(labels)).Set(float64(value))
normalized, ok := normalizeLabels(g.labelNames, labels, g.droppedSamplesCounter)
if !ok {
return
}
defer guardMetricPanic("int64-gauge", g.labelNames, labels)
g.vec.With(normalized).Set(float64(value))
}
// Float64Gauge is a native Prometheus gauge recording float snapshot values.
type Float64Gauge struct {
vec *prometheus.GaugeVec
vec *prometheus.GaugeVec
labelNames []string
droppedSamplesCounter prometheus.Counter
}
// Record sets the gauge to value for the given labels.
func (g *Float64Gauge) Record(_ context.Context, value float64, labels map[string]string) {
g.vec.With(prometheus.Labels(labels)).Set(value)
normalized, ok := normalizeLabels(g.labelNames, labels, g.droppedSamplesCounter)
if !ok {
return
}
defer guardMetricPanic("float64-gauge", g.labelNames, labels)
g.vec.With(normalized).Set(value)
}
// Histogram is a native Prometheus histogram instrument.
type Histogram struct {
vec *prometheus.HistogramVec
vec *prometheus.HistogramVec
labelNames []string
droppedSamplesCounter prometheus.Counter
}
// Record observes value for the given labels.
func (h *Histogram) Record(_ context.Context, value float64, labels map[string]string) {
h.vec.With(prometheus.Labels(labels)).Observe(value)
normalized, ok := normalizeLabels(h.labelNames, labels, h.droppedSamplesCounter)
if !ok {
return
}
defer guardMetricPanic("histogram", h.labelNames, labels)
h.vec.With(normalized).Observe(value)
}
func normalizeLabels(labelNames []string, labels map[string]string, droppedSamplesCounter prometheus.Counter) (prometheus.Labels, bool) {
if len(labelNames) == 0 {
if len(labels) > 0 {
if droppedSamplesCounter != nil {
droppedSamplesCounter.Inc()
}
log.Printf("WARN: dropping metric sample due to unexpected labels: got=%v expected=none", labels)
return nil, false
}
return nil, true
}
normalized := make(prometheus.Labels, len(labelNames))
for _, name := range labelNames {
normalized[name] = ""
}
for k, v := range labels {
if _, ok := normalized[k]; !ok {
if droppedSamplesCounter != nil {
droppedSamplesCounter.Inc()
}
log.Printf("WARN: dropping metric sample due to unexpected label key %q (expected=%v)", k, labelNames)
return nil, false
}
normalized[k] = v
}
return normalized, true
}
func guardMetricPanic(kind string, expected []string, labels map[string]string) {
if recovered := recover(); recovered != nil {
log.Printf("WARN: dropped %s metric sample due to label panic: expected=%v got=%v err=%v", kind, expected, labels, recovered)
}
}

View File

@@ -36,7 +36,10 @@ func TestPrometheusBackendShutdown(t *testing.T) {
func TestPrometheusBackendCounter(t *testing.T) {
b := newTestBackend(t)
c := b.NewCounter("test_counter_total", "A test counter", "result")
c, err := b.NewCounter("test_counter_total", "A test counter", "result")
if err != nil {
t.Fatalf("NewCounter returned error: %v", err)
}
c.Add(context.Background(), 3, map[string]string{"result": "ok"})
body := scrapeMetrics(t, b)
@@ -45,7 +48,10 @@ func TestPrometheusBackendCounter(t *testing.T) {
func TestPrometheusBackendUpDownCounter(t *testing.T) {
b := newTestBackend(t)
u := b.NewUpDownCounter("test_gauge_total", "A test up-down counter", "state")
u, err := b.NewUpDownCounter("test_gauge_total", "A test up-down counter", "state")
if err != nil {
t.Fatalf("NewUpDownCounter returned error: %v", err)
}
u.Add(context.Background(), 5, map[string]string{"state": "active"})
u.Add(context.Background(), -2, map[string]string{"state": "active"})
@@ -55,7 +61,10 @@ func TestPrometheusBackendUpDownCounter(t *testing.T) {
func TestPrometheusBackendInt64Gauge(t *testing.T) {
b := newTestBackend(t)
g := b.NewInt64Gauge("test_int_gauge", "An integer gauge", "ifname")
g, err := b.NewInt64Gauge("test_int_gauge", "An integer gauge", "ifname")
if err != nil {
t.Fatalf("NewInt64Gauge returned error: %v", err)
}
g.Record(context.Background(), 42, map[string]string{"ifname": "wg0"})
body := scrapeMetrics(t, b)
@@ -64,7 +73,10 @@ func TestPrometheusBackendInt64Gauge(t *testing.T) {
func TestPrometheusBackendFloat64Gauge(t *testing.T) {
b := newTestBackend(t)
g := b.NewFloat64Gauge("test_float_gauge", "A float gauge", "cert")
g, err := b.NewFloat64Gauge("test_float_gauge", "A float gauge", "cert")
if err != nil {
t.Fatalf("NewFloat64Gauge returned error: %v", err)
}
g.Record(context.Background(), 7.5, map[string]string{"cert": "example.com"})
body := scrapeMetrics(t, b)
@@ -74,7 +86,10 @@ func TestPrometheusBackendFloat64Gauge(t *testing.T) {
func TestPrometheusBackendHistogram(t *testing.T) {
b := newTestBackend(t)
buckets := []float64{0.1, 0.5, 1.0, 5.0}
h := b.NewHistogram("test_duration_seconds", "A test histogram", buckets, "method")
h, err := b.NewHistogram("test_duration_seconds", "A test histogram", buckets, "method")
if err != nil {
t.Fatalf("NewHistogram returned error: %v", err)
}
h.Record(context.Background(), 0.3, map[string]string{"method": "GET"})
body := scrapeMetrics(t, b)
@@ -85,7 +100,10 @@ func TestPrometheusBackendHistogram(t *testing.T) {
func TestPrometheusBackendMultipleLabels(t *testing.T) {
b := newTestBackend(t)
c := b.NewCounter("multi_label_total", "Multi-label counter", "method", "route", "status_code")
c, err := b.NewCounter("multi_label_total", "Multi-label counter", "method", "route", "status_code")
if err != nil {
t.Fatalf("NewCounter returned error: %v", err)
}
c.Add(context.Background(), 1, map[string]string{
"method": "POST",
"route": "/api/peers",
@@ -122,23 +140,29 @@ func TestPrometheusBackendNoGoMetrics(t *testing.T) {
func TestPrometheusBackendNilLabels(t *testing.T) {
// Adding with nil labels should not panic (treated as empty map).
b := newTestBackend(t)
c := b.NewCounter("nil_labels_total", "counter with no labels")
c, err := b.NewCounter("nil_labels_total", "counter with no labels")
if err != nil {
t.Fatalf("NewCounter returned error: %v", err)
}
// nil labels with no label names declared should be safe
c.Add(context.Background(), 1, nil)
}
func TestPrometheusBackendConcurrentAdd(t *testing.T) {
b := newTestBackend(t)
c := b.NewCounter("concurrent_total", "concurrent counter", "worker")
c, err := b.NewCounter("concurrent_total", "concurrent counter", "worker")
if err != nil {
t.Fatalf("NewCounter returned error: %v", err)
}
done := make(chan struct{})
for i := 0; i < 10; i++ {
go func(_ int) {
go func() {
for j := 0; j < 100; j++ {
c.Add(context.Background(), 1, map[string]string{"worker": "w"})
}
done <- struct{}{}
}(i)
}()
}
for i := 0; i < 10; i++ {
<-done
@@ -148,6 +172,40 @@ func TestPrometheusBackendConcurrentAdd(t *testing.T) {
assertMetricPresent(t, body, `concurrent_total{worker="w"} 1000`)
}
func TestPrometheusBackendAlreadyRegisteredCounter(t *testing.T) {
b := newTestBackend(t)
c1, err := b.NewCounter("dupe_counter_total", "duplicate counter", "result")
if err != nil {
t.Fatalf("first NewCounter returned error: %v", err)
}
c2, err := b.NewCounter("dupe_counter_total", "duplicate counter", "result")
if err != nil {
t.Fatalf("second NewCounter returned error: %v", err)
}
c1.Add(context.Background(), 1, map[string]string{"result": "ok"})
c2.Add(context.Background(), 2, map[string]string{"result": "ok"})
body := scrapeMetrics(t, b)
assertMetricPresent(t, body, `dupe_counter_total{result="ok"} 3`)
}
func TestPrometheusBackendInvalidLabelsNoPanic(t *testing.T) {
b := newTestBackend(t)
c, err := b.NewCounter("invalid_labels_total", "invalid labels test", "result")
if err != nil {
t.Fatalf("NewCounter returned error: %v", err)
}
// Extra label key should be dropped and must not panic.
c.Add(context.Background(), 5, map[string]string{"result": "ok", "unexpected": "x"})
body := scrapeMetrics(t, b)
if strings.Contains(body, `invalid_labels_total{result="ok"}`) {
t.Error("invalid label sample should have been dropped")
}
}
// --- helpers ---
func scrapeMetrics(t *testing.T, b *obsprom.Backend) string {