Skip to content

Commit 14f9955

Browse files
authored
Fix sample stats for step invariant and subquery (#506)
1 parent 2414c3d commit 14f9955

29 files changed

+409
-184
lines changed

engine/engine.go

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ import (
1010
"sort"
1111
"time"
1212

13+
"github.com/thanos-io/promql-engine/execution/telemetry"
14+
1315
"github.com/efficientgo/core/errors"
1416
"github.com/go-kit/log"
1517
"github.com/go-kit/log/level"
@@ -305,6 +307,9 @@ func (e *Engine) MakeInstantQuery(ctx context.Context, q storage.Queryable, opts
305307
t: InstantQuery,
306308
resultSort: resultSort,
307309
scanners: scanners,
310+
start: ts,
311+
end: ts,
312+
step: 0,
308313
}, nil
309314
}
310315

@@ -352,6 +357,9 @@ func (e *Engine) MakeInstantQueryFromPlan(ctx context.Context, q storage.Queryab
352357
// TODO(fpetkovski): Infer the sort order from the plan, ideally without copying the newResultSort function.
353358
resultSort: noSortResultSort{},
354359
scanners: scnrs,
360+
start: ts,
361+
end: ts,
362+
step: 0,
355363
}, nil
356364
}
357365

@@ -404,6 +412,9 @@ func (e *Engine) MakeRangeQuery(ctx context.Context, q storage.Queryable, opts *
404412
warns: warns,
405413
t: RangeQuery,
406414
scanners: scnrs,
415+
start: start,
416+
end: end,
417+
step: step,
407418
}, nil
408419
}
409420

@@ -446,6 +457,9 @@ func (e *Engine) MakeRangeQueryFromPlan(ctx context.Context, q storage.Queryable
446457
warns: warns,
447458
t: RangeQuery,
448459
scanners: scnrs,
460+
start: start,
461+
end: end,
462+
step: step,
449463
}, nil
450464
}
451465

@@ -522,7 +536,7 @@ func (q *Query) Explain() *ExplainOutputNode {
522536
}
523537

524538
func (q *Query) Analyze() *AnalyzeOutputNode {
525-
if observableRoot, ok := q.exec.(model.ObservableVectorOperator); ok {
539+
if observableRoot, ok := q.exec.(telemetry.ObservableVectorOperator); ok {
526540
return analyzeQuery(observableRoot)
527541
}
528542
return nil
@@ -534,6 +548,9 @@ type compatibilityQuery struct {
534548
plan logicalplan.Plan
535549
ts time.Time // Empty for range queries.
536550
warns annotations.Annotations
551+
start time.Time
552+
end time.Time
553+
step time.Duration
537554

538555
t QueryType
539556
resultSort resultSorter
@@ -707,6 +724,10 @@ func (q *compatibilityQuery) Stats() *stats.Statistics {
707724

708725
analysis := q.Analyze()
709726
samples := stats.NewQuerySamples(enablePerStepStats)
727+
if enablePerStepStats {
728+
samples.InitStepTracking(q.start.UnixMilli(), q.end.UnixMilli(), telemetry.StepTrackingInterval(q.step))
729+
}
730+
710731
if analysis != nil {
711732
samples.PeakSamples = int(analysis.PeakSamples())
712733
samples.TotalSamples = analysis.TotalSamples()

engine/engine_test.go

Lines changed: 67 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4814,6 +4814,47 @@ func TestQueryStats(t *testing.T) {
48144814
end: time.Unix(1800, 0),
48154815
step: time.Second * 30,
48164816
},
4817+
{
4818+
name: "step invariant with samples",
4819+
load: `load 5m
4820+
http_requests_total{pod="nginx-1"} 1+1x5
4821+
http_requests_total{pod="nginx-2"} 1+2x5`,
4822+
query: `sum without (__name__) (http_requests_total @ end())`,
4823+
start: time.Unix(1, 0),
4824+
end: time.Unix(600, 0),
4825+
step: time.Second * 34,
4826+
},
4827+
{
4828+
name: "step invariant without samples",
4829+
load: `load 30s
4830+
http_requests_total{pod="nginx-1"} 1.00+1.00x15
4831+
http_requests_total{pod="nginx-2"} 1+2.00x21`,
4832+
query: `pi()`,
4833+
start: time.UnixMilli(0),
4834+
end: time.UnixMilli(120000),
4835+
step: time.Second * 30,
4836+
},
4837+
{
4838+
name: "fuzz subquery without enough samples",
4839+
load: `load 30s
4840+
http_requests_total{pod="nginx-1"} 1.00+1.00x15
4841+
http_requests_total{pod="nginx-2"} 1+2.00x21`,
4842+
query: `rate({__name__="http_requests_total"} offset -6s[1h:1m] offset 1m29s)`,
4843+
start: time.UnixMilli(0),
4844+
end: time.UnixMilli(120000),
4845+
step: time.Second * 30,
4846+
},
4847+
// TODO (harry671003): This is a known case which needs to be fixed upstream.
4848+
//{
4849+
// name: "fuzz aggregation with scalar param",
4850+
// load: `load 30s
4851+
// http_requests_total{pod="nginx-1"} -77.00+1.00x15
4852+
// http_requests_total{pod="nginx-2"} 1+0.67x21`,
4853+
// query: `quantile without (pod) (scalar({__name__="http_requests_total"} offset 2m58s), {__name__="http_requests_total"})`,
4854+
// start: time.UnixMilli(0),
4855+
// end: time.UnixMilli(221000),
4856+
// step: time.Second * 30,
4857+
//},
48174858
}
48184859

48194860
for _, tc := range cases {
@@ -4825,6 +4866,8 @@ func TestQueryStats(t *testing.T) {
48254866
Timeout: 300 * time.Second,
48264867
MaxSamples: math.MaxInt64,
48274868
EnablePerStepStats: true,
4869+
EnableAtModifier: true,
4870+
EnableNegativeOffset: true,
48284871
NoStepSubqueryIntervalFn: func(rangeMillis int64) int64 { return 30 * time.Second.Milliseconds() },
48294872
}
48304873
qOpts := promql.NewPrometheusQueryOpts(true, 5*time.Minute)
@@ -4851,8 +4894,9 @@ func TestQueryStats(t *testing.T) {
48514894
stats.NewQueryStats(newStats)
48524895

48534896
testutil.WithGoCmp(comparer).Equals(t, oldResult, newResult)
4854-
testutil.Equals(t, oldStats.Samples.TotalSamples, newStats.Samples.TotalSamples)
4855-
testutil.Equals(t, oldStats.Samples.TotalSamplesPerStep, newStats.Samples.TotalSamplesPerStep)
4897+
if oldResult.Err == nil {
4898+
testutil.WithGoCmp(samplesComparer).Equals(t, oldStats.Samples, newStats.Samples)
4899+
}
48564900

48574901
// Range query
48584902
oldQ, err = oldEngine.NewRangeQuery(ctx, storage, qOpts, tc.query, tc.start, tc.end, tc.step)
@@ -4868,8 +4912,9 @@ func TestQueryStats(t *testing.T) {
48684912
stats.NewQueryStats(newStats)
48694913

48704914
testutil.WithGoCmp(comparer).Equals(t, oldResult, newResult)
4871-
testutil.Equals(t, oldStats.Samples.TotalSamples, newStats.Samples.TotalSamples)
4872-
testutil.Equals(t, oldStats.Samples.TotalSamplesPerStep, newStats.Samples.TotalSamplesPerStep)
4915+
if oldResult.Err == nil {
4916+
testutil.WithGoCmp(samplesComparer).Equals(t, oldStats.Samples, newStats.Samples)
4917+
}
48734918
})
48744919
}
48754920
}
@@ -5727,6 +5772,24 @@ var (
57275772
}
57285773
return false
57295774
})
5775+
5776+
samplesComparer = cmp.Comparer(func(x, y *stats.QuerySamples) bool {
5777+
if x == nil && y == nil {
5778+
return true
5779+
}
5780+
if x.TotalSamples != y.TotalSamples {
5781+
return false
5782+
}
5783+
5784+
if !cmp.Equal(x.TotalSamplesPerStep, y.TotalSamplesPerStep) {
5785+
return false
5786+
}
5787+
5788+
if !cmp.Equal(x.TotalSamplesPerStepMap(), y.TotalSamplesPerStepMap()) {
5789+
return false
5790+
}
5791+
return true
5792+
})
57305793
)
57315794

57325795
func queryExplanation(q promql.Query) string {

engine/enginefuzz_test.go

Lines changed: 88 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"github.com/prometheus/prometheus/promql/parser"
2121
"github.com/prometheus/prometheus/promql/promqltest"
2222
"github.com/prometheus/prometheus/storage"
23+
"github.com/prometheus/prometheus/util/stats"
2324
"github.com/prometheus/prometheus/util/teststorage"
2425
"github.com/stretchr/testify/require"
2526

@@ -32,9 +33,31 @@ import (
3233
const testRuns = 100
3334

3435
type testCase struct {
35-
query string
36-
loads []string
37-
oldRes, newRes *promql.Result
36+
query string
37+
loads []string
38+
oldRes, newRes *promql.Result
39+
oldStats, newStats *stats.Statistics
40+
start, end time.Time
41+
interval time.Duration
42+
validateSamples bool
43+
}
44+
45+
// shouldValidateSamples checks if the samples can be compared for the expr.
46+
// For certain known cases, prometheus engine and thanos engine returns different samples.
47+
func shouldValidateSamples(expr parser.Expr) bool {
48+
valid := true
49+
50+
parser.Inspect(expr, func(node parser.Node, path []parser.Node) error {
51+
switch n := node.(type) {
52+
case *parser.Call:
53+
if n.Func.Name == "scalar" {
54+
valid = false
55+
return errors.New("error")
56+
}
57+
}
58+
return nil
59+
})
60+
return valid
3861
}
3962

4063
func FuzzEnginePromQLSmithRangeQuery(f *testing.F) {
@@ -61,7 +84,9 @@ func FuzzEnginePromQLSmithRangeQuery(f *testing.F) {
6184
MaxSamples: 1e10,
6285
EnableNegativeOffset: true,
6386
EnableAtModifier: true,
87+
EnablePerStepStats: true,
6488
}
89+
qOpts := promql.NewPrometheusQueryOpts(true, 0)
6590

6691
storage := promqltest.LoadedStorage(t, load)
6792
defer storage.Close()
@@ -80,22 +105,22 @@ func FuzzEnginePromQLSmithRangeQuery(f *testing.F) {
80105
}
81106
ps := promqlsmith.New(rnd, seriesSet, psOpts...)
82107

83-
newEngine := engine.New(engine.Opts{EngineOpts: opts, DisableFallback: true})
108+
newEngine := engine.New(engine.Opts{EngineOpts: opts, DisableFallback: true, EnableAnalysis: true})
84109
oldEngine := promql.NewEngine(opts)
85110

86111
var (
87-
q1 promql.Query
88-
query string
112+
q1 promql.Query
113+
query string
114+
validateSamples bool
89115
)
90116
cases := make([]*testCase, testRuns)
91117
for i := 0; i < testRuns; i++ {
92-
// Since we disabled fallback, keep trying until we find a query
93-
// that can be natively executed by the engine.
94-
// Parsing experimental function, like mad_over_time, will lead to a parser.ParseErrors, so we also ignore those.
95118
for {
96119
expr := ps.WalkRangeQuery()
120+
validateSamples = shouldValidateSamples(expr)
121+
97122
query = expr.Pretty(0)
98-
q1, err = newEngine.NewRangeQuery(context.Background(), storage, nil, query, start, end, interval)
123+
q1, err = newEngine.NewRangeQuery(context.Background(), storage, qOpts, query, start, end, interval)
99124
if errors.Is(err, parse.ErrNotSupportedExpr) || errors.Is(err, parse.ErrNotImplemented) || errors.As(err, &parser.ParseErrors{}) {
100125
continue
101126
} else {
@@ -105,17 +130,27 @@ func FuzzEnginePromQLSmithRangeQuery(f *testing.F) {
105130

106131
testutil.Ok(t, err)
107132
newResult := q1.Exec(context.Background())
133+
newStats := q1.Stats()
134+
stats.NewQueryStats(newStats)
108135

109-
q2, err := oldEngine.NewRangeQuery(context.Background(), storage, nil, query, start, end, interval)
136+
q2, err := oldEngine.NewRangeQuery(context.Background(), storage, qOpts, query, start, end, interval)
110137
testutil.Ok(t, err)
111138

112139
oldResult := q2.Exec(context.Background())
140+
oldStats := q2.Stats()
141+
stats.NewQueryStats(oldStats)
113142

114143
cases[i] = &testCase{
115-
query: query,
116-
newRes: newResult,
117-
oldRes: oldResult,
118-
loads: []string{load},
144+
query: query,
145+
newRes: newResult,
146+
newStats: newStats,
147+
oldRes: oldResult,
148+
oldStats: oldStats,
149+
loads: []string{load},
150+
start: start,
151+
end: end,
152+
interval: interval,
153+
validateSamples: validateSamples,
119154
}
120155
}
121156
validateTestCases(t, cases)
@@ -141,7 +176,9 @@ func FuzzEnginePromQLSmithInstantQuery(f *testing.F) {
141176
MaxSamples: 1e10,
142177
EnableNegativeOffset: true,
143178
EnableAtModifier: true,
179+
EnablePerStepStats: true,
144180
}
181+
qOpts := promql.NewPrometheusQueryOpts(true, 0)
145182

146183
storage := promqltest.LoadedStorage(t, load)
147184
defer storage.Close()
@@ -151,6 +188,7 @@ func FuzzEnginePromQLSmithInstantQuery(f *testing.F) {
151188
EngineOpts: opts,
152189
DisableFallback: true,
153190
LogicalOptimizers: logicalplan.AllOptimizers,
191+
EnableAnalysis: true,
154192
})
155193
oldEngine := promql.NewEngine(opts)
156194

@@ -176,8 +214,11 @@ func FuzzEnginePromQLSmithInstantQuery(f *testing.F) {
176214
// Parsing experimental function, like mad_over_time, will lead to a parser.ParseErrors, so we also ignore those.
177215
for {
178216
expr := ps.WalkInstantQuery()
217+
if !shouldValidateSamples(expr) {
218+
continue
219+
}
179220
query = expr.Pretty(0)
180-
q1, err = newEngine.NewInstantQuery(context.Background(), storage, nil, query, queryTime)
221+
q1, err = newEngine.NewInstantQuery(context.Background(), storage, qOpts, query, queryTime)
181222
if errors.Is(err, parse.ErrNotSupportedExpr) || errors.Is(err, parse.ErrNotImplemented) || errors.As(err, &parser.ParseErrors{}) {
182223
continue
183224
} else {
@@ -187,17 +228,26 @@ func FuzzEnginePromQLSmithInstantQuery(f *testing.F) {
187228

188229
testutil.Ok(t, err)
189230
newResult := q1.Exec(context.Background())
231+
newStats := q1.Stats()
232+
stats.NewQueryStats(newStats)
190233

191-
q2, err := oldEngine.NewInstantQuery(context.Background(), storage, nil, query, queryTime)
234+
q2, err := oldEngine.NewInstantQuery(context.Background(), storage, qOpts, query, queryTime)
192235
testutil.Ok(t, err)
193236

194237
oldResult := q2.Exec(context.Background())
238+
oldStats := q2.Stats()
239+
stats.NewQueryStats(oldStats)
195240

196241
cases[i] = &testCase{
197-
query: query,
198-
newRes: newResult,
199-
oldRes: oldResult,
200-
loads: []string{load},
242+
query: query,
243+
newRes: newResult,
244+
newStats: newStats,
245+
oldRes: oldResult,
246+
oldStats: oldStats,
247+
loads: []string{load},
248+
start: queryTime,
249+
end: queryTime,
250+
validateSamples: true,
201251
}
202252
}
203253
validateTestCases(t, cases)
@@ -444,14 +494,27 @@ func getSeries(ctx context.Context, q storage.Queryable) ([]labels.Labels, error
444494

445495
func validateTestCases(t *testing.T, cases []*testCase) {
446496
failures := 0
497+
logQuery := func(c *testCase) {
498+
for _, load := range c.loads {
499+
t.Logf(load)
500+
}
501+
t.Logf("query: %s, start: %d, end: %d, interval: %v", c.query, c.start.UnixMilli(), c.end.UnixMilli(), c.interval)
502+
}
447503
for i, c := range cases {
448504
if !cmp.Equal(c.oldRes, c.newRes, comparer) {
449-
for _, load := range c.loads {
450-
t.Logf(load)
451-
}
452-
t.Logf(c.query)
505+
logQuery(c)
453506

454507
t.Logf("case %d error mismatch.\nnew result: %s\nold result: %s\n", i, c.newRes.String(), c.oldRes.String())
508+
//failures++
509+
continue
510+
}
511+
if !c.validateSamples || c.oldRes.Err != nil {
512+
// Skip sample comparison
513+
continue
514+
}
515+
if !cmp.Equal(c.oldStats.Samples, c.newStats.Samples, samplesComparer) {
516+
logQuery(c)
517+
t.Logf("case: %d, samples mismatch. total samples: old: %v, new: %v. samples per step: old: %v, new: %v", i, c.oldStats.Samples.TotalSamples, c.newStats.Samples.TotalSamples, c.oldStats.Samples.TotalSamplesPerStep, c.newStats.Samples.TotalSamplesPerStep)
455518
failures++
456519
}
457520
}

0 commit comments

Comments
 (0)