Skip to content

Commit

Permalink
Remove scoring from within the evaluation cause the eval should not c…
Browse files Browse the repository at this point in the history
…are how things are scored
  • Loading branch information
bauersimon committed Oct 25, 2024
1 parent 5ca853c commit e0c1566
Show file tree
Hide file tree
Showing 13 changed files with 129 additions and 300 deletions.
9 changes: 3 additions & 6 deletions cmd/eval-dev-quality/cmd/evaluate.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ import (
"golang.org/x/exp/maps"

"github.com/symflower/eval-dev-quality/evaluate"
"github.com/symflower/eval-dev-quality/evaluate/metrics"
"github.com/symflower/eval-dev-quality/evaluate/report"
evaltask "github.com/symflower/eval-dev-quality/evaluate/task"
"github.com/symflower/eval-dev-quality/language"
Expand Down Expand Up @@ -539,11 +538,9 @@ func (command *Evaluate) evaluateLocal(evaluationContext *evaluate.Context) (err
}

assessmentsPerModel := assessments.CollapseByModel()
_ = assessmentsPerModel.WalkByScore(func(model string, assessment metrics.Assessments, score uint64) (err error) {
command.logger.Printf("Evaluation score for %q: %s", model, assessment)

return nil
})
for _, modelID := range maps.Keys(assessmentsPerModel) {
command.logger.Printf("Evaluation score for %q: %s", modelID, assessmentsPerModel[modelID])
}

return nil
}
Expand Down
128 changes: 64 additions & 64 deletions cmd/eval-dev-quality/cmd/evaluate_test.go

Large diffs are not rendered by default.

52 changes: 16 additions & 36 deletions evaluate/metrics/assessment.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,50 +15,45 @@ var (
allAssessmentKeys []AssessmentKey
// AllAssessmentKeysStrings returns all registered assessment keys as strings.
AllAssessmentKeysStrings []string

// multiplierPerAssessment holds the multipliers awarded for a specific assessment.
multiplierPerAssessment = map[AssessmentKey]uint64{}
)

// RegisterAssessmentKey registers a new assessment key.
// If the multiplier for this assessment type is zero, it is ignored for the score computation.
func RegisterAssessmentKey(key string, multiplier uint64) AssessmentKey {
func RegisterAssessmentKey(key string) AssessmentKey {
assessment := AssessmentKey(key)
i := sort.SearchStrings(AllAssessmentKeysStrings, key)

allAssessmentKeys = slices.Insert(allAssessmentKeys, i, assessment)
AllAssessmentKeysStrings = slices.Insert(AllAssessmentKeysStrings, i, key)
multiplierPerAssessment[assessment] = multiplier

return assessment
}

var (
// AssessmentKeyFilesExecuted holds the successfully executed files.
AssessmentKeyFilesExecuted = RegisterAssessmentKey("files-executed", 1)
AssessmentKeyFilesExecuted = RegisterAssessmentKey("files-executed")
// AssessmentKeyFilesExecutedMaximumReachable holds the maximum theoretically reachable executed files.
AssessmentKeyFilesExecutedMaximumReachable = RegisterAssessmentKey("files-executed-maximum-reachable", 0)
AssessmentKeyFilesExecutedMaximumReachable = RegisterAssessmentKey("files-executed-maximum-reachable")
// AssessmentKeyProcessingTime holds the time in milliseconds that it took to complete the task.
AssessmentKeyProcessingTime = RegisterAssessmentKey("processing-time", 0)
AssessmentKeyProcessingTime = RegisterAssessmentKey("processing-time")

// AssessmentKeyCoverage counts execution coverage objects.
AssessmentKeyCoverage = RegisterAssessmentKey("coverage", 10)
AssessmentKeyCoverage = RegisterAssessmentKey("coverage")

// AssessmentKeyTestsPassing holds the percentage of passing tests.
AssessmentKeyTestsPassing = RegisterAssessmentKey("tests-passing", 10)
AssessmentKeyTestsPassing = RegisterAssessmentKey("tests-passing")

// AssessmentKeyResponseCharacterCount counts the number of characters of a response.
AssessmentKeyResponseCharacterCount = RegisterAssessmentKey("response-character-count", 0)
AssessmentKeyResponseCharacterCount = RegisterAssessmentKey("response-character-count")
// AssessmentKeyGenerateTestsForFileCharacterCount counts the number of characters of a generated test file.
AssessmentKeyGenerateTestsForFileCharacterCount = RegisterAssessmentKey("generate-tests-for-file-character-count", 0)
AssessmentKeyGenerateTestsForFileCharacterCount = RegisterAssessmentKey("generate-tests-for-file-character-count")

// AssessmentKeyResponseNoError indicates that a model responded without error.
AssessmentKeyResponseNoError = RegisterAssessmentKey("response-no-error", 1)
AssessmentKeyResponseNoError = RegisterAssessmentKey("response-no-error")
// AssessmentKeyResponseWithCode indicates that a model responded with code.
AssessmentKeyResponseWithCode = RegisterAssessmentKey("response-with-code", 1)
AssessmentKeyResponseWithCode = RegisterAssessmentKey("response-with-code")
// AssessmentKeyResponseNoExcess indicates that a model did not produce more content as requested.
// TODO Infer if a model produced "too much" code. https://github.com/symflower/eval-dev-quality/issues/44
AssessmentKeyResponseNoExcess = RegisterAssessmentKey("response-no-excess", 1)
AssessmentKeyResponseNoExcess = RegisterAssessmentKey("response-no-excess")
)

// Assessments holds a collection of numerical assessment metrics.
Expand Down Expand Up @@ -104,29 +99,14 @@ func Merge(a Assessments, b Assessments) (c Assessments) {
return c
}

// Score computes the score over all assessments in the collection.
func (a Assessments) Score() (score uint64) {
if len(a) == 0 {
return 0
}

for key, value := range a {
if multiplierPerAssessment[key] != 0 {
score += value
}
}

return score
}

// Award yields the score points defined for the given key.
// Award yields a score point.
func (a Assessments) Award(key AssessmentKey) {
a[key] += multiplierPerAssessment[key]
a[key]++
}

// AwardPoints yields multiple score points defined for the given key.
func (a Assessments) AwardPoints(key AssessmentKey, count uint64) {
a[key] += multiplierPerAssessment[key] * count
// AwardMultiple yields multiple score points.
func (a Assessments) AwardMultiple(key AssessmentKey, count uint64) {
a[key] += count
}

// String returns a string representation of the metrics.
Expand Down
50 changes: 6 additions & 44 deletions evaluate/metrics/assessment_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,10 +153,10 @@ func TestAssessmentString(t *testing.T) {
AssessmentKeyResponseNoExcess: 4,
AssessmentKeyResponseWithCode: 5,
AssessmentKeyProcessingTime: 200,
AssessmentKeyTestsPassing: 70,
AssessmentKeyTestsPassing: 7,
},

ExpectedString: "coverage=1, files-executed=2, files-executed-maximum-reachable=2, generate-tests-for-file-character-count=50, processing-time=200, response-character-count=100, response-no-error=3, response-no-excess=4, response-with-code=5, tests-passing=70",
ExpectedString: "coverage=1, files-executed=2, files-executed-maximum-reachable=2, generate-tests-for-file-character-count=50, processing-time=200, response-character-count=100, response-no-error=3, response-no-excess=4, response-with-code=5, tests-passing=7",
})
}

Expand Down Expand Up @@ -237,44 +237,6 @@ func TestAssessmentsEqual(t *testing.T) {
})
}

func TestAssessmentsScore(t *testing.T) {
type testCase struct {
Name string

Assessments Assessments

ExpectedScore uint64
}

validate := func(t *testing.T, tc *testCase) {
t.Run(tc.Name, func(t *testing.T) {
actualScore := tc.Assessments.Score()

assert.Equal(t, tc.ExpectedScore, actualScore)
})
}

validate(t, &testCase{
Name: "Empty Assessment",

Assessments: NewAssessments(),

ExpectedScore: uint64(0),
})

validate(t, &testCase{
Name: "Values Assessment",

Assessments: Assessments{
AssessmentKeyFilesExecuted: 5,
AssessmentKeyCoverage: 4,
AssessmentKeyProcessingTime: 200,
},

ExpectedScore: uint64(9),
})
}

func TestCombineModelAndSymflowerFixAssessments(t *testing.T) {
type testCase struct {
Name string
Expand Down Expand Up @@ -309,21 +271,21 @@ func TestCombineModelAndSymflowerFixAssessments(t *testing.T) {
SymflowerFixAssessments: Assessments{
AssessmentKeyFilesExecuted: 1,
AssessmentKeyProcessingTime: uint64(100),
AssessmentKeyCoverage: 10,
AssessmentKeyCoverage: 1,
AssessmentKeyResponseNoError: 1,
AssessmentKeyTestsPassing: 100,
AssessmentKeyTestsPassing: 10,
},

ExpectedAssessments: Assessments{
AssessmentKeyFilesExecuted: 1,
AssessmentKeyProcessingTime: uint64(300),
AssessmentKeyCoverage: 10,
AssessmentKeyCoverage: 1,
AssessmentKeyResponseCharacterCount: 100,
AssessmentKeyGenerateTestsForFileCharacterCount: 50,
AssessmentKeyResponseNoError: 0,
AssessmentKeyResponseWithCode: 1,
AssessmentKeyResponseNoExcess: 1,
AssessmentKeyTestsPassing: 100,
AssessmentKeyTestsPassing: 10,
},
})
}
22 changes: 0 additions & 22 deletions evaluate/report/collection.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,28 +16,6 @@ import (
// AssessmentPerModel holds a collection of assessments per model id.
type AssessmentPerModel map[string]metrics.Assessments

// WalkByScore walks the given assessment metrics by their score.
func (a AssessmentPerModel) WalkByScore(function func(model string, assessment metrics.Assessments, score uint64) error) (err error) {
models := maps.Keys(a)
sort.Strings(models)

scores := make(map[string]uint64, len(models))
for _, model := range models {
scores[model] = a[model].Score()
}
sort.SliceStable(models, func(i, j int) bool {
return scores[models[i]] < scores[models[j]]
})

for _, model := range models {
if err := function(model, a[model], scores[model]); err != nil {
return err
}
}

return nil
}

// AssessmentStore holds a collection of assessments per model per language and per repository.
type AssessmentStore struct {
store map[model.Model]map[language.Language]map[string]map[task.Identifier]metrics.Assessments
Expand Down
88 changes: 0 additions & 88 deletions evaluate/report/collection_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ import (
"testing"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

"github.com/symflower/eval-dev-quality/evaluate/metrics"
metricstesting "github.com/symflower/eval-dev-quality/evaluate/metrics/testing"
Expand Down Expand Up @@ -179,93 +178,6 @@ func TestAssessmentPerModelPerLanguagePerRepositoryWalk(t *testing.T) {
}
}

func TestWalkByScore(t *testing.T) {
type testCase struct {
Name string

AssessmentPerModel AssessmentPerModel

ExpectedModelOrder []string
ExpectedScoreOrder []uint64
}

validate := func(t *testing.T, tc *testCase) {
t.Run(tc.Name, func(t *testing.T) {
require.Equal(t, len(tc.ExpectedModelOrder), len(tc.ExpectedScoreOrder), "expected order needs equal lengths")

actualModelOrder := make([]string, 0, len(tc.ExpectedModelOrder))
actualAssessmentOrder := make([]metrics.Assessments, 0, len(tc.ExpectedModelOrder))
actualScoreOrder := make([]uint64, 0, len(tc.ExpectedScoreOrder))
assert.NoError(t, tc.AssessmentPerModel.WalkByScore(func(model string, assessment metrics.Assessments, score uint64) (err error) {
actualModelOrder = append(actualModelOrder, model)
actualAssessmentOrder = append(actualAssessmentOrder, assessment)
actualScoreOrder = append(actualScoreOrder, score)

return nil
}))

assert.Equal(t, tc.ExpectedModelOrder, actualModelOrder)
assert.Equal(t, tc.ExpectedScoreOrder, actualScoreOrder)
for i, model := range tc.ExpectedModelOrder {
assert.Equal(t, tc.AssessmentPerModel[model], actualAssessmentOrder[i])
}
})
}

validate(t, &testCase{
Name: "No Assessment",

AssessmentPerModel: AssessmentPerModel{},

ExpectedModelOrder: []string{},
ExpectedScoreOrder: []uint64{},
})

validate(t, &testCase{
Name: "Single Assessment",

AssessmentPerModel: AssessmentPerModel{
"modelA": metrics.Assessments{
metrics.AssessmentKeyFilesExecuted: 1,
},
},

ExpectedModelOrder: []string{
"modelA",
},
ExpectedScoreOrder: []uint64{
1,
},
})

validate(t, &testCase{
Name: "Multiple Assessments",

AssessmentPerModel: AssessmentPerModel{
"modelA": metrics.Assessments{
metrics.AssessmentKeyFilesExecuted: 1,
},
"modelB": metrics.Assessments{
metrics.AssessmentKeyFilesExecuted: 2,
},
"modelC": metrics.Assessments{
metrics.AssessmentKeyFilesExecuted: 3,
},
},

ExpectedModelOrder: []string{
"modelA",
"modelB",
"modelC",
},
ExpectedScoreOrder: []uint64{
1,
2,
3,
},
})
}

func TestAssessmentCollapseByModel(t *testing.T) {
type testCase struct {
Name string
Expand Down
2 changes: 1 addition & 1 deletion evaluate/task/code-repair.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ func (t *CodeRepair) Run(ctx evaltask.Context) (repositoryAssessment map[evaltas
testsPassing := testResult.TestsPass
taskLogger.Printf("Executes tests with %d tests passing", testsPassing)
modelAssessment.Award(metrics.AssessmentKeyFilesExecuted)
modelAssessment.AwardPoints(metrics.AssessmentKeyTestsPassing, uint64(testsPassing))
modelAssessment.AwardMultiple(metrics.AssessmentKeyTestsPassing, uint64(testsPassing))
}

repositoryAssessment = map[evaltask.Identifier]metrics.Assessments{
Expand Down
12 changes: 6 additions & 6 deletions evaluate/task/code-repair_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ func TestCodeRepairRun(t *testing.T) {
metrics.AssessmentKeyFilesExecuted: 1,
metrics.AssessmentKeyFilesExecutedMaximumReachable: 1,
metrics.AssessmentKeyResponseNoError: 1,
metrics.AssessmentKeyTestsPassing: 40,
metrics.AssessmentKeyTestsPassing: 4,
},
},
ValidateLog: func(t *testing.T, data string) {
Expand Down Expand Up @@ -140,7 +140,7 @@ func TestCodeRepairRun(t *testing.T) {
metrics.AssessmentKeyFilesExecuted: 2,
metrics.AssessmentKeyFilesExecutedMaximumReachable: 2,
metrics.AssessmentKeyResponseNoError: 2,
metrics.AssessmentKeyTestsPassing: 80,
metrics.AssessmentKeyTestsPassing: 8,
},
},
ValidateLog: func(t *testing.T, data string) {
Expand Down Expand Up @@ -195,7 +195,7 @@ func TestCodeRepairRun(t *testing.T) {
metrics.AssessmentKeyFilesExecuted: 1,
metrics.AssessmentKeyFilesExecutedMaximumReachable: 1,
metrics.AssessmentKeyResponseNoError: 1,
metrics.AssessmentKeyTestsPassing: 30,
metrics.AssessmentKeyTestsPassing: 3,
},
},
ValidateLog: func(t *testing.T, data string) {
Expand Down Expand Up @@ -261,7 +261,7 @@ func TestCodeRepairRun(t *testing.T) {
metrics.AssessmentKeyFilesExecutedMaximumReachable: 2,
metrics.AssessmentKeyFilesExecuted: 2,
metrics.AssessmentKeyResponseNoError: 2,
metrics.AssessmentKeyTestsPassing: 60,
metrics.AssessmentKeyTestsPassing: 6,
},
},
ValidateLog: func(t *testing.T, data string) {
Expand Down Expand Up @@ -310,7 +310,7 @@ func TestCodeRepairRun(t *testing.T) {
metrics.AssessmentKeyFilesExecuted: 1,
metrics.AssessmentKeyFilesExecutedMaximumReachable: 1,
metrics.AssessmentKeyResponseNoError: 1,
metrics.AssessmentKeyTestsPassing: 30,
metrics.AssessmentKeyTestsPassing: 3,
},
},
ValidateLog: func(t *testing.T, data string) {
Expand Down Expand Up @@ -369,7 +369,7 @@ func TestCodeRepairRun(t *testing.T) {
metrics.AssessmentKeyFilesExecuted: 2,
metrics.AssessmentKeyFilesExecutedMaximumReachable: 2,
metrics.AssessmentKeyResponseNoError: 2,
metrics.AssessmentKeyTestsPassing: 60,
metrics.AssessmentKeyTestsPassing: 6,
},
},
ValidateLog: func(t *testing.T, data string) {
Expand Down
Loading

0 comments on commit e0c1566

Please sign in to comment.