Skip to content

Commit 3a6b83d

Browse files
ykadowakdeepsource-autofix[bot]
authored andcommitted
Add verification for index correction e2e and add clusterrole cronjobs for operator to deploy index correction (#2205)
* implement the initail framework * add corrector configuration * add corrector logic * add build make command for index correction binary * add Dockerfile for index correction * add Docker image for index job correction * add timer * fix tag align * tmp * fix log * temporally implement two versions of correct function * set eg limit from config * add stream list concurrency config * implement index id caching * add config to use cache or not * style: Format code with prettier and gofumpt * refactor availableAddrs * add kvs range duration * add leftAgentAddrs for performance * Revert "add kvs range duration" This reverts commit 5b647be. * refactor * fix without cache bug * enable observability * refactor * SIGTERM after complete * add metrics server * add pcache * remove comment * [TEMP] use pcache * [TMP] use pcache * fix empty shard returns error * fix to use local map * [TMP] add prestop for pcache * [TEMP] add pcache config * style: Format code with prettier and gofumpt * [TEMP] add pcache log * fix map alloc size * [TMP] Add bbolt cache * update bbolt * fix bbolt bug * add bbolt test * [TEMP] use bbolt as persistent cache * style: Format code with prettier and gofumpt * add SetBatch to bbolt * use batch to write map to disk * style: Format code with prettier and gofumpt * delete the map elements on finalize * manually call GC after the map shrink * add limit to SetBatch goroutine number * stop unnecesarry GC * increase eg limit to the MaxBatchSize * use ch to set batch bbolt * fix servers shutdown properly * use internal/kvs/bbolt * refactor * always use bbolt cache for correction * update sample.yaml for correction * style: format code with Prettier and Gofumpt This commit fixes the style issues introduced in 319ec8b according to the output from Prettier and Gofumpt. Details: #2152 * use go std slices pkg * refactor * add comment * remove valdsync * use vald errgroup * refactor * Define ErrNoAvailableAgentToInsert * update comment in English * Apply new actions yaml format * Disable godox * style: format code with Prettier and Gofumpt This commit fixes the style issues introduced in c860ddc according to the output from Prettier and Gofumpt. Details: #2194 * remove comment * Apply format * Add type check for type assertion * use const to specify filemode * Add bbolt concurrency as config * fix var style * Suppress linter * fix comment * add test template * Refactor parameters for index correction * Refactor config * Add corrector test * style: format code with Prettier and Gofumpt This commit fixes the style issues introduced in 004bf81 according to the output from Prettier and Gofumpt. Details: #2194 * Add timestamp check * Apply format * fix schema type * Fix DeepSource errors * Fix misspell * Add type check * Remove unused config * Fix DeepSource error * Add required go:build e2e tag * Remove memo * Refactor comment * Add index job correction helm templates * Add more fields * Add index correction job E2E test * Add e2e action for job * [REVERT THIS] Temporally change version * Fix name and command * Apply format * update crd * Revert "[REVERT THIS] Temporally change version" This reverts commit 1801a63. * Remove unused pkg * Remove experimental file * remove old workflow * Fix cron job name to new one * Update sample.yaml * fix build path * Fix corrector name * add e2e-jobs to slack notification * Update crds * Add StreamListObject to LB * Add E2E for StreamListObject * Update error handling * Fix StreamListObject e2e verification * Add StreamListObject to LB * Add E2E for StreamListObject * Update error handling * Fix StreamListObject e2e verification * Update index correction e2e to verify correction result with StramListObject * Make it possible to deploy index correction cronjob from operator * Update operator manifests * Make schedule field empty so that a user has to specify manually * add default schedule of index correction --------- Co-authored-by: deepsource-autofix[bot] <62050782+deepsource-autofix[bot]@users.noreply.github.com>
1 parent c3e3281 commit 3a6b83d

File tree

9 files changed

+1050
-11
lines changed

9 files changed

+1050
-11
lines changed

.github/helm/values/values-lb.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,3 +74,4 @@ manager:
7474
# suspend because you do not want corrector to start automatically in CI
7575
# instead run it manually
7676
suspend: true
77+
schedule: "1 2 3 4 5"

.github/valdrelease/valdrelease.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,3 +76,7 @@ spec:
7676
auto_index_duration_limit: 2m
7777
auto_index_check_duration: 30s
7878
auto_index_length: 1000
79+
corrector:
80+
enabled: true
81+
suspend: true
82+
schedule: "1 2 3 4 5"

charts/vald-helm-operator/templates/clusterrole.yaml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,4 +175,17 @@ rules:
175175
- get
176176
- patch
177177
- update
178+
- apiGroups:
179+
- batch
180+
resources:
181+
- cronjobs
182+
verbs:
183+
- create
184+
- delete
185+
- get
186+
- list
187+
- patch
188+
- update
189+
- watch
190+
178191
{{- end }}

charts/vald/values.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2682,7 +2682,7 @@ manager:
26822682
enabled: false
26832683
# @schema {"name": "manager.index.corrector.schedule", "type": "string"}
26842684
# manager.index.corrector.schedule -- CronJob schedule setting for index correction
2685-
schedule: "5 * * * *"
2685+
schedule: "6 3 * * *"
26862686
# @schema {"name": "manager.index.corrector.suspend", "type": "boolean"}
26872687
# manager.index.corrector.suspend -- CronJob suspend setting for index correction
26882688
suspend: false

k8s/operator/helm/clusterrole.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,3 +175,15 @@ rules:
175175
- get
176176
- patch
177177
- update
178+
- apiGroups:
179+
- batch
180+
resources:
181+
- cronjobs
182+
verbs:
183+
- create
184+
- delete
185+
- get
186+
- list
187+
- patch
188+
- update
189+
- watch

k8s/operator/helm/crds/valdrelease.yaml

Lines changed: 965 additions & 0 deletions
Large diffs are not rendered by default.

tests/e2e/crud/crud_test.go

Lines changed: 50 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"flag"
2525
"fmt"
2626
"os"
27+
"os/exec"
2728
"testing"
2829
"time"
2930

@@ -42,13 +43,14 @@ var (
4243
port int
4344
ds *hdf5.Dataset
4445

45-
insertNum int
46-
searchNum int
47-
searchByIDNum int
48-
getObjectNum int
49-
updateNum int
50-
upsertNum int
51-
removeNum int
46+
insertNum int
47+
correctionInsertNum int
48+
searchNum int
49+
searchByIDNum int
50+
getObjectNum int
51+
updateNum int
52+
upsertNum int
53+
removeNum int
5254

5355
insertFrom int
5456
searchFrom int
@@ -73,6 +75,7 @@ func init() {
7375
flag.IntVar(&port, "port", 8081, "gRPC port")
7476

7577
flag.IntVar(&insertNum, "insert-num", 10000, "number of id-vector pairs used for insert")
78+
flag.IntVar(&correctionInsertNum, "correction-insert-num", 3000, "number of id-vector pairs used for insert")
7679
flag.IntVar(&searchNum, "search-num", 10000, "number of id-vector pairs used for search")
7780
flag.IntVar(&searchByIDNum, "search-by-id-num", 100, "number of id-vector pairs used for search-by-id")
7881
flag.IntVar(&getObjectNum, "get-object-num", 100, "number of id-vector pairs used for get-object")
@@ -758,7 +761,9 @@ func TestE2EIndexJobCorrection(t *testing.T) {
758761
t.Fatalf("an error occurred: %s", err)
759762
}
760763

761-
train := ds.Train[insertFrom : insertFrom+insertNum]
764+
// prepare train data
765+
train := ds.Train[insertFrom : insertFrom+correctionInsertNum]
766+
762767
err = op.Insert(t, ctx, operation.Dataset{
763768
Train: train,
764769
})
@@ -768,12 +773,49 @@ func TestE2EIndexJobCorrection(t *testing.T) {
768773

769774
sleep(t, waitAfterInsertDuration)
770775

776+
t.Log("Test case 1: just execute index correction and check if replica number is correct after correction")
771777
exe := operation.NewCronJobExecutor("vald-index-correction")
772778
err = exe.CreateAndWait(t, ctx, "correction-test")
773779
if err != nil {
774780
t.Fatalf("an error occurred: %s", err)
775781
}
776782

783+
// check if replica number is correct
784+
err = op.StreamListObject(t, ctx, operation.Dataset{
785+
Train: train,
786+
})
787+
if err != nil {
788+
t.Fatalf("an error occurred: %s", err)
789+
}
790+
791+
t.Log("Test case 2: execute index correction after one agent removed")
792+
t.Log("removing vald-agent-ngt-0...")
793+
cmd := exec.CommandContext(ctx, "sh", "-c", "kubectl delete pod vald-agent-ngt-0 && kubectl wait --for=condition=Ready pod/vald-agent-ngt-0")
794+
out, err := cmd.Output()
795+
if err != nil {
796+
if exitErr, ok := err.(*exec.ExitError); ok {
797+
t.Fatalf("%s, %s, %v", string(out), string(exitErr.Stderr), err)
798+
} else {
799+
t.Fatalf("unexpected error on creating job: %v", err)
800+
}
801+
}
802+
t.Log(string(out))
803+
804+
// correct the deleted index
805+
err = exe.CreateAndWait(t, ctx, "correction-test")
806+
if err != nil {
807+
t.Fatalf("an error occurred: %s", err)
808+
}
809+
810+
// check if replica number is correct
811+
err = op.StreamListObject(t, ctx, operation.Dataset{
812+
Train: train,
813+
})
814+
if err != nil {
815+
t.Fatalf("an error occurred: %s", err)
816+
}
817+
818+
t.Log("Tear down. Removing all vectors...")
777819
err = op.Remove(t, ctx, operation.Dataset{
778820
Train: train,
779821
})

tests/e2e/operation/job.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,8 @@ func deleteJob(t *testing.T, jobName string) error {
5656
func waitJob(t *testing.T, ctx context.Context, jobName string) error {
5757
t.Helper()
5858
t.Log("waiting for the correction job to complete or fail")
59-
waitCompleteCmd := fmt.Sprintf("kubectl wait --timeout=-1s job/%s --for=condition=complete", jobName)
60-
waitFailedCmd := fmt.Sprintf("kubectl wait --timeout=-1s job/%s --for=condition=failed", jobName)
59+
waitCompleteCmd := fmt.Sprintf("kubectl wait --timeout=10m job/%s --for=condition=complete", jobName)
60+
waitFailedCmd := fmt.Sprintf("kubectl wait --timeout=10m job/%s --for=condition=failed", jobName)
6161

6262
ctx, cancel := context.WithCancel(ctx)
6363
defer cancel()

tests/e2e/operation/stream.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1224,5 +1224,7 @@ exit_loop:
12241224
return fmt.Errorf("the number of vectors returned is different at index id %v: got %v, want %v", k, v, replica)
12251225
}
12261226
}
1227+
1228+
t.Log("StreamListObject operation finished successfully and all vectors are returned with correct replica number")
12271229
return nil
12281230
}

0 commit comments

Comments
 (0)