-
Notifications
You must be signed in to change notification settings - Fork 146
Description
Describe the bug
A clear and concise description of what the bug is.
I got crashed only after IVF index build and then call golang BruteForceIndex.BuildIndex. Most likely because of the IVF index build cause the bug.
--- FAIL: TestIvfAndBruteForceForIssue (17.91s)
gpu_test.go:343:
Error Trace: /home/eric/github/matrixone/pkg/vectorindex/ivfflat/kmeans/device/gpu_test.go:343
/home/eric/miniconda3/envs/go/go/src/runtime/asm_amd64.s:1693
Error: Received unexpected error:
CUDA error encountered at: file=/tmp/conda-bld-output/bld/rattler-build_libcuvs/host_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehol/include/raft/linalg/detail/coalesced_reduction-inl.cuh line=260: call='cudaPeekAtLastError()', Reason=cudaErrorInvalidValue:invalid argument
Obtained 9 stack frames
#1 in /home/eric/miniconda3/envs/go/lib/libcuvs.so(+0x4e902d) [0x72081aae902d]
#2 in /home/eric/miniconda3/envs/go/lib/libcuvs.so(+0x381f74) [0x72081a981f74]
#3 in /home/eric/miniconda3/envs/go/lib/libcuvs.so(+0xc18f33) [0x72081b218f33]
#4 in /home/eric/miniconda3/envs/go/lib/libcuvs.so: cuvs::neighbors::brute_force::index<float, float> cuvs::neighbors::detail::build<float, float, raft::host_device_accessor<std::experimental::default_accessor<float const>, (raft::memory_type)2>, std::experimental::layout_right>(raft::resources const&, std::experimental::mdspan<float const, std::experimental::extents<long, 18446744073709551615ul, 18446744073709551615ul>, std::experimental::layout_right, raft::host_device_accessor<std::experimental::default_accessor<float const>, (raft::memory_type)2> >, cuvs::distance::DistanceType, float) +0x288 [0x72081b232a08]
#5 in /home/eric/miniconda3/envs/go/lib/libcuvs.so: cuvs::neighbors::brute_force::build(raft::resources const&, cuvs::neighbors::brute_force::index_params const&, std::experimental::mdspan<float const, std::experimental::extents<long, 18446744073709551615ul, 18446744073709551615ul>, std::experimental::layout_right, raft::host_device_accessor<std::experimental::default_accessor<float const>, (raft::memory_type)2> >) +0x28 [0x72081b21bab8]
#6 in /home/eric/miniconda3/envs/go/lib/libcuvs_c.so(+0x5fec6) [0x720836416ec6]
#7 in /home/eric/miniconda3/envs/go/lib/libcuvs_c.so: cuvsBruteForceBuild +0x43 [0x7208364173b3]
#8 in /tmp/go-build145927094/b001/device.test() [0x1ff2cc7]
#9 in /tmp/go-build145927094/b001/device.test() [0x493504]
Test: TestIvfAndBruteForceForIssue
FAIL
exit status 1
FAIL github.com/matrixorigin/matrixone/pkg/vectorindex/ivfflat/kmeans/device 20.143s
Steps/Code to reproduce bug
Follow this guide http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports to craft a minimal bug report. This helps us reproduce the issue you're having and resolve the issue more quickly.
Please use the cuvs repo https://github.com/cpegeric/cuvs/tree/golang
https://github.com/rapidsai/cuvs/pull/1600/files
package device
import (
"fmt"
"math/rand/v2"
"sync"
"testing"
"os"
"github.com/stretchr/testify/require"
cuvs "github.com/rapidsai/cuvs/go"
"github.com/rapidsai/cuvs/go/brute_force"
"github.com/rapidsai/cuvs/go/ivf_flat"
)
func getCenters(vecs [][]float32, dim int, clusterCnt int, distanceType cuvs.Distance, maxIterations int) ([][]float32, error) {
resource, err := cuvs.NewResource(nil)
if err != nil {
return nil, err
}
defer resource.Close()
indexParams, err := ivf_flat.CreateIndexParams()
if err != nil {
return nil, err
}
defer indexParams.Close()
indexParams.SetNLists(uint32(clusterCnt))
indexParams.SetMetric(distanceType)
indexParams.SetKMeansNIters(uint32(maxIterations))
indexParams.SetKMeansTrainsetFraction(1) // train all sample
dataset, err := cuvs.NewTensor(vecs)
if err != nil {
return nil, err
}
defer dataset.Close()
index, _ := ivf_flat.CreateIndex(indexParams, &dataset)
defer index.Close()
if _, err := dataset.ToDevice(&resource); err != nil {
return nil, err
}
centers, err := cuvs.NewTensorOnDevice[float32](&resource, []int64{int64(clusterCnt), int64(dim)})
if err != nil {
return nil, err
}
if err := ivf_flat.BuildIndex(resource, indexParams, &dataset, index); err != nil {
return nil, err
}
if err := resource.Sync(); err != nil {
return nil, err
}
if err := ivf_flat.GetCenters(index, ¢ers); err != nil {
return nil, err
}
if _, err := centers.ToHost(&resource); err != nil {
return nil, err
}
if err := resource.Sync(); err != nil {
return nil, err
}
result, err := centers.Slice()
if err != nil {
return nil, err
}
return result, nil
}
func Search(datasetvec [][]float32, queriesvec [][]float32, limit uint, distanceType cuvs.Distance) (retkeys any, retdistances []float64, err error) {
os.Stderr.WriteString(fmt.Sprintf("probe set %d\n", len(queriesvec)))
os.Stderr.WriteString("brute force index search start\n")
resource, err := cuvs.NewResource(nil)
if err != nil {
return
}
defer resource.Close()
dataset, err := cuvs.NewTensor(datasetvec)
if err != nil {
return
}
defer dataset.Close()
index, err := brute_force.CreateIndex()
if err != nil {
return
}
defer index.Close()
queries, err := cuvs.NewTensor(queriesvec)
if err != nil {
return
}
defer queries.Close()
neighbors, err := cuvs.NewTensorOnDevice[int64](&resource, []int64{int64(len(queriesvec)), int64(limit)})
if err != nil {
return
}
defer neighbors.Close()
distances, err := cuvs.NewTensorOnDevice[float32](&resource, []int64{int64(len(queriesvec)), int64(limit)})
if err != nil {
return
}
defer distances.Close()
if _, err = dataset.ToDevice(&resource); err != nil {
return
}
if err = resource.Sync(); err != nil {
return
}
err = brute_force.BuildIndex(resource, &dataset, distanceType, 2.0, index)
if err != nil {
os.Stderr.WriteString(fmt.Sprintf("BruteForceIndex: build index failed %v\n", err))
os.Stderr.WriteString(fmt.Sprintf("BruteForceIndex: build index failed centers %v\n", datasetvec))
return
}
if err = resource.Sync(); err != nil {
return
}
os.Stderr.WriteString("built brute force index\n")
if _, err = queries.ToDevice(&resource); err != nil {
return
}
os.Stderr.WriteString("brute force index search Runing....\n")
err = brute_force.SearchIndex(resource, *index, &queries, &neighbors, &distances)
if err != nil {
return
}
os.Stderr.WriteString("brute force index search finished Runing....\n")
if _, err = neighbors.ToHost(&resource); err != nil {
return
}
os.Stderr.WriteString("brute force index search neighbour to host done....\n")
if _, err = distances.ToHost(&resource); err != nil {
return
}
os.Stderr.WriteString("brute force index search distances to host done....\n")
if err = resource.Sync(); err != nil {
return
}
os.Stderr.WriteString("brute force index search return result....\n")
neighborsSlice, err := neighbors.Slice()
if err != nil {
return
}
distancesSlice, err := distances.Slice()
if err != nil {
return
}
//fmt.Printf("flattened %v\n", flatten)
retdistances = make([]float64, len(distancesSlice)*int(limit))
for i := range distancesSlice {
for j, dist := range distancesSlice[i] {
retdistances[i*int(limit)+j] = float64(dist)
}
}
keys := make([]int64, len(neighborsSlice)*int(limit))
for i := range neighborsSlice {
for j, key := range neighborsSlice[i] {
keys[i*int(limit)+j] = int64(key)
}
}
retkeys = keys
os.Stderr.WriteString("brute force index search RETURN NOW....\n")
return
}
func TestIvfAndBruteForceForIssue(t *testing.T) {
dimension := uint(128)
limit := uint(1)
/*
ncpu := uint(1)
elemsz := uint(4) // float32
*/
dsize := 100000
nlist := 128
vecs := make([][]float32, dsize)
for i := range vecs {
vecs[i] = make([]float32, dimension)
for j := range vecs[i] {
vecs[i][j] = rand.Float32()
}
}
queries := vecs[:8192]
centers, err := getCenters(vecs, int(dimension), nlist, cuvs.DistanceL2, 10)
require.NoError(t, err)
var wg sync.WaitGroup
for n := 0; n < 4; n++ {
wg.Add(1)
go func() {
defer wg.Done()
for i := 0; i < 1000; i++ {
_, _, err := Search(centers, queries, limit, cuvs.DistanceL2)
require.NoError(t, err)
/*
keys_i64, ok := keys.([]int64)
require.Equal(t, ok, true)
for j, key := range keys_i64 {
require.Equal(t, key, int64(j))
require.Equal(t, distances[j], float64(0))
}
*/
// fmt.Printf("keys %v, dist %v\n", keys, distances)
}
}()
}
wg.Wait()
}
The result from the test
--- FAIL: TestIvfAndBruteForceForIssue (17.91s)
gpu_test.go:343:
Error Trace: /home/eric/github/matrixone/pkg/vectorindex/ivfflat/kmeans/device/gpu_test.go:343
/home/eric/miniconda3/envs/go/go/src/runtime/asm_amd64.s:1693
Error: Received unexpected error:
CUDA error encountered at: file=/tmp/conda-bld-output/bld/rattler-build_libcuvs/host_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehol/include/raft/linalg/detail/coalesced_reduction-inl.cuh line=260: call='cudaPeekAtLastError()', Reason=cudaErrorInvalidValue:invalid argument
Obtained 9 stack frames
#1 in /home/eric/miniconda3/envs/go/lib/libcuvs.so(+0x4e902d) [0x72081aae902d]
#2 in /home/eric/miniconda3/envs/go/lib/libcuvs.so(+0x381f74) [0x72081a981f74]
#3 in /home/eric/miniconda3/envs/go/lib/libcuvs.so(+0xc18f33) [0x72081b218f33]
#4 in /home/eric/miniconda3/envs/go/lib/libcuvs.so: cuvs::neighbors::brute_force::index<float, float> cuvs::neighbors::detail::build<float, float, raft::host_device_accessor<std::experimental::default_accessor<float const>, (raft::memory_type)2>, std::experimental::layout_right>(raft::resources const&, std::experimental::mdspan<float const, std::experimental::extents<long, 18446744073709551615ul, 18446744073709551615ul>, std::experimental::layout_right, raft::host_device_accessor<std::experimental::default_accessor<float const>, (raft::memory_type)2> >, cuvs::distance::DistanceType, float) +0x288 [0x72081b232a08]
#5 in /home/eric/miniconda3/envs/go/lib/libcuvs.so: cuvs::neighbors::brute_force::build(raft::resources const&, cuvs::neighbors::brute_force::index_params const&, std::experimental::mdspan<float const, std::experimental::extents<long, 18446744073709551615ul, 18446744073709551615ul>, std::experimental::layout_right, raft::host_device_accessor<std::experimental::default_accessor<float const>, (raft::memory_type)2> >) +0x28 [0x72081b21bab8]
#6 in /home/eric/miniconda3/envs/go/lib/libcuvs_c.so(+0x5fec6) [0x720836416ec6]
#7 in /home/eric/miniconda3/envs/go/lib/libcuvs_c.so: cuvsBruteForceBuild +0x43 [0x7208364173b3]
#8 in /tmp/go-build145927094/b001/device.test() [0x1ff2cc7]
#9 in /tmp/go-build145927094/b001/device.test() [0x493504]
Test: TestIvfAndBruteForceForIssue
FAIL
exit status 1
FAIL github.com/matrixorigin/matrixone/pkg/vectorindex/ivfflat/kmeans/device 20.143s
Expected behavior
A clear and concise description of what you expected to happen.
Environment details (please complete the following information):
- Environment location: [Bare-metal, Docker, Cloud(specify cloud provider)]
- Method of RAFT install: [conda, Docker, or from source]
- If method of install is [Docker], provide
docker pull&docker runcommands used
- If method of install is [Docker], provide
RTX5070
install cuvs 130 with conda on Linux WSL
Additional context
Add any other context about the problem here.
Metadata
Metadata
Assignees
Labels
Type
Projects
Status