Skip to content

[BUG] brute force index crash after ivf index build with golang #1622

@cpegeric

Description

@cpegeric

Describe the bug
A clear and concise description of what the bug is.

I got crashed only after IVF index build and then call golang BruteForceIndex.BuildIndex. Most likely because of the IVF index build cause the bug.


--- FAIL: TestIvfAndBruteForceForIssue (17.91s)
    gpu_test.go:343:
                Error Trace:    /home/eric/github/matrixone/pkg/vectorindex/ivfflat/kmeans/device/gpu_test.go:343
                                                        /home/eric/miniconda3/envs/go/go/src/runtime/asm_amd64.s:1693
                Error:          Received unexpected error:
                                CUDA error encountered at: file=/tmp/conda-bld-output/bld/rattler-build_libcuvs/host_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehol/include/raft/linalg/detail/coalesced_reduction-inl.cuh line=260: call='cudaPeekAtLastError()', Reason=cudaErrorInvalidValue:invalid argument
                                Obtained 9 stack frames
                                #1 in /home/eric/miniconda3/envs/go/lib/libcuvs.so(+0x4e902d) [0x72081aae902d]
                                #2 in /home/eric/miniconda3/envs/go/lib/libcuvs.so(+0x381f74) [0x72081a981f74]
                                #3 in /home/eric/miniconda3/envs/go/lib/libcuvs.so(+0xc18f33) [0x72081b218f33]
                                #4 in /home/eric/miniconda3/envs/go/lib/libcuvs.so: cuvs::neighbors::brute_force::index<float, float> cuvs::neighbors::detail::build<float, float, raft::host_device_accessor<std::experimental::default_accessor<float const>, (raft::memory_type)2>, std::experimental::layout_right>(raft::resources const&, std::experimental::mdspan<float const, std::experimental::extents<long, 18446744073709551615ul, 18446744073709551615ul>, std::experimental::layout_right, raft::host_device_accessor<std::experimental::default_accessor<float const>, (raft::memory_type)2> >, cuvs::distance::DistanceType, float) +0x288 [0x72081b232a08]
                                #5 in /home/eric/miniconda3/envs/go/lib/libcuvs.so: cuvs::neighbors::brute_force::build(raft::resources const&, cuvs::neighbors::brute_force::index_params const&, std::experimental::mdspan<float const, std::experimental::extents<long, 18446744073709551615ul, 18446744073709551615ul>, std::experimental::layout_right, raft::host_device_accessor<std::experimental::default_accessor<float const>, (raft::memory_type)2> >) +0x28 [0x72081b21bab8]
                                #6 in /home/eric/miniconda3/envs/go/lib/libcuvs_c.so(+0x5fec6) [0x720836416ec6]
                                #7 in /home/eric/miniconda3/envs/go/lib/libcuvs_c.so: cuvsBruteForceBuild +0x43 [0x7208364173b3]
                                #8 in /tmp/go-build145927094/b001/device.test() [0x1ff2cc7]
                                #9 in /tmp/go-build145927094/b001/device.test() [0x493504]
                Test:           TestIvfAndBruteForceForIssue
FAIL
exit status 1
FAIL    github.com/matrixorigin/matrixone/pkg/vectorindex/ivfflat/kmeans/device 20.143s

Steps/Code to reproduce bug
Follow this guide http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports to craft a minimal bug report. This helps us reproduce the issue you're having and resolve the issue more quickly.

Please use the cuvs repo https://github.com/cpegeric/cuvs/tree/golang
https://github.com/rapidsai/cuvs/pull/1600/files

package device

import (
        "fmt"
        "math/rand/v2"
        "sync"
        "testing"
        "os"

        "github.com/stretchr/testify/require"

        cuvs "github.com/rapidsai/cuvs/go"
        "github.com/rapidsai/cuvs/go/brute_force"
        "github.com/rapidsai/cuvs/go/ivf_flat"
)

func getCenters(vecs [][]float32, dim int, clusterCnt int, distanceType cuvs.Distance, maxIterations int) ([][]float32, error) {

        resource, err := cuvs.NewResource(nil)
        if err != nil {
                return nil, err
        }
        defer resource.Close()

        indexParams, err := ivf_flat.CreateIndexParams()
        if err != nil {
                return nil, err
        }
        defer indexParams.Close()

        indexParams.SetNLists(uint32(clusterCnt))
        indexParams.SetMetric(distanceType)
        indexParams.SetKMeansNIters(uint32(maxIterations))
        indexParams.SetKMeansTrainsetFraction(1) // train all sample

        dataset, err := cuvs.NewTensor(vecs)
        if err != nil {
                return nil, err
        }
        defer dataset.Close()

        index, _ := ivf_flat.CreateIndex(indexParams, &dataset)
        defer index.Close()

        if _, err := dataset.ToDevice(&resource); err != nil {
                return nil, err
        }

        centers, err := cuvs.NewTensorOnDevice[float32](&resource, []int64{int64(clusterCnt), int64(dim)})
        if err != nil {
                return nil, err
        }

        if err := ivf_flat.BuildIndex(resource, indexParams, &dataset, index); err != nil {
                return nil, err
        }

        if err := resource.Sync(); err != nil {
                return nil, err
        }

        if err := ivf_flat.GetCenters(index, &centers); err != nil {
                return nil, err
        }

        if _, err := centers.ToHost(&resource); err != nil {
                return nil, err
        }

        if err := resource.Sync(); err != nil {
                return nil, err
        }

        result, err := centers.Slice()
        if err != nil {
                return nil, err
        }

        return result, nil

}

func Search(datasetvec [][]float32, queriesvec [][]float32, limit uint, distanceType cuvs.Distance) (retkeys any, retdistances []float64, err error) {
        os.Stderr.WriteString(fmt.Sprintf("probe set %d\n", len(queriesvec)))
        os.Stderr.WriteString("brute force index search start\n")

        resource, err := cuvs.NewResource(nil)
        if err != nil {
                return
        }
        defer resource.Close()

        dataset, err := cuvs.NewTensor(datasetvec)
        if err != nil {
                return
        }
        defer dataset.Close()

        index, err := brute_force.CreateIndex()
        if err != nil {
                return
        }
        defer index.Close()

        queries, err := cuvs.NewTensor(queriesvec)
        if err != nil {
                return
        }
        defer queries.Close()

        neighbors, err := cuvs.NewTensorOnDevice[int64](&resource, []int64{int64(len(queriesvec)), int64(limit)})
        if err != nil {
                return
        }
        defer neighbors.Close()

        distances, err := cuvs.NewTensorOnDevice[float32](&resource, []int64{int64(len(queriesvec)), int64(limit)})
        if err != nil {
                return
        }
        defer distances.Close()

        if _, err = dataset.ToDevice(&resource); err != nil {
                return
        }

        if err = resource.Sync(); err != nil {
                return
        }

        err = brute_force.BuildIndex(resource, &dataset, distanceType, 2.0, index)
        if err != nil {
                os.Stderr.WriteString(fmt.Sprintf("BruteForceIndex: build index failed %v\n", err))
                os.Stderr.WriteString(fmt.Sprintf("BruteForceIndex: build index failed centers %v\n", datasetvec))
                return
        }

        if err = resource.Sync(); err != nil {
                return
        }
        os.Stderr.WriteString("built brute force index\n")

        if _, err = queries.ToDevice(&resource); err != nil {
                return
        }

        os.Stderr.WriteString("brute force index search Runing....\n")
        err = brute_force.SearchIndex(resource, *index, &queries, &neighbors, &distances)
        if err != nil {
                return
        }
        os.Stderr.WriteString("brute force index search finished Runing....\n")

        if _, err = neighbors.ToHost(&resource); err != nil {
                return
        }
        os.Stderr.WriteString("brute force index search neighbour to host done....\n")

        if _, err = distances.ToHost(&resource); err != nil {
                return
        }
        os.Stderr.WriteString("brute force index search distances to host done....\n")

        if err = resource.Sync(); err != nil {
                return
        }

        os.Stderr.WriteString("brute force index search return result....\n")
        neighborsSlice, err := neighbors.Slice()
        if err != nil {
                return
        }

        distancesSlice, err := distances.Slice()
        if err != nil {
                return
        }

        //fmt.Printf("flattened %v\n", flatten)
        retdistances = make([]float64, len(distancesSlice)*int(limit))
        for i := range distancesSlice {
                for j, dist := range distancesSlice[i] {
                        retdistances[i*int(limit)+j] = float64(dist)
                }
        }

        keys := make([]int64, len(neighborsSlice)*int(limit))
        for i := range neighborsSlice {
                for j, key := range neighborsSlice[i] {
                        keys[i*int(limit)+j] = int64(key)
                }
        }
        retkeys = keys
        os.Stderr.WriteString("brute force index search RETURN NOW....\n")
        return
}

func TestIvfAndBruteForceForIssue(t *testing.T) {

        dimension := uint(128)
        limit := uint(1)
        /*
        ncpu := uint(1)
        elemsz := uint(4) // float32
        */

        dsize := 100000
        nlist := 128
        vecs := make([][]float32, dsize)
        for i := range vecs {
                vecs[i] = make([]float32, dimension)
                for j := range vecs[i] {
                        vecs[i][j] = rand.Float32()
                }
        }
        queries := vecs[:8192]

        centers, err := getCenters(vecs, int(dimension), nlist, cuvs.DistanceL2, 10)
        require.NoError(t, err)

        var wg sync.WaitGroup

        for n := 0; n < 4; n++ {

                wg.Add(1)
                go func() {
                        defer wg.Done()
                        for i := 0; i < 1000; i++ {
                                _, _, err := Search(centers, queries, limit, cuvs.DistanceL2)
                                require.NoError(t, err)

                                /*
                                keys_i64, ok := keys.([]int64)
                                require.Equal(t, ok, true)

                                for j, key := range keys_i64 {
                                        require.Equal(t, key, int64(j))
                                        require.Equal(t, distances[j], float64(0))
                                }
                                */
                                // fmt.Printf("keys %v, dist %v\n", keys, distances)
                        }
                }()
        }

        wg.Wait()

}

The result from the test

--- FAIL: TestIvfAndBruteForceForIssue (17.91s)
    gpu_test.go:343:
                Error Trace:    /home/eric/github/matrixone/pkg/vectorindex/ivfflat/kmeans/device/gpu_test.go:343
                                                        /home/eric/miniconda3/envs/go/go/src/runtime/asm_amd64.s:1693
                Error:          Received unexpected error:
                                CUDA error encountered at: file=/tmp/conda-bld-output/bld/rattler-build_libcuvs/host_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehol/include/raft/linalg/detail/coalesced_reduction-inl.cuh line=260: call='cudaPeekAtLastError()', Reason=cudaErrorInvalidValue:invalid argument
                                Obtained 9 stack frames
                                #1 in /home/eric/miniconda3/envs/go/lib/libcuvs.so(+0x4e902d) [0x72081aae902d]
                                #2 in /home/eric/miniconda3/envs/go/lib/libcuvs.so(+0x381f74) [0x72081a981f74]
                                #3 in /home/eric/miniconda3/envs/go/lib/libcuvs.so(+0xc18f33) [0x72081b218f33]
                                #4 in /home/eric/miniconda3/envs/go/lib/libcuvs.so: cuvs::neighbors::brute_force::index<float, float> cuvs::neighbors::detail::build<float, float, raft::host_device_accessor<std::experimental::default_accessor<float const>, (raft::memory_type)2>, std::experimental::layout_right>(raft::resources const&, std::experimental::mdspan<float const, std::experimental::extents<long, 18446744073709551615ul, 18446744073709551615ul>, std::experimental::layout_right, raft::host_device_accessor<std::experimental::default_accessor<float const>, (raft::memory_type)2> >, cuvs::distance::DistanceType, float) +0x288 [0x72081b232a08]
                                #5 in /home/eric/miniconda3/envs/go/lib/libcuvs.so: cuvs::neighbors::brute_force::build(raft::resources const&, cuvs::neighbors::brute_force::index_params const&, std::experimental::mdspan<float const, std::experimental::extents<long, 18446744073709551615ul, 18446744073709551615ul>, std::experimental::layout_right, raft::host_device_accessor<std::experimental::default_accessor<float const>, (raft::memory_type)2> >) +0x28 [0x72081b21bab8]
                                #6 in /home/eric/miniconda3/envs/go/lib/libcuvs_c.so(+0x5fec6) [0x720836416ec6]
                                #7 in /home/eric/miniconda3/envs/go/lib/libcuvs_c.so: cuvsBruteForceBuild +0x43 [0x7208364173b3]
                                #8 in /tmp/go-build145927094/b001/device.test() [0x1ff2cc7]
                                #9 in /tmp/go-build145927094/b001/device.test() [0x493504]
                Test:           TestIvfAndBruteForceForIssue
FAIL
exit status 1
FAIL    github.com/matrixorigin/matrixone/pkg/vectorindex/ivfflat/kmeans/device 20.143s

Expected behavior
A clear and concise description of what you expected to happen.

Environment details (please complete the following information):

  • Environment location: [Bare-metal, Docker, Cloud(specify cloud provider)]
  • Method of RAFT install: [conda, Docker, or from source]
    • If method of install is [Docker], provide docker pull & docker run commands used

RTX5070
install cuvs 130 with conda on Linux WSL

Additional context
Add any other context about the problem here.

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    Status

    Todo

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions