Skip to content

Commit 321ad6d

Browse files
packed reader paritial columns read
Signed-off-by: shaoting-huang <[email protected]>
1 parent f51fd09 commit 321ad6d

File tree

14 files changed

+194
-57
lines changed

14 files changed

+194
-57
lines changed

.github/workflows/ci.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ on:
1313
jobs:
1414
test:
1515
name: Test
16-
runs-on: ubuntu-latest
16+
runs-on: ubuntu-22.04
1717

1818
steps:
1919
- name: Checkout code

.github/workflows/cpp-ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ on:
1111
- '!go/**'
1212
jobs:
1313
unittest:
14-
runs-on: ubuntu-latest
14+
runs-on: ubuntu-22.04
1515
steps:
1616
- uses: actions/checkout@v3
1717

cpp/include/milvus-storage/common/macro.h

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,16 +25,18 @@ namespace milvus_storage {
2525
#undef RETURN_NOT_OK
2626
#define RETURN_NOT_OK(status) \
2727
do { \
28-
if (!(status).ok()) { \
29-
return status; \
28+
auto _s = (status); \
29+
if (!_s.ok()) { \
30+
return _s; \
3031
} \
3132
} while (false)
3233

33-
#define RETURN_ARROW_NOT_OK(status) \
34-
do { \
35-
if (!(status).ok()) { \
36-
return Status::ArrowError((status).ToString()); \
37-
} \
34+
#define RETURN_ARROW_NOT_OK(status) \
35+
do { \
36+
auto _s = (status); \
37+
if (!_s.ok()) { \
38+
return Status::ArrowError((_s).ToString()); \
39+
} \
3840
} while (false)
3941

4042
#define RETURN_ARROW_NOT_OK_WITH_PREFIX(msg, staus) \

cpp/include/milvus-storage/packed/reader.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ class PackedRecordBatchReader : public arrow::RecordBatchReader {
4343
PackedRecordBatchReader(arrow::fs::FileSystem& fs,
4444
const std::string& file_path,
4545
const std::shared_ptr<arrow::Schema> schema,
46-
const std::set<int>& needed_columns,
46+
std::set<int>& needed_columns,
4747
const int64_t buffer_size = DEFAULT_READ_BUFFER_SIZE);
4848

4949
std::shared_ptr<arrow::Schema> schema() const override;
@@ -53,6 +53,12 @@ class PackedRecordBatchReader : public arrow::RecordBatchReader {
5353
arrow::Status Close() override;
5454

5555
private:
56+
void initialize(arrow::fs::FileSystem& fs,
57+
const std::string& file_path,
58+
const std::shared_ptr<arrow::Schema> schema,
59+
std::set<int>& needed_columns,
60+
const int64_t buffer_size);
61+
5662
Status initializeColumnOffsets(arrow::fs::FileSystem& fs, const std::set<int>& needed_columns, size_t num_fields);
5763
// Advance buffer to fill the expected buffer size
5864
arrow::Status advanceBuffer();

cpp/include/milvus-storage/packed/reader_c.h

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,15 +24,37 @@ typedef void* CPackedReader;
2424
typedef void* CArrowArray;
2525
typedef void* CArrowSchema;
2626

27-
int Open(const char* path, struct ArrowSchema* schema, const int64_t buffer_size, struct ArrowArrayStream* out);
28-
27+
/**
28+
* @brief Open a packed reader to read needed columns in the specified path.
29+
*
30+
* @param path The root path of the packed files to read.
31+
* @param schema The original schema of data.
32+
* @param buffer_size The max buffer size of the packed reader.
33+
* @param needed_columns The columns to read. If it is empty, all columns will be read.
34+
* @param c_packed_reader The output pointer of the packed reader.
35+
*/
2936
int NewPackedReader(const char* path,
3037
struct ArrowSchema* schema,
3138
const int64_t buffer_size,
39+
int* needed_columns,
40+
int num_needed_columns,
3241
CPackedReader* c_packed_reader);
3342

43+
/**
44+
* @brief Read the next record batch from the packed reader.
45+
* By default, the maximum return batch is 1024 rows.
46+
*
47+
* @param c_packed_reader The packed reader to read.
48+
* @param out_array The output pointer of the arrow array.
49+
* @param out_schema The output pointer of the arrow schema.
50+
*/
3451
int ReadNext(CPackedReader c_packed_reader, CArrowArray* out_array, CArrowSchema* out_schema);
3552

53+
/**
54+
* @brief Close the packed reader and release the resources.
55+
*
56+
* @param c_packed_reader The packed reader to close.
57+
*/
3658
int CloseReader(CPackedReader c_packed_reader);
3759

3860
#ifdef __cplusplus

cpp/src/packed/column_group.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,6 @@
1717
#include <arrow/table.h>
1818
#include "common/status.h"
1919

20-
using namespace std;
21-
2220
namespace milvus_storage {
2321

2422
ColumnGroup::ColumnGroup(GroupId group_id, const std::vector<int>& origin_column_indices)

cpp/src/packed/reader.cpp

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ namespace milvus_storage {
3131
PackedRecordBatchReader::PackedRecordBatchReader(arrow::fs::FileSystem& fs,
3232
const std::string& file_path,
3333
const std::shared_ptr<arrow::Schema> schema,
34-
const std::set<int>& needed_columns,
34+
std::set<int>& needed_columns,
3535
const int64_t buffer_size)
3636
: file_path_(file_path),
3737
schema_(schema),
@@ -40,6 +40,19 @@ PackedRecordBatchReader::PackedRecordBatchReader(arrow::fs::FileSystem& fs,
4040
row_limit_(0),
4141
absolute_row_position_(0),
4242
read_count_(0) {
43+
initialize(fs, file_path_, schema_, needed_columns, buffer_size);
44+
}
45+
46+
void PackedRecordBatchReader::initialize(arrow::fs::FileSystem& fs,
47+
const std::string& file_path,
48+
const std::shared_ptr<arrow::Schema> schema,
49+
std::set<int>& needed_columns,
50+
const int64_t buffer_size) {
51+
if (needed_columns.empty()) {
52+
for (int i = 0; i < schema->num_fields(); i++) {
53+
needed_columns.insert(i);
54+
}
55+
}
4356
auto status = initializeColumnOffsets(fs, needed_columns, schema->num_fields());
4457
if (!status.ok()) {
4558
throw std::runtime_error(status.ToString());

cpp/src/packed/reader_c.cpp

Lines changed: 8 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -23,35 +23,11 @@
2323
#include <arrow/status.h>
2424
#include <memory>
2525

26-
int Open(const char* path, struct ArrowSchema* schema, const int64_t buffer_size, struct ArrowArrayStream* out) {
27-
auto truePath = std::string(path);
28-
auto factory = std::make_shared<milvus_storage::FileSystemFactory>();
29-
auto conf = milvus_storage::StorageConfig();
30-
conf.uri = "file:///tmp/";
31-
auto r = factory->BuildFileSystem(conf, &truePath);
32-
if (!r.ok()) {
33-
LOG_STORAGE_ERROR_ << "Error building filesystem: " << path;
34-
return -2;
35-
}
36-
auto trueFs = r.value();
37-
auto trueSchema = arrow::ImportSchema(schema).ValueOrDie();
38-
std::set<int> needed_columns;
39-
for (int i = 0; i < trueSchema->num_fields(); i++) {
40-
needed_columns.emplace(i);
41-
}
42-
auto reader =
43-
std::make_shared<milvus_storage::PackedRecordBatchReader>(*trueFs, path, trueSchema, needed_columns, buffer_size);
44-
auto status = ExportRecordBatchReader(reader, out);
45-
if (!status.ok()) {
46-
LOG_STORAGE_ERROR_ << "Error exporting record batch reader" << status.ToString();
47-
return static_cast<int>(status.code());
48-
}
49-
return 0;
50-
}
51-
5226
int NewPackedReader(const char* path,
5327
struct ArrowSchema* schema,
5428
const int64_t buffer_size,
29+
int* needed_columns,
30+
int num_needed_columns,
5531
CPackedReader* c_packed_reader) {
5632
try {
5733
auto truePath = std::string(path);
@@ -60,12 +36,13 @@ int NewPackedReader(const char* path,
6036
conf.uri = "file:///tmp/";
6137
auto trueFs = factory->BuildFileSystem(conf, &truePath).value();
6238
auto trueSchema = arrow::ImportSchema(schema).ValueOrDie();
63-
std::set<int> needed_columns;
64-
for (int i = 0; i < trueSchema->num_fields(); i++) {
65-
needed_columns.emplace(i);
39+
std::set<int> trueNeededColumns;
40+
for (int i = 0; i < num_needed_columns; i++) {
41+
trueNeededColumns.insert(needed_columns[i]);
6642
}
67-
auto reader = std::make_unique<milvus_storage::PackedRecordBatchReader>(*trueFs, path, trueSchema, needed_columns,
68-
buffer_size);
43+
44+
auto reader = std::make_unique<milvus_storage::PackedRecordBatchReader>(*trueFs, path, trueSchema,
45+
trueNeededColumns, buffer_size);
6946
*c_packed_reader = reader.release();
7047
return 0;
7148
} catch (std::exception& e) {

cpp/src/packed/writer.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,13 +97,13 @@ Status PackedRecordBatchWriter::writeWithSplitIndex(const std::shared_ptr<arrow:
9797
// Flush column groups until there's enough room for the new column groups
9898
// to ensure that memory usage stays strictly below the limit
9999
while (current_memory_usage_ + next_batch_size >= memory_limit_ && !max_heap_.empty()) {
100-
LOG_STORAGE_DEBUG_ << "Current memory usage: " << current_memory_usage_
100+
LOG_STORAGE_DEBUG_ << "Current memory usage: " << current_memory_usage_ / 1024 / 1024 << " MB, "
101101
<< ", flushing column group: " << max_heap_.top().first;
102102
auto max_group = max_heap_.top();
103+
max_heap_.pop();
103104
current_memory_usage_ -= max_group.second;
104105

105106
ColumnGroupWriter* writer = group_writers_[max_group.first].get();
106-
max_heap_.pop();
107107
RETURN_NOT_OK(writer->Flush());
108108
}
109109

cpp/test/packed/packed_integration_test.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,6 @@ TEST_F(PackedIntegrationTest, TestOneFile) {
2727
}
2828
EXPECT_TRUE(writer.Close().ok());
2929

30-
std::vector<std::string> paths = {file_path_ + "/0"};
31-
3230
std::set<int> needed_columns = {0, 1, 2};
3331

3432
PackedRecordBatchReader pr(*fs_, file_path_, schema_, needed_columns, reader_memory_);
@@ -47,8 +45,6 @@ TEST_F(PackedIntegrationTest, TestSplitColumnGroup) {
4745
}
4846
EXPECT_TRUE(writer.Close().ok());
4947

50-
std::vector<std::string> paths = {file_path_ + "/0", file_path_ + "/1"};
51-
5248
std::set<int> needed_columns = {0, 1, 2};
5349

5450
PackedRecordBatchReader pr(*fs_, file_path_, schema_, needed_columns, reader_memory_);

go/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ test:
2222
LD_LIBRARY_PATH=$(MILVUS_STORAGE_LD_DIR):$$LD_LIBRARY_PATH \
2323
CGO_CFLAGS="$(CPPFLAGS)" \
2424
CGO_LDFLAGS="$(LDFLAGS) -lmilvus-storage" \
25-
go test -count=1 -timeout 30s ./...
25+
go test -count=1 -timeout 30s ./... -gcflags "all=-N -l" -o gdb/
2626

2727
proto:
2828
mkdir -p proto/manifest_proto

go/packed/packed_option.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
// Copyright 2023 Zilliz
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package packed
16+
17+
type packedReaderOption struct {
18+
needColumns []int
19+
}
20+
21+
func (opt *packedReaderOption) WithNeededColumns(columns []int) {
22+
opt.needColumns = columns
23+
}
24+
25+
func NewPackedReaderOption() *packedReaderOption {
26+
return &packedReaderOption{}
27+
}

go/packed/packed_reader.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ import (
3131
"github.com/apache/arrow/go/v12/arrow/cdata"
3232
)
3333

34-
func newPackedReader(path string, schema *arrow.Schema, bufferSize int) (*PackedReader, error) {
34+
func newPackedReader(path string, schema *arrow.Schema, bufferSize int, opt *packedReaderOption) (*PackedReader, error) {
3535
var cas cdata.CArrowSchema
3636
cdata.ExportArrowSchema(schema, &cas)
3737
cSchema := (*C.struct_ArrowSchema)(unsafe.Pointer(&cas))
@@ -42,7 +42,12 @@ func newPackedReader(path string, schema *arrow.Schema, bufferSize int) (*Packed
4242
cBufferSize := C.int64_t(bufferSize)
4343

4444
var cPackedReader C.CPackedReader
45-
status := C.NewPackedReader(cPath, cSchema, cBufferSize, &cPackedReader)
45+
cNeedColumns := (*C.int)(C.malloc(C.size_t(len(opt.needColumns)) * C.size_t(unsafe.Sizeof(C.int(0)))))
46+
for i, col := range opt.needColumns {
47+
(*[1<<31 - 1]C.int)(unsafe.Pointer(cNeedColumns))[i] = C.int(col)
48+
}
49+
cNumNeededColumns := C.int(len(opt.needColumns))
50+
status := C.NewPackedReader(cPath, cSchema, cBufferSize, cNeedColumns, cNumNeededColumns, &cPackedReader)
4651
if status != 0 {
4752
return nil, errors.New(fmt.Sprintf("failed to new packed reader: %s, status: %d", path, status))
4853
}

0 commit comments

Comments
 (0)