@@ -445,16 +445,22 @@ class BlockParsingOperator {
445
445
num_rows_seen_ += parser->total_num_rows ();
446
446
}
447
447
448
- RETURN_NOT_OK (block.consume_bytes (parsed_size));
448
+ if (block.consume_bytes ) {
449
+ RETURN_NOT_OK (block.consume_bytes (parsed_size));
450
+ }
449
451
return ParsedBlock{std::move (parser), block.block_index ,
450
452
static_cast <int64_t >(parsed_size) + block.bytes_skipped };
451
453
}
452
454
455
+ int64_t num_rows_seen () const { return num_rows_seen_; }
456
+
457
+ int num_csv_cols () const { return num_csv_cols_; }
458
+
453
459
private:
454
460
io::IOContext io_context_;
455
- ParseOptions parse_options_;
456
- int num_csv_cols_;
457
- bool count_rows_;
461
+ const ParseOptions parse_options_;
462
+ const int num_csv_cols_;
463
+ const bool count_rows_;
458
464
int64_t num_rows_seen_;
459
465
};
460
466
@@ -570,7 +576,6 @@ class ReaderMixin {
570
576
parse_options_(parse_options),
571
577
convert_options_(convert_options),
572
578
count_rows_(count_rows),
573
- num_rows_seen_(count_rows_ ? 1 : -1 ),
574
579
input_(std::move(input)) {}
575
580
576
581
protected:
@@ -581,6 +586,7 @@ class ReaderMixin {
581
586
const uint8_t * data = buf->data ();
582
587
const auto data_end = data + buf->size ();
583
588
DCHECK_GT (data_end - data, 0 );
589
+ int64_t num_rows_seen = 1 ;
584
590
585
591
if (read_options_.skip_rows ) {
586
592
// Skip initial rows (potentially invalid CSV data)
@@ -593,14 +599,14 @@ class ReaderMixin {
593
599
" either file is too short or header is larger than block size" );
594
600
}
595
601
if (count_rows_) {
596
- num_rows_seen_ += num_skipped_rows;
602
+ num_rows_seen += num_skipped_rows;
597
603
}
598
604
}
599
605
600
606
if (read_options_.column_names .empty ()) {
601
607
// Parse one row (either to read column names or to know the number of columns)
602
- BlockParser parser (io_context_.pool (), parse_options_, num_csv_cols_ ,
603
- num_rows_seen_, 1 );
608
+ BlockParser parser (io_context_.pool (), parse_options_, /* num_cols= */ - 1 ,
609
+ /* first_row= */ num_rows_seen, /* max_num_rows= */ 1 );
604
610
uint32_t parsed_size = 0 ;
605
611
RETURN_NOT_OK (parser.Parse (
606
612
std::string_view (reinterpret_cast <const char *>(data), data_end - data),
@@ -627,7 +633,7 @@ class ReaderMixin {
627
633
// Skip parsed header row
628
634
data += parsed_size;
629
635
if (count_rows_) {
630
- ++num_rows_seen_ ;
636
+ ++num_rows_seen ;
631
637
}
632
638
}
633
639
} else {
@@ -636,14 +642,17 @@ class ReaderMixin {
636
642
637
643
if (count_rows_) {
638
644
// increase rows seen to skip past rows which will be skipped
639
- num_rows_seen_ += read_options_.skip_rows_after_names ;
645
+ num_rows_seen += read_options_.skip_rows_after_names ;
640
646
}
641
647
642
648
auto bytes_consumed = data - buf->data ();
643
649
*rest = SliceBuffer (buf, bytes_consumed);
644
650
645
- num_csv_cols_ = static_cast <int32_t >(column_names_.size ());
646
- DCHECK_GT (num_csv_cols_, 0 );
651
+ int32_t num_csv_cols = static_cast <int32_t >(column_names_.size ());
652
+ DCHECK_GT (num_csv_cols, 0 );
653
+ // Since we know the number of columns, we can instantiate the BlockParsingOperator
654
+ parsing_operator_.emplace (io_context_, parse_options_, num_csv_cols,
655
+ count_rows_ ? num_rows_seen : -1 );
647
656
648
657
RETURN_NOT_OK (MakeConversionSchema ());
649
658
return bytes_consumed;
@@ -691,7 +700,7 @@ class ReaderMixin {
691
700
692
701
if (convert_options_.include_columns .empty ()) {
693
702
// Include all columns in CSV file order
694
- for (int32_t col_index = 0 ; col_index < num_csv_cols_ ; ++col_index) {
703
+ for (int32_t col_index = 0 ; col_index < num_csv_cols () ; ++col_index) {
695
704
append_csv_column (column_names_[col_index], col_index);
696
705
}
697
706
} else {
@@ -719,66 +728,25 @@ class ReaderMixin {
719
728
return Status::OK ();
720
729
}
721
730
722
- struct ParseResult {
723
- std::shared_ptr<BlockParser> parser;
724
- int64_t parsed_bytes;
725
- };
726
-
727
- Result<ParseResult> Parse (const std::shared_ptr<Buffer>& partial,
728
- const std::shared_ptr<Buffer>& completion,
729
- const std::shared_ptr<Buffer>& block, int64_t block_index,
730
- bool is_final) {
731
- static constexpr int32_t max_num_rows = std::numeric_limits<int32_t >::max ();
732
- auto parser = std::make_shared<BlockParser>(
733
- io_context_.pool (), parse_options_, num_csv_cols_, num_rows_seen_, max_num_rows);
734
-
735
- std::shared_ptr<Buffer> straddling;
736
- std::vector<std::string_view> views;
737
- if (partial->size () != 0 || completion->size () != 0 ) {
738
- if (partial->size () == 0 ) {
739
- straddling = completion;
740
- } else if (completion->size () == 0 ) {
741
- straddling = partial;
742
- } else {
743
- ARROW_ASSIGN_OR_RAISE (
744
- straddling, ConcatenateBuffers ({partial, completion}, io_context_.pool ()));
745
- }
746
- views = {std::string_view (*straddling), std::string_view (*block)};
747
- } else {
748
- views = {std::string_view (*block)};
749
- }
750
- uint32_t parsed_size;
751
- if (is_final) {
752
- RETURN_NOT_OK (parser->ParseFinal (views, &parsed_size));
753
- } else {
754
- RETURN_NOT_OK (parser->Parse (views, &parsed_size));
755
- }
756
- // See BlockParsingOperator for explanation.
757
- const int64_t bytes_before_buffer = partial->size () + completion->size ();
758
- if (static_cast <int64_t >(parsed_size) < bytes_before_buffer) {
759
- return Status::Invalid (
760
- " CSV parser got out of sync with chunker. This can mean the data file "
761
- " contains cell values spanning multiple lines; please consider enabling "
762
- " the option 'newlines_in_values'." );
763
- }
731
+ Result<ParsedBlock> Parse (const CSVBlock& block) {
732
+ DCHECK (parsing_operator_.has_value ());
733
+ return (*parsing_operator_)(block);
734
+ }
764
735
765
- if (count_rows_) {
766
- num_rows_seen_ += parser->total_num_rows ();
767
- }
768
- return ParseResult{std::move (parser), static_cast <int64_t >(parsed_size)};
736
+ int num_csv_cols () const {
737
+ DCHECK (parsing_operator_.has_value ());
738
+ return parsing_operator_->num_csv_cols ();
769
739
}
770
740
771
741
io::IOContext io_context_;
772
- ReadOptions read_options_;
773
- ParseOptions parse_options_;
774
- ConvertOptions convert_options_;
775
-
776
- // Number of columns in the CSV file
777
- int32_t num_csv_cols_ = -1 ;
778
- // Whether num_rows_seen_ tracks the number of rows seen in the CSV being parsed
779
- bool count_rows_;
780
- // Number of rows seen in the csv. Not used if count_rows is false
781
- int64_t num_rows_seen_;
742
+ const ReadOptions read_options_;
743
+ const ParseOptions parse_options_;
744
+ const ConvertOptions convert_options_;
745
+ // Whether to track the number of rows seen in the CSV being parsed
746
+ const bool count_rows_;
747
+
748
+ std::optional<BlockParsingOperator> parsing_operator_;
749
+
782
750
// Column names in the CSV file
783
751
std::vector<std::string> column_names_;
784
752
ConversionSchema conversion_schema_;
@@ -822,14 +790,10 @@ class BaseTableReader : public ReaderMixin, public csv::TableReader {
822
790
return Status::OK ();
823
791
}
824
792
825
- Result<int64_t > ParseAndInsert (const std::shared_ptr<Buffer>& partial,
826
- const std::shared_ptr<Buffer>& completion,
827
- const std::shared_ptr<Buffer>& block,
828
- int64_t block_index, bool is_final) {
829
- ARROW_ASSIGN_OR_RAISE (auto result,
830
- Parse (partial, completion, block, block_index, is_final));
831
- RETURN_NOT_OK (ProcessData (result.parser , block_index));
832
- return result.parsed_bytes ;
793
+ Status ParseAndInsert (const CSVBlock& block) {
794
+ ARROW_ASSIGN_OR_RAISE (auto result, Parse (block));
795
+ RETURN_NOT_OK (ProcessData (result.parser , result.block_index ));
796
+ return Status::OK ();
833
797
}
834
798
835
799
// Trigger conversion of parsed block data
@@ -921,17 +885,14 @@ class StreamingReaderImpl : public ReaderMixin,
921
885
ProcessHeader (first_buffer, &after_header));
922
886
bytes_decoded_->fetch_add (header_bytes_consumed);
923
887
924
- auto parser_op =
925
- BlockParsingOperator (io_context_, parse_options_, num_csv_cols_, num_rows_seen_);
926
888
ARROW_ASSIGN_OR_RAISE (
927
889
auto decoder_op,
928
890
BlockDecodingOperator::Make (io_context_, convert_options_, conversion_schema_));
929
891
930
892
auto block_gen = SerialBlockReader::MakeAsyncIterator (
931
893
std::move (buffer_generator), MakeChunker (parse_options_), std::move (after_header),
932
894
read_options_.skip_rows_after_names );
933
- auto parsed_block_gen =
934
- MakeMappedGenerator (std::move (block_gen), std::move (parser_op));
895
+ auto parsed_block_gen = MakeMappedGenerator (std::move (block_gen), *parsing_operator_);
935
896
auto rb_gen = MakeMappedGenerator (std::move (parsed_block_gen), std::move (decoder_op));
936
897
937
898
auto self = shared_from_this ();
@@ -1035,11 +996,7 @@ class SerialTableReader : public BaseTableReader {
1035
996
// EOF
1036
997
break ;
1037
998
}
1038
- ARROW_ASSIGN_OR_RAISE (
1039
- int64_t parsed_bytes,
1040
- ParseAndInsert (maybe_block.partial , maybe_block.completion , maybe_block.buffer ,
1041
- maybe_block.block_index , maybe_block.is_final ));
1042
- RETURN_NOT_OK (maybe_block.consume_bytes (parsed_bytes));
999
+ RETURN_NOT_OK (ParseAndInsert (maybe_block));
1043
1000
}
1044
1001
// Finish conversion, create schema and table
1045
1002
RETURN_NOT_OK (task_group_->Finish ());
@@ -1110,13 +1067,8 @@ class AsyncThreadedTableReader
1110
1067
DCHECK (!maybe_block.consume_bytes );
1111
1068
1112
1069
// Launch parse task
1113
- self->task_group_ ->Append ([self, maybe_block] {
1114
- return self
1115
- ->ParseAndInsert (maybe_block.partial , maybe_block.completion ,
1116
- maybe_block.buffer , maybe_block.block_index ,
1117
- maybe_block.is_final )
1118
- .status ();
1119
- });
1070
+ self->task_group_ ->Append (
1071
+ [self, maybe_block] { return self->ParseAndInsert (maybe_block); });
1120
1072
return Status::OK ();
1121
1073
};
1122
1074
@@ -1239,12 +1191,8 @@ class CSVRowCounter : public ReaderMixin,
1239
1191
// IterationEnd.
1240
1192
std::function<Result<std::optional<int64_t >>(const CSVBlock&)> count_cb =
1241
1193
[self](const CSVBlock& maybe_block) -> Result<std::optional<int64_t >> {
1242
- ARROW_ASSIGN_OR_RAISE (
1243
- auto parser,
1244
- self->Parse (maybe_block.partial , maybe_block.completion , maybe_block.buffer ,
1245
- maybe_block.block_index , maybe_block.is_final ));
1246
- RETURN_NOT_OK (maybe_block.consume_bytes (parser.parsed_bytes ));
1247
- int32_t total_row_count = parser.parser ->total_num_rows ();
1194
+ ARROW_ASSIGN_OR_RAISE (auto parsed_block, self->Parse (maybe_block));
1195
+ int32_t total_row_count = parsed_block.parser ->total_num_rows ();
1248
1196
self->row_count_ += total_row_count;
1249
1197
return total_row_count;
1250
1198
};
0 commit comments