Skip to content

Commit be8c086

Browse files
committed
Merge branch 'gpimm/bump_write_buffer_size' into 'master'
Add writer options for direct io See merge request minknow/pod5-file-format!352
2 parents 44281d5 + 5abd929 commit be8c086

File tree

7 files changed

+54
-21
lines changed

7 files changed

+54
-21
lines changed

CHANGELOG.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,18 @@ All notable changes, updates, and fixes to pod5 will be documented here
77
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
88
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
99

10-
## [0.3.14]
10+
## [0.3.15]
1111

1212
## Added
1313

1414
- Added new end reasons "api_request" and "device_data_error" to allow for new read end reasons future minknow versions will generate.
15+
- Allow directio to specify the chunk size directly.
16+
17+
## [0.3.14]
18+
19+
## Added
20+
21+
- gcc8 builds
1522

1623
## [0.3.13]
1724

c++/pod5_format/file_writer.cpp

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,19 @@
2929

3030
namespace {
3131
/// Open a file using the specified path and return it
32-
arrow::Result<std::shared_ptr<arrow::io::OutputStream>>
33-
open_file_output_stream(std::string const & path, bool append, bool use_directio = true)
32+
arrow::Result<std::shared_ptr<arrow::io::OutputStream>> open_file_output_stream(
33+
std::string const & path,
34+
bool append,
35+
bool use_directio = true,
36+
bool use_sync_io = false)
3437
{
3538
#ifdef __linux__
3639
auto flags = use_directio ? O_RDWR | O_DIRECT : O_RDWR;
3740

3841
flags |= (append == true ? O_APPEND : O_CREAT);
42+
if (use_sync_io) {
43+
flags |= O_SYNC;
44+
}
3945

4046
int fd = open(path.c_str(), flags, 0644);
4147

@@ -55,11 +61,13 @@ open_file_output_stream(std::string const & path, bool append, bool use_directio
5561
std::shared_ptr<arrow::io::OutputStream> make_async_stream(
5662
std::shared_ptr<arrow::io::OutputStream> const & io_stream,
5763
std::shared_ptr<pod5::ThreadPool> thread_pool,
58-
bool use_directio = true)
64+
bool use_directio,
65+
std::size_t directio_chunk_size)
5966
{
6067
#ifdef __linux__
6168
if (use_directio) {
62-
return std::make_shared<pod5::AsyncOutputStreamDirectIO>(io_stream, thread_pool);
69+
return std::make_shared<pod5::AsyncOutputStreamDirectIO>(
70+
io_stream, thread_pool, directio_chunk_size);
6371
} else {
6472
return std::make_shared<pod5::AsyncOutputStream>(io_stream, thread_pool);
6573
}
@@ -79,6 +87,8 @@ FileWriterOptions::FileWriterOptions()
7987
, m_read_table_batch_size(DEFAULT_READ_TABLE_BATCH_SIZE)
8088
, m_run_info_table_batch_size(DEFAULT_RUN_INFO_TABLE_BATCH_SIZE)
8189
, m_use_directio{DEFAULT_USE_DIRECTIO}
90+
, m_directio_chunk_size(DEFAULT_DIRECTIO_CHUNK_SIZE)
91+
, m_use_sync_io(DEFAULT_USE_SYNC_IO)
8292
{
8393
}
8494

@@ -552,13 +562,14 @@ pod5::Result<std::unique_ptr<FileWriter>> create_file_writer(
552562
auto run_info_tmp_path = make_run_info_tmp_path(arrow_path, file_identifier);
553563

554564
bool const use_directio = options.use_directio();
565+
bool const use_sync_io = options.use_sync_io();
555566

556567
// Prepare the temporary reads file:
557568
ARROW_ASSIGN_OR_RAISE(
558569
auto read_table_file_stream,
559-
::open_file_output_stream(reads_tmp_path, false, use_directio));
560-
auto read_table_file_async =
561-
::make_async_stream(read_table_file_stream, thread_pool, use_directio);
570+
::open_file_output_stream(reads_tmp_path, false, use_directio, use_sync_io));
571+
auto read_table_file_async = ::make_async_stream(
572+
read_table_file_stream, thread_pool, use_directio, options.directio_chunk_size());
562573
ARROW_ASSIGN_OR_RAISE(
563574
auto read_table_tmp_writer,
564575
make_read_table_writer(
@@ -573,9 +584,9 @@ pod5::Result<std::unique_ptr<FileWriter>> create_file_writer(
573584
// Prepare the temporary run_info file:
574585
ARROW_ASSIGN_OR_RAISE(
575586
auto run_info_table_file_stream,
576-
::open_file_output_stream(run_info_tmp_path, false, use_directio));
577-
auto run_info_table_file_async =
578-
::make_async_stream(run_info_table_file_stream, thread_pool, use_directio);
587+
::open_file_output_stream(run_info_tmp_path, false, use_directio, use_sync_io));
588+
auto run_info_table_file_async = ::make_async_stream(
589+
run_info_table_file_stream, thread_pool, use_directio, options.directio_chunk_size());
579590

580591
ARROW_ASSIGN_OR_RAISE(
581592
auto run_info_table_tmp_writer,
@@ -587,8 +598,10 @@ pod5::Result<std::unique_ptr<FileWriter>> create_file_writer(
587598

588599
// Prepare the main file - and set up the signal table to write here:
589600
ARROW_ASSIGN_OR_RAISE(
590-
auto signal_table_file_stream, ::open_file_output_stream(path, false, use_directio));
591-
auto signal_file = ::make_async_stream(signal_table_file_stream, thread_pool, use_directio);
601+
auto signal_table_file_stream,
602+
::open_file_output_stream(path, false, use_directio, use_sync_io));
603+
auto signal_file = ::make_async_stream(
604+
signal_table_file_stream, thread_pool, use_directio, options.directio_chunk_size());
592605

593606
// Write the initial header to the combined file:
594607
ARROW_RETURN_NOT_OK(combined_file_utils::write_combined_header(signal_file, section_marker));

c++/pod5_format/file_writer.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ class POD5_FORMAT_EXPORT FileWriterOptions {
2626
static constexpr std::uint32_t DEFAULT_RUN_INFO_TABLE_BATCH_SIZE = 1;
2727
static constexpr SignalType DEFAULT_SIGNAL_TYPE = SignalType::VbzSignal;
2828
static constexpr bool DEFAULT_USE_DIRECTIO = false;
29+
static constexpr bool DEFAULT_USE_SYNC_IO = false;
30+
static constexpr std::size_t DEFAULT_DIRECTIO_CHUNK_SIZE = 2 * 1024 * 1024;
2931

3032
FileWriterOptions();
3133

@@ -73,6 +75,14 @@ class POD5_FORMAT_EXPORT FileWriterOptions {
7375

7476
bool use_directio() const { return m_use_directio; }
7577

78+
void set_directio_chunk_size(std::size_t chunk_size) { m_directio_chunk_size = chunk_size; }
79+
80+
std::size_t directio_chunk_size() const { return m_directio_chunk_size; }
81+
82+
void set_use_sync_io(bool use_sync_io) { m_use_sync_io = use_sync_io; }
83+
84+
bool use_sync_io() const { return m_use_sync_io; }
85+
7686
private:
7787
std::shared_ptr<ThreadPool> m_writer_thread_pool;
7888
std::uint32_t m_max_signal_chunk_size;
@@ -82,6 +92,8 @@ class POD5_FORMAT_EXPORT FileWriterOptions {
8292
std::size_t m_read_table_batch_size;
8393
std::size_t m_run_info_table_batch_size;
8494
bool m_use_directio;
95+
std::size_t m_directio_chunk_size;
96+
bool m_use_sync_io;
8597
};
8698

8799
class FileWriterImpl;

c++/pod5_format/internal/async_output_stream.h

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,6 @@ namespace {
2222
constexpr size_t alignment = 4096; // buffer alignment (for block devices)
2323
constexpr size_t megabyte = 256 * alignment; // 1MB
2424
constexpr size_t fallocate_chunk = 50 * megabyte; // 50MB
25-
constexpr size_t write_buffer_size = megabyte; // Arbitrary limit. Seems a good trade-off
26-
// between memory usage and disk activities
2725

2826
} // namespace
2927

@@ -182,10 +180,12 @@ class AsyncOutputStreamDirectIO : public AsyncOutputStream {
182180
public:
183181
AsyncOutputStreamDirectIO(
184182
std::shared_ptr<OutputStream> const & main_stream,
185-
std::shared_ptr<ThreadPool> const & thread_pool)
183+
std::shared_ptr<ThreadPool> const & thread_pool,
184+
std::size_t write_chunk_size)
186185
: AsyncOutputStream(main_stream, thread_pool)
186+
, m_write_chunk_size(write_chunk_size)
187187
, m_fallocate_offset{0}
188-
, m_buffer(write_buffer_size, alignment)
188+
, m_buffer(m_write_chunk_size, alignment)
189189
, m_flushed_buffer_copy(alignment, 0)
190190
, m_buffer_offset{0}
191191
, m_num_blocks_written{0}
@@ -240,7 +240,7 @@ class AsyncOutputStreamDirectIO : public AsyncOutputStream {
240240
ARROW_RETURN_NOT_OK(write_cache());
241241

242242
// adjust accounting
243-
m_num_blocks_written += (write_buffer_size / alignment);
243+
m_num_blocks_written += (m_write_chunk_size / alignment);
244244
}
245245
}
246246
return arrow::Status::OK();
@@ -421,6 +421,7 @@ class AsyncOutputStreamDirectIO : public AsyncOutputStream {
421421
return arrow::Status::OK();
422422
}
423423

424+
std::size_t const m_write_chunk_size;
424425
std::size_t m_fallocate_offset;
425426
AlignedBuffer m_buffer;
426427
// copy of buffer (unaligned) which has been flushed to the output stream already,

docs/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@ sphinx-rtd-theme
22
sphinx==v5.3.0
33
myst-parser
44
# Paths are relative to project root for ReadTheDocs and docs/Makefile
5-
pod5==0.3.14
5+
pod5==0.3.15

python/pod5/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ classifiers=[
2222
]
2323

2424
dependencies = [
25-
"lib_pod5 == 0.3.14",
25+
"lib_pod5 == 0.3.15",
2626
"iso8601",
2727
'importlib-metadata; python_version<"3.8"',
2828
"more_itertools",

python/pod5/src/pod5/tools/pod5_convert_to_fast5.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
"signal_positive": 5,
2626
"signal_negative": 6,
2727
"api_request": 7,
28-
"signal_negative": 6,
28+
"device_data_error": 8,
2929
}
3030

3131
# Fast5 types

0 commit comments

Comments
 (0)