Skip to content

Commit 525d2ec

Browse files
committed
Use read lock after truncate checkpoint to prevent races
The previous implementation attempted to hold checkpoint locks during file copy by adding a keep_lock param to the checkpoint function. However, checkpoint can have multiple early exit paths like empty WAL where no lock is acquired. Also, it clutters the checkpoint function. This implementation executes TRUNCATE checkpoint and then acquires read lock on WAL before copying database file. This ensures new data is written to WAL and new Checkpoint can not be taken before this read lock is released. Signed-off-by: Prateek Singh Rathore <[email protected]>
1 parent 510cb7b commit 525d2ec

File tree

2 files changed

+32
-52
lines changed

2 files changed

+32
-52
lines changed

core/lib.rs

Lines changed: 29 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2025,11 +2025,13 @@ impl Connection {
20252025

20262026
/// Create a point-in-time snapshot of the database by copying the database file.
20272027
///
2028-
/// It checkpoints and keeps the lock after checkpointing to avoid race window between
2029-
/// checkpoint completion and file copy operation where concurrent writers could modify
2030-
/// the database.
2028+
/// This function:
2029+
/// 1. Performs a TRUNCATE checkpoint to flush all WAL data to the database file
2030+
/// 2. Acquires a read lock to prevent other checkpoints during copy
2031+
/// 3. Copies the database file (new writes go to WAL, not DB file)
2032+
/// 4. Releases the read lock
20312033
#[cfg(feature = "fs")]
2032-
pub fn _snapshot_copy_file(self: &Arc<Self>, output_path: &str) -> Result<()> {
2034+
fn _snapshot_copy_file(self: &Arc<Self>, output_path: &str) -> Result<()> {
20332035
use crate::util::MEMORY_PATH;
20342036
use std::path::Path;
20352037

@@ -2064,23 +2066,36 @@ impl Connection {
20642066
}
20652067

20662068
let pager = self.pager.load();
2067-
let result = (|| -> Result<()> {
2068-
// Checkpoint and keep the lock
2069-
let _res = pager.blocking_checkpoint_keep_lock(
2070-
CheckpointMode::Truncate {
2071-
upper_bound_inclusive: None,
2072-
},
2073-
self.get_sync_mode(),
2074-
)?;
2069+
let Some(wal) = pager.wal.as_ref() else {
2070+
return Err(LimboError::InternalError(
2071+
"Cannot snapshot database without WAL".to_string(),
2072+
));
2073+
};
20752074

2075+
// TRUNCATE checkpoint - flushes all WAL data to DB file and empties WAL
2076+
let _ = pager.blocking_checkpoint(
2077+
CheckpointMode::Truncate {
2078+
upper_bound_inclusive: None,
2079+
},
2080+
self.get_sync_mode(),
2081+
)?;
2082+
2083+
// Acquire read lock - prevents other checkpoints during copy
2084+
// After TRUNCATE, DB file is complete. New writes go to WAL only, not DB file.
2085+
pager.begin_read_tx()?;
2086+
2087+
// Copy the database file
2088+
let result = (|| -> Result<()> {
20762089
let source_path = Path::new(&self.db.path);
20772090
std::fs::copy(source_path, path).map_err(|e| {
20782091
LimboError::InternalError(format!("Failed to copy database file: {e}"))
20792092
})?;
2080-
2081-
Ok(()) // lock is dropped here when _res is dropped here
2093+
Ok(())
20822094
})();
20832095

2096+
// Release read lock
2097+
wal.end_read_tx();
2098+
20842099
result
20852100
}
20862101

core/storage/pager.rs

Lines changed: 3 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -2856,40 +2856,17 @@ impl Pager {
28562856
}
28572857
}
28582858

2859-
#[instrument(skip_all, level = Level::DEBUG)]
2860-
/// checkpoint_keep_lock checkpoints the WAL to the database file (if needed) and keeps the lock
2861-
pub fn checkpoint_keep_lock(
2862-
&self,
2863-
mode: CheckpointMode,
2864-
sync_mode: crate::SyncMode,
2865-
clear_page_cache: bool,
2866-
) -> Result<IOResult<CheckpointResult>> {
2867-
self.checkpoint_inner(mode, sync_mode, clear_page_cache, true)
2868-
}
2869-
2870-
#[instrument(skip_all, level = Level::DEBUG)]
2871-
/// checkpoint checkpoints the WAL to the database file (if needed) and drops the lock
2872-
pub fn checkpoint(
2873-
&self,
2874-
mode: CheckpointMode,
2875-
sync_mode: crate::SyncMode,
2876-
clear_page_cache: bool,
2877-
) -> Result<IOResult<CheckpointResult>> {
2878-
self.checkpoint_inner(mode, sync_mode, clear_page_cache, false)
2879-
}
2880-
28812859
#[instrument(skip_all, level = Level::DEBUG, name = "pager_checkpoint",)]
28822860
/// Checkpoint the WAL to the database file (if needed).
28832861
/// Args:
28842862
/// - mode: The checkpoint mode to use (PASSIVE, FULL, RESTART, TRUNCATE)
28852863
/// - sync_mode: The fsync mode to use (OFF, NORMAL, FULL)
28862864
/// - clear_page_cache: Whether to clear the page cache after checkpointing
2887-
fn checkpoint_inner(
2865+
pub fn checkpoint(
28882866
&self,
28892867
mode: CheckpointMode,
28902868
sync_mode: crate::SyncMode,
28912869
clear_page_cache: bool,
2892-
keep_lock: bool,
28932870
) -> Result<IOResult<CheckpointResult>> {
28942871
let Some(wal) = self.wal.as_ref() else {
28952872
return Err(LimboError::InternalError(
@@ -3017,10 +2994,8 @@ impl Pager {
30172994
let mut state = self.checkpoint_state.write();
30182995
let mut res = state.result.take().expect("result should be set");
30192996

3020-
// Release checkpoint guard if lock is not to be kept
3021-
if !keep_lock {
3022-
res.release_guard();
3023-
}
2997+
// Release checkpoint guard
2998+
res.release_guard();
30242999

30253000
// Clear page cache only if requested (explicit checkpoints do this, auto-checkpoint does not)
30263001
if clear_page_cache {
@@ -3117,16 +3092,6 @@ impl Pager {
31173092
self.io.block(|| self.checkpoint(mode, sync_mode, true))
31183093
}
31193094

3120-
#[instrument(skip_all, level = Level::DEBUG)]
3121-
pub fn blocking_checkpoint_keep_lock(
3122-
&self,
3123-
mode: CheckpointMode,
3124-
sync_mode: crate::SyncMode,
3125-
) -> Result<CheckpointResult> {
3126-
self.io
3127-
.block(|| self.checkpoint_keep_lock(mode, sync_mode, true))
3128-
}
3129-
31303095
pub fn freepage_list(&self) -> u32 {
31313096
self.io
31323097
.block(|| HeaderRefMut::from_pager(self))

0 commit comments

Comments
 (0)