Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Validate DelimFile headers #17

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ flate2 = "^1"
# For auto-serialization of structs to csv/tsv
csv = "^1"
serde = { version = "^1.0.123", features = ["derive"] }
serde-aux = "^4"
Copy link
Member Author

@theJasonFan theJasonFan Apr 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Used for serde / field name introspection. fqtk uses the same crate.


[dev-dependencies]
tempfile = "3.2.0"
Expand Down
126 changes: 125 additions & 1 deletion src/io/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
//! Ok(())
//! }
//! ```
use std::collections::HashSet;
use std::fs::File;
use std::io::{BufRead, BufReader, BufWriter, Write};
use std::marker::PhantomData;
Expand Down Expand Up @@ -171,8 +172,17 @@ impl<D: DeserializeOwned> DelimFileReader<D> {
.has_headers(true)
.quoting(quote)
.from_reader(reader);
assert!(csv_reader.has_headers(), "Expected input file to have a header row");
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I removed this assert. It wasn't actually checking for a header line but instead just checks that the reader is configured to read a header.


// NB: csv_reader.has_header() does not actually check for existence of a header, but only
// checks that the reader is configured to read a header.

// Empty files are valid but will have empty headers.
// So validate the header only if one is found for a non-empty file.
let header = csv_reader.headers().map_err(FgError::ConversionError)?.to_owned();
if !header.is_empty() {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Empty files return empty headers, so we validate headers for only non-empty TSVs.

Self::validate_header(&header, delimiter)?
}

let record_iter = csv_reader.into_deserialize();
Ok(Self { record_iter, header })
}
Expand All @@ -186,6 +196,24 @@ impl<D: DeserializeOwned> DelimFileReader<D> {
pub fn read(&mut self) -> Option<Result<D>> {
self.record_iter.next().map(|result| result.map_err(FgError::ConversionError))
}

fn validate_header(header: &StringRecord, delimiter: u8) -> Result<()> {
let delim = String::from_utf8(vec![delimiter]).unwrap();
let found_header_parts: HashSet<&str> = header.iter().collect();
let expected_header_parts = serde_aux::prelude::serde_introspect::<D>();

// Expected header fields must be a _subset_ of found header fields
let ok = expected_header_parts.iter().all(|field| found_header_parts.contains(field));
if !ok {
let found_header_parts: Vec<&str> = header.iter().collect();
return Err(FgError::DelimFileHeaderError {
expected: expected_header_parts.join(&delim),
found: found_header_parts.join(&delim),
});
}

Ok(())
}
}

impl<D: DeserializeOwned> Iterator for DelimFileReader<D> {
Expand Down Expand Up @@ -342,6 +370,7 @@ impl DelimFile {

#[cfg(test)]
mod tests {
use super::*;
use crate::io::{DelimFile, Io};
use serde::{Deserialize, Serialize};
use tempfile::TempDir;
Expand All @@ -355,6 +384,25 @@ mod tests {
o: Option<f64>,
}

// Trickier record types in which fields are skipped in de/serialization
#[derive(Debug, Serialize, Deserialize, PartialEq)]
struct RecWithSkipDe {
s: String,
i: usize,
b: bool,
#[serde(skip_deserializing)]
o: Option<f64>,
}

#[derive(Debug, Serialize, Deserialize, PartialEq)]
struct RecWithSkipSe {
s: String,
i: usize,
b: bool,
#[serde(skip_serializing)]
o: Option<f64>,
}

#[test]
fn test_reading_and_writing_lines_to_file() {
let lines = vec!["foo", "bar,splat,whee", "baz\twhoopsie"];
Expand Down Expand Up @@ -431,4 +479,80 @@ mod tests {
assert_eq!(from_csv, recs);
assert_eq!(from_tsv, recs);
}

#[test]
fn test_skip_empty_lines() {
// Check to see that csv readers skip empty lines
let lines = vec!["", "", "s,i,b,o", "", "hello,123,true,123.4"];
let tempdir = TempDir::new().unwrap();

let csv = tempdir.path().join("bad_header.csv");
let io = Io::default();
io.write_lines(&csv, lines).unwrap();

let df = DelimFile::default();
let result: Result<Vec<Rec>> = df.read_csv(&csv);
let from_csv = result.unwrap();
assert_eq!(from_csv[0], Rec { s: "hello".to_owned(), i: 123, b: true, o: Some(123.4) })
}

#[test]
fn test_header_error() {
let lines = vec!["s,i,b,o", "hello,123,true,123.4"];
let tempdir = TempDir::new().unwrap();
let csv = tempdir.path().join("bad_header.csv");
let io = Io::default();
io.write_lines(&csv, lines).unwrap();

let df = DelimFile::default();
let result: Result<Vec<RecWithSkipDe>> = df.read_tsv(&csv);
let err = result.unwrap_err();

// All fields should be serialized, deserialization expects to skip "o"
if let FgError::DelimFileHeaderError { expected, found } = err {
assert_eq!(expected, "s\ti\tb");
assert_eq!(found, "s,i,b,o");
} else {
panic!()
}

let lines = vec!["s,i,b", "hello,123,true"];
let tempdir = TempDir::new().unwrap();
let csv = tempdir.path().join("bad_header.csv");
let io = Io::default();
io.write_lines(&csv, lines).unwrap();

let df = DelimFile::default();
let result: Result<Vec<RecWithSkipSe>> = df.read_tsv(&csv);
let err = result.unwrap_err();

// All fields but "o" should be serialized, deserialization should expect all fields
if let FgError::DelimFileHeaderError { expected, found } = err {
assert_eq!(expected, "s\ti\tb\to");
assert_eq!(found, "s,i,b");
} else {
panic!()
}
}

#[test]
fn test_header_missing() {
theJasonFan marked this conversation as resolved.
Show resolved Hide resolved
let lines = vec!["", "hello,123,true,123.4"];
let tempdir = TempDir::new().unwrap();
let csv = tempdir.path().join("bad_header.csv");
let io = Io::default();
io.write_lines(&csv, &lines).unwrap();

let df = DelimFile::default();
let result: Result<Vec<Rec>> = df.read_csv(&csv);
let err = result.unwrap_err();

if let FgError::DelimFileHeaderError { expected, found } = err {
assert_eq!(expected, "s,i,b,o");
// NB: empty lines are skipped
assert_eq!(found, lines[1].to_owned());
} else {
panic!()
}
}
}
3 changes: 3 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ pub enum FgError {

#[error("Error parsing/formatting delimited data.")]
ConversionError(#[from] csv::Error),

#[error("Error parsing delimited data file header.")]
DelimFileHeaderError { expected: String, found: String },
}

/// Result type that should be used everywhere
Expand Down
Loading