-
Notifications
You must be signed in to change notification settings - Fork 136
New command: augur curate apply-date-bounds
#1828
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: victorlin/support-iso-time-intervals
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,231 @@ | ||||||
| """ | ||||||
| Impose lower and/or upper bounds on dates in a column. | ||||||
| Updated values are are formatted as ISO 8601 intervals. | ||||||
| """ | ||||||
| import argparse | ||||||
| import datetime | ||||||
| from textwrap import dedent | ||||||
| from treetime.utils import datestring_from_numeric | ||||||
| from typing import Any, Dict, Iterable, Tuple, Optional, Union | ||||||
|
|
||||||
| from augur.dates import date_to_numeric, get_numerical_date_from_value | ||||||
| from augur.errors import AugurError | ||||||
| from augur.io.print import print_err, indented_list | ||||||
| from augur.types import DataErrorMethod | ||||||
| from augur.utils import first_line | ||||||
|
|
||||||
|
|
||||||
| TODAY = 'today' | ||||||
|
|
||||||
|
|
||||||
| RecordType = Dict[str, Any] | ||||||
| # maybe Dict[str, str]? | ||||||
|
|
||||||
|
|
||||||
| def register_parser(parent_subparsers): | ||||||
| parser = parent_subparsers.add_parser("apply-date-bounds", | ||||||
| parents=[parent_subparsers.shared_parser], | ||||||
| help=first_line(__doc__)) | ||||||
|
|
||||||
| required = parser.add_argument_group(title="REQUIRED") | ||||||
| required.add_argument("--date-field", metavar="NAME", | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should probably be explicitly marked as
Suggested change
|
||||||
| help=dedent("""\ | ||||||
| Name of an existing date field to apply bounds to. Values will be | ||||||
| formatted as an interval using bounds provided by --lower-bound | ||||||
| and/or --upper-bound.""")) | ||||||
|
|
||||||
| optional = parser.add_argument_group(title="OPTIONAL") | ||||||
| optional.add_argument("--lower-bound", metavar="NAME | DATE", | ||||||
| help=dedent("""\ | ||||||
| Name of an existing date field or date to use as the lower bound for | ||||||
| --date-field (i.e. minimum).""")) | ||||||
| optional.add_argument("--upper-bound", metavar=f"NAME | DATE | {TODAY!r}", | ||||||
| help=dedent(f"""\ | ||||||
| Name of an existing date field or date to use as the upper bound for | ||||||
| --date-field (i.e. maximum). Use {TODAY!r} to set the current date as | ||||||
| the upper bound.""")) | ||||||
| optional.add_argument("--failure-reporting", | ||||||
| type=DataErrorMethod.argtype, | ||||||
| choices=list(DataErrorMethod), | ||||||
| default=DataErrorMethod.ERROR_FIRST, | ||||||
| help="How should failed date formatting be reported.") | ||||||
|
|
||||||
| return parser | ||||||
|
|
||||||
|
|
||||||
| def run(args: argparse.Namespace, records: Iterable[RecordType]): | ||||||
| validate_arguments(args) | ||||||
|
|
||||||
| failures = [] | ||||||
|
|
||||||
| for index, input_record in enumerate(records): | ||||||
| record = input_record.copy() | ||||||
| try: | ||||||
| record[args.date_field] = Record(record, index).get_bounded_date(args) | ||||||
| except DataError as error: | ||||||
| if args.failure_reporting is DataErrorMethod.SILENT: | ||||||
| continue | ||||||
| if args.failure_reporting is DataErrorMethod.ERROR_FIRST: | ||||||
| raise error | ||||||
| if args.failure_reporting is DataErrorMethod.WARN: | ||||||
| print_err(f"WARNING: {error}") | ||||||
| failures.append(error) | ||||||
| continue | ||||||
| if args.failure_reporting is DataErrorMethod.ERROR_ALL: | ||||||
| failures.append(error) | ||||||
| continue | ||||||
| else: | ||||||
| raise ValueError(f"Encountered unhandled failure reporting method: {args.failure_reporting!r}") | ||||||
| yield record | ||||||
|
|
||||||
| if args.failure_reporting is not DataErrorMethod.SILENT and failures: | ||||||
| if args.failure_reporting is DataErrorMethod.ERROR_ALL: | ||||||
| raise AugurError(dedent(f"""\ | ||||||
| Unable to apply bounds. All errors: | ||||||
| {indented_list(map(str, failures), " ")}""")) | ||||||
|
|
||||||
|
|
||||||
| def validate_arguments(args: argparse.Namespace): | ||||||
| if not args.lower_bound and not args.upper_bound: | ||||||
| raise AugurError("At least one of --lower-bound and --upper-bound is required.") | ||||||
|
|
||||||
|
|
||||||
| class Record: | ||||||
| """ | ||||||
| Helper class to wrap a record, its id, and arguments for ease of error handling and small functions. | ||||||
| """ | ||||||
|
|
||||||
| def __init__(self, data: RecordType, record_id: int) -> None: | ||||||
| self.data = data | ||||||
| self.id = record_id | ||||||
|
|
||||||
| def get_bounded_date(self, args: argparse.Namespace) -> str: | ||||||
| """ | ||||||
| Returns a date string representing the date with bounds applied. bounded interval for the given record. | ||||||
| """ | ||||||
| start, end = self.convert_date_to_range(args.date_field) | ||||||
|
|
||||||
| lower_bound, upper_bound = self.get_bounds(args.lower_bound, args.upper_bound) | ||||||
|
|
||||||
| # If any ends are unbounded, return the original date | ||||||
| if (start == float("-inf") and lower_bound is None) or \ | ||||||
| (end == float( "inf") and upper_bound is None): | ||||||
| return self.data[args.date_field] | ||||||
|
|
||||||
| # Error if start or end are out of bounds. | ||||||
| if lower_bound and start < lower_bound and end < lower_bound: | ||||||
| self.raise_data_error( | ||||||
| f"{args.date_field!r}={self.data[args.date_field]!r} " | ||||||
| f"is earlier than the lower bound of " | ||||||
| f"{args.lower_bound!r}={self.data[args.lower_bound]!r}" | ||||||
| ) | ||||||
| if upper_bound and start > upper_bound and end > upper_bound: | ||||||
| self.raise_data_error( | ||||||
| f"{args.date_field!r}={self.data[args.date_field]!r} " | ||||||
| f"is later than the upper bound of " | ||||||
| f"{args.upper_bound!r}={self.data[args.upper_bound]!r}" | ||||||
| ) | ||||||
|
Comment on lines
+116
to
+128
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||||||
|
|
||||||
| # If the target date overlaps with the bounds, apply the bounds. | ||||||
| # The start should be no earlier than the lower bound | ||||||
| # and the end should be no later than the upper bound. | ||||||
| if lower_bound: | ||||||
| start = max(start, lower_bound) | ||||||
| if upper_bound: | ||||||
| end = min(end, upper_bound) | ||||||
|
|
||||||
| # ISO 8601 interval in <start>/<end> format | ||||||
| return f"{datestring_from_numeric(start)}/{datestring_from_numeric(end)}" | ||||||
|
Comment on lines
+138
to
+139
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think ISO 8601 explicitly supports or prohibits negative dates, but should they be allowed here? |
||||||
|
|
||||||
| def convert_date_to_range(self, date_field: str) -> Tuple[float, float]: | ||||||
| original_date = self.data.get(date_field) | ||||||
|
|
||||||
| if original_date is None: | ||||||
| self.raise_data_error( | ||||||
| f"Missing date field {date_field!r}." | ||||||
| ) | ||||||
|
Comment on lines
+142
to
+147
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, this reminded me of the issue with |
||||||
|
|
||||||
| date = get_numerical_date_from_value(original_date, fmt="%Y-%m-%d") | ||||||
|
|
||||||
| if date == None: | ||||||
| self.raise_data_error( | ||||||
| f"Unable to parse value from {date_field!r} as a date: {original_date!r}. " | ||||||
| "Consider formatting values with augur curate format-dates before applying bounds." | ||||||
| ) | ||||||
|
|
||||||
| start, end = float('-inf'), float('inf') | ||||||
|
|
||||||
| if isinstance(date, tuple): | ||||||
| start, end = date | ||||||
| elif isinstance(date, float): | ||||||
| start = date | ||||||
| end = date | ||||||
| elif isinstance(date, int): | ||||||
| start = float(date) | ||||||
| end = float(date) | ||||||
|
|
||||||
| return start, end | ||||||
|
|
||||||
| def get_bounds(self, lower_bound_field_or_value: Optional[str], upper_bound_field_or_value: Optional[str]) -> Tuple[Optional[float], Optional[float]]: | ||||||
| """ | ||||||
| Returns a tuple representing lower and upper bounds. | ||||||
| """ | ||||||
| lower_bound: Union[float, Tuple[float, float], None] = None | ||||||
| upper_bound: Union[float, Tuple[float, float], None] = None | ||||||
|
|
||||||
| if lower_bound_field_or_value is not None: | ||||||
| value = self.data.get(lower_bound_field_or_value) | ||||||
| if value is not None: | ||||||
| # Input is a field name | ||||||
| lower_bound = get_numerical_date_from_value(value, fmt="%Y-%m-%d") | ||||||
| if lower_bound is None: | ||||||
| self.raise_data_error( | ||||||
| f"Unable to parse value from {lower_bound_field_or_value!r} as a date: {value!r}. " | ||||||
| "Consider formatting values with augur curate format-dates before applying bounds." | ||||||
| ) | ||||||
| else: | ||||||
| # Try parsing as a date | ||||||
| lower_bound = get_numerical_date_from_value(lower_bound_field_or_value, fmt="%Y-%m-%d") | ||||||
| if lower_bound is None: | ||||||
| raise AugurError(f"Expected --lower-bound to be a field name or date, but got {lower_bound_field_or_value!r}.") | ||||||
|
|
||||||
| if upper_bound_field_or_value == TODAY: | ||||||
| if TODAY in self.data: | ||||||
| raise AugurError(f"{TODAY!r} is ambiguous as it is both an alias to the current date and a field name.") | ||||||
| upper_bound = date_to_numeric(datetime.date.today()) | ||||||
| elif upper_bound_field_or_value is not None: | ||||||
| value = self.data.get(upper_bound_field_or_value) | ||||||
| if value is not None: | ||||||
| # Input is a field name | ||||||
| upper_bound = get_numerical_date_from_value(value, fmt="%Y-%m-%d") | ||||||
| if upper_bound is None: | ||||||
| self.raise_data_error( | ||||||
| f"Unable to parse value from {upper_bound_field_or_value!r} as a date: {value!r}. " | ||||||
| "Consider formatting values with augur curate format-dates before applying bounds." | ||||||
| ) | ||||||
| else: | ||||||
| # Try parsing as a date | ||||||
| upper_bound = get_numerical_date_from_value(upper_bound_field_or_value, fmt="%Y-%m-%d") | ||||||
| if upper_bound is None: | ||||||
| raise AugurError(f"Expected --upper-bound to be a field name or date, but got {upper_bound_field_or_value!r}.") | ||||||
|
|
||||||
| # Resolve ranges to single values | ||||||
| if isinstance(lower_bound, tuple): | ||||||
| lower_bound = lower_bound[0] | ||||||
| if isinstance(upper_bound, tuple): | ||||||
| upper_bound = upper_bound[1] | ||||||
|
|
||||||
| return lower_bound, upper_bound | ||||||
|
|
||||||
| def raise_data_error(self, message: str) -> None: | ||||||
| raise DataError(self.id, message) | ||||||
|
|
||||||
|
|
||||||
| class DataError(AugurError): | ||||||
| def __init__(self, record_id: int, message: str): | ||||||
| self.record_id = record_id | ||||||
| self.message = message | ||||||
|
|
||||||
| def __str__(self): | ||||||
| return f"[record {self.record_id}] {self.message}" | ||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -154,6 +154,11 @@ def is_date_ambiguous(date, ambiguous_by): | |
| Those should be further validated by date conversion functions. | ||
| """ | ||
|
|
||
| RE_AUGUR_UNKNOWN_DATE = re.compile(r'^XXXX-XX-XX$') | ||
| """ | ||
| Matches an Augur-style unknown date. | ||
| """ | ||
|
|
||
| RE_AUGUR_AMBIGUOUS_DATE = re.compile(r'.*XX.*') | ||
| """ | ||
| Matches an Augur-style ambiguous date with 'XX' used to mask unknown parts of the date. | ||
|
|
@@ -177,13 +182,20 @@ def get_numerical_date_from_value(value, fmt, min_max_year=None) -> Union[float, | |
| except: | ||
| pass | ||
|
|
||
| # 2. Check if value is an ambiguous date in the specified format (fmt). | ||
| # 2. Check if value is an unknown date. | ||
| # This is checked before ambiguous dates since it is a subset of that with | ||
| # special handling. | ||
|
|
||
| if RE_AUGUR_UNKNOWN_DATE.match(value): | ||
| return (float("-inf"), float("inf")) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm guessing this is not handled in treetime which is leading to the errors in a couple of the pathogen-repo-ci tests? |
||
|
|
||
| # 3. Check if value is an ambiguous date in the specified format (fmt). | ||
|
|
||
| if RE_AUGUR_AMBIGUOUS_DATE.match(value): | ||
| start, end = AmbiguousDate(value, fmt=fmt).range(min_max_year=min_max_year) | ||
| return (date_to_numeric(start), date_to_numeric(end)) | ||
|
|
||
| # 3. Check formats that are always supported. | ||
| # 4. Check formats that are always supported. | ||
|
|
||
| if RE_NUMERIC_DATE.match(value): | ||
| return float(value) | ||
|
|
@@ -216,7 +228,7 @@ def get_numerical_date_from_value(value, fmt, min_max_year=None) -> Union[float, | |
|
|
||
| return (date_to_numeric(start), date_to_numeric(end)) | ||
|
|
||
| # 4. Return none (silent error) if the date does not match any of the checked formats. | ||
| # 5. Return none (silent error) if the date does not match any of the checked formats. | ||
|
|
||
| return None | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,7 @@ | ||
| augur.curate.apply\_date\_bounds module | ||
| ============================================= | ||
|
|
||
| .. automodule:: augur.curate.apply_date_bounds | ||
| :members: | ||
| :show-inheritance: | ||
| :undoc-members: |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,52 @@ | ||
| Setup | ||
|
|
||
| $ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}" | ||
|
|
||
| Create NDJSON file for testing. | ||
|
|
||
| $ cat >records.ndjson <<~~ | ||
| > {"record": 1, "date": "2021", "collectionDate": "2020-01-23"} | ||
| > {"record": 2, "date": "2022", "collectionDate": "2020-01-23"} | ||
| > ~~ | ||
|
|
||
| The default behavior of data error handling is to stop on the first error. | ||
|
|
||
| $ cat records.ndjson \ | ||
| > | ${AUGUR} curate apply-date-bounds \ | ||
| > --date-field date \ | ||
| > --upper-bound collectionDate 1> /dev/null | ||
| ERROR: [record 0] 'date'='2021' is later than the upper bound of 'collectionDate'='2020-01-23' | ||
| [2] | ||
|
|
||
| Data errors can be batch reported all at once. | ||
|
|
||
| $ cat records.ndjson \ | ||
| > | ${AUGUR} curate apply-date-bounds \ | ||
| > --failure-reporting "error_all" \ | ||
| > --date-field date \ | ||
| > --upper-bound collectionDate 1> /dev/null | ||
| ERROR: Unable to apply bounds. All errors: | ||
| [record 0] 'date'='2021' is later than the upper bound of 'collectionDate'='2020-01-23' | ||
| [record 1] 'date'='2022' is later than the upper bound of 'collectionDate'='2020-01-23' | ||
| [2] | ||
|
|
||
| Data errors can emit warnings instead of a failure. | ||
|
|
||
| $ cat records.ndjson \ | ||
| > | ${AUGUR} curate apply-date-bounds \ | ||
| > --failure-reporting "warn" \ | ||
| > --date-field date \ | ||
| > --upper-bound collectionDate | ||
| WARNING: [record 0] 'date'='2021' is later than the upper bound of 'collectionDate'='2020-01-23' | ||
| WARNING: [record 1] 'date'='2022' is later than the upper bound of 'collectionDate'='2020-01-23' | ||
|
|
||
| Errors regarding the bounds themselves are not considered data errors and will stop on | ||
| the first error regardless of --failure-reporting. | ||
|
|
||
| $ cat records.ndjson \ | ||
| > | ${AUGUR} curate apply-date-bounds \ | ||
| > --failure-reporting "silent" \ | ||
| > --date-field date \ | ||
| > --upper-bound collectionDate2 | ||
| ERROR: Expected --upper-bound to be a field name or date, but got 'collectionDate2'. | ||
| [2] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
non-blocking
Just wanted to walk through a slightly more complicated use-case to get people's thoughts. Definitely not saying we need to support in this with
apply-date-bounds.Say we are curating SARS-CoV-2 data with unknown collection dates, but we know the clades and the sequence release date. I want to set the lower bound of the collection date to be the first detection date of the clade. I think the steps to do this now is
clade_detection_date.tsvwith the columnscladeandfirst_detection_datemetadata.tsvwithclade_detection_date.tsvWould it be overcomplicated to pass the
clade_detection_date.tsvdirectly to theapply-date-boundscommand?