From 23a3248b9f4e805eb9f1c6ab7c898ab1a5ad9502 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Tue, 14 Nov 2023 16:49:24 -0800 Subject: [PATCH 1/5] tdb/elife_upload: Refactor assay_date parsing Pull out assay_date parsing from filename into a separate method `parse_assay_date_from_filename`. This is done in preparation to update the assay date parsing for VIDRL flat files. --- tdb/elife_upload.py | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/tdb/elife_upload.py b/tdb/elife_upload.py index 9c98d89..a0e74c8 100644 --- a/tdb/elife_upload.py +++ b/tdb/elife_upload.py @@ -49,18 +49,7 @@ def format_measurements(self, measurements, **kwargs): self.format_subtype(meas) self.format_assay_type(meas) self.format_date(meas) - tmp = kwargs['fstem'].split('-')[0] - if len(tmp) > 8: - tmp = tmp[:(8-len(tmp))] - elif len(tmp) < 8: - meas['assay_date'] = "XXXX-XX-XX" - else: - if tmp[0:2] == '20': - meas['assay_date'] = "{}-{}-{}".format(tmp[0:4],tmp[4:6],tmp[6:8]) - else: - meas['assay_date'] = "XXXX-XX-XX" - if 'assay_date' not in meas.keys() or meas['assay_date'] is None: - meas['assay_date'] = "XXXX-XX-XX" + self.parse_assay_date_from_filename(meas, kwargs['fstem']) self.format_passage(meas, 'serum_passage', 'serum_passage_category') self.format_passage(meas, 'virus_passage', 'virus_passage_category') self.format_ref(meas) @@ -78,6 +67,28 @@ def format_measurements(self, measurements, **kwargs): self.disambiguate_sources(measurements) return measurements + + def parse_assay_date_from_filename(self, meas, fstem): + """ + Parse assay date from the *fstem*. + *fstem* is expected to be formatted as `YYYYMMDD*` + + If unable to parse date from *fstem*, then assay date is masked as `XXXX-XX-XX`. + """ + tmp = fstem.split('-')[0] + if len(tmp) > 8: + tmp = tmp[:(8-len(tmp))] + elif len(tmp) < 8: + meas['assay_date'] = "XXXX-XX-XX" + else: + if tmp[0:2] == '20': + meas['assay_date'] = "{}-{}-{}".format(tmp[0:4],tmp[4:6],tmp[6:8]) + else: + meas['assay_date'] = "XXXX-XX-XX" + if 'assay_date' not in meas.keys() or meas['assay_date'] is None: + meas['assay_date'] = "XXXX-XX-XX" + + def disambiguate_sources(self, measurements): ''' Add counter to sources so that create_index still creates unique identifiers for each From 37f9f3b2a5cd6482413fae191ddbeb87e684fd93 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Tue, 14 Nov 2023 16:55:45 -0800 Subject: [PATCH 2/5] tdb/elife_upload: Only parse assay_date once The assay date is the same for all measurement records from the same fstem, so only parse the assay date once. --- tdb/elife_upload.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/tdb/elife_upload.py b/tdb/elife_upload.py index a0e74c8..dd858b0 100644 --- a/tdb/elife_upload.py +++ b/tdb/elife_upload.py @@ -40,6 +40,7 @@ def format_measurements(self, measurements, **kwargs): self.HI_ref_name_abbrev =self.define_strain_fixes(self.HI_ref_name_abbrev_fname) self.define_location_label_fixes("source-data/flu_fix_location_label.tsv") self.define_countries("source-data/geo_synonyms.tsv") + fstem_assay_date = self.parse_assay_date_from_filename(kwargs['fstem']) for meas in measurements: meas['virus_strain'], meas['original_virus_strain'] = self.fix_name(self.HI_fix_name(meas['virus_strain'], serum=False)) meas['serum_strain'], meas['original_serum_strain'] = self.fix_name(self.HI_fix_name(meas['serum_strain'], serum=True)) @@ -49,7 +50,7 @@ def format_measurements(self, measurements, **kwargs): self.format_subtype(meas) self.format_assay_type(meas) self.format_date(meas) - self.parse_assay_date_from_filename(meas, kwargs['fstem']) + meas['assay_date'] = fstem_assay_date self.format_passage(meas, 'serum_passage', 'serum_passage_category') self.format_passage(meas, 'virus_passage', 'virus_passage_category') self.format_ref(meas) @@ -68,25 +69,26 @@ def format_measurements(self, measurements, **kwargs): return measurements - def parse_assay_date_from_filename(self, meas, fstem): + def parse_assay_date_from_filename(self, fstem): """ Parse assay date from the *fstem*. *fstem* is expected to be formatted as `YYYYMMDD*` - If unable to parse date from *fstem*, then assay date is masked as `XXXX-XX-XX`. + If unable to parse date from *fstem*, then return masked assay date as `XXXX-XX-XX`. """ + assay_date = "XXXX-XX-XX" tmp = fstem.split('-')[0] if len(tmp) > 8: tmp = tmp[:(8-len(tmp))] elif len(tmp) < 8: - meas['assay_date'] = "XXXX-XX-XX" + assay_date = "XXXX-XX-XX" else: if tmp[0:2] == '20': - meas['assay_date'] = "{}-{}-{}".format(tmp[0:4],tmp[4:6],tmp[6:8]) + assay_date = "{}-{}-{}".format(tmp[0:4],tmp[4:6],tmp[6:8]) else: - meas['assay_date'] = "XXXX-XX-XX" - if 'assay_date' not in meas.keys() or meas['assay_date'] is None: - meas['assay_date'] = "XXXX-XX-XX" + assay_date = "XXXX-XX-XX" + + return assay_date def disambiguate_sources(self, measurements): From 124508a2cc4551336b96487a59424abb31469c2a Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Tue, 14 Nov 2023 17:23:03 -0800 Subject: [PATCH 3/5] tdb/elife_upload: Update `parse_assay_date_from_filename` Use regex to find all matches for the expected date format 'YYYYMMDD'. Then use the datetime module to validate the date string and check that the date is earlier than the date we are parsing the file. I made the decision to use the latest date if there are multiple matches with the expectation that we would be parsing files not long after the assay date. --- tdb/elife_upload.py | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/tdb/elife_upload.py b/tdb/elife_upload.py index dd858b0..176d63e 100644 --- a/tdb/elife_upload.py +++ b/tdb/elife_upload.py @@ -75,18 +75,30 @@ def parse_assay_date_from_filename(self, fstem): *fstem* is expected to be formatted as `YYYYMMDD*` If unable to parse date from *fstem*, then return masked assay date as `XXXX-XX-XX`. + If there are multiple valid dates, then return the latest date. """ assay_date = "XXXX-XX-XX" - tmp = fstem.split('-')[0] - if len(tmp) > 8: - tmp = tmp[:(8-len(tmp))] - elif len(tmp) < 8: - assay_date = "XXXX-XX-XX" + valid_dates = set() + for potential_date in re.findall(r"\d{8}", fstem): + # Check if the dates are valid + try: + date = datetime.datetime.strptime(potential_date, '%Y%m%d') + except ValueError: + continue + + # Date is only a valid assay date if it's earlier than the current datetime! + if date < datetime.datetime.now(): + valid_dates.add(date) + + if len(valid_dates) == 0: + print(f"Failed to parse assay date from filename {fstem!r}") + elif len(valid_dates) == 1: + assay_date = datetime.datetime.strftime(valid_dates.pop(), '%Y-%m-%d') else: - if tmp[0:2] == '20': - assay_date = "{}-{}-{}".format(tmp[0:4],tmp[4:6],tmp[6:8]) - else: - assay_date = "XXXX-XX-XX" + sorted_dates = list(map(lambda x: datetime.datetime.strftime(x, '%Y-%m-%d'), sorted(valid_dates))) + assay_date = sorted_dates[-1] + print(f"Found multiple potential assay dates in filename {fstem!r}: {sorted_dates}.", + f"Using the last valid date as the assay date: {assay_date!r}") return assay_date From e5108542e1a7f3f33f16be62486f33e600854e5f Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Tue, 14 Nov 2023 17:29:10 -0800 Subject: [PATCH 4/5] tdb/elife_upload: Only use fstem assay date if record does not have one MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only fill in with the assay date parsed from the fstem when the measurment does not have an assay date. This is done in preparation for parsing the flat VIDRL files which will include assay date as a column.¹ This ensures that the fstem date is only a backup to the date that is set within `format_date`.² ¹ https://bedfordlab.slack.com/archives/C03KWDET9/p1699914235686809 ² https://github.com/nextstrain/fauna/blob/8088646ce0ba438310cdc9f919080950d0767c46/tdb/upload.py#L328 --- tdb/elife_upload.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tdb/elife_upload.py b/tdb/elife_upload.py index 176d63e..35e31a3 100644 --- a/tdb/elife_upload.py +++ b/tdb/elife_upload.py @@ -50,7 +50,8 @@ def format_measurements(self, measurements, **kwargs): self.format_subtype(meas) self.format_assay_type(meas) self.format_date(meas) - meas['assay_date'] = fstem_assay_date + if meas.get('assay_date') is None: + meas['assay_date'] = fstem_assay_date self.format_passage(meas, 'serum_passage', 'serum_passage_category') self.format_passage(meas, 'virus_passage', 'virus_passage_category') self.format_ref(meas) From c3cbfed433066f111a5933c6d78edb48db0d49c0 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Tue, 28 Nov 2023 15:58:03 -0800 Subject: [PATCH 5/5] tdb/elife_upload: Raise Exception if filename contains multiple dates It's unclear if we'll ever run into the case where there are multiple assay dates in the filename, but if we do, raise an exception to alert the user to manually fix the filename before upload. --- tdb/elife_upload.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tdb/elife_upload.py b/tdb/elife_upload.py index 35e31a3..255c01c 100644 --- a/tdb/elife_upload.py +++ b/tdb/elife_upload.py @@ -96,10 +96,9 @@ def parse_assay_date_from_filename(self, fstem): elif len(valid_dates) == 1: assay_date = datetime.datetime.strftime(valid_dates.pop(), '%Y-%m-%d') else: - sorted_dates = list(map(lambda x: datetime.datetime.strftime(x, '%Y-%m-%d'), sorted(valid_dates))) - assay_date = sorted_dates[-1] - print(f"Found multiple potential assay dates in filename {fstem!r}: {sorted_dates}.", - f"Using the last valid date as the assay date: {assay_date!r}") + sorted_dates = [datetime.datetime.strftime(valid_date, '%Y-%m-%d') for valid_date in sorted(valid_dates)] + raise Exception(f"Found multiple potential assay dates in filename {fstem!r}: {sorted_dates}. " + + "Filename should only contain one assay date in format YYYYMMDD.") return assay_date