Skip to content

Commit ea04605

Browse files
author
Kyle Hernandez
committed
feat(reader): refactor classes for generalization
1 parent fb4516e commit ea04605

File tree

6 files changed

+50
-31
lines changed

6 files changed

+50
-31
lines changed

gdc_fastq_splitter/fastq/base.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,9 @@ def __str__(self):
2020

2121
class FastqRecord:
2222
"""The base class for a Fastq record"""
23-
def __init__(self, seqid, sequence, qid, qual, seqid_cls=SequenceIdentifier):
24-
self.seqid = seqid_cls.from_string(seqid)
23+
seqid_cls=SequenceIdentifier
24+
def __init__(self, seqid, sequence, qid, qual):
25+
self.seqid = self.seqid_cls.from_string(seqid)
2526
self.sequence = sequence
2627
self.qid = qid
2728
self.qual = qual

gdc_fastq_splitter/fastq/illumina.py

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,10 @@ def is_valid(seqid):
7272
return True
7373

7474
class IlluminaFastqRecord(base.FastqRecord):
75-
def __init__(self, seqid, sequence, qid, qual, seqid_cls=IlluminaSequenceIdentifier):
76-
super().__init__(seqid, sequence, qid, qual, seqid_cls=seqid_cls)
75+
seqid_cls = IlluminaSequenceIdentifier
76+
77+
def __init__(self, seqid, sequence, qid, qual):
78+
super().__init__(seqid, sequence, qid, qual)
7779

7880
@property
7981
def read_key(self):
@@ -95,9 +97,15 @@ def flowcell(self):
9597
def lane(self):
9698
return self.seqid.lane
9799

100+
@classmethod
101+
def is_valid_seqid(cls, seqid):
102+
return cls.seqid_cls.is_valid(seqid)
103+
98104
class IlluminaNoBarcodeFastqRecord(base.FastqRecord):
99-
def __init__(self, seqid, sequence, qid, qual, seqid_cls=IlluminaSequenceIdentifierNoBarcode):
100-
super().__init__(seqid, sequence, qid, qual, seqid_cls=seqid_cls)
105+
seqid_cls = IlluminaSequenceIdentifierNoBarcode
106+
107+
def __init__(self, seqid, sequence, qid, qual):
108+
super().__init__(seqid, sequence, qid, qual)
101109

102110
@property
103111
def read_key(self):
@@ -115,15 +123,19 @@ def flowcell(self):
115123
def lane(self):
116124
return self.seqid.lane
117125

126+
@classmethod
127+
def is_valid_seqid(cls, seqid):
128+
return cls.seqid_cls.is_valid(seqid)
129+
118130
def infer_fastq_type(fil):
119131
"""
120132
Infer the type of fastq based on the first line.
121133
"""
122134
def predicate(obj):
123135
"""A predicate to get all classes that are subclasses of
124-
base.SequenceIdentifier"""
125-
return inspect.isclass(obj) and issubclass(obj, base.SequenceIdentifier) \
126-
and hasattr(obj, 'is_valid')
136+
base.FastqRecord"""
137+
return inspect.isclass(obj) and issubclass(obj, base.FastqRecord) \
138+
and hasattr(obj, 'is_valid_seqid')
127139

128140
fh = gzip.open(fil, 'rt') if fil.endswith('.gz') else \
129141
open(fil, 'rt')
@@ -132,10 +144,11 @@ def predicate(obj):
132144

133145
try:
134146
line = fh.readline().rstrip('\r\n')
147+
#mod = sys.modules["gdc_fastq_splitter.fastq.illumina"]
135148
mod = sys.modules["gdc_fastq_splitter.fastq.illumina"]
136149
# Get all available seqidentifier types
137150
for m in inspect.getmembers(mod, predicate):
138-
if m[1].is_valid(line):
151+
if m[1].is_valid_seqid(line):
139152
cls_mod = m
140153
break
141154
finally:

gdc_fastq_splitter/fastq/report.py

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,10 @@
66

77
class BaseReport:
88
"""Base report for a fastq file"""
9-
def __init__(self, filename, flowcell_barcode=None, lane_number=None):
10-
self._filename = filename
11-
self.filename = os.path.basename(filename)
9+
def __init__(self, report_filename, fastq_filename, flowcell_barcode=None, lane_number=None):
10+
self._report_filename = report_filename
11+
self.report_filename = os.path.basename(report_filename)
12+
self.fastq_filename = os.path.basename(fastq_filename)
1213
self.flowcell_barcode = flowcell_barcode
1314
self.lane_number = lane_number
1415
self.record_counts = 0
@@ -18,27 +19,27 @@ def __iadd__(self, record):
1819
return self
1920

2021
def __str__(self):
21-
return '<flowcell>({0.flowcell_barcode}),<lane>({0.lane_number})'.format(self)
22+
return json.dumps(self.to_dict(), indent=2, sort_keys=True)
2223

2324
def to_dict(self):
2425
return {
2526
'metadata': {
26-
'filename': self.filename,
27+
'fastq_filename': self.fastq_filename,
2728
'flowcell_barcode': self.flowcell_barcode,
2829
'lane_number': self.lane_number,
2930
'record_count': self.record_counts
3031
}
3132
}
3233

3334
def write_to_json(self):
34-
with open(self._filename, 'wt') as o:
35+
with open(self._report_filename, 'wt') as o:
3536
json.dump(self.to_dict(), o, indent=2, sort_keys=True)
3637

3738

38-
class BarcodeFastqReport(BaseReport):
39+
class ReportWithBarcodes(BaseReport):
3940
"""Report that contains barcode frequencies."""
40-
def __init__(self, filename, flowcell_barcode=None, lane_number=None):
41-
super().__init__(filename, flowcell_barcode=flowcell_barcode, lane_number=lane_number)
41+
def __init__(self, report_filename, fastq_filename, flowcell_barcode=None, lane_number=None):
42+
super().__init__(report_filename, fastq_filename, flowcell_barcode=flowcell_barcode, lane_number=lane_number)
4243
self.barcode_frequency = Counter()
4344

4445
@property
@@ -57,13 +58,10 @@ def __iadd__(self, record):
5758
self._add_barcode(record.index)
5859
return self
5960

60-
def __str__(self):
61-
return '<flowcell>({0.flowcell_barcode}),<barcode>({0.multiplex_barcode}),<lane>({0.lane_number})'.format(self)
62-
6361
def to_dict(self):
6462
return {
6563
'metadata': {
66-
'filename': self.filename,
64+
'filename': self.fastq_filename,
6765
'flowcell_barcode': self.flowcell_barcode,
6866
'multiplex_barcode': self.most_common_barcode,
6967
'lane_number': self.lane_number,

gdc_fastq_splitter/fastq/writer.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""Module containing writer classes for writing Fastq files"""
22
import gzip
33
import io
4-
from gdc_fastq_splitter.fastq.report import BaseReport
4+
from gdc_fastq_splitter.fastq.report import BaseReport, ReportWithBarcodes
55

66
class FastqWriter:
77
"""Base Fastq writer class"""
@@ -28,11 +28,12 @@ def __init__(self, fname, reporter, **kwargs):
2828
self.reporter = reporter
2929

3030
@classmethod
31-
def from_record_and_prefix(cls, prefix, record, report_cls=BaseReport):
32-
fbase = '{0}_{1}_R{2}'.format(prefix, record.read_key, record.read_pair)
31+
def from_record_and_prefix(cls, record, prefix):
32+
report_cls = ReportWithBarcodes if hasattr(record, 'index') else BaseReport
33+
fbase = '{0}{1}_R{2}'.format(prefix, record.read_key, record.read_pair)
3334
fname = '{0}.fq.gz'.format(fbase)
3435
rname = '{0}.report.json'.format(fbase)
35-
return cls(fname, report_cls(rname, flowcell_barcode=record.flowcell, lane_number=record.lane))
36+
return cls(fname, report_cls(rname, fname, flowcell_barcode=record.flowcell, lane_number=record.lane))
3637

3738
def __iadd__(self, record):
3839
self.reporter += record
@@ -42,4 +43,3 @@ def __iadd__(self, record):
4243
def close(self):
4344
super().close()
4445
self.reporter.write_to_json()
45-

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
license = "Apache 2.0",
1616
packages = [
1717
"gdc_fastq_splitter",
18+
"gdc_fastq_splitter.fastq",
1819
],
1920
classifiers = [
2021
"Development Status :: 3 - Alpha",

tests/test_illumina.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,10 @@
22
import os
33
import logging
44

5-
from gdc_fastq_splitter.fastq.illumina import IlluminaSequenceIdentifierNoBarcode, IlluminaSequenceIdentifier, IlluminaFastqRecord, infer_fastq_type
5+
from gdc_fastq_splitter.fastq.illumina import (
6+
IlluminaFastqRecord, IlluminaSequenceIdentifierNoBarcode,
7+
IlluminaSequenceIdentifier, IlluminaNoBarcodeFastqRecord,
8+
infer_fastq_type)
69

710
class TestIlluminaSequenceIdentifier(unittest.TestCase):
811
"""Test modern illumina sequence identifiers"""
@@ -58,18 +61,21 @@ def test_valid_record(self):
5861
class TestInferFastqType(unittest.TestCase):
5962
"""Test the infer_fastq_type functionality"""
6063
def test_modern(self):
64+
"""Testing IlluminaFastqRecord"""
6165
fil = os.path.join(os.path.dirname(__file__), 'etc/fake_IlluminaSequenceIdentifier.fastq')
6266
m = infer_fastq_type(fil)
63-
expected = ('IlluminaSequenceIdentifier', IlluminaSequenceIdentifier)
67+
expected = ('IlluminaFastqRecord', IlluminaFastqRecord)
6468
self.assertEqual(expected, m)
6569

6670
def test_nobarcode(self):
71+
"""Testing IlluminaNoBarcodeFastqRecord"""
6772
fil = os.path.join(os.path.dirname(__file__), 'etc/fake_IlluminaSequenceIdentifierNoBarcode.fastq')
6873
m = infer_fastq_type(fil)
69-
expected = ('IlluminaSequenceIdentifierNoBarcode', IlluminaSequenceIdentifierNoBarcode)
74+
expected = ('IlluminaNoBarcodeFastqRecord', IlluminaNoBarcodeFastqRecord)
7075
self.assertEqual(expected, m)
7176

7277
def test_unknown(self):
78+
"""Testing raise exception"""
7379
fil = os.path.join(os.path.dirname(__file__), 'etc/fake_Unknown.fastq')
7480

7581
with self.assertRaises(Exception):

0 commit comments

Comments
 (0)