Skip to content

Commit 66afdd1

Browse files
committed
Modified MWTabFile
Closes #6. Added a validate method, a from_dict method, and changed some data members to properties.
1 parent 6c9b242 commit 66afdd1

File tree

5 files changed

+180
-27
lines changed

5 files changed

+180
-27
lines changed

src/mwtab/mwschema.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,13 @@
1515

1616
from schema import Schema, Optional, Or, And
1717

18-
from . import mwtab
18+
19+
class _duplicate_key_list(list):
20+
"""Class identical to list that can be used for type checking. Used to handle dealing with parsing duplicate keys in JSON."""
21+
def __init__(self, *args, **kwargs):
22+
super(_duplicate_key_list, self).__init__(*args, **kwargs)
23+
24+
1925

2026
if sys.version_info.major == 2:
2127
str = unicode
@@ -122,7 +128,7 @@
122128
"Factors": dict,
123129
Optional("Additional sample data"): {
124130
Optional("RAW_FILE_NAME"): str,
125-
Optional(str): Or(str, mwtab._duplicate_key_list)
131+
Optional(str): Or(str, _duplicate_key_list)
126132
}
127133
}
128134
]

src/mwtab/mwtab.py

+105-18
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,16 @@
2828
import json_duplicate_keys as jdks
2929

3030
from .tokenizer import tokenizer
31+
from .validator import validate_file
32+
from .mwschema import section_schema_mapping, _duplicate_key_list
3133

3234

3335
# The stuff before the MWTabFile class is all to do with being able to handle duplicate keys from a JSON file.
3436
# Python's parser can't do it and you have to do some workarounds for it.
35-
class _duplicate_key_list(list):
36-
"""Class identical to list that can be used for type checking. Used to handle dealing with parsing duplicate keys in JSON."""
37-
def __init__(self, *args, **kwargs):
38-
super(_duplicate_key_list, self).__init__(*args, **kwargs)
37+
# class _duplicate_key_list(list):
38+
# """Class identical to list that can be used for type checking. Used to handle dealing with parsing duplicate keys in JSON."""
39+
# def __init__(self, *args, **kwargs):
40+
# super(_duplicate_key_list, self).__init__(*args, **kwargs)
3941

4042

4143
# From https://stackoverflow.com/questions/14902299/json-loads-allows-duplicate-keys-in-a-dictionary-overwriting-the-first-value
@@ -67,6 +69,52 @@ def _match_process(matchobj):
6769
return '"Additional sample data": {' + temp_string + '}'
6870

6971

72+
# Descriptor to handle the convenience properties for MWTabFile.
73+
class MWTabProperty:
74+
def __set_name__(self, owner, name):
75+
self._name = name
76+
77+
def __get__(self, obj, type=None):
78+
if obj.__dict__.get("_" + self._name + "_was_set"):
79+
return obj.__dict__[self._name]
80+
81+
if self._name == "study_id" or self._name == "analysis_id":
82+
try:
83+
return obj["METABOLOMICS WORKBENCH"].get(self._name.upper())
84+
except Exception:
85+
return None
86+
87+
if self._name == "header":
88+
try:
89+
return " ".join(
90+
["#METABOLOMICS WORKBENCH"]
91+
+ [item[0] + ":" + item[1] for item in obj["METABOLOMICS WORKBENCH"].items() if item[0] not in ["VERSION", "CREATED_ON"]]
92+
)
93+
except Exception:
94+
return None
95+
96+
# try:
97+
# if self._name == "study_id":
98+
# return obj["METABOLOMICS WORKBENCH"].get("STUDY_ID")
99+
# if self._name == "analysis_id":
100+
# return obj["METABOLOMICS WORKBENCH"].get("ANALYSIS_ID")
101+
# if self._name == "header":
102+
# return " ".join(
103+
# ["#METABOLOMICS WORKBENCH"]
104+
# + [item[0] + ":" + item[1] for item in obj["METABOLOMICS WORKBENCH"].items() if item[0] not in ["VERSION", "CREATED_ON"]]
105+
# )
106+
# except KeyError:
107+
# raise KeyError("Missing header information \"METABOLOMICS WORKBENCH\"")
108+
# raise AttributeError("Unknown attribute " + self._name)
109+
110+
def __set__(self, obj, value):
111+
obj.__dict__[self._name] = value
112+
obj.__dict__["_" + self._name + "_was_set"] = True
113+
114+
def __delete__(self, obj):
115+
del obj.__dict__[self._name]
116+
117+
70118
class MWTabFile(OrderedDict):
71119
"""MWTabFile class that stores data from a single ``mwTab`` formatted file in
72120
the form of :py:class:`collections.OrderedDict`.
@@ -89,6 +137,10 @@ class MWTabFile(OrderedDict):
89137
"NMR_BINNED_DATA": "",
90138
"METABOLITES": ""
91139
}
140+
141+
study_id = MWTabProperty()
142+
analysis_id = MWTabProperty()
143+
header = MWTabProperty()
92144

93145
def __init__(self, source, *args, **kwds):
94146
"""File initializer.
@@ -97,10 +149,41 @@ def __init__(self, source, *args, **kwds):
97149
"""
98150
super(MWTabFile, self).__init__(*args, **kwds)
99151
self.source = source
100-
self.study_id = ""
101-
self.analysis_id = ""
102-
self.header = ""
103-
152+
self._study_id = None
153+
self._study_id_was_set = False
154+
self._analysis_id = None
155+
self._analysis_id_was_set = False
156+
self._header = None
157+
self._header_was_set = False
158+
159+
def validate(self, section_schema_mapping=section_schema_mapping, verbose=True, metabolites=True):
160+
"""Validate the instance.
161+
162+
:param dict section_schema_mapping: Dictionary that provides mapping between section name and schema definition.
163+
:param bool verbose: whether to be verbose or not.
164+
:param bool metabolites: whether to validate metabolites section.
165+
:return: Validated file and errors if verbose is False.
166+
:rtype: :py:class:`collections.OrderedDict`, _io.StringIO
167+
"""
168+
return validate_file(
169+
mwtabfile=self,
170+
section_schema_mapping=section_schema_mapping,
171+
verbose=verbose,
172+
metabolites=metabolites
173+
)
174+
175+
@classmethod
176+
def from_dict(cls, input_dict):
177+
"""Create a new MWTabFile instance from input_dict.
178+
179+
:param dict input_dict: Dictionary to create the new instance from.
180+
:return: New instance of MWTabFile
181+
:rtype: :class:`~mwtab.mwtab.MWTabFile`
182+
"""
183+
new_mwtabfile = cls("Internal dictionary. ID: " + str(id(input_dict)))
184+
new_mwtabfile.update(input_dict)
185+
return new_mwtabfile
186+
104187
def read(self, filehandle):
105188
"""Read data into a :class:`~mwtab.mwtab.MWTabFile` instance.
106189
@@ -127,16 +210,20 @@ def read(self, filehandle):
127210
else:
128211
raise TypeError("Unknown file format")
129212

130-
try:
131-
self.study_id = self["METABOLOMICS WORKBENCH"].get("STUDY_ID")
132-
self.analysis_id = self["METABOLOMICS WORKBENCH"].get("ANALYSIS_ID")
133-
# self.header = self["METABOLOMICS WORKBENCH"].get("HEADER")
134-
self.header = " ".join(
135-
["#METABOLOMICS WORKBENCH"]
136-
+ [item[0] + ":" + item[1] for item in self["METABOLOMICS WORKBENCH"].items() if item[0] not in ["VERSION", "CREATED_ON"]]
137-
)
138-
except KeyError as e:
139-
raise KeyError("File missing header information \"METABOLOMICS WORKBENCH\"", e)
213+
# try:
214+
# # Call managed property getters to set initial value.
215+
# self.study_id
216+
# self.analysis_id
217+
# self.header
218+
# # self.study_id = self["METABOLOMICS WORKBENCH"].get("STUDY_ID")
219+
# # self.analysis_id = self["METABOLOMICS WORKBENCH"].get("ANALYSIS_ID")
220+
# # # self.header = self["METABOLOMICS WORKBENCH"].get("HEADER")
221+
# # self.header = " ".join(
222+
# # ["#METABOLOMICS WORKBENCH"]
223+
# # + [item[0] + ":" + item[1] for item in self["METABOLOMICS WORKBENCH"].items() if item[0] not in ["VERSION", "CREATED_ON"]]
224+
# # )
225+
# except KeyError as e:
226+
# raise KeyError("File missing header information \"METABOLOMICS WORKBENCH\"", e)
140227

141228
filehandle.close()
142229

src/mwtab/tokenizer.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
from __future__ import print_function, division, unicode_literals
1919
from collections import deque, namedtuple, OrderedDict
2020

21-
from . import mwtab
21+
from .mwschema import _duplicate_key_list
2222

2323

2424
KeyValue = namedtuple("KeyValue", ["key", "value"])
@@ -73,8 +73,8 @@ def tokenizer(text):
7373
key = key.strip()
7474
value = value.strip()
7575
if key in additional_data:
76-
if not isinstance(additional_data[key], mwtab._duplicate_key_list):
77-
additional_data[key] = mwtab._duplicate_key_list([additional_data[key], value])
76+
if not isinstance(additional_data[key], _duplicate_key_list):
77+
additional_data[key] = _duplicate_key_list([additional_data[key], value])
7878
else:
7979
additional_data[key].append(value)
8080
else:

src/mwtab/validator.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
import sys
2020
import traceback
2121

22-
from .mwschema import section_schema_mapping, base_schema
22+
from .mwschema import section_schema_mapping, base_schema, _duplicate_key_list
2323

2424
import mwtab
2525

@@ -125,7 +125,7 @@ def validate_subject_samples_factors(mwtabfile):
125125
)
126126

127127
duplicate_keys = [key for key, value in subject_sample_factor["Additional sample data"].items()
128-
if isinstance(value, mwtab.mwtab._duplicate_key_list)]
128+
if isinstance(value, _duplicate_key_list)]
129129
if duplicate_keys:
130130
subject_samples_factors_errors.append("SUBJECT_SAMPLE_FACTORS: Entry #" + str(index + 1) +
131131
" has the following duplicate keys:\n\t" +
@@ -293,8 +293,8 @@ def validate_file(mwtabfile, section_schema_mapping=section_schema_mapping, verb
293293
:param dict section_schema_mapping: Dictionary that provides mapping between section name and schema definition.
294294
:param bool verbose: whether to be verbose or not.
295295
:param bool metabolites: whether to validate metabolites section.
296-
:return: Validated file.
297-
:rtype: :py:class:`collections.OrderedDict`
296+
:return: Validated file and errors if verbose is False.
297+
:rtype: :py:class:`collections.OrderedDict`, _io.StringIO
298298
"""
299299
# setup
300300
if not verbose:

tests/test_mwtabfile.py

+60
Original file line numberDiff line numberDiff line change
@@ -143,3 +143,63 @@ def test_read_in_duplicate_keys_tab():
143143
assert isinstance(new_mwtabfile["SUBJECT_SAMPLE_FACTORS"][0]["Additional sample data"]['key_1'], mwtab.mwtab._duplicate_key_list)
144144

145145

146+
def test_validate():
147+
"""Test that the validate method validates the object."""
148+
149+
mwtabfile = mwtab.mwtab.MWTabFile("tests/example_data/other_mwtab_files/ST000122_AN000204_duplicate_keys.txt")
150+
151+
with open("tests/example_data/other_mwtab_files/ST000122_AN000204_duplicate_keys.txt", "r", encoding="utf-8") as f:
152+
mwtabfile.read(f)
153+
154+
_, errors = mwtabfile.validate(verbose=False)
155+
156+
assert "duplicate keys" in errors
157+
158+
159+
def test_from_dict():
160+
"""Test that the from_dict method works to create a new MWTabFile object."""
161+
162+
with open("tests/example_data/other_mwtab_files/incorrect_section_order.json", "r", encoding="utf-8") as f:
163+
json_file = loads(f.read())
164+
165+
mwtabfile = mwtab.mwtab.MWTabFile.from_dict(json_file)
166+
167+
assert mwtabfile.study_id == "ST000000"
168+
169+
170+
def test_properties():
171+
"""Test that the study_id, analysis_id, and header properties behave as expected."""
172+
173+
mwtabfile = mwtab.mwtab.MWTabFile("tests/example_data/other_mwtab_files/ST000122_AN000204_duplicate_keys.txt")
174+
175+
with open("tests/example_data/other_mwtab_files/ST000122_AN000204_duplicate_keys.txt", "r", encoding="utf-8") as f:
176+
mwtabfile.read(f)
177+
178+
assert mwtabfile.study_id == "ST000122"
179+
assert mwtabfile.analysis_id == "AN000204"
180+
assert mwtabfile.header == "#METABOLOMICS WORKBENCH STUDY_ID:ST000122 ANALYSIS_ID:AN000204 PROJECT_ID:PR000109"
181+
182+
temp = mwtabfile["METABOLOMICS WORKBENCH"]
183+
del mwtabfile["METABOLOMICS WORKBENCH"]
184+
185+
assert mwtabfile.study_id is None
186+
assert mwtabfile.analysis_id is None
187+
assert mwtabfile.header is None
188+
189+
mwtabfile["METABOLOMICS WORKBENCH"] = temp
190+
191+
assert mwtabfile.study_id == "ST000122"
192+
assert mwtabfile.analysis_id == "AN000204"
193+
assert mwtabfile.header == "#METABOLOMICS WORKBENCH STUDY_ID:ST000122 ANALYSIS_ID:AN000204 PROJECT_ID:PR000109"
194+
195+
mwtabfile.study_id = "asdf"
196+
mwtabfile.analysis_id = "qwer"
197+
mwtabfile.header = "zxcv"
198+
199+
assert mwtabfile.study_id == "asdf"
200+
assert mwtabfile.analysis_id == "qwer"
201+
assert mwtabfile.header == "zxcv"
202+
203+
204+
205+

0 commit comments

Comments
 (0)