Skip to content

Commit ab0c155

Browse files
authored
Merge pull request #6555 from PrimozGodec/io-update-origin
[ENH] IO - Change origin attribute when not find on system
2 parents 419b188 + 72ccefe commit ab0c155

File tree

8 files changed

+361
-91
lines changed

8 files changed

+361
-91
lines changed

Orange/data/io.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
import xlsxwriter
2525
import openpyxl
2626

27-
from Orange.data import _io, Table, Domain, ContinuousVariable
27+
from Orange.data import _io, Table, Domain, ContinuousVariable, update_origin
2828
from Orange.data import Compression, open_compressed, detect_encoding, \
2929
isnastr, guess_data_type, sanitize_variable
3030
from Orange.data.io_base import FileFormatBase, Flags, DataTableMixin, PICKLE_PROTOCOL
@@ -164,21 +164,15 @@ def read(self):
164164
skipinitialspace=True,
165165
)
166166
data = self.data_table(reader)
167-
168-
# TODO: Name can be set unconditionally when/if
169-
# self.filename will always be a string with the file name.
170-
# Currently, some tests pass StringIO instead of
171-
# the file name to a reader.
172-
if isinstance(self.filename, str):
173-
data.name = path.splitext(
174-
path.split(self.filename)[-1])[0]
167+
data.name = path.splitext(path.split(self.filename)[-1])[0]
175168
if error and isinstance(error, UnicodeDecodeError):
176169
pos, endpos = error.args[2], error.args[3]
177170
warning = ('Skipped invalid byte(s) in position '
178171
'{}{}').format(pos,
179172
('-' + str(endpos)) if (endpos - pos) > 1 else '')
180173
warnings.warn(warning)
181174
self.set_table_metadata(self.filename, data)
175+
update_origin(data, self.filename)
182176
return data
183177
except Exception as e:
184178
error = e
@@ -215,6 +209,7 @@ def read(self):
215209
if not isinstance(table, Table):
216210
raise TypeError("file does not contain a data table")
217211
else:
212+
update_origin(table, self.filename)
218213
return table
219214

220215
@classmethod
@@ -264,6 +259,7 @@ def read(self):
264259
try:
265260
cells = self.get_cells()
266261
table = self.data_table(cells)
262+
update_origin(table, self.filename)
267263
table.name = path.splitext(path.split(self.filename)[-1])[0]
268264
if self.sheet and len(self.sheets) > 1:
269265
table.name = '-'.join((table.name, self.sheet))

Orange/data/io_util.py

Lines changed: 79 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,27 @@
1+
import os.path
12
import subprocess
23
from collections import defaultdict
4+
from typing import Tuple, Optional
35

46
import numpy as np
7+
import pandas as pd
58
from chardet.universaldetector import UniversalDetector
69

710
from Orange.data import (
811
is_discrete_values, MISSING_VALUES, Variable,
9-
DiscreteVariable, StringVariable, ContinuousVariable, TimeVariable,
12+
DiscreteVariable, StringVariable, ContinuousVariable, TimeVariable, Table,
1013
)
1114
from Orange.misc.collections import natural_sorted
1215

13-
__all__ = ["Compression", "open_compressed", "detect_encoding", "isnastr",
14-
"guess_data_type", "sanitize_variable"]
16+
__all__ = [
17+
"Compression",
18+
"open_compressed",
19+
"detect_encoding",
20+
"isnastr",
21+
"guess_data_type",
22+
"sanitize_variable",
23+
"update_origin",
24+
]
1525

1626

1727
class Compression:
@@ -207,3 +217,69 @@ def mapvalues(arr):
207217
values = [_var.parse(i) for i in orig_values]
208218

209219
return values, var
220+
221+
222+
def _extract_new_origin(attr: Variable, table: Table, lookup_dirs: Tuple[str]) -> Optional[str]:
223+
# origin exists
224+
if os.path.exists(attr.attributes["origin"]):
225+
return attr.attributes["origin"]
226+
227+
# last dir of origin in lookup dirs
228+
dir_ = os.path.basename(os.path.normpath(attr.attributes["origin"]))
229+
for ld in lookup_dirs:
230+
new_dir = os.path.join(ld, dir_)
231+
if os.path.isdir(new_dir):
232+
return new_dir
233+
234+
# all column paths in lookup dirs
235+
for ld in lookup_dirs:
236+
if all(
237+
os.path.exists(os.path.join(ld, attr.str_val(v)))
238+
for v in table.get_column(attr)
239+
if v and not pd.isna(v)
240+
):
241+
return ld
242+
243+
return None
244+
245+
246+
def update_origin(table: Table, file_path: str):
247+
"""
248+
When a dataset with file paths in the column is moved to another computer,
249+
the absolute path may not be correct. This function updates the path for all
250+
columns with an "origin" attribute.
251+
252+
The process consists of two steps. First, we identify directories to search
253+
for files, and in the second step, we check if paths exist.
254+
255+
Lookup directories:
256+
1. The directory where the file from file_path is placed
257+
2. The parent directory of 1. The situation when the user places dataset
258+
file in the directory with files (for example, workflow in a directory
259+
with images)
260+
261+
Possible situations for file search:
262+
1. The last directory of origin (basedir) is in one of the lookup directories
263+
2. Origin doesn't exist in any lookup directories, but paths in a column can
264+
be found in one of the lookup directories. This is usually a situation
265+
when paths in a column are complex (e.g. a/b/c/d/file.txt).
266+
267+
Note: This function updates the existing table
268+
269+
Parameters
270+
----------
271+
table
272+
Orange Table to be updated if origin exits in any column
273+
file_path
274+
Path of the loaded dataset for reference. Only paths inside datasets
275+
directory or its parent directory will be considered for new origin.
276+
"""
277+
file_dir = os.path.dirname(file_path)
278+
parent_dir = os.path.dirname(file_dir)
279+
# if file_dir already root file_dir == parent_dir
280+
lookup_dirs = tuple({file_dir: 0, parent_dir: 0})
281+
for attr in table.domain.metas:
282+
if "origin" in attr.attributes and (attr.is_string or attr.is_discrete):
283+
new_orig = _extract_new_origin(attr, table, lookup_dirs)
284+
if new_orig:
285+
attr.attributes["origin"] = new_orig

Orange/data/tests/test_io_util.py

Lines changed: 123 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,18 @@
1+
import os.path
12
import unittest
3+
from tempfile import TemporaryDirectory
24

3-
from Orange.data import ContinuousVariable, guess_data_type
5+
import numpy as np
6+
7+
from Orange.data import (
8+
ContinuousVariable,
9+
guess_data_type,
10+
Table,
11+
Domain,
12+
StringVariable,
13+
DiscreteVariable,
14+
)
15+
from Orange.data.io_util import update_origin
416

517

618
class TestIoUtil(unittest.TestCase):
@@ -10,5 +22,115 @@ def test_guess_continuous_w_nans(self):
1022
ContinuousVariable)
1123

1224

25+
class TestUpdateOrigin(unittest.TestCase):
26+
FILE_NAMES = ["file1.txt", "file2.txt", "file3.txt"]
27+
28+
def setUp(self) -> None:
29+
self.alt_dir = TemporaryDirectory() # pylint: disable=consider-using-with
30+
31+
self.var_string = var = StringVariable("Files")
32+
files = self.FILE_NAMES + [var.Unknown]
33+
self.table_string = Table.from_list(
34+
Domain([], metas=[var]), np.array(files).reshape((-1, 1))
35+
)
36+
self.var_discrete = var = DiscreteVariable("Files", values=self.FILE_NAMES)
37+
files = self.FILE_NAMES + [var.Unknown]
38+
self.table_discrete = Table.from_list(
39+
Domain([], metas=[var]), np.array(files).reshape((-1, 1))
40+
)
41+
42+
def tearDown(self) -> None:
43+
self.alt_dir.cleanup()
44+
45+
def __create_files(self):
46+
for f in self.FILE_NAMES:
47+
f = os.path.join(self.alt_dir.name, f)
48+
with open(f, "w", encoding="utf8"):
49+
pass
50+
self.assertTrue(os.path.exists(f))
51+
52+
def test_origin_not_changed(self):
53+
"""
54+
Origin exist; keep it unchanged, even though dataset path also includes
55+
files from column.
56+
"""
57+
with TemporaryDirectory() as dir_name:
58+
self.var_string.attributes["origin"] = dir_name
59+
update_origin(self.table_string, self.alt_dir.name)
60+
self.assertEqual(
61+
self.table_string.domain[self.var_string].attributes["origin"], dir_name
62+
)
63+
64+
def test_origin_subdir(self):
65+
"""
66+
Origin is wrong but last dir in origin exit in the dataset file's path
67+
"""
68+
images_dir = os.path.join(self.alt_dir.name, "subdir")
69+
os.mkdir(images_dir)
70+
71+
self.var_string.attributes["origin"] = "/a/b/subdir"
72+
update_origin(self.table_string, os.path.join(self.alt_dir.name, "data.csv"))
73+
self.assertEqual(
74+
self.table_string.domain[self.var_string].attributes["origin"], images_dir
75+
)
76+
77+
def test_origin_parents_subdir(self):
78+
"""
79+
Origin is wrong but last dir in origin exit in the dataset file
80+
parent's directory
81+
"""
82+
# make the dir where dataset is placed
83+
images_dir = os.path.join(self.alt_dir.name, "subdir")
84+
os.mkdir(images_dir)
85+
86+
self.var_string.attributes["origin"] = "/a/b/subdir"
87+
update_origin(self.table_string, os.path.join(images_dir, "data.csv"))
88+
self.assertEqual(
89+
self.table_string.domain[self.var_string].attributes["origin"], images_dir
90+
)
91+
92+
def test_column_paths_subdir(self):
93+
"""
94+
Origin dir not exiting but paths from column exist in dataset's dir
95+
"""
96+
self.__create_files()
97+
98+
self.var_string.attributes["origin"] = "/a/b/non-exiting-dir"
99+
update_origin(self.table_string, os.path.join(self.alt_dir.name, "data.csv"))
100+
self.assertEqual(
101+
self.table_string.domain[self.var_string].attributes["origin"],
102+
self.alt_dir.name,
103+
)
104+
105+
self.var_discrete.attributes["origin"] = "/a/b/non-exiting-dir"
106+
update_origin(self.table_discrete, os.path.join(self.alt_dir.name, "data.csv"))
107+
self.assertEqual(
108+
self.table_discrete.domain[self.var_discrete].attributes["origin"],
109+
self.alt_dir.name,
110+
)
111+
112+
def test_column_paths_parents_subdir(self):
113+
"""
114+
Origin dir not exiting but paths from column exist in dataset parent's dir
115+
"""
116+
# make the dir where dataset is placed
117+
dataset_dir = os.path.join(self.alt_dir.name, "subdir")
118+
self.__create_files()
119+
120+
self.var_string.attributes["origin"] = "/a/b/non-exiting-dir"
121+
update_origin(self.table_string, os.path.join(dataset_dir, "data.csv"))
122+
self.assertEqual(
123+
self.table_string.domain[self.var_string].attributes["origin"],
124+
self.alt_dir.name,
125+
)
126+
127+
self.var_discrete.attributes["origin"] = "/a/b/non-exiting-dir"
128+
update_origin(self.table_discrete, os.path.join(dataset_dir, "data.csv"))
129+
self.assertEqual(
130+
self.table_discrete.domain[self.var_discrete].attributes["origin"],
131+
self.alt_dir.name,
132+
)
133+
134+
13135
if __name__ == '__main__':
14136
unittest.main()

Orange/data/tests/test_variable.py

Lines changed: 29 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Test methods with long descriptive names can omit docstrings
22
# pylint: disable=missing-docstring
33
# pylint: disable=protected-access
4+
import csv
45
import os
56
import sys
67
import math
@@ -10,7 +11,7 @@
1011
import warnings
1112
from datetime import datetime, timezone
1213

13-
from io import StringIO
14+
from tempfile import NamedTemporaryFile, TemporaryDirectory
1415

1516
import numpy as np
1617
import pandas as pd
@@ -714,27 +715,35 @@ def test_no_date_no_time(self):
714715
self.assertEqual(TimeVariable('relative time').repr_val(1.6), '1.6')
715716

716717
def test_readwrite_timevariable(self):
717-
output_csv = StringIO()
718-
input_csv = StringIO("""\
719-
Date,Feature
720-
time,continuous
721-
,
722-
1920-12-12,1.0
723-
1920-12-13,3.0
724-
1920-12-14,5.5
725-
""")
726-
for stream in (output_csv, input_csv):
727-
stream.close = lambda: None # HACK: Prevent closing of streams
728-
729-
table = CSVReader(input_csv).read()
730-
self.assertIsInstance(table.domain['Date'], TimeVariable)
731-
self.assertEqual(table[0, 'Date'], '1920-12-12')
718+
content = [
719+
("Date", "Feature"),
720+
("time", "continuous"),
721+
("", ""),
722+
("1920-12-12", 1.0),
723+
("1920-12-13", 3.0),
724+
("1920-12-14", 5.5),
725+
]
726+
with NamedTemporaryFile(
727+
mode="w", delete=False, newline="", encoding="utf-8"
728+
) as input_csv:
729+
csv.writer(input_csv, delimiter=",").writerows(content)
730+
731+
table = CSVReader(input_csv.name).read()
732+
self.assertIsInstance(table.domain["Date"], TimeVariable)
733+
self.assertEqual(table[0, "Date"], "1920-12-12")
732734
# Dates before 1970 are negative
733-
self.assertTrue(all(inst['Date'] < 0 for inst in table))
735+
self.assertTrue(all(inst["Date"] < 0 for inst in table))
734736

735-
CSVReader.write_file(output_csv, table)
736-
self.assertEqual(input_csv.getvalue().splitlines(),
737-
output_csv.getvalue().splitlines())
737+
with NamedTemporaryFile(mode="w", delete=False) as output_csv:
738+
pass
739+
CSVReader.write_file(output_csv.name, table)
740+
741+
with open(input_csv.name, encoding="utf-8") as in_f:
742+
with open(output_csv.name, encoding="utf-8") as out_f:
743+
self.assertEqual(in_f.read(), out_f.read())
744+
745+
os.unlink(input_csv.name)
746+
os.unlink(output_csv.name)
738747

739748
def test_repr_value(self):
740749
# https://github.com/biolab/orange3/pull/1760

0 commit comments

Comments
 (0)