Skip to content

Commit bad40b7

Browse files
committed
Formatted data
1 parent 3262ca1 commit bad40b7

File tree

1 file changed

+101
-73
lines changed

1 file changed

+101
-73
lines changed

pyspi/data.py

Lines changed: 101 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
33
Code is adapted from Patricia Wollstadt's IDTxL (https://github.com/pwollstadt/IDTxl)
44
"""
5-
from multiprocessing.sharedctypes import Value
65
import numpy as np
76
import pandas as pd
87
from pyspi import utils
@@ -14,7 +13,8 @@
1413

1514
VERBOSE = False
1615

17-
class Data():
16+
17+
class Data:
1818
"""Store data for dependency analysis.
1919
2020
Data takes a 2-dimensional array representing realisations of random
@@ -27,7 +27,7 @@ class Data():
2727
>>> data = Data()
2828
>>>
2929
>>> # Load a prefilled financial dataset
30-
>>> data_forex = Data().load_dataset(forex)
30+
>>> data_forex = Data().load_dataset(forex)
3131
>>>
3232
>>> # Create data objects with data of various sizes
3333
>>> d = np.arange(3000).reshape((3, 1000)) # 3 procs.,
@@ -39,7 +39,7 @@ class Data():
3939
4040
Args:
4141
data (array_like, optional):
42-
2-dimensional array with raw data, defaults to None.
42+
2-dimensional array with raw data, defaults to None.
4343
dim_order (str, optional):
4444
Order of dimensions, accepts two combinations of the characters 'p', and 's' for processes and observations, defaults to 'ps'.
4545
normalise (bool, optional):
@@ -55,81 +55,100 @@ class Data():
5555
5656
"""
5757

58-
def __init__(self,data=None,dim_order='ps',normalise=True,name=None,procnames=None,n_processes=None,n_observations=None):
58+
def __init__(
59+
self,
60+
data=None,
61+
dim_order="ps",
62+
normalise=True,
63+
name=None,
64+
procnames=None,
65+
n_processes=None,
66+
n_observations=None,
67+
):
5968
self.normalise = normalise
6069
if data is not None:
6170
dat = self.convert_to_numpy(data)
62-
self.set_data(dat, dim_order=dim_order, name=name, n_processes=n_processes,n_observations=n_observations)
71+
self.set_data(
72+
dat,
73+
dim_order=dim_order,
74+
name=name,
75+
n_processes=n_processes,
76+
n_observations=n_observations,
77+
)
6378

6479
if procnames is not None:
6580
assert len(procnames) == self.n_processes
6681

6782
@property
6883
def name(self):
69-
"""Name of the data object.
70-
"""
71-
if hasattr(self,'_name'):
72-
return self._name
84+
"""Name of the data object."""
85+
if hasattr(self, "_name"):
86+
return self._name
7387
else:
74-
return 'N/A'
88+
return "N/A"
7589

7690
@name.setter
77-
def name(self,n):
78-
"""Set the name of the data object.
79-
"""
80-
if not isinstance(n,str):
81-
raise TypeError(f'Name should be a string, received {type(n)}.')
91+
def name(self, n):
92+
"""Set the name of the data object."""
93+
if not isinstance(n, str):
94+
raise TypeError(f"Name should be a string, received {type(n)}.")
8295
self._name = n
8396

8497
@property
8598
def procnames(self):
86-
"""List of process names.
87-
"""
88-
if hasattr(self,'_procnames'):
99+
"""List of process names."""
100+
if hasattr(self, "_procnames"):
89101
return self._procnames
90102
else:
91-
return [f'proc-{i}' for i in range(self.n_processes)]
103+
return [f"proc-{i}" for i in range(self.n_processes)]
92104

93-
def to_numpy(self,realisation=None,squeeze=False):
105+
def to_numpy(self, realisation=None, squeeze=False):
94106
"""Return the numpy array."""
95107
if realisation is not None:
96-
dat = self._data[:,:,realisation]
108+
dat = self._data[:, :, realisation]
97109
else:
98110
dat = self._data
99-
111+
100112
if squeeze:
101113
return np.squeeze(dat)
102114
else:
103115
return dat
104116

105117
@staticmethod
106118
def convert_to_numpy(data):
107-
"""Converts other data instances to default numpy format.
108-
"""
119+
"""Converts other data instances to default numpy format."""
109120

110121
if isinstance(data, np.ndarray):
111122
npdat = data
112123
elif isinstance(data, pd.DataFrame):
113124
npdat = data.to_numpy()
114-
elif isinstance(data,str):
125+
elif isinstance(data, str):
115126
ext = os.path.splitext(data)[1]
116-
if ext == '.npy':
127+
if ext == ".npy":
117128
npdat = np.load(data)
118-
elif ext == '.txt':
129+
elif ext == ".txt":
119130
npdat = np.genfromtxt(data)
120-
elif ext == '.csv':
121-
npdat = np.genfromtxt(data,',')
122-
elif ext == '.ts':
131+
elif ext == ".csv":
132+
npdat = np.genfromtxt(data, ",")
133+
elif ext == ".ts":
123134
tsdat, tsclasses = load_from_tsfile_to_dataframe(data)
124135
npdat = from_nested_to_3d_numpy(tsdat)
125136
else:
126-
raise TypeError(f'Unknown filename extension: {ext}')
137+
raise TypeError(f"Unknown filename extension: {ext}")
127138
else:
128-
raise TypeError(f'Unknown data type: {type(data)}')
139+
raise TypeError(f"Unknown data type: {type(data)}")
129140

130141
return npdat
131142

132-
def set_data(self,data,dim_order='ps',name=None,n_processes=None,n_observations=None,verbose=False):
143+
def set_data(
144+
self,
145+
data,
146+
dim_order="ps",
147+
name=None,
148+
n_processes=None,
149+
n_observations=None,
150+
verbose=False,
151+
):
133152
"""Overwrite data in an existing instance.
134153
135154
Args:
@@ -141,31 +160,33 @@ def set_data(self,data,dim_order='ps',name=None,n_processes=None,n_observations=
141160
must have the same length as number of dimensions in data
142161
"""
143162
if len(dim_order) > 3:
144-
raise RuntimeError('dim_order can not have more than two '
145-
'entries')
163+
raise RuntimeError("dim_order can not have more than two " "entries")
146164
if len(dim_order) != data.ndim:
147-
raise RuntimeError('Data array dimension ({0}) and length of '
148-
'dim_order ({1}) are not equal.'.format(
149-
data.ndim, len(dim_order)))
165+
raise RuntimeError(
166+
"Data array dimension ({0}) and length of "
167+
"dim_order ({1}) are not equal.".format(data.ndim, len(dim_order))
168+
)
150169

151170
# Bring data into the order processes x observations in a pandas dataframe.
152171
data = self._reorder_data(data, dim_order)
153172

154173
if n_processes is not None:
155174
data = data[:n_processes]
156175
if n_observations is not None:
157-
data = data[:,:n_observations]
176+
data = data[:, :n_observations]
158177

159178
if self.normalise:
160-
data = zscore(data,axis=1,nan_policy='omit',ddof=1)
179+
data = zscore(data, axis=1, nan_policy="omit", ddof=1)
161180
try:
162-
data = detrend(data,axis=1)
181+
data = detrend(data, axis=1)
163182
except ValueError as err:
164-
print(f'Could not detrend data: {err}')
183+
print(f"Could not detrend data: {err}")
165184

166185
nans = np.isnan(data)
167186
if nans.any():
168-
raise ValueError(f'Dataset {name} contains non-numerics (NaNs) in processes: {np.unique(np.where(nans)[0])}.')
187+
raise ValueError(
188+
f"Dataset {name} contains non-numerics (NaNs) in processes: {np.unique(np.where(nans)[0])}."
189+
)
169190

170191
self._data = data
171192
self.data_type = type(data[0, 00, 0])
@@ -176,57 +197,63 @@ def set_data(self,data,dim_order='ps',name=None,n_processes=None,n_observations=
176197
self._name = name
177198

178199
if verbose:
179-
print(f'Dataset "{name}" now has properties: {self.n_processes} processes, {self.n_observations} observations, {self.n_replications} '
180-
'replications')
200+
print(
201+
f'Dataset "{name}" now has properties: {self.n_processes} processes, {self.n_observations} observations, {self.n_replications} '
202+
"replications"
203+
)
181204

182-
def add_process(self,proc,verbose=False):
205+
def add_process(self, proc, verbose=False):
183206
"""Appends a univariate process to the dataset.
184207
185208
Args:
186209
proc (ndarray):
187210
Univariate process to add, must be an array the same size as existing ones.
188211
"""
189212
proc = np.squeeze(proc)
190-
if not isinstance(proc,np.ndarray) or proc.ndim != 1:
191-
raise TypeError('Process must be a 1D numpy array')
213+
if not isinstance(proc, np.ndarray) or proc.ndim != 1:
214+
raise TypeError("Process must be a 1D numpy array")
192215

193-
if hasattr(self,'_data'):
216+
if hasattr(self, "_data"):
194217
try:
195-
self._data = np.append(self._data,np.reshape(proc,(1,self.n_observations,1)),axis=0)
218+
self._data = np.append(
219+
self._data, np.reshape(proc, (1, self.n_observations, 1)), axis=0
220+
)
196221
except IndexError:
197222
raise IndexError()
198223
else:
199-
self.set_data(proc, dim_order='s',verbose=verbose)
200-
224+
self.set_data(proc, dim_order="s", verbose=verbose)
225+
201226
self._reset_data_size()
202227

203228
def remove_process(self, procs):
204229
try:
205-
self._data = np.delete(self._data,procs,axis=0)
230+
self._data = np.delete(self._data, procs, axis=0)
206231
except IndexError:
207-
print(f'Process {procs} is out of bounds of multivariate'
208-
f' time-series data with size {self.data.n_processes}')
209-
232+
print(
233+
f"Process {procs} is out of bounds of multivariate"
234+
f" time-series data with size {self.data.n_processes}"
235+
)
236+
210237
self._reset_data_size()
211238

212239
def _reorder_data(self, data, dim_order):
213240
"""Reorder data dimensions to processes x observations x realisations."""
214241
# add singletons for missing dimensions
215-
missing_dims = 'psr'
242+
missing_dims = "psr"
216243
for dim in dim_order:
217-
missing_dims = missing_dims.replace(dim, '')
244+
missing_dims = missing_dims.replace(dim, "")
218245
for dim in missing_dims:
219246
data = np.expand_dims(data, data.ndim)
220247
dim_order += dim
221248

222249
# reorder array dims if necessary
223-
if dim_order[0] != 'p':
224-
ind_p = dim_order.index('p')
250+
if dim_order[0] != "p":
251+
ind_p = dim_order.index("p")
225252
data = data.swapaxes(0, ind_p)
226253
dim_order = utils.swap_chars(dim_order, 0, ind_p)
227-
if dim_order[1] != 's':
228-
data = data.swapaxes(1, dim_order.index('s'))
229-
254+
if dim_order[1] != "s":
255+
data = data.swapaxes(1, dim_order.index("s"))
256+
230257
return data
231258

232259
def _reset_data_size(self):
@@ -235,14 +262,15 @@ def _reset_data_size(self):
235262
self.n_observations = self._data.shape[1]
236263
self.n_replications = self._data.shape[2]
237264

265+
238266
def load_dataset(name):
239-
basedir = os.path.join(os.path.dirname(__file__),'data')
240-
if name == 'forex':
241-
filename = 'forex.npy'
242-
dim_order = 'sp'
243-
elif name == 'cml':
244-
filename = 'cml.npy'
245-
dim_order = 'sp'
267+
basedir = os.path.join(os.path.dirname(__file__), "data")
268+
if name == "forex":
269+
filename = "forex.npy"
270+
dim_order = "sp"
271+
elif name == "cml":
272+
filename = "cml.npy"
273+
dim_order = "sp"
246274
else:
247-
raise NameError(f'Unknown dataset: {name}.')
248-
return Data(data=os.path.join(basedir,filename),dim_order=dim_order)
275+
raise NameError(f"Unknown dataset: {name}.")
276+
return Data(data=os.path.join(basedir, filename), dim_order=dim_order)

0 commit comments

Comments
 (0)