22
33Code is adapted from Patricia Wollstadt's IDTxL (https://github.com/pwollstadt/IDTxl)
44"""
5- from multiprocessing .sharedctypes import Value
65import numpy as np
76import pandas as pd
87from pyspi import utils
1413
1514VERBOSE = False
1615
17- class Data ():
16+
17+ class Data :
1818 """Store data for dependency analysis.
1919
2020 Data takes a 2-dimensional array representing realisations of random
@@ -27,7 +27,7 @@ class Data():
2727 >>> data = Data()
2828 >>>
2929 >>> # Load a prefilled financial dataset
30- >>> data_forex = Data().load_dataset(forex)
30+ >>> data_forex = Data().load_dataset(forex)
3131 >>>
3232 >>> # Create data objects with data of various sizes
3333 >>> d = np.arange(3000).reshape((3, 1000)) # 3 procs.,
@@ -39,7 +39,7 @@ class Data():
3939
4040 Args:
4141 data (array_like, optional):
42- 2-dimensional array with raw data, defaults to None.
42+ 2-dimensional array with raw data, defaults to None.
4343 dim_order (str, optional):
4444 Order of dimensions, accepts two combinations of the characters 'p', and 's' for processes and observations, defaults to 'ps'.
4545 normalise (bool, optional):
@@ -55,81 +55,100 @@ class Data():
5555
5656 """
5757
58- def __init__ (self ,data = None ,dim_order = 'ps' ,normalise = True ,name = None ,procnames = None ,n_processes = None ,n_observations = None ):
58+ def __init__ (
59+ self ,
60+ data = None ,
61+ dim_order = "ps" ,
62+ normalise = True ,
63+ name = None ,
64+ procnames = None ,
65+ n_processes = None ,
66+ n_observations = None ,
67+ ):
5968 self .normalise = normalise
6069 if data is not None :
6170 dat = self .convert_to_numpy (data )
62- self .set_data (dat , dim_order = dim_order , name = name , n_processes = n_processes ,n_observations = n_observations )
71+ self .set_data (
72+ dat ,
73+ dim_order = dim_order ,
74+ name = name ,
75+ n_processes = n_processes ,
76+ n_observations = n_observations ,
77+ )
6378
6479 if procnames is not None :
6580 assert len (procnames ) == self .n_processes
6681
6782 @property
6883 def name (self ):
69- """Name of the data object.
70- """
71- if hasattr (self ,'_name' ):
72- return self ._name
84+ """Name of the data object."""
85+ if hasattr (self , "_name" ):
86+ return self ._name
7387 else :
74- return ' N/A'
88+ return " N/A"
7589
7690 @name .setter
77- def name (self ,n ):
78- """Set the name of the data object.
79- """
80- if not isinstance (n ,str ):
81- raise TypeError (f'Name should be a string, received { type (n )} .' )
91+ def name (self , n ):
92+ """Set the name of the data object."""
93+ if not isinstance (n , str ):
94+ raise TypeError (f"Name should be a string, received { type (n )} ." )
8295 self ._name = n
8396
8497 @property
8598 def procnames (self ):
86- """List of process names.
87- """
88- if hasattr (self ,'_procnames' ):
99+ """List of process names."""
100+ if hasattr (self , "_procnames" ):
89101 return self ._procnames
90102 else :
91- return [f' proc-{ i } ' for i in range (self .n_processes )]
103+ return [f" proc-{ i } " for i in range (self .n_processes )]
92104
93- def to_numpy (self ,realisation = None ,squeeze = False ):
105+ def to_numpy (self , realisation = None , squeeze = False ):
94106 """Return the numpy array."""
95107 if realisation is not None :
96- dat = self ._data [:,:, realisation ]
108+ dat = self ._data [:, :, realisation ]
97109 else :
98110 dat = self ._data
99-
111+
100112 if squeeze :
101113 return np .squeeze (dat )
102114 else :
103115 return dat
104116
105117 @staticmethod
106118 def convert_to_numpy (data ):
107- """Converts other data instances to default numpy format.
108- """
119+ """Converts other data instances to default numpy format."""
109120
110121 if isinstance (data , np .ndarray ):
111122 npdat = data
112123 elif isinstance (data , pd .DataFrame ):
113124 npdat = data .to_numpy ()
114- elif isinstance (data ,str ):
125+ elif isinstance (data , str ):
115126 ext = os .path .splitext (data )[1 ]
116- if ext == ' .npy' :
127+ if ext == " .npy" :
117128 npdat = np .load (data )
118- elif ext == ' .txt' :
129+ elif ext == " .txt" :
119130 npdat = np .genfromtxt (data )
120- elif ext == ' .csv' :
121- npdat = np .genfromtxt (data ,',' )
122- elif ext == ' .ts' :
131+ elif ext == " .csv" :
132+ npdat = np .genfromtxt (data , "," )
133+ elif ext == " .ts" :
123134 tsdat , tsclasses = load_from_tsfile_to_dataframe (data )
124135 npdat = from_nested_to_3d_numpy (tsdat )
125136 else :
126- raise TypeError (f' Unknown filename extension: { ext } ' )
137+ raise TypeError (f" Unknown filename extension: { ext } " )
127138 else :
128- raise TypeError (f' Unknown data type: { type (data )} ' )
139+ raise TypeError (f" Unknown data type: { type (data )} " )
129140
130141 return npdat
131142
132- def set_data (self ,data ,dim_order = 'ps' ,name = None ,n_processes = None ,n_observations = None ,verbose = False ):
143+ def set_data (
144+ self ,
145+ data ,
146+ dim_order = "ps" ,
147+ name = None ,
148+ n_processes = None ,
149+ n_observations = None ,
150+ verbose = False ,
151+ ):
133152 """Overwrite data in an existing instance.
134153
135154 Args:
@@ -141,31 +160,33 @@ def set_data(self,data,dim_order='ps',name=None,n_processes=None,n_observations=
141160 must have the same length as number of dimensions in data
142161 """
143162 if len (dim_order ) > 3 :
144- raise RuntimeError ('dim_order can not have more than two '
145- 'entries' )
163+ raise RuntimeError ("dim_order can not have more than two " "entries" )
146164 if len (dim_order ) != data .ndim :
147- raise RuntimeError ('Data array dimension ({0}) and length of '
148- 'dim_order ({1}) are not equal.' .format (
149- data .ndim , len (dim_order )))
165+ raise RuntimeError (
166+ "Data array dimension ({0}) and length of "
167+ "dim_order ({1}) are not equal." .format (data .ndim , len (dim_order ))
168+ )
150169
151170 # Bring data into the order processes x observations in a pandas dataframe.
152171 data = self ._reorder_data (data , dim_order )
153172
154173 if n_processes is not None :
155174 data = data [:n_processes ]
156175 if n_observations is not None :
157- data = data [:,:n_observations ]
176+ data = data [:, :n_observations ]
158177
159178 if self .normalise :
160- data = zscore (data ,axis = 1 ,nan_policy = ' omit' , ddof = 1 )
179+ data = zscore (data , axis = 1 , nan_policy = " omit" , ddof = 1 )
161180 try :
162- data = detrend (data ,axis = 1 )
181+ data = detrend (data , axis = 1 )
163182 except ValueError as err :
164- print (f' Could not detrend data: { err } ' )
183+ print (f" Could not detrend data: { err } " )
165184
166185 nans = np .isnan (data )
167186 if nans .any ():
168- raise ValueError (f'Dataset { name } contains non-numerics (NaNs) in processes: { np .unique (np .where (nans )[0 ])} .' )
187+ raise ValueError (
188+ f"Dataset { name } contains non-numerics (NaNs) in processes: { np .unique (np .where (nans )[0 ])} ."
189+ )
169190
170191 self ._data = data
171192 self .data_type = type (data [0 , 00 , 0 ])
@@ -176,57 +197,63 @@ def set_data(self,data,dim_order='ps',name=None,n_processes=None,n_observations=
176197 self ._name = name
177198
178199 if verbose :
179- print (f'Dataset "{ name } " now has properties: { self .n_processes } processes, { self .n_observations } observations, { self .n_replications } '
180- 'replications' )
200+ print (
201+ f'Dataset "{ name } " now has properties: { self .n_processes } processes, { self .n_observations } observations, { self .n_replications } '
202+ "replications"
203+ )
181204
182- def add_process (self ,proc ,verbose = False ):
205+ def add_process (self , proc , verbose = False ):
183206 """Appends a univariate process to the dataset.
184207
185208 Args:
186209 proc (ndarray):
187210 Univariate process to add, must be an array the same size as existing ones.
188211 """
189212 proc = np .squeeze (proc )
190- if not isinstance (proc ,np .ndarray ) or proc .ndim != 1 :
191- raise TypeError (' Process must be a 1D numpy array' )
213+ if not isinstance (proc , np .ndarray ) or proc .ndim != 1 :
214+ raise TypeError (" Process must be a 1D numpy array" )
192215
193- if hasattr (self ,' _data' ):
216+ if hasattr (self , " _data" ):
194217 try :
195- self ._data = np .append (self ._data ,np .reshape (proc ,(1 ,self .n_observations ,1 )),axis = 0 )
218+ self ._data = np .append (
219+ self ._data , np .reshape (proc , (1 , self .n_observations , 1 )), axis = 0
220+ )
196221 except IndexError :
197222 raise IndexError ()
198223 else :
199- self .set_data (proc , dim_order = 's' , verbose = verbose )
200-
224+ self .set_data (proc , dim_order = "s" , verbose = verbose )
225+
201226 self ._reset_data_size ()
202227
203228 def remove_process (self , procs ):
204229 try :
205- self ._data = np .delete (self ._data ,procs ,axis = 0 )
230+ self ._data = np .delete (self ._data , procs , axis = 0 )
206231 except IndexError :
207- print (f'Process { procs } is out of bounds of multivariate'
208- f' time-series data with size { self .data .n_processes } ' )
209-
232+ print (
233+ f"Process { procs } is out of bounds of multivariate"
234+ f" time-series data with size { self .data .n_processes } "
235+ )
236+
210237 self ._reset_data_size ()
211238
212239 def _reorder_data (self , data , dim_order ):
213240 """Reorder data dimensions to processes x observations x realisations."""
214241 # add singletons for missing dimensions
215- missing_dims = ' psr'
242+ missing_dims = " psr"
216243 for dim in dim_order :
217- missing_dims = missing_dims .replace (dim , '' )
244+ missing_dims = missing_dims .replace (dim , "" )
218245 for dim in missing_dims :
219246 data = np .expand_dims (data , data .ndim )
220247 dim_order += dim
221248
222249 # reorder array dims if necessary
223- if dim_order [0 ] != 'p' :
224- ind_p = dim_order .index ('p' )
250+ if dim_order [0 ] != "p" :
251+ ind_p = dim_order .index ("p" )
225252 data = data .swapaxes (0 , ind_p )
226253 dim_order = utils .swap_chars (dim_order , 0 , ind_p )
227- if dim_order [1 ] != 's' :
228- data = data .swapaxes (1 , dim_order .index ('s' ))
229-
254+ if dim_order [1 ] != "s" :
255+ data = data .swapaxes (1 , dim_order .index ("s" ))
256+
230257 return data
231258
232259 def _reset_data_size (self ):
@@ -235,14 +262,15 @@ def _reset_data_size(self):
235262 self .n_observations = self ._data .shape [1 ]
236263 self .n_replications = self ._data .shape [2 ]
237264
265+
238266def load_dataset (name ):
239- basedir = os .path .join (os .path .dirname (__file__ ),' data' )
240- if name == ' forex' :
241- filename = ' forex.npy'
242- dim_order = 'sp'
243- elif name == ' cml' :
244- filename = ' cml.npy'
245- dim_order = 'sp'
267+ basedir = os .path .join (os .path .dirname (__file__ ), " data" )
268+ if name == " forex" :
269+ filename = " forex.npy"
270+ dim_order = "sp"
271+ elif name == " cml" :
272+ filename = " cml.npy"
273+ dim_order = "sp"
246274 else :
247- raise NameError (f' Unknown dataset: { name } .' )
248- return Data (data = os .path .join (basedir ,filename ),dim_order = dim_order )
275+ raise NameError (f" Unknown dataset: { name } ." )
276+ return Data (data = os .path .join (basedir , filename ), dim_order = dim_order )
0 commit comments