1
1
from __future__ import annotations
2
2
3
- from scipy .io .arff import arffread
3
+ import scipy .io .arff
4
+ from scipy .io .arff ._arffread import read_header
4
5
5
6
from river import base
6
7
7
8
from . import utils
8
9
9
10
10
11
def iter_arff (
11
- filepath_or_buffer , target : str | None = None , compression = "infer"
12
+ filepath_or_buffer , target : str | list [ str ] | None = None , compression = "infer" , sparse = False
12
13
) -> base .typing .Stream :
13
14
"""Iterates over rows from an ARFF file.
14
15
@@ -18,11 +19,96 @@ def iter_arff(
18
19
Either a string indicating the location of a file, or a buffer object that has a
19
20
`read` method.
20
21
target
21
- Name of the target field.
22
+ Name(s) of the target field. If `None`, then the target field is ignored. If a list of
23
+ names is passed, then a dictionary is returned instead of a single value.
22
24
compression
23
25
For on-the-fly decompression of on-disk data. If this is set to 'infer' and
24
26
`filepath_or_buffer` is a path, then the decompression method is inferred for the
25
27
following extensions: '.gz', '.zip'.
28
+ sparse
29
+ Whether the data is sparse or not.
30
+
31
+ Examples
32
+ --------
33
+
34
+ >>> cars = '''
35
+ ... @relation CarData
36
+ ... @attribute make {Toyota, Honda, Ford, Chevrolet}
37
+ ... @attribute model string
38
+ ... @attribute year numeric
39
+ ... @attribute price numeric
40
+ ... @attribute mpg numeric
41
+ ... @data
42
+ ... Toyota, Corolla, 2018, 15000, 30.5
43
+ ... Honda, Civic, 2019, 16000, 32.2
44
+ ... Ford, Mustang, 2020, 25000, 25.0
45
+ ... Chevrolet, Malibu, 2017, 18000, 28.9
46
+ ... Toyota, Camry, 2019, 22000, 29.8
47
+ ... '''
48
+ >>> with open('cars.arff', mode='w') as f:
49
+ ... _ = f.write(cars)
50
+
51
+ >>> from river import stream
52
+
53
+ >>> for x, y in stream.iter_arff('cars.arff', target='price'):
54
+ ... print(x, y)
55
+ {'make': 'Toyota', 'model': ' Corolla', 'year': 2018.0, 'mpg': 30.5} 15000.0
56
+ {'make': 'Honda', 'model': ' Civic', 'year': 2019.0, 'mpg': 32.2} 16000.0
57
+ {'make': 'Ford', 'model': ' Mustang', 'year': 2020.0, 'mpg': 25.0} 25000.0
58
+ {'make': 'Chevrolet', 'model': ' Malibu', 'year': 2017.0, 'mpg': 28.9} 18000.0
59
+ {'make': 'Toyota', 'model': ' Camry', 'year': 2019.0, 'mpg': 29.8} 22000.0
60
+
61
+ Finally, let's delete the example file.
62
+
63
+ >>> import os; os.remove('cars.arff')
64
+
65
+ ARFF files support sparse data. Let's create a sparse ARFF file.
66
+
67
+ >>> sparse = '''
68
+ ... % traindata
69
+ ... @RELATION "traindata: -C 6"
70
+ ... @ATTRIBUTE y0 {0, 1}
71
+ ... @ATTRIBUTE y1 {0, 1}
72
+ ... @ATTRIBUTE y2 {0, 1}
73
+ ... @ATTRIBUTE y3 {0, 1}
74
+ ... @ATTRIBUTE y4 {0, 1}
75
+ ... @ATTRIBUTE y5 {0, 1}
76
+ ... @ATTRIBUTE X0 NUMERIC
77
+ ... @ATTRIBUTE X1 NUMERIC
78
+ ... @ATTRIBUTE X2 NUMERIC
79
+ ... @DATA
80
+ ... { 3 1,6 0.863382,8 0.820094 }
81
+ ... { 2 1,6 0.659761 }
82
+ ... { 0 1,3 1,6 0.437881,8 0.818882 }
83
+ ... { 2 1,6 0.676477,7 0.724635,8 0.755123 }
84
+ ... '''
85
+
86
+ >>> with open('sparse.arff', mode='w') as f:
87
+ ... _ = f.write(sparse)
88
+
89
+ In addition, we'll specify that there are several target fields.
90
+
91
+ >>> arff_stream = stream.iter_arff(
92
+ ... 'sparse.arff',
93
+ ... target=['y0', 'y1', 'y2', 'y3', 'y4', 'y5'],
94
+ ... sparse=True
95
+ ... )
96
+
97
+ >>> for x, y in arff_stream:
98
+ ... print(x)
99
+ ... print(y)
100
+ {'X0': '0.863382', 'X2': '0.820094'}
101
+ {'y0': 0, 'y1': 0, 'y2': 0, 'y3': '1', 'y4': 0, 'y5': 0}
102
+ {'X0': '0.659761'}
103
+ {'y0': 0, 'y1': 0, 'y2': '1', 'y3': 0, 'y4': 0, 'y5': 0}
104
+ {'X0': '0.437881', 'X2': '0.818882'}
105
+ {'y0': '1', 'y1': 0, 'y2': 0, 'y3': '1', 'y4': 0, 'y5': 0}
106
+ {'X0': '0.676477', 'X1': '0.724635', 'X2': '0.755123'}
107
+ {'y0': 0, 'y1': 0, 'y2': '1', 'y3': 0, 'y4': 0, 'y5': 0}
108
+
109
+ References
110
+ ----------
111
+ [^1]: [ARFF format description from Weka](https://waikato.github.io/weka-wiki/formats_and_processing/arff_stable/)
26
112
27
113
"""
28
114
@@ -32,26 +118,38 @@ def iter_arff(
32
118
buffer = utils .open_filepath (buffer , compression )
33
119
34
120
try :
35
- rel , attrs = arffread . read_header (buffer )
121
+ rel , attrs = read_header (buffer )
36
122
except ValueError as e :
37
123
msg = f"Error while parsing header, error was: { e } "
38
- raise arffread .ParseArffError (msg )
124
+ raise scipy . io . arff .ParseArffError (msg )
39
125
40
126
names = [attr .name for attr in attrs ]
41
- types = [float if isinstance (attr , arffread .NumericAttribute ) else None for attr in attrs ]
127
+ # HACK: it's a bit hacky to rely on class name to determine what casting to apply
128
+ casts = [float if attr .__class__ .__name__ == "NumericAttribute" else None for attr in attrs ]
42
129
43
130
for r in buffer :
44
131
if len (r ) == 0 :
45
132
continue
46
- x = {
47
- name : typ (val ) if typ else val
48
- for name , typ , val in zip (names , types , r .rstrip ().split ("," ))
49
- }
50
- try :
51
- y = x .pop (target ) if target else None
52
- except KeyError as e :
53
- print (r )
54
- raise e
133
+
134
+ # Read row
135
+ if sparse :
136
+ x = {}
137
+ for s in r .rstrip ()[1 :- 1 ].strip ().split ("," ):
138
+ name_index , val = s .split (" " , 1 )
139
+ x [names [int (name_index )]] = val
140
+ else :
141
+ x = {
142
+ name : cast (val ) if cast else val
143
+ for name , cast , val in zip (names , casts , r .rstrip ().split ("," ))
144
+ }
145
+
146
+ # Handle target
147
+ y = None
148
+ if target is not None :
149
+ if isinstance (target , list ):
150
+ y = {name : x .pop (name , 0 ) for name in target }
151
+ else :
152
+ y = x .pop (target ) if target else None
55
153
56
154
yield x , y
57
155
0 commit comments