Update iter_arff.py (#1288)

MaxHalford · web-flow · commit b87700050c94 · 2023-07-29T18:16:01.000+02:00
diff --git a/docs/releases/unreleased.md b/docs/releases/unreleased.md
@@ -27,6 +27,11 @@ Calling `learn_one` in a pipeline will now update each part of the pipeline in t
 
 - Added `preprocessing.OrdinalEncoder`, to map string features to integers.
 
+## stream
+
+- `stream.iter_arff` now supports sparse data.
+- `stream.iter_arff` now supports multi-output targets.
+
 ## utils
 
 - Added `utils.random.exponential` to retrieve random samples following an exponential distribution.
diff --git a/river/linear_model/bayesian_lin_reg.py b/river/linear_model/bayesian_lin_reg.py
@@ -41,7 +41,7 @@ class BayesianLinearRegression(base.Regressor):
     >>> metric = metrics.MAE()
 
     >>> evaluate.progressive_val_score(dataset, model, metric)
-    MAE: 0.586432
+    MAE: 0.586...
 
     >>> x, _ = next(iter(dataset))
     >>> model.predict_one(x)
@@ -73,7 +73,7 @@ class BayesianLinearRegression(base.Regressor):
     ... )
     >>> metric = metrics.MAE()
     >>> evaluate.progressive_val_score(dataset, model, metric)
-    MAE: 1.284016
+    MAE: 1.284...
 
     And here's how it performs with some smoothing:
 
@@ -84,7 +84,7 @@ class BayesianLinearRegression(base.Regressor):
     ... )
     >>> metric = metrics.MAE()
     >>> evaluate.progressive_val_score(dataset, model, metric)
-    MAE: 0.15906
+    MAE: 0.159...
 
     Smoothing allows the model to gradually "forget" the past, and focus on the more recent data.
 
@@ -99,7 +99,7 @@ class BayesianLinearRegression(base.Regressor):
     ... )
     >>> metric = metrics.MAE()
     >>> evaluate.progressive_val_score(dataset, model, metric)
-    MAE: 0.242248
+    MAE: 0.242...
 
     References
     ----------
diff --git a/river/stream/iter_arff.py b/river/stream/iter_arff.py
@@ -1,14 +1,15 @@
 from __future__ import annotations
 
-from scipy.io.arff import arffread
+import scipy.io.arff
+from scipy.io.arff._arffread import read_header
 
 from river import base
 
 from . import utils
 
 
 def iter_arff(
-    filepath_or_buffer, target: str | None = None, compression="infer"
+    filepath_or_buffer, target: str | list[str] | None = None, compression="infer", sparse=False
 ) -> base.typing.Stream:
     """Iterates over rows from an ARFF file.
 
@@ -18,11 +19,96 @@ def iter_arff(
         Either a string indicating the location of a file, or a buffer object that has a
         `read` method.
     target
-        Name of the target field.
+        Name(s) of the target field. If `None`, then the target field is ignored. If a list of
+        names is passed, then a dictionary is returned instead of a single value.
     compression
         For on-the-fly decompression of on-disk data. If this is set to 'infer' and
         `filepath_or_buffer` is a path, then the decompression method is inferred for the
         following extensions: '.gz', '.zip'.
+    sparse
+        Whether the data is sparse or not.
+
+    Examples
+    --------
+
+    >>> cars = '''
+    ... @relation CarData
+    ... @attribute make {Toyota, Honda, Ford, Chevrolet}
+    ... @attribute model string
+    ... @attribute year numeric
+    ... @attribute price numeric
+    ... @attribute mpg numeric
+    ... @data
+    ... Toyota, Corolla, 2018, 15000, 30.5
+    ... Honda, Civic, 2019, 16000, 32.2
+    ... Ford, Mustang, 2020, 25000, 25.0
+    ... Chevrolet, Malibu, 2017, 18000, 28.9
+    ... Toyota, Camry, 2019, 22000, 29.8
+    ... '''
+    >>> with open('cars.arff', mode='w') as f:
+    ...     _ = f.write(cars)
+
+    >>> from river import stream
+
+    >>> for x, y in stream.iter_arff('cars.arff', target='price'):
+    ...     print(x, y)
+    {'make': 'Toyota', 'model': ' Corolla', 'year': 2018.0, 'mpg': 30.5} 15000.0
+    {'make': 'Honda', 'model': ' Civic', 'year': 2019.0, 'mpg': 32.2} 16000.0
+    {'make': 'Ford', 'model': ' Mustang', 'year': 2020.0, 'mpg': 25.0} 25000.0
+    {'make': 'Chevrolet', 'model': ' Malibu', 'year': 2017.0, 'mpg': 28.9} 18000.0
+    {'make': 'Toyota', 'model': ' Camry', 'year': 2019.0, 'mpg': 29.8} 22000.0
+
+    Finally, let's delete the example file.
+
+    >>> import os; os.remove('cars.arff')
+
+    ARFF files support sparse data. Let's create a sparse ARFF file.
+
+    >>> sparse = '''
+    ... % traindata
+    ... @RELATION "traindata: -C 6"
+    ... @ATTRIBUTE y0 {0, 1}
+    ... @ATTRIBUTE y1 {0, 1}
+    ... @ATTRIBUTE y2 {0, 1}
+    ... @ATTRIBUTE y3 {0, 1}
+    ... @ATTRIBUTE y4 {0, 1}
+    ... @ATTRIBUTE y5 {0, 1}
+    ... @ATTRIBUTE X0 NUMERIC
+    ... @ATTRIBUTE X1 NUMERIC
+    ... @ATTRIBUTE X2 NUMERIC
+    ... @DATA
+    ... { 3 1,6 0.863382,8 0.820094 }
+    ... { 2 1,6 0.659761 }
+    ... { 0 1,3 1,6 0.437881,8 0.818882 }
+    ... { 2 1,6 0.676477,7 0.724635,8 0.755123 }
+    ... '''
+
+    >>> with open('sparse.arff', mode='w') as f:
+    ...     _ = f.write(sparse)
+
+    In addition, we'll specify that there are several target fields.
+
+    >>> arff_stream = stream.iter_arff(
+    ...     'sparse.arff',
+    ...     target=['y0', 'y1', 'y2', 'y3', 'y4', 'y5'],
+    ...     sparse=True
+    ... )
+
+    >>> for x, y in arff_stream:
+    ...     print(x)
+    ...     print(y)
+    {'X0': '0.863382', 'X2': '0.820094'}
+    {'y0': 0, 'y1': 0, 'y2': 0, 'y3': '1', 'y4': 0, 'y5': 0}
+    {'X0': '0.659761'}
+    {'y0': 0, 'y1': 0, 'y2': '1', 'y3': 0, 'y4': 0, 'y5': 0}
+    {'X0': '0.437881', 'X2': '0.818882'}
+    {'y0': '1', 'y1': 0, 'y2': 0, 'y3': '1', 'y4': 0, 'y5': 0}
+    {'X0': '0.676477', 'X1': '0.724635', 'X2': '0.755123'}
+    {'y0': 0, 'y1': 0, 'y2': '1', 'y3': 0, 'y4': 0, 'y5': 0}
+
+    References
+    ----------
+    [^1]: [ARFF format description from Weka](https://waikato.github.io/weka-wiki/formats_and_processing/arff_stable/)
 
     """
 
@@ -32,26 +118,38 @@ def iter_arff(
         buffer = utils.open_filepath(buffer, compression)
 
     try:
-        rel, attrs = arffread.read_header(buffer)
+        rel, attrs = read_header(buffer)
     except ValueError as e:
         msg = f"Error while parsing header, error was: {e}"
-        raise arffread.ParseArffError(msg)
+        raise scipy.io.arff.ParseArffError(msg)
 
     names = [attr.name for attr in attrs]
-    types = [float if isinstance(attr, arffread.NumericAttribute) else None for attr in attrs]
+    # HACK: it's a bit hacky to rely on class name to determine what casting to apply
+    casts = [float if attr.__class__.__name__ == "NumericAttribute" else None for attr in attrs]
 
     for r in buffer:
         if len(r) == 0:
             continue
-        x = {
-            name: typ(val) if typ else val
-            for name, typ, val in zip(names, types, r.rstrip().split(","))
-        }
-        try:
-            y = x.pop(target) if target else None
-        except KeyError as e:
-            print(r)
-            raise e
+
+        # Read row
+        if sparse:
+            x = {}
+            for s in r.rstrip()[1:-1].strip().split(","):
+                name_index, val = s.split(" ", 1)
+                x[names[int(name_index)]] = val
+        else:
+            x = {
+                name: cast(val) if cast else val
+                for name, cast, val in zip(names, casts, r.rstrip().split(","))
+            }
+
+        # Handle target
+        y = None
+        if target is not None:
+            if isinstance(target, list):
+                y = {name: x.pop(name, 0) for name in target}
+            else:
+                y = x.pop(target) if target else None
 
         yield x, y