diff --git a/docs/releases/unreleased.md b/docs/releases/unreleased.md index ee21e93b88..05eae8bdb1 100644 --- a/docs/releases/unreleased.md +++ b/docs/releases/unreleased.md @@ -27,6 +27,11 @@ Calling `learn_one` in a pipeline will now update each part of the pipeline in t - Added `preprocessing.OrdinalEncoder`, to map string features to integers. +## stream + +- `stream.iter_arff` now supports sparse data. +- `stream.iter_arff` now supports multi-output targets. + ## utils - Added `utils.random.exponential` to retrieve random samples following an exponential distribution. diff --git a/river/linear_model/bayesian_lin_reg.py b/river/linear_model/bayesian_lin_reg.py index fb87bcc029..4cf4bb2f31 100644 --- a/river/linear_model/bayesian_lin_reg.py +++ b/river/linear_model/bayesian_lin_reg.py @@ -41,7 +41,7 @@ class BayesianLinearRegression(base.Regressor): >>> metric = metrics.MAE() >>> evaluate.progressive_val_score(dataset, model, metric) - MAE: 0.586432 + MAE: 0.586... >>> x, _ = next(iter(dataset)) >>> model.predict_one(x) @@ -73,7 +73,7 @@ class BayesianLinearRegression(base.Regressor): ... ) >>> metric = metrics.MAE() >>> evaluate.progressive_val_score(dataset, model, metric) - MAE: 1.284016 + MAE: 1.284... And here's how it performs with some smoothing: @@ -84,7 +84,7 @@ class BayesianLinearRegression(base.Regressor): ... ) >>> metric = metrics.MAE() >>> evaluate.progressive_val_score(dataset, model, metric) - MAE: 0.15906 + MAE: 0.159... Smoothing allows the model to gradually "forget" the past, and focus on the more recent data. @@ -99,7 +99,7 @@ class BayesianLinearRegression(base.Regressor): ... ) >>> metric = metrics.MAE() >>> evaluate.progressive_val_score(dataset, model, metric) - MAE: 0.242248 + MAE: 0.242... References ---------- diff --git a/river/stream/iter_arff.py b/river/stream/iter_arff.py index 1903445ecc..830246c16d 100644 --- a/river/stream/iter_arff.py +++ b/river/stream/iter_arff.py @@ -1,6 +1,7 @@ from __future__ import annotations -from scipy.io.arff import arffread +import scipy.io.arff +from scipy.io.arff._arffread import read_header from river import base @@ -8,7 +9,7 @@ def iter_arff( - filepath_or_buffer, target: str | None = None, compression="infer" + filepath_or_buffer, target: str | list[str] | None = None, compression="infer", sparse=False ) -> base.typing.Stream: """Iterates over rows from an ARFF file. @@ -18,11 +19,96 @@ def iter_arff( Either a string indicating the location of a file, or a buffer object that has a `read` method. target - Name of the target field. + Name(s) of the target field. If `None`, then the target field is ignored. If a list of + names is passed, then a dictionary is returned instead of a single value. compression For on-the-fly decompression of on-disk data. If this is set to 'infer' and `filepath_or_buffer` is a path, then the decompression method is inferred for the following extensions: '.gz', '.zip'. + sparse + Whether the data is sparse or not. + + Examples + -------- + + >>> cars = ''' + ... @relation CarData + ... @attribute make {Toyota, Honda, Ford, Chevrolet} + ... @attribute model string + ... @attribute year numeric + ... @attribute price numeric + ... @attribute mpg numeric + ... @data + ... Toyota, Corolla, 2018, 15000, 30.5 + ... Honda, Civic, 2019, 16000, 32.2 + ... Ford, Mustang, 2020, 25000, 25.0 + ... Chevrolet, Malibu, 2017, 18000, 28.9 + ... Toyota, Camry, 2019, 22000, 29.8 + ... ''' + >>> with open('cars.arff', mode='w') as f: + ... _ = f.write(cars) + + >>> from river import stream + + >>> for x, y in stream.iter_arff('cars.arff', target='price'): + ... print(x, y) + {'make': 'Toyota', 'model': ' Corolla', 'year': 2018.0, 'mpg': 30.5} 15000.0 + {'make': 'Honda', 'model': ' Civic', 'year': 2019.0, 'mpg': 32.2} 16000.0 + {'make': 'Ford', 'model': ' Mustang', 'year': 2020.0, 'mpg': 25.0} 25000.0 + {'make': 'Chevrolet', 'model': ' Malibu', 'year': 2017.0, 'mpg': 28.9} 18000.0 + {'make': 'Toyota', 'model': ' Camry', 'year': 2019.0, 'mpg': 29.8} 22000.0 + + Finally, let's delete the example file. + + >>> import os; os.remove('cars.arff') + + ARFF files support sparse data. Let's create a sparse ARFF file. + + >>> sparse = ''' + ... % traindata + ... @RELATION "traindata: -C 6" + ... @ATTRIBUTE y0 {0, 1} + ... @ATTRIBUTE y1 {0, 1} + ... @ATTRIBUTE y2 {0, 1} + ... @ATTRIBUTE y3 {0, 1} + ... @ATTRIBUTE y4 {0, 1} + ... @ATTRIBUTE y5 {0, 1} + ... @ATTRIBUTE X0 NUMERIC + ... @ATTRIBUTE X1 NUMERIC + ... @ATTRIBUTE X2 NUMERIC + ... @DATA + ... { 3 1,6 0.863382,8 0.820094 } + ... { 2 1,6 0.659761 } + ... { 0 1,3 1,6 0.437881,8 0.818882 } + ... { 2 1,6 0.676477,7 0.724635,8 0.755123 } + ... ''' + + >>> with open('sparse.arff', mode='w') as f: + ... _ = f.write(sparse) + + In addition, we'll specify that there are several target fields. + + >>> arff_stream = stream.iter_arff( + ... 'sparse.arff', + ... target=['y0', 'y1', 'y2', 'y3', 'y4', 'y5'], + ... sparse=True + ... ) + + >>> for x, y in arff_stream: + ... print(x) + ... print(y) + {'X0': '0.863382', 'X2': '0.820094'} + {'y0': 0, 'y1': 0, 'y2': 0, 'y3': '1', 'y4': 0, 'y5': 0} + {'X0': '0.659761'} + {'y0': 0, 'y1': 0, 'y2': '1', 'y3': 0, 'y4': 0, 'y5': 0} + {'X0': '0.437881', 'X2': '0.818882'} + {'y0': '1', 'y1': 0, 'y2': 0, 'y3': '1', 'y4': 0, 'y5': 0} + {'X0': '0.676477', 'X1': '0.724635', 'X2': '0.755123'} + {'y0': 0, 'y1': 0, 'y2': '1', 'y3': 0, 'y4': 0, 'y5': 0} + + References + ---------- + [^1]: [ARFF format description from Weka](https://waikato.github.io/weka-wiki/formats_and_processing/arff_stable/) """ @@ -32,26 +118,38 @@ def iter_arff( buffer = utils.open_filepath(buffer, compression) try: - rel, attrs = arffread.read_header(buffer) + rel, attrs = read_header(buffer) except ValueError as e: msg = f"Error while parsing header, error was: {e}" - raise arffread.ParseArffError(msg) + raise scipy.io.arff.ParseArffError(msg) names = [attr.name for attr in attrs] - types = [float if isinstance(attr, arffread.NumericAttribute) else None for attr in attrs] + # HACK: it's a bit hacky to rely on class name to determine what casting to apply + casts = [float if attr.__class__.__name__ == "NumericAttribute" else None for attr in attrs] for r in buffer: if len(r) == 0: continue - x = { - name: typ(val) if typ else val - for name, typ, val in zip(names, types, r.rstrip().split(",")) - } - try: - y = x.pop(target) if target else None - except KeyError as e: - print(r) - raise e + + # Read row + if sparse: + x = {} + for s in r.rstrip()[1:-1].strip().split(","): + name_index, val = s.split(" ", 1) + x[names[int(name_index)]] = val + else: + x = { + name: cast(val) if cast else val + for name, cast, val in zip(names, casts, r.rstrip().split(",")) + } + + # Handle target + y = None + if target is not None: + if isinstance(target, list): + y = {name: x.pop(name, 0) for name in target} + else: + y = x.pop(target) if target else None yield x, y