Discussion on "normalizing first" V.S. "masking first"

### 1. System Info

def preprocess_pems_traffic(
    rate,
    n_steps,
    pattern: str = "point",
    **kwargs,
) -> dict:
    """Load and preprocess the dataset PeMS traffic.

    Parameters
    ----------
    rate:
        The missing rate.

    n_steps:
        The number of time steps to in the generated data samples.
        Also the window size of the sliding window.

    pattern:
        The missing pattern to apply to the dataset.
        Must be one of ['point', 'subseq', 'block'].

    Returns
    -------
    processed_dataset :
        A dictionary containing the processed PeMS traffic.
    """

    assert 0 <= rate < 1, f"rate must be in [0, 1), but got {rate}"
    assert n_steps > 0, f"sample_n_steps must be larger than 0, but got {n_steps}"

    # read the raw data
    data = tsdb.load("pems_traffic")
    df = data["X"]

    feature_names = df.columns.tolist()
    feature_names.remove("date")
    df["date"] = pd.to_datetime(df["date"])

    unique_months = df["date"].dt.to_period("M").unique()
    selected_as_train = unique_months[:15]  # use the first 15 months as train set
    logger.info(f"months selected as train set are {selected_as_train}")
    selected_as_val = unique_months[15:19]  # select the following 4 months as val set
    logger.info(f"months selected as val set are {selected_as_val}")
    selected_as_test = unique_months[
        19:
    ]  # select the left 6 months as test set, 2018-07 has only 2 days, so can be rounded to 5 months
    logger.info(f"months selected as test set are {selected_as_test}")

    test_set = df[df["date"].dt.to_period("M").isin(selected_as_test)]
    val_set = df[df["date"].dt.to_period("M").isin(selected_as_val)]
    train_set = df[df["date"].dt.to_period("M").isin(selected_as_train)]

    scaler = StandardScaler()
    train_X = scaler.fit_transform(train_set.loc[:, feature_names])
    val_X = scaler.transform(val_set.loc[:, feature_names])
    test_X = scaler.transform(test_set.loc[:, feature_names])

    train_X = sliding_window(train_X, n_steps)
    val_X = sliding_window(val_X, n_steps)
    test_X = sliding_window(test_X, n_steps)

    # assemble the final processed data into a dictionary
    processed_dataset = {
        # general info
        "n_steps": n_steps,
        "n_features": train_X.shape[-1],
        "scaler": scaler,
        # train set
        "train_X": train_X,
        # val set
        "val_X": val_X,
        # test set
        "test_X": test_X,
    }

    if rate > 0:
        # hold out ground truth in the original data for evaluation
        train_X_ori = train_X
        val_X_ori = val_X
        test_X_ori = test_X

        # mask values in the train set to keep the same with below validation and test sets
        train_X = create_missingness(train_X, rate, pattern, **kwargs)
        # mask values in the validation set as ground truth
        val_X = create_missingness(val_X, rate, pattern, **kwargs)
        # mask values in the test set as ground truth
        test_X = create_missingness(test_X, rate, pattern, **kwargs)

        processed_dataset["train_X"] = train_X
        processed_dataset["train_X_ori"] = train_X_ori

        processed_dataset["val_X"] = val_X
        processed_dataset["val_X_ori"] = val_X_ori

        processed_dataset["test_X"] = test_X
        processed_dataset["test_X_ori"] = test_X_ori
    else:
        logger.warning("rate is 0, no missing values are artificially added.")

    print_final_dataset_info(train_X, val_X, test_X)
    return processed_dataset
you should mask before normalize，but if you do this, the distribution will shift,I think a possible alternative is to use a portion of the complete data for distribution estimation, and use this portion of the distribution to approximate the distribution of all the data.

### 2. Information

- [ ] The official example scripts
- [ ] My own created scripts

### 3. Reproduction

xxx

### 4. Expected behavior

xxx

### 5. Your contribution

xxx

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Discussion on "normalizing first" V.S. "masking first" #755

1. System Info

2. Information

3. Reproduction

4. Expected behavior

5. Your contribution

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Uh oh!

Discussion on "normalizing first" V.S. "masking first" #755

Description

1. System Info

2. Information

3. Reproduction

4. Expected behavior

5. Your contribution

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions