-
-
Notifications
You must be signed in to change notification settings - Fork 361
Description
Location of the documentation
https://pandera.readthedocs.io/en/latest/dataframe_models.html#required-columns
https://pandera.readthedocs.io/en/latest/dataframe_schemas.html#adding-missing-columns
https://pandera.readthedocs.io/en/latest/dataframe_schemas.html#required-columns
Documentation problem
Only required columns are added when using add_missing_columns
which is not that clear from the documentation and the option add_missing_columns
is not even mentioned in the DataFrame Model documentation.
Suggested fix for documentation
Join the documentation of required columns with the documentation of missing columns maybe with a title like "Optional Columns" to direct the user to understand the difference between requiring a column in the validated DataFrame vs requiring it in the input data.
import pandas as pd
import pandera.pandas as pa
schema = pa.DataFrameSchema(
columns={
"needed_in_input": pa.Column(int),
"maybe": pa.Column(int, required=False),
"needed_in_output": pa.Column(int, default=1),
},
add_missing_columns=True,
coerce=True,
)
df = pd.DataFrame({"needed_in_input": [1, 2, 3]})
schema.validate(df)
from typing import Optional
import pandas as pd
import pandera.pandas as pa
from pandera.typing import Series
class Schema(pa.DataFrameModel):
needed_in_input: int
maybe: Optional[int]
needed_in_output: int = pa.Field(default=1)
class Config:
add_missing_columns = True
coerce = True
df = pd.DataFrame({"needed_in_input": [1, 2, 3]})
Schema.validate(df)
Additionally both are (more or less) exclusive as the add_missing_columns
option applies to the schema itself and nullable required values would just be replaced as can be seen in this test:
from contextlib import nullcontext as does_not_raise
from typing import ContextManager
import numpy as np
import pandas as pd
import pandera.errors
import pandera.pandas as pa
import pytest
df = pd.DataFrame(
{"nullable_needed_in_input": [1, 2, 3, np.nan], "maybe": [1, 2, 3, 5]}
)
@pytest.mark.parametrize(
(
"add_missing_columns",
"df",
"ctx",
),
[
pytest.param(
_add_missing_columns,
df[["nullable_needed_in_input"]],
does_not_raise(),
id="Valid data",
)
for _add_missing_columns in [True, False]
]
+ [
pytest.param(
_add_missing_columns,
df[["maybe"]],
pytest.raises(pa.errors.SchemaError),
id="Invalid data",
)
for _add_missing_columns in [True, False]
],
)
def test_required_vs_add_missing(
add_missing_columns: bool, df: pd.DataFrame, ctx: ContextManager
):
with ctx:
pa.DataFrameSchema(
columns={
"nullable_needed_in_input": pa.Column(float, nullable=True),
"maybe": pa.Column(int, required=False),
},
add_missing_columns=add_missing_columns,
coerce=True,
).validate(df)