Skip to content

Add residuals_ attribute to pg.anova output #260

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ a. The :py:func:`pingouin.pairwise_ttests` has been renamed to :py:func:`pingoui

a. Allow :py:func:`pingouin.bayesfactor_binom` to take Beta alternative model. `PR 252 <https://github.com/raphaelvallat/pingouin/pull/252>`_.
b. Allow keyword arguments for logistic regression in :py:func:`pingouin.mediation_analysis`. `PR 245 <https://github.com/raphaelvallat/pingouin/pull/245>`_.
c. :py:func:`pingouin.anova` now returns the residuals. `PR 260 <https://github.com/raphaelvallat/pingouin/pull/260`_.

v0.5.1 (February 2022)
----------------------
Expand Down
70 changes: 58 additions & 12 deletions pingouin/parametric.py
Original file line number Diff line number Diff line change
Expand Up @@ -532,7 +532,8 @@ def rm_anova(data=None, dv=None, within=None, subject=None, correction='auto',

# Calculate sums of squares
ss_with = ((grp_with.mean() - grandmean)**2 * grp_with.count()).sum()
ss_resall = grp_with.apply(lambda x: (x - x.mean())**2).sum()
resid = grp_with.apply(lambda x: (x - x.mean()))
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you have any documentation for this definition of the residuals? If so, can you add as a comment

ss_resall = (resid**2).sum()
# sstotal = sstime + ss_resall = sstime + (sssubj + sserror)
# ss_total = ((data[dv] - grandmean)**2).sum()
# We can further divide the residuals into a within and between component:
Expand Down Expand Up @@ -611,7 +612,11 @@ def rm_anova(data=None, dv=None, within=None, subject=None, correction='auto',

aov = aov.reindex(columns=col_order)
aov.dropna(how='all', axis=1, inplace=True)
return _postprocess_dataframe(aov)
aov = _postprocess_dataframe(aov)

aov.residuals_ = 0
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alternatively, we could use pandas.DataFrame.attrs, e.g:

df.attrs = dict(residuals=resid)

although I have never tried it so I don't know if it works well

aov.residuals_ = resid
return aov


def rm_anova2(data=None, dv=None, within=None, subject=None, effsize="np2"):
Expand Down Expand Up @@ -734,6 +739,9 @@ def rm_anova2(data=None, dv=None, within=None, subject=None, effsize="np2"):
p_b_corr = f(df_b_c, df_bs_c).sf(f_b)
p_ab_corr = f(df_ab_c, df_abs_c).sf(f_ab)

# Residuals
resid = data.groupby([a, b], observed=True)[dv].transform(lambda x: x - x.mean())

# Create dataframe
aov = pd.DataFrame({
'Source': [a, b, a + ' * ' + b],
Expand All @@ -746,7 +754,11 @@ def rm_anova2(data=None, dv=None, within=None, subject=None, effsize="np2"):
'p-GG-corr': [p_a_corr, p_b_corr, p_ab_corr],
effsize: [ef_a, ef_b, ef_ab],
'eps': [eps_a, eps_b, eps_ab]})
return _postprocess_dataframe(aov)
aov = _postprocess_dataframe(aov)

aov.residuals_ = 0
aov.residuals_ = resid
return aov


@pf.register_dataframe_method
Expand Down Expand Up @@ -793,6 +805,12 @@ def anova(data=None, dv=None, between=None, ss_type=2, detailed=False,
* ``'p-unc'``: uncorrected p-values
* ``'np2'``: Partial eta-square effect sizes

In adition, the output dataframe comes with hidden attribute
for the residuals that can be accessed as follows:

>>> aov = pg.anova() # doctest: +SKIP
>>> aov.residuals_ # doctest: +SKIP

See Also
--------
rm_anova : One-way and two-way repeated measures ANOVA
Expand Down Expand Up @@ -938,9 +956,9 @@ def anova(data=None, dv=None, between=None, ss_type=2, detailed=False,
ssbetween = ((grp.mean() - data[dv].mean())**2 * grp.count()).sum()
# Within effect (= error between)
# = (grp.var(ddof=0) * grp.count()).sum()
sserror = grp.apply(lambda x: (x - x.mean())**2).sum()
error = grp.apply(lambda x: x - x.mean())
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here, do you have any documentation / reference implementation for this method of calculating the residuals?

sserror = (error**2).sum()
# In 1-way ANOVA, sstotal = ssbetween + sserror
# sstotal = ssbetween + sserror

# Calculate DOF, MS, F and p-values
ddof1 = n_groups - 1
Expand Down Expand Up @@ -978,7 +996,10 @@ def anova(data=None, dv=None, between=None, ss_type=2, detailed=False,
})

aov.dropna(how='all', axis=1, inplace=True)
return _postprocess_dataframe(aov)
aov = _postprocess_dataframe(aov)
aov.residuals_ = 0 # Trick to avoid Pandas warning
aov.residuals_ = error # Residuals is a hidden attribute
return aov


def anova2(data=None, dv=None, between=None, ss_type=2, effsize='np2'):
Expand Down Expand Up @@ -1010,7 +1031,8 @@ def anova2(data=None, dv=None, between=None, ss_type=2, effsize='np2'):
ss_fac1 = aov_fac1.at[0, 'SS']
ss_fac2 = aov_fac2.at[0, 'SS']
ss_tot = ((data[dv] - data[dv].mean())**2).sum()
ss_resid = np.sum(grp_both.apply(lambda x: (x - x.mean())**2))
resid = grp_both.apply(lambda x: (x - x.mean()))
ss_resid = (resid**2).sum()
ss_inter = ss_tot - (ss_resid + ss_fac1 + ss_fac2)
# Degrees of freedom
df_fac1 = aov_fac1.at[0, 'DF']
Expand Down Expand Up @@ -1062,7 +1084,11 @@ def anova2(data=None, dv=None, between=None, ss_type=2, effsize='np2'):
})

aov.dropna(how='all', axis=1, inplace=True)
return _postprocess_dataframe(aov)
aov = _postprocess_dataframe(aov)

aov.residuals_ = 0
aov.residuals_ = resid
return aov


def anovan(data=None, dv=None, between=None, ss_type=2, effsize='np2'):
Expand Down Expand Up @@ -1142,6 +1168,10 @@ def format_source(x):
# Add formula to dataframe
aov = _postprocess_dataframe(aov)
aov.formula_ = formula

# Add residuals to dataframe
aov.residuals_ = 0
aov.residuals_ = lm.resid
return aov


Expand Down Expand Up @@ -1276,7 +1306,8 @@ def welch_anova(data=None, dv=None, between=None):
adj_grandmean = (weights * grp.mean()).sum() / weights.sum()

# Sums of squares (regular and adjusted)
ss_res = grp.apply(lambda x: (x - x.mean())**2).sum()
resid = grp.apply(lambda x: x - x.mean())
ss_res = (resid**2).sum()
ss_bet = ((grp.mean() - data[dv].mean())**2 * grp.count()).sum()
ss_betadj = np.sum(weights * np.square(grp.mean() - adj_grandmean))
ms_betadj = ss_betadj / ddof1
Expand All @@ -1296,7 +1327,12 @@ def welch_anova(data=None, dv=None, between=None):
'p-unc': pval,
'np2': np2
}, index=[0])
return _postprocess_dataframe(aov)

aov = _postprocess_dataframe(aov)
aov.residuals_ = 0
aov.residuals_ = resid

return aov


@pf.register_dataframe_method
Expand Down Expand Up @@ -1436,7 +1472,8 @@ def mixed_anova(data=None, dv=None, within=None, subject=None, between=None,
# Extract residuals and interactions
grp = data.groupby([between, within], observed=True)[dv]
# ssresall = residuals within + residuals between
ss_resall = grp.apply(lambda x: (x - x.mean())**2).sum()
resid = grp.apply(lambda x: x - x.mean())
ss_resall = (resid**2).sum()
# Interaction
ss_inter = ss_total - (ss_resall + ss_with + ss_betw)
ss_reswith = aov_with.at[1, 'SS'] - ss_inter
Expand Down Expand Up @@ -1504,7 +1541,12 @@ def mixed_anova(data=None, dv=None, within=None, subject=None, between=None,
'sphericity', 'W-spher', 'p-spher']
aov = aov.reindex(columns=col_order)
aov.dropna(how='all', axis=1, inplace=True)
return _postprocess_dataframe(aov)
aov = _postprocess_dataframe(aov)

aov.residuals_ = 0
aov.residuals_ = resid

return aov


@pf.register_dataframe_method
Expand Down Expand Up @@ -1635,4 +1677,8 @@ def ancova(data=None, dv=None, between=None, covar=None, effsize="np2"):
# Add bw as an attribute (for rm_corr function)
aov = _postprocess_dataframe(aov)
aov.bw_ = model.params.iloc[-1]

# Add residuals attribute
aov.residuals_ = 0
aov.residuals_ = model.resid
return aov
58 changes: 58 additions & 0 deletions pingouin/tests/test_parametric.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,16 @@ def test_anova(self):
aov = anova(dv='Pain threshold', between='Hair color', data=df_pain,
effsize="n2", detailed=True).round(3)
assert aov.at[0, 'n2'] == .576
# Compare residuals with R
# Use different dataset to produce residuals more than 3 decimal places
# aov(Cholesterol ~ Risk, DF)$residuals
resid = read_dataset('anova3').anova(
dv='Cholesterol', between=['Risk']
).residuals_
array_equal(
resid[0:5].round(3),
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I noticed that you are always testing only the first 5 values of the residuals. We need to test all values, i.e. array_equal on all the residuals

[-0.177, 1.816, -0.444, -1.469, 0.693]
)

# Unbalanced and with missing values
df_pain.loc[[17, 18], 'Pain threshold'] = np.nan
Expand Down Expand Up @@ -218,6 +228,16 @@ def test_anova(self):
aov2 = anova(dv="Yield", between=["Blend", "Crop"],
data=df_aov2, effsize="n2").round(4)
array_equal(aov2.loc[[0, 1, 2], 'n2'], [0.0001, 0.1843, 0.1589])
# Compare residuals with R
# Use different dataset to produce residuals more than 3 decimal places
# aov(Cholesterol ~ Risk * Drug, DF)$residuals
resid = read_dataset('anova3').anova(
dv='Cholesterol', between=['Risk', 'Drug']
).residuals_
array_equal(
resid[0:5].round(3),
[-0.022, 1.972, -0.288, -1.313, 0.849]
)

# Two-way ANOVA with unbalanced design
df_aov2 = read_dataset('anova2_unbalanced')
Expand Down Expand Up @@ -260,6 +280,15 @@ def test_anova(self):
0.057, 0.044, np.nan])
array_equal(aov3_ss1.loc[:, 'p-unc'], [0.123, 0.001, 0.619, 0.711,
0.229, 0.245, 0.343, np.nan])
# Compare residuals with R
# ANOVA = aov(Cholesterol~Sex*Risk*Drug, DF)$residuals
resid = df_aov3.anova(dv="Cholesterol",
between=["Sex", "Risk", "Drug"]).residuals_
array_equal(
resid[0:5].round(3),
[-0.261, 1.732, -0.527, -1.553, 0.610]
)

# Unbalanced
df_aov3 = read_dataset('anova3_unbalanced')
aov3_ss1 = anova(dv="Cholesterol", between=['Sex', 'Risk', 'Drug'],
Expand Down Expand Up @@ -334,6 +363,14 @@ def test_rm_anova(self):
assert aov.at[0, 'F'] == 3.913
assert aov.at[0, 'p-unc'] == .023
assert aov.at[0, 'np2'] == .062
# Compare residuals with R package rstatix
# ANOVA = rstatix::anova_test(DF, dv='Scores', within='Time', wid='Subject')
# attr(ANOVA, 'args')$model$residuals
resid = df.rm_anova(dv='Scores', within='Time', subject='Subject').residuals_
array_equal(
resid[0:5].round(3),
[0.501, -1.161, 1.462, -0.283, -0.691]
)

# Same but with categorical columns
aov = rm_anova(dv='Scores', within='Time', subject='Subject',
Expand Down Expand Up @@ -375,6 +412,15 @@ def test_rm_anova2(self):
array_equal(aov.loc[:, 'F'], [33.852, 26.959, 12.632])
array_equal(aov.loc[:, 'np2'], [0.790, 0.750, 0.584])
array_equal(aov.loc[:, 'eps'], [1., 0.969, 0.727])
# Compare residuals with R package rstatix
# ANOVA = rstatix::anova_test(DF, dv='Performance', within=c('Time', 'Metric'), wid='Subject')
# attr(ANOVA, 'args')$model$residuals
resid = data.rm_anova(dv='Performance', within=['Time', 'Metric'],
subject='Subject').residuals_
array_equal(
resid[0:5],
[1, -3, -1, 7, -6]
)

# With categorical
data_cat = data.copy()
Expand Down Expand Up @@ -450,6 +496,18 @@ def test_mixed_anova(self):
between='Group', data=df, effsize="ng2").round(3)
array_equal(aov.loc[:, 'ng2'], [0.031, 0.042, 0.029])

# Compare residuals with R package rstatix
# ANOVA <- rstatix::anova_test(DF, dv='Scores', between='Group',
# within='Time', wid='Subject')
# attr(ANOVA, 'args')$model$residuals
resid = df.mixed_anova(
dv='Scores', within='Time', between='Group', subject='Subject'
).residuals_
array_equal(
resid[0:5].round(3),
[0.463, -1.199, 1.425, -0.321, -0.729]
)

# With missing values
df_nan2 = df_nan.copy()
df_nan2.iloc[158, 0] = np.nan
Expand Down