-
-
Notifications
You must be signed in to change notification settings - Fork 155
Add residuals_ attribute to pg.anova output #260
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 8 commits
fdda464
cd566bb
e21366b
4c38d70
dbe190f
dccc1a9
7089127
7b71c7f
99f6e47
6c1b086
d9ab352
1d5a064
50a6bbe
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -532,7 +532,8 @@ def rm_anova(data=None, dv=None, within=None, subject=None, correction='auto', | |
|
||
# Calculate sums of squares | ||
ss_with = ((grp_with.mean() - grandmean)**2 * grp_with.count()).sum() | ||
ss_resall = grp_with.apply(lambda x: (x - x.mean())**2).sum() | ||
resid = grp_with.apply(lambda x: (x - x.mean())) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you have any documentation for this definition of the residuals? If so, can you add as a comment |
||
ss_resall = (resid**2).sum() | ||
# sstotal = sstime + ss_resall = sstime + (sssubj + sserror) | ||
# ss_total = ((data[dv] - grandmean)**2).sum() | ||
# We can further divide the residuals into a within and between component: | ||
|
@@ -611,7 +612,11 @@ def rm_anova(data=None, dv=None, within=None, subject=None, correction='auto', | |
|
||
aov = aov.reindex(columns=col_order) | ||
aov.dropna(how='all', axis=1, inplace=True) | ||
return _postprocess_dataframe(aov) | ||
aov = _postprocess_dataframe(aov) | ||
|
||
aov.residuals_ = 0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Alternatively, we could use pandas.DataFrame.attrs, e.g: df.attrs = dict(residuals=resid) although I have never tried it so I don't know if it works well |
||
aov.residuals_ = resid | ||
return aov | ||
|
||
|
||
def rm_anova2(data=None, dv=None, within=None, subject=None, effsize="np2"): | ||
|
@@ -734,6 +739,9 @@ def rm_anova2(data=None, dv=None, within=None, subject=None, effsize="np2"): | |
p_b_corr = f(df_b_c, df_bs_c).sf(f_b) | ||
p_ab_corr = f(df_ab_c, df_abs_c).sf(f_ab) | ||
|
||
# Residuals | ||
resid = data.groupby([a, b], observed=True)[dv].transform(lambda x: x - x.mean()) | ||
|
||
# Create dataframe | ||
aov = pd.DataFrame({ | ||
'Source': [a, b, a + ' * ' + b], | ||
|
@@ -746,7 +754,11 @@ def rm_anova2(data=None, dv=None, within=None, subject=None, effsize="np2"): | |
'p-GG-corr': [p_a_corr, p_b_corr, p_ab_corr], | ||
effsize: [ef_a, ef_b, ef_ab], | ||
'eps': [eps_a, eps_b, eps_ab]}) | ||
return _postprocess_dataframe(aov) | ||
aov = _postprocess_dataframe(aov) | ||
|
||
aov.residuals_ = 0 | ||
aov.residuals_ = resid | ||
return aov | ||
|
||
|
||
@pf.register_dataframe_method | ||
|
@@ -793,6 +805,12 @@ def anova(data=None, dv=None, between=None, ss_type=2, detailed=False, | |
* ``'p-unc'``: uncorrected p-values | ||
* ``'np2'``: Partial eta-square effect sizes | ||
|
||
In adition, the output dataframe comes with hidden attribute | ||
for the residuals that can be accessed as follows: | ||
|
||
>>> aov = pg.anova() # doctest: +SKIP | ||
>>> aov.residuals_ # doctest: +SKIP | ||
|
||
See Also | ||
-------- | ||
rm_anova : One-way and two-way repeated measures ANOVA | ||
|
@@ -938,9 +956,9 @@ def anova(data=None, dv=None, between=None, ss_type=2, detailed=False, | |
ssbetween = ((grp.mean() - data[dv].mean())**2 * grp.count()).sum() | ||
# Within effect (= error between) | ||
# = (grp.var(ddof=0) * grp.count()).sum() | ||
sserror = grp.apply(lambda x: (x - x.mean())**2).sum() | ||
error = grp.apply(lambda x: x - x.mean()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here, do you have any documentation / reference implementation for this method of calculating the residuals? |
||
sserror = (error**2).sum() | ||
# In 1-way ANOVA, sstotal = ssbetween + sserror | ||
# sstotal = ssbetween + sserror | ||
|
||
# Calculate DOF, MS, F and p-values | ||
ddof1 = n_groups - 1 | ||
|
@@ -978,7 +996,10 @@ def anova(data=None, dv=None, between=None, ss_type=2, detailed=False, | |
}) | ||
|
||
aov.dropna(how='all', axis=1, inplace=True) | ||
return _postprocess_dataframe(aov) | ||
aov = _postprocess_dataframe(aov) | ||
aov.residuals_ = 0 # Trick to avoid Pandas warning | ||
aov.residuals_ = error # Residuals is a hidden attribute | ||
return aov | ||
|
||
|
||
def anova2(data=None, dv=None, between=None, ss_type=2, effsize='np2'): | ||
|
@@ -1010,7 +1031,8 @@ def anova2(data=None, dv=None, between=None, ss_type=2, effsize='np2'): | |
ss_fac1 = aov_fac1.at[0, 'SS'] | ||
ss_fac2 = aov_fac2.at[0, 'SS'] | ||
ss_tot = ((data[dv] - data[dv].mean())**2).sum() | ||
ss_resid = np.sum(grp_both.apply(lambda x: (x - x.mean())**2)) | ||
resid = grp_both.apply(lambda x: (x - x.mean())) | ||
ss_resid = (resid**2).sum() | ||
ss_inter = ss_tot - (ss_resid + ss_fac1 + ss_fac2) | ||
# Degrees of freedom | ||
df_fac1 = aov_fac1.at[0, 'DF'] | ||
|
@@ -1062,7 +1084,11 @@ def anova2(data=None, dv=None, between=None, ss_type=2, effsize='np2'): | |
}) | ||
|
||
aov.dropna(how='all', axis=1, inplace=True) | ||
return _postprocess_dataframe(aov) | ||
aov = _postprocess_dataframe(aov) | ||
|
||
aov.residuals_ = 0 | ||
aov.residuals_ = resid | ||
return aov | ||
|
||
|
||
def anovan(data=None, dv=None, between=None, ss_type=2, effsize='np2'): | ||
|
@@ -1142,6 +1168,10 @@ def format_source(x): | |
# Add formula to dataframe | ||
aov = _postprocess_dataframe(aov) | ||
aov.formula_ = formula | ||
|
||
# Add residuals to dataframe | ||
aov.residuals_ = 0 | ||
aov.residuals_ = lm.resid | ||
return aov | ||
|
||
|
||
|
@@ -1276,7 +1306,8 @@ def welch_anova(data=None, dv=None, between=None): | |
adj_grandmean = (weights * grp.mean()).sum() / weights.sum() | ||
|
||
# Sums of squares (regular and adjusted) | ||
ss_res = grp.apply(lambda x: (x - x.mean())**2).sum() | ||
resid = grp.apply(lambda x: x - x.mean()) | ||
ss_res = (resid**2).sum() | ||
ss_bet = ((grp.mean() - data[dv].mean())**2 * grp.count()).sum() | ||
ss_betadj = np.sum(weights * np.square(grp.mean() - adj_grandmean)) | ||
ms_betadj = ss_betadj / ddof1 | ||
|
@@ -1296,7 +1327,12 @@ def welch_anova(data=None, dv=None, between=None): | |
'p-unc': pval, | ||
'np2': np2 | ||
}, index=[0]) | ||
return _postprocess_dataframe(aov) | ||
|
||
aov = _postprocess_dataframe(aov) | ||
aov.residuals_ = 0 | ||
aov.residuals_ = resid | ||
|
||
return aov | ||
|
||
|
||
@pf.register_dataframe_method | ||
|
@@ -1436,7 +1472,8 @@ def mixed_anova(data=None, dv=None, within=None, subject=None, between=None, | |
# Extract residuals and interactions | ||
grp = data.groupby([between, within], observed=True)[dv] | ||
# ssresall = residuals within + residuals between | ||
ss_resall = grp.apply(lambda x: (x - x.mean())**2).sum() | ||
resid = grp.apply(lambda x: x - x.mean()) | ||
ss_resall = (resid**2).sum() | ||
# Interaction | ||
ss_inter = ss_total - (ss_resall + ss_with + ss_betw) | ||
ss_reswith = aov_with.at[1, 'SS'] - ss_inter | ||
|
@@ -1504,7 +1541,12 @@ def mixed_anova(data=None, dv=None, within=None, subject=None, between=None, | |
'sphericity', 'W-spher', 'p-spher'] | ||
aov = aov.reindex(columns=col_order) | ||
aov.dropna(how='all', axis=1, inplace=True) | ||
return _postprocess_dataframe(aov) | ||
aov = _postprocess_dataframe(aov) | ||
|
||
aov.residuals_ = 0 | ||
aov.residuals_ = resid | ||
|
||
return aov | ||
|
||
|
||
@pf.register_dataframe_method | ||
|
@@ -1635,4 +1677,8 @@ def ancova(data=None, dv=None, between=None, covar=None, effsize="np2"): | |
# Add bw as an attribute (for rm_corr function) | ||
aov = _postprocess_dataframe(aov) | ||
aov.bw_ = model.params.iloc[-1] | ||
|
||
# Add residuals attribute | ||
aov.residuals_ = 0 | ||
aov.residuals_ = model.resid | ||
return aov |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -181,6 +181,16 @@ def test_anova(self): | |
aov = anova(dv='Pain threshold', between='Hair color', data=df_pain, | ||
effsize="n2", detailed=True).round(3) | ||
assert aov.at[0, 'n2'] == .576 | ||
# Compare residuals with R | ||
# Use different dataset to produce residuals more than 3 decimal places | ||
# aov(Cholesterol ~ Risk, DF)$residuals | ||
resid = read_dataset('anova3').anova( | ||
dv='Cholesterol', between=['Risk'] | ||
).residuals_ | ||
array_equal( | ||
resid[0:5].round(3), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I noticed that you are always testing only the first 5 values of the residuals. We need to test all values, i.e. array_equal on all the residuals |
||
[-0.177, 1.816, -0.444, -1.469, 0.693] | ||
) | ||
|
||
# Unbalanced and with missing values | ||
df_pain.loc[[17, 18], 'Pain threshold'] = np.nan | ||
|
@@ -218,6 +228,16 @@ def test_anova(self): | |
aov2 = anova(dv="Yield", between=["Blend", "Crop"], | ||
data=df_aov2, effsize="n2").round(4) | ||
array_equal(aov2.loc[[0, 1, 2], 'n2'], [0.0001, 0.1843, 0.1589]) | ||
# Compare residuals with R | ||
# Use different dataset to produce residuals more than 3 decimal places | ||
# aov(Cholesterol ~ Risk * Drug, DF)$residuals | ||
resid = read_dataset('anova3').anova( | ||
dv='Cholesterol', between=['Risk', 'Drug'] | ||
).residuals_ | ||
array_equal( | ||
resid[0:5].round(3), | ||
[-0.022, 1.972, -0.288, -1.313, 0.849] | ||
) | ||
|
||
# Two-way ANOVA with unbalanced design | ||
df_aov2 = read_dataset('anova2_unbalanced') | ||
|
@@ -260,6 +280,15 @@ def test_anova(self): | |
0.057, 0.044, np.nan]) | ||
array_equal(aov3_ss1.loc[:, 'p-unc'], [0.123, 0.001, 0.619, 0.711, | ||
0.229, 0.245, 0.343, np.nan]) | ||
# Compare residuals with R | ||
# ANOVA = aov(Cholesterol~Sex*Risk*Drug, DF)$residuals | ||
resid = df_aov3.anova(dv="Cholesterol", | ||
between=["Sex", "Risk", "Drug"]).residuals_ | ||
array_equal( | ||
resid[0:5].round(3), | ||
[-0.261, 1.732, -0.527, -1.553, 0.610] | ||
) | ||
|
||
# Unbalanced | ||
df_aov3 = read_dataset('anova3_unbalanced') | ||
aov3_ss1 = anova(dv="Cholesterol", between=['Sex', 'Risk', 'Drug'], | ||
|
@@ -334,6 +363,14 @@ def test_rm_anova(self): | |
assert aov.at[0, 'F'] == 3.913 | ||
assert aov.at[0, 'p-unc'] == .023 | ||
assert aov.at[0, 'np2'] == .062 | ||
# Compare residuals with R package rstatix | ||
# ANOVA = rstatix::anova_test(DF, dv='Scores', within='Time', wid='Subject') | ||
# attr(ANOVA, 'args')$model$residuals | ||
resid = df.rm_anova(dv='Scores', within='Time', subject='Subject').residuals_ | ||
array_equal( | ||
resid[0:5].round(3), | ||
[0.501, -1.161, 1.462, -0.283, -0.691] | ||
) | ||
|
||
# Same but with categorical columns | ||
aov = rm_anova(dv='Scores', within='Time', subject='Subject', | ||
|
@@ -375,6 +412,15 @@ def test_rm_anova2(self): | |
array_equal(aov.loc[:, 'F'], [33.852, 26.959, 12.632]) | ||
array_equal(aov.loc[:, 'np2'], [0.790, 0.750, 0.584]) | ||
array_equal(aov.loc[:, 'eps'], [1., 0.969, 0.727]) | ||
# Compare residuals with R package rstatix | ||
# ANOVA = rstatix::anova_test(DF, dv='Performance', within=c('Time', 'Metric'), wid='Subject') | ||
# attr(ANOVA, 'args')$model$residuals | ||
resid = data.rm_anova(dv='Performance', within=['Time', 'Metric'], | ||
subject='Subject').residuals_ | ||
array_equal( | ||
resid[0:5], | ||
[1, -3, -1, 7, -6] | ||
) | ||
|
||
# With categorical | ||
data_cat = data.copy() | ||
|
@@ -450,6 +496,18 @@ def test_mixed_anova(self): | |
between='Group', data=df, effsize="ng2").round(3) | ||
array_equal(aov.loc[:, 'ng2'], [0.031, 0.042, 0.029]) | ||
|
||
# Compare residuals with R package rstatix | ||
# ANOVA <- rstatix::anova_test(DF, dv='Scores', between='Group', | ||
# within='Time', wid='Subject') | ||
# attr(ANOVA, 'args')$model$residuals | ||
resid = df.mixed_anova( | ||
dv='Scores', within='Time', between='Group', subject='Subject' | ||
).residuals_ | ||
array_equal( | ||
resid[0:5].round(3), | ||
[0.463, -1.199, 1.425, -0.321, -0.729] | ||
) | ||
|
||
# With missing values | ||
df_nan2 = df_nan.copy() | ||
df_nan2.iloc[158, 0] = np.nan | ||
|
Uh oh!
There was an error while loading. Please reload this page.