Skip to content

Commit e89a1eb

Browse files
committed
update hyp test
1 parent d2edb6a commit e89a1eb

File tree

163 files changed

+1146
-342
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

163 files changed

+1146
-342
lines changed

07-statistical_inference.R

+108-65
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1+
## ---- include=FALSE------------------------------------------------------
2+
knitr::opts_chunk$set(echo = TRUE)
3+
options(digits = 7)
14

2-
3-
## ------------------------------------------------------------------------
5+
## ---- message = FALSE, warning=FALSE-------------------------------------
46
library(tidyverse)
57
library(ggplot2)
68
library(latex2exp)
@@ -24,42 +26,19 @@ ggplot(data.frame(hours)) +
2426
annotate("text", x = mean(hours) + 28, y = 1100, label = "Mean + 2 * SD" )+
2527
annotate("text", x = mean(hours) -28, y = 1100, label = "Mean - 2 * SD" )
2628

27-
28-
## ------------------------------------------------------------------------
29+
## ----message=FALSE, warning=FALSE, echo=FALSE, eval=TRUE, fig.align="center", fig.height = 4, fig.width = 6----
2930
student_sample <- sample(1:25000, size = 100, replace = FALSE)
30-
m1 <- mean(hours[student_sample])
31-
m1
32-
33-
## ------------------------------------------------------------------------
34-
set.seed(12345)
35-
samples <- 20000
36-
means <- matrix(NA, nrow = samples)
37-
for (i in 1:samples){
38-
student_sample <- sample(1:25000, size = 100, replace = FALSE)
39-
means[i,] <- mean(hours[student_sample])
40-
}
41-
meansdf <- data.frame('true' = mean(hours), 'sample' = mean(means))
42-
meansdf <- gather(meansdf)
43-
ggplot(data.frame(means)) +
44-
geom_histogram(aes(x = means), bins = 30, fill='white', color='black') +
45-
theme_bw() +
46-
geom_vline(data = meansdf, aes(xintercept = value, color = key, linetype = key), size=1) +
47-
scale_color_discrete(labels = c("Mean of sample means", "Population mean")) +
48-
scale_linetype_discrete(labels = c("Mean of sample means", "Population mean")) +
49-
theme(legend.title = element_blank(),legend.position = "bottom") +
50-
ggtitle('Distribution of sample means')
51-
52-
## ------------------------------------------------------------------------
53-
head(means, 5)
54-
min(means)
55-
max(means)
56-
mean(means)
31+
sample_1 <- hours[student_sample]
32+
ggplot(data.frame(sample_1)) +
33+
geom_histogram(aes(x = sample_1), bins = 30, fill='white', color='black') +
34+
theme_bw() + xlab("Hours") +
35+
geom_vline(aes(xintercept = mean(sample_1)), size=1) +
36+
ggtitle(TeX(sprintf("Distribution of listening times ($\\bar{x}$ = %.2f)",round(mean(sample_1),2))))
5737

58-
## ------------------------------------------------------------------------
38+
## ----message=FALSE, warning=FALSE, echo=FALSE, eval=TRUE, fig.align="center", fig.height = 6, fig.width = 8----
39+
#student_sample <- sample(1:25000, size = 100, replace = FALSE)
40+
#means <- hours[student_sample]
5941
library(cowplot)
60-
library(gridExtra)
61-
library(grid)
62-
library(latex2exp)
6342
set.seed(8830)
6443
student_sample <- sample(1:25000, size = 100, replace = FALSE)
6544
means1 <- hours[student_sample]
@@ -102,8 +81,61 @@ title <- ggdraw() + draw_label('Distribution of listening times in four differen
10281
p <- plot_grid(title, p, ncol=1, rel_heights=c(0.1, 1)) # rel_heights values control title margins
10382
print(p)
10483

105-
## ------------------------------------------------------------------------
84+
85+
## ----message=FALSE, warning=FALSE, echo=FALSE, eval=TRUE, fig.align="center", fig.height = 4, fig.width = 6----
10686
set.seed(12345)
87+
samples <- 20000
88+
means <- matrix(NA, nrow = samples)
89+
for (i in 1:samples){
90+
student_sample <- sample(1:25000, size = 100, replace = FALSE)
91+
means[i,] <- mean(hours[student_sample])
92+
}
93+
94+
meansdf <- data.frame('true' = mean(hours), 'sample' = mean(means))
95+
meansdf <- gather(meansdf)
96+
ggplot(data.frame(means)) +
97+
geom_histogram(aes(x = means), bins = 30, fill='white', color='black') +
98+
theme_bw() +
99+
geom_vline(data = meansdf, aes(xintercept = value, color = key, linetype = key), size=1) +
100+
scale_color_discrete(labels = c("Mean of sample means", "Population mean")) +
101+
scale_linetype_discrete(labels = c("Mean of sample means", "Population mean")) +
102+
theme(legend.title = element_blank(),
103+
legend.position = "bottom") +
104+
labs(title = "Histogram of listening times",
105+
subtitle = TeX(sprintf("Population mean ($\\mu$) = %.2f; population standard deviation ($\\sigma$) = %.2f",round(mean(hours),2),round(sd(hours),2))),
106+
y = 'Number of students',
107+
x = 'Hours')
108+
109+
## ----message=FALSE, warning=FALSE, eval=TRUE, echo=FALSE, fig.align="center", fig.cap="Relationship between the sample size and the standard error"----
110+
set.seed(321)
111+
hours <- rnorm(25000, 50, 10)
112+
113+
R <- 1000
114+
sems <- numeric()
115+
replication <- numeric()
116+
117+
for (r in 10:R) {
118+
y_sample <- sample(hours, r)
119+
sem <- sd(hours)/sqrt(length(y_sample))
120+
sems <- rbind(sems, sem)
121+
replication <- rbind(replication, r)
122+
}
123+
124+
df <- as.data.frame(cbind(replication, sems))
125+
ggplot(data=df, aes(y = sems, x = replication)) +
126+
geom_line() +
127+
ylab("Standard error of the mean") +
128+
xlab("Sample size") +
129+
ggtitle('Relationship between sample size and standard error') +
130+
theme_bw()
131+
132+
## ----message=FALSE, warning=FALSE, echo=FALSE, eval=TRUE, fig.align="center", fig.height = 6, fig.width = 8----
133+
library(cowplot)
134+
library(gridExtra)
135+
library(grid)
136+
library(latex2exp)
137+
set.seed(12345)
138+
107139
sample_size = 10
108140
samples <- 20000
109141
means <- matrix(NA, nrow = samples)
@@ -174,13 +206,20 @@ plot4 <- ggplot(data.frame(means)) +
174206

175207
p <- plot_grid(plot1, plot2, plot3, plot4, ncol = 2,
176208
labels = c("A", "B","C","D"))
177-
title <- ggdraw() + draw_label('Distribution of sample means', fontface='bold')
209+
title <- ggdraw() + draw_label('Relationship between sample size and standard error', fontface='bold')
178210
p <- plot_grid(title, p, ncol=1, rel_heights=c(0.1, 1)) # rel_heights values control title margins
179211
print(p)
212+
# now add the title
213+
#title <- ggdraw() + draw_label("", fontface='bold')
214+
#plot_grid(title, p, ncol=1, rel_heights=c(0.1, 1)) # rel_heights values control title margins
180215

181216

182-
## ------------------------------------------------------------------------
217+
## ----message=FALSE, warning=FALSE, echo=FALSE, eval=TRUE, fig.align="center", fig.height = 3, fig.width = 8----
218+
library(cowplot)
219+
library(gridExtra)
220+
library(grid)
183221
set.seed(12345)
222+
184223
hours1 <- rnorm(25000, 50, 1)
185224
sample_size = 100
186225
samples <- 20000
@@ -218,13 +257,17 @@ plot2 <- ggplot(data.frame(means)) +
218257
scale_linetype_discrete(labels = c("Mean of sample means", "Population mean")) +
219258
theme(legend.position = "none") + ggtitle(TeX(sprintf("n = 100; $\\sigma = 10$; $\\sigma_{\\bar x}$ = %.2f",round(sd(hours2)/sqrt(sample_size),2))))
220259

221-
p <- plot_grid(plot1, plot2, ncol = 1,
260+
p <- plot_grid(plot1, plot2, ncol = 2,
222261
labels = c("A", "B"))
223-
title <- ggdraw() + draw_label('Distribution of sample means', fontface='bold')
262+
title <- ggdraw() + draw_label('Relationship between population SD and standard error', fontface='bold')
224263
p <- plot_grid(title, p, ncol=1, rel_heights=c(0.1, 1)) # rel_heights values control title margins
225264
print(p)
265+
# now add the title
266+
#title <- ggdraw() + draw_label("", fontface='bold')
267+
#plot_grid(title, p, ncol=1, rel_heights=c(0.1, 1)) # rel_heights values control title margins
268+
226269

227-
## ------------------------------------------------------------------------
270+
## ----message=FALSE, warning=FALSE, echo=TRUE, eval=TRUE, fig.align="center", fig.height = 4, fig.width = 6----
228271
set.seed(321)
229272
hours <- rgamma(25000, shape = 2, scale = 10)
230273
ggplot(data.frame(hours)) +
@@ -235,14 +278,19 @@ ggplot(data.frame(hours)) +
235278
y = 'Number of students',
236279
x = 'Hours')
237280

281+
282+
## ----message=FALSE, warning=FALSE, echo=FALSE, eval=TRUE, fig.align="center", fig.height = 6, fig.width = 8----
283+
#student_sample <- sample(1:25000, size = 100, replace = FALSE)
284+
#means <- hours[student_sample]
285+
238286
set.seed(8830)
239287
student_sample <- sample(1:25000, size = 100, replace = FALSE)
240288
means1 <- hours[student_sample]
241289
plot1 <- ggplot(data.frame(means1)) +
242290
geom_histogram(aes(x = means1), bins = 30, fill='white', color='black') +
243291
theme_bw() + xlab("Hours") +
244292
geom_vline(aes(xintercept = mean(means1)), size=1) +
245-
ggtitle(TeX(sprintf("$\\bar{x}_1$ = %.2f",round(mean(means1),2))))
293+
ggtitle(TeX(sprintf("$\\bar{x}$ = %.2f",round(mean(means1),2))))
246294

247295
set.seed(6789)
248296
student_sample <- sample(1:25000, size = 100, replace = FALSE)
@@ -251,7 +299,7 @@ plot2 <- ggplot(data.frame(means1)) +
251299
geom_histogram(aes(x = means1), bins = 30, fill='white', color='black') +
252300
theme_bw() + xlab("Hours") +
253301
geom_vline(aes(xintercept = mean(means1)), size=1) +
254-
ggtitle(TeX(sprintf("$\\bar{x}_2$ = %.2f",round(mean(means1),2))))
302+
ggtitle(TeX(sprintf("$\\bar{x}$ = %.2f",round(mean(means1),2))))
255303

256304
set.seed(3904)
257305
student_sample <- sample(1:25000, size = 100, replace = FALSE)
@@ -260,7 +308,7 @@ plot3 <- ggplot(data.frame(means1)) +
260308
geom_histogram(aes(x = means1), bins = 30, fill='white', color='black') +
261309
theme_bw() + xlab("Hours") +
262310
geom_vline(aes(xintercept = mean(means1)), size=1) +
263-
ggtitle(TeX(sprintf("$\\bar{x}_3$ = %.2f",round(mean(means1),2))))
311+
ggtitle(TeX(sprintf("$\\bar{x}$ = %.2f",round(mean(means1),2))))
264312

265313
set.seed(3333)
266314
student_sample <- sample(1:25000, size = 100, replace = FALSE)
@@ -269,7 +317,7 @@ plot4 <- ggplot(data.frame(means1)) +
269317
geom_histogram(aes(x = means1), bins = 30, fill='white', color='black') +
270318
theme_bw() + xlab("Hours") +
271319
geom_vline(aes(xintercept = mean(means1)), size=1) +
272-
ggtitle(TeX(sprintf("$\\bar{x}_4$ = %.2f",round(mean(means1),2))))
320+
ggtitle(TeX(sprintf("$\\bar{x}$ = %.2f",round(mean(means1),2))))
273321

274322
p <- plot_grid(plot1, plot2, plot3, plot4, ncol = 2,
275323
labels = c("A", "B","C","D"))
@@ -278,9 +326,14 @@ p <- plot_grid(title, p, ncol=1, rel_heights=c(0.1, 1)) # rel_heights values con
278326
print(p)
279327

280328

281-
## ------------------------------------------------------------------------
329+
## ----message=FALSE, warning=FALSE, echo=FALSE, eval=TRUE, fig.align="center", fig.height = 6, fig.width = 8----
330+
library(cowplot)
331+
library(gridExtra)
332+
library(grid)
282333
set.seed(321)
334+
283335
hours <- rgamma(25000, shape = 2, scale = 10)
336+
284337
samples <- 10
285338
means <- matrix(NA, nrow = samples)
286339
for (i in 1:samples){
@@ -344,7 +397,7 @@ p <- plot_grid(title, p, ncol=1, rel_heights=c(0.1, 1)) # rel_heights values con
344397
print(p)
345398

346399

347-
## ------------------------------------------------------------------------
400+
## ---- fig.height = 4, fig.width=6----------------------------------------
348401
set.seed(321)
349402
hours <- rgamma(25000, shape = 2, scale = 10)
350403

@@ -357,33 +410,21 @@ plot2 <- ggplot(data.frame(hours_s)) +
357410
geom_histogram(aes(x = hours_s), bins = 30, fill='white', color='black') +
358411
theme_bw() + xlab("Hours") +
359412
geom_vline(aes(xintercept = mean(hours_s)), size=1) +
360-
ggtitle(TeX(sprintf("$n$ = %d; $\\bar{x}$ = %.2f; $s$ = %.2f",sample_size,round(mean(hours_s),2),round(sd(hours_s),2))))
413+
ggtitle(TeX(sprintf("Random sample; $n$ = %d; $\\bar{x}$ = %.2f; $s$ = %.2f",sample_size,round(mean(hours_s),2),round(sd(hours_s),2))))
361414
plot2
362415

363-
364-
## ------------------------------------------------------------------------
416+
## ---- fig.height = 4, fig.width=6----------------------------------------
365417
qnorm(0.975)
366-
qnorm(0.025)
367418

368-
## ------------------------------------------------------------------------
419+
## ---- fig.height = 4, fig.width=6----------------------------------------
369420
sample_mean <- mean(hours_s)
370421
se <- sd(hours_s)/sqrt(sample_size)
371-
ci_upper <- sample_mean + qnorm(0.975)*se
372422
ci_lower <- sample_mean - qnorm(0.975)*se
373-
ci_upper
423+
ci_upper <- sample_mean + qnorm(0.975)*se
374424
ci_lower
425+
ci_upper
375426

376-
plot2 <- ggplot(data.frame(hours_s)) +
377-
geom_histogram(aes(x = hours_s), bins = 30, fill='white', color='black') +
378-
theme_bw() + xlab("Hours") +
379-
geom_vline(aes(xintercept = sample_mean), size=1) +
380-
geom_vline(aes(xintercept = ci_upper), size=1, color="red") +
381-
geom_vline(aes(xintercept = ci_lower), size=1, color="red") +
382-
ggtitle(TeX(sprintf("$n$ = %d; $\\bar{x}$ = %.2f; $s$ = %.2f",sample_size,round(mean(hours_s),2),round(sd(hours_s),2))))
383-
plot2
384-
385-
386-
## ------------------------------------------------------------------------
427+
## ---- fig.height = 15, fig.width=10--------------------------------------
387428
set.seed(12)
388429
samples <- 100
389430
hours <- rgamma(25000, shape = 2, scale = 10)
@@ -404,3 +445,5 @@ ggplot2::ggplot(means_sd, aes(y = y)) +
404445
scale_color_manual(values = c("red", "black")) +
405446
guides(color=guide_legend(title="True mean in CI")) +
406447
theme_bw()
448+
449+

0 commit comments

Comments
 (0)