1
+ # # ---- include=FALSE------------------------------------------------------
2
+ knitr :: opts_chunk $ set(echo = TRUE )
3
+ options(digits = 7 )
1
4
2
-
3
- # # ------------------------------------------------------------------------
5
+ # # ---- message = FALSE, warning=FALSE-------------------------------------
4
6
library(tidyverse )
5
7
library(ggplot2 )
6
8
library(latex2exp )
@@ -24,42 +26,19 @@ ggplot(data.frame(hours)) +
24
26
annotate(" text" , x = mean(hours ) + 28 , y = 1100 , label = " Mean + 2 * SD" )+
25
27
annotate(" text" , x = mean(hours ) - 28 , y = 1100 , label = " Mean - 2 * SD" )
26
28
27
-
28
- # # ------------------------------------------------------------------------
29
+ # # ----message=FALSE, warning=FALSE, echo=FALSE, eval=TRUE, fig.align="center", fig.height = 4, fig.width = 6----
29
30
student_sample <- sample(1 : 25000 , size = 100 , replace = FALSE )
30
- m1 <- mean(hours [student_sample ])
31
- m1
32
-
33
- # # ------------------------------------------------------------------------
34
- set.seed(12345 )
35
- samples <- 20000
36
- means <- matrix (NA , nrow = samples )
37
- for (i in 1 : samples ){
38
- student_sample <- sample(1 : 25000 , size = 100 , replace = FALSE )
39
- means [i ,] <- mean(hours [student_sample ])
40
- }
41
- meansdf <- data.frame (' true' = mean(hours ), ' sample' = mean(means ))
42
- meansdf <- gather(meansdf )
43
- ggplot(data.frame (means )) +
44
- geom_histogram(aes(x = means ), bins = 30 , fill = ' white' , color = ' black' ) +
45
- theme_bw() +
46
- geom_vline(data = meansdf , aes(xintercept = value , color = key , linetype = key ), size = 1 ) +
47
- scale_color_discrete(labels = c(" Mean of sample means" , " Population mean" )) +
48
- scale_linetype_discrete(labels = c(" Mean of sample means" , " Population mean" )) +
49
- theme(legend.title = element_blank(),legend.position = " bottom" ) +
50
- ggtitle(' Distribution of sample means' )
51
-
52
- # # ------------------------------------------------------------------------
53
- head(means , 5 )
54
- min(means )
55
- max(means )
56
- mean(means )
31
+ sample_1 <- hours [student_sample ]
32
+ ggplot(data.frame (sample_1 )) +
33
+ geom_histogram(aes(x = sample_1 ), bins = 30 , fill = ' white' , color = ' black' ) +
34
+ theme_bw() + xlab(" Hours" ) +
35
+ geom_vline(aes(xintercept = mean(sample_1 )), size = 1 ) +
36
+ ggtitle(TeX(sprintf(" Distribution of listening times ($\\ bar{x}$ = %.2f)" ,round(mean(sample_1 ),2 ))))
57
37
58
- # # ------------------------------------------------------------------------
38
+ # # ----message=FALSE, warning=FALSE, echo=FALSE, eval=TRUE, fig.align="center", fig.height = 6, fig.width = 8----
39
+ # student_sample <- sample(1:25000, size = 100, replace = FALSE)
40
+ # means <- hours[student_sample]
59
41
library(cowplot )
60
- library(gridExtra )
61
- library(grid )
62
- library(latex2exp )
63
42
set.seed(8830 )
64
43
student_sample <- sample(1 : 25000 , size = 100 , replace = FALSE )
65
44
means1 <- hours [student_sample ]
@@ -102,8 +81,61 @@ title <- ggdraw() + draw_label('Distribution of listening times in four differen
102
81
p <- plot_grid(title , p , ncol = 1 , rel_heights = c(0.1 , 1 )) # rel_heights values control title margins
103
82
print(p )
104
83
105
- # # ------------------------------------------------------------------------
84
+
85
+ # # ----message=FALSE, warning=FALSE, echo=FALSE, eval=TRUE, fig.align="center", fig.height = 4, fig.width = 6----
106
86
set.seed(12345 )
87
+ samples <- 20000
88
+ means <- matrix (NA , nrow = samples )
89
+ for (i in 1 : samples ){
90
+ student_sample <- sample(1 : 25000 , size = 100 , replace = FALSE )
91
+ means [i ,] <- mean(hours [student_sample ])
92
+ }
93
+
94
+ meansdf <- data.frame (' true' = mean(hours ), ' sample' = mean(means ))
95
+ meansdf <- gather(meansdf )
96
+ ggplot(data.frame (means )) +
97
+ geom_histogram(aes(x = means ), bins = 30 , fill = ' white' , color = ' black' ) +
98
+ theme_bw() +
99
+ geom_vline(data = meansdf , aes(xintercept = value , color = key , linetype = key ), size = 1 ) +
100
+ scale_color_discrete(labels = c(" Mean of sample means" , " Population mean" )) +
101
+ scale_linetype_discrete(labels = c(" Mean of sample means" , " Population mean" )) +
102
+ theme(legend.title = element_blank(),
103
+ legend.position = " bottom" ) +
104
+ labs(title = " Histogram of listening times" ,
105
+ subtitle = TeX(sprintf(" Population mean ($\\ mu$) = %.2f; population standard deviation ($\\ sigma$) = %.2f" ,round(mean(hours ),2 ),round(sd(hours ),2 ))),
106
+ y = ' Number of students' ,
107
+ x = ' Hours' )
108
+
109
+ # # ----message=FALSE, warning=FALSE, eval=TRUE, echo=FALSE, fig.align="center", fig.cap="Relationship between the sample size and the standard error"----
110
+ set.seed(321 )
111
+ hours <- rnorm(25000 , 50 , 10 )
112
+
113
+ R <- 1000
114
+ sems <- numeric ()
115
+ replication <- numeric ()
116
+
117
+ for (r in 10 : R ) {
118
+ y_sample <- sample(hours , r )
119
+ sem <- sd(hours )/ sqrt(length(y_sample ))
120
+ sems <- rbind(sems , sem )
121
+ replication <- rbind(replication , r )
122
+ }
123
+
124
+ df <- as.data.frame(cbind(replication , sems ))
125
+ ggplot(data = df , aes(y = sems , x = replication )) +
126
+ geom_line() +
127
+ ylab(" Standard error of the mean" ) +
128
+ xlab(" Sample size" ) +
129
+ ggtitle(' Relationship between sample size and standard error' ) +
130
+ theme_bw()
131
+
132
+ # # ----message=FALSE, warning=FALSE, echo=FALSE, eval=TRUE, fig.align="center", fig.height = 6, fig.width = 8----
133
+ library(cowplot )
134
+ library(gridExtra )
135
+ library(grid )
136
+ library(latex2exp )
137
+ set.seed(12345 )
138
+
107
139
sample_size = 10
108
140
samples <- 20000
109
141
means <- matrix (NA , nrow = samples )
@@ -174,13 +206,20 @@ plot4 <- ggplot(data.frame(means)) +
174
206
175
207
p <- plot_grid(plot1 , plot2 , plot3 , plot4 , ncol = 2 ,
176
208
labels = c(" A" , " B" ," C" ," D" ))
177
- title <- ggdraw() + draw_label(' Distribution of sample means ' , fontface = ' bold' )
209
+ title <- ggdraw() + draw_label(' Relationship between sample size and standard error ' , fontface = ' bold' )
178
210
p <- plot_grid(title , p , ncol = 1 , rel_heights = c(0.1 , 1 )) # rel_heights values control title margins
179
211
print(p )
212
+ # now add the title
213
+ # title <- ggdraw() + draw_label("", fontface='bold')
214
+ # plot_grid(title, p, ncol=1, rel_heights=c(0.1, 1)) # rel_heights values control title margins
180
215
181
216
182
- # # ------------------------------------------------------------------------
217
+ # # ----message=FALSE, warning=FALSE, echo=FALSE, eval=TRUE, fig.align="center", fig.height = 3, fig.width = 8----
218
+ library(cowplot )
219
+ library(gridExtra )
220
+ library(grid )
183
221
set.seed(12345 )
222
+
184
223
hours1 <- rnorm(25000 , 50 , 1 )
185
224
sample_size = 100
186
225
samples <- 20000
@@ -218,13 +257,17 @@ plot2 <- ggplot(data.frame(means)) +
218
257
scale_linetype_discrete(labels = c(" Mean of sample means" , " Population mean" )) +
219
258
theme(legend.position = " none" ) + ggtitle(TeX(sprintf(" n = 100; $\\ sigma = 10$; $\\ sigma_{\\ bar x}$ = %.2f" ,round(sd(hours2 )/ sqrt(sample_size ),2 ))))
220
259
221
- p <- plot_grid(plot1 , plot2 , ncol = 1 ,
260
+ p <- plot_grid(plot1 , plot2 , ncol = 2 ,
222
261
labels = c(" A" , " B" ))
223
- title <- ggdraw() + draw_label(' Distribution of sample means ' , fontface = ' bold' )
262
+ title <- ggdraw() + draw_label(' Relationship between population SD and standard error ' , fontface = ' bold' )
224
263
p <- plot_grid(title , p , ncol = 1 , rel_heights = c(0.1 , 1 )) # rel_heights values control title margins
225
264
print(p )
265
+ # now add the title
266
+ # title <- ggdraw() + draw_label("", fontface='bold')
267
+ # plot_grid(title, p, ncol=1, rel_heights=c(0.1, 1)) # rel_heights values control title margins
268
+
226
269
227
- # # -------------------------------------------------------------------- ----
270
+ # # ----message=FALSE, warning=FALSE, echo=TRUE, eval=TRUE, fig.align="center", fig.height = 4, fig.width = 6 ----
228
271
set.seed(321 )
229
272
hours <- rgamma(25000 , shape = 2 , scale = 10 )
230
273
ggplot(data.frame (hours )) +
@@ -235,14 +278,19 @@ ggplot(data.frame(hours)) +
235
278
y = ' Number of students' ,
236
279
x = ' Hours' )
237
280
281
+
282
+ # # ----message=FALSE, warning=FALSE, echo=FALSE, eval=TRUE, fig.align="center", fig.height = 6, fig.width = 8----
283
+ # student_sample <- sample(1:25000, size = 100, replace = FALSE)
284
+ # means <- hours[student_sample]
285
+
238
286
set.seed(8830 )
239
287
student_sample <- sample(1 : 25000 , size = 100 , replace = FALSE )
240
288
means1 <- hours [student_sample ]
241
289
plot1 <- ggplot(data.frame (means1 )) +
242
290
geom_histogram(aes(x = means1 ), bins = 30 , fill = ' white' , color = ' black' ) +
243
291
theme_bw() + xlab(" Hours" ) +
244
292
geom_vline(aes(xintercept = mean(means1 )), size = 1 ) +
245
- ggtitle(TeX(sprintf(" $\\ bar{x}_1 $ = %.2f" ,round(mean(means1 ),2 ))))
293
+ ggtitle(TeX(sprintf(" $\\ bar{x}$ = %.2f" ,round(mean(means1 ),2 ))))
246
294
247
295
set.seed(6789 )
248
296
student_sample <- sample(1 : 25000 , size = 100 , replace = FALSE )
@@ -251,7 +299,7 @@ plot2 <- ggplot(data.frame(means1)) +
251
299
geom_histogram(aes(x = means1 ), bins = 30 , fill = ' white' , color = ' black' ) +
252
300
theme_bw() + xlab(" Hours" ) +
253
301
geom_vline(aes(xintercept = mean(means1 )), size = 1 ) +
254
- ggtitle(TeX(sprintf(" $\\ bar{x}_2 $ = %.2f" ,round(mean(means1 ),2 ))))
302
+ ggtitle(TeX(sprintf(" $\\ bar{x}$ = %.2f" ,round(mean(means1 ),2 ))))
255
303
256
304
set.seed(3904 )
257
305
student_sample <- sample(1 : 25000 , size = 100 , replace = FALSE )
@@ -260,7 +308,7 @@ plot3 <- ggplot(data.frame(means1)) +
260
308
geom_histogram(aes(x = means1 ), bins = 30 , fill = ' white' , color = ' black' ) +
261
309
theme_bw() + xlab(" Hours" ) +
262
310
geom_vline(aes(xintercept = mean(means1 )), size = 1 ) +
263
- ggtitle(TeX(sprintf(" $\\ bar{x}_3 $ = %.2f" ,round(mean(means1 ),2 ))))
311
+ ggtitle(TeX(sprintf(" $\\ bar{x}$ = %.2f" ,round(mean(means1 ),2 ))))
264
312
265
313
set.seed(3333 )
266
314
student_sample <- sample(1 : 25000 , size = 100 , replace = FALSE )
@@ -269,7 +317,7 @@ plot4 <- ggplot(data.frame(means1)) +
269
317
geom_histogram(aes(x = means1 ), bins = 30 , fill = ' white' , color = ' black' ) +
270
318
theme_bw() + xlab(" Hours" ) +
271
319
geom_vline(aes(xintercept = mean(means1 )), size = 1 ) +
272
- ggtitle(TeX(sprintf(" $\\ bar{x}_4 $ = %.2f" ,round(mean(means1 ),2 ))))
320
+ ggtitle(TeX(sprintf(" $\\ bar{x}$ = %.2f" ,round(mean(means1 ),2 ))))
273
321
274
322
p <- plot_grid(plot1 , plot2 , plot3 , plot4 , ncol = 2 ,
275
323
labels = c(" A" , " B" ," C" ," D" ))
@@ -278,9 +326,14 @@ p <- plot_grid(title, p, ncol=1, rel_heights=c(0.1, 1)) # rel_heights values con
278
326
print(p )
279
327
280
328
281
- # # ------------------------------------------------------------------------
329
+ # # ----message=FALSE, warning=FALSE, echo=FALSE, eval=TRUE, fig.align="center", fig.height = 6, fig.width = 8----
330
+ library(cowplot )
331
+ library(gridExtra )
332
+ library(grid )
282
333
set.seed(321 )
334
+
283
335
hours <- rgamma(25000 , shape = 2 , scale = 10 )
336
+
284
337
samples <- 10
285
338
means <- matrix (NA , nrow = samples )
286
339
for (i in 1 : samples ){
@@ -344,7 +397,7 @@ p <- plot_grid(title, p, ncol=1, rel_heights=c(0.1, 1)) # rel_heights values con
344
397
print(p )
345
398
346
399
347
- # # -------------------------------- ----------------------------------------
400
+ # # ---- fig.height = 4, fig.width=6 ----------------------------------------
348
401
set.seed(321 )
349
402
hours <- rgamma(25000 , shape = 2 , scale = 10 )
350
403
@@ -357,33 +410,21 @@ plot2 <- ggplot(data.frame(hours_s)) +
357
410
geom_histogram(aes(x = hours_s ), bins = 30 , fill = ' white' , color = ' black' ) +
358
411
theme_bw() + xlab(" Hours" ) +
359
412
geom_vline(aes(xintercept = mean(hours_s )), size = 1 ) +
360
- ggtitle(TeX(sprintf(" $n$ = %d; $\\ bar{x}$ = %.2f; $s$ = %.2f" ,sample_size ,round(mean(hours_s ),2 ),round(sd(hours_s ),2 ))))
413
+ ggtitle(TeX(sprintf(" Random sample; $n$ = %d; $\\ bar{x}$ = %.2f; $s$ = %.2f" ,sample_size ,round(mean(hours_s ),2 ),round(sd(hours_s ),2 ))))
361
414
plot2
362
415
363
-
364
- # # ------------------------------------------------------------------------
416
+ # # ---- fig.height = 4, fig.width=6----------------------------------------
365
417
qnorm(0.975 )
366
- qnorm(0.025 )
367
418
368
- # # -------------------------------- ----------------------------------------
419
+ # # ---- fig.height = 4, fig.width=6 ----------------------------------------
369
420
sample_mean <- mean(hours_s )
370
421
se <- sd(hours_s )/ sqrt(sample_size )
371
- ci_upper <- sample_mean + qnorm(0.975 )* se
372
422
ci_lower <- sample_mean - qnorm(0.975 )* se
373
- ci_upper
423
+ ci_upper <- sample_mean + qnorm( 0.975 ) * se
374
424
ci_lower
425
+ ci_upper
375
426
376
- plot2 <- ggplot(data.frame (hours_s )) +
377
- geom_histogram(aes(x = hours_s ), bins = 30 , fill = ' white' , color = ' black' ) +
378
- theme_bw() + xlab(" Hours" ) +
379
- geom_vline(aes(xintercept = sample_mean ), size = 1 ) +
380
- geom_vline(aes(xintercept = ci_upper ), size = 1 , color = " red" ) +
381
- geom_vline(aes(xintercept = ci_lower ), size = 1 , color = " red" ) +
382
- ggtitle(TeX(sprintf(" $n$ = %d; $\\ bar{x}$ = %.2f; $s$ = %.2f" ,sample_size ,round(mean(hours_s ),2 ),round(sd(hours_s ),2 ))))
383
- plot2
384
-
385
-
386
- # # ------------------------------------------------------------------------
427
+ # # ---- fig.height = 15, fig.width=10--------------------------------------
387
428
set.seed(12 )
388
429
samples <- 100
389
430
hours <- rgamma(25000 , shape = 2 , scale = 10 )
@@ -404,3 +445,5 @@ ggplot2::ggplot(means_sd, aes(y = y)) +
404
445
scale_color_manual(values = c(" red" , " black" )) +
405
446
guides(color = guide_legend(title = " True mean in CI" )) +
406
447
theme_bw()
448
+
449
+
0 commit comments