-
Notifications
You must be signed in to change notification settings - Fork 23
/
2_3_ST-regression.tex
1579 lines (988 loc) · 40.5 KB
/
2_3_ST-regression.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
%\documentclass[mathserif]{beamer}
\documentclass[handout]{beamer}
%\usetheme{Goettingen}
\usetheme{Warsaw}
%\usetheme{Singapore}
%\usetheme{Frankfurt}
%\usetheme{Copenhagen}
%\usetheme{Szeged}
%\usetheme{Montpellier}
%\usetheme{CambridgeUS}
%\usecolortheme{}
%\setbeamercovered{transparent}
\usepackage[english, activeacute]{babel}
\usepackage[utf8]{inputenc}
\usepackage{amsmath, amssymb}
\usepackage{dsfont}
\usepackage{graphics}
\usepackage{cases}
\usepackage{graphicx}
\usepackage{pgf}
\usepackage{epsfig}
\usepackage{amssymb}
\usepackage{multirow}
\usepackage{amstext}
\usepackage[ruled,vlined,lined]{algorithm2e}
\usepackage{amsmath}
\usepackage{epic}
\usepackage{epsfig}
\usepackage{fontenc}
\usepackage{framed,color}
\usepackage{palatino, url, multicol}
\usepackage{listings}
%\algsetup{indent=2em}
\newcommand{\factorial}{\ensuremath{\mbox{\sc Factorial}}}
\newcommand{\BIGOP}[1]{\mathop{\mathchoice%
{\raise-0.22em\hbox{\huge $#1$}}%
{\raise-0.05em\hbox{\L
\usepackage{fontenc}
\usepackage{framed,color}
\usepackage{palatino, url, multicol}
\usepackage{listings}
%\algsetup{indent=2em}
\newcommand{\factorial}{\ensuremath{\mbox{\sc Factorial}}}
\newcommand{\BIGOP}[1]{\mathop{\mathchoice%
{\raise-0.22em\hbox{\huge $#1$}}%
{\raise-0.05em\hbox{\Large $#1$}}{\hbox{\large $#1$}}{#1}}}
\newcommand{\bigtimes}{\BIGOP{\times}}
\vspace{-0.5cm}
\title{Introduction to Statistical Inference}
\vspace{-0.5cm}
\author[Felipe Bravo Márquez]{\footnotesize
%\author{\footnotesize
\textcolor[rgb]{0.00,0.00,1.00}{Felipe José Bravo Márquez}}
\date{ \today }
arge $#1$}}{\hbox{\large $#1$}}{#1}}}
\newcommand{\bigtimes}{\BIGOP{\times}}
\vspace{-0.5cm}
\title{Linear Regression}
\vspace{-0.5cm}
\author[Felipe Bravo Márquez]{\footnotesize
%\author{\footnotesize
\textcolor[rgb]{0.00,0.00,1.00}{Felipe José Bravo Márquez}}
\date{ \today }
\begin{document}
\begin{frame}
\titlepage
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}{Introduction}
\scriptsize{
\begin{itemize}
\item A regression model is used to model the relationship of a numerical dependent variable $\mathbf{y}$ with $m$ independent variables $\mathbf{x}_1, \mathbf{x}_2, \dots, \mathbf{x}_m$ \cite{wasserman2013all}.
\item The dependent variable $\mathbf{y}$ is also called \textbf{target}, \textbf{outcome}, or \textbf{response} variable.
\item The independent variables $\mathbf{x}_1, \mathbf{x}_2, \dots, \mathbf{x}_m$ are also called \textbf{covariates}, \textbf{attributes}, \textbf{features}, or \textbf{predictor variables}.
\item Roughly speaking we want to know the expected value of $\mathbf{y}$ from the values of $\mathbf{x}$:
\begin{displaymath}
\mathbb{E}(y|x_1,x_2,\dots,x_m)
\end{displaymath}
\item We use these models when we believe that the response variable $\mathbf{y}$ can be modeled by other independent variables.
\item To perform this type of analysis we need a dataset consisting of $n$ observations that include both the response variable and each of the attributes.
\item We refer to the process of \textbf{fitting} a regression function as the process in which we infer a hypothesis function $h$ from the data that allows us to \textbf{predict} unknown values for $\mathbf{y}$ using the values of the attributes.
\end{itemize}
}
\end{frame}
\begin{frame}{Introduction (2)}
\scriptsize{
\begin{itemize}
\item This process of fitting a function from data is referred to in the areas of data mining and machine learning as \textbf{training}.
\item In those disciplines, functions are said to \textbf{learn} from data.
\item Since we need observations where the value of $\mathbf{y}$ (the output) is known to learn the function, such techniques are referred to as \textbf{supervised learning}.
\item When $\mathbf{y}$ is a categorical variable we have a \textbf{classification} problem.
\end{itemize}
\begin{figure}[h!]
\centering
\includegraphics[scale=0.4]{pics/learning.pdf}
\end{figure}
}
\end{frame}
\begin{frame}{Simple Linear Regression}
\scriptsize{
\begin{itemize}
\item In simple linear regression, we have a single independent variable $\mathbf{x}$ to model the dependent variable $\mathbf{y}$.
\item The following linear relationship between the variables is assumed:
\begin{displaymath}
y_i=\beta_{0}+\beta_{1}x_i +\epsilon_i \quad \forall i
\end{displaymath}
\item The parameter $\beta_{0}$ represents the intercept of the line (the value of $y$ when $x$ is zero).
\item The parameter $\beta_{1}$ is the slope and represents the change of $\mathbf{y}$ when we vary the value of $\mathbf{x}$. The greater the magnitude of this parameter the greater the linear relationship between the variables.
\item The $\epsilon_{i}$ values correspond to the errors or \textbf{residuals} associated with the model.
\item We have to find a linear function or straight line $h_\beta$ that allows us to find an estimate of $y$, $\hat{y}$ for any value of $x$ with the minimum expected error.
\begin{displaymath}
h(x)=\beta_{0}+\beta_{1}x
\end{displaymath}
\end{itemize}
}
\end{frame}
\begin{frame}{Simple Linear Regression}
\scriptsize{
\begin{figure}[h!]
\centering
\includegraphics[scale=0.21]{pics/linear_model.png}
\caption{Source: \url{https://www.jmp.com}}
\end{figure}
}
\end{frame}
\begin{frame}{Least Squares}
\scriptsize{
\begin{itemize}
\item The ordinary least squares method allows us to estimate $\hat{\beta}_{0}$ and $\hat{\beta}_{1}$ by minimizing the sum of squared errors (SSE) of the observed data.
\item Suppose we have $n$ observations of $\mathbf{y}$ and $\mathbf{x}$, we compute the sum of squared errors (SSE) as follows:
\begin{equation}
SSE = \sum_{i=1}^{n} (y_i-h(x_i))^2 = \sum_{i=1}^{n} (y_i-\beta_{0}-\beta_{1}x_i)^2
\end{equation}
\item To find the parameters that minimize the error we calculate the partial derivatives of SSE with respect to $\beta_{0}$ and $\beta_{1}$. Then we equal the derivatives to zero and solve the equation to find the parameter values.
\begin{equation}
\frac{\partial SSE}{ \partial \beta_0} = -2\sum_{i=1}^{n}(y_i-\beta_{0}-\beta_{1}x_i)=0
\end{equation}
\begin{equation}
\frac{\partial SSE}{ \partial \beta_1} = -2\sum_{i=1}^{n}(y_i-\beta_{0}-\beta_{1}x_i)x_i=0
\end{equation}
\end{itemize}
}
\end{frame}
\begin{frame}{Least Squares}
\scriptsize{
\begin{itemize}
\item From the above system of equations the normal solutions are obtained:
\begin{equation}
\hat{\beta}_{1} = \frac{\sum_{i}^{n} (x_i-\overline{x})(y_i-\overline{y}) }{ \sum_{i}^{n} (x_i-\overline{x})^2}
\end{equation}
\begin{equation}
\hat{\beta}_{0} = \overline{y} -\hat{\beta}_{1}\overline{x}
\end{equation}
\item The fitted model represents the line of least squared error.
\begin{figure}[h!]
\centering
\includegraphics[scale=0.35]{pics/Linear_regression.png}
\end{figure}
\end{itemize}
}
\end{frame}
\begin{frame}{Coefficient of Determination $R^2$}
\scriptsize{
\begin{itemize}
\item How can we evaluate the quality of our fitted model?
\item A very common metric is the coefficient of determination $R^2$.
\item It is calculated from errors that are different from SSE.
\item The total sum of squares (SST) is defined as the predictive error when we use the mean $\overline{y}$ to predict the response variable $y$ (it is very similar to the variance of the variable):
\begin{displaymath}
\text{SST} = \sum_{i}^{n}(y_i-\overline{y})^2
\end{displaymath}
\item The regression sum of squares (SSM), on the other hand, indicates the variability of the model's predictions with respect to the mean:
\begin{displaymath}
\text{SSM} = \sum_{i}^{n}(\hat{y}_i-\overline{y})^2
\end{displaymath}
\end{itemize}
}
\end{frame}
\begin{frame}{Coefficient of Determination $R^2$}
\scriptsize{
\begin{itemize}
\item It can be proved that all the above errors are related as follows: \begin{equation}
SST = SSE + SSM
\end{equation}
\item The coefficient of determination for a linear model $R^2$ is defined as:
\begin{equation}
R^2= \frac{\text{SSM}}{\text{SST}} = \frac{\sum_{i}^{n}(\hat{y}_i-\overline{y})^2 }{\sum_{i}^{n}(y_i-\overline{y})^2 }
\end{equation}
\item An alternative but equivalent definition:
\begin{equation}
R^2= 1-\frac{\text{SSE}}{\text{SST}} = 1- \frac{\sum_{i}^{n}(y_i-\hat{y}_i)^2 }{\sum_{i}^{n}(y_i-\overline{y})^2 }
\label{eq:r2}
\end{equation}
\end{itemize}
}
\end{frame}
\begin{frame}{Coefficient of Determination $R^2$}
\scriptsize{
\begin{itemize}
\item Another alternative but equivalent definition:
\begin{equation}
R^2= 1-\frac{\text{var}(\epsilon)}{\text{var}(y)}
\end{equation}
\item It is often interpreted as the ``variance explained'' by the model.
\item The coefficient usually takes values between $0$ to $1$ and the closer its value is to 1 the higher the quality of the model\footnote{It can take negative values when predictions are worse than using $\overline{y}$ in all predictions.}.
\item The value of $R^2$ is equivalent to the squared linear correlation (Pearsons) between $y$ and $\hat{y}$.
\begin{displaymath}
R^2=\text{cor}(y,\hat{y})^2
\end{displaymath}
\end{itemize}
}
\end{frame}
\begin{frame}{Assumptions of the Linear Model}
\scriptsize{
Whenever we fit a linear model we are implicitly making certain assumptions about the data.
\begin{block}{Assumptions}
\begin{enumerate}
\item Linearity: the response variable is linearly related to the attributes.
\item Normality: errors have a zero mean normal distribution: $\epsilon_{i} \sim N(0,\sigma^2)$.
\item Homoscedasticity: errors have constant variance (same value of $\sigma^2$).
\item Independence: errors are independent of each other.
\end{enumerate}
\end{block}
}
\end{frame}
\begin{frame}{Probabilistic Interpretation}
\scriptsize{
\begin{itemize}
\item Considering the above assumptions, we are saying that the probability density (PDF) of the errors $\epsilon$ is a Gaussian with zero mean and constant variance $\sigma^2$:
\begin{displaymath}
\text{PDF}(\epsilon_{i})=\frac{1}{\sqrt{2\pi} \sigma} \exp \left(- \frac{\epsilon_{i}^{2}}{2\sigma^2}\right)
\end{displaymath}
\item Moreover all $\epsilon_{i}$ are IID (independent and identically distributed).
\item This implies that:
\begin{displaymath}
\text{PDF}(y_i|x_{i};\beta)=\frac{1}{\sqrt{2\pi} \sigma} \exp \left(- \frac{(y_i - h_{\beta}(x_{i}) )^{2}}{2\sigma^2}\right)
\end{displaymath}
\item Which implies that the distribution of $\mathbf{y}$ given the values of $\mathbf{x}$, parameterized by $\beta$ is also normally distributed.
\item Let's estimate the parameters of the model ($\beta$) using maximum likelihood estimation.
\end{itemize}
}
\end{frame}
\begin{frame}{Probabilistic Interpretation}
\scriptsize{
\begin{itemize}
\item The likelihood function of $\beta$ can be written as:
\begin{displaymath}
\mathcal{L}(\beta) = \prod_{i=1}^{n}\frac{1}{\sqrt{2\pi} \sigma} \exp \left(- \frac{(y_i - h_{\beta}(x_{i}) )^{2}}{2\sigma^2}\right)
\end{displaymath}
\item and the the log likelihood $l_n(\beta)$:
\begin{align}
l_n(\beta) & = \log \mathcal{L}(\beta) \\
& = \log \prod_{i=1}^{n}\frac{1}{\sqrt{2\pi} \sigma} \exp \left(- \frac{(y_i - h_{\beta}(x_{i}) )^{2}}{2\sigma^2}\right) \\
& = \sum_{i=1}^n \log \frac{1}{\sqrt{2\pi} \sigma} \exp \left(- \frac{(y_i - h_{\beta}(x_{i}) )^{2}}{2\sigma^2}\right) \\
& = n \log\left(\frac{1}{\sqrt{2\pi} \sigma}\right) - \frac{1}{\sigma^2}\times \frac{1}{2}\sum_{i=1}^n(y_i - h_{\beta}(x_{i}) )^{2}
\end{align}
\end{itemize}
}
\end{frame}
\begin{frame}{Probabilistic Interpretation}
\scriptsize{
\begin{itemize}
\item After ignoring the constants, we realize that maximizing $l_n(\beta)$ is equivalent to minimizing:
\begin{displaymath}
\sum_{i=1}^n(y_i - h_{\beta}(x_{i}))^{2}
\end{displaymath}
\item Which is equivalent to minimizing SSE!
\item Then, if one estimates the parameters of $\beta$ using maximum likelihood estimation one arrives at the same results as doing least squares estimation.
\item Therefore, every time we perform least squares estimation, we are implicitly making the same probabilistic assumptions mentioned above (e.g., normality, homoscedasticity).
\end{itemize}
}
\end{frame}
\begin{frame}{Standard errors for regression models}
\scriptsize{
\begin{itemize}
\item If we want to make inferences about the regression parameter estimates, then we also need an estimate of their variability \cite{poldrack2019statistical}.
\item To compute this, we first need to compute the \textbf{mean squared error} $MS_{error}$ of the model:
\begin{displaymath}
MS_{error} = \frac{SSE}{df} = \frac{\sum_{i=1}^{n} (y_i-h(x_i))^2}{n-p}
\end{displaymath}
\item The degrees of freedom ($df$) are determined by subtracting the number of estimated parameters (2 in this case: $\beta_0$ and $\beta_1$) from the number of observations ($n$).
\item We can then compute the standard error for the model as:
\begin{displaymath}
SE_{model} = \sqrt{MS_{error}}
\end{displaymath}
\end{itemize}
}
\end{frame}
\begin{frame}{A significance test for $\beta$}
\scriptsize{
\begin{itemize}
\item In order to get the standard error for a specific regression parameter estimate, $SE_{\beta_x}$, we need to rescale the standard error of the model by the square root of the sum of squares of the X variable:
\begin{displaymath}
SE_{\hat{\beta}_x} = \frac{SE_{model}}{\sqrt{\sum_{i=1}^n(x_i-\overline{x})^2}}
\end{displaymath}
\item Now we can compute a $t$ statistic to tell us the likelihood of the observed parameter estimates compared to some expected value under the null hypothesis.
\item In this case we will test against the null hypothesis of no effect (i.e. $\beta_{H_0}=0$):
\begin{displaymath}
t_{n-p} = \frac{\hat{\beta}-\beta_{H_0}}{ SE_{\hat{\beta}_x}} = \frac{\hat{\beta}}{ SE_{\hat{\beta}_x}}
\end{displaymath}
\item Later we will see that R automatically reports the t-statistics and p-values of all coefficients of a linear model.
\item This allows us to determine whether the linear relationship between the two variables ($y$ and $x$) is significant.
\end{itemize}
}
\end{frame}
\begin{frame}[fragile]{Example: a model of height}
\scriptsize{
\begin{itemize}
\item We are going to work with the dataset \verb+Howell1+ that has demographic data from Kalahari !Kung San people collected by Nancy Howell in the late 1960s.
\item The !Kung San are the most famous foraging population of the twentieth century, largely because of detailed quantitative studies by people like Howell. \cite{mcelreath2020statistical}
\end{itemize}
\begin{figure}[h!]
\centering
\includegraphics[scale=0.6]{pics/San_Schmuck.jpg}
\caption{By Staehler - Own work, CC BY-SA 4.0, \url{https://commons.wikimedia.org/w/index.php?curid=45076017}}
\end{figure}
}
\end{frame}
\begin{frame}[fragile]{Example: a model of height}
\scriptsize{
\begin{itemize}
\item Each observation corresponds to an individual.
\item The variables of the dataset are:
\begin{enumerate}
\scriptsize{
\item height: height in cm
\item weight: weight in kg
\item age: age in years
\item male: gender indicator (1 for male 0 for woman)
}
\end{enumerate}
\item Let's explore the linear correlations between the variables
\begin{verbatim}
> d <- read.csv("howell1.csv")
> cor(d)
height weight age male
height 1.0000000 0.9408222 0.683688567 0.139229021
weight 0.9408222 1.0000000 0.678335313 0.155442866
age 0.6836886 0.6783353 1.000000000 0.005887126
male 0.1392290 0.1554429 0.005887126 1.000000000
\end{verbatim}
\end{itemize}
}
\end{frame}
\begin{frame}[fragile]{Example: a model of height}
\scriptsize{
\begin{itemize}
\item We can see that there is a positive correlation between \verb+height+ and \verb+age+.
\item Let's filter out the non-adult examples because we know that height is strongly correlated with age before adulthood.
\begin{verbatim}
d2 <- d[ d$age >= 18 , ]
\end{verbatim}
\item Now age doesn't correlate with height:
\begin{verbatim}
> cor(d2)
height weight age male
height 1.0000000 0.7547479 -0.10183776 0.69999340
weight 0.7547479 1.0000000 -0.17290430 0.52445271
age -0.1018378 -0.1729043 1.00000000 0.02845498
male 0.6999934 0.5244527 0.02845498 1.00000000
\end{verbatim}
\item Let's model height as a function of weight using a simple linear regression:
\begin{displaymath}
\text{height}(\text{weight})=\beta_0+\beta_1*\text{weight}
\end{displaymath}
\end{itemize}
}
\end{frame}
\begin{frame}[fragile]{Example: a model of height}
\scriptsize{
\begin{itemize}
\item In R the linear models are created with the command \verb+lm+ that receives formulas of the form \verb+y~x+ ($y=f(x)$).
\begin{verbatim}
> reg1<-lm(height~weight,d2)
> reg1
Call:
lm(formula = height ~ weight, data = d2)
Coefficients:
(Intercept) weight
113.879 0.905
\end{verbatim}
\end{itemize}
}
\end{frame}
\begin{frame}[fragile]{Example: a model of height}
\scriptsize{
\begin{itemize}
\item We can see that the coefficients of the model are $\beta_{0}=113.879$ and $\beta_{1}=0.905$.
\item The estimate of $\beta_{0}=113.879$, indicates that a person of weight 0 should be around $114$cm tall, which we know is nonsense but it what our linear model believes.
\item Since $\beta_1$ is a slope, the value $0.905$ can be read as a person 1 kg heavier is expected to be 0.90 cm taller.
\item We can directly access the coefficients and store them in a variable:
\begin{verbatim}
> reg1.coef<-reg1$coefficients
> reg1.coef
(Intercept) weight
113.8793936 0.9050291
\end{verbatim}
\end{itemize}
}
\end{frame}
\begin{frame}[fragile]{Example: a model of height}
\scriptsize{
\begin{itemize}
\item We can view various indicators about the linear model with the command \textbf{summary}:
\begin{verbatim}
> summary(reg1)
Call:
lm(formula = height ~ weight, data = d2)
Residuals:
Min 1Q Median 3Q Max
-19.7464 -2.8835 0.0222 3.1424 14.7744
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 113.87939 1.91107 59.59 <2e-16 ***
weight 0.90503 0.04205 21.52 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 5.086 on 350 degrees of freedom
Multiple R-squared: 0.5696, Adjusted R-squared: 0.5684
F-statistic: 463.3 on 1 and 350 DF, p-value: < 2.2e-16
\end{verbatim}
\item We can see that $\beta_0$ and $\beta_1$ are both statistically significantly different from zero.
\end{itemize}
}
\end{frame}
\begin{frame}[fragile]{Example: a model of height}
\scriptsize{
\begin{itemize}
\item We see that the coefficient of determination $R^2$ has a value of $0.57$ which is not so good but acceptable.
\item We can conclude that the weight while providing useful information to model a part of the variability of the height of the !Kung people, is not enough to build a highly reliable model.
\item We can store the results of the command \verb+summary+ in a variable then access the coefficient of determination:
\begin{verbatim}
> sum.reg1<-summary(reg1)
> sum.reg1$r.squared
[1] 0.5696444
\end{verbatim}
\item We can also access the fitted values which are the values predicted by my model for the data used:
\begin{verbatim}
> reg1$fitted.values
1 2 3 4 5 6
157.1630 146.9001 142.7180 161.8839 151.2362 170.8895
\end{verbatim}
\end{itemize}
}
\end{frame}
\begin{frame}[fragile]{Example: a model of height}
\scriptsize{
\begin{itemize}
\item We can check now that all definitions given for $R^2$ are equivalent.
\begin{verbatim}
> SSE<-sum(reg1$residuals^2)
> SST<-sum((d2$height-mean(d2$height))^2)
> SSM<-sum((reg1$fitted.values-mean(d2$height))^2)
> SSM/SST
[1] 0.5696444
> 1-SSE/SST
[1] 0.5696444
> 1-var(reg1$residuals)/var(d2$height)
[1] 0.5696444
> cor(d2$height,reg1$fitted.values)^2
[1] 0.5696444
\end{verbatim}
\item Or manually calculate the standard error for the model $SE_{\text{model}}$:
\begin{verbatim}
> df <- dim(d2)[1]-2
> SE <- sqrt(SSE/df)
> SE
[1] 5.086336
\end{verbatim}
\end{itemize}
}
\end{frame}
\begin{frame}[fragile]{Example: a model of height}
\scriptsize{
\begin{itemize}
\item Suppose now that we know the weight for two !Kung people but we don't know their height.
\item We could use our linear model to predict the height of these two people.
\item To do this in R we must use the command \verb+predict.lm+ which receives the linear model and a data.frame with the new data:
\begin{verbatim}
> new.weights<-data.frame(weight=c(50,62))
> predict.lm(object=reg1,newdata=new.weights)
1 2
159.1308 169.9912
> # this is equivalent to:
> reg1.coef[1]+reg1.coef[2]*new.weights[1:2,]
[1] 159.1308 169.99122
\end{verbatim}
\end{itemize}
}
\end{frame}
\begin{frame}{Multivariate Linear Regression}
\scriptsize{
\begin{itemize}
\item Suppose we have $m$ independent variables: $x_1,x_2,\dots,x_m$.
\item In many cases more variables can better explain the variability of the response variable $\mathbf{y}$ than a single one.
\item A multivariate linear model is defined as follows:
\begin{displaymath}
y_i=\beta_{0}+\beta_{1}x_{i,1}+\beta_{2}x_{i,2} + \cdots + \beta_{m}x_{i,m} + \epsilon_i \quad \forall i \in \{1,n\}
\end{displaymath}
\item In essence we are adding a parameter for each independent variable.
\item Then, we multiply the parameter by the variable and add that term to the linear model.
\item In the multivariate model, all the properties of the simple linear model are extended.
\item The problem can be represented in a matrix form:
\begin{displaymath}
Y=X\beta+\epsilon
\end{displaymath}
\end{itemize}
}
\end{frame}
\begin{frame}{Multivariate Linear Regression}
\scriptsize{
\begin{itemize}
\item Where $Y$ is a vector $n\times 1$ response variables:
\begin{displaymath}
Y =
\begin{pmatrix}
y_{1} \\
y_{2} \\
\vdots \\
y_{n}
\end{pmatrix}
\end{displaymath}
\item $X$ is a $n \times (m+1)$ matrix with the explanatory variables. We have $n$ observations of the $m$ variables. The first column is constant equal to $1$ ($x_{i,0}=1 \quad \forall i$) to model the intercept variables $\beta_0$.
\begin{displaymath}
X =
\begin{pmatrix}
x_{1,0} & x_{1,1} & x_{1,2} & \cdots & x_{1,m} \\
x_{2,0} & x_{2,1} & x_{2,2} & \cdots & x_{2,m} \\
\vdots & \vdots & \vdots & \ddots & \vdots \\
x_{n,0} & x_{n,1} & x_{n,2} & \cdots & x_{n,m}
\end{pmatrix}
\end{displaymath}
\end{itemize}
}
\end{frame}
\begin{frame}{Multivariate Linear Regression}
\scriptsize{
\begin{itemize}
\item Then, $\beta$ is a $(m+1) \times 1$ vector of parameters.
\begin{displaymath}
\beta =
\begin{pmatrix}
\beta_{0} \\
\beta_{1} \\
\vdots \\
\beta_{m}
\end{pmatrix}
\end{displaymath}
\item Finally, $\epsilon$ is a $n \times 1$ vector with the model errors.
\begin{displaymath}
\epsilon =
\begin{pmatrix}
\epsilon_{1} \\
\epsilon_{2} \\
\vdots \\
\epsilon_{n}
\end{pmatrix}
\end{displaymath}
\item Using matrix notation, we can see that the sum of squared errors (SSE) can be expressed as:
\begin{displaymath}
\text{SSE} = (Y - X\beta)^{T}(Y-X\beta)
\end{displaymath}
\item Minimizing this expression by deriving the error as a function of $\beta$ and setting it equal to zero leads to the normal equations:
\begin{displaymath}
\hat{\beta} = (X^{T}X)^{-1} X^{T}Y
\end{displaymath}
\end{itemize}
}
\end{frame}
\begin{frame}[fragile]{Multivariate Linear Regression in R}
\scriptsize{
\begin{itemize}
\item Now we will study a multiple linear regression for the \verb+Howell1+ data.
\item Let's add the variable \textbf{age} as an additional predictor for \textbf{height}.
\item We know that age is good at predicting height for non-adults so we will work with the original dataset.
\item Let's fit the following linear multi-variate model:
\begin{displaymath}
\text{height}(\text{weight,age})=\beta_{0}+\beta_{1}*\text{weight}+\beta_{2}*\text{age}
\end{displaymath}
\item In R, we add more variables to the linear model with the operator \textbf{+} :
\begin{verbatim}
reg2<-lm(height~weight+age,d)
\end{verbatim}
\end{itemize}
}
\end{frame}
\begin{frame}[fragile]{Multivariate Linear Regression in R}
\scriptsize{
\begin{verbatim}
> summary(reg2)
Call:
lm(formula = height ~ weight + age, data = d)
Residuals:
Min 1Q Median 3Q Max
-29.0350 -5.4228 0.7333 6.4919 19.6964
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 75.96329 1.04216 72.890 < 2e-16 ***
weight 1.65710 0.03656 45.324 < 2e-16 ***
age 0.11212 0.02594 4.322 1.84e-05 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 9.214 on 541 degrees of freedom
Multiple R-squared: 0.889, Adjusted R-squared: 0.8886
F-statistic: 2166 on 2 and 541 DF, p-value: < 2.2e-16
\end{verbatim}
}
\end{frame}
\begin{frame}[fragile]{Multivariate Linear Regression in R}
\scriptsize{
\begin{itemize}
\item When we had a simple regression we could see the fitted model as a line.
\item Now that we have two independent variables we can see the fitted model as a plane.
\item If we had more independent variables our model would be a hyper-plane.
\item We can plot the plane of our linear model of two independent variables and one dependent variable in R as follows:
\end{itemize}
\begin{verbatim}
library("scatterplot3d")
s3d <- scatterplot3d(d[,c("weight","age","height")],
type="h", highlight.3d=TRUE,
angle=55, scale.y=0.7, pch=16,
main="height~weight+age")
s3d$plane3d(reg2, lty.box = "solid")
\end{verbatim}
}
\end{frame}
\begin{frame}{Multivariate Linear Regression in R}