-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathR tutorial 2.R
208 lines (157 loc) · 7.17 KB
/
R tutorial 2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
### ------------------------------------- ###
### Data Management ###
### ------------------------------------- ###
# 0. Set directory
setwd("/YOUR DATA LOCATION/")
# 1. Example 1: combine data files based on subjects' ID
## Import files:
library(readxl)
WhichFile<- "Data 2/hdp_abc.xlsx"
excel_sheets(path = WhichFile)
# [1] "hdp_a" "hdp_b" "hdp_c"
df_a=read_xlsx(WhichFile, sheet="hdp_a" )
df_b=read_xlsx(WhichFile, sheet="hdp_b" )
## Combine data
### If we want to obtain the frequency of remission in married and not married groups,
### we need to combine data from df_a and df_b:
df_ab=merge(df_a,df_b, by="ID" )
table(df_ab$remission)
# 0 1
# 6004 2521
table(df_ab$Married)
# 0 1 married Married Not Married NotMarried
# 3408 5113 1 1 1 1
class(df_ab$Married)
# [1] "character"
### Clean data:
#### Convert all letters to lower case:
df_ab$Married_new=tolower(df_ab$Married)
table(df_ab$Married_new)
# 0 1 married not married notmarried
# 3408 5113 2 1 1
df_ab$Married_new1=gsub(" ", "", x=df_ab$Married_new, fixed = TRUE)
table(df_ab$Married_new1)
# 0 1 married notmarried
# 3408 5113 2 2
df_ab$Married_new2[df_ab$Married_new1 %in% c("married")]="Married"
table(df_ab$Married_new2)
# Married
# 2
df_ab$Married_new3[df_ab$Married_new1 %in% c("married", "1")]="Married"
table(df_ab$Married_new3)
# Married
# 5115
df_ab$Married_new3[df_ab$Married_new1 %in% c("notmarried", "0")]="Not Married"
table(df_ab$Married_new3)
# Married Not Married
# 5115 3410
sum(table(df_ab$Married_new3))
# [1] 8525
### Obtain the frequency table of remission in marriage groups:
table(df_ab$remission, df_ab$Married_new3)
# Married Not Married
# 0 3566 2438
# 1 1549 972
### Add labels:
df_ab$remission_label[df_ab$remission==0]="NO"
df_ab$remission_label[df_ab$remission==1]="Yes"
table(df_ab$remission_label, df_ab$Married_new3)
# Married Not Married
# NO 3566 2438
# Yes 1549 972
# 2. Example 2: convert data between wide and long formats
tolerance=read.csv("Data 1/tolerance1.csv", header = T, stringsAsFactors = F )
head(tolerance)
## This data is a so called "WIDE" data - each row contains the values for one individual
# id tol11 tol12 tol13 tol14 tol15 male exposure
# 1 9 2.23 1.79 1.90 2.12 2.66 0 1.54
# 2 45 1.12 1.45 1.45 1.45 1.99 1 1.16
# 3 268 1.45 1.34 1.99 1.79 1.34 1 0.90
# 4 314 1.22 1.22 1.55 1.12 1.12 0 0.81
# 5 442 1.45 1.99 1.45 1.67 1.90 0 1.13
# 6 514 1.34 1.67 2.23 2.12 2.44 1 0.90
## Some times we need to convert the "WIDE" data to "LONG" data for analysis.
## There are several ways to convert data between wide and long.
### a. Use "reshape2": wide to long
library(reshape2)
tolerance.long.1=reshape(data = tolerance, idvar = c("id", "male", "exposure"),
varying = c( "tol11","tol12","tol13","tol14","tol15"),
timevar = "year", v.names = "tolerance", direction = "long" )
## Let's check if the data was correctly tranformed
tolerance.long.1[tolerance.long.1$id==9, ]
# id male exposure year tolerance
# 9.0.1.54.1 9 0 1.54 1 2.23
# 9.0.1.54.2 9 0 1.54 2 1.79
# 9.0.1.54.3 9 0 1.54 3 1.90
# 9.0.1.54.4 9 0 1.54 4 2.12
# 9.0.1.54.5 9 0 1.54 5 2.66
### b. Use "tidyr": wide to long
library(tidyr)
tolerance.long.2=gather(data = tolerance, key = "year", value = "tolerance",
c( "tol11","tol12","tol13","tol14","tol15"),
factor_key=TRUE )
tolerance.long.2[tolerance.long.2$id==9, ]
# id male exposure year tolerance
# 1 9 0 1.54 tol11 2.23
# 17 9 0 1.54 tol12 1.79
# 33 9 0 1.54 tol13 1.90
# 49 9 0 1.54 tol14 2.12
# 65 9 0 1.54 tol15 2.66
## If we have a long data and want to tranform it to wide data, we can also use the same packages.
### c. Use "reshape2": long to wide
tolerance.wide.1=reshape(data = tolerance.long.1, idvar = c("id", "male", "exposure"),
timevar = "year",
direction = "wide" )
### d. Use "tidyr": long to wide
tolerance.wide.2=spread(data = tolerance.long.2, key = "year", value = "tolerance")
# 3. Example 3: duplicate data
tolerance2=read.csv("Data 2/tolerance2.csv" , header = T, stringsAsFactors = F )
head(tolerance2)
# id tol11 tol12 tol13 tol14 tol15 male exposure
# 1 9 2.23 1.79 1.90 2.12 2.66 0 1.54
# 2 9 2.23 NA 1.90 2.12 2.66 a 0 1.54
# 3 45 1.12 1.45 1.45 1.45 1.99 1 1.16
# 4 268 1.45 1.34 1.99 1.79 1.34 1 0.90
# 5 314 1.22 1.22 1.55 1.12 1.12 0 0.81
# 6 442 1.45 1.99 1.45 1.67 1.9 0 1.13
sapply(tolerance2, class)
# id tol11 tol12 tol13 tol14 tol15 male exposure
# "integer" "numeric" "numeric" "numeric" "numeric" "character" "integer" "numeric"
## Why tol15 is character but not numeric?
## The second observation is 2.66 a, causing the whole column imported as character.
## Also, the data have duplicates (e.g., two observations found for subject 9).
## First, let's clean the letters in the numeric variable:
tolerance2$tol15=gsub("[a-zA-Z ]", "", tolerance2$tol15)
tolerance2$tol15
# [1] 2.66 2.66 1.99 1.34 1.12 1.90 2.44 NA 1.99 1.22 1.12 1.22 1.55 3.32 3.32 2.12
# [17] 2.12 1.55 2.12
class(tolerance2$tol15)
# [1] "character"
## "tol15" is still character after removing letters in the column values, so we have to convert it to numeric:
tolerance2$tol15=as.numeric(tolerance2$tol15)
class(tolerance2$tol15)
# [1] "numeric"
## Second, let's check out how many and which subjects have duplicates:
tolerance2[duplicated(tolerance2$id), ]
# id tol11 tol12 tol13 tol14 tol15 male exposure
# 2 9 2.23 NA 1.9 2.12 2.66 a 0 1.54
# 9 569 1.79 1.90 1.9 1.99 1.99 0 1.99
# 15 978 1.22 1.34 NA NA 3.32 1 1.59
dim(tolerance2[duplicated(tolerance2$id), ])
# [1] 3 8
nrow(tolerance2[duplicated(tolerance2$id), ])
# [1] 3
## 3 subjects have duplicated observations
## Let's find out all the duplicated observations for them. First, we identify their IDs:
ind=tolerance2$id[duplicated(tolerance2$id)]
ind
tolerance2[which(tolerance2$id %in% ind ), ]
## The duplicates all have same values for each subject except some of them have NAs.
## So we can keep those with complete data (i.e., rows have all column values available).
## We identify that the rows need to be removed are row 2,8,and 15.
tolerance2.clean=tolerance2[-c(2,8,15),]
## We check again if there is duplicate.
nrow(tolerance2.clean[duplicated(tolerance2.clean$id), ])
# [1] 0
## 0 row shows that there is no duplicate any more.
write.csv(tolerance2.clean, "Data 2/tolerance2_clean.csv", row.names = F)