-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathwhitewine.r
262 lines (158 loc) · 7.47 KB
/
whitewine.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
library(ggplot2)
library(car)
library(GGally)
library(tidyverse)
library(mvoutlier)
library(car)
library(outliers)
library(pastecs)
library(moments)
library(corrplot)
#Convert Data to cols separated by semi col in excel
df <- read.csv("winequality-white.csv")
head(df)
names(df)
#[1] "fixed.acidity" "volatile.acidity" "citric.acid"
#[4] "residual.sugar" "chlorides" "free.sulfur.dioxide"
#[7] "total.sulfur.dioxide" "density" "pH"
#[10] "sulphates" "alcohol" "quality"
###########################################
#Univariate and Multivariate Analysis
##########################################
#Set significant digits and get a detailed summary
options(scipen = 100)
options(digits=2)
dmalesummary <- data.frame(stat.desc(df))
#Let's check what we got
dmalesummary
#as you can see the number of null values for citric acid are 19
#Let's count the na values
sum(is.na(df))
#0
#If you need NA count Column wise
sapply(df, function(x) sum(is.na(x)))
#If You need NA count of all -
table(is.na(df))
#This I found from stack overflow - elegant way
for (Var in names(df)) {
missing <- sum(is.na(df[,Var]))
if (missing > 0) {
print(c(Var,missing))
}
}
#Let's move ahead
#Let's plot qqplot for each col and study measurements which are aberrant
qqnorm(df$fixed.acidity , main="Normal Q-Q plot for Fixed Acidity")
qqline(df$fixed.acidity, lty = 2, col="red")
qqnorm(df$volatile.acidity , main="Normal Q-Q plot for Volatile Acidity")
qqline(df$volatile.acidity, lty = 2, col="red")
qqnorm(df$citric.acid, main="Normal Q-Q plot for citric acid")
qqline(df$citric.acid, lty = 2, col="red")
qqnorm(df$residual.sugar, main="Normal Q-Q plot for Residual sugar")
qqline(df$residual.sugar, lty = 2, col="red")
qqnorm(df$chlorides, main="Normal Q-Q plot for Chlorides")
qqline(df$chlorides, lty = 2, col="red")
qqnorm(df$free.sulfur.dioxide, main="Normal Q-Q plot for free sulfur dioxide")
qqline(df$free.sulfur.dioxide, lty = 2, col="red")
qqnorm(df$total.sulfur.dioxide, main="Normal Q-Q plot for total sulfur dioxide")
qqline(df$total.sulfur.dioxide, lty = 2, col="red")
qqnorm(df$density, main="Normal Q-Q plot for density")
qqline(df$density, lty = 2, col="red")
qqnorm(df$pH, main="Normal Q-Q plot for ph")
qqline(df$pH, lty = 2, col="red")
qqnorm(df$sulphates, main="Normal Q-Q plot for sulphates")
qqline(df$sulphates, lty = 2, col="red")
qqnorm(df$alcohol, main="Normal Q-Q plot for alcohol")
qqline(df$alcohol, lty = 2, col="red")
qqnorm(df$quality, main="Normal Q-Q plot for quality")
qqline(df$quality, lty = 2, col="red")
#Let's plot histograms for each col and study distributions
hist(df$fixed.acidity)
hist(df$volatile.acidity)
hist(df$citric.acid)
hist(df$residual.sugar)
hist(df$chlorides)
hist(df$free.sulfur.dioxide)
hist(df$total.sulfur.dioxide)
hist(df$density)
hist(df$pH)
hist(df$sulphates)
hist(df$alcohol)
hist(df$quality)
#Test for skewness and kurtosis
#In statistics, skewness is a measure of the asymmetry of the probability distribution of a random variable about its mean. In other words, skewness tells you the amount and direction of skew (departure from horizontal symmetry). The skewness value can be positive or negative, or even undefined. If skewness is 0, the data are perfectly symmetrical, although it is quite unlikely for real-world data. As a general rule of thumb:
#If skewness is less than -1 or greater than 1, the distribution is highly skewed.
#If skewness is between -1 and -0.5 or between 0.5 and 1, the distribution is moderately skewed.
#If skewness is between -0.5 and 0.5, the distribution is approximately symmetric.
for var in names(df)
{
a<- sapply(df, skewness)
print(a)
}
#fixed.acidity volatile.acidity citric.acid
#0.65 1.58 1.28
#residual.sugar chlorides free.sulfur.dioxide
#1.08 5.02 1.41
#total.sulfur.dioxide density pH
#0.39 0.98 0.46
#sulphates alcohol quality
#0.98 0.49 0.16
#Now write descriptions of each as
#Fixed acidity is moderately skewed to right or on the positive side
#Kurtosis
#Kurtosis tells you the height and sharpness of the central peak, relative to that of a standard bell curve.
#If the value is greater than 3 then distribution of the data is leptokurtic
#If the value is less than 3 then distribution of the data is platykurtic
for var in names(df)
{
b<- sapply(df, kurtosis)
print(b)
}
#fixed.acidity volatile.acidity citric.acid
#5.2 8.1 9.2
#residual.sugar chlorides free.sulfur.dioxide
#6.5 40.5 14.5
#total.sulfur.dioxide density pH
#3.6 12.8 3.5
#sulphates alcohol quality
#4.6 2.3 3.2
#Now write descriptions for each as per rules
#leptokurtic = all except alcohol
#platykurtic = just alcohol
#Perform shapiro test for normality
sapply(df, shapiro.test)
##Interactive outlier zipping using shapiro
# Performs a chisquared test for detection of one outlier in a vector.
#don't use sapply here since we are adding few more parameters here so do it separately
chisq.out.test(df$fixed.acidity, variance = var(df$fixed.acidity), opposite = FALSE)
chisq.out.test(df$volatile.acidity, variance = var(df$volatile.acidity), opposite = FALSE)
chisq.out.test(df$citric.acid, variance = var(df$citric.acid), opposite = FALSE)
chisq.out.test(df$residual.sugar, variance = var(df$residual.sugar), opposite = FALSE)
chisq.out.test(df$chlorides, variance = var(df$chlorides), opposite = FALSE)
chisq.out.test(df$free.sulfur.dioxide, variance = var(df$free.sulfur.dioxide), opposite = FALSE)
chisq.out.test(df$total.sulfur.dioxide, variance = var(df$total.sulfur.dioxide), opposite = FALSE)
chisq.out.test(df$density, variance = var(df$density), opposite = FALSE)
chisq.out.test(df$pH, variance = var(df$pH), opposite = FALSE)
chisq.out.test(df$sulphates, variance = var(df$sulphates), opposite = FALSE)
chisq.out.test(df$alcohol, variance = var(df$alcohol), opposite = FALSE)
chisq.out.test(df$quality, variance = var(df$quality), opposite = FALSE)
########################################
##############################
##############
#Boxplot method
#Remember capital B boxplot method is from car package
Boxplot(df$fixed.acidity, col = rgb(0,0,1,0.5))
#old method boxplot
boxplot(df$fixed.acidity)
summary(df$fixed.acidity)
bm <- 7.3 + 1.5 * IQR(df$fixed.acidity)
bm
#8.8 Anything above this value is an outlier
#wait we also have outliers in first quartile as well
bmbelow <- quantile(df$fixed.acidity, 0.25) - 1.5 * IQR(df$fixed.acidity)
bmbelow
#4.8 Anything below this value is an outlier
high <- 8.8
low <- 4.8
newfixedacidity <- df$fixed.acidity[(df$fixed.acidity < high) & (df$fixed.acidity > low) ]
#do these steps for each col to get a new dataframe now that the number of outliers will vary per col, to make the length even before putting everyhting in dataframe is replace blanks or na with col means