-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathNYC_Open_Data_Analysis.Rmd
143 lines (117 loc) · 4.28 KB
/
NYC_Open_Data_Analysis.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
---
title: "Heatseek NYC DB - Analysis Script"
author: "Jolene Lim"
date: "15 November 2019"
output: html_document
---
# Setup
```{r, message = FALSE, warning = FALSE}
library(tidyverse)
```
# Load Data
```{r, message = FALSE}
hs = read_csv("data/heatseek_geocoded.csv")
complaints = read_csv("data/heatseek_complaints.csv")
violations = read_csv("data/heatseek_violations.csv")
```
```{r}
users = hs %>%
group_by(user_id, address, zip_code, apartment) %>%
summarize(n_violations = sum(violation)) %>%
filter(!is.na(user_id))
```
```{r}
filter(users, n_violations > 1000)
```
# COMPLAINTS
## Question: How many users with violations (recorded by the sensor) filed a complaint in 2019?
```{r}
complaints_2019 = complaints %>%
filter(str_detect(ReceivedDate, "2019"))
users_who_filed_2019 = unique(complaints_2019$user_id)
users = users %>%
mutate(faced_violation = n_violations > 1) %>%
mutate(faced_violation = factor(faced_violation,
levels = c(TRUE, FALSE),
labels = c("Faced Violation", "Did Not Face Violation"))) %>%
mutate(filed_complaint = user_id %in% users_who_filed_2019) %>%
mutate(filed_complaint = factor(filed_complaint,
levels = c(TRUE, FALSE),
labels = c("Filed Complaint", "Did Not File Complaint")))
```
```{r}
table(users$faced_violation, users$filed_complaint)
```
```{r}
ggplot(users, aes(x = filed_complaint, y = n_violations, color = as.factor(faced_violation))) +
geom_jitter(width = 0.2) +
labs(title = "Whether Users Filed Complaint and the Number of Violations They Faced",
x = "Complaint Status",
y = "Number of Violations") +
scale_color_discrete(name = "Sensor Recorded Violation") +
theme_minimal()
```
```{r}
n_complaints = complaints_2019 %>%
group_by(user_id) %>%
summarize(n_complaints = n())
users = left_join(users, n_complaints) %>%
mutate(n_complaints = ifelse(is.na(n_complaints), 0, n_complaints))
```
```{r}
ggplot(users, aes(x = n_violations, y = n_complaints, color = filed_complaint)) +
geom_point() +
labs(title = "Number of Violations Recorded by Sensor v. Number of NYCDB Complaints",
x = "Number of Violations Recorded by Sensor",
y = "Number of NYCDB Complaints") +
scale_color_discrete(name = "Whether User Filed Complaint") +
theme_bw()
```
## Question: For those who did file complaints, were they users who did not file complaints in the last 2 years?
I.e., did the sensors motivate them to file a complaint?
```{r}
users_who_filed = unique(complaints$user_id)
sum(!(users_who_filed_2019 %in% users_who_filed))
```
There are no such users.
# VIOLATIONS
## Question: How many users with violations (recorded by the sensor) eventually ended up as a violation case in 2019?
Note: Violations count both open and closed
```{r}
violations_2019 = violations %>%
filter(str_detect(NOVIssuedDate, "2019"))
users_who_filedv_2019 = unique(violations_2019$user_id)
users = users %>%
mutate(filed_violation = user_id %in% users_who_filedv_2019) %>%
mutate(filed_violation = factor(filed_violation,
levels = c(TRUE, FALSE),
labels = c("Filed Violation", "Did Not File Violation")))
```
```{r}
table(users$faced_violation, users$filed_violation)
```
```{r}
ggplot(users, aes(x = filed_violation, y = n_violations, color = faced_violation)) +
geom_jitter(width = 0.2) +
labs(title = "Whether Users Filed Violation and the Number of Violations They Faced",
x = "Violation Status",
y = "Number of Violations (Detected by Sensor)") +
scale_color_discrete(name = "Sensor Recorded Violation") +
theme_minimal()
```
```{r}
n_dbviolations = violations_2019 %>%
group_by(user_id) %>%
summarize(n_dbviolations = n())
users = left_join(users, n_dbviolations) %>%
mutate(n_dbviolations = ifelse(is.na(n_dbviolations), 0, n_dbviolations))
```
```{r}
ggplot(users, aes(x = n_violations, y = n_dbviolations, color = filed_violation)) +
geom_point() +
labs(title = "Number of Violations Recorded by Sensor v. Number of NYCDB Violations",
x = "Number of Violations Recorded by Sensor",
y = "Number of NYCDB Violations") +
scale_color_discrete(name = "Whether User Filed 311 Violation") +
theme_bw()
```