-
Notifications
You must be signed in to change notification settings - Fork 1
/
01_data-cleaning.Rmd
179 lines (149 loc) · 6.46 KB
/
01_data-cleaning.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
---
editor_options:
chunk_output_type: console
---
# Data cleaning
In this script, we will ensure that the data is cleaned, and we create a single sheet for annotations at dawn and another sheet for annotations at dusk.
## Install required libraries
```{r}
library(tidyverse)
library(dplyr)
library(stringr)
library(vegan)
library(ggplot2)
library(scico)
library(data.table)
library(extrafont)
library(sf)
library(raster)
# for plotting
library(scales)
library(ggplot2)
library(ggspatial)
library(colorspace)
```
## Loading acoustic data
The acoustic data consists of annotations of 10-s chunks of audio files across summer and winter months. We will load annotation datasets for both dawn and dusk and process them further to ensure that we can quickly extract measures of vocal activity in future scripts. For the sake of future analysis, we define dawn as periods of acoustic data annotated between 6am and 10am. We define dusk as periods of acoustic data annotated between 4pm to 7pm (Please note that in future scripts, we subsample the data to ensure that uniform sampling periods are being compared across dawn and dusk).
```{r}
# These dawn and dusk annotation .csvs are not uploaded to GitHub and can be provided upon request
# Please write to Vijay Ramesh if you would like to access the raw data - [email protected])
# However, all outputs emerging from this script, which includes processed annotations are public and can be used for analysis
# dawn data
summer_dawn <- read.csv("data/summer-dawn-annotations.csv")
winter_dawn <- read.csv("data/winter-dawn-annotations.csv")
dawn <- bind_rows(summer_dawn, winter_dawn)
names(dawn)
# reorder columns to ensure the species codes appear one after another
dawn <- dawn %>%
relocate(c("BFOW", "SBEO", "JUNI", "ASKO", "HSWO"), .after = "CORO")
# split the filename column into 4 columns : site, date, time and splits
dawn <- separate(dawn, col = Filename, into = c("site_id", "date", "time", "splits"), sep = "_") %>%
dplyr::select(-(Birds:Notes)) %>% # remove columns not required & rename
rename(., restoration_type = Restoration.Type..Benchmark.Active.Passive.) %>%
rename(., time_of_day = Time..Morning.Evening.Night.) %>%
mutate(time_of_day = "dawn")
# dusk data
summer_dusk <- read.csv("data/summer-dusk-annotations.csv")
winter_dusk <- read.csv("data/winter-dusk-annotations.csv")
dusk <- bind_rows(summer_dusk, winter_dusk)
names(dusk)
# reorder columns to ensure the species codes appear one after another
dusk <- dusk %>%
relocate("GREN", .after = "CORO")
# split the filename column into 4 columns : site, date, time and splits
dusk <- separate(dusk, col = Filename, into = c("site_id", "date", "time", "splits"), sep = "_") %>%
dplyr::select(-(Birds:Notes)) %>% # remove columns not required & rename
rename(., restoration_type = Restoration.Type..Benchmark.Active.Passive.) %>%
rename(., time_of_day = Time..Morning.Evening.Night.) %>%
mutate(time_of_day = "dusk")
# create a single dataframe
acoustic_data <- bind_rows(dawn, dusk)
```
## Add standardized eBird species codes for future analysis
```{r}
species_codes <- read.csv("data/species-annotation-codes.csv")
## Using the splice operator from an example on stackOverflow
## https://stackoverflow.com/questions/60113369/change-column-names-in-dataframe-based-on-matching-to-another-dataframe-by-dplyr
eBirdCodes <- species_codes %>%
dplyr::select(eBird_codes, species_annotation_codes) %>%
deframe()
# replace current annotation codes with eBird codes
acoustic_data <- acoustic_data %>%
rename(!!!eBirdCodes)
```
## How much data has been annotated for dawn and dusk across sites and days?
```{r}
# we observe that OLCAP5B has the least amount of data annotated (~3 unique days/visits) at dawn
dawnSampling <- acoustic_data %>%
filter(time_of_day == "dawn") %>%
dplyr::select(site_id, date) %>%
distinct() %>%
arrange(site_id) %>%
count(site_id)
# we observe that OLCAP5B has the least amount of data annotated (~3 unique visits)
duskSampling <- acoustic_data %>%
filter(time_of_day == "dusk") %>%
dplyr::select(site_id, date) %>%
distinct() %>%
arrange(site_id) %>%
count(site_id)
```
## More data cleaning
Removing NAs and renaming columns
```{r}
# Here, we pivot the data to long format and remove NA/zero values or missing data
acoustic_data <- acoustic_data %>%
group_by(site_id, date, time, splits, time_of_day, restoration_type) %>%
transform() %>%
replace(is.na(.), 0) %>%
summarise_at(.vars = vars(c("compea":"grenig1")), .funs = sum) %>%
pivot_longer(cols = compea:grenig1, names_to = "eBird_codes", values_to = "number") %>%
filter(number != 0) # one way to remove zeros
# rename the `time` column as `start_time` to distinguish from `time_of_day`
acoustic_data <- acoustic_data %>%
rename(., start_time = time)
# add hourOfDay column to indicate time-windows for when the acoustic-visit was started
acoustic_data <- acoustic_data %>%
mutate(
hour_of_day =
case_when(
start_time >= "060000" & start_time < "070000"
~ "6AM to 7AM",
start_time >= "070000" & start_time < "080000"
~ "7AM to 8AM",
start_time >= "080000" & start_time < "090000"
~ "8AM to 9AM",
start_time >= "090000" & start_time <= "100000"
~ "9AM to 10AM",
start_time >= "160000" & start_time < "170000"
~ "4PM to 5PM",
start_time >= "170000" & start_time <= "180000"
~ "5PM to 6PM",
start_time >= "180000" & start_time <= "190000"
~ "6PM to 7PM"
)
) %>%
ungroup()
# add a zero before the start_time to ensure its all 6 digits of time atleast
acoustic_data <- acoustic_data %>%
mutate(across(start_time, str_pad, width = 6, pad = "0"))
# keep only species that vocalize atleast across a minimum of 20 different/unique dates and times
# we chose this random arbitrary threshold to include only species that are highly vocal and can be used for comparative analyses
spp_subset <- acoustic_data %>%
group_by(site_id, date, eBird_codes) %>%
summarise(unique_voc = sum(number)) %>%
ungroup()
# keep only species with n > 20
spp_subset <- spp_subset %>%
group_by(eBird_codes) %>%
summarise(n = n()) %>%
filter(n > 20)
## subset to only include species that were detected atleast across 20 unique data/time combinations
acoustic_data <- acoustic_data %>%
filter(eBird_codes %in% spp_subset$eBird_codes)
## Only 69 species remain after including highly vocal species alone
```
## Write to file
```{r}
write.csv(acoustic_data, "results/acoustic_data.csv", row.names = F)
```