-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathhrws_survey.py
75 lines (61 loc) · 3.46 KB
/
hrws_survey.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#! /usr/bin/env python
# Import python modules that will do useful stuff for us
import collections, csv
# code written for use by/for the Huron Valley Watershed Council (A2 DataDive 2013)
# Looks at the species table in the database, and outputs a file describing whether or not a given species is at a given site
# (value 0 = no, 1 = yes)
# Also prints out a list of the 10 surveys that showed the most species
# TODOS:
# - Adapt GIS software to show these results on a map [GIS, d3]
#
# Code written and discussed by Nick Krabbenhoeft, @bernease/Bernease Herman; refactoring by Andy Boughton
# To use: output the database table for site surveys to comma-separated values (CSV) format; we based this analysis
# on the provided excel file "6- species list.csv"
## NOTES:
# final map likely ought to be spot-checked by ecologists/other workers or volunteers familiar with the data
#################
all_species = set()
all_surveys = {}
# First pass through the file: create a list of all known species in the database
with open('species.csv', 'rU') as f:
reader = csv.reader(f)
# skip header row
reader.next()
for row in reader:
# Store species information in a dictionary whose key is the ID field in the database
# (this may not be the same as the UniqueID field; as I understand it, this reports the most species-rich surveys, rather than
# the most species-rich bioreserves. --abought
# Convert species names to lowercase
names_lowercase = [ item.lower() for item in row[5:] ]
for item in names_lowercase:
# Globally keep track of every unique species name listed. This list only combines obvious similarities (ELM and elm).
# To detect more subtle similarities (Am. Elm vs American elm), the final output will need to be spot-checked by HRWC
all_species.add( item )
# Store species information in a dictionary based on the ID field (column 0) in the database
# (this may not be the same as the UniqueID field, which we think is the name of the BioReserve).
# So the output of this file reports the most species-rich surveys, rather than
# the most species-rich bioreserves. --abought
all_surveys[ row[0] ] = collections.Counter( names_lowercase )
#####
# This next line is what connects a survey with an ID; change to row[1] to use the BioReserve ID instead of site ID
all_surveys[ row[0] ]['survey_id'] = row[0]
# We're not interested in blank columns, so get rid of those
if '' in all_surveys[ row[0] ] : del all_surveys[ row[0] ]['']
all_species = sorted( all_species )
if '' in all_species: all_species.remove( '' )
# Wrte data to output file; first column should be the survey_id ("ID" = column 0 in database)
with open('survey_species2.tsv','w') as f:
writer = csv.DictWriter( f, delimiter='\t', fieldnames=['survey_id'] + all_species , restval = 0 )
writer.writeheader()
for s in all_surveys:
writer.writerow( all_surveys[s] )
# Lastly: can we find the 10 surveys/bioreserve sites that seem to have the most unique species?
rank_by_most_species = sorted( all_surveys , reverse = True,
key = lambda x: len( all_surveys[x].values() ) -1 )
print "Most species-rich surveys:"
print "Survey_id", "#species"
for i in range(10):
survey_id = rank_by_most_species[i]
print survey_id , len( all_surveys[ survey_id ].values() ) - 1
# Don't close the output window until the user is done looking at it
raw_input( "Analysis done! Press the return key to exit." )