-
Notifications
You must be signed in to change notification settings - Fork 1
/
generate_files.py
62 lines (51 loc) · 1.76 KB
/
generate_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import csv
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import os
# import progressbar
import sqlite3
sequence_file_path = os.path.abspath("data/sequences.csv")
occurrence_file_path = os.path.abspath("data/occurrences.csv")
fasta_file_path = os.path.abspath("data/sequences.fasta")
sqlite_file_path = os.path.abspath("data/occurrence.sqlite")
# fasta file
csv.field_size_limit(100000000)
try:
os.remove(fasta_file_path)
except FileNotFoundError:
pass
# bar = progressbar.ProgressBar()
with open(fasta_file_path, "w") as fasta_file:
with open(sequence_file_path) as csv_file:
reader = csv.DictReader(csv_file)
count = 0
for row in reader:
seq = str(row["sequence"]).replace("-", "")
record = SeqRecord(
Seq(seq),
id=row["hash"],
description=row["hash"]
)
SeqIO.write([record], fasta_file, "fasta")
count = count + 1
# bar.update(count)
# sqlite (occurrence IDs)
try:
os.remove(sqlite_file_path)
except FileNotFoundError:
pass
con = sqlite3.connect(sqlite_file_path)
cur = con.cursor()
cur.execute("create table occurrence (hash, decimallongitude real, decimallatitude real, dataset_id, phylum, class, \"order\", family, genus, scientificname, count int)")
# with progressbar.ProgressBar(max_value=progressbar.UnknownLength) as bar:
with open(occurrence_file_path) as csv_file:
reader = csv.DictReader(csv_file)
count = 0
for row in reader:
cur.execute("insert into occurrence values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", list(row.values()))
count = count + 1
# bar.update(count)
cur.execute("create index idx_hash on occurrence (hash)")
con.commit()
con.close()