forked from Hua-CM/HuaSmallTools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_BioProject.py
54 lines (48 loc) · 1.98 KB
/
parse_BioProject.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# -*- coding: utf-8 -*-
# @Time : 2019/9/30 11:03
# @Author : Zhongyi Hua
# @FileName: parse_BioProject.py
# @Usage:
# @Note:
# @E-mail: [email protected]
import re
import pandas as pd
biosample_pattern = {"biosample": "BioSample:(.*?)[;\n]",
"sra_id": "SRA:(.*?)[;\n]",
"host": "host=(.*?)\n",
"source": "source=(.*?)\n",
"sample_name": "Sample name:(.*?)[;\n]",
"organism": "Organism:(.*?)[;\n]"
}
def biosample(biosample_record):
biosample_str = "".join(biosample_record)
biosample_dict = {}
for key, pattern in biosample_pattern.items():
try:
biosample_dict[key] = re.search(pattern, biosample_str).group(1).strip()
except:
biosample_dict[key] = "NA"
return biosample_dict
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description="This is the script for extracting Biosample information from \
Bioproject txt. Since Bio could not parse its gbk result correctly")
parser.add_argument('-i', '--input_txt', required=True,
help='<file_path> The Bioproject txt downloaded from NCBI')
parser.add_argument('-o', '--output_tsv', required=True,
help='<file_path> The result table')
args = parser.parse_args()
parse_results = pd.DataFrame(columns=["biosample", "sra_id", "host", "source", "sample_name", "organism"])
with open(args.input_txt, 'r', encoding='utf8') as f:
cont = True
li = []
while cont:
cont = f.readline()
li.append(cont)
if cont == '\n':
if li == ["\n"]:
continue
else:
parse_results = parse_results.append(biosample(li), ignore_index=True)
li = []
parse_results.to_csv(args.output_tsv, sep="\t", index=None)