-
Notifications
You must be signed in to change notification settings - Fork 26
/
cat_mark_duplicates_stats.py
executable file
·111 lines (91 loc) · 2.91 KB
/
cat_mark_duplicates_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/env python
import glob
import sys
import os
import argparse
parser = argparse.ArgumentParser(description= """
DESCRIPTION
Concatenate duplicate stats from picard MarkDuplicates. Stats go to stdout, histogram to
file given via -H.
Both stats and hitogra, files will have filename left-bound.
USAGE
cat_mark_duplicates_stats.py *.markDuplicates.txt -H hist.txt > concat.stats
""", prog= 'cat_mark_duplicates_stats.py', formatter_class= argparse.RawDescriptionHelpFormatter) # , formatter_class= argparse.RawTextHelpFormatter
# -----------------------------------------------------------------------------
parser.add_argument('--version', action='version', version='%(prog)s 0.1.0')
parser.add_argument('--input', '-i',
required= True,
nargs= '+',
help='''Stat files to concatenate.
''')
parser.add_argument('--hist', '-H',
required= False,
default= None,
help='''File with the concatenated *histograms*.
''')
args = parser.parse_args()
def parseStats(x):
"""Extract line contaiining stats
x:
stat file name from MarkDuplicates
return:
Tuple with two lists: Header and stats
"""
try:
xstats= open(x)
except IOError:
sys.exit('I cannot open file %s' %x)
xstats= xstats.readlines()
header= None
stats= None
for i in range(0, len(xstats)):
line= xstats[i]
if line.startswith('## METRICS CLASS'):
header= xstats[i+1].strip().split('\t')
stats= xstats[i+2].strip().split('\t')
break
if not header or not stats:
sys.exit('Cannot locate header and or stats line in file %s' %x)
return((header, stats))
def parseHist(x):
"""Extract lines contaiining histogram
x:
stat file name from MarkDuplicates
return:
Tuple with two lists: Header and stats
"""
try:
xstats= open(x)
except IOError:
sys.exit('I cannot open file %s' %x)
xstats= xstats.readlines()
skip= None
for i in range(0, len(xstats)):
line= xstats[i]
if line.startswith('## HISTOGRAM'):
hist= xstats[i+2:]
hist= [x.strip().split('\t') for x in hist if x.strip() != '']
return(hist)
infiles= []
for f in args.input:
infiles.extend(glob.glob(f))
infiles= sorted(list(set(infiles)))
if args.hist:
hout= open(args.hist, 'w')
hout.write('\t'.join(['FILENAME', 'BIN', 'VALUE']) + '\n')
header= False
for f in infiles:
xf= parseStats(f)
filename= xf[0]
if not header:
print('\t'.join(['FILENAME'] + filename))
header= True
if args.hist:
xh= parseHist(f)
for x in xh:
x= [f] + x
hout.write('\t'.join(x) + '\n')
print('\t'.join([os.path.split(f)[1]] + xf[1]))
if args.hist:
hout.close()
sys.exit()