-
Notifications
You must be signed in to change notification settings - Fork 1
/
qc_Snakefile
51 lines (41 loc) · 1.89 KB
/
qc_Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# vim: ft=python
configfile: 'hpv_config.yaml'
workdir: os.environ['PWD']
shell.executable('bash')
localrules: all_qc
rule all_qc:
input:
expand('posttrim_qc/{sampleID}.filtered_fastqc.zip', sampleID=sampleIDs),
'reports/filtered_read_count.tsv'
rule pretrim_qc:
input: 'bams/{sampleID}.bam'
output: 'pretrim_qc/{sampleID}_fastqc.zip'
threads: 4
run:
shell('fastqc {input} -t {threads} --outdir=pretrim_qc')
rule posttrim_qc:
input: 'mapq_filter/{sampleID}.filtered.bam'
output: 'posttrim_qc/{sampleID}.filtered_fastqc.zip'
threads: 4
run:
shell('fastqc {input} -t {threads} --outdir=posttrim_qc')
rule fastqc_report:
input:
expand('pretrim_qc/{sampleID}_fastqc.zip', sampleID=sampleIDs),
expand('posttrim_qc/{sampleID}.filtered_fastqc.zip', sampleID=sampleIDs)
output: 'multiqc/fastqc_report.html'
run:
shell('multiqc -d pretrim_qc posttrim_qc -o multiqc -n fastqc_report')
rule filtered_count:
input: rules.fastqc_report.output
output: 'reports/filtered_read_count.tsv'
run:
df = pandas.read_table('multiqc/fastqc_report_data/multiqc_general_stats.txt', sep='\t')
df['sampleID'] = df['Sample'].apply(lambda x: x.split(' | ')[1].split('.')[0])
pre = df[df['Sample'].str.contains('pretrim')][['sampleID', 'FastQC_total_sequences']].copy()
post = df[df['Sample'].str.contains('posttrim')][['sampleID', 'FastQC_total_sequences']].copy()
dd = pre.merge(post, on='sampleID', suffixes=('_pre', '_post'))
dd['lowq_reads'] = dd['FastQC_total_sequences_pre'].astype(float) - dd['FastQC_total_sequences_post'].astype(float)
dd['lowq_perc'] = 100 - (dd['FastQC_total_sequences_post'].astype(float) / dd['FastQC_total_sequences_pre'].astype(float) * 100)
shell('mkdir -p reports')
dd.to_csv('reports/filtered_read_count.tsv', sep='\t', index=False)