-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathloop_f1.py
More file actions
94 lines (87 loc) · 3.19 KB
/
loop_f1.py
File metadata and controls
94 lines (87 loc) · 3.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import sys
import numpy as np
import os
from collections import defaultdict
def extract_loc(pred_detect_path):
overall_dict = defaultdict(list)
with open(pred_detect_path) as f:
for line in f:
if line.startswith("#"):
continue
line = line.strip().split()
chrom1 = line[0]
try:
start1 = int(float(line[1]))
end1 = int(float(line[2]))
except:
continue
chrom2 = line[3]
try:
start2 = int(float(line[4]))
end2 = int(float(line[5]))
except:
continue
if "chr" not in chrom1:
chrom1 = "chr"+chrom1
overall_dict[chrom1].append([start1, start2])
return overall_dict
def calculate_chromosome_loop_f1(predict_loc, gt_loc,max_scope=5):
"""
predict_loc: nd array N*3
gt_loc: nd array M*3
max_scope: max distance to match
"""
recall = np.zeros(len(gt_loc))
precision = np.zeros(len(predict_loc))
for i,pred_peak in enumerate(predict_loc):
for j,tgt_peak in enumerate(gt_loc):
dist=np.linalg.norm(pred_peak-tgt_peak)
if dist<=max_scope:
recall[j]=1
precision[i]=1
recall = np.mean(recall)
precision = np.mean(precision)
f1score = 2*precision*recall/(precision+recall)
return f1score
def calculate_loop_f1(true_dict, pred_dict, resolution,max_scope=5):
max_scope = max_scope*resolution
loop_f1 = {}
for chrom in true_dict:
if chrom not in pred_dict:
#loop_f1[chrom] = 0
print("Chromosome ",chrom," not in prediction")
continue
true_loc = np.array(true_dict[chrom])
pred_loc = np.array(pred_dict[chrom])
loop_f1[chrom] = calculate_chromosome_loop_f1(pred_loc, true_loc,max_scope)
return loop_f1
#input two bed files, one with the true peaks and one with the predicted peaks
#output the f1 score of the predicted peaks
"""
```
python3 loop_f1.py [true.bed] [pred.bed] [resolution]
```
[true.bed]: the true peaks, in bed format <br>
[pred.bed]: the predicted peaks, in bed format <br>
[resolution]: the resolution of the Hi-C data <br>
"""
if __name__ == '__main__':
if len(sys.argv) != 4:
print("Usage: python3 loop_f1.py true.bed pred.bed resolution")
print("ture.bed: the true peaks, in bed format")
print("pred.bed: the predicted peaks, in bed format")
print("resolution: the resolution of the Hi-C data")
sys.exit(1)
true_bed = sys.argv[1]
pred_bed = sys.argv[2]
resolution = int(sys.argv[3])
true_dict = extract_loc(true_bed)
pred_dict = extract_loc(pred_bed)
count_true_loop = sum([len(true_dict[chrom]) for chrom in true_dict])
count_pred_loop = sum([len(pred_dict[chrom]) for chrom in pred_dict])
print("Number of true loops: ", count_true_loop)
print("Number of predicted loops: ", count_pred_loop)
loop_f1=calculate_loop_f1(true_dict, pred_dict, resolution)
print("Chromosome-wise F1: ",loop_f1)
#average across different chromosomes
print("Overall F1:",np.mean(list(loop_f1.values() ) ) )