hic-helper/analysis/loop_f1.py at main · Noble-Lab/hic-helper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93

import sys
import numpy as np
import os
from collections import defaultdict

def extract_loc(pred_detect_path):
    overall_dict = defaultdict(list)
    with open(pred_detect_path) as f:
        for line in f:
            if line.startswith("#"):
                continue
            line = line.strip().split()
            chrom1 = line[0]
            try:
                start1 = int(float(line[1]))
                end1 = int(float(line[2]))
            except:
                continue
            chrom2 = line[3]
            try:
                start2 = int(float(line[4]))
                end2 = int(float(line[5]))
            except:
                continue
            if "chr" not in chrom1:
                chrom1 = "chr"+chrom1
            overall_dict[chrom1].append([start1, start2])

    return overall_dict
def calculate_chromosome_loop_f1(predict_loc, gt_loc,max_scope=5):
    """
    predict_loc: nd array N*3
    gt_loc: nd array M*3
    max_scope: max distance to match
    """
    recall = np.zeros(len(gt_loc))
    precision = np.zeros(len(predict_loc))
    for i,pred_peak in enumerate(predict_loc):
        for j,tgt_peak in enumerate(gt_loc):
            dist=np.linalg.norm(pred_peak-tgt_peak)
            if dist<=max_scope:
                recall[j]=1
                precision[i]=1
    recall = np.mean(recall)
    precision = np.mean(precision)
    f1score = 2*precision*recall/(precision+recall)
    return f1score
def calculate_loop_f1(true_dict, pred_dict, resolution,max_scope=5):
    max_scope = max_scope*resolution
    loop_f1 = {}
    for chrom in true_dict:
        if chrom not in pred_dict:
            #loop_f1[chrom] = 0
            print("Chromosome ",chrom," not in prediction")
            continue
        true_loc = np.array(true_dict[chrom])
        pred_loc = np.array(pred_dict[chrom])
        loop_f1[chrom] = calculate_chromosome_loop_f1(pred_loc, true_loc,max_scope)
    return loop_f1


#input two bed files, one with the true peaks and one with the predicted peaks
#output the f1 score of the predicted peaks
"""
```
python3 loop_f1.py [true.bed] [pred.bed] [resolution]
```
[true.bed]: the true peaks, in bed format <br>
[pred.bed]: the predicted peaks, in bed format <br>
[resolution]: the resolution of the Hi-C data <br>

"""
if __name__ == '__main__':
    if len(sys.argv) != 4:
        print("Usage: python3 loop_f1.py true.bed pred.bed resolution")
        print("ture.bed: the true peaks, in bed format")
        print("pred.bed: the predicted peaks, in bed format")
        print("resolution: the resolution of the Hi-C data")
        sys.exit(1)
    true_bed = sys.argv[1]
    pred_bed = sys.argv[2]
    resolution = int(sys.argv[3])
    true_dict = extract_loc(true_bed)
    pred_dict = extract_loc(pred_bed)
    count_true_loop = sum([len(true_dict[chrom]) for chrom in true_dict])
    count_pred_loop = sum([len(pred_dict[chrom]) for chrom in pred_dict])
    print("Number of true loops: ", count_true_loop)
    print("Number of predicted loops: ", count_pred_loop)
    loop_f1=calculate_loop_f1(true_dict, pred_dict, resolution)
    print("Chromosome-wise F1: ",loop_f1)
    #average across different chromosomes
    print("Overall F1:",np.mean(list(loop_f1.values() ) ) )