Skip to content

Commit 291f630

Browse files
committed
proof of concept for computeMatrix
1 parent 5ced25d commit 291f630

File tree

6 files changed

+818
-4
lines changed

6 files changed

+818
-4
lines changed

Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,5 @@ rust-htslib = "0.47.0"
1414
rayon = "1.10.0"
1515
itertools = "0.12.1"
1616
bigtools = "0.5.3"
17-
tokio = "*"
17+
tokio = "*"
18+
flate2 = "*"

pydeeptools/deeptools/computeMatrix2.py

Lines changed: 416 additions & 0 deletions
Large diffs are not rendered by default.

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,4 +84,5 @@ plotHeatmap = "deeptools.plotHeatmap:main"
8484
plotPCA = "deeptools.plotPCA:main"
8585
plotProfile = "deeptools.plotProfile:main"
8686
bamCoverage2 = "deeptools.bamCoverage2:main"
87-
bamCompare2 = "deeptools.bamCompare2:main"
87+
bamCompare2 = "deeptools.bamCompare2:main"
88+
computeMatrix2 = "deeptools.computeMatrix2:main"

src/computematrix.rs

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
use pyo3::prelude::*;
2+
use pyo3::types::PyList;
3+
use crate::filehandler::{read_bedfiles, chrombounds_from_bw, bwintervals, header_matrix, write_matrix};
4+
use rayon::prelude::*;
5+
use rayon::ThreadPoolBuilder;
6+
use std::collections::HashMap;
7+
use std::path::Path;
8+
9+
#[pyfunction]
10+
pub fn r_computematrix(
11+
mode: &str,
12+
bedlis: Py<PyList>,
13+
bwlis: Py<PyList>,
14+
upstream: u64,
15+
downstream: u64,
16+
unscaled5prime: u64,
17+
unscaled3prime: u64,
18+
regionbodylength: u64,
19+
binsize: u64,
20+
missingdatazero: bool,
21+
referencepoint: &str,
22+
nproc: usize,
23+
verbose: bool,
24+
ofile: &str
25+
) -> PyResult<()> {
26+
27+
// Extract the bed and bigwig files from pyList to Vec.
28+
let mut bed_files: Vec<String> = Vec::new();
29+
let mut bw_files: Vec<String> = Vec::new();
30+
Python::with_gil(|py| {
31+
bed_files = bedlis.extract(py).expect("Failed to retrieve bed file strings.");
32+
bw_files = bwlis.extract(py).expect("Failed to retrieve bed file strings.");
33+
});
34+
// Get chromosome boundaries from first bigwig file.
35+
let chromsizes = chrombounds_from_bw(&bw_files.get(0).unwrap());
36+
// compute number of columns
37+
let bpsum = &upstream + &downstream + &unscaled5prime + &unscaled3prime + &regionbodylength;
38+
// Get the 'basepaths' of the bed files to use as labels later on.
39+
let mut bedlabels: Vec<String> = Vec::new();
40+
for bed in bed_files.iter() {
41+
let entryname = Path::new(bed)
42+
.file_stem()
43+
.unwrap()
44+
.to_string_lossy()
45+
.into_owned();
46+
bedlabels.push(entryname);
47+
}
48+
// Define the scaling regions in a struct
49+
let scale_regions = Scalingregions {
50+
upstream: upstream,
51+
downstream: downstream,
52+
unscaled5prime: unscaled5prime,
53+
unscaled3prime: unscaled3prime,
54+
regionbodylength: regionbodylength,
55+
binsize: binsize,
56+
cols_expected: ((bw_files.len() as u64 * bpsum) / binsize) as usize,
57+
missingdata_as_zero: missingdatazero,
58+
referencepoint: referencepoint.to_string(),
59+
mode: mode.to_string(),
60+
bwfiles: bw_files.len(),
61+
avgtype: "mean".to_string(),
62+
verbose: verbose,
63+
proc_number: nproc,
64+
bedlabels: bedlabels
65+
};
66+
// Parse regions from bed files. Note that we retain the name of the bed file (in case there are more then 1)
67+
// Additionaly, score and strand are also retained, if it's a 3-column bed file we just fill in '.'
68+
let (regions, regionsizes) = read_bedfiles(&bed_files);
69+
let slopregions = slop_regions(
70+
&regions,
71+
&scale_regions,
72+
&chromsizes
73+
);
74+
let pool = ThreadPoolBuilder::new().num_threads(nproc).build().unwrap();
75+
let matrix: Vec<Vec<f64>> = pool.install(|| {
76+
bw_files.par_iter()
77+
.map(|i| bwintervals(&i, &regions, &slopregions, &scale_regions))
78+
.reduce(
79+
|| vec![vec![]; regions.len()],
80+
|mut acc, vec_of_vecs| {
81+
for (i, inner_vec) in vec_of_vecs.into_iter().enumerate() {
82+
acc[i].extend(inner_vec);
83+
}
84+
acc
85+
},
86+
)
87+
});
88+
write_matrix(
89+
header_matrix(&scale_regions, regionsizes),
90+
matrix,
91+
ofile,
92+
regions
93+
);
94+
Ok(())
95+
}
96+
97+
fn slop_regions(
98+
regions: &Vec<(String, u64, u64, String, String, String)>,
99+
scale_regions: &Scalingregions,
100+
chromsizes: &HashMap<String, u64>
101+
) -> Vec<Vec<(u64, u64)>> {
102+
let mut regionranges: Vec<Vec<(u64, u64)>> = Vec::new();
103+
// Idea is to create a vector of tuples with start and end of every bin (binsize passed by computeMatrix).
104+
// The number of columns per region needs to be fixed per region.
105+
// Note that the before / after could mean that we run out of chromosome.
106+
// Invalid regions (later to be encoded as NA or 0), will be pushed as (0,0) tuples.
107+
let col_expected = scale_regions.cols_expected / scale_regions.bwfiles;
108+
for region in regions.iter() {
109+
// To implement:
110+
// // scale-regions + unscaled 5 and 3
111+
// // + and - encodings
112+
let chromend: u64 = *chromsizes.get(&region.0).unwrap();
113+
assert!(region.2 <= chromend, "Region end goes beyond chromosome boundary. Fix your bed files. {:?} > {}", region, chromend);
114+
assert!(region.1 <= chromend, "Region start goes beyond chromosome boundary. Fix your bed files. {:?} > {}", region, chromend);
115+
116+
let anchorpoint = match scale_regions.referencepoint.as_str() {
117+
"TSS" => region.1,
118+
"TES" => region.2,
119+
"center" => (region.1 + region.2) / 2,
120+
_ => panic!("Reference should either be TSS, TES or center. {:?} is not supported.", scale_regions.referencepoint),
121+
};
122+
123+
let mut regionsizes: Vec<(u64, u64)> = Vec::new();
124+
let mut absstart: i64 = anchorpoint as i64 - scale_regions.upstream as i64;
125+
let absstop: i64 = anchorpoint as i64 + scale_regions.downstream as i64;
126+
while absstart < absstop {
127+
let bin = absstart + scale_regions.binsize as i64;
128+
if absstart < 0 || bin > chromend as i64 {
129+
regionsizes.push((0,0));
130+
} else {
131+
regionsizes.push((absstart as u64, bin as u64))
132+
}
133+
absstart = bin;
134+
}
135+
assert!(
136+
regionsizes.len() == col_expected,
137+
"Number of bins does not match expected number of columns: (CHROMLEN = {}) {:?} \n \n {:?}, \n \n {} != {}",
138+
*chromsizes.get(&region.0).unwrap(),
139+
region,
140+
regionsizes,
141+
regionsizes.len(),
142+
col_expected,
143+
);
144+
regionranges.push(regionsizes);
145+
}
146+
return regionranges;
147+
}
148+
149+
pub struct Scalingregions {
150+
pub upstream: u64,
151+
pub downstream: u64,
152+
pub unscaled5prime: u64,
153+
pub unscaled3prime: u64,
154+
pub regionbodylength: u64,
155+
pub binsize: u64,
156+
pub cols_expected: usize,
157+
pub missingdata_as_zero: bool,
158+
pub referencepoint: String,
159+
pub mode: String,
160+
pub bwfiles: usize,
161+
pub avgtype: String,
162+
pub verbose: bool,
163+
pub proc_number: usize,
164+
pub bedlabels: Vec<String>
165+
}

0 commit comments

Comments
 (0)