1+ use pyo3:: prelude:: * ;
2+ use pyo3:: types:: PyList ;
3+ use crate :: filehandler:: { read_bedfiles, chrombounds_from_bw, bwintervals, header_matrix, write_matrix} ;
4+ use rayon:: prelude:: * ;
5+ use rayon:: ThreadPoolBuilder ;
6+ use std:: collections:: HashMap ;
7+ use std:: path:: Path ;
8+
9+ #[ pyfunction]
10+ pub fn r_computematrix (
11+ mode : & str ,
12+ bedlis : Py < PyList > ,
13+ bwlis : Py < PyList > ,
14+ upstream : u64 ,
15+ downstream : u64 ,
16+ unscaled5prime : u64 ,
17+ unscaled3prime : u64 ,
18+ regionbodylength : u64 ,
19+ binsize : u64 ,
20+ missingdatazero : bool ,
21+ referencepoint : & str ,
22+ nproc : usize ,
23+ verbose : bool ,
24+ ofile : & str
25+ ) -> PyResult < ( ) > {
26+
27+ // Extract the bed and bigwig files from pyList to Vec.
28+ let mut bed_files: Vec < String > = Vec :: new ( ) ;
29+ let mut bw_files: Vec < String > = Vec :: new ( ) ;
30+ Python :: with_gil ( |py| {
31+ bed_files = bedlis. extract ( py) . expect ( "Failed to retrieve bed file strings." ) ;
32+ bw_files = bwlis. extract ( py) . expect ( "Failed to retrieve bed file strings." ) ;
33+ } ) ;
34+ // Get chromosome boundaries from first bigwig file.
35+ let chromsizes = chrombounds_from_bw ( & bw_files. get ( 0 ) . unwrap ( ) ) ;
36+ // compute number of columns
37+ let bpsum = & upstream + & downstream + & unscaled5prime + & unscaled3prime + & regionbodylength;
38+ // Get the 'basepaths' of the bed files to use as labels later on.
39+ let mut bedlabels: Vec < String > = Vec :: new ( ) ;
40+ for bed in bed_files. iter ( ) {
41+ let entryname = Path :: new ( bed)
42+ . file_stem ( )
43+ . unwrap ( )
44+ . to_string_lossy ( )
45+ . into_owned ( ) ;
46+ bedlabels. push ( entryname) ;
47+ }
48+ // Define the scaling regions in a struct
49+ let scale_regions = Scalingregions {
50+ upstream : upstream,
51+ downstream : downstream,
52+ unscaled5prime : unscaled5prime,
53+ unscaled3prime : unscaled3prime,
54+ regionbodylength : regionbodylength,
55+ binsize : binsize,
56+ cols_expected : ( ( bw_files. len ( ) as u64 * bpsum) / binsize) as usize ,
57+ missingdata_as_zero : missingdatazero,
58+ referencepoint : referencepoint. to_string ( ) ,
59+ mode : mode. to_string ( ) ,
60+ bwfiles : bw_files. len ( ) ,
61+ avgtype : "mean" . to_string ( ) ,
62+ verbose : verbose,
63+ proc_number : nproc,
64+ bedlabels : bedlabels
65+ } ;
66+ // Parse regions from bed files. Note that we retain the name of the bed file (in case there are more then 1)
67+ // Additionaly, score and strand are also retained, if it's a 3-column bed file we just fill in '.'
68+ let ( regions, regionsizes) = read_bedfiles ( & bed_files) ;
69+ let slopregions = slop_regions (
70+ & regions,
71+ & scale_regions,
72+ & chromsizes
73+ ) ;
74+ let pool = ThreadPoolBuilder :: new ( ) . num_threads ( nproc) . build ( ) . unwrap ( ) ;
75+ let matrix: Vec < Vec < f64 > > = pool. install ( || {
76+ bw_files. par_iter ( )
77+ . map ( |i| bwintervals ( & i, & regions, & slopregions, & scale_regions) )
78+ . reduce (
79+ || vec ! [ vec![ ] ; regions. len( ) ] ,
80+ |mut acc, vec_of_vecs| {
81+ for ( i, inner_vec) in vec_of_vecs. into_iter ( ) . enumerate ( ) {
82+ acc[ i] . extend ( inner_vec) ;
83+ }
84+ acc
85+ } ,
86+ )
87+ } ) ;
88+ write_matrix (
89+ header_matrix ( & scale_regions, regionsizes) ,
90+ matrix,
91+ ofile,
92+ regions
93+ ) ;
94+ Ok ( ( ) )
95+ }
96+
97+ fn slop_regions (
98+ regions : & Vec < ( String , u64 , u64 , String , String , String ) > ,
99+ scale_regions : & Scalingregions ,
100+ chromsizes : & HashMap < String , u64 >
101+ ) -> Vec < Vec < ( u64 , u64 ) > > {
102+ let mut regionranges: Vec < Vec < ( u64 , u64 ) > > = Vec :: new ( ) ;
103+ // Idea is to create a vector of tuples with start and end of every bin (binsize passed by computeMatrix).
104+ // The number of columns per region needs to be fixed per region.
105+ // Note that the before / after could mean that we run out of chromosome.
106+ // Invalid regions (later to be encoded as NA or 0), will be pushed as (0,0) tuples.
107+ let col_expected = scale_regions. cols_expected / scale_regions. bwfiles ;
108+ for region in regions. iter ( ) {
109+ // To implement:
110+ // // scale-regions + unscaled 5 and 3
111+ // // + and - encodings
112+ let chromend: u64 = * chromsizes. get ( & region. 0 ) . unwrap ( ) ;
113+ assert ! ( region. 2 <= chromend, "Region end goes beyond chromosome boundary. Fix your bed files. {:?} > {}" , region, chromend) ;
114+ assert ! ( region. 1 <= chromend, "Region start goes beyond chromosome boundary. Fix your bed files. {:?} > {}" , region, chromend) ;
115+
116+ let anchorpoint = match scale_regions. referencepoint . as_str ( ) {
117+ "TSS" => region. 1 ,
118+ "TES" => region. 2 ,
119+ "center" => ( region. 1 + region. 2 ) / 2 ,
120+ _ => panic ! ( "Reference should either be TSS, TES or center. {:?} is not supported." , scale_regions. referencepoint) ,
121+ } ;
122+
123+ let mut regionsizes: Vec < ( u64 , u64 ) > = Vec :: new ( ) ;
124+ let mut absstart: i64 = anchorpoint as i64 - scale_regions. upstream as i64 ;
125+ let absstop: i64 = anchorpoint as i64 + scale_regions. downstream as i64 ;
126+ while absstart < absstop {
127+ let bin = absstart + scale_regions. binsize as i64 ;
128+ if absstart < 0 || bin > chromend as i64 {
129+ regionsizes. push ( ( 0 , 0 ) ) ;
130+ } else {
131+ regionsizes. push ( ( absstart as u64 , bin as u64 ) )
132+ }
133+ absstart = bin;
134+ }
135+ assert ! (
136+ regionsizes. len( ) == col_expected,
137+ "Number of bins does not match expected number of columns: (CHROMLEN = {}) {:?} \n \n {:?}, \n \n {} != {}" ,
138+ * chromsizes. get( & region. 0 ) . unwrap( ) ,
139+ region,
140+ regionsizes,
141+ regionsizes. len( ) ,
142+ col_expected,
143+ ) ;
144+ regionranges. push ( regionsizes) ;
145+ }
146+ return regionranges;
147+ }
148+
149+ pub struct Scalingregions {
150+ pub upstream : u64 ,
151+ pub downstream : u64 ,
152+ pub unscaled5prime : u64 ,
153+ pub unscaled3prime : u64 ,
154+ pub regionbodylength : u64 ,
155+ pub binsize : u64 ,
156+ pub cols_expected : usize ,
157+ pub missingdata_as_zero : bool ,
158+ pub referencepoint : String ,
159+ pub mode : String ,
160+ pub bwfiles : usize ,
161+ pub avgtype : String ,
162+ pub verbose : bool ,
163+ pub proc_number : usize ,
164+ pub bedlabels : Vec < String >
165+ }
0 commit comments