Skip to content

Commit

Permalink
update splice junction calculation implementation in rust script
Browse files Browse the repository at this point in the history
  • Loading branch information
pre-mRNA committed Apr 15, 2024
1 parent 3f13655 commit 874dbb6
Show file tree
Hide file tree
Showing 6 changed files with 100 additions and 39 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ nalgebra = "0.32.4"
bio = "1.6.0"
clap = "4.5.4"
multimap = "0.9.1"
rayon = "1.5.0"

[dev-dependencies]
env_logger = "0.11.3"
16 changes: 15 additions & 1 deletion manuscript_demo/process_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,19 @@ export wd="/g/data/lf10/as7425/R2DTool_demo/"; mkdir -p ${wd}
bash ~/R2Dtool/scripts/cheui_to_bed.sh ${methylation_calls} "${wd}/methylation_calls.bed"

# annotate the methylation calls against gencode v38
# temporarty
cd ~/R2Dtool && rm -rf target && cargo build --release
annotation="/home/150/as7425/R2Dtool/test/gencode_v38.gtf"
cd $wd; rm splice_sites_map.txt 2>/dev/null
time r2d annotate -i "${wd}/methylation_calls.bed" -g ${annotation} -H > "${wd}/methylation_calls_annotated.bed"
cat splice_sites_map.txt | grep "ENST00000000233" -A 20 -B 20

# compare to Rscript version
module load R
# time Rscript ~/R2Dtool/scripts/R2_annotate.R "${wd}/methylation_calls.bed" "${annotation}" "${wd}/methylation_calls_annotated_R.bed"
cd $wd
for i in *; do head $i > ~/toLocal/${i##*/}; done


# liftover the annotated called to genomic coordinates
time r2d liftover -i "${wd}/methylation_calls_annotated.bed" -g ${annotation} -H > "${wd}/methylation_calls_annotated_lifted.bed"
Expand All @@ -28,10 +40,12 @@ module load R
time Rscript ~/R2Dtool/scripts/R2_plotMetaTranscript.R "${wd}/methylation_calls_annotated_lifted.bed" "${wd}/metatranscript_out.png" "probability" "0.9999" "upper"

# plotMetaJunction
time Rscript ~/R2Dtool/scripts/R2_plotMetaJunction.R "${wd}/methylation_calls_annotated_lifted.bed" "${wd}/out2.png" "probability" "0.9999" "upper"
time Rscript ~/R2Dtool/scripts/R2_plotMetaJunction.R "${wd}/methylation_calls_annotated_lifted.bed" "${wd}/metajunction_out.png" "probability" "0.9999" "upper"


# annotate methylation calls using Gencode v38


time Rscript ~/R2Dtool/scripts/R2_annotate.R "${wd}/methylationCalls.bed" "${annotation}" "${wd}/methylationCalls_annotated.bed"
# takes about 9 minutes

Expand Down
2 changes: 1 addition & 1 deletion scripts/R2_annotate.R
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ meta <- merge_out %>%
mutate(cds_start = utr5_len,
cds_end = utr5_len + cds_len,
tx_end = cds_end + utr3_len) %>%
mutate(rel_pos = ifelse(tx_coord < cds_start, # if the site is in the 5' utr
mutate(transcript_metacoordinate = ifelse(tx_coord < cds_start, # if the site is in the 5' utr
((tx_coord)/(cds_start)), # the relative position is simply the position of the site in the UTR
ifelse(tx_coord < cds_end, # else, if the site is in the CDS
(1 + (tx_coord - utr5_len)/(cds_len)), # the relative position is betwee 1 and 2 and represents the fraction of the cds the site is in
Expand Down
86 changes: 64 additions & 22 deletions src/annotate.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
use std::fs::File;
use std::fs::{File,OpenOptions};
use std::io::{BufRead, BufReader, Write};
use std::error::Error;
use std::collections::HashMap;
use crate::parse_gtf::{Transcript, read_annotation_file};
use crate::parse_gtf::{Transcript, read_annotation_file, Exon};
use rayon::prelude::*;
use std::sync::{Arc, Mutex};


#[derive(Debug, Clone)]
Expand All @@ -13,7 +15,7 @@ pub struct SpliceSite {

// Additional functions to calculate missing features
fn calculate_tx_len(utr5_len: u64, cds_len: u64, utr3_len: u64) -> u64 {
utr5_len + cds_len + utr3_len + 3
utr5_len + cds_len + utr3_len
}

fn calculate_cds_start(utr5_len: u64) -> u64 {
Expand Down Expand Up @@ -47,56 +49,93 @@ fn calculate_meta_coordinates(tx_coord: u64, utr5_len: u64, cds_len: u64, utr3_l
(rel_pos, abs_cds_start, abs_cds_end)
}

fn generate_splice_sites(transcripts: &HashMap<String, Transcript>) -> HashMap<String, Vec<SpliceSite>> {
let mut splice_sites_map: HashMap<String, Vec<SpliceSite>> = HashMap::new();

for (transcript_id, transcript) in transcripts {
fn generate_splice_sites(transcripts: &HashMap<String, Transcript>) -> Arc<Mutex<HashMap<String, Vec<SpliceSite>>>> {
let splice_sites_map = Arc::new(Mutex::new(HashMap::new()));

let file_path = "splice_sites_map.txt";
let file = OpenOptions::new().write(true).create(true).truncate(true).open(file_path).expect("Unable to open file");
let file = Arc::new(Mutex::new(file));

transcripts.par_iter().for_each(|(transcript_id, transcript)| {
let transcript_id_no_version = transcript_id.split('.').next().unwrap().to_string();
let mut splice_sites: Vec<SpliceSite> = Vec::new();
let mut cumulative_length: u64 = 0;

if !transcript.exons.is_empty() {
let ref exons = transcript.exons;
let mut cumsum_width = 0;
let mut exons: Vec<&Exon> = transcript.exons.iter()
.filter(|exon| exon.feature.as_ref().map_or(false, |ft| ft == "exon"))
.collect();

if transcript.strand.as_deref() == Some("+") {
exons.sort_by_key(|exon| exon.start);
} else if transcript.strand.as_deref() == Some("-") {
exons.sort_by_key(|exon| std::cmp::Reverse(exon.start));
}

for (i, exon) in exons.iter().enumerate() {
let exon_length = exon.end - exon.start + 1;
cumsum_width += exon_length;

if i < exons.len() - 1 {
// Update cumulative length to include exon length
cumulative_length += exon.length;
if i < exons.len() - 1 { // Ignore the last exon
splice_sites.push(SpliceSite {
transcript_id: transcript_id_no_version.clone(),
tx_coord: cumsum_width as u64,
tx_coord: cumulative_length, // Position within the transcript
});
}
// Debug output
// println!("Transcript ID: {}, Exon Length: {}, Cumulative Length: {}", transcript_id_no_version, exon.length, cumulative_length);
}
}
splice_sites_map.insert(transcript_id_no_version, splice_sites);
}

// Writing to file (thread-safe)
let mut file = file.lock().unwrap();
writeln!(file, "Transcript ID: {}", transcript_id_no_version).expect("Unable to write to file");
for splice_site in &splice_sites {
writeln!(file, "Splice Site: {}", splice_site.tx_coord).expect("Unable to write to file");
}
writeln!(file).expect("Unable to write to file"); // Write a newline for better readability

let mut map = splice_sites_map.lock().unwrap();
map.insert(transcript_id_no_version, splice_sites);
});

splice_sites_map
}

fn splice_site_distances(tx_coord: u64, splice_sites: &[SpliceSite]) -> (Option<i64>, Option<i64>) {

// Initialize upstream and downstream distances as None
let mut upstream_distance: Option<i64> = None;
let mut downstream_distance: Option<i64> = None;

// Iterate over all splice sites
for splice_site in splice_sites {
let site_distance = tx_coord as i64 - splice_site.tx_coord as i64;

// Calculate the distance from the current transcript coordinate to the splice site
let site_distance = splice_site.tx_coord as i64 - tx_coord as i64;

if site_distance >= 0 {
// Determine if the site is upstream or downstream
// If site_distance is negative, the splice site is upstream
if site_distance < 0 {
let positive_distance = site_distance.abs();
if upstream_distance.is_none() || positive_distance < upstream_distance.unwrap() {
upstream_distance = Some(positive_distance);
}
}
// Similar logic for downstream
else if site_distance > 0 {
if downstream_distance.is_none() || site_distance < downstream_distance.unwrap() {
downstream_distance = Some(site_distance);
}
} else {
if upstream_distance.is_none() || site_distance.abs() < upstream_distance.unwrap() {
upstream_distance = Some(site_distance.abs());
}
}
}

// Return the smallest positive distances for upstream and downstream junctions
(upstream_distance, downstream_distance)
}


pub fn run_annotate(matches: &clap::ArgMatches, has_header: bool) -> Result<(), Box<dyn Error>> {

// eprintln!("Running the annotate functionality...");
Expand Down Expand Up @@ -174,9 +213,12 @@ pub fn run_annotate(matches: &clap::ArgMatches, has_header: bool) -> Result<(),
abs_cds_start = calculated_values.1.to_string();
abs_cds_end = calculated_values.2.to_string();
}


// Lock the mutex to safely access the shared hashmap
let map_lock = splice_sites.lock().expect("Failed to lock the mutex");

// Handle splice sites if available
if let Some(splice_sites) = splice_sites.get(transcript_id) {
if let Some(splice_sites) = map_lock.get(transcript_id) {
let calculated_distances = splice_site_distances(tx_coord, splice_sites);
up_junc_dist = calculated_distances.0.map_or("NA".to_string(), |x| x.to_string());
down_junc_dist = calculated_distances.1.map_or("NA".to_string(), |x| x.to_string());
Expand Down
4 changes: 2 additions & 2 deletions src/parse_gtf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ pub fn read_annotation_file(file_path: &str, is_gtf: bool) -> Result<HashMap<Str
"UTR" => {

// Construct a UTR feature similar to how you construct an Exon
let utr_feature = Exon { // Assuming UTR features have a similar structure; adjust as needed
let utr_feature = Exon {
seq_id: record.seqname().to_string(),
source: record.source().to_string(),
feature_type: record.feature_type().to_string(),
Expand All @@ -227,7 +227,7 @@ pub fn read_annotation_file(file_path: &str, is_gtf: bool) -> Result<HashMap<Str
length: feature_length,
score: record.score().map(|s| s as f64),
strand: record.strand().map(|s| s.strand_symbol().to_string()).unwrap_or_else(|| ".".to_string()),
frame: None, // UTRs typically do not have a frame, adjust according to your data structure
frame: None,
attributes: attributes.clone(),
feature: Some("UTR".to_string()), // Explicitly marking the feature as UTR
};
Expand Down
Loading

0 comments on commit 874dbb6

Please sign in to comment.