|
| 1 | +library(rtracklayer); |
| 2 | +library(GenomicRanges); |
| 3 | +library(R.utils); |
| 4 | + |
| 5 | +devtools::load_all(); |
| 6 | +source('config.R') # see project PRAD-000101-MethySubtypes/PrCaMethy/config.R |
| 7 | + |
| 8 | +cpg.450k <- readRDS(arg$path.cpg.annotation.450k); |
| 9 | +cpg.850k <- readRDS(arg$path.cpg.annotation.850k); |
| 10 | +cpg.850k <- cpg.850k[!cpg.850k$cpg %in% cpg.450k$cpg, ]; |
| 11 | +cpg.850k <- cpg.850k[!duplicated(cpg.850k$cpg), ]; |
| 12 | + |
| 13 | +cpg.450k$array <- '450k'; |
| 14 | +cpg.850k$array <- '850k'; |
| 15 | + |
| 16 | +colnames(cpg.450k)[which(colnames(cpg.450k) == 'chr')] <- 'chr.hg19'; |
| 17 | +colnames(cpg.450k)[which(colnames(cpg.450k) == 'pos')] <- 'pos.hg19'; |
| 18 | +colnames(cpg.850k)[which(colnames(cpg.850k) == 'chr')] <- 'chr.hg38'; |
| 19 | +colnames(cpg.850k)[which(colnames(cpg.850k) == 'pos')] <- 'pos.hg38'; |
| 20 | + |
| 21 | + |
| 22 | +cpg.450k.sub <- cpg.450k[, c('cpg', 'chr.hg19', 'pos.hg19', 'strand', 'cpg.type', 'promoter', 'array','UCSC_RefGene_Name')]; |
| 23 | +cpg.850k.sub <- cpg.850k[, c('cpg', 'chr.hg38', 'pos.hg38', 'strand', 'cpg.type', 'promoter', 'array','UCSC_RefGene_Name')]; |
| 24 | + |
| 25 | +levels(cpg.850k.sub$cpg.type)[levels(cpg.850k.sub$cpg.type) == 'OpenSea'] <- 'Open sea'; |
| 26 | +stopifnot(all(levels(cpg.450k.sub$cpg.type) == levels(cpg.850k.sub$cpg.type))); |
| 27 | +for (i in 1:ncol(cpg.450k.sub)) { |
| 28 | + if (is.factor(cpg.450k.sub[, i])) { |
| 29 | + #print(i); |
| 30 | + stopifnot(all(levels(cpg.450k.sub[, i]) == levels(cpg.850k.sub[, i]))); |
| 31 | + } |
| 32 | + } |
| 33 | + |
| 34 | + |
| 35 | +########## convert 450k coordinates to hg38 |
| 36 | +# Create GRanges object from hg19 coordinates |
| 37 | +gr.hg19 <- GRanges( |
| 38 | + seqnames = paste0('chr', as.character(cpg.450k.sub$chr.hg19)), |
| 39 | + ranges = IRanges(start = cpg.450k.sub$pos.hg19, end = cpg.450k.sub$pos.hg19), |
| 40 | + strand = cpg.450k.sub$strand, |
| 41 | + cpg = cpg.450k.sub$cpg |
| 42 | + ); |
| 43 | + |
| 44 | +# Download the liftover chain file |
| 45 | +# path.file <- file.path(arg$path.save.annot, paste0(chain.file, '.gz')); |
| 46 | +# download.file( |
| 47 | +# 'http://hgdownload.cse.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz', |
| 48 | +# destfile = path.file |
| 49 | +# ); |
| 50 | +# gunzip(path.file, remove = FALSE); |
| 51 | + |
| 52 | + |
| 53 | +# Load the chain file and perform liftOver |
| 54 | +chain <- import.chain(arg$path.hg19.to.hg38.chain); |
| 55 | +gr.hg38 <- liftOver(gr.hg19, chain) |
| 56 | +gr.hg38 <- unlist(gr.hg38) |
| 57 | + |
| 58 | +# Convert back to data frame and merge with original |
| 59 | +lifted.df <- data.frame( |
| 60 | + cpg = gr.hg38$cpg, |
| 61 | + chr.hg38 = gsub('chr', '', as.character(seqnames(gr.hg38))), |
| 62 | + pos.hg38 = start(gr.hg38) |
| 63 | + ); |
| 64 | + |
| 65 | +# Merge hg38 coordinates back into original data |
| 66 | +cpg.450k.hg38 <- merge(cpg.450k.sub, lifted.df, by = 'cpg', all.x = TRUE) |
| 67 | +mean(is.na(cpg.450k.hg38$pos.hg38)); |
| 68 | + |
| 69 | + |
| 70 | + |
| 71 | +# pool 450k and 850k annotations |
| 72 | +cpg.annotation <- data.frame(data.table::rbindlist(list(cpg.450k.hg38, cpg.850k.sub), fill = TRUE), check.names = FALSE) |
| 73 | +cpg.annotation <- cpg.annotation[, c('cpg', 'chr.hg38', 'pos.hg38', 'chr.hg19', 'pos.hg19', 'cpg.type', 'promoter', 'array','UCSC_RefGene_Name')]; |
| 74 | +mean(cpg.annotation$chr.hg38 == cpg.annotation$chr.hg19, na.rm = TRUE); |
| 75 | + |
| 76 | +saveRDS( |
| 77 | + cpg.annotation, |
| 78 | + file = file.path(arg$path.save.annot, paste0(Sys.Date(), '_cpg_annotation_450k-850k.rds')), |
| 79 | + ); |
| 80 | + |
| 81 | +data.table::fwrite( |
| 82 | + cpg.annotation, |
| 83 | + file = file.path(arg$path.save.annot, paste0(Sys.Date(), '_cpg_annotation_450k-850k.csv.gz')), |
| 84 | + row.names = FALSE, |
| 85 | + col.names = TRUE |
| 86 | + ); |
0 commit comments