Skip to content

Commit 7d3c1b7

Browse files
authored
Merge pull request #30 from uclahs-cds/jarbet-illumina-850K-support
Support for CpGs in Illumina EPIC 850k human methylation array
2 parents 7edfabe + 39c4c53 commit 7d3c1b7

16 files changed

+218
-34
lines changed

DESCRIPTION

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
Package: PrCaMethy
22
Title: Prostate Cancer Methylation
3-
Version: 0.1.0
4-
Date: 2025-01-16
3+
Version: 0.2.0
4+
Date: 2025-05-15
55
Maintainer: Jaron Arbet <[email protected]>
6-
Description: Resources for predicting clinical and molecular features using prostate cancer DNA methylation data generated from the Illumina 450K array.
6+
Description: Resources for predicting clinical and molecular features using prostate cancer DNA methylation data.
77
Authors@R: c(
88
person("Jaron Arbet", role = c("aut", "cre"), email = "[email protected]"),
99
person("Paul C. Boutros", role = "aut"))

NEWS.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,13 @@
1+
## PrCaMethy 0.2.0 (2025-05-15)
2+
3+
### New Features
4+
5+
* Support CpGs from Illumina 850K array when using `gene.methylation` to calculate gene-level methylation. Previously only 450K array was supported.
6+
7+
### Enhancements
8+
9+
* Documentation updated to explain both Illumina 450K and 850K arrays are supported (previously only 450K was supported)
10+
111
## PrCaMethy 0.1.0 (2025-02-12)
212

313
* First release of the package.

R/gene.methylation.R

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
#' Gene-level methylation
22
#'
3-
#' Calculate gene-level methylation for a dataset containing CpGs from the Illumina 450k methylation array.
3+
#' Calculate gene-level methylation for a dataset containing CpGs from the Illumina 450k and 850k human methylation arrays.
44
#' Gene-level methylation is calculated as the median beta-value among CpG islands in the gene promoter region.
5+
#' Note that gene names that originally contained a hyphen (-) are replaced with a period (.)
56
#'
67
#' @param methy methylation dataset where rownames give patient ids and columns use CpG ids
78
#' @param print.progress TRUE/FALSE to show progress bar

R/gene.promoter.cpgi.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
#' CpG islands in the gene promoter region
22
#'
3-
#' List of genes where each element gives a vector of CpG islands in the promoter region of that gene.
3+
#' List of genes where each element gives a vector of CpG islands in the promoter region of that gene. The Illumina cg IDs come from the 450k and 850k arrays.
44
'gene.promoter.cpgi'

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
# PrCaMethy
22

3-
The `PrCaMethy` R package offers tools for predicting clinical and molecular features using prostate cancer DNA methylation data generated from the Illumina 450K array.
3+
The `PrCaMethy` R package offers tools for predicting clinical and molecular features using prostate cancer DNA methylation data.
4+
5+
The user inputs CpG beta-values from CpGs contained in the Illumina 450K and/or 850K human methylation arrays. See the tutorial below for more details.
46

57
## Tutorial
68

data/example.data.gene.methy.rda

476 Bytes
Binary file not shown.

data/gene.promoter.cpgi.rda

1.23 KB
Binary file not shown.

inst/cpg.annotation.450k.850k.R

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
library(rtracklayer);
2+
library(GenomicRanges);
3+
library(R.utils);
4+
5+
devtools::load_all();
6+
source('config.R') # see project PRAD-000101-MethySubtypes/PrCaMethy/config.R
7+
8+
cpg.450k <- readRDS(arg$path.cpg.annotation.450k);
9+
cpg.850k <- readRDS(arg$path.cpg.annotation.850k);
10+
cpg.850k <- cpg.850k[!cpg.850k$cpg %in% cpg.450k$cpg, ];
11+
cpg.850k <- cpg.850k[!duplicated(cpg.850k$cpg), ];
12+
13+
cpg.450k$array <- '450k';
14+
cpg.850k$array <- '850k';
15+
16+
colnames(cpg.450k)[which(colnames(cpg.450k) == 'chr')] <- 'chr.hg19';
17+
colnames(cpg.450k)[which(colnames(cpg.450k) == 'pos')] <- 'pos.hg19';
18+
colnames(cpg.850k)[which(colnames(cpg.850k) == 'chr')] <- 'chr.hg38';
19+
colnames(cpg.850k)[which(colnames(cpg.850k) == 'pos')] <- 'pos.hg38';
20+
21+
22+
cpg.450k.sub <- cpg.450k[, c('cpg', 'chr.hg19', 'pos.hg19', 'strand', 'cpg.type', 'promoter', 'array','UCSC_RefGene_Name')];
23+
cpg.850k.sub <- cpg.850k[, c('cpg', 'chr.hg38', 'pos.hg38', 'strand', 'cpg.type', 'promoter', 'array','UCSC_RefGene_Name')];
24+
25+
levels(cpg.850k.sub$cpg.type)[levels(cpg.850k.sub$cpg.type) == 'OpenSea'] <- 'Open sea';
26+
stopifnot(all(levels(cpg.450k.sub$cpg.type) == levels(cpg.850k.sub$cpg.type)));
27+
for (i in 1:ncol(cpg.450k.sub)) {
28+
if (is.factor(cpg.450k.sub[, i])) {
29+
#print(i);
30+
stopifnot(all(levels(cpg.450k.sub[, i]) == levels(cpg.850k.sub[, i])));
31+
}
32+
}
33+
34+
35+
########## convert 450k coordinates to hg38
36+
# Create GRanges object from hg19 coordinates
37+
gr.hg19 <- GRanges(
38+
seqnames = paste0('chr', as.character(cpg.450k.sub$chr.hg19)),
39+
ranges = IRanges(start = cpg.450k.sub$pos.hg19, end = cpg.450k.sub$pos.hg19),
40+
strand = cpg.450k.sub$strand,
41+
cpg = cpg.450k.sub$cpg
42+
);
43+
44+
# Download the liftover chain file
45+
# path.file <- file.path(arg$path.save.annot, paste0(chain.file, '.gz'));
46+
# download.file(
47+
# 'http://hgdownload.cse.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz',
48+
# destfile = path.file
49+
# );
50+
# gunzip(path.file, remove = FALSE);
51+
52+
53+
# Load the chain file and perform liftOver
54+
chain <- import.chain(arg$path.hg19.to.hg38.chain);
55+
gr.hg38 <- liftOver(gr.hg19, chain)
56+
gr.hg38 <- unlist(gr.hg38)
57+
58+
# Convert back to data frame and merge with original
59+
lifted.df <- data.frame(
60+
cpg = gr.hg38$cpg,
61+
chr.hg38 = gsub('chr', '', as.character(seqnames(gr.hg38))),
62+
pos.hg38 = start(gr.hg38)
63+
);
64+
65+
# Merge hg38 coordinates back into original data
66+
cpg.450k.hg38 <- merge(cpg.450k.sub, lifted.df, by = 'cpg', all.x = TRUE)
67+
mean(is.na(cpg.450k.hg38$pos.hg38));
68+
69+
70+
71+
# pool 450k and 850k annotations
72+
cpg.annotation <- data.frame(data.table::rbindlist(list(cpg.450k.hg38, cpg.850k.sub), fill = TRUE), check.names = FALSE)
73+
cpg.annotation <- cpg.annotation[, c('cpg', 'chr.hg38', 'pos.hg38', 'chr.hg19', 'pos.hg19', 'cpg.type', 'promoter', 'array','UCSC_RefGene_Name')];
74+
mean(cpg.annotation$chr.hg38 == cpg.annotation$chr.hg19, na.rm = TRUE);
75+
76+
saveRDS(
77+
cpg.annotation,
78+
file = file.path(arg$path.save.annot, paste0(Sys.Date(), '_cpg_annotation_450k-850k.rds')),
79+
);
80+
81+
data.table::fwrite(
82+
cpg.annotation,
83+
file = file.path(arg$path.save.annot, paste0(Sys.Date(), '_cpg_annotation_450k-850k.csv.gz')),
84+
row.names = FALSE,
85+
col.names = TRUE
86+
);
Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,15 @@
1-
# https://www.bioconductor.org/packages/release/data/annotation/html/IlluminaHumanMethylation450kanno.ilmn12.hg19.html
2-
# v 0.6.1
31
devtools::load_all();
42
library(IlluminaHumanMethylation450kanno.ilmn12.hg19);
3+
source('config.R') # see project PRAD-000101-MethySubtypes/PrCaMethy/config.R
54

65
data(Other);
76
cpg.annotation <- as.data.frame(Other);
8-
cpg.annotation$genomic.location <- factor(ifelse(
9-
test = cpg.annotation$UCSC_RefGene_Name == '',
10-
yes = 'Intergenic',
11-
no = ifelse(
7+
cpg.annotation$promoter <- ifelse(
128
test = grepl('TSS200|TSS1500|5\'UTR', cpg.annotation$UCSC_RefGene_Group) | cpg.annotation$Regulatory_Feature_Group == 'Promoter_Associated',
13-
yes = 'Promoter',
14-
no = 'Body'
15-
)
16-
));
17-
table(cpg.annotation$genomic.location);
9+
yes = 'yes',
10+
no = 'no'
11+
);
12+
table(cpg.annotation$promoter);
1813

1914
cpg.annotation$cpg <- rownames(cpg.annotation);
2015

@@ -61,4 +56,9 @@ cpg.annotation <- merge(
6156
all.x = TRUE
6257
);
6358

64-
usethis::use_data(cpg.annotation, overwrite = TRUE, compress = 'xz');
59+
cpg.annotation.450k <- cpg.annotation;
60+
#usethis::use_data(cpg.annotation.450k, overwrite = TRUE, compress = 'xz');
61+
saveRDS(
62+
cpg.annotation.450k,
63+
file = file.path(arg$path.save.annot, paste0(Sys.Date(), '_cpg_annotation_450k.rds')),
64+
);

inst/cpg.annotation.850k.R

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
devtools::load_all();
2+
library(IlluminaHumanMethylationEPICv2anno.20a1.hg38);
3+
source('config.R') # see project PRAD-000101-MethySubtypes/PrCaMethy/config.R
4+
5+
data(Other);
6+
cpg.annotation <- as.data.frame(Other);
7+
cpg.annotation$promoter <- ifelse(
8+
test = grepl('TSS200|TSS1500|5\'UTR', cpg.annotation$UCSC_RefGene_Group) | cpg.annotation$Regulatory_Feature_Group == 'Promoter_Associated',
9+
yes = 'yes',
10+
no = 'no'
11+
);
12+
13+
cpg.annotation$cpg.id.full <- rownames(cpg.annotation);
14+
cpg.annotation$cpg <- sapply(strsplit(cpg.annotation$cpg.id.full, '_'), function(x) x[1]);
15+
16+
data(Locations);
17+
locations <- as.data.frame(Locations);
18+
locations$chr <- gsub('chr', '',locations$chr);
19+
stopifnot(all(as.character(c(1:22, 'X', 'Y')) %in% locations$chr));
20+
locations$chr <- factor(
21+
x = locations$chr,
22+
levels = as.character(c(1:22, 'X', 'Y'))
23+
);
24+
locations$cpg.id.full <- rownames(locations);
25+
locations$cpg <- sapply(strsplit(locations$cpg.id.full, '_'), function(x) x[1]);
26+
stopifnot(all(locations$cpg %in% cpg.annotation$cpg));
27+
cpg.annotation <- merge(
28+
x = cpg.annotation,
29+
y = locations,
30+
by = 'cpg.id.full'
31+
);
32+
stopifnot(nrow(cpg.annotation) == nrow(locations));
33+
34+
# island, open sea, shelf, shore
35+
data(Islands.UCSC);
36+
islands <- as.data.frame(Islands.UCSC);
37+
islands$cpg.type <- islands$Relation_to_Island
38+
islands$cpg.type <- factor(islands$cpg.type);
39+
islands$cpg.id.full <- rownames(islands);
40+
islands$cpg <- sapply(strsplit(islands$cpg.id.full, '_'), function(x) x[1]);
41+
cpg.annotation <- merge(
42+
x = cpg.annotation,
43+
y = islands,
44+
by = 'cpg.id.full',
45+
all.x = TRUE
46+
);
47+
stopifnot(nrow(cpg.annotation) == nrow(islands));
48+
49+
cpg.annotation.850k <- cpg.annotation;
50+
#usethis::use_data(cpg.annotation.450k, overwrite = TRUE, compress = 'xz');
51+
saveRDS(
52+
cpg.annotation.850k,
53+
file = file.path(arg$path.save.annot, paste0(Sys.Date(), '_cpg_annotation_850k.rds')),
54+
);

0 commit comments

Comments
 (0)