Skip to content

Commit 04ef0fd

Browse files
authored
Merge pull request #5 from uclahs-cds/maotian-fix-bugs
Issue fixes for data.frame check and file check
2 parents 71d150e + 656d3fc commit 04ef0fd

19 files changed

+226
-206
lines changed

R/apply.scaling.R

Lines changed: 32 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,14 @@
11
apply.scaling <- function(data.matrices, scaling.factors) {
22

3-
# data.matrices can be a single matrix or a list of matrices
4-
# if the data is a single matrix then the class will be 'matrix'
5-
if (class(data.matrices)[1] == 'matrix') {
6-
3+
# `data.matrices` can be either 1. a single matrix or data frame, or 2. a list of matrices or data frames
4+
5+
if (is.matrix(data.matrices) || is.data.frame(data.matrices)) {
6+
77
# check that scaling.factors is have elements with the names "center" and "scale"
88
if (all(!c('center','scale') %in% names(scaling.factors))) {
99
stop('for each data matrix, scaling.factor needs to be a list with center and scale ');
1010
}
1111

12-
1312
# check that scaling.factors are the correct format
1413
if (length(scaling.factors$center) != nrow(data.matrices)) {
1514
stop('the length of scaling.factors$center needs to match the number of rows in data.matrices');
@@ -19,55 +18,53 @@ apply.scaling <- function(data.matrices, scaling.factors) {
1918
}
2019

2120
# if necessary adjust the format of the scaling factors for a single matrix
22-
if (class(scaling.factors$center) == 'list') {
21+
if (is.list(scaling.factors$center)) {
2322
scaling.factors$center <- scaling.factors$center[[1]];
2423
warning('the first item from the scaling.factor$center list was used for scaling');
2524
}
26-
if (class(scaling.factors$scale) == 'list') {
25+
if (is.list(scaling.factors$scale)) {
2726
scaling.factors$scale <- scaling.factors$scale[[1]];
2827
warning('the first item from the scaling.factor$scale list was used for scaling');
2928
}
3029

3130
# scale each row in the matrix by the corresponding scaling factors
32-
for(i in 1:nrow(data.matrices)) {
31+
for (i in 1:nrow(data.matrices)) {
3332
center.adjustment <- scaling.factors$center[rownames(data.matrices)[i]];
3433
scale.adjustment <- 1;
35-
if(scaling.factors$scale[rownames(data.matrices)[i]] > 0) {
34+
if (scaling.factors$scale[rownames(data.matrices)[i]] > 0) {
3635
scale.adjustment <- scaling.factors$scale[rownames(data.matrices)[i]];
3736
}
38-
data.matrices[i,] <- (data.matrices[i,] - center.adjustment) / scale.adjustment;
37+
data.matrices[i, ] <- (data.matrices[i, ] - center.adjustment) / scale.adjustment;
3938
}
4039

4140
# return the scaled single matrix
4241
return(data.matrices);
43-
}
44-
else if (class(data.matrices) == 'list') {
45-
# if you make it to this point then data.matrices is a list
46-
# so check the format of the input and then recurse on each matrix
47-
48-
# check that scaling.factors are the correct format
49-
if (any(sort(names(data.matrices)) != sort(names(scaling.factors)))) {
50-
stop('the scaling.factors list needs to have the same names as the data.matrices list');
51-
}
52-
53-
# if you get to this point then data.matrices is a list of matrices
54-
for(data.type in names(data.matrices)) {
42+
} else if (is.list(data.matrices)) {
43+
# if you make it to this point then data.matrices is a list
44+
# so check the format of the input and then recurse on each matrix
5545

5646
# check that scaling.factors are the correct format
57-
if (length(scaling.factors[[data.type]]$center) != nrow(data.matrices[[data.type]])) {
58-
stop(paste0('scaling.factors$',data.type,'$center does not match the number of rows in data.matrices$',data.type));
47+
if (any(sort(names(data.matrices)) != sort(names(scaling.factors)))) {
48+
stop('the scaling.factors list needs to have the same names as the data.matrices list');
5949
}
60-
if (length(scaling.factors[[data.type]]$scale) != nrow(data.matrices[[data.type]])) {
61-
stop(paste0('scaling.factors$',data.type,'$scale does not match the number of rows in data.matrices$',data.type));
50+
51+
# if you get to this point then data.matrices is a list of matrices
52+
for (data.type in names(data.matrices)) {
53+
54+
# check that scaling.factors are the correct format
55+
if (length(scaling.factors[[data.type]]$center) != nrow(data.matrices[[data.type]])) {
56+
stop(paste0('scaling.factors$', data.type,'$center does not match the number of rows in data.matrices$',data.type));
57+
}
58+
if (length(scaling.factors[[data.type]]$scale) != nrow(data.matrices[[data.type]])) {
59+
stop(paste0('scaling.factors$', data.type,'$scale does not match the number of rows in data.matrices$',data.type));
60+
}
61+
62+
# call the function for each data type
63+
data.matrices[[data.type]] <- apply.scaling(data.matrices[[data.type]], scaling.factors[[data.type]]);
6264
}
63-
64-
# call the function for each data type
65-
data.matrices[[data.type]] <- apply.scaling(data.matrices[[data.type]],scaling.factors[[data.type]]);
66-
}
6765

68-
# return the scaled list of matrices
69-
return(data.matrices);
66+
# return the scaled list of matrices
67+
return(data.matrices);
68+
}
69+
stop('`data.matrices` must be a matrix, a data frame, or a list of matrices or data frames');
7070
}
71-
stop('data.matrices needs to be a matrix or a list of matrices');
72-
}
73-

R/calculate.cis.matrix.R

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,13 @@ calculate.cis.matrix <- function(
1111
num.iterations = 10,
1212
print.intermediary.similarity.matrices.to.file = TRUE,
1313
print.dir = '.',
14-
patient.proportion.seeds = seq(1,num.iterations),
15-
feature.proportion.seeds = seq(1,num.iterations)
14+
patient.proportion.seeds = seq(1, num.iterations),
15+
feature.proportion.seeds = seq(1, num.iterations)
1616
) {
1717

1818
# pull out the patients to use
1919
patients <- NULL;
20-
for(data.type in data.types) {
20+
for (data.type in data.types) {
2121
if (filter.to.common.patients) {
2222
if (is.null(patients)) {
2323
patients <- colnames(data.matrices[[data.type]])[grep('\\d', colnames(data.matrices[[data.type]]))];
@@ -40,16 +40,16 @@ calculate.cis.matrix <- function(
4040

4141
# repeatly subsample the dataset and calculate integrative similarity
4242
per.patient.data.type.corr <- list();
43-
for(i in 1:num.iterations) {
43+
for (i in 1:num.iterations) {
4444
set.seed(patient.proportion.seeds[i]);
45-
selected.patients <- sample(patients.for.correlations,round(length(patients.for.correlations)*patient.proportion));
45+
selected.patients <- sample(patients.for.correlations, round(length(patients.for.correlations) * patient.proportion));
4646
data.matrices.subset <- data.matrices;
4747
# if the feature proportion is 1, then we don't need to filter the features
4848
# if its not 1, then the features need to be selected for the iteration
4949
if (feature.proportion != 1) {
50-
for(data.type in data.types) {
50+
for (data.type in data.types) {
5151
set.seed(feature.proportion.seeds[i]);
52-
selected.features <- sample(rownames(data.matrices[[data.type]]), ceiling(nrow(data.matrices[[data.type]])*feature.proportion));
52+
selected.features <- sample(rownames(data.matrices[[data.type]]), ceiling(nrow(data.matrices[[data.type]]) * feature.proportion));
5353
data.matrices.subset[[data.type]] <- data.matrices.subset[[data.type]][selected.features,];
5454
}
5555
}
@@ -65,7 +65,7 @@ calculate.cis.matrix <- function(
6565
if (print.intermediary.similarity.matrices.to.file) {
6666
write.table(
6767
per.patient.data.type.corr[[i]],
68-
file = paste0(print.dir,'/',Sys.Date(),'_correlation_matrix_seed_',i,'.txt'),
68+
file = paste0(print.dir, '/', Sys.Date(), '_correlation_matrix_seed_', i, '.txt'),
6969
col.names = TRUE,
7070
row.names = TRUE,
7171
sep = '\t',
@@ -80,9 +80,15 @@ calculate.cis.matrix <- function(
8080
nrow = nrow(per.patient.data.type.corr[[1]]),
8181
ncol = ncol(per.patient.data.type.corr[[1]])
8282
);
83-
for(i in 1:nrow(median.per.patient.data.type.corr)) {
84-
for(j in 1:ncol(median.per.patient.data.type.corr)) {
85-
median.per.patient.data.type.corr[i,j] <- median(sapply(per.patient.data.type.corr, function(x) {x[i,j]}));
83+
for (i in 1:nrow(median.per.patient.data.type.corr)) {
84+
for (j in 1:ncol(median.per.patient.data.type.corr)) {
85+
median.per.patient.data.type.corr[i, j] <- median(
86+
sapply(
87+
per.patient.data.type.corr,
88+
function(x) {
89+
x[i, j]
90+
}
91+
), na.rm = TRUE);
8692
}
8793
}
8894
rownames(median.per.patient.data.type.corr) <- rownames(per.patient.data.type.corr[[1]]);

R/calculate.integrative.similarity.matrix.R

Lines changed: 45 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,11 @@ calculate.integrative.similarity.matrix <- function(
66
filter.to.common.patients = FALSE,
77
patients.to.return = NULL,
88
patients.for.correlations = NULL
9-
) {
9+
) {
1010

1111
# pull out the patients to use
1212
patients <- NULL;
13-
for(data.type in data.types) {
13+
for (data.type in data.types) {
1414
if (filter.to.common.patients) {
1515
# assume patient IDs have at least one number in them and annotation columns don't
1616
if (is.null(patients)) {
@@ -36,81 +36,86 @@ calculate.integrative.similarity.matrix <- function(
3636
}
3737
else {
3838
patients.to.return <- intersect(patients.to.return, patients);
39-
}
39+
}
4040
if (is.null(patients.for.correlations)) {
4141
patients.for.correlations <- patients;
4242
}
4343
else {
4444
patients.for.correlations <- intersect(patients.for.correlations, patients);
4545
}
46-
patient.pairs <- as.character(sapply(1:(length(patients.for.correlations)), function(x) {paste0(patients.for.correlations[x], ':', patients.to.return)}));
46+
patient.pairs <- as.character(sapply(1:(length(patients.for.correlations)), function(x) {
47+
paste0(patients.for.correlations[x], ':', patients.to.return)
48+
}
49+
));
4750
patients <- sort(unique(c(patients.to.return, patients.for.correlations)));
4851

4952
# calculate pair-wise distances
5053
patient.paired.dists <- list();
5154
patient.paired.dists.matrix <- matrix(NA, ncol = length(data.types), nrow = length(patient.pairs));
5255
colnames(patient.paired.dists.matrix) <- data.types;
5356
rownames(patient.paired.dists.matrix) <- patient.pairs;
54-
for(data.type in data.types) {
57+
for (data.type in data.types) {
5558
# filter to required patients
56-
data.matrices[[data.type]] <- data.matrices[[data.type]][,sort(colnames(data.matrices[[data.type]])[colnames(data.matrices[[data.type]]) %in% patients])];
59+
data.matrices[[data.type]] <- data.matrices[[data.type]][, sort(colnames(data.matrices[[data.type]])[colnames(data.matrices[[data.type]]) %in% patients])];
5760

5861
# set up the matrix of distance pairs that are required
5962
dist.matrix <- matrix(NA, ncol = length(patients.for.correlations), nrow = length(patients.to.return));
6063
colnames(dist.matrix) <- patients.for.correlations;
6164
rownames(dist.matrix) <- patients.to.return;
6265

6366
# determine the most efficient approach for calculating the distances
64-
# we need pr.num x pc.num distances but calculating distances creates
67+
# we need pr.num x pc.num distances but calculating distances creates
6568
# matrix with the same columns and rows which would be (pr.num + pc.num)^2
6669
# calculating distances in sets can mean less unnecessary calculations are done
6770
# comparing the each of the patients.to.return to the patients.for.correlation
6871
# select the number of patients.to.return to compare to patients.for.correlation at a time
6972
# (pc.num + k) is the patients per comparison
70-
# then there will be (ceiling(pr.num/k)-1) comparisons
73+
# then there will be (ceiling(pr.num/k) - 1) comparisons
7174
# and then an additional (pc.num + j) where j is the remainder not calculated
7275
dist.calc.operations <- list();
7376
pr.num <- length(patients.to.return);
7477
pc.num <- length(patients.for.correlations);
7578
opt.num.of.return.to.calc.at.once <- order(sapply(
7679
1:pr.num,
77-
function(k) {(pc.num + k)^2 * (ceiling(pr.num/k)-1) + (pc.num + ifelse((pr.num%%k) == 0, k, pr.num%%k))^2}
78-
))[1];
80+
function(k) {
81+
(pc.num + k)^2 * (ceiling(pr.num / k) - 1) + (pc.num + ifelse((pr.num %% k) == 0, k, pr.num %% k))^2
82+
}
83+
))[1];
7984
pr.tracker <- 0;
80-
while(pr.tracker < pr.num) {
85+
while (pr.tracker < pr.num) {
8186
if ((pr.tracker + opt.num.of.return.to.calc.at.once) < pr.num) {
82-
dist.calc.operations[[as.character(pr.tracker)]] <- patients.to.return[(pr.tracker+1):(pr.tracker+opt.num.of.return.to.calc.at.once)];
87+
dist.calc.operations[[as.character(pr.tracker)]] <- patients.to.return[(pr.tracker + 1):(pr.tracker + opt.num.of.return.to.calc.at.once)];
8388
pr.tracker <- pr.tracker + opt.num.of.return.to.calc.at.once;
8489
}
8590
else {
86-
dist.calc.operations[[as.character(pr.tracker)]] <- patients.to.return[(pr.tracker+1):pr.num];
91+
dist.calc.operations[[as.character(pr.tracker)]] <- patients.to.return[(pr.tracker + 1):pr.num];
8792
pr.tracker <- pr.num;
8893
}
8994
}
9095

9196
# calculate distances and fill in patient by patient distance matrix
92-
for(dist.op in 1:length(dist.calc.operations)) {
97+
for (dist.op in 1:length(dist.calc.operations)) {
9398
if (class(dist.metrics[[data.type]]) == 'character') {
9499

95-
if (dist.metrics[[data.type]] %in% c('pearson','spearman')) {
100+
if (dist.metrics[[data.type]] %in% c('pearson', 'spearman')) {
96101
# if the distance metric is a correlation, convert the correlation into a distance
97102
dist.result <- as.dist(
98-
1 - cor(data.matrices[[data.type]][,intersect(colnames(data.matrices[[data.type]]),unique(c(dist.calc.operations[dist.op][[1]],patients.for.correlations)))],
103+
1 - cor(data.matrices[[data.type]][, intersect(colnames(data.matrices[[data.type]]), unique(c(dist.calc.operations[dist.op][[1]], patients.for.correlations)))],
99104
use = 'pairwise',
100105
method = dist.metrics[[data.type]])
101106
);
102107
}
103108
else {
104109
# for distances other than correlations use the distance function
105110
dist.result <- distance(
106-
t(data.matrices[[data.type]][,intersect(colnames(data.matrices[[data.type]]),unique(c(dist.calc.operations[dist.op][[1]],patients.for.correlations)))]),
111+
t(data.matrices[[data.type]][, intersect(colnames(data.matrices[[data.type]]), unique(c(dist.calc.operations[dist.op][[1]], patients.for.correlations)))]),
107112
method = dist.metrics[[data.type]],
108113
use.row.names = TRUE
109114
);
110115
}
111116
}
112117
else if (class(dist.metrics[[data.type]]) == 'function') {
113-
dist.result <- as.dist((dist.metrics[[data.type]])(t(data.matrices[[data.type]][,intersect(colnames(dist.metrics[[data.type]]), unique(c(dist.calc.operations[dist.op][[1]], patients.for.correlations)))])));
118+
dist.result <- as.dist((dist.metrics[[data.type]])(t(data.matrices[[data.type]][, intersect(colnames(dist.metrics[[data.type]]), unique(c(dist.calc.operations[dist.op][[1]], patients.for.correlations)))])));
114119
}
115120
else {
116121
stop(paste0('invalid option for ', data.type, ' distance metric: ', dist.metrics[[data.type]]));
@@ -125,29 +130,39 @@ calculate.integrative.similarity.matrix <- function(
125130
}
126131

127132
patient.paired.dists[[data.type]] <- dist.matrix;
128-
patient.pairs <- as.character(sapply(1:(ncol(dist.matrix)),function(x) {paste0(colnames(dist.matrix)[x], ':', rownames(dist.matrix))}));
133+
patient.pairs <- as.character(sapply(1:(ncol(dist.matrix)),function(x) {
134+
paste0(colnames(dist.matrix)[x], ':', rownames(dist.matrix))
135+
}
136+
));
129137
patient.paired.dists.matrix[patient.pairs,data.type] <- as.numeric(dist.matrix);
130138
}
131139
# find the rows with at least one value (not na) beyond the patient by patient comparison
132-
patient.paired.dists.matrix <- patient.paired.dists.matrix[apply(patient.paired.dists.matrix, 1, function(x) {sum(!is.na(x))}) > 1,];
140+
patient.paired.dists.matrix <- patient.paired.dists.matrix[apply(patient.paired.dists.matrix, 1, function(x) {
141+
sum(!is.na(x))
142+
}
143+
) > 1,];
133144

134145
split.rownames <- strsplit(rownames(patient.paired.dists.matrix), ':');
135-
pair.patient1 <- sapply(1:length(split.rownames), function(i) {split.rownames[[i]][1]});
136-
pair.patient2 <- sapply(1:length(split.rownames), function(i) {split.rownames[[i]][2]});
146+
pair.patient1 <- sapply(1:length(split.rownames), function(i) {
147+
split.rownames[[i]][1]
148+
});
149+
pair.patient2 <- sapply(1:length(split.rownames), function(i) {
150+
split.rownames[[i]][2]
151+
});
137152

138153
# calculate correlations (or integrative similarity) between data types
139-
per.patient.data.type.corr <- matrix(NA, nrow = length(patients.to.return), ncol = length(data.types)*(length(data.types)-1)/2);
154+
per.patient.data.type.corr <- matrix(NA, nrow = length(patients.to.return), ncol = length(data.types) * (length(data.types) - 1) / 2);
140155
rownames(per.patient.data.type.corr) <- patients.to.return;
141156
colnames(per.patient.data.type.corr) <- seq(1, ncol(per.patient.data.type.corr));
142157
data.type.pair.counter <- 0;
143-
for(i in 1:(length(data.types)-1)) {
144-
for(j in (i+1):length(data.types)) {
158+
for (i in 1:(length(data.types) - 1)) {
159+
for (j in (i + 1):length(data.types)) {
145160

146161
data.type.pair.counter <- data.type.pair.counter + 1;
147162
colnames(per.patient.data.type.corr)[data.type.pair.counter] <- paste0(data.types[i], ':', data.types[j]);
148163
not.na.rows <- (!is.na(patient.paired.dists.matrix[,i])) & (!is.na(patient.paired.dists.matrix[,j]));
149164

150-
for(patient in rownames(per.patient.data.type.corr)) {
165+
for (patient in rownames(per.patient.data.type.corr)) {
151166
rows.to.use <- which(pair.patient2 == patient & not.na.rows);
152167
if (length(rows.to.use) > 1) {
153168
per.patient.data.type.corr[patient,data.type.pair.counter] <- cor(
@@ -159,7 +174,10 @@ calculate.integrative.similarity.matrix <- function(
159174
}
160175
}
161176
}
162-
per.patient.data.type.corr <- per.patient.data.type.corr[which(apply(per.patient.data.type.corr, 1, function(x) { sum(!is.na(x)) }) > 0), , drop=FALSE];
177+
per.patient.data.type.corr <- per.patient.data.type.corr[which(apply(per.patient.data.type.corr, 1, function(x) {
178+
sum(!is.na(x))
179+
}
180+
) > 0), , drop = FALSE];
163181

164182
return(per.patient.data.type.corr);
165183
}

R/calculate.scaling.R

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,21 @@
11
calculate.scaling <- function(data.matrices) {
22
# if there is only one data type to scale
3-
if (class(data.matrices)[1] == 'matrix') {
3+
if (is.matrix(data.matrices) || is.data.frame(data.matrices)) {
44
# return the mean and sd of each row
55
return(list(
6-
center = apply(data.matrices,1,mean),
7-
scale = apply(data.matrices,1,sd)
8-
));
6+
center = apply(data.matrices, 1, mean),
7+
scale = apply(data.matrices, 1, sd)
8+
))
99
}
10-
if (class(data.matrices) == 'list') {
10+
if (is.list(data.matrices)) {
1111
# if there are multiple data types to scale
1212
# return the mean and sd of each row for each data matrix
1313
scaling.factors <- list();
14-
for(data.type in names(data.matrices)) {
14+
for (data.type in names(data.matrices)) {
1515
scaling.factors[[data.type]] <- calculate.scaling(data.matrices[[data.type]]);
1616
}
1717
return(scaling.factors);
1818
}
19-
# if not a list or a matrix return an error message to let the user know how to correct the input
20-
stop('data.matrices needs to be a matrix or a list');
19+
# if `data.matrices` is not a list, a matrix, or a data frame, return an error message to let the user know how to correct the input
20+
stop('`data.matrices` must be a matrix, a data frame, or a list of matrices or data frames');
2121
}

0 commit comments

Comments
 (0)