uclahs-cds
diff --git a/‎R/apply.scaling.R‎
Lines changed: 32 additions & 35 deletions b/‎R/apply.scaling.R‎
Lines changed: 32 additions & 35 deletions
diff --git a/‎R/calculate.cis.matrix.R‎
Lines changed: 17 additions & 11 deletions b/‎R/calculate.cis.matrix.R‎
Lines changed: 17 additions & 11 deletions
diff --git a/‎R/calculate.integrative.similarity.matrix.R‎
Lines changed: 45 additions & 27 deletions b/‎R/calculate.integrative.similarity.matrix.R‎
Lines changed: 45 additions & 27 deletions
diff --git a/‎R/calculate.scaling.R‎
Lines changed: 8 additions & 8 deletions b/‎R/calculate.scaling.R‎
Lines changed: 8 additions & 8 deletions
@@ -1,15 +1,14 @@
 apply.scaling <- function(data.matrices, scaling.factors) {
 
-	# data.matrices can be a single matrix or a list of matrices
-	# if the data is a single matrix then the class will be 'matrix'	
-	if (class(data.matrices)[1] == 'matrix') {
-	
+	# `data.matrices` can be either 1. a single matrix or data frame, or 2. a list of matrices or data frames
+
+	if (is.matrix(data.matrices) || is.data.frame(data.matrices)) {
+
 		# check that scaling.factors is have elements with the names "center" and "scale"
 		if (all(!c('center','scale') %in% names(scaling.factors))) {
 			stop('for each data matrix, scaling.factor needs to be a list with center and scale ');
 			}
 
-
 		# check that scaling.factors are the correct format
 		if (length(scaling.factors$center) != nrow(data.matrices)) {
 			stop('the length of scaling.factors$center needs to match the number of rows in data.matrices');
@@ -19,55 +18,53 @@ apply.scaling <- function(data.matrices, scaling.factors) {
 			}
 
 		# if necessary adjust the format of the scaling factors for a single matrix
-		if (class(scaling.factors$center) == 'list') {
+		if (is.list(scaling.factors$center)) {
 			scaling.factors$center <- scaling.factors$center[[1]];
 			warning('the first item from the scaling.factor$center list was used for scaling');
 			}
-		if (class(scaling.factors$scale) == 'list') {
+		if (is.list(scaling.factors$scale)) {
 			scaling.factors$scale <- scaling.factors$scale[[1]];
 			warning('the first item from the scaling.factor$scale list was used for scaling');
 			}
 
 		# scale each row in the matrix by the corresponding scaling factors
-		for(i in 1:nrow(data.matrices)) {
+		for (i in 1:nrow(data.matrices)) {
 			center.adjustment <- scaling.factors$center[rownames(data.matrices)[i]];
 			scale.adjustment <- 1;
-			if(scaling.factors$scale[rownames(data.matrices)[i]] > 0) {
+			if (scaling.factors$scale[rownames(data.matrices)[i]] > 0) {
 				scale.adjustment <- scaling.factors$scale[rownames(data.matrices)[i]];
 				}
-			data.matrices[i,] <- (data.matrices[i,] - center.adjustment) / scale.adjustment;
+			data.matrices[i, ] <- (data.matrices[i, ] - center.adjustment) / scale.adjustment;
 			}
 
 		# return the scaled single matrix
 		return(data.matrices);
-		}
-	else if (class(data.matrices) == 'list') {
-		# if you make it to this point then data.matrices is a list
-		# so check the format of the input and then recurse on each matrix
-
-		# check that scaling.factors are the correct format
-		if (any(sort(names(data.matrices)) != sort(names(scaling.factors)))) {
-			stop('the scaling.factors list needs to have the same names as the data.matrices list');
-			}
-
-		# if you get to this point then data.matrices is a list of matrices
-		for(data.type in names(data.matrices)) {
+		} else if (is.list(data.matrices)) {
+			# if you make it to this point then data.matrices is a list
+			# so check the format of the input and then recurse on each matrix
 
 			# check that scaling.factors are the correct format
-			if (length(scaling.factors[[data.type]]$center) != nrow(data.matrices[[data.type]])) {
-				stop(paste0('scaling.factors$',data.type,'$center does not match the number of rows in data.matrices$',data.type));
+			if (any(sort(names(data.matrices)) != sort(names(scaling.factors)))) {
+				stop('the scaling.factors list needs to have the same names as the data.matrices list');
 				}
-			if (length(scaling.factors[[data.type]]$scale) != nrow(data.matrices[[data.type]])) {
-				stop(paste0('scaling.factors$',data.type,'$scale does not match the number of rows in data.matrices$',data.type));
+
+			# if you get to this point then data.matrices is a list of matrices
+			for (data.type in names(data.matrices)) {
+
+				# check that scaling.factors are the correct format
+				if (length(scaling.factors[[data.type]]$center) != nrow(data.matrices[[data.type]])) {
+					stop(paste0('scaling.factors$', data.type,'$center does not match the number of rows in data.matrices$',data.type));
+					}
+				if (length(scaling.factors[[data.type]]$scale) != nrow(data.matrices[[data.type]])) {
+					stop(paste0('scaling.factors$', data.type,'$scale does not match the number of rows in data.matrices$',data.type));
+					}
+
+				# call the function for each data type
+				data.matrices[[data.type]] <- apply.scaling(data.matrices[[data.type]], scaling.factors[[data.type]]);
 				}
-		
-			# call the function for each data type
-			data.matrices[[data.type]] <- apply.scaling(data.matrices[[data.type]],scaling.factors[[data.type]]);
-			}
 
-		# return the scaled list of matrices
-		return(data.matrices);
+			# return the scaled list of matrices
+			return(data.matrices);
+			}
+		stop('`data.matrices` must be a matrix, a data frame, or a list of matrices or data frames');
 		}
-	stop('data.matrices needs to be a matrix or a list of matrices');
-	}
-
@@ -11,13 +11,13 @@ calculate.cis.matrix <- function(
 	num.iterations = 10,
 	print.intermediary.similarity.matrices.to.file = TRUE,
 	print.dir = '.',
-	patient.proportion.seeds = seq(1,num.iterations),
-	feature.proportion.seeds = seq(1,num.iterations)
+	patient.proportion.seeds = seq(1, num.iterations),
+	feature.proportion.seeds = seq(1, num.iterations)
 	) {
 
 	# pull out the patients to use
 	patients <- NULL;
-	for(data.type in data.types) {
+	for (data.type in data.types) {
 		if (filter.to.common.patients) {
 			if (is.null(patients)) {
 				patients <- colnames(data.matrices[[data.type]])[grep('\\d', colnames(data.matrices[[data.type]]))];
@@ -40,16 +40,16 @@ calculate.cis.matrix <- function(
 
 	# repeatly subsample the dataset and calculate integrative similarity
 	per.patient.data.type.corr <- list();
-	for(i in 1:num.iterations) {
+	for (i in 1:num.iterations) {
 		set.seed(patient.proportion.seeds[i]);
-		selected.patients <- sample(patients.for.correlations,round(length(patients.for.correlations)*patient.proportion));
+		selected.patients <- sample(patients.for.correlations, round(length(patients.for.correlations) * patient.proportion));
 		data.matrices.subset <- data.matrices;
 		# if the feature proportion is 1, then we don't need to filter the features
 		# if its not 1, then the features need to be selected for the iteration
 		if (feature.proportion != 1) {
-			for(data.type in data.types) {
+			for (data.type in data.types) {
 				set.seed(feature.proportion.seeds[i]);
-				selected.features <- sample(rownames(data.matrices[[data.type]]), ceiling(nrow(data.matrices[[data.type]])*feature.proportion));
+				selected.features <- sample(rownames(data.matrices[[data.type]]), ceiling(nrow(data.matrices[[data.type]]) * feature.proportion));
 				data.matrices.subset[[data.type]] <- data.matrices.subset[[data.type]][selected.features,];
 				}
 			}
@@ -65,7 +65,7 @@ calculate.cis.matrix <- function(
 		if (print.intermediary.similarity.matrices.to.file) {
 			write.table(
 				per.patient.data.type.corr[[i]],
-				file = paste0(print.dir,'/',Sys.Date(),'_correlation_matrix_seed_',i,'.txt'),
+				file = paste0(print.dir, '/', Sys.Date(), '_correlation_matrix_seed_', i, '.txt'),
 				col.names = TRUE,
 				row.names = TRUE,
 				sep = '\t',
@@ -80,9 +80,15 @@ calculate.cis.matrix <- function(
 		nrow = nrow(per.patient.data.type.corr[[1]]),
 		ncol = ncol(per.patient.data.type.corr[[1]])
 		);
-	for(i in 1:nrow(median.per.patient.data.type.corr)) {
-		for(j in 1:ncol(median.per.patient.data.type.corr)) {
-			median.per.patient.data.type.corr[i,j] <- median(sapply(per.patient.data.type.corr, function(x) {x[i,j]}));
+	for (i in 1:nrow(median.per.patient.data.type.corr)) {
+		for (j in 1:ncol(median.per.patient.data.type.corr)) {
+			median.per.patient.data.type.corr[i, j] <- median(
+				sapply(
+					per.patient.data.type.corr,
+					function(x) {
+						x[i, j]
+						}
+						), na.rm = TRUE);
 			}
 		}
 	rownames(median.per.patient.data.type.corr) <- rownames(per.patient.data.type.corr[[1]]);
 
@@ -6,11 +6,11 @@ calculate.integrative.similarity.matrix <- function(
 	filter.to.common.patients = FALSE,
 	patients.to.return = NULL,
 	patients.for.correlations = NULL
-	) { 
+	) {
 
 	# pull out the patients to use
 	patients <- NULL;
-	for(data.type in data.types) {
+	for (data.type in data.types) {
 		if (filter.to.common.patients) {
 			# assume patient IDs have at least one number in them and annotation columns don't
 			if (is.null(patients)) {
@@ -36,81 +36,86 @@ calculate.integrative.similarity.matrix <- function(
 		}
 	else {
 		patients.to.return <- intersect(patients.to.return, patients);
-		} 
+		}
 	if (is.null(patients.for.correlations)) {
 		patients.for.correlations <- patients;
 		}
 	else {
 		patients.for.correlations <- intersect(patients.for.correlations, patients);
 		}
-	patient.pairs <- as.character(sapply(1:(length(patients.for.correlations)), function(x) {paste0(patients.for.correlations[x], ':', patients.to.return)}));
+	patient.pairs <- as.character(sapply(1:(length(patients.for.correlations)), function(x) {
+																					paste0(patients.for.correlations[x], ':', patients.to.return)
+																					}
+																					));
 	patients <- sort(unique(c(patients.to.return, patients.for.correlations)));
 
 	# calculate pair-wise distances
 	patient.paired.dists <- list();
 	patient.paired.dists.matrix <- matrix(NA, ncol = length(data.types), nrow = length(patient.pairs));
 	colnames(patient.paired.dists.matrix) <- data.types;
 	rownames(patient.paired.dists.matrix) <- patient.pairs;
-	for(data.type in data.types) {
+	for (data.type in data.types) {
 		# filter to required patients
-		data.matrices[[data.type]] <- data.matrices[[data.type]][,sort(colnames(data.matrices[[data.type]])[colnames(data.matrices[[data.type]]) %in% patients])];
+		data.matrices[[data.type]] <- data.matrices[[data.type]][, sort(colnames(data.matrices[[data.type]])[colnames(data.matrices[[data.type]]) %in% patients])];
 
 		# set up the matrix of distance pairs that are required
 		dist.matrix <- matrix(NA, ncol = length(patients.for.correlations), nrow = length(patients.to.return));
 		colnames(dist.matrix) <- patients.for.correlations;
 		rownames(dist.matrix) <- patients.to.return;
 
 		# determine the most efficient approach for calculating the distances
-		# we need pr.num x pc.num distances but calculating distances creates 
+		# we need pr.num x pc.num distances but calculating distances creates
 		# matrix with the same columns and rows which would be (pr.num + pc.num)^2
 		# calculating distances in sets can mean less unnecessary calculations are done
 		# comparing the each of the patients.to.return to the patients.for.correlation
 		# select the number of patients.to.return to compare to patients.for.correlation at a time
 		# (pc.num + k) is the patients per comparison
-		# then there will be (ceiling(pr.num/k)-1) comparisons
+		# then there will be (ceiling(pr.num/k) - 1) comparisons
 		# and then an additional (pc.num + j) where j is the remainder not calculated
 		dist.calc.operations <- list();
 		pr.num <- length(patients.to.return);
 		pc.num <- length(patients.for.correlations);
 		opt.num.of.return.to.calc.at.once <- order(sapply(
 			1:pr.num,
-			function(k) {(pc.num + k)^2 * (ceiling(pr.num/k)-1) + (pc.num + ifelse((pr.num%%k) == 0, k, pr.num%%k))^2}
-			))[1];
+			function(k) {
+				(pc.num + k)^2 * (ceiling(pr.num / k) - 1) + (pc.num + ifelse((pr.num %% k) == 0, k, pr.num %% k))^2
+				}
+				))[1];
 		pr.tracker <- 0;
-		while(pr.tracker < pr.num) {
+		while (pr.tracker < pr.num) {
 			if ((pr.tracker + opt.num.of.return.to.calc.at.once) < pr.num) {
-				dist.calc.operations[[as.character(pr.tracker)]] <- patients.to.return[(pr.tracker+1):(pr.tracker+opt.num.of.return.to.calc.at.once)];
+				dist.calc.operations[[as.character(pr.tracker)]] <- patients.to.return[(pr.tracker + 1):(pr.tracker + opt.num.of.return.to.calc.at.once)];
 				pr.tracker <- pr.tracker + opt.num.of.return.to.calc.at.once;
 				}
 			else {
-				dist.calc.operations[[as.character(pr.tracker)]] <- patients.to.return[(pr.tracker+1):pr.num];
+				dist.calc.operations[[as.character(pr.tracker)]] <- patients.to.return[(pr.tracker + 1):pr.num];
 				pr.tracker <- pr.num;
 				}
 			}
 
 		# calculate distances and fill in patient by patient distance matrix
-		for(dist.op in 1:length(dist.calc.operations)) {
+		for (dist.op in 1:length(dist.calc.operations)) {
 			if (class(dist.metrics[[data.type]]) == 'character') {
 
-				if (dist.metrics[[data.type]] %in% c('pearson','spearman')) {
+				if (dist.metrics[[data.type]] %in% c('pearson', 'spearman')) {
 					# if the distance metric is a correlation, convert the correlation into a distance
 					dist.result <- as.dist(
-						1 - cor(data.matrices[[data.type]][,intersect(colnames(data.matrices[[data.type]]),unique(c(dist.calc.operations[dist.op][[1]],patients.for.correlations)))],
+						1 - cor(data.matrices[[data.type]][, intersect(colnames(data.matrices[[data.type]]), unique(c(dist.calc.operations[dist.op][[1]], patients.for.correlations)))],
 						use = 'pairwise',
 						method = dist.metrics[[data.type]])
 						);
 					}
 				else {
 					# for distances other than correlations use the distance function
 					dist.result <- distance(
-						t(data.matrices[[data.type]][,intersect(colnames(data.matrices[[data.type]]),unique(c(dist.calc.operations[dist.op][[1]],patients.for.correlations)))]),
+						t(data.matrices[[data.type]][, intersect(colnames(data.matrices[[data.type]]), unique(c(dist.calc.operations[dist.op][[1]], patients.for.correlations)))]),
 						method = dist.metrics[[data.type]],
 						use.row.names = TRUE
 						);
 					}
 				}
 			else if (class(dist.metrics[[data.type]]) == 'function') {
-				dist.result <- as.dist((dist.metrics[[data.type]])(t(data.matrices[[data.type]][,intersect(colnames(dist.metrics[[data.type]]), unique(c(dist.calc.operations[dist.op][[1]], patients.for.correlations)))])));
+				dist.result <- as.dist((dist.metrics[[data.type]])(t(data.matrices[[data.type]][, intersect(colnames(dist.metrics[[data.type]]), unique(c(dist.calc.operations[dist.op][[1]], patients.for.correlations)))])));
 				}
 			else {
 				stop(paste0('invalid option for ', data.type, ' distance metric: ', dist.metrics[[data.type]]));
@@ -125,29 +130,39 @@ calculate.integrative.similarity.matrix <- function(
 			}
 
 		patient.paired.dists[[data.type]] <- dist.matrix;
-		patient.pairs <- as.character(sapply(1:(ncol(dist.matrix)),function(x) {paste0(colnames(dist.matrix)[x], ':', rownames(dist.matrix))}));
+		patient.pairs <- as.character(sapply(1:(ncol(dist.matrix)),function(x) {
+																		paste0(colnames(dist.matrix)[x], ':', rownames(dist.matrix))
+																		}
+																		));
 		patient.paired.dists.matrix[patient.pairs,data.type] <- as.numeric(dist.matrix);
 		}
 	# find the rows with at least one value (not na) beyond the patient by patient comparison
-	patient.paired.dists.matrix <- patient.paired.dists.matrix[apply(patient.paired.dists.matrix, 1, function(x) {sum(!is.na(x))}) > 1,];
+	patient.paired.dists.matrix <- patient.paired.dists.matrix[apply(patient.paired.dists.matrix, 1, function(x) {
+																										sum(!is.na(x))
+																										}
+																										) > 1,];
 
 	split.rownames <- strsplit(rownames(patient.paired.dists.matrix), ':');
-	pair.patient1 <- sapply(1:length(split.rownames), function(i) {split.rownames[[i]][1]});
-	pair.patient2 <- sapply(1:length(split.rownames), function(i) {split.rownames[[i]][2]});
+	pair.patient1 <- sapply(1:length(split.rownames), function(i) {
+														split.rownames[[i]][1]
+														});
+	pair.patient2 <- sapply(1:length(split.rownames), function(i) {
+														split.rownames[[i]][2]
+														});
 
 	# calculate correlations (or integrative similarity) between data types
-	per.patient.data.type.corr <- matrix(NA, nrow = length(patients.to.return), ncol = length(data.types)*(length(data.types)-1)/2);
+	per.patient.data.type.corr <- matrix(NA, nrow = length(patients.to.return), ncol = length(data.types) * (length(data.types) - 1) / 2);
 	rownames(per.patient.data.type.corr) <- patients.to.return;
 	colnames(per.patient.data.type.corr) <- seq(1, ncol(per.patient.data.type.corr));
 	data.type.pair.counter <- 0;
-	for(i in 1:(length(data.types)-1)) {
-		for(j in (i+1):length(data.types)) {
+	for (i in 1:(length(data.types) - 1)) {
+		for (j in (i + 1):length(data.types)) {
 
 			data.type.pair.counter <- data.type.pair.counter + 1;
 			colnames(per.patient.data.type.corr)[data.type.pair.counter] <- paste0(data.types[i], ':', data.types[j]);
 			not.na.rows <- (!is.na(patient.paired.dists.matrix[,i])) & (!is.na(patient.paired.dists.matrix[,j]));
 
-			for(patient in rownames(per.patient.data.type.corr)) {
+			for (patient in rownames(per.patient.data.type.corr)) {
 				rows.to.use <- which(pair.patient2 == patient & not.na.rows);
 				if (length(rows.to.use) > 1) {
 					per.patient.data.type.corr[patient,data.type.pair.counter] <- cor(
@@ -159,7 +174,10 @@ calculate.integrative.similarity.matrix <- function(
 				}
 			}
 		}
-	per.patient.data.type.corr <- per.patient.data.type.corr[which(apply(per.patient.data.type.corr, 1, function(x) { sum(!is.na(x)) }) > 0), , drop=FALSE];
+	per.patient.data.type.corr <- per.patient.data.type.corr[which(apply(per.patient.data.type.corr, 1, function(x) {
+																											sum(!is.na(x))
+																											}
+																											) > 0), , drop = FALSE];
 
 	return(per.patient.data.type.corr);
 	}
@@ -1,21 +1,21 @@
 calculate.scaling <- function(data.matrices) {
 	# if there is only one data type to scale
-	if (class(data.matrices)[1] == 'matrix') {
+	if (is.matrix(data.matrices) || is.data.frame(data.matrices)) {
 		# return the mean and sd of each row
 		return(list(
-			center = apply(data.matrices,1,mean),
-			scale = apply(data.matrices,1,sd)
-			));
+			center = apply(data.matrices, 1, mean),
+			scale = apply(data.matrices, 1, sd)
+			))
 		}
-	if (class(data.matrices) == 'list') {
+	if (is.list(data.matrices)) {
 		# if there are multiple data types to scale
 		# return the mean and sd of each row for each data matrix
 		scaling.factors <- list();
-		for(data.type in names(data.matrices)) {
+		for (data.type in names(data.matrices)) {
 			scaling.factors[[data.type]] <- calculate.scaling(data.matrices[[data.type]]);
 			}
 		return(scaling.factors);
 		}
-	# if not a list or a matrix return an error message to let the user know how to correct the input
-	stop('data.matrices needs to be a matrix or a list');
+	# if `data.matrices` is not a list, a matrix, or a data frame, return an error message to let the user know how to correct the input
+	stop('`data.matrices` must be a matrix, a data frame, or a list of matrices or data frames');
 	}