-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathFigure4.R
165 lines (115 loc) · 7.39 KB
/
Figure4.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#Figure 4c 4g
library( "ggplot2" )
library( "reshape2" )
library( "data.table" )
geneinfo <- read.table( "geneinfo.txt", header=TRUE, check.names=FALSE, sep="\t", quote="", comment.char="#" );
system.time( expPan <- fread( "TcgaTargetGtex_fpkm", data.table=FALSE ) )
row.names( expPan ) <- as.character( expPan[ , 1 ] );
expPan <- expPan[ , 2 : ncol( expPan ) ];
phenotype <- read.table( "TcgaTargetGTEX_phenotype.txt", header=TRUE, check.names=FALSE, sep="\t", quote="", comment.char="" )
colnames( phenotype ) <- gsub( "^_", "", colnames( phenotype ) );
# do not use TARGET
phenotype <- subset( phenotype, study %in% c( "GTEX", "TCGA" ) );
expPan <- expPan[ , as.character( phenotype$sample ) ];
# do not use Testis, Bone Marrow, Lymphocytes or Fibroblasts from GTEx
phenotype <- rbind( subset( phenotype, study == "TCGA" ), subset( phenotype, study == "GTEX" & sample_type != "Cell Line" & primary_site != "Testis" ) );
expPan <- expPan[ , as.character( phenotype$sample ) ];
# define normal group and each cancer type group
gdc_cases <- read.table( "gdc_cases_update04072019.txt", header=TRUE, check.names=FALSE, sep="\t", quote="", comment.char="" )
phenotype$Group <- unlist( lapply( as.character( phenotype$sample ), function(x) {
if( grepl( "^GTEX", x ) ) {
site <- as.character( subset( phenotype, sample == x )$primary_site );
if( site == "" ) {
tissue <- as.character( subset( phenotype, sample == x )$`primary disease or tissue` );
site <- c( "Skin", "Stomach", "Esophagus" )[ match( tissue, c( "Skin - Sun Exposed (Lower Leg)", "Stomach", "Esophagus - Mucosa" ) ) ];
}
return( paste( "NormalGTEX", site, sep=", " ) );
}
t <- unlist( strsplit( x, "-" ) )[ 4 ];
p <- paste( unlist( strsplit( x, "-" ) )[ 1:3 ], collapse="-" );
tt <- as.character( subset( gdc_cases, submitter_id == p )$project_id );
if( grepl( "^1", t ) ) { return( paste( "NormalTCGA", as.character( subset( phenotype, sample == x )$primary_site ), sep=", " ) ); }
if( grepl( "^2", t ) ) { return( NA ); }
if( length( tt ) == 0 ) { stop( x, "\n" ); }
return( tt );
} ) )
tab <- table( sort( as.character( phenotype$Group ) ) );
write.table( as.data.frame( tab ), file="tmp2.txt", append=FALSE, quote=F, row.names=FALSE, col.names=TRUE, sep="\t" );
row.names( expPan ) <- unlist( lapply( strsplit( row.names( expPan ), "\\." ), function(x) {x[1]} ) )
# load medianFPKM matrix generated by "C:\Users\linzhang\Box Sync\Jiao0_work\2019 Apr 20 Cancer Specificity\Tools\0.MyAnalysis\SP.by5.correct.R"
expMedian <- read.table( "expMedian.txt", header=TRUE, check.names=FALSE, row.names=1, sep="\t", quote="", comment.char="" )
tlst <- colnames( expMedian );
tlst0 <- tlst[ grepl( "^NormalGTEX", tlst ) ]
tlst1 <- tlst[ grepl( "^TCGA", tlst ) ];
print( length( tlst0 ) ); # 29
print( length( tlst1 ) ); # 33
tlst0 <- c( "NormalGTEX, Kidney", "NormalGTEX, Prostate", "NormalGTEX, Bladder", "NormalGTEX, Lung", "NormalGTEX, Esophagus", "NormalGTEX, Stomach", "NormalGTEX, Colon", "NormalGTEX, Small Intestine", "NormalGTEX, Pancreas", "NormalGTEX, Liver", "NormalGTEX, Adrenal Gland", "NormalGTEX, Thyroid", "NormalGTEX, Breast", "NormalGTEX, Fallopian Tube", "NormalGTEX, Ovary", "NormalGTEX, Cervix Uteri", "NormalGTEX, Uterus", "NormalGTEX, Vagina", "NormalGTEX, Brain", "NormalGTEX, Nerve", "NormalGTEX, Blood", "NormalGTEX, Skin", "NormalGTEX, Adipose Tissue", "NormalGTEX, Blood Vessel", "NormalGTEX, Heart", "NormalGTEX, Muscle", "NormalGTEX, Pituitary", "NormalGTEX, Salivary Gland", "NormalGTEX, Spleen" );
tlst1 <- c( "TCGA-KIRC", "TCGA-KIRP", "TCGA-KICH", "TCGA-PRAD", "TCGA-BLCA", "TCGA-THYM", "TCGA-MESO", "TCGA-LUAD", "TCGA-LUSC", "TCGA-HNSC", "TCGA-ESCA", "TCGA-STAD", "TCGA-COAD", "TCGA-READ", "TCGA-PAAD", "TCGA-LIHC", "TCGA-CHOL", "TCGA-ACC", "TCGA-PCPG", "TCGA-THCA", "TCGA-BRCA", "TCGA-OV", "TCGA-CESC", "TCGA-UCEC", "TCGA-UCS", "TCGA-SARC", "TCGA-TGCT", "TCGA-LAML", "TCGA-DLBC", "TCGA-LGG", "TCGA-GBM", "TCGA-UVM", "TCGA-SKCM" );
# load SP gene list, by Zhongyi, after filtering out immune-related genes
df <- read.table( "SP.by5.L3.txt", header=TRUE, check.names=FALSE, sep="\t", quote="", comment.char="#" );
# do not consider subtype
dfP <- subset( df, grepl( "^TCGA", CancerType ) );
dfP$Tier <- factor( as.character( dfP$Tier ), levels=c( "L1", "L2", "L3" ) );
dfP$CancerType <- factor( as.character( dfP$CancerType ), levels=tlst1 );
dfP <- dfP[ with( dfP, order( Tier, CancerType ) ), ];
# plot examples
plotEg0 <- function( GeneA ) {
# GeneA <- "CLDN6";
print( GeneA );
dfPlot <- subset( phenotype, Group %in% c( tlst0, tlst1 ) );
dfPlot$Class <- unlist( lapply( as.character( dfPlot$Group ), function(x) { return( ifelse( grepl( "^TCGA", x ), 1, 0 ) ); } ) );
if( GeneA %in% row.names( expPan ) ) { dfPlot$expA <- as.vector( as.matrix( expPan[ GeneA, as.character( dfPlot$sample ) ] ) );
} else { GeneA <- as.character( subset( geneinfo, gene == GeneA )$ensg );
dfPlot$expA <- as.vector( as.matrix( expPan[ GeneA, as.character( dfPlot$sample ) ] ) ); }
print( GeneA );
xTick <- sort( unique( as.character( dfPlot$Group ) ) );
t <- subset( dfP, ensg == GeneA );
if( nrow( t ) > 0 ) { fn <- unlist( apply( t[ 1, ], 1, function(x) { return( paste( "t", x[ "gene" ], x[ "Tier" ], sep=" " ) ); } ) );
} else { fn <- unlist( apply( t[ 1, ], 1, function(x) { return( paste( "t", GeneA, x[ "Tier" ], sep=" " ) ); } ) ); }
ymax <- max( unlist( lapply( c( tlst0, tlst1 ), function(x) {
qt <- quantile( subset( dfPlot, Group == x )$expA );
return( ( qt[ "75%" ] - qt[ "25%" ] ) * 1.5 + qt[ "75%" ] ); } ) ) );
yTick <- function(y) {
s <- 10^floor( log10( y ) );
Exp_tick <- 0;
Exp_tick0 <- seq( 0, ceiling( y / s ) ) * s;
Exp_tick1 <- seq( 0, ceiling( y / s / 4 ) ) * s * 4;
Exp_tick2 <- seq( 0, ceiling( y / s / 5 ) ) * s * 5;
if( length( Exp_tick0 ) %in% c( 3, 4, 5 ) ) { Exp_tick <- Exp_tick0; return( Exp_tick ); }
if( length( Exp_tick1 ) %in% c( 3, 4, 5 ) ) { Exp_tick <- Exp_tick1; return( Exp_tick ); }
if( length( Exp_tick2 ) %in% c( 3, 4, 5 ) ) { Exp_tick <- Exp_tick2; return( Exp_tick ); }
return( Exp_tick );
}
Exp_tick <- yTick( ymax );
if( length( Exp_tick ) == 1 ) {
s <- 10^( floor( log10( ymax ) ) - 1 );
Exp_tick <- 0;
Exp_tick0 <- seq( 0, ceiling( ymax / s ) ) * s;
Exp_tick1 <- seq( 0, ceiling( ymax / s / 4 ) ) * s * 4;
Exp_tick2 <- seq( 0, ceiling( ymax / s / 5 ) ) * s * 5;
if( length( Exp_tick0 ) == 4 ) { Exp_tick <- Exp_tick0; }
if( length( Exp_tick0 ) == 5 ) { Exp_tick <- Exp_tick0; }
if( length( Exp_tick1 ) == 4 ) { Exp_tick <- Exp_tick1; }
if( length( Exp_tick1 ) == 5 ) { Exp_tick <- Exp_tick1; }
if( length( Exp_tick2 ) == 4 ) { Exp_tick <- Exp_tick2; }
if( length( Exp_tick2 ) == 5 ) { Exp_tick <- Exp_tick2; }
}
print( ymax );
print( Exp_tick );
qt <- do.call( rbind, lapply( c( tlst0, tlst1 ), function(t) {
qt <- quantile( subset( dfPlot, Group == t )$expA );
return( cbind( data.frame( Tissue=t ), data.frame( t( as.data.frame( qt ) ), check.names=FALSE ) ) );
} ) )
qt$Tissue <- unlist( lapply( as.character( qt$Tissue ), function(y) { return( gsub( "^NormalGTEX, ", "GTEx-", y ) ); } ) );
write.table( qt, file=paste( fn, ".txt", sep="" ), append=FALSE, quote=F, row.names=FALSE, col.names=TRUE, sep="\t" );
}
plotEg0( "MSLN" )
plotEg0( "TM4SF4" )
plotEg0( "GPC3" )
plotEg0( "TM4SF5" )
# --> Figure 4c
plotEg0( "CA9" )
plotEg0( "SLC26A9" )
plotEg0( "GPA33" )
plotEg0( "TMIGD1" )
# --> Figure 4g