Skip to content

Latest commit

 

History

History
201 lines (188 loc) · 14 KB

CABANA_2020.md

File metadata and controls

201 lines (188 loc) · 14 KB


CABANA Workshop San Jose Costa Rica @ University of Costa Rica

Virome Network Analysis (V i N A) framework adapoted for VirusDetect

Code developed by Ricardo I Alcala Ph.D. candidate | @Garrett lab University of Florida


# R version 3.6.2

#@------------------------ SET UP ENVIRONMENT ------------------------ #@------ packages #uncomment to install ----- # Install Libraries: # install.packages("tidyr") # install.packages("igraph") # install.packages("bipartite") # install.packages("XML") # install.packages("htmltab") #@ load libraries library(plyr) library(tidyverse) library(tidyr) library(igraph) library(bipartite) library(XML) library(htmltab)

Notas:

Jan no confio cuando hay minimo dos mistmatches en los siRNAs

Los 21 y 22 nt son mayoritariamente de virus
Los de 24 son mayoritariamente de origen vegetal

En```R
#@---- Parsing function -----
dat.parse <- function(df, location, host){

```R
# Formatting metadata
df$sampleID <- location
df$host <- host
# ReGex and residuals removal
names <- str_extract(df$Description, ".*virus[:space:][:alnum:]")
a <- gsub("virus i", "virus", names)
a <- gsub("virus s", "virus", a)
# if there are more 'CHARACTER' after virus, add another line as follows
# a <- gsub("virus s", "virus", 'CHARACTER')
df$Species <- gsub("virus p", "virus", a)
# Generating pseudo-acronyms
df$acronym <- abbreviate(df$Species)
print(df)
}

#----- setwd -----
setwd("…/ViNA_for_CABANA-master/")
#------------------------ DATA MANIPULATION ------------------------
#---- Loading data from Virus Detect -----
loaded_files <- list(
vd.re1 <- readHTMLTable(‘Cca59_blastn.html’),
vd.re2 <- readHTMLTable(‘Czo24_blastn.html’),
vd.re3 <- readHTMLTable(‘Czo25_blastn.html’),
vd.re4 <- readHTMLTable(‘Czo42_blastn.html’),
vd.re5 <- readHTMLTable(‘Czo56_blastn.html’),
vd.re6 <- readHTMLTable(‘Hua25_blastn.html’),
vd.re7 <- readHTMLTable(‘Ica87_blastn.html’),
vd.re8 <- readHTMLTable(‘Jin116_blastn.html’)
)

> Note:
#loaded_files[[1]] # change this number to check other inputs [['n']]
#@---- Formatting as tibble and numbers -----
results.ls = temp = list(NULL)
for (k in seq_along(loaded_files)){
	temp[[k]] <- as_tibble(loaded_files[[k]]$`NULL`[1:dim(loaded_files[[k]]$`NULL`)[2]])
	results.ls[[k]] <- temp[[k]] %>%
		mutate(`Depth (Norm)` = as.numeric(as.character(`Depth (Norm)`)) )
}
results.ls
#@--- Adding metadata to Virus Detect cuando no se parecen a nada, se van a un 'archivo' especial de no 'matches' que 
pueden tener secuencias de interés.

Badnavirus en camote están a muy bajos titulos Mastrevirus en camote no se sabe que hace, pero esta bajo y tiene muy poca variación.

SPCSV the HIV of sweetpotato

  • no introducir ciertas variantes de un lugar a otro por los tipos de cultivares

Results ----- host <- rep("potato", length(results.ls)) # add the number of samples to study locations <- c("Cca5", "Czo24", "Czo25", "Czo45", "Czo56", "Hua25", "Ica87", "Jin16") # add the name of the different locations

LOCATION KEY
– CZO = Cuzco
– HUA = HUANCAVELICA
– ICA =ICA
– JIN = JUNIN
– CCA =CAJAMARCA

#--- Generating final data -----
res <- list(NULL)
for (i in seq_along(results.ls)){
	print(i)
	res[[i]] <- dat.parse(results.ls[[i]], locations[i], host[i])
}
#@------ Create a data frame with all individual information
df <- na.omit(as_tibble(rbind.fill(res)))
#@----- Create summary data and incidence matrix -----
dfs <- ddply(df, .(sampleID, acronym), summarise, cov = mean(`Depth (Norm)`))
dfs <- t(spread(dfs, sampleID, cov, drop=TRUE , fill = 0))
in.mat = t(apply(dfs[2:8,], 1, as.numeric))
colnames(in.mat) <- dfs[1,]
#@--- incidence matrix results
in.mat
#@------------------------ IGRAPH ------------------------
#@---- Generating graph object -----
g <- graph.incidence(in.mat, weighted= TRUE)

#@ Network attributes V(g)$name # Check the vertex names V(g)$type # Check vertex types #@ plotting plot(g, vertex.label.color='black', vertex.label.dist= ((V(g)$type 4)-2)-1 , layout = layout_as_bipartite) plot(g, vertex.label.color='black') #---- Adding attributes #@ Shapes to nodes shapes = c(rep("square", length(V(g)$type[V(g)$type == "FALSE"])), rep("circle", length(V(g)$type[V(g)$type == "TRUE"]))) #@ plotting plot(g, vertex.shape=shapes, vertex.label.color='black') #------------------------ #@ Add colors to nodes V(g)$color <- c(rep("#046A38", length(V(g)$type[V(g)$type == "FALSE"])), rep("#FA4616", length(V(g)$type[V(g)$type == "TRUE"]))) #@ plotting plot(g, vertex.shape=shapes, vertex.label.color='black')

#@------------------------ BIPARTITE ------------------------
#library(bipartite)
#@ Plot bipartite network using bipartite package
#@ data in incidence matrix format
plotweb(sortweb(in.mat, sort.order="inc"), method="normal")

#@ Plot in matrix format
visweb(sortweb(in.mat, sort.order="dec"), type= "none", # change to nested or diagonal
labsize= 2, square= "interaction", text= "none", textsize= 4)

#@----- Calculating metrics ----
#@ Node metrics
node.metrics <- specieslevel(round(in.mat))
#@ Exploring metrics
str(node.metrics)
#@ How many levels are in the list?
#@ node.metrics$`higher level` # Want to know about the metrics? Call ?specieslevel
#@ Exploring $`higher level`
(h.nd <- node.metrics$`higher level`[1]) # node degree OR node.metrics$`higher level`$degree
(h.bc <- node.metrics$`higher level`[2]) # species strength
#@ Exploring $`lower level`
(l.nd <- node.metrics$`lower level`[1]) # node degree
(l.bc <- node.metrics$`lower level`[2]) # species strength
#@ Network metrics
network.metrics <- networklevel(round(in.mat))
#@ network.metrics # Want to know about the metrics? Call ?networklevel
#@ Exploring by metric
network.metrics["connectance"] # Connectance
network.metrics["weighted nestedness"] # Nestedness *weighted
#@ Computing modularity
computeModules(in.mat) # Default method: Becket
(modularity <- LPA_wb_plus(in.mat))
mod <- convert2moduleWeb(in.mat, modularity)
plotModuleWeb(mod, weighted = F)
#@----Plotting with attributes ----
#@ Selecting metadata from Virus Detect
meta <- df %>% select(Genus, host, Species, acronym) %>%
			unique()
str(meta)
#@ Adding colors
meta$colors <- ifelse( meta$Genus == "ilarvirus ", "#FA4616",
	ifelse( meta$Genus == 'polerovirus ', "yellow",
		ifelse( meta$Genus == "potexvirus", "brown",
			ifelse( meta$Genus == "nepovirus", "orange",
				ifelse( meta$Genus == "ipomovirus", "red",
					ifelse( meta$Genus == "badnavirus", "gray",
						ifelse( meta$Genus == "potyvirus", "green", "purple")))))))
#@ Combining node attributes from bipartite and igraph
V(g)$color <- c(rep("#046A38", length(V(g)$type[V(g)$type == "FALSE"])),
						meta$colors)
V(g)$xx <- c(unlist(l.nd), unlist(h.bc)+5) # adding node degree + species strength plus a constant
#@ Types of layout algorithms
#@ Default: Kamada-Kawai
plot(g, vertex.shape=shapes, vertex.label=NA, vertex.size= as.numeric(V(g)$xx))
#@ Davidson-Harel
plot(g, vertex.shape=shapes, vertex.size= as.numeric(V(g)$xx), layout=layout_with_dh(g) )
#@ Distributed Recursive
plot(g, vertex.shape=shapes, edge.width= log(E(g)$weight), vertex.size= as.numeric(V(g)$xx), layout=layout_with_drl(g) )