code_networks_ML

library(phyloseq)
library(indicspecies)
library(igraph)
library(RColorBrewer)
library(ggraph)
library(qgraph)
library(vegan)
library(MCL)
library(metagMisc)
library(ggplot2)
library(dplyr)
library(magrittr)
library(scales)
library(grid)
library(reshape2)
library(randomForest)
library(caret)
library(ggiraphExtra)
library(ggiraph)

##### Import Data #####
ps <- import_biom(feature-table.biom)

#####Indicator species analysis #####
#filter raw ps object
ps.filt <- phyloseq_filter_prevalence(ps, prev.trh = 0.20, abund.trh = 100, threshold_condition = "AND", abund.type = "total")

#get OTU and tax table from ps filtered object
OTU1 = as(otu_table(ps.filt), "matrix")
OTU1.trans <- as.data.frame(t(OTU1))
design_16s <- read.table("./metadata_2years.txt", header=T, row.names=1, stringsAsFactors=F, na.strings="NA")
indic_soil_groups_16s <- design_16s$Treatment
length(unique(indic_soil_groups_16s))

indicatorsp_soil_16s <- multipatt(OTU1.trans,indic_soil_groups_16s,func = "r.g",control=how(nperm=9999))#calculate indicspecies
summary(indicatorsp_soil_16s,alpha=1,indvalcomp=T)
indic_soil_df_16s <- indicatorsp_soil_16s$sign

MC_16s <- as.matrix(indic_soil_df_16s[which(indic_soil_df_16s$s.MC == 1 & indic_soil_df_16s$p.value < 0.05),])
MN_16s <- as.matrix(indic_soil_df_16s[which(indic_soil_df_16s$s.MN == 1 & indic_soil_df_16s$p.value < 0.05),])
MO_16s <- as.matrix(indic_soil_df_16s[which(indic_soil_df_16s$s.MO == 1 & indic_soil_df_16s$p.value < 0.05),])
MT_16s <- as.matrix(indic_soil_df_16s[which(indic_soil_df_16s$s.MT == 1 & indic_soil_df_16s$p.value < 0.05),])
soil_r_values_16s <- rbind(MC_16s,MN_16s,MO_16s,MT_16s)
colnames(soil_r_values_16s)[1:4] <-c("MC","MN","MO","MT")

## Construct node table for bulk soil bacteria communities from indicator species data
soil_bi_finaltable_16s <- data.frame(from= c(rep("MC",length(which(soil_r_values_16s[,"MC"]==1))),
                                         rep("MN",length(which(soil_r_values_16s[,"MN"]==1))),
                                         rep("MT",length(which(soil_r_values_16s[,"MO"]==1))),
                                         rep("MO",length(which(soil_r_values_16s[,"MT"]==1)))),
                                 to= c(rownames(soil_r_values_16s)[which(soil_r_values_16s[,"MC"]==1)],
                                       rownames(soil_r_values_16s)[which(soil_r_values_16s[,"MN"]==1)],
                                       rownames(soil_r_values_16s)[which(soil_r_values_16s[,"MO"]==1)],
                                       rownames(soil_r_values_16s)[which(soil_r_values_16s[,"MT"]==1)]),
                                 r= c(soil_r_values_16s[which(soil_r_values_16s[,"MC"]==1),"stat"],
                                      soil_r_values_16s[which(soil_r_values_16s[,"MN"]==1),"stat"],
                                      soil_r_values_16s[which(soil_r_values_16s[,"MO"]==1),"stat"],
                                      soil_r_values_16s[which(soil_r_values_16s[,"MT"]==1),"stat"]))


net_16s <- graph.data.frame(soil_bi_finaltable_16s,directed=F)
net_16s <- simplify(net_16s, remove.multiple=T, remove.loops=T)
net_16s #get number of nodes and edges
UN-- 442 794 --

#add taxonomy as attribute
V(net_16s)$taxonomy <- as.character(node_taxonomy$taxonomy)

pal <- brewer.pal(length(unique(V(net_16s)$taxonomy)), "Dark2")
plot(net_16s, layout=l, vertex.color=pal[as.numeric(as.factor(vertex_attr(net_16s, "taxonomy")))],
     edge.arrow.size=0.5, 
     vertex.label=V(net_16s)$name, vertex.label.cex = 0.3,
     vertex.shape="circle", 
     vertex.size=4, 
     vertex.label.color="black", 
     edge.width=0.5)

##Farming system networks
ps <- import_biom(feature-table.biom)
ps.filt <- phyloseq_filter_prevalence(ps, prev.trh = 0.2, abund.trh = 100, threshold_condition = "AND", abund.type = "total")
write.table(ps.filt.otu, "./ps.filt.txt")

#Networks computed through Metagenomenets using Pearson correlation p<0.01, minimum of 30 reads. Resulting in 540 features.
#Abundancies normalized by total sum scaling and tranformed to centered log-ratio (clr)

#Import MC_edgelist file from Metagenomenets
MC_edgelist <- read.table("./MC_edgelist", header = FALSE)#read file
MC_edgelist.matrix <- as.matrix(MC_edgelist)
MC_graph <- graph_from_edgelist(MC_edgelist.matrix, directed = FALSE)
MC_graph_degree <- degree(MC_graph, mode = "total") #calculate degree
V(MC_graph)$degree <- MC_graph_degree[match(V(MC_graph)$name, names(MC_graph_degree))]#match degre list to node name

#MC modules
MC_modules_louvain <- cluster_louvain(MC_graph)
#prepare attributes to plot highlight
ToHighlight = c(1) #membership id to highlight
PCol = rep("lightgray", length(unique(MC_modules_louvain$membership)))
PCol[ToHighlight] = rainbow(length(ToHighlight))
new_cols <- PCol[membership(MC_modules_louvain)]
HCol = rep(NA, length(unique(MC_modules_louvain$membership)))
HCol[ToHighlight] = rainbow(length(ToHighlight), alpha=0.15)
HCol[grepl('#FF000026', HCol)] <- 'orange' #HCol controls the highlight of module
new_cols[grepl('#FF0000FF', new_cols)] <- 'lightgray'#set back node color to gray

#define layout
e <- get.edgelist(MC_graph,names=FALSE)
l1 <- qgraph.layout.fruchtermanreingold(e,vcount=vcount(MC_graph),
      area=8*(vcount(MC_graph)^2),repulse.rad=(vcount(MC_graph)^3.1))

#plot network

plot(MC_modules_louvain, MC_graph, col=new_cols, mark.border=HCol, mark.col=HCol, layout=l1, vertex.label = ifelse(V(MC_graph)$degree>=50, V(MC_graph)$name, NA), vertex.size=ifelse(V(MC_graph)$degree>=50, 3,3), edge.color="grey70", vertex.label.cex = 0.5)

#Import MT_edgelist file from Metagenomenets
MT_edgelist <- read.table("MT_edgelist", header = FALSE)#read file
MT_edgelist.matrix <- as.matrix(MT_edgelist)
MT_graph <- graph_from_edgelist(MT_edgelist.matrix, directed = FALSE)
MT_graph_degree <- degree(MT_graph, mode = "total") #calculate degree
V(MT_graph)$degree <- MT_graph_degree[match(V(MT_graph)$name, names(MT_graph_degree))]#match degre list to node name
MT_graph <- delete.vertices(MT_graph, V(MT_graph)[degree(MT_graph)<=5])#delete less relevant nodes to improve visualization

#MT modules
MT_modules_louvain <- cluster_louvain(MT_graph)

#prepare attributes to plot highlight
ToHighlight = c(1,5,6,7) #membership id to highlight
PCol = rep("lightgray", length(unique(MT_modules_louvain$membership)))
PCol[ToHighlight] = rainbow(length(ToHighlight))
new_cols <- PCol[membership(MT_modules_louvain)]
HCol = rep(NA, length(unique(MT_modules_louvain$membership)))
HCol[ToHighlight] = rainbow(length(ToHighlight), alpha=0.15)

new_cols[grepl('#FF0000FF', new_cols)] <- 'lightgray'#set back node color to gray
new_cols[grepl('#00FF00FF', new_cols)] <- 'lightgray'#set back node color to gray
new_cols[grepl('#0000FFFF', new_cols)] <- 'lightgray'#set back node color to gray
new_cols[grepl('#00FFFFFF', new_cols)] <- 'lightgray'#set back node color to gray

#define layout
e <- get.edgelist(MT_graph,names=FALSE)
l <- qgraph.layout.fruchtermanreingold(e,vcount=vcount(MT_graph))

plot(MT_modules_louvain, MT_graph, col=new_cols, mark.border=HCol, mark.col=HCol, layout=l, vertex.label = ifelse(V(MT_graph)$degree>=50, V(MT_graph)$name, NA), vertex.size=ifelse(V(MT_graph)$degree>=50, 3,3), edge.color="grey70", vertex.label.cex = 0.5)

#Import MN_edgelist file from Metagenomenets
MN_edgelist <- read.table("MN_edgelist", header = FALSE)#read file
MN_edgelist.matrix <- as.matrix(MN_edgelist)
MN_graph <- graph_from_edgelist(MN_edgelist.matrix, directed = FALSE)
MN_graph_degree <- degree(MN_graph, mode = "total") #calculate degree
V(MN_graph)$degree <- MN_graph_degree[match(V(MN_graph)$name, names(MN_graph_degree))]#match degre list to node name
MN_graph <- delete.vertices(MN_graph, V(MN_graph)[degree(MN_graph)<=5]) #delete less relevant nodes to improve visualization

#MN modules
MN_modules_louvain <- cluster_louvain(MN_graph)

#prepare attributes to plot highlight
ToHighlight = c(1,2,5) #membership id to highlight
PCol = rep("lightgray", length(unique(MT_modules_louvain$membership)))
PCol[ToHighlight] = rainbow(length(ToHighlight))
new_cols <- PCol[membership(MN_modules_louvain)]
HCol = rep(NA, length(unique(MN_modules_louvain$membership)))
HCol[ToHighlight] = rainbow(length(ToHighlight), alpha=0.15)

new_cols[grepl('#0000FFFF', new_cols)] <- 'lightgray'#set back node color to gray
new_cols[grepl('#00FF00FF', new_cols)] <- 'lightgray'#set back node color to gray
new_cols[grepl('#FF0000FF', new_cols)] <- 'lightgray'#set back node color to gray

e <- get.edgelist(MN_graph,names=FALSE)
l <- qgraph.layout.fruchtermanreingold(e,vcount=vcount(MN_graph))

plot(MN_modules_louvain, MN_graph, col=new_cols, mark.border=HCol, mark.col=HCol, layout=l, vertex.label = ifelse(V(MN_graph)$degree>=50, V(MN_graph)$name, NA), vertex.size=ifelse(V(MN_graph)$degree>=50, 3,3), edge.color="grey70", vertex.label.cex = 0.5)


#Import MO_edgelist file from Metagenomenets
MO_edgelist <- read.table("MO_edgelist", header = FALSE)#read file
MO_edgelist.matrix <- as.matrix(MO_edgelist)
MO_graph <- graph_from_edgelist(MO_edgelist.matrix, directed = FALSE)
MO_graph_degree <- degree(MO_graph, mode = "total") #calculate degree
V(MO_graph)$degree <- MO_graph_degree[match(V(MO_graph)$name, names(MO_graph_degree))]#match degre list to node name
MO_graph <- delete.vertices(MO_graph, V(MO_graph)[degree(MO_graph)<=5]) #delete less relevant nodes to improve visualization

#MO modules
MO_modules_louvain <- cluster_louvain(MO_graph)

#prepare attributes to plot highlight
ToHighlight = c(1) #membership id to highlight
PCol = rep("lightgray", length(unique(MO_modules_louvain$membership)))
PCol[ToHighlight] = rainbow(length(ToHighlight))
new_cols <- PCol[membership(MO_modules_louvain)]
HCol = rep(NA, length(unique(MO_modules_louvain$membership)))
HCol[ToHighlight] = rainbow(length(ToHighlight), alpha=0.15)

new_cols[grepl('#FF0000FF', new_cols)] <- 'lightgray'#set back node color to gray

e <- get.edgelist(MO_graph,names=FALSE)
l <- qgraph.layout.fruchtermanreingold(e,vcount=vcount(MO_graph))

plot(MO_modules_louvain, MO_graph, col=new_cols, mark.border=HCol, mark.col=HCol, layout=l, vertex.label = ifelse(V(MO_graph)$degree>=50, V(MO_graph)$name, NA), vertex.size=ifelse(V(MO_graph)$degree>=50, 3,3), edge.color="grey70", vertex.label.cex = 0.5)

#Random forest models

library(ggplot2)
library(vegan)
library(dplyr)
library(magrittr)
library(scales)
library(grid)
library(reshape2)
library(randomForest)
library(caret)

set.seed(1234)

#input full file and generate training and validation files 0.6/0.4
FS_ALL <- read.table("all_ASV_table_FS")
FS_caret_train <- createDataPartition(FS_ALL$farming, p=0.6, list = FALSE)
FS_caret_training <- FS_ALL[FS_caret_train,]
FS_caret_validation <- FS_ALL[-FS_caret_train,]

#42 samples were assigned to train; 27 to validation

#train model on full indicspecies dataset of 0.6% of training samples
FS_rf_caret <- train(farming~., data = FS_caret_training, method = "rf")
FS_feature_importance <- varImp(FS_rf_caret)

FS_importance_df <- data.frame(FS_feature_importance$importance,feature = rownames(FS_feature_importance$importance))
FS_top30 <- head(FS_importance_df[order(FS_importance_df[,1],decreasing = T),],n=30) %>%
  .$feature %>%
  gsub("`", '', .)  

# top30 data selection
FS_top30 <- FS_caret_training[,FS_top30] 
FS_top30$farming <- FS_caret_training$farming

#train a new model using only best 30 ASVs
FS_rf_top30 <- train(farming~., data = FS_top30, method = "rf")

#predict accuracy
FS_top30_pred <- predict(FS_rf_top30, FS_caret_validation)

#define generic plotting function
plot_confusion_matrix<-function(Actual,Predict,colors=c("white","red4","dodgerblue3"),text.scl=5){
    actual = as.data.frame(table(Actual))
    names(actual) = c("Actual","ActualFreq")
    
    #build confusion matrix
    confusion = as.data.frame(table(Actual, Predict))
    names(confusion) = c("Actual","Predicted","Freq")
    
    #calculate percentage of test cases based on actual frequency
    
    confusion = merge(confusion, actual, by=c('Actual','Actual'))
    confusion$Percent = confusion$Freq/confusion$ActualFreq*100
    confusion$ColorScale<-confusion$Percent*-1
    confusion[which(confusion$Actual==confusion$Predicted),]$ColorScale<-confusion[which(confusion$Actual==confusion$Predicted),]$ColorScale*-1
    confusion$Label<-paste(round(confusion$Percent,0),"%")
    tile <- ggplot() +
        geom_tile(aes(x=Actual, y=Predicted,fill=ColorScale),data=confusion, color="black",size=0.1) +
        labs(x="Actual",y="Predicted")
    
    tile = tile +
        geom_text(aes(x=Actual,y=Predicted, label=Label),data=confusion, size=text.scl, colour="black") +
        scale_fill_gradient2(low=colors[2],high=colors[3],mid=colors[1],midpoint = 0,guide='none')
}

Actual <- FS_caret_validation$farming
Predict <- FS_top30_pred
(plot_confusion_matrix(Actual = Actual, Predict = Predict))


#repeat to predict treatyear (TY)
#input full file and generate training and validation files 0.6/0.4
TY_ALL <- read.table("all_ASV_table_TY")
TY_caret_train <- createDataPartition(TY_ALL$treatyear, p=0.6, list = FALSE)
TY_caret_training <- TY_ALL[TY_caret_train,]
TY_caret_validation <- TY_ALL[-TY_caret_train,]

#46 samples were assigned to train; 23 to validation

#train model on full indicspecies dataset of 0.6% of training samples
TY_rf_caret <- train(treatyear~., data = TY_caret_training, method = "rf")
TY_feature_importance <- varImp(TY_rf_caret)

TY_importance_df <- data.frame(TY_feature_importance$importance,feature = rownames(TY_feature_importance$importance))
TY_top30 <- head(TY_importance_df[order(TY_importance_df[,1],decreasing = T),],n=30) %>%
  .$feature %>%
  gsub("`", '', .)  

# top30 data selection
TY_top30 <- TY_caret_training[,TY_top30] 
TY_top30$treatyear <- TY_caret_training$treatyear

#train a new model using only best 30 ASVs
TY_rf_top30 <- train(treatyear~., data = TY_top30, method = "rf")

#predict accuracy
TY_top30_pred <- predict(TY_rf_top30, TY_caret_validation)

Actual <- TY_caret_validation$treatyear
Predict <- TY_top30_pred
(plot_confusion_matrix(Actual = Actual, Predict = Predict))

####Predict continuous variables
#Only perform prediction on models at least 50% of R2

#Inorganic N content

#input full file and generate training and validation files 0.6/0.4
Ninorg_ALL <- read.table("all_ASV_table_Ninorg")
Ninorg_caret_train <- createDataPartition(Ninorg_ALL$Ninorg, p=0.6, list = FALSE)
Ninorg_caret_training <- Ninorg_ALL[Ninorg_caret_train,]
Ninorg_caret_validation <- Ninorg_ALL[-Ninorg_caret_train,]

#44 samples were assigned to train; 25 to validation

#train model on full indicspecies dataset of 0.6% of training samples
Ninorg_rf_caret <- train(Ninorg ~., data = Ninorg_caret_training, method = "rf", trControl = trainControl("cv", number = 10))
#rf model with all ASVs had R2 of 3% only

#filter top 30 features
Ninorg_feature_importance <- varImp(Ninorg_rf_caret)
Ninorg_importance_df <- data.frame(Ninorg_feature_importance$importance,feature = rownames(Ninorg_feature_importance$importance))
Ninorg_top30 <- head(Ninorg_importance_df[order(Ninorg_importance_df[,1],decreasing = T),],n=30) %>%
  .$feature %>%
  gsub("`", '', .)  

# top30 data selection
Ninorg_top30 <- Ninorg_caret_training[,Ninorg_top30] 
Ninorg_top30$Ninorg <- Ninorg_caret_training$Ninorg

Ninorg_rf_top30 <- train(Ninorg ~., data = Ninorg_top30, method = "rf", trControl = trainControl("cv", number = 10))

#Filtered top30 model explained 28.85% of Var of Ninorg values within the model
#Due to the low inner R2 no prediction was performed for Ninorg values

#Betaglucosidase
#input full file and generate training and validation files 0.6/0.4
Beta_ALL <- read.table("all_ASV_table_Beta")
Beta_caret_train <- createDataPartition(Beta_ALL$Beta, p=0.6, list = FALSE)
Beta_caret_training <- Beta_ALL[Beta_caret_train,]
Beta_caret_validation <- Beta_ALL[-Beta_caret_train,]

#44 samples were assigned to train; 25 to validation

#train model on full indicspecies dataset of 0.6% of training samples
Beta_rf_caret <- train(Beta ~., data = Beta_caret_training, method = "rf", trControl = trainControl("cv", number = 10))
#rf model with all ASVs had R2 of 44.07%

#filter top 30 features
Beta_feature_importance <- varImp(Beta_rf_caret)
Beta_importance_df <- data.frame(Beta_feature_importance$importance,feature = rownames(Beta_feature_importance$importance))
Beta_top30 <- head(Beta_importance_df[order(Beta_importance_df[,1],decreasing = T),],n=30) %>%
  .$feature %>%
  gsub("`", '', .)  

# top30 data selection
Beta_top30 <- Beta_caret_training[,Beta_top30] 
Beta_top30$Beta <- Beta_caret_training$Beta

Beta_rf_top30 <- train(Beta~., data = Beta_top30, method = "rf", trControl = trainControl("cv", number = 10))
#Filtered top30 model explained 54.43% of Var of Beta values within the model; proceed to prediction

#predict accuracy
Beta_top30_pred <- predict(Beta_rf_top30, Beta_caret_validation)
table(actual_Beta=Beta_caret_validation[,1],predicted_Beta=Beta_top30_pred)
predicted_Beta_final <- table(actual_Beta=Beta_caret_validation[,1],predicted_Beta=Beta_top30_pred)
write.table(predicted_Beta_final, file = "Beta_predicted_actual")

#CNinorg
#input full file and generate training and validation files 0.6/0.4
CNinorg_ALL <- read.table("all_ASV_table_CNinorg")
CNinorg_caret_train <- createDataPartition(CNinorg_ALL$CNinorg, p=0.6, list = FALSE)
CNinorg_caret_training <- CNinorg_ALL[CNinorg_caret_train,]
CNinorg_caret_validation <- CNinorg_ALL[-CNinorg_caret_train,]

#44 samples were assigned to train; 25 to validation

#train model on full indicspecies dataset of 0.6% of training samples
CNinorg_rf_caret <- train(CNinorg~., data = CNinorg_caret_training, method = "rf", trControl = trainControl("cv", number = 10))
#rf model with all ASVs had R2 of 0%
#Due to the low inner R2 no prediction was performed for CNinorg values

#Ptot
Ptot_ALL <- read.table("all_ASV_table_Ptot")
Ptot_caret_train <- createDataPartition(Ptot_ALL$Ptot, p=0.6, list = FALSE)
Ptot_caret_training <- Ptot_ALL[Ptot_caret_train,]
Ptot_caret_validation <- Ptot_ALL[-Ptot_caret_train,]

#43 samples were assigned to train; 26 to validation

#train model on full indicspecies dataset of 0.6% of training samples
Ptot_rf_caret <- train(Ptot~., data = Ptot_caret_training, method = "rf", trControl = trainControl("cv", number = 10))
#rf model with all ASVs had R2 of 42.56%

#filter top 30 features
Ptot_feature_importance <- varImp(Ptot_rf_caret)
Ptot_importance_df <- data.frame(Ptot_feature_importance$importance,feature = rownames(Ptot_feature_importance$importance))
Ptot_top30 <- head(Ptot_importance_df[order(Ptot_importance_df[,1],decreasing = T),],n=30) %>%
  .$feature %>%
  gsub("`", '', .)  

# top30 data selection
Ptot_top30 <- Ptot_caret_training[,Ptot_top30] 
Ptot_top30$Ptot <- Ptot_caret_training$Ptot

Ptot_rf_top30 <- train(Ptot~., data = Ptot_top30, method = "rf", trControl = trainControl("cv", number = 10))
#Filtered top30 model explained 45.26% of Var of Ptot values within the model; Prediction not done

#OM
OM_ALL <- read.table("all_ASV_table_OM")
OM_caret_train <- createDataPartition(OM_ALL$OM, p=0.6, list = FALSE)
OM_caret_training <- OM_ALL[OM_caret_train,]
OM_caret_validation <- OM_ALL[-OM_caret_train,]

#43 samples were assigned to train; 26 to validation
#train model on full indicspecies dataset of 0.6% of training samples
OM_rf_caret <- train(OM~., data = OM_caret_training, method = "rf", trControl = trainControl("cv", number = 10))
#rf model with all ASVs had R2 of 0%
#Due to the low inner R2 no prediction was performed for OM values

#pH
#input full file and generate training and validation files 0.6/0.4
pH_ALL <- read.table("all_ASV_table_pH")
pH_caret_train <- createDataPartition(pH_ALL$pH, p=0.6, list = FALSE)
pH_caret_training <- pH_ALL[pH_caret_train,]
pH_caret_validation <- pH_ALL[-pH_caret_train,]

#43 samples were assigned to train; 26 to validation

#train model on full indicspecies dataset of 0.6% of training samples
pH_rf_caret <- train(pH ~., data = pH_caret_training, method = "rf", trControl = trainControl("cv", number = 10))
#rf model with all ASVs had R2 of 87.38%

#filter top 30 features
pH_feature_importance <- varImp(pH_rf_caret)
pH_importance_df <- data.frame(pH_feature_importance$importance,feature = rownames(pH_feature_importance$importance))
pH_top30 <- head(pH_importance_df[order(pH_importance_df[,1],decreasing = T),],n=30) %>%
  .$feature %>%
  gsub("`", '', .)  

# top30 data selection
pH_top30 <- pH_caret_training[,pH_top30] 
pH_top30$pH <- pH_caret_training$pH

pH_rf_top30 <- train(pH~., data = pH_top30, method = "rf", trControl = trainControl("cv", number = 10))
#Filtered top30 model explained 91.13% of Var of pH values within the model; proceed to prediction

#predict accuracy
actual_pH <- pH_caret_validation[,1]
pH_top30_pred <- predict(pH_rf_top30, pH_caret_validation)
lm(formula = actual_pH ~ pH_top30_pred)
fit_pH = lm(formula = actual_pH ~ pH_top30_pred)


#Ktotal
Ktot_ALL <- read.table("all_ASV_table_Ktot")
Ktot_caret_train <- createDataPartition(Ktot_ALL$Ktot, p=0.6, list = FALSE)
Ktot_caret_training <- Ktot_ALL[Ktot_caret_train,]
Ktot_caret_validation <- Ktot_ALL[-Ktot_caret_train,]

#44 samples were assigned to train; 25 to validation

#train model on full indicspecies dataset of 0.6% of training samples
Ktot_rf_caret <- train(Ktot~., data = Ktot_caret_training, method = "rf", trControl = trainControl("cv", number = 10))
#rf model with all ASVs had R2 of 57.49%

#filter top 30 features
Ktot_feature_importance <- varImp(Ktot_rf_caret)
Ktot_importance_df <- data.frame(Ktot_feature_importance$importance,feature = rownames(Ktot_feature_importance$importance))
Ktot_top30 <- head(Ktot_importance_df[order(Ktot_importance_df[,1],decreasing = T),],n=30) %>%
  .$feature %>%
  gsub("`", '', .)  

# top30 data selection
Ktot_top30 <- Ktot_caret_training[,Ktot_top30] 
Ktot_top30$Ktot <- Ktot_caret_training$Ktot

Ktot_rf_top30 <- train(Ktot~., data = Ktot_top30, method = "rf", trControl = trainControl("cv", number = 10))
#Filtered top30 model explained 68.92% of Var of Ktot values within the model; proceed to prediction

#predict accuracy
actual_Ktot <- Ktot_caret_validation[,1]
Ktot_top30_pred <- predict(Ktot_rf_top30, Ktot_caret_validation)
lm(formula = actual_Ktot ~ Ktot_top30_pred)
fit_Ktot = lm(formula = actual_Ktot ~ Ktot_top30_pred)

#Palk
Palk_ALL <- read.table("all_ASV_table_Palk")
Palk_caret_train <- createDataPartition(Palk_ALL$Palk, p=0.6, list = FALSE)
Palk_caret_training <- Palk_ALL[Palk_caret_train,]
Palk_caret_validation <- Palk_ALL[-Palk_caret_train,]

#44 samples were assigned to train; 25 to validation
#train model on full indicspecies dataset of 0.6% of training samples
Palk_rf_caret <- train(Palk~., data = Palk_caret_training, method = "rf", trControl = trainControl("cv", number = 10))
#rf model with all ASVs had R2 of 67.28%

#filter top 30 features
Palk_feature_importance <- varImp(Palk_rf_caret)
Palk_importance_df <- data.frame(Palk_feature_importance$importance,feature = rownames(Palk_feature_importance$importance))
Palk_top30 <- head(Palk_importance_df[order(Palk_importance_df[,1],decreasing = T),],n=30) %>%
  .$feature %>%
  gsub("`", '', .)  

# top30 data selection
Palk_top30 <- Palk_caret_training[,Palk_top30] 
Palk_top30$Palk <- Palk_caret_training$Palk

Palk_rf_top30 <- train(Palk~., data = Palk_top30, method = "rf", trControl = trainControl("cv", number = 10))
#Filtered top30 model explained 73.54% of Var of Palk values within the model; proceed to prediction

#predict accuracy
actual_Palk <- Palk_caret_validation[,1]
Palk_top30_pred  <- predict(Palk_rf_top30, Palk_caret_validation)
lm(formula = actual_Palk ~ Palk_top30_pred)
fit_Palk = lm(formula = actual_Palk ~ Palk_top30_pred)

#Pac
Pac_ALL <- read.table("all_ASV_table_Pac")
Pac_caret_train <- createDataPartition(Pac_ALL$Pac, p=0.6, list = FALSE)
Pac_caret_training <- Pac_ALL[Pac_caret_train,]
Pac_caret_validation <- Pac_ALL[-Pac_caret_train,]

#44 samples were assigned to train; 25 to validation
#train model on full indicspecies dataset of 0.6% of training samples
Pac_rf_caret <- train(Pac~., data = Pac_caret_training, method = "rf", trControl = trainControl("cv", number = 10))
#rf model with all ASVs had R2 of 56.03%

#filter top 30 features
Pac_feature_importance <- varImp(Pac_rf_caret)
Pac_importance_df <- data.frame(Pac_feature_importance$importance,feature = rownames(Pac_feature_importance$importance))
Pac_top30 <- head(Pac_importance_df[order(Pac_importance_df[,1],decreasing = T),],n=30) %>%
  .$feature %>%
  gsub("`", '', .)  

# top30 data selection
Pac_top30 <- Pac_caret_training[,Pac_top30] 
Pac_top30$Pac <- Pac_caret_training$Pac

Pac_rf_top30 <- train(Pac~., data = Pac_top30, method = "rf", trControl = trainControl("cv", number = 10))
#Filtered top30 model explained 66.78% of Var of Pac values within the model; proceed to prediction

#predict accuracy
actual_Pac <- Pac_caret_validation[,1]
Pac_top30_pred  <- predict(Pac_rf_top30, Pac_caret_validation)
lm(formula = actual_Pac ~ Pac_top30_pred)
fit_Pac = lm(formula = actual_Pac ~ Pac_top30_pred)


ggplotRegression <- function (fit) {
     
     require(ggplot2)
     
     ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) + 
         geom_point() +
         stat_smooth(method = "lm", col = "black", se= FALSE) +
         labs(title = paste("Adj R2 = ",signif(summary(fit)$adj.r.squared, 5),
                            "Intercept =",signif(fit$coef[[1]],5 ),
                            " Slope =",signif(fit$coef[[2]], 5),
                            " P =",signif(summary(fit)$coef[2,4], 5)))
}
ggplotRegression(fit_pH)
ggplotRegression(fit_Ktot)
ggplotRegression(fit_Palk)
ggplotRegression(fit_Pac)