## ---- eval = FALSE------------------------------------------------------------ # install.packages("BiocManager") # BiocManager::install("Rcpi") ## ---- eval = FALSE------------------------------------------------------------ # BiocManager::install("Rcpi", dependencies = c("Imports", "Enhances")) ## ---- eval = FALSE------------------------------------------------------------ # library("Rcpi") # # # Load FASTA files # extracell <- readFASTA(system.file( # "vignettedata/extracell.fasta", # package = "Rcpi" # )) # mitonchon <- readFASTA(system.file( # "vignettedata/mitochondrion.fasta", # package = "Rcpi" # )) ## ---- eval = FALSE------------------------------------------------------------ # length(extracell) ## ---- eval = FALSE------------------------------------------------------------ # length(mitonchon) ## ---- eval = FALSE------------------------------------------------------------ # extracell <- extracell[(sapply(extracell, checkProt))] # mitonchon <- mitonchon[(sapply(mitonchon, checkProt))] ## ---- eval = FALSE------------------------------------------------------------ # length(extracell) ## ---- eval = FALSE------------------------------------------------------------ # length(mitonchon) ## ---- eval = FALSE------------------------------------------------------------ # # Calculate APAAC descriptors # x1 <- t(sapply(extracell, extractProtAPAAC)) # x2 <- t(sapply(mitonchon, extractProtAPAAC)) # x <- rbind(x1, x2) # # # Make class labels # labels <- as.factor(c(rep(0, length(extracell)), rep(1, length(mitonchon)))) ## ---- eval = FALSE------------------------------------------------------------ # set.seed(1001) # # # Split training and test set # tr.idx <- c( # sample(1:nrow(x1), round(nrow(x1) * 0.75)), # sample(nrow(x1) + 1:nrow(x2), round(nrow(x2) * 0.75)) # ) # te.idx <- setdiff(1:nrow(x), tr.idx) # # x.tr <- x[tr.idx, ] # x.te <- x[te.idx, ] # y.tr <- labels[tr.idx] # y.te <- labels[te.idx] ## ---- eval = FALSE------------------------------------------------------------ # library("randomForest") # rf.fit <- randomForest(x.tr, y.tr, cv.fold = 5) # print(rf.fit) ## ---- eval = FALSE------------------------------------------------------------ # # Predict on test set # rf.pred <- predict(rf.fit, newdata = x.te, type = "prob")[, 1] # # # Plot ROC curve # library("pROC") # plot.roc(y.te, rf.pred, grid = TRUE, print.auc = TRUE) ## ---- echo=FALSE, out.width="65%", fig.cap="Figure 1: ROC curve for the test set of protein subcellular localization data."---- knitr::include_graphics("figures/ex1-1.png") ## ---- eval = FALSE------------------------------------------------------------ # library("Rcpi") # # RI.smi <- system.file( # "vignettedata/RI.smi", # package = "Rcpi" # ) # RI.csv <- system.file( # "vignettedata/RI.csv", # package = "Rcpi" # ) # # x.mol <- readMolFromSmi(RI.smi, type = "mol") # x.tab <- read.table(RI.csv, sep = "\t", header = TRUE) # y <- x.tab$RI ## ---- eval = FALSE------------------------------------------------------------ # # Calculate selected molecular descriptors # x <- suppressWarnings(cbind( # extractDrugALOGP(x.mol), # extractDrugApol(x.mol), # extractDrugECI(x.mol), # extractDrugTPSA(x.mol), # extractDrugWeight(x.mol), # extractDrugWienerNumbers(x.mol), # extractDrugZagrebIndex(x.mol) # )) ## ---- eval = FALSE------------------------------------------------------------ # # Run regression on training set # library("caret") # library("pls") # # # Cross-validation settings # ctrl <- trainControl( # method = "repeatedcv", number = 5, repeats = 10, # summaryFunction = defaultSummary # ) # # # Train a PLS model # set.seed(1002) # pls.fit <- train( # x, y, # method = "pls", tuneLength = 10, trControl = ctrl, # metric = "RMSE", preProc = c("center", "scale") # ) # # # Print cross-validation results # print(pls.fit) ## ---- eval = FALSE------------------------------------------------------------ # # Number of components vs. RMSE # print(plot(pls.fit, asp = 0.5)) ## ---- echo=FALSE, out.width="85%", fig.cap="Figure 2: Number of principal components vs. RMSE for the PLS regression model."---- knitr::include_graphics("figures/ex2-1.png") ## ---- eval = FALSE------------------------------------------------------------ # # Plot experimental RIs vs predicted RIs # plot(y, predict(pls.fit, x), # xlim = range(y), ylim = range(y), # xlab = "Experimental RIs", ylab = "Predicted RIs" # ) # abline(a = 0, b = 1) ## ---- echo=FALSE, out.width="65%", fig.cap="Figure 3: Experimental RIs vs. predicted RIs."---- knitr::include_graphics("figures/ex2-2.png") ## ---- eval = FALSE------------------------------------------------------------ # library("Rcpi") # # fdamdd.smi <- system.file("vignettedata/FDAMDD.smi", package = "Rcpi") # fdamdd.csv <- system.file("vignettedata/FDAMDD.csv", package = "Rcpi") # # x.mol <- readMolFromSmi(fdamdd.smi, type = "mol") # x.smi <- readMolFromSmi(fdamdd.smi, type = "text") # y <- as.factor(paste0("class", scan(fdamdd.csv))) ## ---- eval = FALSE------------------------------------------------------------ # # Calculate molecular fingerprints # x1 <- extractDrugEstateComplete(x.mol) # x2 <- extractDrugMACCSComplete(x.mol) # x3 <- extractDrugOBFP4(x.smi, type = "smile") ## ---- eval = FALSE------------------------------------------------------------ # library("caret") # # # Remove near-zero variance variables # x1 <- x1[, -nearZeroVar(x1)] # x2 <- x2[, -nearZeroVar(x2)] # x3 <- x3[, -nearZeroVar(x3)] # # # Split training and test set # set.seed(1003) # tr.idx <- sample(1:nrow(x1), round(nrow(x1) * 0.75)) # te.idx <- setdiff(1:nrow(x1), tr.idx) # x1.tr <- x1[tr.idx, ] # x1.te <- x1[te.idx, ] # x2.tr <- x2[tr.idx, ] # x2.te <- x2[te.idx, ] # x3.tr <- x3[tr.idx, ] # x3.te <- x3[te.idx, ] # y.tr <- y[tr.idx] # y.te <- y[te.idx] ## ---- eval = FALSE------------------------------------------------------------ # # SVM classification on training sets # library("kernlab") # # # Cross-validation settings # ctrl <- trainControl( # method = "cv", number = 5, repeats = 10, # classProbs = TRUE, # summaryFunction = twoClassSummary # ) # # # SVM with RBF kernel # svm.fit1 <- train( # x1.tr, y.tr, # method = "svmRadial", trControl = ctrl, # metric = "ROC", preProc = c("center", "scale") # ) # svm.fit2 <- train( # x2.tr, y.tr, # method = "svmRadial", trControl = ctrl, # metric = "ROC", preProc = c("center", "scale") # ) # svm.fit3 <- train( # x3.tr, y.tr, # method = "svmRadial", trControl = ctrl, # metric = "ROC", preProc = c("center", "scale") # ) # # # Print cross-validation results # print(svm.fit1) # print(svm.fit2) # print(svm.fit3) ## ---- eval = FALSE------------------------------------------------------------ # # Predict on test set # svm.pred1 <- predict(svm.fit1, newdata = x1.te, type = "prob")[, 1] # svm.pred2 <- predict(svm.fit2, newdata = x2.te, type = "prob")[, 1] # svm.pred3 <- predict(svm.fit3, newdata = x3.te, type = "prob")[, 1] # # # Generate colors # library("RColorBrewer") # pal <- brewer.pal(3, "Set1") # # # ROC curves of different fingerprints # library("pROC") # plot(smooth(roc(y.te, svm.pred1)), col = pal[1], grid = TRUE) # plot(smooth(roc(y.te, svm.pred2)), col = pal[2], grid = TRUE, add = TRUE) # plot(smooth(roc(y.te, svm.pred3)), col = pal[3], grid = TRUE, add = TRUE) ## ---- echo=FALSE, out.width="65%", fig.cap="Figure 4: Smoothed ROC curves for different fingerprint types."---- knitr::include_graphics("figures/ex3-1.png") ## ---- eval = FALSE------------------------------------------------------------ # library("Rcpi") # mols <- readMolFromSDF(system.file( # "compseq/tyrphostin.sdf", # package = "Rcpi" # )) ## ---- eval = FALSE------------------------------------------------------------ # simmat <- diag(length(mols)) # # for (i in 1:length(mols)) { # for (j in i:length(mols)) { # fp1 <- extractDrugEstate(mols[[i]]) # fp2 <- extractDrugEstate(mols[[j]]) # tmp <- calcDrugFPSim(fp1, fp2, fptype = "compact", metric = "tanimoto") # simmat[i, j] <- tmp # simmat[j, i] <- tmp # } # } ## ---- eval = FALSE------------------------------------------------------------ # mol.hc <- hclust(as.dist(1 - simmat), method = "ward.D") # # library("ape") # Tree visualization of clusters # clus5 <- cutree(mol.hc, 5) # Cut dendrogram into 5 clusters # # # Generate colors # library("RColorBrewer") # pal5 <- brewer.pal(5, "Set1") # plot(as.phylo(mol.hc), # type = "fan", # tip.color = pal5[clus5], # label.offset = 0.1, cex = 0.7 # ) ## ---- echo=FALSE, out.width="65%", fig.cap="Figure 5: Tree visualization of the molecular clustering result."---- knitr::include_graphics("figures/ex4-1.png") ## ---- eval = FALSE------------------------------------------------------------ # library("Rcpi") # # mol <- system.file("compseq/DB00530.sdf", package = "Rcpi") # moldb <- system.file("compseq/tyrphostin.sdf", package = "Rcpi") ## ---- eval = FALSE------------------------------------------------------------ # rank1 <- searchDrug( # mol, moldb, # cores = 4, method = "fp", # fptype = "maccs", fpsim = "tanimoto" # ) # rank2 <- searchDrug( # mol, moldb, # cores = 4, method = "fp", # fptype = "fp2", fpsim = "cosine" # ) # rank3 <- searchDrug( # mol, moldb, # cores = 4, method = "mcs", # mcssim = "tanimoto" # ) ## ---- eval = FALSE------------------------------------------------------------ # head(rank1) ## ---- eval = FALSE------------------------------------------------------------ # head(rank2) ## ---- eval = FALSE------------------------------------------------------------ # head(rank3) ## ---- eval = FALSE------------------------------------------------------------ # # Convert SDF format to SMILES format # convMolFormat( # infile = mol, outfile = "DB00530.smi", from = "sdf", to = "smiles" # ) # convMolFormat( # infile = moldb, outfile = "tyrphostin.smi", from = "sdf", to = "smiles" # ) # # smi1 <- readLines("DB00530.smi") # smi2 <- readLines("tyrphostin.smi")[92] # Select the #92 molecule # calcDrugMCSSim(smi1, smi2, type = "smile", plot = TRUE) ## ---- echo=FALSE, out.width="50%", fig.cap="Figure 6: Maximum common structure of the query molecule and the #92 molecule in the chemical database."---- knitr::include_graphics("figures/ex5-1.png") ## ---- eval = FALSE------------------------------------------------------------ # library("Rcpi") # # gpcr <- read.table(system.file( # "vignettedata/GPCR.csv", # package = "Rcpi" # ), # header = FALSE, as.is = TRUE # ) ## ---- eval = FALSE------------------------------------------------------------ # head(gpcr) ## ---- eval = FALSE------------------------------------------------------------ # library("igraph") # library("reshape") # # remotes::install_github("gastonstat/arcdiagram") # library("arcdiagram") # # g <- graph.data.frame(gpcr[1:(nrow(gpcr) / 2), ], directed = FALSE) # edgelist <- get.edgelist(g) # vlabels <- V(g)$name # vgroups <- c(rep(0, 95), rep(1, 223)) # vfill <- c(rep("#8B91D4", 95), rep("#B2C771", 223)) # vborders <- c(rep("#6F74A9", 95), rep("#8E9F5A", 223)) # degrees <- degree(g) # # xx <- data.frame(vgroups, degrees, vlabels, ind = 1:vcount(g)) # yy <- arrange(xx, desc(vgroups), desc(degrees)) # new.ord <- yy$ind # # arcplot( # edgelist, # ordering = new.ord, labels = vlabels, # cex.labels = 0.1, show.nodes = TRUE, # col.nodes = vborders, bg.nodes = vfill, # cex.nodes = log10(degrees) + 0.1, # pch.nodes = 21, line = -0.5, col.arcs = hsv(0, 0, 0.2, 0.25) # ) ## ---- echo=FALSE, out.width="85%", fig.cap="Figure 7: Arc diagram visualization of the GPCR drug-target interaction network."---- knitr::include_graphics("figures/ex6-1.png") ## ---- eval = FALSE------------------------------------------------------------ # library("Rcpi") # # gpcr <- read.table(system.file( # "vignettedata/GPCR.csv", # package = "Rcpi" # ), # header = FALSE, as.is = TRUE # ) # # protid <- unique(gpcr[, 1]) # drugid <- unique(gpcr[, 2]) # # protseq <- getSeqFromKEGG(protid, parallel = 5) # drugseq <- getSmiFromKEGG(drugid, parallel = 50) ## ---- eval = FALSE------------------------------------------------------------ # x0.prot <- cbind( # t(sapply(unlist(protseq), extractProtAPAAC)), # t(sapply(unlist(protseq), extractProtCTriad)) # ) # # x0.drug <- cbind( # extractDrugEstateComplete(readMolFromSmi(textConnection(drugseq))), # extractDrugMACCSComplete(readMolFromSmi(textConnection(drugseq))), # extractDrugOBFP4(drugseq, type = "smile") # ) ## ---- eval = FALSE------------------------------------------------------------ # # Generate drug x / protein x / y # x.prot <- matrix(NA, nrow = nrow(gpcr), ncol = ncol(x0.prot)) # x.drug <- matrix(NA, nrow = nrow(gpcr), ncol = ncol(x0.drug)) # for (i in 1:nrow(gpcr)) x.prot[i, ] <- x0.prot[which(gpcr[, 1][i] == protid), ] # for (i in 1:nrow(gpcr)) x.drug[i, ] <- x0.drug[which(gpcr[, 2][i] == drugid), ] # # y <- as.factor(c(rep("pos", nrow(gpcr) / 2), rep("neg", nrow(gpcr) / 2))) ## ---- eval = FALSE------------------------------------------------------------ # x <- getCPI(x.prot, x.drug, type = "combine") ## ---- eval = FALSE------------------------------------------------------------ # library("caret") # x <- x[, -nearZeroVar(x)] # # # Cross-validation settings # ctrl <- trainControl( # method = "cv", number = 5, repeats = 10, # classProbs = TRUE, # summaryFunction = twoClassSummary # ) # # # Train a random forest classifier # library("randomForest") # # set.seed(1006) # rf.fit <- train( # x, y, # method = "rf", trControl = ctrl, # metric = "ROC", preProc = c("center", "scale") # ) ## ---- eval = FALSE------------------------------------------------------------ # rf.pred <- predict(rf.fit$finalModel, x, type = "prob")[, 1] # # library("pROC") # plot(smooth(roc(y, rf.pred)), grid = TRUE, print.auc = TRUE) ## ---- echo=FALSE, out.width="65%", fig.cap="Figure 8: ROC curve for predicting on the training set of the GPCR drug-target interaction dataset using random forest."---- knitr::include_graphics("figures/ex6-2.png")