## ---- eval = FALSE------------------------------------------------------- # source("https://bioconductor.org/biocLite.R") # biocLite("Rcpi") ## ---- eval = FALSE------------------------------------------------------- # source("https://bioconductor.org/biocLite.R") # biocLite("Rcpi", dependencies = c("Imports", "Enhances")) ## ---- eval = FALSE------------------------------------------------------- # library("Rcpi") # # # load FASTA files # extracell = readFASTA(system.file( # "vignettedata/extracell.fasta", package = "Rcpi")) # mitonchon = readFASTA(system.file( # "vignettedata/mitochondrion.fasta", package = "Rcpi")) ## ---- eval = FALSE------------------------------------------------------- # length(extracell) ## ---- eval = FALSE------------------------------------------------------- # ## [1] 325 ## ---- eval = FALSE------------------------------------------------------- # length(mitonchon) ## ---- eval = FALSE------------------------------------------------------- # ## [1] 306 ## ---- eval = FALSE------------------------------------------------------- # extracell = extracell[(sapply(extracell, checkProt))] # mitonchon = mitonchon[(sapply(mitonchon, checkProt))] ## ---- eval = FALSE------------------------------------------------------- # length(extracell) ## ---- eval = FALSE------------------------------------------------------- # ## [1] 323 ## ---- eval = FALSE------------------------------------------------------- # length(mitonchon) ## ---- eval = FALSE------------------------------------------------------- # ## [1] 304 ## ---- eval = FALSE------------------------------------------------------- # # calculate APAAC descriptors # x1 = t(sapply(extracell, extractProtAPAAC)) # x2 = t(sapply(mitonchon, extractProtAPAAC)) # x = rbind(x1, x2) # # # make class labels # labels = as.factor(c(rep(0, length(extracell)), rep(1, length(mitonchon)))) ## ---- eval = FALSE------------------------------------------------------- # set.seed(1001) # # # split training and test set # tr.idx = c( # sample(1:nrow(x1), round(nrow(x1) * 0.75)), # sample(nrow(x1) + 1:nrow(x2), round(nrow(x2) * 0.75))) # te.idx = setdiff(1:nrow(x), tr.idx) # # x.tr = x[tr.idx, ] # x.te = x[te.idx, ] # y.tr = labels[tr.idx] # y.te = labels[te.idx] ## ---- eval = FALSE------------------------------------------------------- # library("randomForest") # rf.fit = randomForest(x.tr, y.tr, cv.fold = 5) # print(rf.fit) ## ---- eval = FALSE------------------------------------------------------- # ## Call: # ## randomForest(x = x.tr, y = y.tr, cv.fold = 5) # ## Type of random forest: classification # ## Number of trees: 500 # ## No. of variables tried at each split: 8 # ## # ## OOB estimate of error rate: 25.11% # ## Confusion matrix: # ## 0 1 class.error # ## 0 196 46 0.1900826 # ## 1 72 156 0.3157895 ## ---- eval = FALSE------------------------------------------------------- # # predict on test set # rf.pred = predict(rf.fit, newdata = x.te, type = "prob")[, 1] # # # plot ROC curve # library("pROC") # plot.roc(y.te, rf.pred, grid = TRUE, print.auc = TRUE) ## ---- eval = FALSE------------------------------------------------------- # ## Call: # ## plot.roc.default(x = y.te, predictor = rf.pred, col = "#0080ff", # ## grid = TRUE, print.auc = TRUE) # ## # ## Data: rf.pred in 81 controls (y.te 0) > 76 cases (y.te 1). # ## Area under the curve: 0.8697 ## ---- eval = FALSE------------------------------------------------------- # library("Rcpi") # # RI.smi = system.file( # "vignettedata/FDAMDD.smi", package = "Rcpi") # RI.csv = system.file( # "vignettedata/RI.csv", package = "Rcpi") # # x.mol = readMolFromSmi(RI.smi, type = "mol") # x.tab = read.table(RI.csv, sep = "\t", header = TRUE) # y = x.tab$RI ## ---- eval = FALSE------------------------------------------------------- # # calculate selected molecular descriptors # x = suppressWarnings(cbind( # extractDrugALOGP(x.mol), # extractDrugApol(x.mol), # extractDrugECI(x.mol), # extractDrugTPSA(x.mol), # extractDrugWeight(x.mol), # extractDrugWienerNumbers(x.mol), # extractDrugZagrebIndex(x.mol))) ## ---- eval = FALSE------------------------------------------------------- # # regression on training set # library("caret") # library("pls") # # # cross-validation settings # ctrl = trainControl( # method = "repeatedcv", number = 5, repeats = 10, # summaryFunction = defaultSummary) # # # train a pls model # set.seed(1002) # pls.fit = train( # x, y, method = "pls", tuneLength = 10, trControl = ctrl, # metric = "RMSE", preProc = c("center", "scale")) # # # print cross-validation result # print(pls.fit) ## ---- eval = FALSE------------------------------------------------------- # ## Partial Least Squares # ## # ## 297 samples # ## 10 predictors # ## # ## Pre-processing: centered, scaled # ## Resampling: Cross-Validated (5 fold, repeated 10 times) # ## # ## Summary of sample sizes: 237, 237, 237, 238, 239, 238, ... # ## # ## Resampling results across tuning parameters: # ## # ## ncomp RMSE Rsquared RMSE SD Rsquared SD # ## 1 104 0.884 9.44 0.0285 # ## 2 86.4 0.92 6.99 0.0194 # ## 3 83.8 0.924 6.56 0.0185 # ## 4 79.6 0.931 6.98 0.0194 # ## 5 76.3 0.937 7.45 0.0187 # ## 6 74.7 0.94 6.85 0.0162 # ## 7 73.7 0.941 6.75 0.0159 # ## 8 73.5 0.942 6.5 0.0142 # ## 9 72.5 0.944 6.18 0.0137 # ## # ## RMSE was used to select the optimal model using the smallest value. # ## The final value used for the model was ncomp = 9. ## ---- eval = FALSE------------------------------------------------------- # # # of components vs RMSE # print(plot(pls.fit, asp = 0.5)) ## ---- eval = FALSE------------------------------------------------------- # # plot experimental RIs vs predicted RIs # plot(y, predict(pls.fit, x), xlim = range(y), ylim = range(y), # xlab = "Experimental RIs", ylab = "Predicted RIs") # abline(a = 0, b = 1) ## ---- eval = FALSE------------------------------------------------------- # library("Rcpi") # # fdamdd.smi = system.file("vignettedata/FDAMDD.smi", package = "Rcpi") # fdamdd.csv = system.file("vignettedata/FDAMDD.csv", package = "Rcpi") # # x.mol = readMolFromSmi(fdamdd.smi, type = "mol") # x.smi = readMolFromSmi(fdamdd.smi, type = "text") # y = as.factor(paste0("class", scan(fdamdd.csv))) ## ---- eval = FALSE------------------------------------------------------- # # calculate molecular fingerprints # x1 = extractDrugEstateComplete(x.mol) # x2 = extractDrugMACCSComplete(x.mol) # x3 = extractDrugOBFP4(x.smi, type = "smile") ## ---- eval = FALSE------------------------------------------------------- # library("caret") # # # remove near-zero variance variables # x1 = x1[, -nearZeroVar(x1)] # x2 = x2[, -nearZeroVar(x2)] # x3 = x3[, -nearZeroVar(x3)] # # # split training and test set # set.seed(1003) # tr.idx = sample(1:nrow(x1), round(nrow(x1) * 0.75)) # te.idx = setdiff(1:nrow(x1), tr.idx) # x1.tr = x1[tr.idx, ] # x1.te = x1[te.idx, ] # x2.tr = x2[tr.idx, ] # x2.te = x2[te.idx, ] # x3.tr = x3[tr.idx, ] # x3.te = x3[te.idx, ] # y.tr = y[tr.idx] # y.te = y[te.idx] ## ---- eval = FALSE------------------------------------------------------- # # svm classification on training sets # library("kernlab") # # # cross-validation settings # ctrl = trainControl(method = "cv", number = 5, repeats = 10, # classProbs = TRUE, # summaryFunction = twoClassSummary) # # # SVM with RBF kernel # svm.fit1 = train( # x1.tr, y.tr, method = "svmRadial", trControl = ctrl, # metric = "ROC", preProc = c("center", "scale")) # svm.fit2 = train( # x2.tr, y.tr, method = "svmRadial", trControl = ctrl, # metric = "ROC", preProc = c("center", "scale")) # svm.fit3 = train( # x3.tr, y.tr, method = "svmRadial", trControl = ctrl, # metric = "ROC", preProc = c("center", "scale")) # # # print cross-validation result # print(svm.fit1) # print(svm.fit2) # print(svm.fit3) ## ---- eval = FALSE------------------------------------------------------- # ## Support Vector Machines with Radial Basis Function Kernel # ## # ## 597 samples # ## 23 predictors # ## 2 classes: "class0", "class1" # ## # ## Pre-processing: centered, scaled # ## Resampling: Cross-Validated (5 fold) # ## # ## Summary of sample sizes: 478, 479, 477, 477, 477 # ## # ## Resampling results across tuning parameters: # ## # ## C ROC Sens Spec ROC SD Sens SD Spec SD # ## 0.25 0.797 0.7 0.765 0.0211 0.0442 0.00666 # ## 0.5 0.808 0.696 0.79 0.0173 0.059 0.0236 # ## 1 0.812 0.703 0.781 0.0191 0.0664 0.0228 # ## # ## Tuning parameter "sigma" was held constant at a value of 0.02921559 # ## ROC was used to select the optimal model using the largest value. # ## The final values used for the model were sigma = 0.0292 and C = 1. ## ---- eval = FALSE------------------------------------------------------- # ## Support Vector Machines with Radial Basis Function Kernel # ## # ## 597 samples # ## 126 predictors # ## 2 classes: "class0", "class1" # ## # ## Pre-processing: centered, scaled # ## Resampling: Cross-Validated (5 fold) # ## # ## Summary of sample sizes: 477, 477, 477, 478, 479 # ## # ## Resampling results across tuning parameters: # ## # ## C ROC Sens Spec ROC SD Sens SD Spec SD # ## 0.25 0.834 0.715 0.775 0.0284 0.0994 0.0589 # ## 0.5 0.848 0.726 0.79 0.0299 0.065 0.0493 # ## 1 0.863 0.769 0.793 0.0307 0.0229 0.0561 # ## # ## Tuning parameter "sigma" was held constant at a value of 0.004404305 # ## ROC was used to select the optimal model using the largest value. # ## The final values used for the model were sigma = 0.0044 and C = 1. ## ---- eval = FALSE------------------------------------------------------- # ## Support Vector Machines with Radial Basis Function Kernel # ## # ## 597 samples # ## 58 predictors # ## 2 classes: "class0", "class1" # ## # ## Pre-processing: centered, scaled # ## Resampling: Cross-Validated (5 fold) # ## # ## Summary of sample sizes: 478, 478, 477, 478, 477 # ## # ## Resampling results across tuning parameters: # ## # ## C ROC Sens Spec ROC SD Sens SD Spec SD # ## 0.25 0.845 0.769 0.746 0.0498 0.0458 0.0877 # ## 0.5 0.856 0.744 0.777 0.0449 0.0148 0.0728 # ## 1 0.863 0.751 0.777 0.0428 0.036 0.0651 # ## # ## Tuning parameter "sigma" was held constant at a value of 0.01077024 # ## ROC was used to select the optimal model using the largest value. # ## The final values used for the model were sigma = 0.0108 and C = 1. ## ---- eval = FALSE------------------------------------------------------- # # predict on test set # svm.pred1 = predict(svm.fit1, newdata = x1.te, type = "prob")[, 1] # svm.pred2 = predict(svm.fit2, newdata = x2.te, type = "prob")[, 1] # svm.pred3 = predict(svm.fit3, newdata = x3.te, type = "prob")[, 1] # # # generate colors # library("RColorBrewer") # pal = brewer.pal(3, "Set1") # # # ROC curves of different fingerprints # library("pROC") # plot(smooth(roc(y.te, svm.pred1)), col = pal[1], grid = TRUE) # plot(smooth(roc(y.te, svm.pred2)), col = pal[2], grid = TRUE, add = TRUE) # plot(smooth(roc(y.te, svm.pred3)), col = pal[3], grid = TRUE, add = TRUE) ## ---- eval = FALSE------------------------------------------------------- # library("Rcpi") # mols = readMolFromSDF(system.file( # "compseq/tyrphostin.sdf", package = "Rcpi")) ## ---- eval = FALSE------------------------------------------------------- # simmat = diag(length(mols)) # # for (i in 1:length(mols)) { # for (j in i:length(mols)) { # fp1 = extractDrugEstate(mols[[i]]) # fp2 = extractDrugEstate(mols[[j]]) # tmp = calcDrugFPSim(fp1, fp2, fptype = "compact", metric = "tanimoto") # simmat[i, j] = tmp # simmat[j, i] = tmp # } # } ## ---- eval = FALSE------------------------------------------------------- # mol.hc = hclust(as.dist(1 - simmat), method = "ward") # # library("ape") # tree visualization of clusters # clus5 = cutree(mol.hc, 5) # cut dendrogram into 5 clusters # # # generate colors # library("RColorBrewer") # pal5 = brewer.pal(5, "Set1") # plot(as.phylo(mol.hc), type = "fan", # tip.color = pal5[clus5], # label.offset = 0.1, cex = 0.7) ## ---- eval = FALSE------------------------------------------------------- # library("Rcpi") # # mol = system.file("compseq/DB00530.sdf", package = "Rcpi") # moldb = system.file("compseq/tyrphostin.sdf", package = "Rcpi") ## ---- eval = FALSE------------------------------------------------------- # rank1 = searchDrug( # mol, moldb, cores = 4, method = "fp", # fptype = "maccs", fpsim = "tanimoto") # rank2 = searchDrug( # mol, moldb, cores = 4, method = "fp", # fptype = "fp2", fpsim = "cosine") # rank3 = searchDrug( # mol, moldb, cores = 4, method = "mcs", # mcssim = "tanimoto") ## ---- eval = FALSE------------------------------------------------------- # head(rank1) # ## 92 100 83 101 1 36 # ## 0.6491228 0.6491228 0.5882353 0.5660377 0.5000000 0.4861111 # # head(rank2) # ## 100 92 83 101 94 16 # ## 0.8310005 0.8208663 0.5405856 0.5033150 0.4390790 0.4274081 # # head(rank3) # ## 92 100 23 39 94 64 # ## 0.7000000 0.7000000 0.4000000 0.4000000 0.4000000 0.3783784 ## ---- eval = FALSE------------------------------------------------------- # # convert SDF format to SMILES format # convMolFormat( # infile = mol, outfile = "DB00530.smi", from = "sdf", to = "smiles") # convMolFormat( # infile = moldb, outfile = "tyrphostin.smi", from = "sdf", to = "smiles") # # smi1 = readLines("DB00530.smi") # smi2 = readLines("tyrphostin.smi")[92] # select the #92 molecule # calcDrugMCSSim(smi1, smi2, type = "smile", plot = TRUE) ## ---- eval = FALSE------------------------------------------------------- # ## [[1]] # ## An instance of "MCS" # ## Number of MCSs: 1 # ## 530: 29 atoms # ## 4705: 22 atoms # ## MCS: 18 atoms # ## Tanimoto Coefficient: 0.54545 # ## Overlap Coefficient: 0.81818 # ## # ## [[2]] # ## Tanimoto_Coefficient # ## 0.5454545 # ## # ## [[3]] # ## Overlap_Coefficient # ## 0.8181818 ## ---- eval = FALSE------------------------------------------------------- # library("Rcpi") # # gpcr = read.table(system.file( # "vignettedata/GPCR.csv", package = "Rcpi"), # header = FALSE, as.is = TRUE) ## ---- eval = FALSE------------------------------------------------------- # head(gpcr) ## ---- eval = FALSE------------------------------------------------------- # ## V1 V2 # ## 1 hsa:10161 D00528 # ## 2 hsa:10800 D00411 # ## 3 hsa:10800 D01828 # ## 4 hsa:10800 D05129 # ## 5 hsa:11255 D00234 # ## 6 hsa:11255 D00300 ## ---- eval = FALSE------------------------------------------------------- # library("igraph") # library("arcdiagram") # library("reshape") # # g = graph.data.frame(gpcr[1:(nrow(gpcr)/2), ], directed = FALSE) # edgelist = get.edgelist(g) # vlabels = V(g)$name # vgroups = c(rep(0, 95), rep(1, 223)) # vfill = c(rep("#8B91D4", 95), rep("#B2C771", 223)) # vborders = c(rep("#6F74A9", 95), rep("#8E9F5A", 223)) # degrees = degree(g) # # xx = data.frame(vgroups, degrees, vlabels, ind = 1:vcount(g)) # yy = arrange(xx, desc(vgroups), desc(degrees)) # new.ord = yy$ind # # arcplot( # edgelist, ordering = new.ord, labels = vlabels, # cex.labels = 0.1, show.nodes = TRUE, # col.nodes = vborders, bg.nodes = vfill, # cex.nodes = log10(degrees) + 0.1, # pch.nodes = 21, line = -0.5, col.arcs = hsv(0, 0, 0.2, 0.25)) ## ---- eval = FALSE------------------------------------------------------- # library("Rcpi") # # gpcr = read.table(system.file( # "vignettedata/GPCR.csv", package = "Rcpi"), # header = FALSE, as.is = TRUE) # # protid = unique(gpcr[, 1]) # drugid = unique(gpcr[, 2]) # # protseq = getSeqFromKEGG(protid, parallel = 5) # drugseq = getSmiFromKEGG(drugid, parallel = 50) ## ---- eval = FALSE------------------------------------------------------- # x0.prot = cbind( # t(sapply(unlist(protseq), extractProtAPAAC)), # t(sapply(unlist(protseq), extractProtCTriad))) # # x0.drug = cbind( # extractDrugEstateComplete(readMolFromSmi(textConnection(drugseq))), # extractDrugMACCSComplete(readMolFromSmi(textConnection(drugseq))), # extractDrugOBFP4(drugseq, type = "smile")) ## ---- eval = FALSE------------------------------------------------------- # # generate drug x / protein x / y # x.prot = matrix(NA, nrow = nrow(gpcr), ncol = ncol(x0.prot)) # x.drug = matrix(NA, nrow = nrow(gpcr), ncol = ncol(x0.drug)) # for (i in 1:nrow(gpcr)) x.prot[i, ] = x0.prot[which(gpcr[, 1][i] == protid), ] # for (i in 1:nrow(gpcr)) x.drug[i, ] = x0.drug[which(gpcr[, 2][i] == drugid), ] # # y = as.factor(c(rep("pos", nrow(gpcr)/2), rep("neg", nrow(gpcr)/2))) ## ---- eval = FALSE------------------------------------------------------- # x = getCPI(x.prot, x.drug, type = "combine") ## ---- eval = FALSE------------------------------------------------------- # library("caret") # x = x[, -nearZeroVar(x)] # # # cross-validation settings # ctrl = trainControl( # method = "cv", number = 5, repeats = 10, # classProbs = TRUE, # summaryFunction = twoClassSummary) # # # train a random forest classifier # library("randomForest") # # set.seed(1006) # rf.fit = train( # x, y, method = "rf", trControl = ctrl, # metric = "ROC", preProc = c("center", "scale")) # \end{CodeInput} # # Print the cross-validation result: # # \begin{CodeInput} # print(rf.fit) ## ---- eval = FALSE------------------------------------------------------- # ## Random Forest # ## # ## 1270 samples # ## 562 predictors # ## 2 classes: "neg", "pos" # ## # ## Pre-processing: centered, scaled # ## Resampling: Cross-Validated (5 fold) # ## # ## Summary of sample sizes: 1016, 1016, 1016, 1016, 1016 # ## # ## Resampling results across tuning parameters: # ## # ## mtry ROC Sens Spec ROC SD Sens SD Spec SD # ## 2 0.83 0.726 0.778 0.0221 0.044 0.0395 # ## 33 0.882 0.795 0.82 0.018 0.0522 0.0443 # ## 562 0.893 0.822 0.844 0.0161 0.0437 0.0286 # ## # ## ROC was used to select the optimal model using the largest value. # ## The final value used for the model was mtry = 562. ## ---- eval = FALSE------------------------------------------------------- # rf.pred = predict(rf.fit$finalModel, x, type = "prob")[, 1] # # library("pROC") # plot(smooth(roc(y, rf.pred)), grid = TRUE, print.auc = TRUE)