Hi,
I’d like to generate an AUC for a validation set to assess model performance and support the predicted error rate after using the predict function. I created the below method for use with large two-factor classification independent training and validation datasets and I want to confirm if it’s appropriate to use? I simplified the case study into a two-factor classification problem as an example of the method.
Thanks for the help.
Tamara
#mixOmics Test Case
library(mixOmics) # import the mixOmics library
#> Loading required package: MASS
#> Loading required package: lattice
#> Loading required package: ggplot2
#>
#> Loaded mixOmics 6.20.0
#> Thank you for using mixOmics!
#> Tutorials: http://mixomics.org
#> Bookdown vignette: https://mixomicsteam.github.io/Bookdown
#> Questions, issues: Follow the prompts at http://mixomics.org/contact-us
#> Cite us: citation('mixOmics')
library(pROC)
#> Type 'citation("pROC")' for a citation.
#>
#> Attaching package: 'pROC'
#> The following objects are masked from 'package:stats':
#>
#> cov, smooth, var
set.seed(5249) # for reproducibility, remove for normal use
data(srbct) # extract the small round bull cell tumour data
X <- srbct$gene # use the gene expression data as the X matrix
Y <- srbct$class # use the class data as the Y matrix
dim(X) # check the dimensions of the X dataframe
#> [1] 63 2308
EWSorRMS <- which(Y == "EWS" | Y == "RMS") #Subset to a two factor classification problem, selected for the largest class sizes
X <- X[EWSorRMS,]
Y <- Y[EWSorRMS]
dim(X)
#> [1] 43 2308
length(Y)
#> [1] 43
train <- sample(1:nrow(X), 30) # randomly select 30 samples in training
test <- setdiff(1:nrow(X), train) # rest is part of the test set
# store matrices into training and test set:
X.train <- X[train, ]
X.test <- X[test,]
Y.train <- Y[train]
Y.test <- Y[test]
# grid of possible keepX values that will be tested for each component
list.keepX <- c(1:10, seq(20, 300, 10))
# undergo the tuning process to determine the optimal number of variables
tune.splsda.srbct <- tune.splsda(X, Y, ncomp = 4, # calculate for first 4 components
validation = 'Mfold',
folds = 5, nrepeat = 10, # use repeated cross-validation
dist = 'max.dist', # use max.dist measure
measure = "BER", # use balanced error rate of dist measure
test.keepX = list.keepX,
cpus = 2) # allow for paralleliation to decrease runtime
optimal.ncomp <- tune.splsda.srbct$choice.ncomp$ncomp
#since optimal ncomp = 1, reassigning it to meet the minimum required to run the algorithm
optimal.ncomp <- 2
optimal.keepX <- tune.splsda.srbct$choice.keepX[1:optimal.ncomp]
train.splsda.srbct <- splsda(X.train, Y.train, ncomp = optimal.ncomp, keepX = optimal.keepX)
predict.splsda.srbct <- predict(train.splsda.srbct, X.test,
dist = "mahalanobis.dist")
predict.comp2 <- predict.splsda.srbct$class$mahalanobis.dist[,2]
table(factor(predict.comp2, levels = levels(Y)), Y.test)
#> Y.test
#> EWS BL NB RMS
#> EWS 6 0 0 0
#> BL 0 0 0 0
#> NB 0 0 0 0
#> RMS 0 0 0 7
##### Generate AUC in the test data
predictionsdat <- data.frame(predict.splsda.srbct$predict[,,2])
predictionsdat$predict <- predict.splsda.srbct$class$mahalanobis.dist[,2]
predictionsdat$observed <- Y.test
roc_hrd <- pROC::roc(ifelse(predictionsdat$observed=="EWS", 1, 0), as.numeric(predictionsdat$EWS))
#> Setting levels: control = 0, case = 1
#> Setting direction: controls < cases
roc_hrd$auc
#> Area under the curve: 1
Created on 2023-01-13 with reprex v2.0.2
Session info
sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#> setting value
#> version R version 4.2.1 (2022-06-23 ucrt)
#> os Windows 10 x64 (build 19044)
#> system x86_64, mingw32
#> ui RTerm
#> language (EN)
#> collate English_United States.utf8
#> ctype English_United States.utf8
#> tz #Hidden
#> date 2023-01-13
#> pandoc 2.18 @ C:/Program Files/RStudio/bin/quarto/bin/tools/ (via rmarkdown)
#>
#> ─ Packages ───────────────────────────────────────────────────────────────────
#> package * version date (UTC) lib source
#> assertthat 0.2.1 2019-03-21 [1] CRAN (R 4.2.1)
#> BiocParallel 1.30.3 2022-06-07 [1] Bioconductor
#> cli 3.3.0 2022-04-25 [1] CRAN (R 4.2.1)
#> codetools 0.2-18 2020-11-04 [2] CRAN (R 4.2.1)
#> colorspace 2.0-3 2022-02-21 [1] CRAN (R 4.2.1)
#> corpcor 1.6.10 2021-09-16 [1] CRAN (R 4.2.0)
#> DBI 1.1.3 2022-06-18 [1] CRAN (R 4.2.1)
#> digest 0.6.29 2021-12-01 [1] CRAN (R 4.2.1)
#> dplyr 1.0.9 2022-04-28 [1] CRAN (R 4.2.1)
#> ellipse 0.4.3 2022-05-31 [1] CRAN (R 4.2.1)
#> evaluate 0.16 2022-08-09 [1] CRAN (R 4.2.1)
#> fansi 1.0.3 2022-03-24 [1] CRAN (R 4.2.1)
#> fastmap 1.1.0 2021-01-25 [1] CRAN (R 4.2.1)
#> fs 1.5.2 2021-12-08 [1] CRAN (R 4.2.1)
#> generics 0.1.3 2022-07-05 [1] CRAN (R 4.2.1)
#> ggplot2 * 3.3.6 2022-05-03 [1] CRAN (R 4.2.1)
#> ggrepel 0.9.1 2021-01-15 [1] CRAN (R 4.2.1)
#> glue 1.6.2 2022-02-24 [1] CRAN (R 4.2.1)
#> gridExtra 2.3 2017-09-09 [1] CRAN (R 4.2.1)
#> gtable 0.3.1 2022-09-01 [1] CRAN (R 4.2.1)
#> highr 0.9 2021-04-16 [1] CRAN (R 4.2.1)
#> htmltools 0.5.3 2022-07-18 [1] CRAN (R 4.2.1)
#> igraph 1.3.4 2022-07-19 [1] CRAN (R 4.2.1)
#> knitr 1.40 2022-08-24 [1] CRAN (R 4.2.1)
#> lattice * 0.20-45 2021-09-22 [2] CRAN (R 4.2.1)
#> lifecycle 1.0.2 2022-09-09 [1] CRAN (R 4.2.1)
#> magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.2.1)
#> MASS * 7.3-57 2022-04-22 [2] CRAN (R 4.2.1)
#> Matrix 1.5-1 2022-09-13 [1] CRAN (R 4.2.1)
#> matrixStats 0.62.0 2022-04-19 [1] CRAN (R 4.2.1)
#> mixOmics * 6.20.0 2022-04-26 [1] Bioconductor (R 4.2.0)
#> munsell 0.5.0 2018-06-12 [1] CRAN (R 4.2.1)
#> pillar 1.8.1 2022-08-19 [1] CRAN (R 4.2.1)
#> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.2.1)
#> plyr 1.8.7 2022-03-24 [1] CRAN (R 4.2.1)
#> pROC * 1.18.0 2021-09-03 [1] CRAN (R 4.2.1)
#> purrr 0.3.4 2020-04-17 [1] CRAN (R 4.2.1)
#> R6 2.5.1 2021-08-19 [1] CRAN (R 4.2.1)
#> rARPACK 0.11-0 2016-03-10 [1] CRAN (R 4.2.1)
#> RColorBrewer 1.1-3 2022-04-03 [1] CRAN (R 4.2.0)
#> Rcpp 1.0.9 2022-07-08 [1] CRAN (R 4.2.1)
#> reprex 2.0.2 2022-08-17 [1] CRAN (R 4.2.2)
#> reshape2 1.4.4 2020-04-09 [1] CRAN (R 4.2.1)
#> rlang 1.0.5 2022-08-31 [1] CRAN (R 4.2.1)
#> rmarkdown 2.16 2022-08-24 [1] CRAN (R 4.2.1)
#> RSpectra 0.16-1 2022-04-24 [1] CRAN (R 4.2.1)
#> rstudioapi 0.14 2022-08-22 [1] CRAN (R 4.2.1)
#> scales 1.2.1 2022-08-20 [1] CRAN (R 4.2.1)
#> sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.2.1)
#> stringi 1.7.8 2022-07-11 [1] CRAN (R 4.2.1)
#> stringr 1.4.1 2022-08-20 [1] CRAN (R 4.2.1)
#> tibble 3.1.8 2022-07-22 [1] CRAN (R 4.2.1)
#> tidyr 1.2.0 2022-02-01 [1] CRAN (R 4.2.1)
#> tidyselect 1.1.2 2022-02-21 [1] CRAN (R 4.2.1)
#> utf8 1.2.2 2021-07-24 [1] CRAN (R 4.2.1)
#> vctrs 0.4.1 2022-04-13 [1] CRAN (R 4.2.1)
#> withr 2.5.0 2022-03-03 [1] CRAN (R 4.2.1)
#> xfun 0.32 2022-08-10 [1] CRAN (R 4.2.1)
#> yaml 2.3.5 2022-02-21 [1] CRAN (R 4.2.1)
#>
#> [1] #Hidden
#> [2] C:/Program Files/R/R-4.2.1/library
#>
#> ──────────────────────────────────────────────────────────────────────────────