####---------------------Section 3 Bookdown------------PCA& sPCA------------
# combined data: liver.toxicity is a list of 4 df: gene, clinic, treatment, gene.ID
data(liver.toxicity)
names(liver.toxicity)
#[1] "gene" "clinic" "treatment" "gene.ID"
#Separate liver.toxicity into indiv df's'
genedf <- liver.toxicity$gene # no zeroes data looks centered
treatdf<- liver.toxicity$treatment
clinicdf<- liver.toxicity$clinic
gene.IDdf<- liver.toxicity$gene.ID
#Note: if the names of the samples are not meaningful at this stage, they can be replaced
#by symbols (ind.names=TRUE) test if setting ind.names=F forces the names to show
#PCA
MyResult.pca <- pca(genedf) # 1 Run the method
plotIndiv(MyResult.pca) # 2 Plot the samples orig form plots circles
plotVar(MyResult.pca, cutoff = 0.8) # 3 Plot the variables
##Customize PLots-----
#note Dose.Group is a column header in the dataframe liver.toxicity$treatments
#orig using imbedded data name
plotIndiv(MyResult.pca, group = liver.toxicity$treatment$Dose.Group,
legend = TRUE)
#without the nested file name
#plots exaclty same as above command
plotIndiv(MyResult.pca, group = treatdf$Dose.Group,
legend = TRUE)
#cutomize legends, Titles
#Additionally, two factors can be displayed using both colours (argument group) and symbols (argument pch)
#here exposure is the Time.Group column
plotIndiv(MyResult.pca, ind.names = FALSE,
group = liver.toxicity$treatment$Dose.Group,
pch = as.factor(liver.toxicity$treatment$Time.Group),
legend = TRUE, title = 'Liver toxicity: genes, PCA comp 1 - 2',
legend.title = 'Dose', legend.title.pch = 'Exposure')
# not using the nested file names # same graph as above
plotIndiv(MyResult.pca, ind.names = FALSE,
group = treatdf$Dose.Group,
pch = as.factor(treatdf$Time.Group),
legend = TRUE, title = 'Liver toxicity: genes, PCA comp 1 - 2',
legend.title = 'Dose', legend.title.pch = 'Exposure')
#PCA with 3 components
MyResult.pca2 <- pca(genedf, ncomp = 3)
plotIndiv(MyResult.pca2, comp = c(1,3), legend = TRUE,
group = liver.toxicity$treatment$Time.Group,
title = 'Multidrug transporter, PCA comp 1 - 3')
#Note the 3rd component in Y axis highlights an exposure effect
# amount of variance explained & # of components
plot(MyResult.pca2)
# first 3 components explain approx 65% variance see below exace 62.7
MyResult.pca2
# Eigenvalues for the first 3 principal components, see object$sdev^2:
# PC1 PC2 PC3
# 17.971416 9.079234 4.567709
#
# Proportion of explained variance for the first 3 principal components, see object$prop_expl_var:
# PC1 PC2 PC3
# 0.35684128 0.18027769 0.09069665
#
# Cumulative proportion of explained variance for the first 3 principal components, see object$cum.var:
# PC1 PC2 PC3
# 0.3568413 0.5371190 0.6278156
#
# Other available components:
#
# loading vectors: see object$rotation
# Other functions:
# plotIndiv, plot, plotVar, selectVar, biplot
#other plots
# The loading weights are represented in decreasing order
# from bottom to top in plotLoadings. Their absolute value
# indicates the importance of each variable to define each
# PC, as represented by the length of each bar
# a customized example to only show the top 100 genes
# and their gene name
plotLoadings(MyResult.pca, ndisplay = 100,
name.var = liver.toxicity$gene.ID[, "geneBank"],
size.name = rel(0.3))
#variable selection with sparse PCS
#Apply PCA and identify the key variables that contribute
#to the explanation of most variance in the data set
# The user needs to provide the number of variables to
# select on each PC. Here, for example, we ask to select
# the top 15 genes contributing to the definition of
# PC1, the top 10 genes contributing to PC2 and the
# top 5 genes for PC3 (keepX=c(15,10,5)).
X <-genedf
MyResult.spca <- spca(X, ncomp = 3, keepX = c(15,10,5)) # 1 Run the method
plotIndiv(MyResult.spca, group = liver.toxicity$treatment$Dose.Group, # 2 Plot the samples
pch = as.factor(liver.toxicity$treatment$Time.Group),
legend = TRUE, title = 'Liver toxicity: genes, sPCA comp 1 - 2',
legend.title = 'Dose', legend.title.pch = 'Exposure')
# since axi PCA values above show O use below to get actual values
MyResult.spca
# sparse PCA with 3 principal components.
# Input data X of dimensions: 64 3116
# Number of selected variables on each prinicipal components:
# PC1 PC2 PC3
# 15 10 5
# Proportion of adjusted explained variance for the first 3 principal components, see object$prop_expl_var:
# PC1 PC2 PC3
# 0.003856574 0.002213243 0.001108384
#
# Other available components:
#
# loading vectors: see object$rotation
# Other functions:
#
# tune.spca, plotIndiv, plot, plotVar, selectVar, biplot
plotVar(MyResult.spca, cex = 1) # 3 Plot the variables
#Selected variables can be identified on each component
#with the selectVar function. Here the coefficient
#values are extracted,see ?selectVar:
selectVar(MyResult.spca, comp = 1)$value
# Those values correspond to the loading weights that
# are used to define each component. A large absolute
# value indicates the importance of the variable in this
# PC. Selected variables are ranked from the most
# important (top) to the least important
# value.var
# A_43_P20281 -0.39077443
# A_43_P16829 -0.38898291
# A_43_P21269 -0.37452039
# A_43_P20475 -0.32482960
# A_43_P20891 -0.31740002
# A_43_P14037 -0.27681845
# A_42_P751969 -0.26140533
# A_43_P15845 -0.22392912
# A_42_P814129 -0.18838954
# A_42_P680505 -0.18672610
# A_43_P21483 -0.16202222
# A_43_P21243 -0.13259471
# A_43_P22469 -0.12493156
# A_43_P23061 -0.12255308
# A_43_P11409 -0.09768656
plotLoadings(MyResult.spca) # comp 1
#for component 2
selectVar(MyResult.spca, comp=2)$value
plotLoadings(MyResult.spca, comp = 2)
#Tuning PCA
tune.pca(X)# does nothing on MixOmix vs 6.17.28
####---------------------Section 4 Bookdown-------PLS-DA------------
#I am analysing a single data set (e.g. transcriptomics data) and I would like to classify
#my samples into known groups and predict the class of new samples. In addition, I am interested
#in identifying the key variables that drive such discrimination.
data(srbct) #the expression levels of 2,308 genes measured on 63 samples
names(srbct)
#[1] "gene" "class" "gene.name"
X <- srbct$gene #a data frame with 63 rows and 2308 columns. The expression levels of 2,308 genes in 63 subjects.
Y <- srbct$class #a class vector containing the class tumour of each individual (4 classes in total)
Z <- srbct$gene.name # a data frame with 2,308 rows and 2 columns containing further information on the genes.
summary(Y) ## class summary
# EWS BL NB RMS
# 23 8 12 20
dim(X) ## number of samples and features
#[1] 63 2308
length(Y) ## length of class memebrship factor = number of samples
#[1] 63
#sLPCA-DA
MyResult.splsda <- splsda(X, Y, keepX = c(50,50)) # 1 Run the method
plotIndiv(MyResult.splsda) # 2 Plot the samples (coloured by classes automatically)
plotVar(MyResult.splsda) # 3 Plot the variables
#select var Comp1
selectVar(MyResult.splsda, comp=1)$name # Selected variables on component 1
#PLS-DA without variable selection
MyResult.plsda <- plsda(X,Y) # 1 Run the method
plotIndiv(MyResult.plsda) # 2 Plot the samples
plotVar(MyResult.plsda, cutoff = 0.7) # 3 Plot the variables
#Customizing Sample Plots
#The sample plots can be improved in various ways. First, if the names of the samples are
#not meaningful at this stage, they can be replaced by symbols (ind.names=TRUE).
#Confidence ellipses can be plotted for each sample (ellipse = TRUE, confidence level set to 95%
#by default, see the argument ellipse.level), Additionally, a star plot displays arrows from eac
#group centroid towards each individual sample (star = TRUE)
plotIndiv(MyResult.splsda, ind.names = FALSE, legend=TRUE,
ellipse = TRUE, star = TRUE, title = 'sPLS-DA on SRBCT',
X.label = 'PLS-DA 1', Y.label = 'PLS-DA 2')
#Predictions(classification)
background <- background.predict(MyResult.splsda, comp.predicted=2,
dist = "max.dist")
plotIndiv(MyResult.splsda, comp = 1:2, group = srbct$class,
ind.names = FALSE, title = "Maximum distance",
legend = TRUE, background = background)
#Variable selection
MyResult.splsda2 <- splsda(X,Y, ncomp=3, keepX=c(15,10,5))
selectVar(MyResult.splsda2, comp=1)$value
# value.var
# g123 0.53516982
# g846 0.41271455
# g335 0.30309695
# g1606 0.30194141
# g836 0.29365241
# g783 0.26329876
# g758 0.25826903
# g1386 0.23702577
# g1158 0.15283961
# g585 0.13838913
# g589 0.12738682
# g1387 0.12202390
# g1884 0.08458869
# g1295 0.03150351
# g1036 0.00224886
plotLoadings(MyResult.splsda2, contrib = 'max', method = 'mean')
#tunin Parameters and numerical output
MyResult.plsda2 <- plsda(X,Y, ncomp=10)
set.seed(30) # for reproducbility in this vignette, otherwise increase nrepeat
MyPerf.plsda <- perf(MyResult.plsda2, validation = "Mfold", folds = 3,
progressBar = FALSE, nrepeat = 10) # we suggest nrepeat = 50
plot(MyPerf.plsda, col = color.mixo(5:7), sd = TRUE, legend.position = "horizontal")
type or paste code here