Chapter 7 Artificial Intelligence and Machine Learning
7.1 ML Regression in R
7.1.1 Linear Regression with R
library(reshape2)
library(tidyverse)
library(tidymodels)
library(plotly)
data(tips)
<- tips$tip
y <- tips$total_bill
X
set.seed(123)
<- initial_split(tips)
tips_split <- tips_split %>%
tips_training training()
<- tips_split %>%
tips_test testing()
<- linear_reg() %>%
lm_model set_engine('lm') %>%
set_mode('regression') %>%
fit(tip ~ total_bill, data = tips_training)
<- seq(min(X), max(X), length.out = 100)
x_range <- matrix(x_range, nrow=100, ncol=1)
x_range <- data.frame(x_range)
xdf colnames(xdf) <- c('total_bill')
<- lm_model %>%
ydf predict(xdf)
colnames(ydf) <- c('tip')
<- data.frame(xdf, ydf)
xy
<- plot_ly(data = tips_training, x = ~total_bill, y = ~tip, type = 'scatter', name = 'train', mode = 'markers', alpha = 0.65) %>%
fig add_trace(data = tips_test, x = ~total_bill, y = ~tip, type = 'scatter', name = 'test', mode = 'markers', alpha = 0.65 ) %>%
add_trace(data = xy, x = ~total_bill, y = ~tip, name = 'prediction', mode = 'lines', alpha = 1)
fig
7.2 ROC and PR Curves
7.2.1 ROC and PR Curves
library(plotly)
library(tidymodels)
set.seed(0)
<- matrix(rnorm(10000),nrow=500)
X <- sample(0:1, 500, replace=TRUE)
y <- data.frame(X,y)
data $y <- as.factor(data$y)
data<- subset(data,select = -c(y))
X <-
logistic_glm logistic_reg() %>%
set_engine("glm") %>%
set_mode("classification") %>%
fit(y ~ ., data = data)
<- logistic_glm %>%
y_scores predict(X, type = 'prob')
<- y_scores$.pred_1
y_score <- data.frame(data$y, y_score)
db
<- roc_curve(data = db, 'data.y', 'y_score')
z $specificity <- 1 - z$specificity
zcolnames(z) <- c('threshold', 'tpr', 'fpr')
<- plot_ly(x= y_score, color = data$y, colors = c('blue', 'red'), type = 'histogram', alpha = 0.5, nbinsx = 50) %>%
fig1 layout(barmode = "overlay")
fig1
<- plot_ly(data = z, x = ~threshold) %>%
fig2 add_trace(y = ~fpr, mode = 'lines', name = 'False Positive Rate', type = 'scatter')%>%
add_trace(y = ~tpr, mode = 'lines', name = 'True Positive Rate', type = 'scatter')%>%
layout(title = 'TPR and FPR at every threshold')
<- fig2 %>% layout(legend=list(title=list(text='<b> Rate </b>')))
fig2 fig2
7.2.2 Basic binary ROC curve
library(dplyr)
library(ggplot2)
library(plotly)
library(pROC)
set.seed(0)
<- matrix(rnorm(10000),nrow=500)
X <- sample(0:1, 500, replace=TRUE)
y <- data.frame(X,y)
db $y <- as.factor(db$y)
db= db[1:20]
test_data
<- logistic_reg() %>%
modelset_engine("glm") %>%
set_mode("classification") %>%
# Fit the model
fit(y ~., data = db)
<- predict(model,
ypred new_data = test_data,
type = "prob")
<- data.frame(ypred$.pred_0)
yscore <- cbind(db$y,yscore)
rdb colnames(rdb) = c('y','yscore')
<- roc_curve(rdb, y, yscore)
pdb $specificity <- 1 - pdb$specificity
pdb= roc_auc(rdb, y, yscore)
auc = auc$.estimate
auc
= paste('ROC Curve (AUC = ',toString(round(auc,2)),')',sep = '')
tit
<- plot_ly(data = pdb ,x = ~specificity, y = ~sensitivity, type = 'scatter', mode = 'lines', fill = 'tozeroy') %>%
fig layout(title = tit,xaxis = list(title = "False Positive Rate"), yaxis = list(title = "True Positive Rate")) %>%
add_segments(x = 0, xend = 1, y = 0, yend = 1, line = list(dash = "dash", color = 'black'),inherit = FALSE, showlegend = FALSE)
fig
7.2.3 Multiclass ROC Curve
library(plotly)
library(tidymodels)
library(fastDummies)
# Artificially add noise to make task harder
data(iris)
<- sample.int(150, 50)
ind <- sample(x = iris$Species, size = 50)
samples 'Species'] = samples
iris[ind,
# Define the inputs and outputs
<- subset(iris, select = -c(Species))
X $Species <- as.factor(iris$Species)
iris
# Fit the model
<-
logistic multinom_reg() %>%
set_engine("nnet") %>%
set_mode("classification") %>%
fit(Species ~ ., data = iris)
<- logistic %>%
y_scores predict(X, type = 'prob')
# One hot encode the labels in order to plot them
<- dummy_cols(iris$Species)
y_onehot colnames(y_onehot) <- c('drop', 'setosa', 'versicolor', 'virginica')
<- subset(y_onehot, select = -c(drop))
y_onehot
= cbind(y_scores, y_onehot)
z
$setosa <- as.factor(z$setosa)
z<- roc_curve(data = z, setosa, .pred_setosa)
roc_setosa $specificity <- 1 - roc_setosa$specificity
roc_setosacolnames(roc_setosa) <- c('threshold', 'tpr', 'fpr')
<- roc_auc(data = z, setosa, .pred_setosa)
auc_setosa <- auc_setosa$.estimate
auc_setosa <- paste('setosa (AUC=',toString(round(1-auc_setosa,2)),')',sep = '')
setosa
$versicolor <- as.factor(z$versicolor)
z<- roc_curve(data = z, versicolor, .pred_versicolor)
roc_versicolor $specificity <- 1 - roc_versicolor$specificity
roc_versicolorcolnames(roc_versicolor) <- c('threshold', 'tpr', 'fpr')
<- roc_auc(data = z, versicolor, .pred_versicolor)
auc_versicolor <- auc_versicolor$.estimate
auc_versicolor <- paste('versicolor (AUC=',toString(round(1-auc_versicolor,2)),')', sep = '')
versicolor
$virginica <- as.factor(z$virginica)
z<- roc_curve(data = z, virginica, .pred_virginica)
roc_virginica $specificity <- 1 - roc_virginica$specificity
roc_virginicacolnames(roc_virginica) <- c('threshold', 'tpr', 'fpr')
<- roc_auc(data = z, virginica, .pred_virginica)
auc_virginica <- auc_virginica$.estimate
auc_virginica <- paste('virginica (AUC=',toString(round(1-auc_virginica,2)),')',sep = '')
virginica
# Create an empty figure, and iteratively add a line for each class
<- plot_ly()%>%
fig add_segments(x = 0, xend = 1, y = 0, yend = 1, line = list(dash = "dash", color = 'black'), showlegend = FALSE) %>%
add_trace(data = roc_setosa,x = ~fpr, y = ~tpr, mode = 'lines', name = setosa, type = 'scatter')%>%
add_trace(data = roc_versicolor,x = ~fpr, y = ~tpr, mode = 'lines', name = versicolor, type = 'scatter')%>%
add_trace(data = roc_virginica,x = ~fpr, y = ~tpr, mode = 'lines', name = virginica, type = 'scatter')%>%
layout(xaxis = list(
title = "False Positive Rate"
yaxis = list(
), title = "True Positive Rate"
legend = list(x = 100, y = 0.5))
), fig
7.3 PCA Visulization
7.3.1 Visualize all the original dimensions
library(plotly)
data(iris)
= list(showline=FALSE,
axis zeroline=FALSE,
gridcolor='#ffff',
ticklen=4,
titlefont=list(size=13))
<- iris %>%
fig plot_ly()
<- fig %>%
fig add_trace(
type = 'splom',
dimensions = list(
list(label='sepal length', values=~Sepal.Length),
list(label='sepal width', values=~Sepal.Width),
list(label='petal length', values=~Petal.Length),
list(label='petal width', values=~Petal.Width)
),color = ~Species, colors = c('#636EFA','#EF553B','#00CC96') ,
marker = list(
size = 7,
line = list(
width = 1,
color = 'rgb(230,230,230)'
)
)
)<- fig %>% style(diagonal = list(visible = FALSE))
fig <- fig %>%
fig layout(
hovermode='closest',
dragmode= 'select',
plot_bgcolor='rgba(240,240,240, 0.95)',
xaxis=list(domain=NULL, showline=F, zeroline=F, gridcolor='#ffff', ticklen=4),
yaxis=list(domain=NULL, showline=F, zeroline=F, gridcolor='#ffff', ticklen=4),
xaxis2=axis,
xaxis3=axis,
xaxis4=axis,
yaxis2=axis,
yaxis3=axis,
yaxis4=axis
)
fig
7.3.2 Visualize all the principal components
library(plotly)
library(stats)
data(iris)
<- subset(iris, select = -c(Species))
X <- prcomp(X)
prin_comp <- summary(prin_comp)[["importance"]]['Proportion of Variance',]
explained_variance_ratio <- 100 * explained_variance_ratio
explained_variance_ratio <- prin_comp[["x"]]
components <- data.frame(components)
components <- cbind(components, iris$Species)
components $PC3 <- -components$PC3
components$PC2 <- -components$PC2
components
= list(showline=FALSE,
axis zeroline=FALSE,
gridcolor='#ffff',
ticklen=4,
titlefont=list(size=13))
<- components %>%
fig plot_ly() %>%
add_trace(
type = 'splom',
dimensions = list(
list(label=paste('PC 1 (',toString(round(explained_variance_ratio[1],1)),'%)',sep = ''), values=~PC1),
list(label=paste('PC 2 (',toString(round(explained_variance_ratio[2],1)),'%)',sep = ''), values=~PC2),
list(label=paste('PC 3 (',toString(round(explained_variance_ratio[3],1)),'%)',sep = ''), values=~PC3),
list(label=paste('PC 4 (',toString(round(explained_variance_ratio[4],1)),'%)',sep = ''), values=~PC4)
),color = ~iris$Species, colors = c('#636EFA','#EF553B','#00CC96')
%>%
) style(diagonal = list(visible = FALSE)) %>%
layout(
legend=list(title=list(text='color')),
hovermode='closest',
dragmode= 'select',
plot_bgcolor='rgba(240,240,240, 0.95)',
xaxis=list(domain=NULL, showline=F, zeroline=F, gridcolor='#ffff', ticklen=4),
yaxis=list(domain=NULL, showline=F, zeroline=F, gridcolor='#ffff', ticklen=4),
xaxis2=axis,
xaxis3=axis,
xaxis4=axis,
yaxis2=axis,
yaxis3=axis,
yaxis4=axis
)
fig
7.3.3 Visualize a subset of the principal components
library(plotly)
library(stats)
library(MASS)
= Boston
db
<- prcomp(db, rank. = 4)
prin_comp
<- prin_comp[["x"]]
components <- data.frame(components)
components <- cbind(components, db$medv)
components $PC2 <- -components$PC2
componentscolnames(components)[5] = 'Median_Price'
<- summary(prin_comp)[["importance"]]['Proportion of Variance',]
tot_explained_variance_ratio <- 100 * sum(tot_explained_variance_ratio)
tot_explained_variance_ratio
= 'Total Explained Variance = 99.56'
tit
= list(showline=FALSE,
axis zeroline=FALSE,
gridcolor='#ffff',
ticklen=4)
<- components %>%
fig plot_ly() %>%
add_trace(
type = 'splom',
dimensions = list(
list(label='PC1', values=~PC1),
list(label='PC2', values=~PC2),
list(label='PC3', values=~PC3),
list(label='PC4', values=~PC4)
),color=~Median_Price,
marker = list(
size = 7
)%>% style(diagonal = list(visible = F)) %>%
) layout(
title= tit,
hovermode='closest',
dragmode= 'select',
plot_bgcolor='rgba(240,240,240, 0.95)',
xaxis=list(domain=NULL, showline=F, zeroline=F, gridcolor='#ffff', ticklen=4),
yaxis=list(domain=NULL, showline=F, zeroline=F, gridcolor='#ffff', ticklen=4),
xaxis2=axis,
xaxis3=axis,
xaxis4=axis,
yaxis2=axis,
yaxis3=axis,
yaxis4=axis
)options(warn=-1)
fig
7.3.4 Visualize PCA with scatter3d
data("iris")
<- subset(iris, select = -c(Species))
X
<- prcomp(X, rank. = 3)
prin_comp
<- prin_comp[["x"]]
components <- data.frame(components)
components $PC2 <- -components$PC2
components$PC3 <- -components$PC3
components= cbind(components, iris$Species)
components
<- summary(prin_comp)[["importance"]]['Proportion of Variance',]
tot_explained_variance_ratio <- 100 * sum(tot_explained_variance_ratio)
tot_explained_variance_ratio
= 'Total Explained Variance = 99.48'
tit
<- plot_ly(components, x = ~PC1, y = ~PC2, z = ~PC3, color = ~iris$Species, colors = c('#636EFA','#EF553B','#00CC96') ) %>%
fig add_markers(size = 12)
<- fig %>%
fig layout(
title = tit,
scene = list(bgcolor = "#e5ecf6")
)
fig
7.3.5 Visualize Loadings
library(plotly)
library(stats)
data(iris)
<- subset(iris, select = -c(Species))
X <- prcomp(X, rank = 2)
prin_comp <- prin_comp[["x"]]
components <- data.frame(components)
components <- cbind(components, iris$Species)
components $PC2 <- -components$PC2
components<- summary(prin_comp)[["sdev"]]
explained_variance <- explained_variance[1:2]
explained_variance <- prin_comp[["rotation"]]
comp 'PC2'] <- - comp[,'PC2']
comp[,<- comp
loadings for (i in seq(explained_variance)){
<- comp[,i] * explained_variance[i]
loadings[,i]
}
= c('sepal_length', 'sepal_width', 'petal_length', 'petal_width')
features
<- plot_ly(components, x = ~PC1, y = ~PC2, color = ~iris$Species, colors = c('#636EFA','#EF553B','#00CC96'), type = 'scatter', mode = 'markers') %>%
fig layout(
legend=list(title=list(text='color')),
plot_bgcolor = "#e5ecf6",
xaxis = list(
title = "0"),
yaxis = list(
title = "1"))
for (i in seq(4)){
<- fig %>%
fig add_segments(x = 0, xend = loadings[i, 1], y = 0, yend = loadings[i, 2], line = list(color = 'black'),inherit = FALSE, showlegend = FALSE) %>%
add_annotations(x=loadings[i, 1], y=loadings[i, 2], ax = 0, ay = 0,text = features[i], xanchor = 'center', yanchor= 'bottom')
}
fig
7.4 t-SNE and UMAP projections
7.4.1 Basic t-SNE projections
library(plotly)
library(stats)
data(iris)
<- subset(iris, select = -c(Species))
X = list(showline=FALSE,
axis zeroline=FALSE,
gridcolor='#ffff',
ticklen=4)
<- iris %>%
fig plot_ly() %>%
add_trace(
type = 'splom',
dimensions = list(
list(label = 'sepal_width',values=~Sepal.Width),
list(label = 'sepal_length',values=~Sepal.Length),
list(label ='petal_width',values=~Petal.Width),
list(label = 'petal_length',values=~Petal.Length)),
color = ~Species, colors = c('#636EFA','#EF553B','#00CC96')
) <- fig %>%
fig layout(
legend=list(title=list(text='species')),
hovermode='closest',
dragmode= 'select',
plot_bgcolor='rgba(240,240,240,0.95)',
xaxis=list(domain=NULL, showline=F, zeroline=F, gridcolor='#ffff', ticklen=4),
yaxis=list(domain=NULL, showline=F, zeroline=F, gridcolor='#ffff', ticklen=4),
xaxis2=axis,
xaxis3=axis,
xaxis4=axis,
yaxis2=axis,
yaxis3=axis,
yaxis4=axis
) fig
7.4.2 Project data into 2D with t-SNE
library(tsne)
library(plotly)
data("iris")
<- subset(iris, select = -c(Species))
features
set.seed(0)
<- tsne(features, initial_dims = 2)
tsne <- data.frame(tsne)
tsne <- cbind(tsne,iris$Species)
pdb options(warn = -1)
<- plot_ly(data = pdb ,x = ~X1, y = ~X2, type = 'scatter', mode = 'markers', split = ~iris$Species)
fig
<- fig %>%
fig layout(
plot_bgcolor = "#e5ecf6"
)
fig
7.4.3 Project data into 3D with t-SNE
library(tsne)
library(plotly)
data("iris")
<- subset(iris, select = -c(Species))
features
#set.seed(0)
<- tsne(features, initial_dims = 3, k =3)
tsne <- data.frame(tsne)
tsne <- cbind(tsne,iris$Species)
pdb options(warn = -1)
<- plot_ly(data = pdb ,x = ~X1, y = ~X2, z = ~X3, color = ~iris$Species, colors = c('#636EFA','#EF553B','#00CC96') ) %>%
fig add_markers(size = 8) %>%
layout(
xaxis = list(
zerolinecolor = "#ffff",
zerolinewidth = 2,
gridcolor='#ffff'),
yaxis = list(
zerolinecolor = "#ffff",
zerolinewidth = 2,
gridcolor='#ffff'),
scene =list(bgcolor = "#e5ecf6"))
fig
7.4.4 Projections with UMAP
library(plotly)
library(umap)
= iris[, grep("Sepal|Petal", colnames(iris))]
iris.data = iris[, "Species"]
iris.labels = umap(iris.data, n_components = 2, random_state = 15)
iris.umap <- iris.umap[["layout"]]
layout <- data.frame(layout)
layout <- cbind(layout, iris$Species)
final
<- plot_ly(final, x = ~X1, y = ~X2, color = ~iris$Species, colors = c('#636EFA','#EF553B','#00CC96'), type = 'scatter', mode = 'markers')%>%
fig layout(
plot_bgcolor = "#e5ecf6",
legend=list(title=list(text='species')),
xaxis = list(
title = "0"),
yaxis = list(
title = "1"))
= umap(iris.data, n_components = 3, random_state = 15)
iris.umap <- iris.umap[["layout"]]
layout <- data.frame(layout)
layout <- cbind(layout, iris$Species)
final
<- plot_ly(final, x = ~X1, y = ~X2, z = ~X3, color = ~iris$Species, colors = c('#636EFA','#EF553B','#00CC96'))
fig2 <- fig2 %>% add_markers()
fig2 <- fig2 %>% layout(scene = list(xaxis = list(title = '0'),
fig2 yaxis = list(title = '1'),
zaxis = list(title = '2')))
fig
fig2
7.4.5 Visualizing image datasets
library(rsvd)
library(plotly)
library(umap)
data('digits')
= digits[, grep("pixel", colnames(digits))]
digits.data = digits[, "label"]
digits.labels = umap(digits.data, n_components = 2, k = 10)
digits.umap <- digits.umap[["layout"]]
layout <- data.frame(layout)
layout <- cbind(layout, digits[,'label'])
final colnames(final) <- c('X1', 'X2', 'label')
<- plot_ly(final, x = ~X1, y = ~X2, split = ~label, type = 'scatter', mode = 'markers')%>%
fig layout(
plot_bgcolor = "#e5ecf6",
legend=list(title=list(text='digit')),
xaxis = list(
title = "0"),
yaxis = list(
title = "1"))
fig