machine.Rmd
### From Jeppe's code and DTU dataset
### 2021, december 3 rd
### Julien Rodriguez, Ifremer
### François Danhiez, CapGemini Engineering
### Jeppe Olsen, DTU Aqua
### Contact
### julien.rodriguez@ifremer.fr
### What is it about?
# This code aims at giving a toy example on how home-made machine-learning methods and cross-validation can be applied to a dataset
# This code and the associated functions were produced during ICES workshop on using high resolution spatial data for SSF monitoring,
# SO, please take into account it was produced in less than 2 days of coding activity!
# All the necessary reflexion regarding the purpose of the algorithm (and so, the cross-validation process to be applied for proper qualification),
# is an absolute prerequisite before performing this kind of process.
# For this example, the decision was made to apply a ping-level decision rule on fishing/non-fishing.
# depending on the objective, choosing a different level of aggregation might prove to be relevant (for example, a "fishing event", to identify a fishing gear)
# then different aggregated index have to be produced = average, quantiles, skewness, kurthosis....
# Also a reflexion has to be done on the covariates that can be computed.
# In that case, these covariates, calculated from the associated functions, have been created for the purpose of the demonstration and are suggestions,
# but probably not the most relevant! Even sometimes, their computation might be wrong, but this wasn't the objective of the exercice!
# remotes::install_github("ices-eg/WKSSFGEO", ref = "pkg")
library(WKSSFGEO)
#> Error in library(WKSSFGEO): there is no package called 'WKSSFGEO'
library(sf)
library(ranger)
library(caret)
#> Error in library(caret): there is no package called 'caret'
library(sfc)
#> Error in library(sfc): there is no package called 'sfc'
library(dplyr)
fld <- getwd()
library(doMC)
#> Error in library(doMC): there is no package called 'doMC'
registerDoMC(20)
#> Error in registerDoMC(20): could not find function "registerDoMC"
tracks <-
track01 %>%
data.table::setDT() %>%
# only unique time per veseel, first record retained - which may be the wrong one
unique(by = c("vessel_id", "time_stamp")) %>%
WKSSFGEO::add_harbours(harbours) %>%
WKSSFGEO::interpolate_ais() %>%
WKSSFGEO::define_trips_pol(min_dur = 0.8, max_dur = 48,
split_trips = T, preserve_all = F)
#> Error in loadNamespace(x): there is no package called 'WKSSFGEO'
### Identify missing values to retrieve it in the final dataset used to build the model
nnai <- apply( tracks[, c("speed", "course", "behaviour" )], 1, function(x) !anyNA(x))
#> Error: object 'tracks' not found
trips <- unique(tracks$trip_id)
#> Error: object 'tracks' not found
out2 <- tracks
#> Error: object 'tracks' not found
out.ComNewCovar <- do.call( rbind, lapply( trips, function(tp) {
out.trip <- out2[nnai, ] %>% as.data.frame
out.trip <- out.trip[ out.trip$trip_id %in% tp, ]
out.NewCovar <- CalcAcceleration(
CalcDistBetweenNearestNeighbours(
CalcStraigthness(
CalcHeading( st_as_sf( out.trip,
coords = c("lon", "lat"), crs = 4326))
, col.Dir = "course"),
nnn = 9))
return(out.NewCovar)
}))
#> Error: object 'trips' not found
glimpse(out.ComNewCovar)
#> Error: object 'out.ComNewCovar' not found
out.ComNewCovar %>%
gather(var, value) %>%
ggplot(aes(x=value)) +
geom_histogram(fill="steelblue", alpha=.7) +
theme_minimal() +
facet_wrap(~key, scales="free")
#> Error in ggplot(., aes(x = value)): could not find function "ggplot"
out.ComNewCovar$behaviour <- factor(as.character(out.ComNewCovar$behaviour ))
#> Error: object 'out.ComNewCovar' not found
form <- as.formula( paste( "behaviour", paste("speed", collapse = "+"), sep = "~"))
optim.rf <- tune_RF(formula = form, sf::st_set_geometry(out.ComNewCovar[nnai, ], NULL))
#> Error in tune_RF(formula = form, sf::st_set_geometry(out.ComNewCovar[nnai, : could not find function "tune_RF"
nnn <- 9 # Number of nearest neighbours chosen
covar.names <-
c("speed", "course", "abs.HeadingChange", "HEADING.deg", "acceleration",
paste0( "DistWithNeighbour_", WKSSFGEO:::set_0nbr(1:nnn), 1:nnn))
#> Error in loadNamespace(x): there is no package called 'WKSSFGEO'
form <- as.formula( paste( "behaviour", paste(covar.names, collapse = "+"), sep = "~"))
#> Error: object 'covar.names' not found
out.ComNewCovar$behaviour <- factor(as.character(out.ComNewCovar$behaviour ))
#> Error: object 'out.ComNewCovar' not found
# Identify indexes without missing values (normally, none of them should contain missing values)
nnai <- apply( out.ComNewCovar[, c("behaviour", covar.names)], 1, function(x) !anyNA(x))
#> Error: object 'out.ComNewCovar' not found
!nnai %>% any
#> Error: object 'nnai' not found
# EINAR: the {tune_RF} finds the "optimimum" min.node.size and mtry to be passed
# to {ranger} in the next step
optim.rf <- WKSSFGEO::tune_RF(formula = form, sf::st_set_geometry(out.ComNewCovar[nnai, ], NULL))
#> Error in loadNamespace(x): there is no package called 'WKSSFGEO'
optim.rf # Automatically selected hyper-parameters with tune_RF function
#> Error: object 'optim.rf' not found
mod.rf <- ranger::ranger(form,
sf::st_set_geometry(out.ComNewCovar[nnai, ], NULL),
importance = "impurity",
mtry = optim.rf$mtry,
min.node.size = optim.rf$min.node.size,
num.trees = 500,
write.forest = TRUE)
#> Error: object 'out.ComNewCovar' not found
#saveRDS( list(optim.rf = optim.rf, mod.rf = mod.rf), file= sprintf( "%s/RandomForestModel.rds", fld))
mod.rf$prediction.error
#> Error: object 'mod.rf' not found
# OOB prediction error 11.17%
# Shows variable importance regarding how they contribute to the model
tibble::tibble(var = names(mod.rf$variable.importance),
val = mod.rf$variable.importance) %>%
ggplot2::ggplot() +
ggplot2::geom_pointrange(ggplot2::aes(reorder(var, val), val, ymin = 0, ymax = val)) +
ggplot2::coord_flip()
#> Error: object 'mod.rf' not found
# Error matrix
table(mod.rf$predictions, out.ComNewCovar$behaviour[nnai])
#> Error: object 'mod.rf' not found
# Select only the most relevant variables for LDA and QDA
mod.rf$variable.importance %>% sort(decreasing = TRUE)
#> Error: object 'mod.rf' not found
# Course and heading to be avoided because it may lead to overfitting
# These variables have been kept in randomforest which is less sensitive to colinearity and overfitting
covar.names <- c("speed", "abs.HeadingChange", "acceleration", paste0( "DistWithNeighbour_", c(8,6,2,9)))
form.lda <- as.formula( paste( "behaviour", paste(covar.names, collapse = "+"), sep = "~"))
mod.lda <- MASS::lda(form, out.ComNewCovar[nnai, ])
#> Error in eval(expr, p): object 'out.ComNewCovar' not found
mod.qda <- MASS::qda(form, out.ComNewCovar[nnai, ])
#> Error in eval(expr, p): object 'out.ComNewCovar' not found
#saveRDS( list(mod.lda = mod.lda, mod.qda = mod.qda), file= sprintf( "%s/LDAModel.rds", fld))
pred.lda <- predict(mod.lda)
#> Error: object 'mod.lda' not found
pred.qda <- predict(mod.qda)
#> Error: object 'mod.qda' not found
mod.lda %>% names
#> Error: object 'mod.lda' not found
mod.lda$prior
#> Error: object 'mod.lda' not found
out.ComNewCovar$behaviour[nnai] %>% table
#> Error: object 'out.ComNewCovar' not found
table(pred.lda$class, out.ComNewCovar$behaviour[nnai])
#> Error: object 'pred.lda' not found
table(pred.qda$class, out.ComNewCovar$behaviour[nnai])
#> Error: object 'pred.qda' not found
out.ComNewCovar %>% colnames
#> Error: object 'out.ComNewCovar' not found
nnn = 9 ## Number of nearest neighbours chosen
covar.names <- c("speed", "course", "abs.HeadingChange", "HEADING.deg", "acceleration", paste0( "DistWithNeighbour_", WKSSFGEO:::set_0nbr(1:nnn), 1:nnn))
#> Error in loadNamespace(x): there is no package called 'WKSSFGEO'
form <- as.formula( paste( "behaviour", paste(covar.names, collapse = "+"), sep = "~"))
# Identify indexes without missing values (normally, none of them should contain missing values)
nnai <- apply( out.ComNewCovar[, c("behaviour", covar.names)], 1, function(x) !anyNA(x))
#> Error: object 'out.ComNewCovar' not found
!nnai %>% any
#> Error: object 'nnai' not found
## Create a CV design
# Create Subset with 5 folds with a selection based on trip
# If we want to test the capacity to make predictions for a new boat, we could perform CV for boats instead of trips.
trips <- unique(out.ComNewCovar$trip_id)
#> Error: object 'out.ComNewCovar' not found
db.index <- 1:nrow(out.ComNewCovar)
#> Error: object 'out.ComNewCovar' not found
set.seed(021221)
n.samp <- floor(length(trips)/5)
#> Error: object 'trips' not found
CV.subset <- vector(mode = "list", length = 5)
for(k in 1:5){
if(k < 5){
samp.ids <- sample( trips, size = n.samp)
trips <- trips[ !trips %in% samp.ids]
}else{
samp.ids <- trips
}
samp.index <- which( out.ComNewCovar$trip_id[nnai] %in% samp.ids)
CV.subset[[k]] <- samp.index
}
#> Error: object 'trips' not found
# Chech the CV.subset and its consistency for CV purpose
trips <- unique(out.ComNewCovar$trip_id)
#> Error: object 'out.ComNewCovar' not found
lapply(CV.subset, length)
#> [[1]]
#> [1] 0
#>
#> [[2]]
#> [1] 0
#>
#> [[3]]
#> [1] 0
#>
#> [[4]]
#> [1] 0
#>
#> [[5]]
#> [1] 0
lapply(CV.subset, function(x) unique(out.ComNewCovar$trip_id[x]))
#> Error in FUN(X[[i]], ...): object 'out.ComNewCovar' not found
do.call(c, lapply(CV.subset, function(x) unique(out.ComNewCovar$trip_id[x]))) %>% duplicated %>% any
#> Error in FUN(X[[i]], ...): object 'out.ComNewCovar' not found
any(!trips %in% do.call(c, lapply(CV.subset, function(x) unique(out.ComNewCovar$trip_id[x]))))
#> Error: object 'trips' not found
### Create empty columns in dataset to retrieve CV predictions
if( !"predCV.rf" %in% colnames(out.ComNewCovar)){
out.ComNewCovar$predCV.rf <- rep(NA, nrow(out.ComNewCovar))
out.ComNewCovar$predCV.lda <- rep(NA, nrow(out.ComNewCovar))
out.ComNewCovar$predCV.qda <- rep(NA, nrow(out.ComNewCovar))
# Perform 5-folds CV
for(k in 1:5){
samp.index <- sort(CV.subset[[k]])
training.dataset <- sf::st_set_geometry(out.ComNewCovar[ -samp.index, ], NULL)
ap.dataset <- sf::st_set_geometry(out.ComNewCovar[ samp.index, ], NULL)
mod.rf.CV <- ranger::ranger( form, training.dataset ,
importance = "impurity", mtry = optim.rf$mtry, min.node.size = optim.rf$min.node.size, num.trees = 500, write.forest = TRUE)
out.ComNewCovar$predCV.rf[ samp.index ] <- predict(mod.rf.CV, ap.dataset)$predictions
mod.lda.CV <- MASS::lda(form, training.dataset)
mod.qda.CV <- MASS::qda(form, training.dataset)
out.ComNewCovar$predCV.lda[ samp.index ] <- predict(mod.lda.CV, ap.dataset)$class
out.ComNewCovar$predCV.qda[ samp.index ] <- predict(mod.qda.CV, ap.dataset)$class
#saveRDS(out.ComNewCovar , file= sprintf( "%s/SavedDataWithCVResults.rds", fld))
}
}
#> Error: object 'out.ComNewCovar' not found
### Error contingency matrix
table( out.ComNewCovar$behaviour)
#> Error: object 'out.ComNewCovar' not found
with(out.ComNewCovar[nnai,], table(predCV.lda, behaviour))
#> Error: object 'out.ComNewCovar' not found
with(out.ComNewCovar[nnai,], table(predCV.qda, behaviour))
#> Error: object 'out.ComNewCovar' not found
mod.rf$prediction.error
#> Error: object 'mod.rf' not found
ErrMat.RF <- with(out.ComNewCovar[nnai,], table(predCV.rf, behaviour))
#> Error: object 'out.ComNewCovar' not found
ErrMat.RF
#> Error: object 'ErrMat.RF' not found
# Compute overall accuracy
sum(diag(ErrMat.RF))/(sum(ErrMat.RF))
#> Error: object 'ErrMat.RF' not found
# From 11.2 % error, it is evaluated to 27% with this validation process!
# which is a more realistic appreciation of that could be achieved from a new dataset
# You can guess the result if cross-validation was performed on boats instead of fishing trips !
out.ComNewCovar %>% dim
#> Error: object 'out.ComNewCovar' not found
out.ComNewCovar$vessel_id %>% unique %>% length
#> Error: object 'out.ComNewCovar' not found
out.ComNewCovar$gear %>% table
#> Error: object 'out.ComNewCovar' not found
out.ComNewCovar$trip_id %>% unique %>% length
#> Error: object 'out.ComNewCovar' not found
# too few boats in this toy example to do so, but this can be interesting depending on the objective
# This also shows the interest of being able to manage its own validation process instead of being dependent on micro-waved methods for whom the qualification unit is the line
# but if you look closer, most of the good qualification comes from the non-fishing pings which are not the most interesting ones!
# Event if OOB prediction error could be well to qualify algorithm quality ,
# the input selection is based on rows index which may not be the purpose of the model qualification
# the result shows it is best to make its own validation procedure based on the purpose of machine-learning application
# In my opinion, to be decided depending on user application
# The confusion can also be checked by fishing gear
gears <- out.ComNewCovar$gear %>% unique
#> Error: object 'out.ComNewCovar' not found
for( g in 1:length(gears)){
print(gears[g])
print( with(out.ComNewCovar[out.ComNewCovar$gear %in% gears[g],], table(predCV.rf, behaviour)))
}
#> Error: object 'gears' not found
ErrMat.RF
#> Error: object 'ErrMat.RF' not found
### Plot for one trip
k = 10
trip = trips[k]
#> Error: object 'trips' not found
plot.new()
print(
sp::spplot( out.ComNewCovar[out.ComNewCovar$trip_id %in% trip,] %>% as_Spatial, "behaviour"),
position = c(0,0,.5,1),more=T)
#> Error in h(simpleError(msg, call)): error in evaluating the argument 'obj' in selecting a method for function 'spplot': object 'out.ComNewCovar' not found
print(
sp::spplot( out.ComNewCovar[out.ComNewCovar$trip_id %in% trip,] %>% as_Spatial, "predCV.rf"),
position = c(.5,0,1,1),more = T)
#> Error in h(simpleError(msg, call)): error in evaluating the argument 'obj' in selecting a method for function 'spplot': object 'out.ComNewCovar' not found
#Loading data
import <- out.ComNewCovar
#> Error: object 'out.ComNewCovar' not found
data <- st_drop_geometry(import)
#> Error: object 'import' not found
#Selecting features for the prediction
predictors <-
c("speed", "HEADING.deg", "abs.HeadingChange", "DistWithNeighbour_1",
"DistWithNeighbour_2", "DistWithNeighbour_3", "DistWithNeighbour_4",
"DistWithNeighbour_5", "DistWithNeighbour_6", "DistWithNeighbour_7",
"DistWithNeighbour_8", "acceleration")
#Define the classification target
target <- c("behaviour")
#Selecting the dataset
variables <- select(data,c(predictors))
#> Error in UseMethod("select"): no applicable method for 'select' applied to an object of class "function"
#Select the target to be predict
target <- data.frame(behaviour=data[,target])
#> Error in data[, target]: object of type 'closure' is not subsettable
#Spliting the dataset into training and validation dataset
set.seed(123)
#Applying a split ratio of 70/30
train_index <- sample(1:nrow(data), nrow(data)*0.70)
#> Error in 1:nrow(data): argument of length 0
# Creating the training set
train.data <- variables[train_index,]
#> Error: object 'variables' not found
train.label <- target[train_index,]
#> Error: object 'train_index' not found
train.data <- cbind(train.data, data.frame(behaviour = train.label))
#> Error: object 'train.data' not found
train.data$behaviour <- as.character(train.data$behaviour)
#> Error: object 'train.data' not found
# Creating the validation set
test.data <- variables[-train_index,]
#> Error: object 'variables' not found
test.label <- target[-train_index,]
#> Error: object 'train_index' not found
test.data <- cbind(test.data,data.frame(behaviour = test.label))
#> Error: object 'test.data' not found
test.data$behaviour <- as.character(test.data$behaviour)
#> Error: object 'test.data' not found
# Training and validation for RandomForest
rf.model <- train(behaviour ~., data = train.data, method = "ranger",
trControl = trainControl("cv", number = 10),
preProcess = c("center", "scale"))
#> Error in train(behaviour ~ ., data = train.data, method = "ranger", trControl = trainControl("cv", : could not find function "train"
rf.pred <- predict(rf.model, test.data)
#> Error: object 'rf.model' not found
rf.conf.mat <- confusionMatrix(as.factor(rf.pred), as.factor(test.label))
#> Error in confusionMatrix(as.factor(rf.pred), as.factor(test.label)): could not find function "confusionMatrix"
print("rf done")
#> [1] "rf done"
# Training and validation for SVM
# svm.model <- train(behaviour ~., data = train.data, method = "svmRadial",
# trControl = trainControl("cv", number = 10),
# preProcess = c("center", "scale"))
# svm.pred <- predict(svm.model, test.data)
# svm.conf.mat <- confusionMatrix(as.factor(svm.pred), as.factor(test.label))
# Training and validation for C5.0
c5.model <- train(behaviour ~., data = train.data, method = "C5.0",
trControl = trainControl("cv", number = 10),
preProcess = c("center", "scale"))
#> Error in train(behaviour ~ ., data = train.data, method = "C5.0", trControl = trainControl("cv", : could not find function "train"
c5.pred <- predict(c5.model, test.data)
#> Error: object 'c5.model' not found
c5.conf.mat <- confusionMatrix(as.factor(c5.pred), as.factor(test.label))
#> Error in confusionMatrix(as.factor(c5.pred), as.factor(test.label)): could not find function "confusionMatrix"
print("c5 done")
#> [1] "c5 done"
# Training and validation for XGBoost
xgb.model <- train(behaviour ~., data = train.data, method = "xgbTree",
trControl = trainControl("cv", number = 10),
preProcess = c("center", "scale"))
#> Error in train(behaviour ~ ., data = train.data, method = "xgbTree", trControl = trainControl("cv", : could not find function "train"
xgb.pred <- predict(xgb.model, test.data)
#> Error: object 'xgb.model' not found
xgb.conf.mat <- confusionMatrix(as.factor(xgb.pred), as.factor(test.label))
#> Error in confusionMatrix(as.factor(xgb.pred), as.factor(test.label)): could not find function "confusionMatrix"
print("xgb done")
#> [1] "xgb done"
# Training and validation for Decision Tree
treebag.model <- train(behaviour ~., data = train.data, method = "treebag",
trControl = trainControl("cv", number = 10),
preProcess = c("center", "scale"))
#> Error in train(behaviour ~ ., data = train.data, method = "treebag", trControl = trainControl("cv", : could not find function "train"
treebag.pred <- predict(treebag.model, test.data)
#> Error: object 'treebag.model' not found
treebag.conf.mat <- confusionMatrix(as.factor(treebag.pred), as.factor(test.label))
#> Error in confusionMatrix(as.factor(treebag.pred), as.factor(test.label)): could not find function "confusionMatrix"
print("treebag done")
#> [1] "treebag done"
# Construction statistics data frame
stats.table <- cbind(data.frame(Model="RF"), t(rf.conf.mat$overall))
#> Error: object 'rf.conf.mat' not found
#stats.table <- rbind(stats.table, cbind(data.frame(Model = "SVM"), t(svm.conf.mat$overall)))
stats.table <- rbind(stats.table, cbind(data.frame(Model = "C5.0"), t(c5.conf.mat$overall)))
#> Error: object 'stats.table' not found
stats.table <- rbind(stats.table, cbind(data.frame(Model = "XGB"), t(xgb.conf.mat$overall)))
#> Error: object 'stats.table' not found
stats.table <- rbind(stats.table, cbind(data.frame(Model = "DT"), t(treebag.conf.mat$overall)))
#> Error: object 'stats.table' not found
print(stats.table)
#> Error: object 'stats.table' not found