### From Jeppe's code and DTU dataset 


### 2021, december 3 rd
### Julien Rodriguez, Ifremer
### François Danhiez, CapGemini Engineering
### Jeppe Olsen, DTU Aqua 


### Contact
### julien.rodriguez@ifremer.fr 

### What is it about?

# This code aims at giving a toy example on how home-made machine-learning methods and cross-validation can be applied to a dataset
# This code and the associated functions were produced during ICES workshop on using high resolution spatial data for SSF monitoring, 
# SO, please take into account it was produced in less than 2 days of coding activity!
# All the necessary reflexion regarding the purpose of the algorithm (and so, the cross-validation process to be applied for proper qualification), 
# is an absolute prerequisite before performing this kind of process. 
# For this example, the decision was made to apply a ping-level decision rule on fishing/non-fishing.
# depending on the objective, choosing a different level of aggregation might prove to be relevant (for example, a "fishing event", to identify a fishing gear)
# then different aggregated index have to be produced =  average, quantiles, skewness, kurthosis....
# Also a reflexion has to be done on the covariates that can be computed. 
# In that case, these covariates, calculated from the associated functions, have been created for the purpose of the demonstration and are suggestions, 
# but probably not the most relevant! Even sometimes, their computation might be wrong, but this wasn't the objective of the exercice!

Comparison on trip calculation

# remotes::install_github("ices-eg/WKSSFGEO", ref = "pkg")
library(WKSSFGEO)
#> Error in library(WKSSFGEO): there is no package called 'WKSSFGEO'
library(sf)
library(ranger)
library(caret)
#> Error in library(caret): there is no package called 'caret'
library(sfc)
#> Error in library(sfc): there is no package called 'sfc'
library(dplyr)
fld <- getwd()
library(doMC)
#> Error in library(doMC): there is no package called 'doMC'
registerDoMC(20)
#> Error in registerDoMC(20): could not find function "registerDoMC"

The data:

track01 %>% dplyr::glimpse()
#> Error: object 'track01' not found
harbours
#> Error: object 'harbours' not found

Processing:

tracks <- 
  track01 %>% 
  data.table::setDT() %>% 
  # only unique time per veseel, first record retained - which may be the wrong one
  unique(by = c("vessel_id", "time_stamp")) %>% 
  WKSSFGEO::add_harbours(harbours) %>% 
  WKSSFGEO::interpolate_ais() %>% 
  WKSSFGEO::define_trips_pol(min_dur = 0.8, max_dur = 48, 
                             split_trips = T, preserve_all = F)
#> Error in loadNamespace(x): there is no package called 'WKSSFGEO'
### Identify missing values to retrieve it in the final dataset used to build the model
nnai <- apply( tracks[, c("speed", "course", "behaviour" )], 1, function(x) !anyNA(x))
#> Error: object 'tracks' not found
trips <- unique(tracks$trip_id)
#> Error: object 'tracks' not found
out2 <- tracks
#> Error: object 'tracks' not found
out.ComNewCovar <- do.call( rbind, lapply( trips, function(tp) {
  
  out.trip <- out2[nnai, ] %>% as.data.frame
  out.trip <- out.trip[ out.trip$trip_id %in% tp, ]
  out.NewCovar <- CalcAcceleration(
    CalcDistBetweenNearestNeighbours(
      CalcStraigthness(
        CalcHeading( st_as_sf( out.trip, 
                               coords = c("lon", "lat"), crs = 4326))
        ,   col.Dir = "course"),
      nnn = 9))
  return(out.NewCovar)
  
}))
#> Error: object 'trips' not found
glimpse(out.ComNewCovar)
#> Error: object 'out.ComNewCovar' not found
out.ComNewCovar %>% 
  gather(var, value) %>%
  ggplot(aes(x=value)) + 
  geom_histogram(fill="steelblue", alpha=.7) +
  theme_minimal() +
  facet_wrap(~key, scales="free")
#> Error in ggplot(., aes(x = value)): could not find function "ggplot"

Part 2 Build customized randomForest model

out.ComNewCovar$behaviour <-  factor(as.character(out.ComNewCovar$behaviour ))
#> Error: object 'out.ComNewCovar' not found
form <- as.formula( paste( "behaviour", paste("speed", collapse = "+"), sep = "~"))
optim.rf <- tune_RF(formula = form, sf::st_set_geometry(out.ComNewCovar[nnai, ], NULL))
#> Error in tune_RF(formula = form, sf::st_set_geometry(out.ComNewCovar[nnai, : could not find function "tune_RF"
nnn <- 9 # Number of nearest neighbours chosen
covar.names <- 
  c("speed", "course", "abs.HeadingChange", "HEADING.deg", "acceleration", 
    paste0( "DistWithNeighbour_", WKSSFGEO:::set_0nbr(1:nnn), 1:nnn))
#> Error in loadNamespace(x): there is no package called 'WKSSFGEO'
form <- as.formula( paste( "behaviour", paste(covar.names, collapse = "+"), sep = "~"))
#> Error: object 'covar.names' not found
out.ComNewCovar$behaviour <-  factor(as.character(out.ComNewCovar$behaviour ))
#> Error: object 'out.ComNewCovar' not found

# Identify indexes without missing values (normally, none of them should contain missing values)
nnai <- apply( out.ComNewCovar[, c("behaviour", covar.names)], 1, function(x) !anyNA(x))
#> Error: object 'out.ComNewCovar' not found
!nnai %>% any
#> Error: object 'nnai' not found

# EINAR: the {tune_RF} finds the "optimimum" min.node.size and mtry to be passed
#        to {ranger} in the next step
optim.rf <- WKSSFGEO::tune_RF(formula = form, sf::st_set_geometry(out.ComNewCovar[nnai, ], NULL))
#> Error in loadNamespace(x): there is no package called 'WKSSFGEO'
optim.rf # Automatically selected hyper-parameters with tune_RF function
#> Error: object 'optim.rf' not found
mod.rf <- ranger::ranger(form, 
                         sf::st_set_geometry(out.ComNewCovar[nnai, ], NULL),  
                         importance = "impurity", 
                         mtry =  optim.rf$mtry, 
                         min.node.size =  optim.rf$min.node.size, 
                         num.trees = 500,
                         write.forest = TRUE)
#> Error: object 'out.ComNewCovar' not found
#saveRDS( list(optim.rf = optim.rf, mod.rf = mod.rf), file= sprintf( "%s/RandomForestModel.rds", fld))



mod.rf$prediction.error
#> Error: object 'mod.rf' not found
# OOB prediction error 11.17%
# Shows variable importance regarding how they contribute to the model
tibble::tibble(var = names(mod.rf$variable.importance),
               val = mod.rf$variable.importance) %>% 
  ggplot2::ggplot() +
  ggplot2::geom_pointrange(ggplot2::aes(reorder(var, val), val, ymin = 0, ymax = val)) +
  ggplot2::coord_flip()
#> Error: object 'mod.rf' not found

# Error matrix
table(mod.rf$predictions, out.ComNewCovar$behaviour[nnai])
#> Error: object 'mod.rf' not found

# Select only the most relevant variables for LDA and QDA
mod.rf$variable.importance %>% sort(decreasing = TRUE)
#> Error: object 'mod.rf' not found
# Course and heading to be avoided because it may lead to overfitting 
# These variables have been kept in randomforest which is less sensitive to colinearity and overfitting

covar.names <- c("speed", "abs.HeadingChange", "acceleration", paste0( "DistWithNeighbour_", c(8,6,2,9)))
form.lda <- as.formula( paste( "behaviour", paste(covar.names, collapse = "+"), sep = "~"))

mod.lda <- MASS::lda(form,  out.ComNewCovar[nnai, ])
#> Error in eval(expr, p): object 'out.ComNewCovar' not found
mod.qda <- MASS::qda(form,  out.ComNewCovar[nnai, ])
#> Error in eval(expr, p): object 'out.ComNewCovar' not found

#saveRDS( list(mod.lda = mod.lda, mod.qda = mod.qda), file= sprintf( "%s/LDAModel.rds", fld))

pred.lda <- predict(mod.lda)
#> Error: object 'mod.lda' not found
pred.qda <- predict(mod.qda)
#> Error: object 'mod.qda' not found
mod.lda %>% names
#> Error: object 'mod.lda' not found
mod.lda$prior
#> Error: object 'mod.lda' not found

out.ComNewCovar$behaviour[nnai] %>% table
#> Error: object 'out.ComNewCovar' not found

table(pred.lda$class, out.ComNewCovar$behaviour[nnai])
#> Error: object 'pred.lda' not found
table(pred.qda$class, out.ComNewCovar$behaviour[nnai])
#> Error: object 'pred.qda' not found

Perform 5 folds Cross-validation on trips to evaluate a realistic accuracy for new predictions

out.ComNewCovar %>% colnames
#> Error: object 'out.ComNewCovar' not found
nnn = 9 ## Number of nearest neighbours chosen
covar.names <- c("speed", "course", "abs.HeadingChange", "HEADING.deg", "acceleration", paste0( "DistWithNeighbour_", WKSSFGEO:::set_0nbr(1:nnn), 1:nnn))
#> Error in loadNamespace(x): there is no package called 'WKSSFGEO'
form <- as.formula( paste( "behaviour", paste(covar.names, collapse = "+"), sep = "~"))

# Identify indexes without missing values (normally, none of them should contain missing values)
nnai <- apply( out.ComNewCovar[, c("behaviour", covar.names)], 1, function(x) !anyNA(x))
#> Error: object 'out.ComNewCovar' not found
!nnai %>% any
#> Error: object 'nnai' not found

## Create a CV design

# Create Subset with 5 folds with a selection based on trip
# If we want to test the capacity to make predictions for a new boat, we could perform CV for boats instead of trips.
trips <- unique(out.ComNewCovar$trip_id)
#> Error: object 'out.ComNewCovar' not found

db.index <- 1:nrow(out.ComNewCovar)
#> Error: object 'out.ComNewCovar' not found
set.seed(021221)
n.samp <- floor(length(trips)/5)
#> Error: object 'trips' not found

CV.subset <- vector(mode = "list", length = 5)

for(k in 1:5){
  
  if(k < 5){
    samp.ids <- sample( trips, size = n.samp)
    trips <- trips[ !trips %in% samp.ids]
  }else{
    samp.ids <- trips
  }
  
  samp.index <- which( out.ComNewCovar$trip_id[nnai] %in% samp.ids)
  CV.subset[[k]] <- samp.index
  
}
#> Error: object 'trips' not found

# Chech the CV.subset and its consistency for CV purpose
trips <- unique(out.ComNewCovar$trip_id)
#> Error: object 'out.ComNewCovar' not found
lapply(CV.subset, length)
#> [[1]]
#> [1] 0
#> 
#> [[2]]
#> [1] 0
#> 
#> [[3]]
#> [1] 0
#> 
#> [[4]]
#> [1] 0
#> 
#> [[5]]
#> [1] 0
lapply(CV.subset, function(x) unique(out.ComNewCovar$trip_id[x]))
#> Error in FUN(X[[i]], ...): object 'out.ComNewCovar' not found
do.call(c, lapply(CV.subset, function(x) unique(out.ComNewCovar$trip_id[x]))) %>% duplicated %>% any
#> Error in FUN(X[[i]], ...): object 'out.ComNewCovar' not found
any(!trips %in% do.call(c, lapply(CV.subset, function(x) unique(out.ComNewCovar$trip_id[x]))))
#> Error: object 'trips' not found

### Create empty columns in dataset to retrieve CV predictions
if( !"predCV.rf" %in% colnames(out.ComNewCovar)){
  
  out.ComNewCovar$predCV.rf <- rep(NA, nrow(out.ComNewCovar))
  out.ComNewCovar$predCV.lda <- rep(NA, nrow(out.ComNewCovar))
  out.ComNewCovar$predCV.qda <- rep(NA, nrow(out.ComNewCovar))
  
  # Perform 5-folds CV
  for(k in 1:5){
    
    samp.index <- sort(CV.subset[[k]])
    training.dataset <- sf::st_set_geometry(out.ComNewCovar[ -samp.index, ], NULL)
    ap.dataset <- sf::st_set_geometry(out.ComNewCovar[ samp.index, ], NULL)
    
    mod.rf.CV <- ranger::ranger( form,   training.dataset ,  
                                 importance = "impurity", mtry =  optim.rf$mtry, min.node.size =  optim.rf$min.node.size, num.trees = 500, write.forest = TRUE)
    
    out.ComNewCovar$predCV.rf[ samp.index ] <- predict(mod.rf.CV, ap.dataset)$predictions
    
    mod.lda.CV <- MASS::lda(form,  training.dataset)
    mod.qda.CV <- MASS::qda(form,  training.dataset)
    
    out.ComNewCovar$predCV.lda[ samp.index ] <- predict(mod.lda.CV, ap.dataset)$class
    out.ComNewCovar$predCV.qda[ samp.index ] <- predict(mod.qda.CV, ap.dataset)$class
    
    #saveRDS(out.ComNewCovar , file= sprintf( "%s/SavedDataWithCVResults.rds", fld))
    
  }
}
#> Error: object 'out.ComNewCovar' not found

### Error contingency matrix
table( out.ComNewCovar$behaviour)
#> Error: object 'out.ComNewCovar' not found
with(out.ComNewCovar[nnai,], table(predCV.lda, behaviour))
#> Error: object 'out.ComNewCovar' not found
with(out.ComNewCovar[nnai,], table(predCV.qda, behaviour))
#> Error: object 'out.ComNewCovar' not found

mod.rf$prediction.error
#> Error: object 'mod.rf' not found
ErrMat.RF <- with(out.ComNewCovar[nnai,], table(predCV.rf, behaviour))
#> Error: object 'out.ComNewCovar' not found
ErrMat.RF
#> Error: object 'ErrMat.RF' not found

# Compute overall accuracy
sum(diag(ErrMat.RF))/(sum(ErrMat.RF))
#> Error: object 'ErrMat.RF' not found

# From 11.2 % error, it is evaluated to 27% with this validation process! 
# which is a more realistic appreciation of that could be achieved from a new dataset
# You can guess the result if cross-validation was performed on boats instead of fishing trips !
out.ComNewCovar %>% dim
#> Error: object 'out.ComNewCovar' not found
out.ComNewCovar$vessel_id %>% unique %>% length
#> Error: object 'out.ComNewCovar' not found
out.ComNewCovar$gear %>% table
#> Error: object 'out.ComNewCovar' not found
out.ComNewCovar$trip_id %>% unique %>% length
#> Error: object 'out.ComNewCovar' not found
# too few boats in this toy example to do so, but this can be interesting depending on the objective
# This also shows the interest of being able to manage its own validation process instead of being dependent on micro-waved methods for whom the qualification unit is the line 

# but if you look closer, most of the good qualification comes from the non-fishing pings which are not the most interesting ones!

# Event if OOB prediction error could be well to qualify algorithm quality , 
# the input selection is based on rows index which may not be the purpose of the model qualification
# the result shows it is best to make its own validation procedure based on the purpose of machine-learning application
# In my opinion, to be decided depending on user application

# The confusion can also be checked by fishing gear
gears <- out.ComNewCovar$gear  %>% unique
#> Error: object 'out.ComNewCovar' not found

for( g in 1:length(gears)){
  
  print(gears[g])
  print( with(out.ComNewCovar[out.ComNewCovar$gear %in% gears[g],], table(predCV.rf, behaviour)))
  
}
#> Error: object 'gears' not found

ErrMat.RF
#> Error: object 'ErrMat.RF' not found


### Plot for one trip

k = 10
trip = trips[k]
#> Error: object 'trips' not found

plot.new()

print(
  sp::spplot( out.ComNewCovar[out.ComNewCovar$trip_id %in% trip,] %>% as_Spatial, "behaviour"),
  position = c(0,0,.5,1),more=T)
#> Error in h(simpleError(msg, call)): error in evaluating the argument 'obj' in selecting a method for function 'spplot': object 'out.ComNewCovar' not found
print(
  sp::spplot( out.ComNewCovar[out.ComNewCovar$trip_id %in% trip,] %>% as_Spatial, "predCV.rf"),
  position = c(.5,0,1,1),more = T)
#> Error in h(simpleError(msg, call)): error in evaluating the argument 'obj' in selecting a method for function 'spplot': object 'out.ComNewCovar' not found

Using Caret batch of functions to calibrate more machine learning methods.

#Loading data
import <- out.ComNewCovar
#> Error: object 'out.ComNewCovar' not found
data <- st_drop_geometry(import)
#> Error: object 'import' not found

#Selecting features for the prediction
predictors <-
  c("speed", "HEADING.deg", "abs.HeadingChange", "DistWithNeighbour_1",
    "DistWithNeighbour_2", "DistWithNeighbour_3", "DistWithNeighbour_4",
    "DistWithNeighbour_5", "DistWithNeighbour_6", "DistWithNeighbour_7",
    "DistWithNeighbour_8", "acceleration")

#Define the classification target
target <- c("behaviour")


#Selecting the dataset
variables <- select(data,c(predictors))
#> Error in UseMethod("select"): no applicable method for 'select' applied to an object of class "function"
#Select the target to be predict
target <- data.frame(behaviour=data[,target])
#> Error in data[, target]: object of type 'closure' is not subsettable


#Spliting the dataset into training and validation dataset
set.seed(123)

#Applying a split ratio of 70/30
train_index <- sample(1:nrow(data), nrow(data)*0.70)
#> Error in 1:nrow(data): argument of length 0

# Creating the training set
train.data <- variables[train_index,]
#> Error: object 'variables' not found
train.label <- target[train_index,]
#> Error: object 'train_index' not found
train.data <- cbind(train.data, data.frame(behaviour = train.label))
#> Error: object 'train.data' not found
train.data$behaviour <- as.character(train.data$behaviour)
#> Error: object 'train.data' not found

# Creating the validation set
test.data <- variables[-train_index,]
#> Error: object 'variables' not found
test.label <- target[-train_index,]
#> Error: object 'train_index' not found
test.data <- cbind(test.data,data.frame(behaviour = test.label))
#> Error: object 'test.data' not found
test.data$behaviour <- as.character(test.data$behaviour)
#> Error: object 'test.data' not found

# Training and validation for RandomForest
rf.model <- train(behaviour ~., data = train.data, method = "ranger",
                  trControl = trainControl("cv", number = 10),
                  preProcess = c("center", "scale"))
#> Error in train(behaviour ~ ., data = train.data, method = "ranger", trControl = trainControl("cv", : could not find function "train"
rf.pred <- predict(rf.model, test.data)
#> Error: object 'rf.model' not found
rf.conf.mat <- confusionMatrix(as.factor(rf.pred), as.factor(test.label))
#> Error in confusionMatrix(as.factor(rf.pred), as.factor(test.label)): could not find function "confusionMatrix"
print("rf done")
#> [1] "rf done"
# Training and validation for SVM
# svm.model <- train(behaviour ~., data = train.data, method = "svmRadial",
#                    trControl = trainControl("cv", number = 10),
#                    preProcess = c("center", "scale"))
# svm.pred <- predict(svm.model, test.data)
# svm.conf.mat <- confusionMatrix(as.factor(svm.pred), as.factor(test.label))

# Training and validation for C5.0
c5.model <- train(behaviour ~., data = train.data, method = "C5.0",
                  trControl = trainControl("cv", number = 10),
                  preProcess = c("center", "scale"))
#> Error in train(behaviour ~ ., data = train.data, method = "C5.0", trControl = trainControl("cv", : could not find function "train"
c5.pred <- predict(c5.model, test.data)
#> Error: object 'c5.model' not found
c5.conf.mat <- confusionMatrix(as.factor(c5.pred), as.factor(test.label))
#> Error in confusionMatrix(as.factor(c5.pred), as.factor(test.label)): could not find function "confusionMatrix"
print("c5 done")
#> [1] "c5 done"
# Training and validation for XGBoost
xgb.model <- train(behaviour ~., data = train.data, method = "xgbTree",
                   trControl = trainControl("cv", number = 10),
                   preProcess = c("center", "scale"))
#> Error in train(behaviour ~ ., data = train.data, method = "xgbTree", trControl = trainControl("cv", : could not find function "train"
xgb.pred <- predict(xgb.model, test.data)
#> Error: object 'xgb.model' not found
xgb.conf.mat <- confusionMatrix(as.factor(xgb.pred), as.factor(test.label))
#> Error in confusionMatrix(as.factor(xgb.pred), as.factor(test.label)): could not find function "confusionMatrix"
print("xgb done")
#> [1] "xgb done"
# Training and validation for Decision Tree
treebag.model <- train(behaviour ~., data = train.data, method = "treebag",
                       trControl = trainControl("cv", number = 10),
                       preProcess = c("center", "scale"))
#> Error in train(behaviour ~ ., data = train.data, method = "treebag", trControl = trainControl("cv", : could not find function "train"
treebag.pred <- predict(treebag.model, test.data)
#> Error: object 'treebag.model' not found
treebag.conf.mat <- confusionMatrix(as.factor(treebag.pred), as.factor(test.label))
#> Error in confusionMatrix(as.factor(treebag.pred), as.factor(test.label)): could not find function "confusionMatrix"
print("treebag done")
#> [1] "treebag done"
# Construction statistics data frame
stats.table <- cbind(data.frame(Model="RF"), t(rf.conf.mat$overall))
#> Error: object 'rf.conf.mat' not found
#stats.table <- rbind(stats.table, cbind(data.frame(Model = "SVM"),  t(svm.conf.mat$overall)))
stats.table <- rbind(stats.table, cbind(data.frame(Model = "C5.0"), t(c5.conf.mat$overall)))
#> Error: object 'stats.table' not found
stats.table <- rbind(stats.table, cbind(data.frame(Model = "XGB"),  t(xgb.conf.mat$overall)))
#> Error: object 'stats.table' not found
stats.table <- rbind(stats.table, cbind(data.frame(Model = "DT"),   t(treebag.conf.mat$overall)))
#> Error: object 'stats.table' not found
print(stats.table)
#> Error: object 'stats.table' not found