datacall

try to separate out mutation process and filter/qc process. i.e. don’t do both at the same time

Preamble

This document is trying to emulate the process behind the (2024) datacall. Some deviations in the code flow, in particular if less terse approach can easily be implemented.

libraries

library(vmstools)
library(sf)
library(tidyverse)
source("~/stasi/ices/ICES-VMS-and-Logbook-Data-Call/0_global_functions.R")
library(conflicted)
conflicts_prefer(dplyr::filter, dplyr::lag)
rb_st_keep <- function(x, y) {
  i <- sf::st_intersects(x, y) |> lengths() > 0
  x <- x[i, ]
  return(x)
}
rb_st_remove <- function(x, y) {
  i <- lengths(st_intersects(x, y)) == 0
  x <- x[i, ]
  return(x)
}

Globals

# Setting thresholds
spThres       <- 20   # Maximum speed threshold in analyses in nm
intThres      <- 5    # Minimum difference in time interval in minutes to prevent pseudo duplicates
intvThres     <- 240  # Maximum difference in time interval in minutes to prevent unrealistic intervals
lanThres      <- 1.5  # Maximum difference in log10-transformed sorted weights

# Set the years to submit
yearsToSubmit <- c(1803:1804)

# Set the gear names for which automatic fishing activity is wanted
autoDetectionGears <- c("TBB","OTB","OTT", "OTM","SSC","SDN","DRB","PTB","HMD", "MIS")

# Decide if you want to visually analyze speed-histograms to identify fishing activity peaks
visualInspection <- FALSE

# Specify how landings should be distributed over the VMS pings
linkEflaloTacsat <- c("trip")

# Extract valid level 6 metiers 
valid_metiers <- data.table::fread("https://raw.githubusercontent.com/ices-eg/RCGs/master/Metiers/Reference_lists/RDB_ISSG_Metier_list.csv")$Metier_level6

ices_geartype        <-  icesVocab::getCodeList("GearType")
ices_fishingactivity <-  icesVocab::getCodeList("Metier6_FishingActivity")

# speed allocation
speed <-
  tribble(~met, ~s1, ~s2)

Auxillary data

data(harbours)
data(ICESareas)

ices <- 
  ia <-
  ICESareas |> 
  select(area = Area_27)
sf_use_s2(TRUE)
harbours <- 
  harbours |> 
  dplyr::mutate(harbour = iconv(harbour, from = "latin1", to = "UTF-8")) |> 
  tibble::as_tibble() |> 
  sf::st_as_sf(coords = c("lon", "lat"),
               crs = 4326) |> 
  sf::st_buffer(dist = 3000) |>  # radius
  dplyr::select(harbour)
#sf_use_s2(FALSE)
# may use
dictionary <- 
  tribble(~iso, ~vt,
          "dttm", "SI_DATIM",
          "date", "SI_DATE",
          "time", "SI_TIME",
          "vid", "VE_REF",
          "D1", "FT_DDAT",
          "D2", "FT_LDAT")

Uses vmstools dataset

data(tacsat)
# leap-year issue
tacsat <- 
  tacsat |> 
  dplyr::mutate(SI_DATE = stringr::str_replace(SI_DATE, "1800", "1803"),
                SI_DATE = stringr::str_replace(SI_DATE, "1801", "1804")) |> 
  dplyr::mutate(tmp = dmy_hms(paste(SI_DATE, SI_TIME), tz = "UTC")) |> 
  # get order upfront (could have used sortTacsat)
  dplyr::arrange(VE_REF, tmp) |> 
  dplyr::select(-tmp) |> 
  dplyr::mutate(.ridt = 1:n(), 
                .before = VE_COU)
# for the alt
ais <- 
  tacsat |> 
  tibble::as_tibble() |> 
  tidyr::unite(col = "time", SI_DATE, SI_TIME, sep = " ") |>
  dplyr::mutate(time = lubridate::dmy_hms(time, tz = "UTC")) |> 
  dplyr::select(.ridt,
                vid = VE_REF,
                time,
                lon = SI_LONG,
                lat = SI_LATI,
                speed = SI_SP,
                hd = SI_HE)

data(eflalo)
# leap-year issue
eflalo <- 
  eflalo |> 
  dplyr::mutate(FT_DDAT = stringr::str_replace(FT_DDAT, "1800", "1803"),
                FT_DDAT = stringr::str_replace(FT_DDAT, "1801", "1804"),
                FT_LDAT = stringr::str_replace(FT_LDAT, "1800", "1803"),
                FT_LDAT = stringr::str_replace(FT_LDAT, "1801", "1804"),
                LE_CDAT = stringr::str_replace(LE_CDAT, "1800", "1803"),
                LE_CDAT = stringr::str_replace(LE_CDAT, "1801", "1804")) |> 
  # get order upfront
  dplyr::mutate(tmp = dmy_hms(paste0(FT_DDAT, FT_DTIME), tz = "UTC")) |> 
  dplyr::arrange(VE_REF, tmp) |> 
  dplyr::select(-tmp) |> 
  dplyr::mutate(.ride = 1:n(), 
                .before = VE_REF) |> 
  dplyr::rename(LE_MET = LE_MET_level6)
lb <- 
  eflalo |> 
  tibble::as_tibble() |> 
  dplyr::rename(d1 = LE_CDAT) |> 
  # probably not kosher - 
  dplyr::mutate(LE_STIME = dplyr::case_when(is.na(LE_STIME) ~ "00:00:01",
                                            .default = LE_STIME),
                LE_ETIME = dplyr::case_when(is.na(LE_ETIME) ~ "23:59:59")) |> 
  tidyr::unite(col = "T1", FT_DDAT, FT_DTIME, sep = " ") |> 
  tidyr::unite(col = "T2", FT_LDAT, FT_LTIME, sep = " ") |> 
  dplyr::mutate(T1 = lubridate::dmy_hms(T1, tz = "UTC"),
                T2 = lubridate::dmy_hms(T2, tz = "UTC"),
                t1 = dmy_hms(paste(d1, LE_STIME), tz = "UTC"),
                t2 = dmy_hms(paste(d1, LE_ETIME), tz = "UTC"),
                d1 = lubridate::dmy(d1),
                .after = LE_ID) |> 
  # "haul can not be before nor after trip start-end
  dplyr::mutate(t1 = dplyr::case_when(t1 < T1 ~ T1,
                                      .default = t1),
                t2 = dplyr::case_when(t2 > T2 ~ T2,
                                      .default = t2)) |> 
  select(-c(VE_COU, VE_LEN, VE_KW, VE_TON,
            FT_DCOU, FT_DHAR, FT_LCOU, FT_LHAR,
            LE_STIME, LE_ETIME,
            LE_SLAT, LE_SLON, LE_ELAT, LE_ELON,
            LE_EFF_VMS))
# get rid of species stuff for now
lb <- 
  lb |> 
  dplyr::select(-(LE_KG_ANE:LE_EURO_SWO)) |> 
  # in the end just do this
  dplyr::select(.ride, 
                vid = VE_REF,
                fleet = VE_FLT,
                T1,
                T2,
                d1,
                .tid = FT_REF,
                gid = LE_GEAR,
                ir = LE_RECT,
                .sid = LE_ID,
                # t1, t2,
                mesh = LE_MSZ,
                met6 = LE_MET,
                div = LE_DIV,
                effort = LE_EFF,
                unit = LE_UNIT)

Preprocessing

Positions

tacsat <- formatTacsat(tacsat)
# 1.2 Clean the TACSAT data ----------------------------------------------------

## 1.2.0 Keep track of removed points ------------------------------------------
#    not done here

## 1.2.1 Remove VMS pings outside the ICES areas -------------------------------
#  ia <- transform_to_sf(ICESareas, coords = c("SI_LONG", "SI_LATI"))
tacsat <- transform_to_sf(tacsat, coords = c("SI_LONG", "SI_LATI"))
# Make ia valid and transform it - redundant, already done
#ia <- ia %>%
#  sf::st_make_valid() %>%
#  sf::st_transform(4326) |> 
#  sf::st_zm()
# Find intersections
overs <- sf::st_intersects(tacsat, ia)
tacsat <- tacsat[lengths(overs) > 0,]

## 1.2.2 Remove duplicate records ----------------------------------------------
# Convert SI_DATE and SI_TIME to POSIXct
tacsat$SI_DATIM <- 
  as.POSIXct(paste(tacsat$SI_DATE, tacsat$SI_TIME), tz = "GMT", format = "%d/%m/%Y  %H:%M")
# Create a unique identifier for each row
tacsat$unique_id <- paste(tacsat$VE_REF, tacsat$SI_LATI, tacsat$SI_LONG, tacsat$SI_DATIM)
# Remove duplicates based on the unique identifier
tacsat <- tacsat[!duplicated(tacsat$unique_id), ]

## 1.2.3 Remove points that have impossible coordinates ------------------------
# Extract coordinates from tacsat
coords <- st_coordinates(tacsat)
# Check for impossible positions
invalid_positions <- which(coords[,2] > 90 | coords[,2] < -90 | coords[,1] > 180 | coords[,1] < -180)
if (length(invalid_positions) > 0) {
  # Print the invalid positions
  print(tacsat[invalid_positions,])
  # Remove points with impossible positions
  tacsat <- tacsat[-invalid_positions,]
}

## 1.2.4 Remove points which are pseudo duplicates -----------------------------
#    as they have an interval rate < x minutes ---
# Sort tacsat and calculate intervals
tacsat <- sfsortTacsat(tacsat)
tacsat$INTV <- 
  intervalTacsat(as.data.frame(tacsat), 
                 level = "vessel", 
                 fill.na = TRUE)$INTV
# Remove rows with small intervals
tacsat <- tacsat[tacsat$INTV >= intThres, ]
# Remove INTV column from tacsat
tacsat$INTV <- NULL

## 1.2.5 Remove points in harbour ----------------------------------------------
# Find intersections
overs <- sf::st_intersects(tacsat, harbours)
# Filter tacsat
tacsat <- tacsat[!(lengths(overs) > 0),]
tacsat <- as.data.frame(tacsat)
tacsat <- tacsat %>% dplyr::select(-geometry)
tacsat <- tacsat %>% dplyr::select(-unique_id)

ais <- 
  ais |> 
  sf::st_as_sf(coords = c("lon", "lat"),
               crs = 4326,
               remove = FALSE) |> 
  # should already be done
  dplyr::arrange(vid, time) |> 
  ## 1.2.1 Remove VMS pings outside the ICES areas -----------------------------
# takes a bit of time
rb_st_keep(ices) |> 
  ## 1.2.2 Remove duplicate records --------------------------------------------
dplyr::distinct(vid, time, lon, lat, .keep_all = TRUE) |> 
  ## 1.2.3 Remove points that have impossible coordinates ----------------------
#       Redundant given step 1.2.1
## 1.2.4 Remove points which are pseudo duplicates ---------------------------
dplyr::group_by(vid) |> 
  # this is not by convention
  dplyr::mutate(dt = difftime(time, lag(time), units = "mins"),
                dt = as.numeric(dt)) |> 
  tidyr::fill(dt, .direction = "up") |> 
  dplyr::ungroup() |> 
  dplyr::filter(dt >= intThres) |> 
  ## 1.2.5 Remove points in harbour --------------------------------------------
rb_st_remove(harbours)
#dplyr::filter(lengths(st_intersects(geometry, harbours$geometry)) == 0)

identical(tacsat$.ridt, ais$.ridt)

[1] TRUE

Logbooks

## 1.3.1 Keep track of removed points -----------------------------------------
#   not done here

## 1.3.2 Warn for outlying catch records ---------------------------------------

## 1.3.3 Remove non-unique trip numbers ----------------------------------------

# Apply the trip ID function to the eflalo data frame
#   function name is bit of a misnomer
trip_id <- create_trip_id(eflalo)
# Remove records with non-unique trip identifiers
eflalo <- eflalo[!duplicated(trip_id), ]

## 1.3.4 Remove impossible time stamp records ----------------------------------
# Apply the convert to date-time function to the FT_DDAT and FT_DTIME columns
eflalo$FT_DDATIM <- convert_to_datetime(eflalo$FT_DDAT, eflalo$FT_DTIME)
# Apply the function to the FT_LDAT and FT_LTIME columns
eflalo$FT_LDATIM <- convert_to_datetime(eflalo$FT_LDAT, eflalo$FT_LTIME)
# Remove records with NA in either FT_DDATIM or FT_LDATIM
eflalo <- eflalo[!is.na(eflalo$FT_DDATIM) & !is.na(eflalo$FT_LDATIM), ]

## 1.3.5 Remove trip starting before 1st Jan -----------------------------------
# Call the remove before january function with the appropriate arguments
# not run in test
if(FALSE) eflalo <- remove_before_jan(eflalo, year)

## 1.3.6 Remove records with arrival date before departure date  ---------------
# Find the indices of rows where 'FT_LDATIM' is greater than or equal to 'FT_DDATIM'
idx <- which(eflalo$FT_LDATIM >= eflalo$FT_DDATIM)
# Keep only the rows in 'eflalo' where 'FT_LDATIM' is greater than or equal to 'FT_DDATIM'
eflalo <- eflalo[idx,]

## 1.3.7 Remove trip with overlap with another trip ----------------------------
# Order 'eflalo' by 'VE_COU', 'VE_REF', 'FT_DDATIM', and 'FT_LDATIM'
eflalo <- orderBy(~ VE_COU + VE_REF + FT_DDATIM + FT_LDATIM, data = eflalo)

# If a trip (same depart and return times) has more than one FT_REF, make them all into the same (first) FT_REF. 
dt1 <- data.table::data.table(eflalo)[,.(VE_REF, FT_REF, FT_DDATIM, FT_LDATIM)]
dt1 <- unique(dt1, by = c("VE_REF", "FT_REF"))
data.table::setkey(dt1, VE_REF, FT_DDATIM, FT_LDATIM)
dt2 <- dt1[, ref := .N > 1, by = data.table::key(dt1)][ref == T]
dt3 <- dt2[,.(FT_REF_NEW = FT_REF[1]), by = .(VE_REF, FT_DDATIM, FT_LDATIM)]
dt4 <- merge(dt2, dt3)
eflalo2 <- merge(data.table::data.table(eflalo), dt4, all.x = T)
eflalo2[!is.na(FT_REF_NEW), FT_REF := FT_REF_NEW]
eflalo2[, FT_REF_NEW := NULL]
eflalo <- data.frame(eflalo2)
eflalo <- eflalo %>% select(-ref)

# Create a data table 'dt1' with the necessary columns from 'eflalo'
dt1 <- data.table::data.table(ID = eflalo$VE_REF, FT = eflalo$FT_REF,
                              startdate = eflalo$FT_DDATIM,
                              enddate = eflalo$FT_LDATIM)
# Remove duplicate rows from 'dt1'
dt1 <- dt1[!duplicated(paste(dt1$ID, dt1$FT)), ]
# Set keys for 'dt1' for efficient joining and overlapping
data.table::setkey(dt1, ID, startdate, enddate)
# Find overlapping trips in 'dt1'
result <- data.table::foverlaps(dt1, dt1, by.x = c("ID", "startdate", "enddate"),
                                by.y = c("ID", "startdate", "enddate"))
# Filter 'result' to get only the rows where trips overlap
overlapping.trips <- subset(result, startdate < i.enddate & enddate > i.startdate & FT != i.FT)
# If there are overlapping trips, remove them from 'eflalo' and save them to a file
if (nrow(overlapping.trips) > 0) {
  eflalo <- eflalo[!eflalo$FT_REF %in% overlapping.trips$FT, ]
  print("THERE ARE OVERLAPPING TRIPS IN THE DATASET -> SEE THE FILE overlappingTrips SAVED IN THE RESULTS FOLDER")
  save(overlapping.trips, file = file.path(outPath, paste0("overlappingTrips", year, ".RData")))
} 

#'----------------------------------------------------------------------------
# 1.4 METIERS ICES Vocabulary Quality Control ----------------------------------------
#'----------------------------------------------------------------------------
#'
#' Check the fields with related Metier ICES Vocabularies prior the analysis block (2_eflalo_tacsat_analysis.R)
#' Some functions in this analysis will rise errors if there are  values with  not valid controlled vocabulary 
#' 

## 1.4.1 Check Metier L4 (Gear) categories are accepted ------------------------
m4_ices         <-  icesVocab::getCodeList("GearType")
table ( eflalo$LE_GEAR %in%m4_ices$Key )   # TRUE records accepted in DATSU, FALSE aren't


TRUE 
4446

# Get summary  of   DATSU valid/not valid records
eflalo [ ! eflalo$LE_GEAR %in%m4_ices$Key,]%>%group_by(LE_GEAR)%>%dplyr::select(LE_GEAR)%>%tally()

# A tibble: 0 × 2
# ℹ 2 variables: LE_GEAR <chr>, n <int>

# Correct or dismiss not valid records (if any) and filter only valid ones
eflalo      <-  eflalo%>%filter(LE_GEAR %in% m4_ices$Key)

## 3.5.5 Check Metier L6 (Fishing Activity) categories are accepted -----------
m6_ices         <-  icesVocab::getCodeList("Metier6_FishingActivity")
table ( eflalo$LE_MET %in%m6_ices$Key )   # TRUE records accepted in DATSU, FALSE aren't


FALSE  TRUE 
  738  3708

# Get summary  of   DATSU valid/not valid records
eflalo [ ! eflalo$LE_MET  %in%m6_ices$Key,]%>%group_by(LE_MET)%>%dplyr::select(LE_MET)%>%tally()

# A tibble: 8 × 2
  LE_MET                 n
  <chr>              <int>
1 DRB_MOL_0_0_0         58
2 FPO_CRU_0_0_0         24
3 GNS_DEF_0_0_0         11
4 GNS_DEF_UND_0_0       14
5 LH_FIN_0_0_0           9
6 MIS_UND_UND_0_0      581
7 OTM_SPF_UND_0_0       36
8 TBB_DEF_90-119_0_0     5

# Correct them if any not valid and filter only valid ones
eflalo      <-  eflalo%>%filter(LE_MET %in% m6_ices$Key)

## 1.3.1 Keep track of removed points -----------------------------------------
#   not done here
## 1.3.2 Warn for outlying catch records ---------------------------------------
## 1.3.3 Remove non-unique trip numbers ----------------------------------------
lb <- 
  lb |> 
  # I would also use vessel id here, but that is another matter
  distinct(.sid, d1, .keep_all = TRUE) |> 
  ## 1.3.4 Remove impossible time stamp records ----------------------------------
filter(!is.na(T1) & !is.na(T2)) |> 
  ## 1.3.5 Remove trip starting before 1st Jan -----------------------------------
#        not run here, have mupltiple years
## 1.3.6 Remove records with arrival date before departure date  ---------------
filter(T2 >= T1) |> 
  ## 1.3.7 Remove trip with overlap with another trip --------------------------
#        NOT IMPLEMENTED because of lack of understanding
#        If a trip (same depart and return times) has more than one FT_REF, 
#        make them all into the same (first) FT_REF
nest(.by = c(vid, fleet, .tid, T1, T2)) |> 
  group_by(vid, fleet) |> 
  mutate(issues = 
           case_when(
             T1 > T2 ~ "arrival before departure",
             # NOTE: SHOULD REALLY BE A CAUSE FOR A DROP
             #       so in production remove the prefix
             T1 == T2 ~ "0_arrival same as departure",    
             T2 > lead(T1) ~ "next departure before current arrival",
             lag(T2) > T1 ~ "previous arrival after current departure",
             row_number() == 1 ~ "0_first row in a group", 
             row_number() == max(row_number()) ~ "0_last row in a group",
             .default = "0_no issues")) |> 
  ungroup() |> 
  filter(stringr::str_starts(issues, "0_")) |> 
  select(-issues) |> 
  unnest(data) |> 
  ## 1.4.1 Check Metier L4 (Gear) categories are accepted ----------------------
filter(gid %in% ices_geartype$Key) |> 
  ## 3.5.5 Check Metier L6 (Fishing Activity) categories are accepted ----------
filter(met6 %in% ices_fishingactivity$Key)

eflalo <- eflalo |> arrange(.ride)
near(eflalo$.ride, lb$.ride) |> table()


TRUE 
3708

Analysis

bla, bla, bla, …

# 2.2  Assign EFLALO Fishing trip information (gear, vessel, lenght, etc. ) to VMS records in TACSAT ----
# 
# Assign a EFLALO trip identifier (FT_REF)  at each VMS record in TACSAT . 
# Methods assgn fishing trip to VMS records which  date/time  is between the Trip dates of departure and return to port  .
tacsatp <- mergeEflalo2Tacsat(eflalo,tacsat)
tacsatp <- data.frame(tacsatp)

This can be done via:

ais2 <- 
  ais |> 
  left_join(lb |> select(vid, .tid, T1, T2) |> distinct(),
            by = join_by(vid, between(time, T1, T2))) |> 
  select(-c(T1, T2)) |> 
  # to be compliant with vmstools:mergeEflalo2Tacsat
  mutate(.tid = replace_na(.tid, "0"))

near(tacsat$.ridt, ais2$.ridt) |> table(useNA = "ifany")


 TRUE 
67699

(tacsatp$FT_REF == ais2$.tid) |> table(useNA = "ifany")


 TRUE 
67699