Assign event variables to the vms pings

Libraries and functions

library(data.table)
library(tidyverse)

The trip_assign function below is based on issue #52. Only change:

Functions documentation initiated
The ‘haul_logbook’ part was stripped (consider this temporary)
Some opinionated comments where added/amended
Added a suppressWarning in one line

#' Assign logbook events to vessel positions
#'
#' 
#' The script section associated with haul_logbooks has been removed from this
#' version of the function.Thus the argument 'haul_logbooks' is inactive.
#' 
#' The approach:
#' 
#' * For pings where catch date is reported in the logbooks allocate the gear with the highest
#' recorded catches within the catch date (no aggregation none a priori within a catch date).
#' * For pings outside reported logbook catch dates allocate the gear with the highest trip
#' catch.
#' 
#' @param tacsatp Vessel position, tacsat format 
#' @param eflalo Logbooks, eflalo format
#' @param col An atomic character (default "LE_GEAR"), normally "LE_GEAR", "LE_MZS", "LE_MET"
#' @param haul_logbook Boolean (default FALSE), **not active** in this version
#'ot
#' @return A dataframe, returning only rows where trip has more than one gear (or other variables
#' specified by argument 'col', with additional/overwritten variable specified in the col argument.
#' @export
#'
trip_assign <- function(tacsatp, eflalo, col = "LE_GEAR", haul_logbook = FALSE) {
  
  # Count the number of gear (or other variables specified by argument 'col') per trip
  #  The name of the count variable is by default 'V1'
  tst <- data.table(eflalo)[!is.na(get(col)),.(uniqueN(get(col))), by=.(FT_REF)]
  
  # Use/filter only **trips** with more then **one** gear (or other ...)
  # Einar's comment: Think this works where trips has only **one** gear (or other ...)
  #                  So the usage of conditional if is questionable
  #                  If dropped amend/simplify the code in the script, e.g. no row-bind needed
  if(nrow(tst[V1>1]) > 0 ) {
    
    e <- data.table(eflalo)[FT_REF %in% tst[V1>1]$FT_REF]
    tz <- data.table(tacsatp)[FT_REF  %in% tst[V1>1]$FT_REF]
    suppressWarnings(tz[, (col) := NULL])  # Any prior allocation is nullified
    
    # Allocate gear (or other variable) with the highest catch within a **date** (LE_CDAT) 
    #  Note this leaves pings within a trip on other dates than catch dates as NA, that being dealt with downstream
    order_col <- "LE_KG_TOT"  # the column you want to order by (string)
    e2 <- e[order(-get(order_col)),               # order the records by total LE_KG, the negative ensures descending order (highest first) 
            .(get(col)[1L]),                      # get the gear corresponding with the higest catch
            by = .(FT_REF, LE_CDAT)]              #   .... within a trip and catch date
    names(e2) <- c("FT_REF", "LE_CDAT", col)
    # Einar's comment; I think usage of "many-to-many" and followed by the unique argument may invite trouble
    #                  At minimum a justification for usage of these two is warranted. I.e. why?
    tz <- tz |>
      left_join(e2, by = c("FT_REF" = "FT_REF", "SI_DATE" = "LE_CDAT"), relationship = "many-to-many")
    tz <- unique(tz)  #%>%  as.data.frame()
    
    # Einar - STRIPPED if(haul_logbook)
    
    # For vms **dates** where logbook catch is not reported we still have not allocated any gear (those
    #  pings are NA's at the moment). For those pings (non-reported fishing days) allocate the 
    #  gear that has the highest catch within the **trip** (FT_REF) 
    
    if(nrow(  tz[is.na(get(col))] ) > 0){
      ft_ref_isna = tz %>%  filter ( is.na ( get(col)  )) %>%  distinct(FT_REF) %>% pull()
      tz2 = tz %>%  filter ( FT_REF %in% ft_ref_isna & is.na ( get ( col)  )  ) %>%  as.data.frame()
      e2 = e %>%  filter ( FT_REF %in% ft_ref_isna )
      
      if(!"LE_KG_TOT" %in% names(e2)){
        idxkgeur <- colnames(e2)[grepl("LE_KG_|LE_EURO_", colnames(e2))]
        # Calculate the total KG and EURO for each row
        e2$LE_KG_TOT <- rowSums(e2[,..idxkgeur], na.rm = TRUE)
      }
      
      highvalue <- e2[,.(LE_KG_TOT = sum(LE_KG_TOT, na.rm = T)), by = .(FT_REF, get(col))]
      highvalue <- highvalue[,.(get[which.max(LE_KG_TOT)]), by = .(FT_REF)]
      names(highvalue) <-  c("FT_REF", col)
      
      tx2 <- tz2
      tx2 = tx2 %>%  dplyr::select ( - any_of( col )  )
      tz2 <- tx2 %>%  inner_join ( highvalue, by =  "FT_REF")
      
    }
    
    tz = tz %>% filter(!is.na(get(col)))
    if(exists("tz2")) {tz = rbind(tz, tz2)}
    tz = tz |> as.data.frame()
    
    return(tz)
    # Einar's comment: The else can be dropped if I am right about the stuff mentioned
    #                  at the start of the if-statement
  } else {
    warning(paste("No more than one value for ", col, " in EFLALO trips"))
    return(data.frame())
  }
}

The `trip_assign` function - digging under the hood

One way to try to understand or test a function is to generate short bare minimum datasets. Here we just focus on gear, creating two trips, one where single gear is used another with two gears and three event records (one gear has catch reported in two different rectangles):

eflalo <- 
  tribble(
    ~VE_REF,  ~FT_REF, ~LE_GEAR, ~LE_ID,     ~LE_CDAT, ~LE_RECT, ~LE_KG_TOT,
      "two",  "trip1",    "GNS",    "1", "01/01/1901",  "one",    10000,   # trip w.  1 gear
      "two",  "trip2",    "OTB",    "1", "02/01/2001",  "one",       99,   # trip w. >1 gear
      "two",  "trip2",    "OTM",    "2", "02/01/2001",  "two",      100,
      "two",  "trip2",    "OTB",    "3", "02/01/2001",  "two",       99)
tacsatp <-
  tribble(~.id, ~VE_REF,  ~FT_REF,     ~SI_DATE, ~comment,
          1L,   "two",  "trip1", "01/01/1901",   "this gets dropped",
          2L,   "two",  "trip2", "01/01/2001",   "outbound, no lb record",
          3L,   "two",  "trip2", "02/01/2001",   "lb record",
          4L,   "two",  "trip2", "02/01/2001",   "lb record",
          5L,   "two",  "trip2", "02/01/2001",   "lb record",
          6L,   "two",  "trip2", "03/01/2001",   "inbound, no lb record"
          )

Take note:

Trip 1 has only a single gear
Trip 2 contains more than one gear. Here the highest individual event record of catches on day “02/01/2001” is 100 kg where gear OTM was used.
On that date however the gear with the highest total catches is OTB (99 + 99 = 198 kg)

Let’s see what happens when confronting these data with trip_assign function.

trip_assign(tacsatp, eflalo) |> 
  arrange(.id)

  .id VE_REF FT_REF    SI_DATE                comment LE_GEAR
1   2    two  trip2 01/01/2001 outbound, no lb record     OTB
2   3    two  trip2 02/01/2001              lb record     OTM
3   4    two  trip2 02/01/2001              lb record     OTM
4   5    two  trip2 02/01/2001              lb record     OTM
5   6    two  trip2 03/01/2001  inbound, no lb record     OTB

So, this confirms that the function is correctly interpreted as indicated in the above function documentation. Just to repeat it using slightly different wording:

If fishing is reported on a particular catch date (LE_CDAT) allocate the gear that has the highest event catches within that date to the pings for that date.
For pings falling on dates where no catch is reported use the gear that has the highest summed catches over the whole trip.

Other things to note:

The function does not return trip records where the a single gear was used
The order of the returned dataframe is not the same as in the original dataframe.

For now I refrain from having an opinion about why things are done this way, the objective here was just to test my understanding of what the trip_assign function does.

Trial using pure tidyverse approach

If I were to emulate this in pure tidyverse I would try this:

Create a table from logbooks only containing the gear with the highest reported event catch within a fishing date
Create a table from logbooks only containing the gear with the highest trip catches
Join both these tables to the vms first and then do the final gear allocation to each ping

Let’s just take this step at a time:

highest_event_catches <-
  eflalo |> 
  arrange(desc(LE_KG_TOT)) |> 
  group_by(VE_REF, FT_REF, LE_CDAT) |> 
  slice(1) |> 
  ungroup() |> 
  select(VE_REF, FT_REF, LE_CDAT, gear_maxkg_event = LE_GEAR)
tacsatp |> 
  left_join(highest_event_catches,
            by = join_by(VE_REF, FT_REF, SI_DATE == LE_CDAT),
            relationship = "many-to-one") |> 
  knitr::kable(caption = "Step 1: Gear allocation based on the highest event catces")

Step 1: Gear allocation based on the highest event catces
.id	VE_REF	FT_REF	SI_DATE	comment	gear_maxkg_event
1	two	trip1	01/01/1901	this gets dropped	GNS
2	two	trip2	01/01/2001	outbound, no lb record	NA
3	two	trip2	02/01/2001	lb record	OTM
4	two	trip2	02/01/2001	lb record	OTM
5	two	trip2	02/01/2001	lb record	OTM
6	two	trip2	03/01/2001	inbound, no lb record	NA

Gear with highest trip catch

highest_trip_catches <-
  eflalo |> 
  group_by(VE_REF, FT_REF, LE_GEAR) |> 
  summarise(LE_KG_TOT = sum(LE_KG_TOT),
            .groups = "drop") |> 
  arrange(desc(LE_KG_TOT)) |> 
  group_by(VE_REF, FT_REF) |> 
  slice(1) |> 
  ungroup() |> 
  select(VE_REF, FT_REF, gear_maxkg_trip = LE_GEAR)
tacsatp |> 
  left_join(highest_trip_catches,
            by = join_by(VE_REF, FT_REF),
            relationship = "many-to-one") |> 
  knitr::kable(caption = "Step 2: Gear allocation based on the highest trip catces")

Step 2: Gear allocation based on the highest trip catces
.id	VE_REF	FT_REF	SI_DATE	comment	gear_maxkg_trip
1	two	trip1	01/01/1901	this gets dropped	GNS
2	two	trip2	01/01/2001	outbound, no lb record	OTB
3	two	trip2	02/01/2001	lb record	OTB
4	two	trip2	02/01/2001	lb record	OTB
5	two	trip2	02/01/2001	lb record	OTB
6	two	trip2	03/01/2001	inbound, no lb record	OTB

All steps above and then munge:

tacsatp |> 
  left_join(highest_event_catches,
            by = join_by(VE_REF, FT_REF, SI_DATE == LE_CDAT),
            relationship = "many-to-one") |> 
  left_join(highest_trip_catches,
            by = join_by(VE_REF, FT_REF),
            relationship = "many-to-one") |> 
  mutate(LE_GEAR = case_when(!is.na(gear_maxkg_event) ~ gear_maxkg_event,
                             is.na(gear_maxkg_event) & !is.na(gear_maxkg_trip) ~ gear_maxkg_trip,
                             # capture possible bugs
                             .default = "Unexpected"),
         .after = SI_DATE) |> 
  select(.id, VE_REF, FT_REF, SI_DATE, LE_GEAR) |> 
  knitr::kable(caption = "Step 1, 2 & 3: Final gear allocations to pings")

Step 1, 2 & 3: Final gear allocations to pings
.id	VE_REF	FT_REF	SI_DATE	LE_GEAR
1	two	trip1	01/01/1901	GNS
2	two	trip2	01/01/2001	OTB
3	two	trip2	02/01/2001	OTM
4	two	trip2	02/01/2001	OTM
5	two	trip2	02/01/2001	OTM
6	two	trip2	03/01/2001	OTB

So we have reproduce the process, but here with the added bonus that:

No dichotomy in the process, single and multiple gear trips done in one sweep (the trip_assign and the datacall process could likely easily be modified doing the same).
The original order of the vms data remains

One could easily generalize the process to work with other variables (LE_MZS, LE_MET6). Or just do the three variable allocation in one sweep.

Comparison using the vmstools datasets

Just an added test testing the two approaches using the built in vmstools datasets

library(vmstools)
data("eflalo")
data("tacsat")

eflalo_dcall <- 
  eflalo |> 
  mutate(weight = rowSums(dplyr::across(dplyr::starts_with("LE_KG_")))) |> 
  select(-c(starts_with("LE_KG_"), starts_with("LE_EURO"))) |> 
  rename(LE_KG_TOT = weight)
eflalo_tvers <- eflalo_dcall # to be used in the tidyverse test
  
tacsatp_dcall <- 
  vmstools::mergeEflalo2Tacsat(eflalo_dcall, tacsat) |> 
  filter(FT_REF != "0") |> 
  mutate(.rowid = 1:n(),
         .before = VE_COU)
tacsatp_tvers <- tacsatp_dcall # to be used in the tidyverse test

Datacall flow

# Define the columns to be added
cols <- c("LE_GEAR", "LE_MSZ", "VE_LEN", "VE_KW", "LE_RECT", "LE_MET", "LE_WIDTH", "VE_FLT", "VE_COU")

# Use a loop to add each column
for (col in cols) {
  # Match 'FT_REF' values in 'tacsatp' and 'eflalo' and use these to add the column from 'eflalo' to 'tacsatp'
  tacsatp_dcall[[col]] <- eflalo_dcall[[col]][match(tacsatp_dcall$FT_REF, eflalo_dcall$FT_REF)]
}
tacsatpa_dcall_LE_GEAR <- trip_assign(tacsatp_dcall, eflalo_dcall, col = "LE_GEAR",  haul_logbook = F)
tacsatp_dcall <- 
  rbindlist(list(tacsatp_dcall[!tacsatp_dcall$FT_REF %in% tacsatpa_dcall_LE_GEAR$FT_REF,],
                 tacsatpa_dcall_LE_GEAR), fill = T)

Pure tidyverse

gear_max_catch_per_day <- 
  eflalo_tvers |> 
  group_by(VE_COU, VE_REF, FT_REF, LE_GEAR, LE_CDAT) |> 
  summarise(catch = sum(LE_KG_TOT),
            .groups = "drop") |> 
  arrange(desc(catch)) |> 
  group_by(VE_COU, VE_REF, FT_REF, LE_CDAT) |> 
  slice(1) |> 
  ungroup() |> 
  rename(gear_date = LE_GEAR)
# unfortunately these are the same
gear_max_catch_per_trip <- 
  eflalo_tvers |> 
  group_by(VE_COU, VE_REF, FT_REF, LE_GEAR) |> 
  summarise(catch = sum(LE_KG_TOT),
            .groups = "drop") |> 
  arrange(desc(catch)) |> 
  group_by(VE_COU, VE_REF, FT_REF) |> 
  slice(1) |> 
  ungroup() |> 
  rename(gear_trip = LE_GEAR)
# now for the join
tacsatp_tvers <- 
  tacsatp_tvers |> 
  left_join(gear_max_catch_per_day |> select(-catch),
            by = join_by(VE_COU, VE_REF, FT_REF, SI_DATE == LE_CDAT),
            relationship = "many-to-one") |> 
  left_join(gear_max_catch_per_trip |> select(-catch),
            by = join_by(VE_COU, VE_REF, FT_REF),
            relationship = "many-to-one") |> 
  mutate(gear_tvers = case_when(!is.na(gear_date) ~ gear_date,
                          !is.na(gear_trip) ~ gear_trip,
                          .default = "unexpected"))

Comparison

Join the vms datasets by rowid and make a comparison of gear allocation:

nrow(tacsatp_tvers) == nrow(tacsatp_dcall)

[1] TRUE

tacsatp_tvers |> 
  as_tibble() |> 
  select(.rowid, gear_tvers) |> 
  full_join(tacsatp_dcall |> select(.rowid, gear_dcall = LE_GEAR),
            by = join_by(.rowid)) |> 
  count(gear_tvers, gear_dcall) |> 
  knitr::kable(caption = "Comparions of allocation of gear to pings, datacall vs tverse")

Comparions of allocation of gear to pings, datacall vs tverse
gear_tvers	gear_dcall	n
DRB	DRB	680
GNS	GNS	9
MIS	MIS	123
OTB	OTB	13494
OTM	OTM	12249
PTB	PTB	91
TBB	TBB	54173

So things looking dandy. Now even though the two approaches give the same results further comparative test on diverse datasets is warranted.

Reuse

CC BY 4.0