library(data.table)
library(tidyverse)
Libraries and functions
The trip_assign
function below is based on issue #52. Only change:
- Functions documentation initiated
- The ‘haul_logbook’ part was stripped (consider this temporary)
- Some opinionated comments where added/amended
- Added a suppressWarning in one line
#' Assign logbook events to vessel positions
#'
#'
#' The script section associated with haul_logbooks has been removed from this
#' version of the function.Thus the argument 'haul_logbooks' is inactive.
#'
#' The approach:
#'
#' * For pings where catch date is reported in the logbooks allocate the gear with the highest
#' recorded catches within the catch date (no aggregation none a priori within a catch date).
#' * For pings outside reported logbook catch dates allocate the gear with the highest trip
#' catch.
#'
#' @param tacsatp Vessel position, tacsat format
#' @param eflalo Logbooks, eflalo format
#' @param col An atomic character (default "LE_GEAR"), normally "LE_GEAR", "LE_MZS", "LE_MET"
#' @param haul_logbook Boolean (default FALSE), **not active** in this version
#'ot
#' @return A dataframe, returning only rows where trip has more than one gear (or other variables
#' specified by argument 'col', with additional/overwritten variable specified in the col argument.
#' @export
#'
<- function(tacsatp, eflalo, col = "LE_GEAR", haul_logbook = FALSE) {
trip_assign
# Count the number of gear (or other variables specified by argument 'col') per trip
# The name of the count variable is by default 'V1'
<- data.table(eflalo)[!is.na(get(col)),.(uniqueN(get(col))), by=.(FT_REF)]
tst
# Use/filter only **trips** with more then **one** gear (or other ...)
# Einar's comment: Think this works where trips has only **one** gear (or other ...)
# So the usage of conditional if is questionable
# If dropped amend/simplify the code in the script, e.g. no row-bind needed
if(nrow(tst[V1>1]) > 0 ) {
<- data.table(eflalo)[FT_REF %in% tst[V1>1]$FT_REF]
e <- data.table(tacsatp)[FT_REF %in% tst[V1>1]$FT_REF]
tz suppressWarnings(tz[, (col) := NULL]) # Any prior allocation is nullified
# Allocate gear (or other variable) with the highest catch within a **date** (LE_CDAT)
# Note this leaves pings within a trip on other dates than catch dates as NA, that being dealt with downstream
<- "LE_KG_TOT" # the column you want to order by (string)
order_col <- e[order(-get(order_col)), # order the records by total LE_KG, the negative ensures descending order (highest first)
e2 get(col)[1L]), # get the gear corresponding with the higest catch
.(= .(FT_REF, LE_CDAT)] # .... within a trip and catch date
by names(e2) <- c("FT_REF", "LE_CDAT", col)
# Einar's comment; I think usage of "many-to-many" and followed by the unique argument may invite trouble
# At minimum a justification for usage of these two is warranted. I.e. why?
<- tz |>
tz left_join(e2, by = c("FT_REF" = "FT_REF", "SI_DATE" = "LE_CDAT"), relationship = "many-to-many")
<- unique(tz) #%>% as.data.frame()
tz
# Einar - STRIPPED if(haul_logbook)
# For vms **dates** where logbook catch is not reported we still have not allocated any gear (those
# pings are NA's at the moment). For those pings (non-reported fishing days) allocate the
# gear that has the highest catch within the **trip** (FT_REF)
if(nrow( tz[is.na(get(col))] ) > 0){
= tz %>% filter ( is.na ( get(col) )) %>% distinct(FT_REF) %>% pull()
ft_ref_isna = tz %>% filter ( FT_REF %in% ft_ref_isna & is.na ( get ( col) ) ) %>% as.data.frame()
tz2 = e %>% filter ( FT_REF %in% ft_ref_isna )
e2
if(!"LE_KG_TOT" %in% names(e2)){
<- colnames(e2)[grepl("LE_KG_|LE_EURO_", colnames(e2))]
idxkgeur # Calculate the total KG and EURO for each row
$LE_KG_TOT <- rowSums(e2[,..idxkgeur], na.rm = TRUE)
e2
}
<- e2[,.(LE_KG_TOT = sum(LE_KG_TOT, na.rm = T)), by = .(FT_REF, get(col))]
highvalue <- highvalue[,.(get[which.max(LE_KG_TOT)]), by = .(FT_REF)]
highvalue names(highvalue) <- c("FT_REF", col)
<- tz2
tx2 = tx2 %>% dplyr::select ( - any_of( col ) )
tx2 <- tx2 %>% inner_join ( highvalue, by = "FT_REF")
tz2
}
= tz %>% filter(!is.na(get(col)))
tz if(exists("tz2")) {tz = rbind(tz, tz2)}
= tz |> as.data.frame()
tz
return(tz)
# Einar's comment: The else can be dropped if I am right about the stuff mentioned
# at the start of the if-statement
else {
} warning(paste("No more than one value for ", col, " in EFLALO trips"))
return(data.frame())
} }
The trip_assign
function - digging under the hood
One way to try to understand or test a function is to generate short bare minimum datasets. Here we just focus on gear, creating two trips, one where single gear is used another with two gears and three event records (one gear has catch reported in two different rectangles):
<-
eflalo tribble(
~VE_REF, ~FT_REF, ~LE_GEAR, ~LE_ID, ~LE_CDAT, ~LE_RECT, ~LE_KG_TOT,
"two", "trip1", "GNS", "1", "01/01/1901", "one", 10000, # trip w. 1 gear
"two", "trip2", "OTB", "1", "02/01/2001", "one", 99, # trip w. >1 gear
"two", "trip2", "OTM", "2", "02/01/2001", "two", 100,
"two", "trip2", "OTB", "3", "02/01/2001", "two", 99)
<-
tacsatp tribble(~.id, ~VE_REF, ~FT_REF, ~SI_DATE, ~comment,
1L, "two", "trip1", "01/01/1901", "this gets dropped",
2L, "two", "trip2", "01/01/2001", "outbound, no lb record",
3L, "two", "trip2", "02/01/2001", "lb record",
4L, "two", "trip2", "02/01/2001", "lb record",
5L, "two", "trip2", "02/01/2001", "lb record",
6L, "two", "trip2", "03/01/2001", "inbound, no lb record"
)
Take note:
- Trip 1 has only a single gear
- Trip 2 contains more than one gear. Here the highest individual event record of catches on day “02/01/2001” is 100 kg where gear OTM was used.
- On that date however the gear with the highest total catches is OTB (99 + 99 = 198 kg)
Let’s see what happens when confronting these data with trip_assign
function.
trip_assign(tacsatp, eflalo) |>
arrange(.id)
.id VE_REF FT_REF SI_DATE comment LE_GEAR
1 2 two trip2 01/01/2001 outbound, no lb record OTB
2 3 two trip2 02/01/2001 lb record OTM
3 4 two trip2 02/01/2001 lb record OTM
4 5 two trip2 02/01/2001 lb record OTM
5 6 two trip2 03/01/2001 inbound, no lb record OTB
So, this confirms that the function is correctly interpreted as indicated in the above function documentation. Just to repeat it using slightly different wording:
- If fishing is reported on a particular catch date (LE_CDAT) allocate the gear that has the highest event catches within that date to the pings for that date.
- For pings falling on dates where no catch is reported use the gear that has the highest summed catches over the whole trip.
Other things to note:
- The function does not return trip records where the a single gear was used
- The order of the returned dataframe is not the same as in the original dataframe.
For now I refrain from having an opinion about why things are done this way, the objective here was just to test my understanding of what the trip_assign
function does.
Trial using pure tidyverse approach
If I were to emulate this in pure tidyverse I would try this:
- Create a table from logbooks only containing the gear with the highest reported event catch within a fishing date
- Create a table from logbooks only containing the gear with the highest trip catches
- Join both these tables to the vms first and then do the final gear allocation to each ping
Let’s just take this step at a time:
<-
highest_event_catches |>
eflalo arrange(desc(LE_KG_TOT)) |>
group_by(VE_REF, FT_REF, LE_CDAT) |>
slice(1) |>
ungroup() |>
select(VE_REF, FT_REF, LE_CDAT, gear_maxkg_event = LE_GEAR)
|>
tacsatp left_join(highest_event_catches,
by = join_by(VE_REF, FT_REF, SI_DATE == LE_CDAT),
relationship = "many-to-one") |>
::kable(caption = "Step 1: Gear allocation based on the highest event catces") knitr
.id | VE_REF | FT_REF | SI_DATE | comment | gear_maxkg_event |
---|---|---|---|---|---|
1 | two | trip1 | 01/01/1901 | this gets dropped | GNS |
2 | two | trip2 | 01/01/2001 | outbound, no lb record | NA |
3 | two | trip2 | 02/01/2001 | lb record | OTM |
4 | two | trip2 | 02/01/2001 | lb record | OTM |
5 | two | trip2 | 02/01/2001 | lb record | OTM |
6 | two | trip2 | 03/01/2001 | inbound, no lb record | NA |
- Gear with highest trip catch
<-
highest_trip_catches |>
eflalo group_by(VE_REF, FT_REF, LE_GEAR) |>
summarise(LE_KG_TOT = sum(LE_KG_TOT),
.groups = "drop") |>
arrange(desc(LE_KG_TOT)) |>
group_by(VE_REF, FT_REF) |>
slice(1) |>
ungroup() |>
select(VE_REF, FT_REF, gear_maxkg_trip = LE_GEAR)
|>
tacsatp left_join(highest_trip_catches,
by = join_by(VE_REF, FT_REF),
relationship = "many-to-one") |>
::kable(caption = "Step 2: Gear allocation based on the highest trip catces") knitr
.id | VE_REF | FT_REF | SI_DATE | comment | gear_maxkg_trip |
---|---|---|---|---|---|
1 | two | trip1 | 01/01/1901 | this gets dropped | GNS |
2 | two | trip2 | 01/01/2001 | outbound, no lb record | OTB |
3 | two | trip2 | 02/01/2001 | lb record | OTB |
4 | two | trip2 | 02/01/2001 | lb record | OTB |
5 | two | trip2 | 02/01/2001 | lb record | OTB |
6 | two | trip2 | 03/01/2001 | inbound, no lb record | OTB |
- All steps above and then munge:
|>
tacsatp left_join(highest_event_catches,
by = join_by(VE_REF, FT_REF, SI_DATE == LE_CDAT),
relationship = "many-to-one") |>
left_join(highest_trip_catches,
by = join_by(VE_REF, FT_REF),
relationship = "many-to-one") |>
mutate(LE_GEAR = case_when(!is.na(gear_maxkg_event) ~ gear_maxkg_event,
is.na(gear_maxkg_event) & !is.na(gear_maxkg_trip) ~ gear_maxkg_trip,
# capture possible bugs
.default = "Unexpected"),
.after = SI_DATE) |>
select(.id, VE_REF, FT_REF, SI_DATE, LE_GEAR) |>
::kable(caption = "Step 1, 2 & 3: Final gear allocations to pings") knitr
.id | VE_REF | FT_REF | SI_DATE | LE_GEAR |
---|---|---|---|---|
1 | two | trip1 | 01/01/1901 | GNS |
2 | two | trip2 | 01/01/2001 | OTB |
3 | two | trip2 | 02/01/2001 | OTM |
4 | two | trip2 | 02/01/2001 | OTM |
5 | two | trip2 | 02/01/2001 | OTM |
6 | two | trip2 | 03/01/2001 | OTB |
So we have reproduce the process, but here with the added bonus that:
- No dichotomy in the process, single and multiple gear trips done in one sweep (the
trip_assign
and the datacall process could likely easily be modified doing the same). - The original order of the vms data remains
One could easily generalize the process to work with other variables (LE_MZS, LE_MET6). Or just do the three variable allocation in one sweep.
Comparison using the vmstools datasets
Just an added test testing the two approaches using the built in vmstools datasets
library(vmstools)
data("eflalo")
data("tacsat")
<-
eflalo_dcall |>
eflalo mutate(weight = rowSums(dplyr::across(dplyr::starts_with("LE_KG_")))) |>
select(-c(starts_with("LE_KG_"), starts_with("LE_EURO"))) |>
rename(LE_KG_TOT = weight)
<- eflalo_dcall # to be used in the tidyverse test
eflalo_tvers
<-
tacsatp_dcall ::mergeEflalo2Tacsat(eflalo_dcall, tacsat) |>
vmstoolsfilter(FT_REF != "0") |>
mutate(.rowid = 1:n(),
.before = VE_COU)
<- tacsatp_dcall # to be used in the tidyverse test tacsatp_tvers
Datacall flow
# Define the columns to be added
<- c("LE_GEAR", "LE_MSZ", "VE_LEN", "VE_KW", "LE_RECT", "LE_MET", "LE_WIDTH", "VE_FLT", "VE_COU")
cols
# Use a loop to add each column
for (col in cols) {
# Match 'FT_REF' values in 'tacsatp' and 'eflalo' and use these to add the column from 'eflalo' to 'tacsatp'
<- eflalo_dcall[[col]][match(tacsatp_dcall$FT_REF, eflalo_dcall$FT_REF)]
tacsatp_dcall[[col]]
}<- trip_assign(tacsatp_dcall, eflalo_dcall, col = "LE_GEAR", haul_logbook = F)
tacsatpa_dcall_LE_GEAR <-
tacsatp_dcall rbindlist(list(tacsatp_dcall[!tacsatp_dcall$FT_REF %in% tacsatpa_dcall_LE_GEAR$FT_REF,],
fill = T) tacsatpa_dcall_LE_GEAR),
Pure tidyverse
<-
gear_max_catch_per_day |>
eflalo_tvers group_by(VE_COU, VE_REF, FT_REF, LE_GEAR, LE_CDAT) |>
summarise(catch = sum(LE_KG_TOT),
.groups = "drop") |>
arrange(desc(catch)) |>
group_by(VE_COU, VE_REF, FT_REF, LE_CDAT) |>
slice(1) |>
ungroup() |>
rename(gear_date = LE_GEAR)
# unfortunately these are the same
<-
gear_max_catch_per_trip |>
eflalo_tvers group_by(VE_COU, VE_REF, FT_REF, LE_GEAR) |>
summarise(catch = sum(LE_KG_TOT),
.groups = "drop") |>
arrange(desc(catch)) |>
group_by(VE_COU, VE_REF, FT_REF) |>
slice(1) |>
ungroup() |>
rename(gear_trip = LE_GEAR)
# now for the join
<-
tacsatp_tvers |>
tacsatp_tvers left_join(gear_max_catch_per_day |> select(-catch),
by = join_by(VE_COU, VE_REF, FT_REF, SI_DATE == LE_CDAT),
relationship = "many-to-one") |>
left_join(gear_max_catch_per_trip |> select(-catch),
by = join_by(VE_COU, VE_REF, FT_REF),
relationship = "many-to-one") |>
mutate(gear_tvers = case_when(!is.na(gear_date) ~ gear_date,
!is.na(gear_trip) ~ gear_trip,
.default = "unexpected"))
Comparison
Join the vms datasets by rowid and make a comparison of gear allocation:
nrow(tacsatp_tvers) == nrow(tacsatp_dcall)
[1] TRUE
|>
tacsatp_tvers as_tibble() |>
select(.rowid, gear_tvers) |>
full_join(tacsatp_dcall |> select(.rowid, gear_dcall = LE_GEAR),
by = join_by(.rowid)) |>
count(gear_tvers, gear_dcall) |>
::kable(caption = "Comparions of allocation of gear to pings, datacall vs tverse") knitr
gear_tvers | gear_dcall | n |
---|---|---|
DRB | DRB | 680 |
GNS | GNS | 9 |
MIS | MIS | 123 |
OTB | OTB | 13494 |
OTM | OTM | 12249 |
PTB | PTB | 91 |
TBB | TBB | 54173 |
So things looking dandy. Now even though the two approaches give the same results further comparative test on diverse datasets is warranted.