Dataflow: Going -> dplyr -> duckdb

Code
# setup ------------------------------------------------------------------------
## libraries -------------------------------------------------------------------
library(vmstools)
library(tidyverse)
library(duckdb)
## ad-hoc functions ------------------------------------------------------------
### ramb functions
source("https://raw.githubusercontent.com/einarhjorleifsson/ramb/refs/heads/main/R/format.R")
source("https://raw.githubusercontent.com/einarhjorleifsson/ramb/refs/heads/main/R/sum_across.R")
### datacall functions - can not source global.R, here a datacall trip_assign as a temporary gist
source("https://gist.githubusercontent.com/einarhjorleifsson/deee46b493f1f95e95e905c15f1c2432/raw/e2d2a0a8241ac8f9c86c3971de677ff260d7319d/rb_trip_assign.R")

rb_getCodeList <- function(code_type, from_file = TRUE) {
  if(from_file) {
    readr::read_rds(paste0(here::here("data/icesVocab"), "/", code_type, ".rds"))
  } else {
    icesVocab::getCodeList(code_type)
  }
}
iv_gear <-   rb_getCodeList("GearType")$Key
iv_target <- rb_getCodeList("TargetAssemblage")$Key
iv_met6 <-   rb_getCodeList("Metier6_FishingActivity")$Key

# Data pre-processing-----------------------------------------------------------
#  Use eflalo and tacsat in the vmstools package as a demo
#  Only do some minimal stuff here
## logbooks --------------------------------------------------------------------
data(eflalo)
# consolidate
eflalop <- 
  eflalo |> 
  as_tibble() |>
  rb_sum_across("LE_KG", remove = TRUE) |> 
  rb_sum_across("LE_EURO", remove = TRUE) |> 
  mutate(D_DATIM = rb_create_timestamp(FT_DDAT, FT_DTIME),
         L_DATIM = rb_create_timestamp(FT_LDAT, FT_LTIME),
         # LE_CDAT = dmy(LE_CDAT), # should be, but ...
         .after = FT_REF) |> 
  select(-c(FT_DDAT, FT_DTIME, FT_LDAT, FT_LTIME)) |> 
  rename(LE_MET6 = LE_MET_level6)

# NEW
eflalop <- 
  eflalop |> 
  # only needed if one wants to do additional testing
  separate(LE_MET6, into = c(".m6gear", ".m6target", ".m6mesh", ".m6rest"), sep = "_", remove = FALSE, extra = "merge") |> 
  separate(.m6mesh, into = c(".m1", ".m2"), sep = "-", remove = FALSE, fill = "right", convert = TRUE) |> 
  separate(LE_ID, into = c(".etid", ".egear", ".eir"), sep = "-", remove = FALSE) |> 
  # TODO: Deal with .m1 or .m2 being characters
  #mutate(.m2 = case_when(is.na(.m2) & as.character(.m1) == "0" ~ "0",
  #                       .default = .m2)) |> 
  mutate(
    checks = case_when(                                                   # datacall
      # Catch record outlier - code pending                               # 1.3.2
      base::duplicated(paste(VE_REF, LE_ID, LE_CDAT)) ~ "01 duplicated events",   # 1.3.3
      is.na(D_DATIM) | is.na(L_DATIM) ~ "02 impossible time",                       # 1.3.4
      year(D_DATIM) == (year(L_DATIM) - 1) ~ "03 new years trip",                   # 1.3.5
      D_DATIM > L_DATIM ~ "04 departure before arrival",                           # 1.3.6
      # "05 overlapping trips  - code pending                             # 1.3.7
      !LE_GEAR %in% iv_gear ~ "06 gear (metier 4) invalid",                  # 1.4.1
      !LE_MET6 %in% iv_met6 ~ "07 metier 6 invalid",                         # 3.5.5
      # Addendum, needs discussions
      !between(dmy(LE_CDAT), as_date(D_DATIM), as_date(L_DATIM)) ~ "ok - 08 catch date not within trip",
      # metier level 6 component checks
      LE_GEAR != .m6gear ~ "ok - 01 met6 gear different from gear",
      !.m6target %in% iv_target ~ "ok - 10 met6 target invalid",
      # event id component checks
      FT_REF != .etid ~ "ok - 11 event id tripid different from tripid",
      LE_GEAR != .egear ~ "ok - 12 event id gear different from gear",
      LE_RECT != .eir ~ "ok - 13 event id rectangle different from rectangle",
      # Pending solution prior to case_when applied
      #!between(mesh, as.integer(.m1), .m2) ~ "11 mesh size not in the met6 range",
      .default = "ok")
  )
eflalop |> 
  count(checks) |> 
  mutate('%' = round(n / sum(n) * 100, 2)) |> 
  knitr::kable(caption = "Logbooks: All records not starting with 'ok' will be filtered downstream")
Logbooks: All records not starting with ‘ok’ will be filtered downstream
checks n %
01 duplicated events 3 0.07
07 metier 6 invalid 741 16.33
ok 3775 83.17
ok - 01 met6 gear different from gear 20 0.44
Code
eflalop <-
  eflalop |> 
  select(-c(.etid, .egear, .eir,
            .m6gear, .m6target, .m6mesh, .m1, .m2, .m6rest)) |> 
  filter(str_starts(checks, "ok")) |> 
  select(-checks)
# End of NEW

# create a tidy trip table
#  eflalo is not a tidy structure - can be split into a tidy trip table and then
#   a detail event table. tverse pipe code flow becomes more succinct downstream
trips <-
  eflalop |> 
  group_by(VE_COU, VE_REF, FT_REF) |> 
  mutate(weight = sum(LE_KG),
         price = sum(LE_EURO)) |> 
  ungroup() |> 
  select(VE_COU, VE_REF, VE_LEN, VE_KW, VE_TON, FT_REF, D_DATIM, L_DATIM, weight, price) |> 
  distinct(VE_COU, VE_REF, FT_REF, .keep_all = TRUE)
duck_trips <- trips |> arrow::to_duckdb()  # for later
# The detail table, remove vessel and trip variables except those used as keys 
events <- 
  eflalop |> 
  select(VE_COU, VE_REF, FT_REF, starts_with("LE_")) |> 
  # reduce the clutter for this demo test
  janitor::remove_empty(which = "cols")
duck_events <- events |> arrow::to_duckdb() # for later

## trails ----------------------------------------------------------------------
data(tacsat)
tacsat <- 
  tacsat |> 
  as_tibble() |> 
  mutate(SI_DATIM = ramb::rb_create_timestamp(SI_DATE, SI_TIME),
         .after = VE_REF) |> 
  # select(-c(SI_DATE, SI_TIME)) |> assign_trip only works if these are available
  arrange(VE_COU, VE_REF, SI_DATIM) |> 
  distinct(VE_COU, VE_REF, SI_DATIM, .keep_all = TRUE) |> 
  arrange(VE_COU, VE_COU, SI_DATIM) |> 
  mutate(.pid = 1L:n(),
         .before = VE_COU)

if(FALSE) {
  it_min <- 5 * 60 # Interval threshold minimum [units: seconds]
  tacsat <- 
    tacsat |> 
    # TODO: There are different ways to skin a cat - here calculating sequential statistics
    # group_by(cid, vid, time) |> 
    #  see: 
    # mutate(dt = as.numeric(difftime(lead(time), time, units = "sec"))) |> 
    # fill(dt, .direction = "down") |> 
    # ungroup() |> 
    mutate(
      checks =
        case_when(                                                                            # datacall
          # will argue that these comes first (now done in 2.3.2)
          is.na(VE_COU) ~ "00 1 no country id",
          is.na(VE_REF) ~ "00 2 no vessel id",
          is.na(SI_DATIM) ~ "00 3 no time",
          is.na(SI_LONG) ~  "00 4 no lon",
          is.na(SI_LATI) ~  "00 5 no lat",
          # Would put question mark to this, one can derive speed
          is.na(SI_SP) ~ "00 6 no speed",
          # TODO: Outside ICES area",                                                               # 1.2.1
          base::duplicated(paste(VE_COU, VE_REF, SI_LONG, SI_LATI, SI_DATIM)) ~ "02 duplicates",                     # 1.2.2
          # This already inclusive in check 01, should be dropped
          !between(SI_LONG, -180, 180) | !between(SI_LATI, -90, 90) ~ "03 coordinates out of bound",  # 1.2.3
          c(it_min, diff(unclass(SI_DATIM))) < it_min ~ "04 time interval exceeds threshhold",            # 1.2.4
          # TODO: Points in harbour                                                                # 1.2.5
          .default = "ok")
    )
  
  
  tacsat |> 
    count(checks) |> 
    mutate('%' = round(n / sum(n) * 100, 2)) |> 
    knitr::kable(caption = "VMS: All records not 'ok' will be filtered downstream")
  
  tacsat <- 
    tacsat |> 
    filter(str_starts(checks, "ok")) |>
    select(-c(checks))
}

trails <- tacsat

# Analysis ---------------------------------------------------------------------
## Add trip information to pings -----------------------------------------------
### datacallway ----------------------------------------------------------------
tacsatp <- 
  mergeEflalo2Tacsat(eflalop |> as.data.frame(),
                     tacsat |> as.data.frame())
# NOTE: even information added downstream - separated to clarify thinking
cols <- c("VE_LEN", "VE_KW")
for (col in cols) {
  tacsatp[[col]] <- eflalo[[col]][match(tacsatp$FT_REF, eflalo$FT_REF)]
}
### myway ----------------------------------------------------------------------
trails <- 
  trails |> 
  left_join(trips,
            by = join_by(VE_COU, VE_REF, between(SI_DATIM, D_DATIM, L_DATIM)),
            relationship = "many-to-one") |> 
  select(-c(D_DATIM, L_DATIM))
duck_trails <- trails |> arrow::to_duckdb() # for later

### comparisons ----------------------------------------------------------------
identical(tacsatp$.pid, trails$.pid)
[1] FALSE
Code
tacsatp <- tacsatp |> arrange(.pid)
identical(tacsatp$VE_REF, replace_na(trails$VE_REF, "0")) # not really needed
[1] TRUE
Code
identical(tacsatp$FT_REF, replace_na(trails$FT_REF, "0")) # not really needed
[1] TRUE
Code
# TODO: check these
identical(tacsatp$VE_LEN, trails$VE_LEN)
[1] FALSE
Code
identical(tacsatp$VE_KW, trails$VE_KW)
[1] FALSE
Code
# check non-identicals
data(eflalo) # reload eflalo
comparison <- 
  tacsatp |> 
  as_tibble() |> 
  select(.pid, VE_COU, VE_REF, FT_REF, length_dc = VE_LEN, kw_dc = VE_KW) |> 
  left_join(trails |> select(.pid, VE_COU, VE_REF, FT_REF, length_mw = VE_LEN, kw_mw = VE_KW)) |> 
  filter(length_dc != length_mw | kw_dc != kw_mw) |> 
  select(-c(.pid, FT_REF)) |> 
  distinct()
comparison |> 
  left_join(eflalo |> 
              filter(VE_REF %in% unique(comparison$VE_REF)) |> 
              select(VE_REF, VE_LEN, VE_KW) |> 
              distinct()) |> 
  knitr::kable(caption = "Assigning vessel info to pings - discrepancies\n'_dc': datacall, '_mw: myway, original from eflalo\nSeems like myway same as the original")
Assigning vessel info to pings - discrepancies ’_dc’: datacall, ’_mw: myway, original from eflalo Seems like myway same as the original
VE_COU VE_REF length_dc kw_dc length_mw kw_mw VE_LEN VE_KW
Atlantis 729 24.73 221 25.00000 221 25.00000 221
Atlantis 915 24.50 349 24.00000 221 24.00000 221
Atlantis 864 25.27 184 17.52212 111 17.52212 111
Code
# myway seems correct, so something to check in the datacall flow


## Add gear, mesh and met6 to pings --------------------------------------------
#  this is ambiguously called trip assign in the datacall flow
### datacallway ----------------------------------------------------------------
# Now add the event info - this step needed so that next step works - could be simplified
cols <- c("LE_GEAR", "LE_MSZ", "LE_RECT", "LE_MET6")
for (col in cols) {
  tacsatp[[col]] <- eflalop[[col]][match(tacsatp$FT_REF, eflalop$FT_REF)]
}

library(data.table)  # needed in the trip_assign

tacsatpa_LE_GEAR <- trip_assign(tacsatp, eflalop, col = "LE_GEAR",  haul_logbook = F)
# Here and subsequently, replaced %!in% with %in%, moving the ! upfront
tacsatp <- rbindlist(list(tacsatp[!tacsatp$FT_REF %in% tacsatpa_LE_GEAR$FT_REF,], tacsatpa_LE_GEAR), fill = T)

tacsatpa_LE_MSZ <- trip_assign(tacsatp, eflalop, col = "LE_MSZ",  haul_logbook = F)
tacsatp <- rbindlist(list(tacsatp[!tacsatp$FT_REF %in% tacsatpa_LE_MSZ$FT_REF,], tacsatpa_LE_MSZ), fill = T)

tacsatpa_LE_RECT <- trip_assign(tacsatp, eflalop, col = "LE_RECT",  haul_logbook = F)
tacsatp <- rbindlist(list(tacsatp[!tacsatp$FT_REF %in% tacsatpa_LE_RECT$FT_REF,], tacsatpa_LE_RECT), fill = T)

tacsatpa_LE_MET <- trip_assign(tacsatp, eflalop, col = "LE_MET6",  haul_logbook = F)
tacsatp <- rbindlist(list(tacsatp[!tacsatp$FT_REF %in% tacsatpa_LE_MET$FT_REF,], tacsatpa_LE_MET), fill = T)

### myway ----------------------------------------------------------------------
# 1. For each fishing day, add the values of gear, mesh and met6 from the 
#    highest event catch record within the day (no summation done a priori)
# 2. For non-fishing days, add the values of gear, mesh and met6 from the 
#    highest trip catches
highest_event_catch_per_day <-
  events |> 
  arrange(desc(LE_KG)) |> 
  group_by(VE_COU, VE_REF, FT_REF, LE_CDAT) |> 
  slice(1) |> 
  ungroup() |> 
  select(VE_COU, VE_REF, FT_REF, LE_CDAT, LE_GEAR, LE_MSZ, LE_MET6, LE_RECT)
highest_trip_catches <-
  events |> 
  group_by(VE_COU, VE_REF, FT_REF, LE_GEAR, LE_MSZ, LE_MET6, LE_RECT) |> 
  summarise(LE_KG = sum(LE_KG),
            .groups = "drop") |> 
  arrange(desc(LE_KG)) |> 
  group_by(VE_COU, VE_REF, FT_REF) |> 
  slice(1) |> 
  ungroup() |> 
  select(VE_COU, VE_REF, FT_REF, .LE_GEAR = LE_GEAR, .LE_MSZ = LE_MSZ, 
         .LE_MET6 = LE_MET6, .LE_RECT = LE_RECT)
trails_copy <- trails
trails <- 
  trails |> 
  left_join(highest_event_catch_per_day,
            by = join_by(VE_COU, VE_REF, FT_REF, SI_DATE == LE_CDAT),
            relationship = "many-to-one") |> 
  left_join(highest_trip_catches,
            by = join_by(VE_COU, VE_REF, FT_REF),
            relationship = "many-to-one") |> 
  mutate(LE_GEAR = case_when(is.na(LE_GEAR) & !is.na(.LE_GEAR) ~ .LE_GEAR,
                             .default = LE_GEAR),
         LE_MSZ = case_when(is.na(LE_MSZ) & !is.na(.LE_MSZ) ~ .LE_MSZ,
                            .default = LE_MSZ),
         LE_MET6 = case_when(is.na(LE_MET6) & !is.na(.LE_MET6) ~ .LE_MET6,
                             .default = LE_MET6),
         LE_RECT = case_when(is.na(LE_RECT) & !is.na(.LE_RECT) ~ .LE_RECT,
                             .default = LE_RECT)) |> 
  select(-c(.LE_GEAR, .LE_MSZ, .LE_MET6, .LE_RECT))
### comparisons ----------------------------------------------------------------        
identical(tacsatp$.pid, trails$.pid)
[1] FALSE
Code
tacsatp <- tacsatp |> arrange(.pid)
identical(tacsatp$.pid, trails$.pid)
[1] TRUE
Code
identical(tacsatp$FT_REF, replace_na(trails$FT_REF, "0"))
[1] TRUE
Code
identical(tacsatp$LE_MET6, trails$LE_MET6)
[1] TRUE
Code
# TODO: check these
# so some work needed, do not get identical
identical(tacsatp$LE_GEAR, trails$LE_GEAR)
[1] FALSE
Code
identical(tacsatp$LE_MSZ, trails$LE_MSZ)
[1] FALSE
Code
identical(tacsatp$LE_RECT, trails$LE_RECT)
[1] FALSE
Code
comparison <- 
  tacsatp |> 
  as_tibble() |> 
  select(.pid, VE_COU, VE_REF, FT_REF, LE_GEAR, LE_MSZ, LE_MET6, LE_RECT) |> 
  left_join(trails |> select(.pid, VE_COU, VE_REF, FT_REF, 
                             .LE_GEAR = LE_GEAR, .LE_MSZ = LE_MSZ, 
                             .LE_MET6 = LE_MET6, .LE_RECT = LE_RECT)) |> 
  # reorder
  select(.pid, VE_COU, VE_REF, FT_REF, LE_GEAR, .LE_GEAR, LE_MSZ, .LE_MSZ, LE_MET6, .LE_MET6, LE_RECT, .LE_RECT)
comparison |> 
  #filter(LE_GEAR != .LE_GEAR  | LE_MSZ != .LE_MSZ | LE_MET6 != .LE_MET6) |> 
  mutate(gear = if_else(LE_GEAR == .LE_GEAR, "yes", "no", "yes_na"),
         mesh = if_else(LE_MSZ  == .LE_MSZ, "yes", "no", "yes_na"),
         met6 = if_else(LE_MET6 == .LE_MET6, "yes", "no", "yes_na"),
         rect = if_else(LE_RECT   == .LE_RECT, "yes", "no", "yes_na")) |> 
  count(gear, mesh, met6, rect) |> 
  mutate('%' = round(n / sum(n) * 100, 2)) |> 
  knitr::kable(caption = "Assigning events info to pings - discrepancies between datacall vs. myway\nNeeds a revisit, at least the rectangles")
Assigning events info to pings - discrepancies between datacall vs. myway Needs a revisit, at least the rectangles
gear mesh met6 rect n %
no yes yes no 33 0.04
yes no yes no 841 0.98
yes yes yes no 28628 33.29
yes yes yes yes 38377 44.63
yes_na yes_na yes_na yes_na 18109 21.06
Code
# For now let's look at the event records were gear and rectangles are not the same (33 records)
problem_trips <- 
  comparison |> 
  #filter(LE_GEAR != .LE_GEAR  | LE_MSZ != .LE_MSZ | LE_MET6 != .LE_MET6) |> 
  filter(LE_GEAR != .LE_GEAR  & LE_RECT != .LE_RECT) |> 
  select(VE_COU, VE_REF, FT_REF) |> 
  distinct()
problem_events <- 
  problem_trips |> 
  left_join(events |> select(VE_REF, FT_REF, LE_CDAT, LE_GEAR, LE_RECT, LE_KG)) |> 
  arrange(LE_CDAT, desc(LE_KG))
problem_events |> knitr::kable(caption = "Events table, original data")
Events table, original data
VE_COU VE_REF FT_REF LE_CDAT LE_GEAR LE_RECT LE_KG
Atlantis 1527 270892 29/04/1800 TBB 36F5 688.5230
Atlantis 1527 270892 29/04/1800 OTB 34F4 373.7451
Atlantis 1527 270892 29/04/1800 TBB 34F4 319.9824
Code
problem_events |> 
  inner_join(trails) |> 
  arrange(SI_DATIM) |> 
  select(.pid, SI_DATIM, VE_REF, FT_REF, LE_CDAT, SI_DATE, LE_GEAR, LE_RECT, LE_KG) |> 
  #distinct(VE_REF, FT_REF, LE_CDAT, SI_DATE, LE_GEAR, LE_RECT, LE_KG, .keep_all = TRUE) |> 
  knitr::kable(caption = "Discrepancies - myway")
Discrepancies - myway
.pid SI_DATIM VE_REF FT_REF LE_CDAT SI_DATE LE_GEAR LE_RECT LE_KG
3696 1800-04-28 09:46:00 1527 270892 29/04/1800 28/04/1800 TBB 36F5 688.523
3699 1800-04-28 11:42:00 1527 270892 29/04/1800 28/04/1800 TBB 36F5 688.523
3702 1800-04-28 13:36:00 1527 270892 29/04/1800 28/04/1800 TBB 36F5 688.523
3705 1800-04-28 15:32:00 1527 270892 29/04/1800 28/04/1800 TBB 36F5 688.523
3708 1800-04-28 17:28:00 1527 270892 29/04/1800 28/04/1800 TBB 36F5 688.523
3711 1800-04-28 19:22:00 1527 270892 29/04/1800 28/04/1800 TBB 36F5 688.523
3714 1800-04-28 21:18:00 1527 270892 29/04/1800 28/04/1800 TBB 36F5 688.523
3716 1800-04-28 23:12:00 1527 270892 29/04/1800 28/04/1800 TBB 36F5 688.523
3718 1800-04-29 01:08:00 1527 270892 29/04/1800 29/04/1800 TBB 36F5 688.523
3720 1800-04-29 03:04:00 1527 270892 29/04/1800 29/04/1800 TBB 36F5 688.523
3722 1800-04-29 04:58:00 1527 270892 29/04/1800 29/04/1800 TBB 36F5 688.523
3724 1800-04-29 06:54:00 1527 270892 29/04/1800 29/04/1800 TBB 36F5 688.523
3726 1800-04-29 08:48:00 1527 270892 29/04/1800 29/04/1800 TBB 36F5 688.523
3728 1800-04-29 10:44:00 1527 270892 29/04/1800 29/04/1800 TBB 36F5 688.523
3731 1800-04-29 14:34:00 1527 270892 29/04/1800 29/04/1800 TBB 36F5 688.523
3734 1800-04-29 18:24:00 1527 270892 29/04/1800 29/04/1800 TBB 36F5 688.523
3736 1800-04-29 20:20:00 1527 270892 29/04/1800 29/04/1800 TBB 36F5 688.523
3739 1800-04-30 00:10:00 1527 270892 29/04/1800 30/04/1800 TBB 36F5 688.523
3741 1800-04-30 02:06:00 1527 270892 29/04/1800 30/04/1800 TBB 36F5 688.523
3744 1800-04-30 05:56:00 1527 270892 29/04/1800 30/04/1800 TBB 36F5 688.523
3746 1800-04-30 07:50:00 1527 270892 29/04/1800 30/04/1800 TBB 36F5 688.523
3748 1800-04-30 09:46:00 1527 270892 29/04/1800 30/04/1800 TBB 36F5 688.523
3750 1800-04-30 11:42:00 1527 270892 29/04/1800 30/04/1800 TBB 36F5 688.523
3752 1800-04-30 13:36:00 1527 270892 29/04/1800 30/04/1800 TBB 36F5 688.523
3754 1800-04-30 15:32:00 1527 270892 29/04/1800 30/04/1800 TBB 36F5 688.523
3756 1800-04-30 17:28:00 1527 270892 29/04/1800 30/04/1800 TBB 36F5 688.523
3760 1800-04-30 19:22:00 1527 270892 29/04/1800 30/04/1800 TBB 36F5 688.523
3764 1800-04-30 21:18:00 1527 270892 29/04/1800 30/04/1800 TBB 36F5 688.523
3768 1800-04-30 23:12:00 1527 270892 29/04/1800 30/04/1800 TBB 36F5 688.523
3772 1800-05-01 01:08:00 1527 270892 29/04/1800 01/05/1800 TBB 36F5 688.523
3776 1800-05-01 03:02:00 1527 270892 29/04/1800 01/05/1800 TBB 36F5 688.523
3780 1800-05-01 04:58:00 1527 270892 29/04/1800 01/05/1800 TBB 36F5 688.523
3784 1800-05-01 06:54:00 1527 270892 29/04/1800 01/05/1800 TBB 36F5 688.523
Code
problem_events |> 
  inner_join(tacsatp) |> 
  arrange(SI_DATIM) |> 
  select(.pid, SI_DATIM, VE_REF, FT_REF, LE_CDAT, SI_DATE, LE_GEAR, LE_RECT, LE_KG) |> 
  #distinct(VE_REF, FT_REF, LE_CDAT, SI_DATE, LE_GEAR, LE_RECT, LE_KG, .keep_all = TRUE) |> 
  knitr::kable(caption = "Discrepancies - datacall")
Discrepancies - datacall
.pid SI_DATIM VE_REF FT_REF LE_CDAT SI_DATE LE_GEAR LE_RECT LE_KG
3696 1800-04-28 09:46:00 1527 270892 29/04/1800 28/04/1800 OTB 34F4 373.7451
3699 1800-04-28 11:42:00 1527 270892 29/04/1800 28/04/1800 OTB 34F4 373.7451
3702 1800-04-28 13:36:00 1527 270892 29/04/1800 28/04/1800 OTB 34F4 373.7451
3705 1800-04-28 15:32:00 1527 270892 29/04/1800 28/04/1800 OTB 34F4 373.7451
3708 1800-04-28 17:28:00 1527 270892 29/04/1800 28/04/1800 OTB 34F4 373.7451
3711 1800-04-28 19:22:00 1527 270892 29/04/1800 28/04/1800 OTB 34F4 373.7451
3714 1800-04-28 21:18:00 1527 270892 29/04/1800 28/04/1800 OTB 34F4 373.7451
3716 1800-04-28 23:12:00 1527 270892 29/04/1800 28/04/1800 OTB 34F4 373.7451
3718 1800-04-29 01:08:00 1527 270892 29/04/1800 29/04/1800 OTB 34F4 373.7451
3720 1800-04-29 03:04:00 1527 270892 29/04/1800 29/04/1800 OTB 34F4 373.7451
3722 1800-04-29 04:58:00 1527 270892 29/04/1800 29/04/1800 OTB 34F4 373.7451
3724 1800-04-29 06:54:00 1527 270892 29/04/1800 29/04/1800 OTB 34F4 373.7451
3726 1800-04-29 08:48:00 1527 270892 29/04/1800 29/04/1800 OTB 34F4 373.7451
3728 1800-04-29 10:44:00 1527 270892 29/04/1800 29/04/1800 OTB 34F4 373.7451
3731 1800-04-29 14:34:00 1527 270892 29/04/1800 29/04/1800 OTB 34F4 373.7451
3734 1800-04-29 18:24:00 1527 270892 29/04/1800 29/04/1800 OTB 34F4 373.7451
3736 1800-04-29 20:20:00 1527 270892 29/04/1800 29/04/1800 OTB 34F4 373.7451
3739 1800-04-30 00:10:00 1527 270892 29/04/1800 30/04/1800 OTB 34F4 373.7451
3741 1800-04-30 02:06:00 1527 270892 29/04/1800 30/04/1800 OTB 34F4 373.7451
3744 1800-04-30 05:56:00 1527 270892 29/04/1800 30/04/1800 OTB 34F4 373.7451
3746 1800-04-30 07:50:00 1527 270892 29/04/1800 30/04/1800 OTB 34F4 373.7451
3748 1800-04-30 09:46:00 1527 270892 29/04/1800 30/04/1800 OTB 34F4 373.7451
3750 1800-04-30 11:42:00 1527 270892 29/04/1800 30/04/1800 OTB 34F4 373.7451
3752 1800-04-30 13:36:00 1527 270892 29/04/1800 30/04/1800 OTB 34F4 373.7451
3754 1800-04-30 15:32:00 1527 270892 29/04/1800 30/04/1800 OTB 34F4 373.7451
3756 1800-04-30 17:28:00 1527 270892 29/04/1800 30/04/1800 OTB 34F4 373.7451
3760 1800-04-30 19:22:00 1527 270892 29/04/1800 30/04/1800 OTB 34F4 373.7451
3764 1800-04-30 21:18:00 1527 270892 29/04/1800 30/04/1800 OTB 34F4 373.7451
3768 1800-04-30 23:12:00 1527 270892 29/04/1800 30/04/1800 OTB 34F4 373.7451
3772 1800-05-01 01:08:00 1527 270892 29/04/1800 01/05/1800 OTB 34F4 373.7451
3776 1800-05-01 03:02:00 1527 270892 29/04/1800 01/05/1800 OTB 34F4 373.7451
3780 1800-05-01 04:58:00 1527 270892 29/04/1800 01/05/1800 OTB 34F4 373.7451
3784 1800-05-01 06:54:00 1527 270892 29/04/1800 01/05/1800 OTB 34F4 373.7451
Code
# TODO: Check why something is wrotten in the State of Denmark
print("There is a pending issue, the ways do not give the same results")
[1] "There is a pending issue, the ways do not give the same results"
Code
# Fireworks - running via duckdb -----------------------------------------------
duck_highest_event_catch_per_day <-
  duck_events |> 
  arrange(desc(LE_KG)) |> 
  group_by(VE_COU, VE_REF, FT_REF, LE_CDAT) |> 
  filter(row_number()==1) |> # instead of slice
  ungroup() |> 
  select(VE_COU, VE_REF, FT_REF, LE_CDAT, LE_GEAR, LE_MSZ, LE_MET6, LE_RECT)
duck_highest_trip_catches <-
  duck_events |> 
  group_by(VE_COU, VE_REF, FT_REF, LE_GEAR, LE_MSZ, LE_MET6, LE_RECT) |> 
  summarise(LE_KG = sum(LE_KG),
            .groups = "drop") |> 
  arrange(desc(LE_KG)) |> 
  group_by(VE_COU, VE_REF, FT_REF) |> 
  filter(row_number()==1) |> # instead of slice
  ungroup() |> 
  select(VE_COU, VE_REF, FT_REF, .LE_GEAR = LE_GEAR, .LE_MSZ = LE_MSZ, 
         .LE_MET6 = LE_MET6, .LE_RECT = LE_RECT)
duck_trails <- 
  duck_trails |> 
  left_join(duck_highest_event_catch_per_day,
            by = join_by(VE_COU, VE_REF, FT_REF, SI_DATE == LE_CDAT)) |> 
  # relationship = "many-to-one") |> 
  left_join(duck_highest_trip_catches,
            by = join_by(VE_COU, VE_REF, FT_REF)) |> 
  # relationship = "many-to-one") |> 
  mutate(LE_GEAR = case_when(is.na(LE_GEAR) & !is.na(.LE_GEAR) ~ .LE_GEAR,
                             .default = LE_GEAR),
         LE_MSZ = case_when(is.na(LE_MSZ) & !is.na(.LE_MSZ) ~ .LE_MSZ,
                            .default = LE_MSZ),
         LE_MET6 = case_when(is.na(LE_MET6) & !is.na(.LE_MET6) ~ .LE_MET6,
                             .default = LE_MET6),
         LE_RECT = case_when(is.na(LE_RECT) & !is.na(.LE_RECT) ~ .LE_RECT,
                             .default = LE_RECT)) |> 
  select(-c(.LE_GEAR, .LE_MSZ, .LE_MET6, .LE_RECT))
Code
# Once we have few more things sorted out, like state and split-among-ping
#  we would in end sum the stuff, like:
dx <- dy <- 0.05 # csquare
final <- 
  duck_trails |> 
  mutate(year = year(SI_DATIM),
         month = month(SI_DATIM),
         lon = SI_LONG%/%dx * dx + dx/2,
         lat = SI_LATI%/%dy * dy + dy/2) |> 
  group_by(year, month, lon, lat, LE_GEAR, LE_MET6) |> 
  summarise(pings = n(),
            .groups = "drop") # |> 
# collect() # get an error if not interactive