library(vmstools)
library(sf)
library(tidyverse)
source("~/stasi/ices/ICES-VMS-and-Logbook-Data-Call/0_global_functions.R")
library(conflicted)
conflicts_prefer(dplyr::filter, dplyr::lag)
<- function(x, y) {
rb_st_keep <- sf::st_intersects(x, y) |> lengths() > 0
i <- x[i, ]
x return(x)
}<- function(x, y) {
rb_st_remove <- lengths(st_intersects(x, y)) == 0
i <- x[i, ]
x return(x)
}
datacall
- try to separate out mutation process and filter/qc process. i.e. don’t do both at the same time
Preamble
This document is trying to emulate the process behind the (2024) datacall. Some deviations in the code flow, in particular if less terse approach can easily be implemented.
libraries
Globals
# Setting thresholds
<- 20 # Maximum speed threshold in analyses in nm
spThres <- 5 # Minimum difference in time interval in minutes to prevent pseudo duplicates
intThres <- 240 # Maximum difference in time interval in minutes to prevent unrealistic intervals
intvThres <- 1.5 # Maximum difference in log10-transformed sorted weights
lanThres
# Set the years to submit
<- c(1803:1804)
yearsToSubmit
# Set the gear names for which automatic fishing activity is wanted
<- c("TBB","OTB","OTT", "OTM","SSC","SDN","DRB","PTB","HMD", "MIS")
autoDetectionGears
# Decide if you want to visually analyze speed-histograms to identify fishing activity peaks
<- FALSE
visualInspection
# Specify how landings should be distributed over the VMS pings
<- c("trip")
linkEflaloTacsat
# Extract valid level 6 metiers
<- data.table::fread("https://raw.githubusercontent.com/ices-eg/RCGs/master/Metiers/Reference_lists/RDB_ISSG_Metier_list.csv")$Metier_level6
valid_metiers
<- icesVocab::getCodeList("GearType")
ices_geartype <- icesVocab::getCodeList("Metier6_FishingActivity")
ices_fishingactivity
# speed allocation
<-
speed tribble(~met, ~s1, ~s2)
Auxillary data
data(harbours)
data(ICESareas)
<-
ices <-
ia |>
ICESareas select(area = Area_27)
sf_use_s2(TRUE)
<-
harbours |>
harbours ::mutate(harbour = iconv(harbour, from = "latin1", to = "UTF-8")) |>
dplyr::as_tibble() |>
tibble::st_as_sf(coords = c("lon", "lat"),
sfcrs = 4326) |>
::st_buffer(dist = 3000) |> # radius
sf::select(harbour)
dplyr#sf_use_s2(FALSE)
# may use
<-
dictionary tribble(~iso, ~vt,
"dttm", "SI_DATIM",
"date", "SI_DATE",
"time", "SI_TIME",
"vid", "VE_REF",
"D1", "FT_DDAT",
"D2", "FT_LDAT")
Uses vmstools dataset
data(tacsat)
# leap-year issue
<-
tacsat |>
tacsat ::mutate(SI_DATE = stringr::str_replace(SI_DATE, "1800", "1803"),
dplyrSI_DATE = stringr::str_replace(SI_DATE, "1801", "1804")) |>
::mutate(tmp = dmy_hms(paste(SI_DATE, SI_TIME), tz = "UTC")) |>
dplyr# get order upfront (could have used sortTacsat)
::arrange(VE_REF, tmp) |>
dplyr::select(-tmp) |>
dplyr::mutate(.ridt = 1:n(),
dplyr.before = VE_COU)
# for the alt
<-
ais |>
tacsat ::as_tibble() |>
tibble::unite(col = "time", SI_DATE, SI_TIME, sep = " ") |>
tidyr::mutate(time = lubridate::dmy_hms(time, tz = "UTC")) |>
dplyr::select(.ridt,
dplyrvid = VE_REF,
time,lon = SI_LONG,
lat = SI_LATI,
speed = SI_SP,
hd = SI_HE)
data(eflalo)
# leap-year issue
<-
eflalo |>
eflalo ::mutate(FT_DDAT = stringr::str_replace(FT_DDAT, "1800", "1803"),
dplyrFT_DDAT = stringr::str_replace(FT_DDAT, "1801", "1804"),
FT_LDAT = stringr::str_replace(FT_LDAT, "1800", "1803"),
FT_LDAT = stringr::str_replace(FT_LDAT, "1801", "1804"),
LE_CDAT = stringr::str_replace(LE_CDAT, "1800", "1803"),
LE_CDAT = stringr::str_replace(LE_CDAT, "1801", "1804")) |>
# get order upfront
::mutate(tmp = dmy_hms(paste0(FT_DDAT, FT_DTIME), tz = "UTC")) |>
dplyr::arrange(VE_REF, tmp) |>
dplyr::select(-tmp) |>
dplyr::mutate(.ride = 1:n(),
dplyr.before = VE_REF) |>
::rename(LE_MET = LE_MET_level6)
dplyr<-
lb |>
eflalo ::as_tibble() |>
tibble::rename(d1 = LE_CDAT) |>
dplyr# probably not kosher -
::mutate(LE_STIME = dplyr::case_when(is.na(LE_STIME) ~ "00:00:01",
dplyr.default = LE_STIME),
LE_ETIME = dplyr::case_when(is.na(LE_ETIME) ~ "23:59:59")) |>
::unite(col = "T1", FT_DDAT, FT_DTIME, sep = " ") |>
tidyr::unite(col = "T2", FT_LDAT, FT_LTIME, sep = " ") |>
tidyr::mutate(T1 = lubridate::dmy_hms(T1, tz = "UTC"),
dplyrT2 = lubridate::dmy_hms(T2, tz = "UTC"),
t1 = dmy_hms(paste(d1, LE_STIME), tz = "UTC"),
t2 = dmy_hms(paste(d1, LE_ETIME), tz = "UTC"),
d1 = lubridate::dmy(d1),
.after = LE_ID) |>
# "haul can not be before nor after trip start-end
::mutate(t1 = dplyr::case_when(t1 < T1 ~ T1,
dplyr.default = t1),
t2 = dplyr::case_when(t2 > T2 ~ T2,
.default = t2)) |>
select(-c(VE_COU, VE_LEN, VE_KW, VE_TON,
FT_DCOU, FT_DHAR, FT_LCOU, FT_LHAR,
LE_STIME, LE_ETIME,
LE_SLAT, LE_SLON, LE_ELAT, LE_ELON,
LE_EFF_VMS))# get rid of species stuff for now
<-
lb |>
lb ::select(-(LE_KG_ANE:LE_EURO_SWO)) |>
dplyr# in the end just do this
::select(.ride,
dplyrvid = VE_REF,
fleet = VE_FLT,
T1,
T2,
d1,.tid = FT_REF,
gid = LE_GEAR,
ir = LE_RECT,
.sid = LE_ID,
# t1, t2,
mesh = LE_MSZ,
met6 = LE_MET,
div = LE_DIV,
effort = LE_EFF,
unit = LE_UNIT)
Preprocessing
Positions
<- formatTacsat(tacsat)
tacsat # 1.2 Clean the TACSAT data ----------------------------------------------------
## 1.2.0 Keep track of removed points ------------------------------------------
# not done here
## 1.2.1 Remove VMS pings outside the ICES areas -------------------------------
# ia <- transform_to_sf(ICESareas, coords = c("SI_LONG", "SI_LATI"))
<- transform_to_sf(tacsat, coords = c("SI_LONG", "SI_LATI"))
tacsat # Make ia valid and transform it - redundant, already done
#ia <- ia %>%
# sf::st_make_valid() %>%
# sf::st_transform(4326) |>
# sf::st_zm()
# Find intersections
<- sf::st_intersects(tacsat, ia)
overs <- tacsat[lengths(overs) > 0,]
tacsat
## 1.2.2 Remove duplicate records ----------------------------------------------
# Convert SI_DATE and SI_TIME to POSIXct
$SI_DATIM <-
tacsatas.POSIXct(paste(tacsat$SI_DATE, tacsat$SI_TIME), tz = "GMT", format = "%d/%m/%Y %H:%M")
# Create a unique identifier for each row
$unique_id <- paste(tacsat$VE_REF, tacsat$SI_LATI, tacsat$SI_LONG, tacsat$SI_DATIM)
tacsat# Remove duplicates based on the unique identifier
<- tacsat[!duplicated(tacsat$unique_id), ]
tacsat
## 1.2.3 Remove points that have impossible coordinates ------------------------
# Extract coordinates from tacsat
<- st_coordinates(tacsat)
coords # Check for impossible positions
<- which(coords[,2] > 90 | coords[,2] < -90 | coords[,1] > 180 | coords[,1] < -180)
invalid_positions if (length(invalid_positions) > 0) {
# Print the invalid positions
print(tacsat[invalid_positions,])
# Remove points with impossible positions
<- tacsat[-invalid_positions,]
tacsat
}
## 1.2.4 Remove points which are pseudo duplicates -----------------------------
# as they have an interval rate < x minutes ---
# Sort tacsat and calculate intervals
<- sfsortTacsat(tacsat)
tacsat $INTV <-
tacsatintervalTacsat(as.data.frame(tacsat),
level = "vessel",
fill.na = TRUE)$INTV
# Remove rows with small intervals
<- tacsat[tacsat$INTV >= intThres, ]
tacsat # Remove INTV column from tacsat
$INTV <- NULL
tacsat
## 1.2.5 Remove points in harbour ----------------------------------------------
# Find intersections
<- sf::st_intersects(tacsat, harbours)
overs # Filter tacsat
<- tacsat[!(lengths(overs) > 0),]
tacsat <- as.data.frame(tacsat)
tacsat <- tacsat %>% dplyr::select(-geometry)
tacsat <- tacsat %>% dplyr::select(-unique_id) tacsat
<-
ais |>
ais ::st_as_sf(coords = c("lon", "lat"),
sfcrs = 4326,
remove = FALSE) |>
# should already be done
::arrange(vid, time) |>
dplyr## 1.2.1 Remove VMS pings outside the ICES areas -----------------------------
# takes a bit of time
rb_st_keep(ices) |>
## 1.2.2 Remove duplicate records --------------------------------------------
::distinct(vid, time, lon, lat, .keep_all = TRUE) |>
dplyr## 1.2.3 Remove points that have impossible coordinates ----------------------
# Redundant given step 1.2.1
## 1.2.4 Remove points which are pseudo duplicates ---------------------------
::group_by(vid) |>
dplyr# this is not by convention
::mutate(dt = difftime(time, lag(time), units = "mins"),
dplyrdt = as.numeric(dt)) |>
::fill(dt, .direction = "up") |>
tidyr::ungroup() |>
dplyr::filter(dt >= intThres) |>
dplyr## 1.2.5 Remove points in harbour --------------------------------------------
rb_st_remove(harbours)
#dplyr::filter(lengths(st_intersects(geometry, harbours$geometry)) == 0)
identical(tacsat$.ridt, ais$.ridt)
[1] TRUE
Logbooks
## 1.3.1 Keep track of removed points -----------------------------------------
# not done here
## 1.3.2 Warn for outlying catch records ---------------------------------------
## 1.3.3 Remove non-unique trip numbers ----------------------------------------
# Apply the trip ID function to the eflalo data frame
# function name is bit of a misnomer
<- create_trip_id(eflalo)
trip_id # Remove records with non-unique trip identifiers
<- eflalo[!duplicated(trip_id), ]
eflalo
## 1.3.4 Remove impossible time stamp records ----------------------------------
# Apply the convert to date-time function to the FT_DDAT and FT_DTIME columns
$FT_DDATIM <- convert_to_datetime(eflalo$FT_DDAT, eflalo$FT_DTIME)
eflalo# Apply the function to the FT_LDAT and FT_LTIME columns
$FT_LDATIM <- convert_to_datetime(eflalo$FT_LDAT, eflalo$FT_LTIME)
eflalo# Remove records with NA in either FT_DDATIM or FT_LDATIM
<- eflalo[!is.na(eflalo$FT_DDATIM) & !is.na(eflalo$FT_LDATIM), ]
eflalo
## 1.3.5 Remove trip starting before 1st Jan -----------------------------------
# Call the remove before january function with the appropriate arguments
# not run in test
if(FALSE) eflalo <- remove_before_jan(eflalo, year)
## 1.3.6 Remove records with arrival date before departure date ---------------
# Find the indices of rows where 'FT_LDATIM' is greater than or equal to 'FT_DDATIM'
<- which(eflalo$FT_LDATIM >= eflalo$FT_DDATIM)
idx # Keep only the rows in 'eflalo' where 'FT_LDATIM' is greater than or equal to 'FT_DDATIM'
<- eflalo[idx,]
eflalo
## 1.3.7 Remove trip with overlap with another trip ----------------------------
# Order 'eflalo' by 'VE_COU', 'VE_REF', 'FT_DDATIM', and 'FT_LDATIM'
<- orderBy(~ VE_COU + VE_REF + FT_DDATIM + FT_LDATIM, data = eflalo)
eflalo
# If a trip (same depart and return times) has more than one FT_REF, make them all into the same (first) FT_REF.
<- data.table::data.table(eflalo)[,.(VE_REF, FT_REF, FT_DDATIM, FT_LDATIM)]
dt1 <- unique(dt1, by = c("VE_REF", "FT_REF"))
dt1 ::setkey(dt1, VE_REF, FT_DDATIM, FT_LDATIM)
data.table<- dt1[, ref := .N > 1, by = data.table::key(dt1)][ref == T]
dt2 <- dt2[,.(FT_REF_NEW = FT_REF[1]), by = .(VE_REF, FT_DDATIM, FT_LDATIM)]
dt3 <- merge(dt2, dt3)
dt4 <- merge(data.table::data.table(eflalo), dt4, all.x = T)
eflalo2 !is.na(FT_REF_NEW), FT_REF := FT_REF_NEW]
eflalo2[:= NULL]
eflalo2[, FT_REF_NEW <- data.frame(eflalo2)
eflalo <- eflalo %>% select(-ref)
eflalo
# Create a data table 'dt1' with the necessary columns from 'eflalo'
<- data.table::data.table(ID = eflalo$VE_REF, FT = eflalo$FT_REF,
dt1 startdate = eflalo$FT_DDATIM,
enddate = eflalo$FT_LDATIM)
# Remove duplicate rows from 'dt1'
<- dt1[!duplicated(paste(dt1$ID, dt1$FT)), ]
dt1 # Set keys for 'dt1' for efficient joining and overlapping
::setkey(dt1, ID, startdate, enddate)
data.table# Find overlapping trips in 'dt1'
<- data.table::foverlaps(dt1, dt1, by.x = c("ID", "startdate", "enddate"),
result by.y = c("ID", "startdate", "enddate"))
# Filter 'result' to get only the rows where trips overlap
<- subset(result, startdate < i.enddate & enddate > i.startdate & FT != i.FT)
overlapping.trips # If there are overlapping trips, remove them from 'eflalo' and save them to a file
if (nrow(overlapping.trips) > 0) {
<- eflalo[!eflalo$FT_REF %in% overlapping.trips$FT, ]
eflalo print("THERE ARE OVERLAPPING TRIPS IN THE DATASET -> SEE THE FILE overlappingTrips SAVED IN THE RESULTS FOLDER")
save(overlapping.trips, file = file.path(outPath, paste0("overlappingTrips", year, ".RData")))
}
#'----------------------------------------------------------------------------
# 1.4 METIERS ICES Vocabulary Quality Control ----------------------------------------
#'----------------------------------------------------------------------------
#'
#' Check the fields with related Metier ICES Vocabularies prior the analysis block (2_eflalo_tacsat_analysis.R)
#' Some functions in this analysis will rise errors if there are values with not valid controlled vocabulary
#'
## 1.4.1 Check Metier L4 (Gear) categories are accepted ------------------------
<- icesVocab::getCodeList("GearType")
m4_ices table ( eflalo$LE_GEAR %in%m4_ices$Key ) # TRUE records accepted in DATSU, FALSE aren't
TRUE
4446
# Get summary of DATSU valid/not valid records
! eflalo$LE_GEAR %in%m4_ices$Key,]%>%group_by(LE_GEAR)%>%dplyr::select(LE_GEAR)%>%tally() eflalo [
# A tibble: 0 × 2
# ℹ 2 variables: LE_GEAR <chr>, n <int>
# Correct or dismiss not valid records (if any) and filter only valid ones
<- eflalo%>%filter(LE_GEAR %in% m4_ices$Key)
eflalo
## 3.5.5 Check Metier L6 (Fishing Activity) categories are accepted -----------
<- icesVocab::getCodeList("Metier6_FishingActivity")
m6_ices table ( eflalo$LE_MET %in%m6_ices$Key ) # TRUE records accepted in DATSU, FALSE aren't
FALSE TRUE
738 3708
# Get summary of DATSU valid/not valid records
! eflalo$LE_MET %in%m6_ices$Key,]%>%group_by(LE_MET)%>%dplyr::select(LE_MET)%>%tally() eflalo [
# A tibble: 8 × 2
LE_MET n
<chr> <int>
1 DRB_MOL_0_0_0 58
2 FPO_CRU_0_0_0 24
3 GNS_DEF_0_0_0 11
4 GNS_DEF_UND_0_0 14
5 LH_FIN_0_0_0 9
6 MIS_UND_UND_0_0 581
7 OTM_SPF_UND_0_0 36
8 TBB_DEF_90-119_0_0 5
# Correct them if any not valid and filter only valid ones
<- eflalo%>%filter(LE_MET %in% m6_ices$Key) eflalo
## 1.3.1 Keep track of removed points -----------------------------------------
# not done here
## 1.3.2 Warn for outlying catch records ---------------------------------------
## 1.3.3 Remove non-unique trip numbers ----------------------------------------
<-
lb |>
lb # I would also use vessel id here, but that is another matter
distinct(.sid, d1, .keep_all = TRUE) |>
## 1.3.4 Remove impossible time stamp records ----------------------------------
filter(!is.na(T1) & !is.na(T2)) |>
## 1.3.5 Remove trip starting before 1st Jan -----------------------------------
# not run here, have mupltiple years
## 1.3.6 Remove records with arrival date before departure date ---------------
filter(T2 >= T1) |>
## 1.3.7 Remove trip with overlap with another trip --------------------------
# NOT IMPLEMENTED because of lack of understanding
# If a trip (same depart and return times) has more than one FT_REF,
# make them all into the same (first) FT_REF
nest(.by = c(vid, fleet, .tid, T1, T2)) |>
group_by(vid, fleet) |>
mutate(issues =
case_when(
> T2 ~ "arrival before departure",
T1 # NOTE: SHOULD REALLY BE A CAUSE FOR A DROP
# so in production remove the prefix
== T2 ~ "0_arrival same as departure",
T1 > lead(T1) ~ "next departure before current arrival",
T2 lag(T2) > T1 ~ "previous arrival after current departure",
row_number() == 1 ~ "0_first row in a group",
row_number() == max(row_number()) ~ "0_last row in a group",
.default = "0_no issues")) |>
ungroup() |>
filter(stringr::str_starts(issues, "0_")) |>
select(-issues) |>
unnest(data) |>
## 1.4.1 Check Metier L4 (Gear) categories are accepted ----------------------
filter(gid %in% ices_geartype$Key) |>
## 3.5.5 Check Metier L6 (Fishing Activity) categories are accepted ----------
filter(met6 %in% ices_fishingactivity$Key)
<- eflalo |> arrange(.ride)
eflalo near(eflalo$.ride, lb$.ride) |> table()
TRUE
3708
Analysis
bla, bla, bla, …
# 2.2 Assign EFLALO Fishing trip information (gear, vessel, lenght, etc. ) to VMS records in TACSAT ----
#
# Assign a EFLALO trip identifier (FT_REF) at each VMS record in TACSAT .
# Methods assgn fishing trip to VMS records which date/time is between the Trip dates of departure and return to port .
<- mergeEflalo2Tacsat(eflalo,tacsat)
tacsatp <- data.frame(tacsatp) tacsatp
This can be done via:
<-
ais2 |>
ais left_join(lb |> select(vid, .tid, T1, T2) |> distinct(),
by = join_by(vid, between(time, T1, T2))) |>
select(-c(T1, T2)) |>
# to be compliant with vmstools:mergeEflalo2Tacsat
mutate(.tid = replace_na(.tid, "0"))
near(tacsat$.ridt, ais2$.ridt) |> table(useNA = "ifany")
TRUE
67699
$FT_REF == ais2$.tid) |> table(useNA = "ifany") (tacsatp
TRUE
67699