On trip overlap

While using/testing ‘1.3.7 Remove trip with overlap with another trip’ in the ‘1_eflalo_tacsat_preprocessing.R’ I was dropping more trips than expected. Ran a little test on synthetic data to try to understand why. Turns out that the code in the workflow can be quite generous when dropping trips, even though only one trip start/end time may be an issue. added an alternative approach for testing time overlaps limited to sequential trips - apologies for not using data.table. What to do about overlaps is then of course another matter.
code
rtip
Author

Einar Hjörleifsson

Published

July 19, 2025

library(tidyverse)
library(data.table)
eflalo_org <- 
  tibble(FT_DDATIM = seq(ymd_hms("2024-05-01 00:04:00"),
                         ymd_hms("2024-05-31 00:00:00"), 
                         by = "1 day")) |> 
  mutate(VE_COU = "myland",
         VE_REF = "myship",
         FT_REF = 1:n(),
         FT_LDATIM = FT_DDATIM + dhours(10),
         # Add an overlap
         FT_LDATIM = if_else(FT_REF == 5, FT_DDATIM + dhours(360), FT_LDATIM))


eflalo <- eflalo_org
p <- 
  ggplot(eflalo,
         aes(x = FT_DDATIM, xend = FT_LDATIM,
             y = FT_REF, yend = FT_REF)) +
  geom_segment(linewidth = 1, colour = "black")
p + labs(caption = "Synthetic data: Trip 5 overlaps with trips 6 through 19")

# 1.3.7 Remove trip with overlap with another trip ---------------------------
# Order 'eflalo' by 'VE_COU', 'VE_REF', 'FT_DDATIM', and 'FT_LDATIM'
eflalo <- doBy::orderBy(~ VE_COU + VE_REF + FT_DDATIM + FT_LDATIM, data = eflalo)
# If a trip (same depart and return times) has more than one FT_REF, make them all into the same (first) FT_REF. 
dt1 <- data.table(eflalo)[,.(VE_REF, FT_REF, FT_DDATIM, FT_LDATIM)]
dt1 <- unique(dt1, by = c("VE_REF", "FT_REF"))
setkey(dt1, VE_REF, FT_DDATIM, FT_LDATIM)
dt2 <- dt1[, ref := .N > 1, by = key(dt1)][ref == T]
dt3 <- dt2[,.(FT_REF_NEW = FT_REF[1]), by = .(VE_REF, FT_DDATIM, FT_LDATIM)]
dt4 <- merge(dt2, dt3)
eflalo2 <- merge(data.table(eflalo), dt4, all.x = T)
eflalo2[!is.na(FT_REF_NEW), FT_REF := FT_REF_NEW]
eflalo2[, FT_REF_NEW := NULL]
eflalo <- data.frame(eflalo2)
eflalo <- eflalo %>% select(-ref)
# Create a data table 'dt1' with the necessary columns from 'eflalo'
dt1 <- data.table(ID = eflalo$VE_REF, FT = eflalo$FT_REF,
                  startdate = eflalo$FT_DDATIM,
                  enddate = eflalo$FT_LDATIM)
# Remove duplicate rows from 'dt1'
dt1 <- dt1[!duplicated(paste(dt1$ID, dt1$FT)), ]
# Set keys for 'dt1' for efficient joining and overlapping
setkey(dt1, ID, startdate, enddate)
# Find overlapping trips in 'dt1'
result <- foverlaps(dt1, dt1, by.x = c("ID", "startdate", "enddate"),
                    by.y = c("ID", "startdate", "enddate"))
# Filter 'result' to get only the rows where trips overlap
overlapping.trips <- subset(result, startdate < i.enddate & enddate > i.startdate & FT != i.FT)
# If there are overlapping trips, remove them from 'eflalo' and save them to a file
if (nrow(overlapping.trips) > 0) {
  eflalo <- eflalo[!eflalo$FT_REF %in% overlapping.trips$FT, ]
  print("THERE ARE OVERLAPPING TRIPS IN THE DATASET -> SEE THE FILE overlappingTrips SAVED IN THE RESULTS FOLDER")
} 
[1] "THERE ARE OVERLAPPING TRIPS IN THE DATASET -> SEE THE FILE overlappingTrips SAVED IN THE RESULTS FOLDER"
p +
  geom_segment(data = eflalo, colour = "red", linewidth = 1) +
  labs(caption = "Red: Trips retained")

So algorithm removes all trips although only one may be suspect.

# One could think about using sequential tests
altverse <- 
  eflalo_org |> 
  group_by(VE_COU, VE_REF) |> 
  mutate(issues = 
           case_when(FT_DDATIM > FT_LDATIM ~ "arrival before departure",
                     FT_DDATIM == FT_LDATIM ~ "arrival same as departure",
                     FT_LDATIM > lead(FT_DDATIM) ~ "next departure before current arrival",
                     lag(FT_LDATIM) > FT_DDATIM ~ "previous arrival after current departure",
                     row_number()==1 ~ "0_first row in a group", 
                     row_number() == max(row_number()) ~ "0_last row in a group",
                     .default = "0_no issues")) |> 
  ungroup()

ggplot(altverse,
       aes(x = FT_DDATIM, xend = FT_LDATIM,
           y = FT_REF, yend = FT_REF,
           colour = issues)) +
  geom_segment(linewidth = 1) +
  scale_colour_brewer(palette = "Set1") +
  labs(caption = "Specific case: Here remove the violet trip (trip 5)")

Using the above we would only remove trip 5 (labelled “nest departure before current arrival”). Before adopting this kind of algorithm it would though be of value to check other cases.

An alternative would be to try to do some kind of a correction, in this specific case one may think of setting end of trip 5 before start of trip 6.

Lastly, let’s do a quick check on the vmstools inbuilt dataset:

lb <- 
  ramb::eflalo |> 
  mutate(FT_DDATIM = dmy_hms(paste0(FT_DDAT, " ", FT_DTIME)),
         FT_LDATIM = dmy_hms(paste0(FT_LDAT, " ", FT_LTIME))) |> 
  select(VE_COU, VE_REF, FT_REF, FT_DDATIM, FT_LDATIM) |> 
  arrange(VE_COU, VE_REF, FT_DDATIM, FT_LDATIM) |> 
  distinct(VE_COU, VE_REF, FT_REF, .keep_all = TRUE)
lb <- 
  lb |> 
  group_by(VE_COU, VE_REF) |> 
  mutate(issues = 
           case_when(FT_DDATIM > FT_LDATIM ~ "arrival before departure",
                     FT_DDATIM == FT_LDATIM ~ "arrival same as departure",
                     FT_LDATIM > lead(FT_DDATIM) ~ "next departure before current arrival",
                     lag(FT_LDATIM) > FT_DDATIM ~ "previous arrival after current departure",
                     row_number()==1 ~ "0_first row in a group", 
                     row_number() == max(row_number()) ~ "0_last row in a group",
                     .default = "0_no issues")) |> 
  ungroup()
# lb |> filter(!str_starts(issues, "0_"))
lb |> 
  filter(VE_REF == 1784) |> 
  knitr::kable(caption = "Eflalo demo data: Vessels with trip time overlap issues")
Eflalo demo data: Vessels with trip time overlap issues
VE_COU VE_REF FT_REF FT_DDATIM FT_LDATIM issues
Atlantis 1784 290397 1803-10-11 08:00:00 1803-10-11 08:00:00 arrival same as departure
Atlantis 1784 290398 1803-10-19 13:00:00 1803-10-19 15:00:00 0_no issues
Atlantis 1784 312252 1804-07-04 10:00:00 1804-07-09 12:00:00 0_no issues
Atlantis 1784 312253 1804-07-10 11:00:00 1804-07-15 10:00:00 0_last row in a group