# Not run
<- 1e8
N # A tibble that covers the bounding box of FAO area 27
::tibble(lon = runif(N,-44.00, 68.50),
tibblelat = runif(N, 36.00, 89.99)) |>
::mutate(.id = 1:n(),
dplyr.before = lon) |>
::write_dataset(here::here("data/test/test_dataset.parquet")) duckdbfs
A dummy dataset
We generate here a simple test dataset that has 100 million coordinates. We store the data as a parquet file, which if you have not heard of it is kind of like csv-file on steroids.
In addition we will use the eusm sediment spatial dataset, the same as used in the ICES VMS datacall. This dataset has 3.17 million polygons consisting a total of 119 million point-coordinates! In addition some of the polygons are not valid (meaning e.g. that the a polygon may intersect itself) and there are “holes” between some of the polygons. Not ideal for proper analysis but a good canditate for the test setup here.
Library loading
We will be using a number of packages but instead of loading them we will call each functions directly within the script, just to limit ambiguity. We start off by turning the google s2geometry.io library off, because otherwise the conventional spatial join will not work, given that the eusm has non-valid geometries (see above).
::sf_use_s2(FALSE) sf
duckdb
In this section the data is not loaded into R-memory, rather a temporary duckdb database is generated with a pointer to the dataset on the disk. The process used is the same as before, except here we run the test using up to 100 million ais points. And the data is only imported into R at the end of the join process.
<-
eusm ::open_dataset(here::here("data/auxillary/ICESareas.gpkg")) |>
duckdbfs::select(area = Area_Full, geom)
dplyr<-
ais ::open_dataset(here::here("data/test/test_dataset.parquet"))
duckdbfs
<- seq(4, 8, by = 0.2)
o <- 10^o
N ::tic.clearlog()
tictocfor(i in 1:length(N)) {
print(i)
::tic()
tictoc<- N[i]
X <-
ais_join |>
ais ::filter(.id <= X) |>
dplyr::mutate(geom = ST_POINT(lon, lat)) |>
dplyr::spatial_join(eusm, by = "st_intersects", join = "left") |>
duckdbfs::select(-geom_1) |>
dplyr::collect()
dplyr::toc(log = TRUE, quiet = TRUE)
tictoc
}<- tictoc::tic.log(format = FALSE)
log.lst ::tic.clearlog()
tictoc<- unlist(lapply(log.lst, function(x) x$toc - x$tic))
timings
timings<-
duckdb_eusm ::tibble(n = N / 1e6,
tibbletiming = timings |> as.vector())
|> readr::write_csv("test_duckdbfs.csv") duckdb_eusm
The conventional way
In this section we are going to read the data into memory. And because of the limitations that that creates we are not going to test for more than 10 million ais records. The steps are as follows:
- Load the dataset into memory
- Run a loop with ever increasing number of ais-records, ending with a dataframe of 1 million records
- Within each loop generate an sf-tibble and join that with the eusm-polygons to get the sediment type for each ping
- Keep track of time elapsed using {tictoc}
- Summarise the timing and plot the results.
# Here read data into memory
<-
eusm ::read_sf(here::here("data/auxillary/ICESareas.gpkg")) |>
sf::select(fao = Area_Full)
dplyr<-
ais ::read_parquet(here::here("data/test/test_dataset.parquet")) |>
nanoparquet# Let's just limit the case to 1 million records,
# is enough to get the point across
::filter(.id <= 1e7)
dplyr<- seq(4, 7, by = 0.2)
o <- 10^o
N ::tic.clearlog()
tictocfor(i in 1:length(N)) {
::tic()
tictoc<- N[i]
X <-
ais_join |>
ais ::filter(.id <= X) |>
dplyr::st_as_sf(coords = c("lon", "lat"),
sfcrs = 4326,
remove = FALSE) |>
::st_join(eusm, join = sf::st_intersects, left = TRUE)
sf::toc(log = TRUE, quiet = TRUE)
tictoc
}<- tictoc::tic.log(format = FALSE)
log.lst ::tic.clearlog()
tictoc<- unlist(lapply(log.lst, function(x) x$toc - x$tic))
timings
timings<-
dplyr_eusm ::tibble(n = N / 1e6,
tibbletiming = timings |> as.vector())
Comparison
::bind_rows(duckdb_eusm |> dplyr::mutate(method = "duckdb"),
dplyr|> dplyr::mutate(method = "dplyr")) |>
dplyr_eusm ::ggplot(ggplot2::aes(n, timing / 60, colour = method)) +
ggplot2::geom_point() +
ggplot2::labs(x = "Millions of pings",
ggplot2y = "Timing [minutes]",
colour = "Method") |>
::scale_colour_brewer(palette = "Set1") ggplot2
Conclusion
Generally, doing things via {duckdb} is better than via {dplyr-sf}.