points <-
"https://github.com/cboettig/duckdbfs/raw/main/inst/extdata/metro.fgb" |>
duckdbfs:: open_dataset (format = "sf" ) |>
dplyr:: select (city = name, geom)
polygons <-
"https://github.com/cboettig/duckdbfs/raw/main/inst/extdata/world.fgb" |>
duckdbfs:: open_dataset (url, format = "sf" ) |>
dplyr:: select (country = name, geom)
system.time (
points |>
duckdbfs:: spatial_join (polygons, by = "st_intersects" , join = "left" ) |>
dplyr:: select (city, country) |>
dplyr:: glimpse ()
)
Rows: ??
Columns: 2
Database: DuckDB v1.3.0 [unknown@Linux 5.10.0-33-amd64:R 4.4.1/:memory:]
$ city <chr> "Auckland", "Melbourne", "Adelaide", "Sydney", "Brisbane", "Pe…
$ country <chr> "New Zealand", "Australia", "Australia", "Australia", "Austral…
user system elapsed
0.416 0.037 10.705
…, hmmm this took little longer than I thought
Let’s try a bigger set:
library (duckdbfs)
library (duckdb)
if (FALSE ) {
N <- 1e9
tibble:: tibble (id = 1 : N,
lon = runif (N, - 179 , 179 ),
lat = runif (N, - 89 , 89 )) |>
arrow:: write_parquet ("tmp.parquet" )
}
points <-
duckdbfs:: open_dataset (here:: here ("tmp.parquet" )) |>
dplyr:: mutate (geom = st_point (lon, lat))
system.time (
points |>
duckdbfs:: spatial_join (polygons, by = "st_intersects" , join = "left" ) |>
dplyr:: select (id, country) |>
dplyr:: glimpse ()
)
Rows: ??
Columns: 2
Database: DuckDB v1.3.0 [unknown@Linux 5.10.0-33-amd64:R 4.4.1/:memory:]
$ id <int> 1, 15, 17, 21, 22, 30, 38, 42, 43, 44, 48, 51, 52, 55, 60, 61,…
$ country <chr> "Brazil", "Antarctica", "Turkey", "Antarctica", "Antarctica", …
user system elapsed
19.619 2.406 6.419
Not run:
library (sf)
library (arrow)
# Sample data
df <- data.frame (
id = 1 : 2 ,
name = c ("Los Angeles" , "Las Vegas" ),
lon = c (- 118.24 , - 115.14 ),
lat = c (34.05 , 36.17 )
)
# Convert to sf object (geometry column)
sf_df <- st_as_sf (df, coords = c ("lon" , "lat" ), crs = 4326 ) # WGS84
# Convert geometry to WKT
sf_df_wkt <-
sf_df %>%
dplyr:: mutate (wkt = st_as_text (geometry)) %>%
st_drop_geometry () # drop sf geometry column
# Save to Parquet
write_dataset (sf_df_wkt, "location2" , format = "parquet" )
duckdbfs:: write_dataset (sf_df_wkt, "location3" , format = "parquet" )
duckdbfs:: open_dataset ("location3" ) |> duckdbfs:: to_sf ()
q <- arrow:: open_dataset ("location2" )
q |> dplyr:: glimpse () |> print ()
q$ metadata
q <- duckdbfs:: open_dataset ("location2" )
q |> dplyr:: glimpse ()
q |> dplyr:: collect ()