mining_u2merki.Rmd
library(fs)
library(omar)
library(lubridate)
library(staroddi)
library(tidyverse)
source("~/stasi/taggart/scripts/read_dst.R")
Although the DST-recaptures have systematically been reported along with conventional recaptures in the MFRI database, the DST profile records (time, temperature, depth, …) have not.
Fortunately, many of the DST profiles (.DAT files) have been stored in a common directory (/u2/merki) since the beginning of the century. The problem here though is that the space has been used both as a raw data storage space as well as processing space. Hence one may have multiple identical original files or some extracts of the those in different folders.
The objective here is:
What is not (yet) considered here is:
All headers of the .DAT files within the /u2/merki
are
first gathered
TODO:
Problem :
Date-time:
This should be solvable - just need to check the format in the header file
files <-
fs::dir_info("/u2/merki/gogn", recurse = TRUE, fail = FALSE, glob = "*.DAT") %>%
mutate(path = as.character(path),
file = basename(path),
ID = 1:n())
header <-
map(files$path, read_dst_header, long = FALSE) %>%
bind_rows() %>%
janitor::clean_names()
header %>% glimpse()
d <-
header %>%
mutate(loc = str_locate(recorder, "[a-zA-Z]+")[,1],
utgafa = str_sub(recorder, 1, loc-1),
ak = stringr::str_extract(recorder, "[a-zA-Z]+") |> stringr::str_to_upper(),
numerc = stringr::str_sub(recorder, loc + 1),
numer = as.integer(numerc),
utgafa = as.integer(utgafa)) %>%
group_by(recorder) %>%
mutate(n.recorder = n()) %>%
ungroup() %>%
arrange(ak, numer, utgafa, desc(n), modification_time) %>%
group_by(recorder) %>%
mutate(n.tmp = 1:n()) %>%
# identify profile that has mininum modification_time and maximum n
mutate(use = ifelse(n == max(n) &
modification_time == min(modification_time) &
date_time == min(date_time) &
n.tmp == min(n.tmp),
TRUE,
FALSE)) %>%
ungroup() %>%
select(-n.tmp)
Problem with algorithm, fix upstream:
d %>%
filter(is.na(date_time)) %>%
glimpse()
# the files to check
d %>%
filter(is.na(date_time)) %>%
pull(path)
… for the time being, filter these out:
# what does modification time vs change_time mean??
d %>%
select(recorder, date_time, modification_time, change_time, path) %>%
mutate(diff = difftime(change_time, modification_time, units = "days")) %>%
arrange(diff)
… filtering
d.filtered <-
d %>%
select(recorder, audkenni, numer, utgafa, n, modification_time, everything()) %>%
arrange(audkenni, numer, utgafa, desc(n), modification_time) %>%
group_by(recorder) %>%
# filter by mininum modification_time and maximum n
filter(n == max(n) &
modification_time == min(modification_time) &
date_time == min(date_time)) %>%
# above things being equal, take the first slice
# could here think about prioritizing the directory the data is taken from
slice(1) %>%
ungroup()
… we should here have unique recorder
… how to preserve time
… check out file_link
map(d.filtered$path, file_copy, "/u2/reikn/einarhj/tgs/DAT")
"/u2/reikn/einarhj/tgs"
files <-
fs::dir_info("/u2/reikn/einarhj/tgs/DAT", recurse = TRUE, fail = FALSE, glob = "*.DAT") %>%
mutate(path = as.character(path),
file = basename(path),
ID = 1:n())
# read all the files
res <- map(files$path, read_dst_header, FALSE)
d <-
bind_rows(res) %>%
janitor::clean_names() %>%
mutate(loc = stringr::str_locate(recorder, "[a-zA-Z]+")[,1],
utgafa = str_sub(recorder, 1, loc-1),
audkenni = stringr::str_extract(recorder, "[a-zA-Z]+") |> stringr::str_to_upper(),
numerc = stringr::str_sub(recorder, loc + 1),
numer = as.integer(numerc),
utgafa = as.integer(utgafa)) %>%
group_by(recorder) %>%
mutate(n.recorder = n()) %>%
ungroup()
d %>%
filter(n.recorder > 1) %>%
nrow()
library(omar)
con <- connect_mar()
ak <- c("ISL.ELDI.", "UTE.DUMMY", "ISL.MILLI.", "ISL.RAF.",
"ESKIFJORDUR", "ISL.GPS.", "ISL.MICRO.", "BIOPOL",
"Hafro.Selur", "SL.EHJ", "Pit tag", "DST", "Pop-up",
"RAF")
rec.dst <-
taggart(con) %>%
filter(audkenni %in% ak) %>%
collect(n = Inf)
… just the taggart bookkeeping:
rec.dst %>%
mutate(recaptured = ifelse(!is.na(rdate), TRUE, FALSE),
onrecord = ifelse(!is.na(dst_id), TRUE, FALSE)) %>%
count(rsid, audkenni, recaptured, onrecord) %>%
knitr::kable()
… join with the mined data:
rec.dst %>%
mutate(recaptured = ifelse(!is.na(rdate), TRUE, FALSE),
onrecord = ifelse(!is.na(dst_id), TRUE, FALSE)) %>%
full_join(d %>%
select(recorder) %>%
mutate(on.disk = TRUE),
by = c("dst_id" = "recorder")) %>%
mutate(on.disk = replace_na(on.disk, FALSE)) %>%
count(rsid, audkenni, recaptured, onrecord, on.disk) %>%
knitr::kable()
… hmmmmm,