# install.packages(c("readxl", "janitor")) library(tidyverse) library(readxl) library(janitor) d <- read_excel("data-raw/DATASET_srilanka.xlsx") d |> glimpse() # clean names d <- d |> janitor::clean_names() d |> glimpse() d$date # so date as imported is all over the place. we have: # * something that looks like a human readable data, like "22/05/2019" # * it looks like most of those are of the format "DD/MM/YYYY" # * something that is just an integer, like "43471" # * these are actually Excel numerical version of a date # to get date in order in R we have to do things two ways: d <- d |> rename(date_org = date) |> mutate(date = case_when(!is.na(as.integer(date_org)) ~ excel_numeric_to_date(as.numeric(date_org)), .default = dmy(date_org))) # let's see what we got: d |> knitr::kable() # Given we know that the data entry was such that dates where supposed to be in # chronological order we see that even though we have tried to convert values # to a date format that R understands. E.g. row where we have date "2019-05-31" # we get date "2019-01-06". This means that the date entered in Excel orginally # was wrong. So we need to do something else that what was tried above. # Here we do: # Split the original date column into year, month and day # Where the date entry is wrong we will interpret day as month and month as day # Where right, we will just keep that value # # Let's start from scatch read_excel("data-raw/DATASET_srilanka.xlsx") |> janitor::clean_names() |> d |> # get rid of redundunant variable (date_org) d <- d |> select(date, boats = number_of_boats, catch = catch_number_of_individuals) # Lets check boats vs catch d |> ggplot(aes(boats, catch)) + geom_point() + # fit a linear model: geom_smooth(method = "lm") # lets calculate cpue (catch per boat) and then plot the histogram d |> mutate(cpue = catch / boats) |> ggplot(aes(cpue)) + geom_histogram() # lets look at the cpue over time d |> mutate(cpue = catch / boats) |> ggplot(aes(date, cpue)) + geom_point() + geom_smooth() # so most observations in june and july # lets look at number of "observations" by month d |> mutate(date = floor_date(date, "month")) |> group_by(date) |> reframe(boats = sum(boats)) |> ggplot(aes(date, boats)) + geom_col()