library(tidyverse)
library(readxl)
library(janitor)

# IMPORT -----------------------------------------------------------------------
# the read_excel function looks at the first 1000 rows of data to
#  obtain the likely type (character, numerical, date, ...) of a variable
# in order to overwrite that default we increase the guess_max value
d <- read_excel("DATASET_kenya.xlsx", guess_max = 1e5)
# lets see what we go
d |> glimpse()
# let actually clean up the variable names a bit

# MAKE NICE VARIABLE NAMES -----------------------------------------------------
d <- 
  d |> 
  clean_names()
d |> glimpse()
d |> view()
# lets just look at a subset of the data
d2 <- 
  d |> 
  select(date_org = date_dd_mm_yyyy,
         year,
         month = month_name_in_full,
         latin = species_name,
         fao = fao_code,
         length = tl_cm,
         wt_org = ungutted_fish_weight_kg)
d2 |> glimpse()
# So we have date in a format that looks like and integer
#  and wt_raw is a character

# CHECK DATES ------------------------------------------------------------------
# Lets just look at some dates
d2$date_org |> unique()   # unique gives me unique values
# so date is a mixture of numbers and something that looks like a date format
# we could try to fix this by
d2 <- 
  d2 |> 
  mutate(date = case_when(!is.na(as.integer(date_org)) ~ excel_numeric_to_date(as.numeric(date_org)),
                          .default = dmy(date_org)),
         .after = date_org)
# Lets check if we have any missing dates (which we do not expect)
d2_check <- 
  d2 |> 
  filter(is.na(date))
# note that here we have 10694 records with missing "derived" date
d2_check |> view()
# seems like we have date records which were of the format mdy (even though the
#  original variable name on date was explicitly written as: "date_dd_mm_yyyy".
# We could try to fix this by using "mdy" function
d2 <-
  d2 |> 
  mutate(date = case_when(is.na(date) ~ mdy(date_org),
                          .default = date))
d2 |> 
  filter(is.na(date))
# ok, so now only two dates missing but that is because there is nothing in
#  the date_org

# even though we now managed to create a proper R-date one should never trust
# blindly that one has created the correct date. this has nothing to do with
# R but excel which can make a real mess of things when it comes to dates

# in the above we actually selected some few columns (object d2), lets here
# repeat the convertion do date on the dataframe that has all the variables
d <-
  d |> 
  mutate(date = case_when(!is.na(as.integer(date_dd_mm_yyyy)) ~ excel_numeric_to_date(as.numeric(date_dd_mm_yyyy)),
                          .default = dmy(date_dd_mm_yyyy)),
         .after = date_dd_mm_yyyy) |> 
  mutate(date = case_when(is.na(date) ~ mdy(date_dd_mm_yyyy),
                          .default = date))
  
d |> glimpse()
d |> filter(is.na(date)) |> glimpse()


# One thing we notice is that the wt_org is a character but we expected a
#  numeric. To convert it to numeric we do:
d2 <- 
  d2 |> 
  mutate(wt = as.numeric(wt_org))
d2
# lets check if some numbers got missing in the convertions:
d2 |> 
  filter(!is.na(wt_org) & is.na(wt))
# does not seem to be the case, so lets do our first exploratory analysis
d2 |> 
  ggplot(aes(length, wt)) +
  geom_point(size = 2)
# hmm..., some fishes that are less than 50 cm almost half a tonne
#  these entries need to be corrected or fixed before we proceed

# lets for now just filter out weights more than 10 kg
d2 |> 
  filter(wt < 10) |> 
  ggplot(aes(length, wt)) +
  geom_point(size = 0.5, alpha = 0.5) +
  facet_wrap(~ latin)
# hmmmm, ...

# if we plot things on a log-log scale we should expect a straigh line:
d2 |> 
  filter(wt < 10) |> 
  ggplot(aes(length, wt)) +
  geom_hex() +
  facet_wrap(~ latin) +
  scale_x_log10() +
  scale_y_log10() +
  scale_fill_viridis_c()

# say this because
tibble(length = 1:100) |> 
  mutate(wt = 0.01 * length^3) |> 
  ggplot(aes(length, wt)) +
  geom_point() +
  scale_x_log10() +
  scale_y_log10()

# so I will make a guess that length is a length class and weights could be
# a sum of weights of fishes that fall in that length class
#  so question is: in which column does one find the number of fish reported
#                  within a given length class
# Lesson is: one has to know the data, a data-monkey like me needs to know
#            the details