library(tidyverse) library(readxl) library(janitor) # IMPORT ----------------------------------------------------------------------- # the read_excel function looks at the first 1000 rows of data to # obtain the likely type (character, numerical, date, ...) of a variable # in order to overwrite that default we increase the guess_max value d <- read_excel("DATASET_kenya.xlsx", guess_max = 1e5) # lets see what we go d |> glimpse() # let actually clean up the variable names a bit # MAKE NICE VARIABLE NAMES ----------------------------------------------------- d <- d |> clean_names() d |> glimpse() d |> view() # lets just look at a subset of the data d2 <- d |> select(date_org = date_dd_mm_yyyy, year, month = month_name_in_full, latin = species_name, fao = fao_code, length = tl_cm, wt_org = ungutted_fish_weight_kg) d2 |> glimpse() # So we have date in a format that looks like and integer # and wt_raw is a character # CHECK DATES ------------------------------------------------------------------ # Lets just look at some dates d2$date_org |> unique() # unique gives me unique values # so date is a mixture of numbers and something that looks like a date format # we could try to fix this by d2 <- d2 |> mutate(date = case_when(!is.na(as.integer(date_org)) ~ excel_numeric_to_date(as.numeric(date_org)), .default = dmy(date_org)), .after = date_org) # Lets check if we have any missing dates (which we do not expect) d2_check <- d2 |> filter(is.na(date)) # note that here we have 10694 records with missing "derived" date d2_check |> view() # seems like we have date records which were of the format mdy (even though the # original variable name on date was explicitly written as: "date_dd_mm_yyyy". # We could try to fix this by using "mdy" function d2 <- d2 |> mutate(date = case_when(is.na(date) ~ mdy(date_org), .default = date)) d2 |> filter(is.na(date)) # ok, so now only two dates missing but that is because there is nothing in # the date_org # even though we now managed to create a proper R-date one should never trust # blindly that one has created the correct date. this has nothing to do with # R but excel which can make a real mess of things when it comes to dates # in the above we actually selected some few columns (object d2), lets here # repeat the convertion do date on the dataframe that has all the variables d <- d |> mutate(date = case_when(!is.na(as.integer(date_dd_mm_yyyy)) ~ excel_numeric_to_date(as.numeric(date_dd_mm_yyyy)), .default = dmy(date_dd_mm_yyyy)), .after = date_dd_mm_yyyy) |> mutate(date = case_when(is.na(date) ~ mdy(date_dd_mm_yyyy), .default = date)) d |> glimpse() d |> filter(is.na(date)) |> glimpse() # One thing we notice is that the wt_org is a character but we expected a # numeric. To convert it to numeric we do: d2 <- d2 |> mutate(wt = as.numeric(wt_org)) d2 # lets check if some numbers got missing in the convertions: d2 |> filter(!is.na(wt_org) & is.na(wt)) # does not seem to be the case, so lets do our first exploratory analysis d2 |> ggplot(aes(length, wt)) + geom_point(size = 2) # hmm..., some fishes that are less than 50 cm almost half a tonne # these entries need to be corrected or fixed before we proceed # lets for now just filter out weights more than 10 kg d2 |> filter(wt < 10) |> ggplot(aes(length, wt)) + geom_point(size = 0.5, alpha = 0.5) + facet_wrap(~ latin) # hmmmm, ... # if we plot things on a log-log scale we should expect a straigh line: d2 |> filter(wt < 10) |> ggplot(aes(length, wt)) + geom_hex() + facet_wrap(~ latin) + scale_x_log10() + scale_y_log10() + scale_fill_viridis_c() # say this because tibble(length = 1:100) |> mutate(wt = 0.01 * length^3) |> ggplot(aes(length, wt)) + geom_point() + scale_x_log10() + scale_y_log10() # so I will make a guess that length is a length class and weights could be # a sum of weights of fishes that fall in that length class # so question is: in which column does one find the number of fish reported # within a given length class # Lesson is: one has to know the data, a data-monkey like me needs to know # the details