Climate Data

In part three, I made mention that I didn’t exactly have sunlight hours, so I used proxy measures such as Specific photovoltaic power output. Further research led me to the Bureau of Meteorology’s average climate maps.

This had the data I was after (Daily average solar hours for each month) as well as some robust features such as:


The files were available as ASCII grids which I hadn’t worked with until this project. They are formatted .txt files with a structure similar to the one below. The raster::raster function imports them with no issue; however, there is no CRS data attached.



ncols         178
nrows         139
xllcorner     111.875
yllcorner     -44.625
cellsize      0.25
NODATA_value  -9999
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 3 3 3 3 3
3 4 4 4 5 5 5 6 6 6 7 7 8 8 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9
9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 8 8 8 8 8 8 8 8 8 8 8 8 8 8 7 7 7 7 7 7 7 
7 7 7 7 7 7 7 7 7 7 7 7 7 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6


As always, I worked backwards from the desired data structure, which is a wide format data frame with a column for each variable expressed as a boolean for factors and quantitive factors as numerics. Scaling would be handled in the data preparation stages.

First steps were to download and unpack the zips:



pacman::p_load(tidyverse, curl, sf, glue, fasterize)
root = "input/09 climate grids"
#http://www.bom.gov.au/jsp/ncc/climate_averages/climate-classifications/index.jsp?maptype=tmp_zones#maps

climate_files = c("http://www.bom.gov.au/web01/ncc/www/climatology/climate-classification/seasgrpb.zip",
          "http://www.bom.gov.au/web01/ncc/www/climatology/climate-classification/seasb.zip",
          "http://www.bom.gov.au/web01/ncc/www/climatology/climate-classification/kpngrp.zip",
          "http://www.bom.gov.au/web01/ncc/www/climatology/climate-classification/kpn.zip",
          "http://www.bom.gov.au/web01/ncc/www/climatology/climate-classification/tmp_zones.zip")

sun_files = month.abb %>%
  str_to_lower() %>%
  paste0("http://www.bom.gov.au/web01/ncc/www/climatology/sunshine/sun",.,".zip")

frost_files = c('http://www.bom.gov.au/web01/ncc/www/climatology/frost/frost-ltm5deg-7605-an.zip',
                'http://www.bom.gov.au/web01/ncc/www/climatology/frost/frost-ltm2deg-7605-an.zip',
                'http://www.bom.gov.au/web01/ncc/www/climatology/frost/frost-lt0deg-7605-an.zip',
                'http://www.bom.gov.au/web01/ncc/www/climatology/frost/frost-lt2deg-7605-an.zip')

zip_files = c(climate_files,sun_files,frost_files)

download_zip_and_unpack = function(zip){
  file_name = zip %>% str_extract("[^\\/]+(?=\\.zip)")
  zip_path = glue("{root}/zip/{file_name}.zip")
  if(!dir.exists(glue("{root}/zip"))){dir.create(glue("{root}/zip"), F, T)}
  if(!dir.exists(glue("{root}/grids"))){dir.create(glue("{root}/grids"), F, T)}
  if(!file.exists(zip_path)){curl_download(zip, zip_path)}
  folder_name = case_when(str_detect(zip,"sunshine") ~ "sun",
                          str_detect(zip, "frost") ~ "frost",
                          TRUE ~ file_name)
  zip::unzip(zip_path, exdir = glue("{root}/grids/{folder_name}"))
  unzipped_files = list.files(path = glue("{root}/grids/{folder_name}"), full.names = T)
  ifelse(str_detect(zip,"(sunshine|frost)"),
          glue("{root}/grids/{folder_name}/{file_name}.txt"),
          unzipped_files[!str_detect(unzipped_files,"readme")])
  }

zip_files %>% map_chr(download_zip_and_unpack)

all_files = list.files(glue("{root}/grids"),
                       recursive = T,
                       pattern = ".txt",
                       full.names = T) %>%
  data.frame(file = .) %>%
  filter(!str_detect(file, "readme")) %>%
  pluck("file") %>%
  as.character()


The download_zip_and_unpack function handles folder creation for each layer and unzipping the archives, with the .txt files for each zip file being returned. This leaves us with a folder structure like so:


──── 09 climate grids ├─── grids │ ├─── frost │ ├─── kpn │ ├─── kpngrp │ ├─── seasb │ ├─── seasgrpb │ ├─── sun │ └─── tmp_zones └─── zip


I then wrote a function that

  1. Imported the raw ASCII files as a raster object
  2. Set the CRS to match the project
  3. Write the raster to a GeoTIFF (if you know a better way, please make a PR)
  4. Translate the GeoTIFF to resolution of the project using GDAL
  5. Translate the GeoTIFF from (4) to the extent of the project
  6. Remove the intermediate files are return a path to the corrected file



raster_preprocessing = function(file){
  res = 360 / 8640
  raw = raster(file)
  raster::crs(raw) = "+proj=longlat +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +no_defs"
  tif_path = str_replace_all(file,".txt$",".tif")
  raster::writeRaster(raw, filename = tif_path, overwrite = TRUE)
  sf::gdal_utils("translate",
                 source = tif_path,
                 destination = str_replace(tif_path, "\\.tif","\\_temp\\.tif"),
                 options = c("-tr", res, res),
                 )
  # gdal_translate -projwin ulx uly lrx lry inraster.tif outraster.tif`
  sf::gdal_utils("translate",
                 source = str_replace(tif_path, "\\.tif","\\_temp\\.tif"),
                 destination = tif_path,
                 options = c("-projwin",
                             112,
                             -10,
                             154,
                             -44))
  file.remove(str_replace(tif_path, "\\.tif","\\_temp\\.tif"))
  tif_path
}


I then revisited the raster extraction code from part 3. Due to returning the path from the raster_preprocessing function, I could pass this directly into singleband_raster_to_dataframe.



singleband_raster_to_dataframe = function(img){
  img_brick = img %>%
    raster::raster() %>%
    raster::brick()

  col_name = img_brick@data@names[1]

  img_brick@data@values %>%
    as.data.frame() %>%
    set_names(nm = str_to_lower(col_name))
}

df = all_files %>%
  map_chr(raster_preprocessing) %>%
  map_dfc(singleband_raster_to_dataframe)


This was fine however the column names were non-intuitive which will make interpretation harder down the line:



df %>% glimpse(width = 80)
Observations: 822,528
Variables: 20
$ flt0.ann   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
$ fltm2.ann  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
$ fltm5.ann  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
$ frostann   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
$ kpnall     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
$ kpngrp     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
$ seasrain   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
$ sunapr     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
$ sunaug     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
$ sundec     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
$ sunfeb     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
$ sunjan     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
$ sunjul     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
$ sunjun     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
$ sunmar     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
$ sunmay     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
$ sunnov     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
$ sunoct     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
$ sunsep     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
$ clim.zones <dbl> NA, 5, 5, 2, 5, 5, 2, 2, 5, 5, 2, 2, 5, 2, 2, 5, 2, 2, 5, 


To counter this, I used a CSV with tidied names and joined it by category and factor level.



category_map = glue("{root}/grids/category_map.csv") %>% data.table::fread()


The category column matches the feature names in df;thus, category_map.csv is a $ M \times N $ table between values and categories. This allows for a neat left_join with no duplicates.



|category | from|to                                          |
|:--------|----:|:-------------------------------------------|
|kpnall   |   42|kpnall_equatorial_rainforest_monsoonal      |
|kpnall   |   41|kpnall_equatorial_savanna                   |
|kpnall   |   37|kpnall_tropical_rainforest_persistently_wet |
|kpnall   |   36|kpnall_tropical_rainforest_monsoonal        |
|kpnall   |   35|kpnall_tropical_savanna                     |
|kpnall   |   34|kpnall_subtropical_no_dry_season            |


I perform this join inside the categorical_column_to_wide function and exploit the features of pivot_wider to achieve my target wide dataframe directly.


Stepping through this function, it takes a data frame (df) and a column (col), adds an index to maintain identity and pivots the dummy column value by the levels in col. It then removes the original column from df and appends the wide levels and returns the modified data frame, to be piped into the next function.



categorical_column_to_wide = function(df, col){
  col_enq = enquo(col)

  vector_levels = df %>%
    select(!!col_enq) %>%
    set_names("column") %>%
    mutate(n = row_number(),
           value = 1,
           category = col) %>%
    left_join(category_map,
              by = c("column" = "from", "category" = "category")) %>%
    pivot_wider(names_from = "to",
                values_from = value,
                values_fill = list(value = 0)) %>%
    select(-n, -category, -column)
  if ("NA" %in% names(vector_levels)) {
    vector_levels = vector_levels %>%  select(-`NA`)
  }
  df %>% select(-!!col_enq) %>% bind_cols(vector_levels)
  }


I then pipe the results of this, recursively onto itself with new columns before bind the lat-lon data in df_points.csv and write a wide data frame of the climate data.



df_wide = df %>%
  categorical_column_to_wide("clim.zones") %>%
  categorical_column_to_wide("seasrain") %>%
  categorical_column_to_wide("kpngrp") %>%
  categorical_column_to_wide("kpnall") %>%
  set_names(names(.) %>% str_replace_all(c("fltm" = "frost_days_less_than_minus_",
                                           "\\.ann" = "",
                                           "flt"  = "frost_days_less_than_",
                                           "frostann" = "frost_days_per_annum",
                                           "sun" = "sunshine_hours_in_")))

df_wide %>%
  bind_cols(data.table::fread("input/df_points.csv")) %>%
  data.table::fwrite(glue("{root}/df_climate.csv"))