auscensus - A more complex case

This vignette shows a more complex use case of auscensus. Let’s assume we want to extract the percentage of Australian Citizens for all Commonwealth Electoral Divisions, as measured in last 4 Censuses (2006-2021).

An initial exploration shows that this data can be found in table 01 (across all four censuses) - which provided an statistical summary. However, is not published aggregated by electorate across all censuses.


#| eval: true

census_tables <- list_census_tables() |>
                 filter(Number %in% c("01")) 

tables_summary <- census_tables |>
                  select(-contains("Table")) |>
                  pivot_longer(-Number, names_to="Year",values_to = "Value") |>
                  filter(!is.na(Value)) |>
                  select(-Value)


list_census_geo_tables(year = tables_summary$Year,geo="CED|CED_2007|CED_2004",table_number = tables_summary$Number) |>
  mutate(Year=as.character(Year)) |>
  right_join(tables_summary, by=c("table_number"="Number","Year"="Year"))
#> # A tibble: 4 × 3
#>   Year  table_number CED  
#>   <chr> <chr>        <lgl>
#> 1 2011  01           TRUE 
#> 2 2016  01           TRUE 
#> 3 2021  01           TRUE 
#> 4 2006  01           NA

Therefore, we will retrieve the data from the lowest statistical unit. However, SA1 were not available in 2006 - where the smallest area was a “CD”.


#| eval: true

list_census_geo_tables(year = tables_summary$Year,geo=c("SA1|CD"),table_number = tables_summary$Number) |>
   mutate(Year=as.character(Year)) |>
  right_join(tables_summary, by=c("table_number"="Number","Year"="Year")) 
#> # A tibble: 4 × 4
#>   Year  table_number CD    SA1  
#>   <chr> <chr>        <lgl> <lgl>
#> 1 2006  01           TRUE  NA   
#> 2 2011  01           NA    TRUE 
#> 3 2016  01           NA    TRUE 
#> 4 2021  01           NA    TRUE

The next step is to figure the attributes for the numbers of Australian citizen and total population, which are presented below:


citizenship_attributes <- tibble()

for(i in 1:nrow(tables_summary)){

  table_i <- tables_summary[i,]$Number
  year_i  <- tables_summary[i,]$Year

  attr_i <- list_census_attributes(table_i) |> 
             pivot_longer(-c(Table,Attribute),
                           names_to="Year",values_to = "Value") |>
              filter(Year==year_i) |>
              filter(!is.na(Value)) |>
              select(-Value)

  citizenship_attributes <- bind_rows(attr_i,citizenship_attributes)

}



citizenship_attributes |>
  head()
#> # A tibble: 6 × 3
#>   Table Attribute                     Year 
#>   <chr> <chr>                         <chr>
#> 1 01    Total Persons_males           2006 
#> 2 01    Total Persons_females         2006 
#> 3 01    Total Persons_persons         2006 
#> 4 01    Age Groups: 0-4 Years_males   2006 
#> 5 01    Age Groups: 0-4 Years_females 2006 
#> 6 01    Age Groups: 0-4 Years_persons 2006

citizenship_attributes <- citizenship_attributes   |> 
                       distinct(Attribute,Year) |>
                      filter(str_detect(Attribute,"[Aa]ustralian")) |>
                      filter(str_detect(Attribute,"[Pp]erson")) |>
                      mutate(dummy=TRUE) |>
                      pivot_wider(names_from = Year, values_from = dummy)  |>
                      bind_rows(
                         citizenship_attributes   |> 
                       distinct(Attribute,Year) |>
                      filter(str_detect(Attribute,"^[Tt]otal")) |>
                      filter(str_detect(Attribute,"[Pp]ersons$")) |>
                      mutate(dummy=TRUE) |>
                      pivot_wider(names_from = Year, values_from = dummy)
                      )


citizenship_attributes 
#> # A tibble: 4 × 5
#>   Attribute                  `2006` `2011` `2016` `2021`
#>   <chr>                      <lgl>  <lgl>  <lgl>  <lgl> 
#> 1 Australian Citizen_persons TRUE   NA     NA     NA    
#> 2 Australian_citizen_persons NA     TRUE   TRUE   TRUE  
#> 3 Total Persons_persons      TRUE   NA     NA     NA    
#> 4 Total_persons_persons      NA     TRUE   TRUE   TRUE

Using attribute_tibble_to_list, this data frame can be converted into the required format.


citizenship_attributes <- citizenship_attributes |>
                          select(Attribute)      |>
                          mutate(AttrNew = case_when(
                            str_detect(Attribute,"Australian") ~ "Australian Citizens",
                            str_detect(Attribute,"Total")      ~ "Total"
                          ))

levels <- attribute_tibble_to_list(citizenship_attributes)

Now, we can cycle through the four censuses and extract the data. Please note that CDs and SA1s are not equivalent, but they are stored together for convenience:

census_years <- c("2006","2011","2016","2021")

citizenship <- tibble()

for(year in census_years){

  if(year=="2006"){
    geo_structure_x <- "CD"
  } else{
    geo_structure_x <- "SA1"
  }

  citizenship_i <- get_census_summary(census_table = census_tables,
                                   selected_years = year,
                                   geo_structure=geo_structure_x,
                                   attribute = levels) 


  citizenship <- bind_rows(citizenship,citizenship_i)


}
#> ERROR : [EBUSY] Failed to remove 'C:/Users/carlo/OneDrive/Documents/.auscensus_cache/CD_NSW_B01.csv': resource busy or locked 
#> ERROR : [EBUSY] Failed to remove 'C:/Users/carlo/OneDrive/Documents/.auscensus_cache/CD_NSW_B01.csv': resource busy or locked 
#> ERROR : [EBUSY] Failed to remove 'C:/Users/carlo/OneDrive/Documents/.auscensus_cache/2011Census_B01_AUST_SA1_short.csv': resource busy or locked 
#> ERROR : [EBUSY] Failed to remove 'C:/Users/carlo/OneDrive/Documents/.auscensus_cache/2016Census_G01_AUS_SA1.csv': resource busy or locked 
#> ERROR : [EBUSY] Failed to remove 'C:/Users/carlo/OneDrive/Documents/.auscensus_cache/2021Census_G01_AUST_SA1.csv': resource busy or locked

rm(citizenship_i,geo_structure_x,levels,citizenship_attributes)

To aggregate the data, aussiemaps::geo_aggregate() can help using area to apportion on non-overalpping cases. Then, this package’s calculate_percentage() will take the totals from the list and calculate percentages.


citizenship_ced<- tibble()
codes <- c("CD_CODE_2006","SA1_7DIGITCODE_2011","SA1_7DIGITCODE_2016","SA1_CODE_2021")
ceds  <- c("CED_NAME_2006","CED_NAME_2011","CED_NAME_2016","CED_NAME_2021")

 for(i in 1:length(census_years)){

   year <- as.double(census_years[i])

   value_i <- citizenship |> 
                  filter(Year==year)     |>
                  select(-Unit)                    |>
                  rename(!!codes[i]:="Census_Code") |>
                  collect() |>
                  aussiemaps::geo_aggregate(
                              values_col="Value",
                              original_geo=codes[i],
                              new_geo=ceds[i],
                              grouping_col = c("Year","Attribute"),
                              year=census_years[i]) |>
              rename("Unit"=ceds[i])  |>
              filter(!is.na(Unit))    |>
              filter(str_detect(Unit,"[Ss]hipping",TRUE))|>
              filter(str_detect(Unit,"[Uu]sual",TRUE))   |>
              filter(str_detect(Unit,"[Aa]pplicable",TRUE)) 

    value_i <- value_i  |> 
               auscensus::calculate_percentage(key_col = "Attribute",
                                                value_col = "Value",
                                                key_value = "Total",
                                                percentage_scale = 100)

   citizenship_ced <- bind_rows(citizenship_ced,value_i)


 }


citizenship_ced 
#> # A tibble: 595 × 6
#>    Unit       Year Attribute            Value  Total Percentage
#>    <chr>     <dbl> <chr>                <dbl>  <dbl>      <dbl>
#>  1 Adelaide   2006 Australian Citizens 115908 140151       82.7
#>  2 Aston      2006 Australian Citizens 115010 129463       88.8
#>  3 Ballarat   2006 Australian Citizens 118247 129221       91.5
#>  4 Banks      2006 Australian Citizens 104181 118371       88.0
#>  5 Barker     2006 Australian Citizens 133033 145410       91.5
#>  6 Barton     2006 Australian Citizens 105287 131234       80.2
#>  7 Bass       2006 Australian Citizens  85631  94270       90.8
#>  8 Batman     2006 Australian Citizens 105561 128107       82.4
#>  9 Bendigo    2006 Australian Citizens 121930 131885       92.5
#> 10 Bennelong  2006 Australian Citizens 104473 129340       80.8
#> # … with 585 more rows