--- title: "auscensus - A more complex case" resource_files: - vignetttes/house-preference-flow*.png author: "Carlos YANEZ SANTIBANEZ" date: "2023-01-14" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{auscensus - A more complex case} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- This vignette shows a more complex use case of auscensus. Let's assume we want to extract the percentage of Australian Citizens for all Commonwealth Electoral Divisions, as measured in last 4 Censuses (2006-2021). An initial exploration shows that this data can be found in table 01 (across all four censuses) - which provided an statistical summary. However, is not published aggregated by electorate across all censuses. ```r #| eval: true census_tables <- list_census_tables() |> filter(Number %in% c("01")) tables_summary <- census_tables |> select(-contains("Table")) |> pivot_longer(-Number, names_to="Year",values_to = "Value") |> filter(!is.na(Value)) |> select(-Value) list_census_geo_tables(year = tables_summary$Year,geo="CED|CED_2007|CED_2004",table_number = tables_summary$Number) |> mutate(Year=as.character(Year)) |> right_join(tables_summary, by=c("table_number"="Number","Year"="Year")) #> # A tibble: 4 × 3 #> Year table_number CED #> #> 1 2011 01 TRUE #> 2 2016 01 TRUE #> 3 2021 01 TRUE #> 4 2006 01 NA ``` Therefore, we will retrieve the data from the lowest statistical unit. However, SA1 were not available in 2006 - where the smallest area was a "CD". ```r #| eval: true list_census_geo_tables(year = tables_summary$Year,geo=c("SA1|CD"),table_number = tables_summary$Number) |> mutate(Year=as.character(Year)) |> right_join(tables_summary, by=c("table_number"="Number","Year"="Year")) #> # A tibble: 4 × 4 #> Year table_number CD SA1 #> #> 1 2006 01 TRUE NA #> 2 2011 01 NA TRUE #> 3 2016 01 NA TRUE #> 4 2021 01 NA TRUE ``` The next step is to figure the attributes for the numbers of Australian citizen and total population, which are presented below: ```r citizenship_attributes <- tibble() for(i in 1:nrow(tables_summary)){ table_i <- tables_summary[i,]$Number year_i <- tables_summary[i,]$Year attr_i <- list_census_attributes(table_i) |> pivot_longer(-c(Table,Attribute), names_to="Year",values_to = "Value") |> filter(Year==year_i) |> filter(!is.na(Value)) |> select(-Value) citizenship_attributes <- bind_rows(attr_i,citizenship_attributes) } citizenship_attributes |> head() #> # A tibble: 6 × 3 #> Table Attribute Year #> #> 1 01 Total Persons_males 2006 #> 2 01 Total Persons_females 2006 #> 3 01 Total Persons_persons 2006 #> 4 01 Age Groups: 0-4 Years_males 2006 #> 5 01 Age Groups: 0-4 Years_females 2006 #> 6 01 Age Groups: 0-4 Years_persons 2006 ``` ```r citizenship_attributes <- citizenship_attributes |> distinct(Attribute,Year) |> filter(str_detect(Attribute,"[Aa]ustralian")) |> filter(str_detect(Attribute,"[Pp]erson")) |> mutate(dummy=TRUE) |> pivot_wider(names_from = Year, values_from = dummy) |> bind_rows( citizenship_attributes |> distinct(Attribute,Year) |> filter(str_detect(Attribute,"^[Tt]otal")) |> filter(str_detect(Attribute,"[Pp]ersons$")) |> mutate(dummy=TRUE) |> pivot_wider(names_from = Year, values_from = dummy) ) citizenship_attributes #> # A tibble: 4 × 5 #> Attribute `2006` `2011` `2016` `2021` #> #> 1 Australian Citizen_persons TRUE NA NA NA #> 2 Australian_citizen_persons NA TRUE TRUE TRUE #> 3 Total Persons_persons TRUE NA NA NA #> 4 Total_persons_persons NA TRUE TRUE TRUE ``` Using *attribute_tibble_to_list*, this data frame can be converted into the required format. ```r citizenship_attributes <- citizenship_attributes |> select(Attribute) |> mutate(AttrNew = case_when( str_detect(Attribute,"Australian") ~ "Australian Citizens", str_detect(Attribute,"Total") ~ "Total" )) levels <- attribute_tibble_to_list(citizenship_attributes) ``` Now, we can cycle through the four censuses and extract the data. Please note that CDs and SA1s are not equivalent, but they are stored together for convenience: ```r census_years <- c("2006","2011","2016","2021") citizenship <- tibble() for(year in census_years){ if(year=="2006"){ geo_structure_x <- "CD" } else{ geo_structure_x <- "SA1" } citizenship_i <- get_census_summary(census_table = census_tables, selected_years = year, geo_structure=geo_structure_x, attribute = levels) citizenship <- bind_rows(citizenship,citizenship_i) } #> ERROR : [EBUSY] Failed to remove 'C:/Users/carlo/OneDrive/Documents/.auscensus_cache/CD_NSW_B01.csv': resource busy or locked #> ERROR : [EBUSY] Failed to remove 'C:/Users/carlo/OneDrive/Documents/.auscensus_cache/CD_NSW_B01.csv': resource busy or locked #> ERROR : [EBUSY] Failed to remove 'C:/Users/carlo/OneDrive/Documents/.auscensus_cache/2011Census_B01_AUST_SA1_short.csv': resource busy or locked #> ERROR : [EBUSY] Failed to remove 'C:/Users/carlo/OneDrive/Documents/.auscensus_cache/2016Census_G01_AUS_SA1.csv': resource busy or locked #> ERROR : [EBUSY] Failed to remove 'C:/Users/carlo/OneDrive/Documents/.auscensus_cache/2021Census_G01_AUST_SA1.csv': resource busy or locked rm(citizenship_i,geo_structure_x,levels,citizenship_attributes) ``` To aggregate the data, **[aussiemaps::geo_aggregate()](https://gh.carlosyanez.id.au/aussiemaps/)** can help using area to apportion on non-overalpping cases. Then, this package's *calculate_percentage()* will take the totals from the list and calculate percentages. ```r citizenship_ced<- tibble() codes <- c("CD_CODE_2006","SA1_7DIGITCODE_2011","SA1_7DIGITCODE_2016","SA1_CODE_2021") ceds <- c("CED_NAME_2006","CED_NAME_2011","CED_NAME_2016","CED_NAME_2021") for(i in 1:length(census_years)){ year <- as.double(census_years[i]) value_i <- citizenship |> filter(Year==year) |> select(-Unit) |> rename(!!codes[i]:="Census_Code") |> collect() |> aussiemaps::geo_aggregate( values_col="Value", original_geo=codes[i], new_geo=ceds[i], grouping_col = c("Year","Attribute"), year=census_years[i]) |> rename("Unit"=ceds[i]) |> filter(!is.na(Unit)) |> filter(str_detect(Unit,"[Ss]hipping",TRUE))|> filter(str_detect(Unit,"[Uu]sual",TRUE)) |> filter(str_detect(Unit,"[Aa]pplicable",TRUE)) value_i <- value_i |> auscensus::calculate_percentage(key_col = "Attribute", value_col = "Value", key_value = "Total", percentage_scale = 100) citizenship_ced <- bind_rows(citizenship_ced,value_i) } citizenship_ced #> # A tibble: 595 × 6 #> Unit Year Attribute Value Total Percentage #> #> 1 Adelaide 2006 Australian Citizens 115908 140151 82.7 #> 2 Aston 2006 Australian Citizens 115010 129463 88.8 #> 3 Ballarat 2006 Australian Citizens 118247 129221 91.5 #> 4 Banks 2006 Australian Citizens 104181 118371 88.0 #> 5 Barker 2006 Australian Citizens 133033 145410 91.5 #> 6 Barton 2006 Australian Citizens 105287 131234 80.2 #> 7 Bass 2006 Australian Citizens 85631 94270 90.8 #> 8 Batman 2006 Australian Citizens 105561 128107 82.4 #> 9 Bendigo 2006 Australian Citizens 121930 131885 92.5 #> 10 Bennelong 2006 Australian Citizens 104473 129340 80.8 #> # … with 585 more rows ```