csfmt_rts_data_v2
csfmt_rts_data_v2 (vignette("csfmt_rts_data_v2", package = "cstidy")
) is a data format for real-time surveillance.
d <- cstidy::generate_test_data()
cstidy::set_csfmt_rts_data_v2(d)
# Looking at the dataset
d[]
#> granularity_time granularity_geo country_iso3 location_code border age
#> 1: isoyearweek county nor county_nor42 NA <NA>
#> 2: isoyearweek county nor county_nor32 NA <NA>
#> 3: isoyearweek county nor county_nor33 NA <NA>
#> 4: isoyearweek county nor county_nor56 NA <NA>
#> 5: isoyearweek county nor county_nor34 NA <NA>
#> 6: isoyearweek county nor county_nor15 NA <NA>
#> 7: isoyearweek county nor county_nor18 NA <NA>
#> 8: isoyearweek county nor county_nor03 NA <NA>
#> 9: isoyearweek county nor county_nor11 NA <NA>
#> 10: isoyearweek county nor county_nor40 NA <NA>
#> 11: isoyearweek county nor county_nor55 NA <NA>
#> 12: isoyearweek county nor county_nor50 NA <NA>
#> 13: isoyearweek county nor county_nor39 NA <NA>
#> 14: isoyearweek county nor county_nor46 NA <NA>
#> 15: isoyearweek county nor county_nor31 NA <NA>
#> 16: isoyearweek county nor county_nor42 NA total
#> 17: isoyearweek county nor county_nor32 NA total
#> 18: isoyearweek county nor county_nor33 NA total
#> 19: isoyearweek county nor county_nor56 NA total
#> 20: isoyearweek county nor county_nor34 NA total
#> 21: isoyearweek county nor county_nor15 NA total
#> 22: isoyearweek county nor county_nor18 NA total
#> 23: isoyearweek county nor county_nor03 NA total
#> 24: isoyearweek county nor county_nor11 NA total
#> 25: isoyearweek county nor county_nor40 NA total
#> 26: isoyearweek county nor county_nor55 NA total
#> 27: isoyearweek county nor county_nor50 NA total
#> 28: isoyearweek county nor county_nor39 NA total
#> 29: isoyearweek county nor county_nor46 NA total
#> 30: isoyearweek county nor county_nor31 NA total
#> 31: isoyearweek county nor county_nor42 NA 000_005
#> 32: isoyearweek county nor county_nor32 NA 000_005
#> 33: isoyearweek county nor county_nor33 NA 000_005
#> 34: isoyearweek county nor county_nor56 NA 000_005
#> 35: isoyearweek county nor county_nor34 NA 000_005
#> 36: isoyearweek county nor county_nor15 NA 000_005
#> 37: isoyearweek county nor county_nor18 NA 000_005
#> 38: isoyearweek county nor county_nor03 NA 000_005
#> 39: isoyearweek county nor county_nor11 NA 000_005
#> 40: isoyearweek county nor county_nor40 NA 000_005
#> 41: isoyearweek county nor county_nor55 NA 000_005
#> 42: isoyearweek county nor county_nor50 NA 000_005
#> 43: isoyearweek county nor county_nor39 NA 000_005
#> 44: isoyearweek county nor county_nor46 NA 000_005
#> 45: isoyearweek county nor county_nor31 NA 000_005
#> granularity_time granularity_geo country_iso3 location_code border age
#> sex isoyear isoweek isoyearweek isoquarter isoyearquarter season
#> 1: <NA> 2022 3 2022-03 1 2022-Q1 2021/2022
#> 2: <NA> 2022 3 2022-03 1 2022-Q1 2021/2022
#> 3: <NA> 2022 3 2022-03 1 2022-Q1 2021/2022
#> 4: <NA> 2022 3 2022-03 1 2022-Q1 2021/2022
#> 5: <NA> 2022 3 2022-03 1 2022-Q1 2021/2022
#> 6: <NA> 2022 3 2022-03 1 2022-Q1 2021/2022
#> 7: <NA> 2022 3 2022-03 1 2022-Q1 2021/2022
#> 8: <NA> 2022 3 2022-03 1 2022-Q1 2021/2022
#> 9: <NA> 2022 3 2022-03 1 2022-Q1 2021/2022
#> 10: <NA> 2022 3 2022-03 1 2022-Q1 2021/2022
#> 11: <NA> 2022 3 2022-03 1 2022-Q1 2021/2022
#> 12: <NA> 2022 3 2022-03 1 2022-Q1 2021/2022
#> 13: <NA> 2022 3 2022-03 1 2022-Q1 2021/2022
#> 14: <NA> 2022 3 2022-03 1 2022-Q1 2021/2022
#> 15: <NA> 2022 3 2022-03 1 2022-Q1 2021/2022
#> 16: total 2022 3 2022-03 1 2022-Q1 2021/2022
#> 17: total 2022 3 2022-03 1 2022-Q1 2021/2022
#> 18: total 2022 3 2022-03 1 2022-Q1 2021/2022
#> 19: total 2022 3 2022-03 1 2022-Q1 2021/2022
#> 20: total 2022 3 2022-03 1 2022-Q1 2021/2022
#> 21: total 2022 3 2022-03 1 2022-Q1 2021/2022
#> 22: total 2022 3 2022-03 1 2022-Q1 2021/2022
#> 23: total 2022 3 2022-03 1 2022-Q1 2021/2022
#> 24: total 2022 3 2022-03 1 2022-Q1 2021/2022
#> 25: total 2022 3 2022-03 1 2022-Q1 2021/2022
#> 26: total 2022 3 2022-03 1 2022-Q1 2021/2022
#> 27: total 2022 3 2022-03 1 2022-Q1 2021/2022
#> 28: total 2022 3 2022-03 1 2022-Q1 2021/2022
#> 29: total 2022 3 2022-03 1 2022-Q1 2021/2022
#> 30: total 2022 3 2022-03 1 2022-Q1 2021/2022
#> 31: total 2022 3 2022-03 1 2022-Q1 2021/2022
#> 32: total 2022 3 2022-03 1 2022-Q1 2021/2022
#> 33: total 2022 3 2022-03 1 2022-Q1 2021/2022
#> 34: total 2022 3 2022-03 1 2022-Q1 2021/2022
#> 35: total 2022 3 2022-03 1 2022-Q1 2021/2022
#> 36: total 2022 3 2022-03 1 2022-Q1 2021/2022
#> 37: total 2022 3 2022-03 1 2022-Q1 2021/2022
#> 38: total 2022 3 2022-03 1 2022-Q1 2021/2022
#> 39: total 2022 3 2022-03 1 2022-Q1 2021/2022
#> 40: total 2022 3 2022-03 1 2022-Q1 2021/2022
#> 41: total 2022 3 2022-03 1 2022-Q1 2021/2022
#> 42: total 2022 3 2022-03 1 2022-Q1 2021/2022
#> 43: total 2022 3 2022-03 1 2022-Q1 2021/2022
#> 44: total 2022 3 2022-03 1 2022-Q1 2021/2022
#> 45: total 2022 3 2022-03 1 2022-Q1 2021/2022
#> sex isoyear isoweek isoyearweek isoquarter isoyearquarter season
#> seasonweek calyear calmonth calyearmonth date deaths_n
#> 1: 26 NA NA <NA> 2022-01-23 4
#> 2: 26 NA NA <NA> 2022-01-23 7
#> 3: 26 NA NA <NA> 2022-01-23 6
#> 4: 26 NA NA <NA> 2022-01-23 3
#> 5: 26 NA NA <NA> 2022-01-23 4
#> 6: 26 NA NA <NA> 2022-01-23 8
#> 7: 26 NA NA <NA> 2022-01-23 8
#> 8: 26 NA NA <NA> 2022-01-23 4
#> 9: 26 NA NA <NA> 2022-01-23 7
#> 10: 26 NA NA <NA> 2022-01-23 4
#> 11: 26 NA NA <NA> 2022-01-23 7
#> 12: 26 NA NA <NA> 2022-01-23 4
#> 13: 26 NA NA <NA> 2022-01-23 1
#> 14: 26 NA NA <NA> 2022-01-23 3
#> 15: 26 NA NA <NA> 2022-01-23 4
#> 16: 26 NA NA <NA> 2022-01-23 4
#> 17: 26 NA NA <NA> 2022-01-23 7
#> 18: 26 NA NA <NA> 2022-01-23 6
#> 19: 26 NA NA <NA> 2022-01-23 3
#> 20: 26 NA NA <NA> 2022-01-23 4
#> 21: 26 NA NA <NA> 2022-01-23 8
#> 22: 26 NA NA <NA> 2022-01-23 8
#> 23: 26 NA NA <NA> 2022-01-23 4
#> 24: 26 NA NA <NA> 2022-01-23 7
#> 25: 26 NA NA <NA> 2022-01-23 4
#> 26: 26 NA NA <NA> 2022-01-23 7
#> 27: 26 NA NA <NA> 2022-01-23 4
#> 28: 26 NA NA <NA> 2022-01-23 1
#> 29: 26 NA NA <NA> 2022-01-23 3
#> 30: 26 NA NA <NA> 2022-01-23 4
#> 31: 26 NA NA <NA> 2022-01-23 4
#> 32: 26 NA NA <NA> 2022-01-23 7
#> 33: 26 NA NA <NA> 2022-01-23 6
#> 34: 26 NA NA <NA> 2022-01-23 3
#> 35: 26 NA NA <NA> 2022-01-23 4
#> 36: 26 NA NA <NA> 2022-01-23 8
#> 37: 26 NA NA <NA> 2022-01-23 8
#> 38: 26 NA NA <NA> 2022-01-23 4
#> 39: 26 NA NA <NA> 2022-01-23 7
#> 40: 26 NA NA <NA> 2022-01-23 4
#> 41: 26 NA NA <NA> 2022-01-23 7
#> 42: 26 NA NA <NA> 2022-01-23 4
#> 43: 26 NA NA <NA> 2022-01-23 1
#> 44: 26 NA NA <NA> 2022-01-23 3
#> 45: 26 NA NA <NA> 2022-01-23 4
#> seasonweek calyear calmonth calyearmonth date deaths_n
Smart assignment
csfmt_rts_data_v2
does smart assignment for time and geography.
When the variables in bold are assigned using :=
, the listed variables will be automatically imputed.
location_code:
- granularity_geo
- country_iso3
isoyear:
- granularity_time
- isoweek
- isoyearweek
- season
- seasonweek
- calyear
- calmonth
- calyearmonth
- date
isoyearweek:
- granularity_time
- isoyear
- isoweek
- season
- seasonweek
- calyear
- calmonth
- calyearmonth
- date
date:
- granularity_time
- isoyear
- isoweek
- isoyearweek
- season
- seasonweek
- calyear
- calmonth
- calyearmonth
d <- cstidy::generate_test_data()[1:5]
cstidy::set_csfmt_rts_data_v2(d)
# Looking at the dataset
d[]
#> granularity_time granularity_geo country_iso3 location_code border age sex
#> 1: isoyearweek county nor county_nor42 NA <NA> <NA>
#> 2: isoyearweek county nor county_nor32 NA <NA> <NA>
#> 3: isoyearweek county nor county_nor33 NA <NA> <NA>
#> 4: isoyearweek county nor county_nor56 NA <NA> <NA>
#> 5: isoyearweek county nor county_nor34 NA <NA> <NA>
#> isoyear isoweek isoyearweek isoquarter isoyearquarter season seasonweek
#> 1: 2022 3 2022-03 1 2022-Q1 2021/2022 26
#> 2: 2022 3 2022-03 1 2022-Q1 2021/2022 26
#> 3: 2022 3 2022-03 1 2022-Q1 2021/2022 26
#> 4: 2022 3 2022-03 1 2022-Q1 2021/2022 26
#> 5: 2022 3 2022-03 1 2022-Q1 2021/2022 26
#> calyear calmonth calyearmonth date deaths_n
#> 1: NA NA <NA> 2022-01-23 9
#> 2: NA NA <NA> 2022-01-23 4
#> 3: NA NA <NA> 2022-01-23 3
#> 4: NA NA <NA> 2022-01-23 4
#> 5: NA NA <NA> 2022-01-23 5
# Smart assignment of time columns (note how granularity_time, isoyear, isoyearweek, date all change)
d[1,isoyearweek := "2021-01"]
d
#> granularity_time granularity_geo country_iso3 location_code border age sex
#> 1: isoyearweek county nor county_nor42 NA <NA> <NA>
#> 2: isoyearweek county nor county_nor32 NA <NA> <NA>
#> 3: isoyearweek county nor county_nor33 NA <NA> <NA>
#> 4: isoyearweek county nor county_nor56 NA <NA> <NA>
#> 5: isoyearweek county nor county_nor34 NA <NA> <NA>
#> isoyear isoweek isoyearweek isoquarter isoyearquarter season seasonweek
#> 1: 2021 1 2021-01 1 2021-Q1 2020/2021 24
#> 2: 2022 3 2022-03 1 2022-Q1 2021/2022 26
#> 3: 2022 3 2022-03 1 2022-Q1 2021/2022 26
#> 4: 2022 3 2022-03 1 2022-Q1 2021/2022 26
#> 5: 2022 3 2022-03 1 2022-Q1 2021/2022 26
#> calyear calmonth calyearmonth date deaths_n
#> 1: NA NA <NA> 2021-01-10 9
#> 2: NA NA <NA> 2022-01-23 4
#> 3: NA NA <NA> 2022-01-23 3
#> 4: NA NA <NA> 2022-01-23 4
#> 5: NA NA <NA> 2022-01-23 5
# Smart assignment of time columns (note how granularity_time, isoyear, isoyearweek, date all change)
d[2,isoyear := 2019]
d
#> granularity_time granularity_geo country_iso3 location_code border age sex
#> 1: isoyearweek county nor county_nor42 NA <NA> <NA>
#> 2: isoyear county nor county_nor32 NA <NA> <NA>
#> 3: isoyearweek county nor county_nor33 NA <NA> <NA>
#> 4: isoyearweek county nor county_nor56 NA <NA> <NA>
#> 5: isoyearweek county nor county_nor34 NA <NA> <NA>
#> isoyear isoweek isoyearweek isoquarter isoyearquarter season seasonweek
#> 1: 2021 1 2021-01 1 2021-Q1 2020/2021 24
#> 2: 2019 52 2019-52 1 2022-Q1 <NA> NA
#> 3: 2022 3 2022-03 1 2022-Q1 2021/2022 26
#> 4: 2022 3 2022-03 1 2022-Q1 2021/2022 26
#> 5: 2022 3 2022-03 1 2022-Q1 2021/2022 26
#> calyear calmonth calyearmonth date deaths_n
#> 1: NA NA <NA> 2021-01-10 9
#> 2: NA NA <NA> 2019-12-29 4
#> 3: NA NA <NA> 2022-01-23 3
#> 4: NA NA <NA> 2022-01-23 4
#> 5: NA NA <NA> 2022-01-23 5
# Smart assignment of time columns (note how granularity_time, isoyear, isoyearweek, date all change)
d[4:5,date := as.Date("2020-01-01")]
d
#> granularity_time granularity_geo country_iso3 location_code border age sex
#> 1: isoyearweek county nor county_nor42 NA <NA> <NA>
#> 2: isoyear county nor county_nor32 NA <NA> <NA>
#> 3: isoyearweek county nor county_nor33 NA <NA> <NA>
#> 4: date county nor county_nor56 NA <NA> <NA>
#> 5: date county nor county_nor34 NA <NA> <NA>
#> isoyear isoweek isoyearweek isoquarter isoyearquarter season seasonweek
#> 1: 2021 1 2021-01 1 2021-Q1 2020/2021 24
#> 2: 2019 52 2019-52 1 2022-Q1 <NA> NA
#> 3: 2022 3 2022-03 1 2022-Q1 2021/2022 26
#> 4: 2020 1 2020-01 1 2020-Q1 2019/2020 24
#> 5: 2020 1 2020-01 1 2020-Q1 2019/2020 24
#> calyear calmonth calyearmonth date deaths_n
#> 1: NA NA <NA> 2021-01-10 9
#> 2: NA NA <NA> 2019-12-29 4
#> 3: NA NA <NA> 2022-01-23 3
#> 4: 2020 1 2020-M01 2020-01-01 4
#> 5: 2020 1 2020-M01 2020-01-01 5
# Smart assignment fails when multiple time columns are set
d[1,c("isoyear","isoyearweek") := .(2021,"2021-01")]
#> Warning in `[.csfmt_rts_data_v2`(d, 1, `:=`(c("isoyear", "isoyearweek"), :
#> Multiple time variables specified. Smart-assignment disabled.
d
#> granularity_time granularity_geo country_iso3 location_code border age sex
#> 1: isoyearweek county nor county_nor42 NA <NA> <NA>
#> 2: isoyear county nor county_nor32 NA <NA> <NA>
#> 3: isoyearweek county nor county_nor33 NA <NA> <NA>
#> 4: date county nor county_nor56 NA <NA> <NA>
#> 5: date county nor county_nor34 NA <NA> <NA>
#> isoyear isoweek isoyearweek isoquarter isoyearquarter season seasonweek
#> 1: 2021 1 2021-01 1 2021-Q1 2020/2021 24
#> 2: 2019 52 2019-52 1 2022-Q1 <NA> NA
#> 3: 2022 3 2022-03 1 2022-Q1 2021/2022 26
#> 4: 2020 1 2020-01 1 2020-Q1 2019/2020 24
#> 5: 2020 1 2020-01 1 2020-Q1 2019/2020 24
#> calyear calmonth calyearmonth date deaths_n
#> 1: NA NA <NA> 2021-01-10 9
#> 2: NA NA <NA> 2019-12-29 4
#> 3: NA NA <NA> 2022-01-23 3
#> 4: 2020 1 2020-M01 2020-01-01 4
#> 5: 2020 1 2020-M01 2020-01-01 5
# Smart assignment of geo columns
d[1,c("location_code") := .("norge")]
d
#> granularity_time granularity_geo country_iso3 location_code border age sex
#> 1: isoyearweek nation nor norge NA <NA> <NA>
#> 2: isoyear county nor county_nor32 NA <NA> <NA>
#> 3: isoyearweek county nor county_nor33 NA <NA> <NA>
#> 4: date county nor county_nor56 NA <NA> <NA>
#> 5: date county nor county_nor34 NA <NA> <NA>
#> isoyear isoweek isoyearweek isoquarter isoyearquarter season seasonweek
#> 1: 2021 1 2021-01 1 2021-Q1 2020/2021 24
#> 2: 2019 52 2019-52 1 2022-Q1 <NA> NA
#> 3: 2022 3 2022-03 1 2022-Q1 2021/2022 26
#> 4: 2020 1 2020-01 1 2020-Q1 2019/2020 24
#> 5: 2020 1 2020-01 1 2020-Q1 2019/2020 24
#> calyear calmonth calyearmonth date deaths_n
#> 1: NA NA <NA> 2021-01-10 9
#> 2: NA NA <NA> 2019-12-29 4
#> 3: NA NA <NA> 2022-01-23 3
#> 4: 2020 1 2020-M01 2020-01-01 4
#> 5: 2020 1 2020-M01 2020-01-01 5
# Collapsing down to different levels, and healing the dataset
# (so that it can be worked on further with regards to real time surveillance)
d[, .(deaths_n = sum(deaths_n), location_code = "norge"), keyby=.(granularity_time)] %>%
cstidy::set_csfmt_rts_data_v2(create_unified_columns = FALSE) %>%
print()
#> granularity_time deaths_n location_code date
#> 1: date 9 norge <NA>
#> 2: isoyear 4 norge <NA>
#> 3: isoyearweek 12 norge <NA>
# Collapsing to different levels, and removing the class csfmt_rts_data_v2 because
# it is going to be used in new output/analyses
d[, .(deaths_n = sum(deaths_n), location_code = "norge"), keyby=.(granularity_time)] %>%
cstidy::remove_class_csfmt_rts_data() %>%
print()
#> granularity_time deaths_n location_code
#> 1: date 9 norge
#> 2: isoyear 4 norge
#> 3: isoyearweek 12 norge
Summary
We need a way to easily summarize the data structure of a dataset.
cstidy::generate_test_data() %>%
cstidy::set_csfmt_rts_data_v2() %>%
summary()
#>
#> granularity_time
#> ✅ No errors
#>
#> granularity_geo
#> ✅ No errors
#>
#> country_iso3
#> ✅ No errors
#>
#> location_code
#> ✅ No errors
#>
#> border
#> ❌ Errors:
#> - NA exists (not allowed)
#>
#> age
#> ✅ No errors
#>
#> sex
#> ✅ No errors
#>
#> isoyear
#> ✅ No errors
#>
#> isoweek
#> ✅ No errors
#>
#> isoyearweek
#> ✅ No errors
#>
#> isoquarter
#> ✅ No errors
#>
#> isoyearquarter
#> ✅ No errors
#>
#> season
#> ✅ No errors
#>
#> seasonweek
#> ✅ No errors
#>
#> calyear
#> ✅ No errors
#>
#> calmonth
#> ✅ No errors
#>
#> calyearmonth
#> ✅ No errors
#>
#> date
#> ✅ No errors
#> granularity_time (character):
#> - isoyearweek (n = 45)
#> granularity_geo (character):
#> - county (n = 45)
#> country_iso3 (character):
#> - nor (n = 45)
#> location_code (character)
#> border (integer):
#> - <NA> (n = 45)
#> age (character):
#> - <NA> (n = 15)
#> - 000_005 (n = 15)
#> - total (n = 15)
#> sex (character):
#> - <NA> (n = 15)
#> - total (n = 30)
#> isoyear (integer):
#> - 2022 (n = 45)
#> isoweek (integer)
#> isoyearweek (character)
#> isoquarter (integer)
#> isoyearquarter (character)
#> season (character):
#> - 2021/2022 (n = 45)
#> seasonweek (numeric)
#> calyear (integer)
#> calmonth (integer)
#> calyearmonth (character)
#> date (Date)
#> deaths_n (integer)
Identifying data structure of one column
We need a way to easily summarize the data structure of one column inside a dataset.
cstidy::generate_test_data() %>%
cstidy::set_csfmt_rts_data_v2() %>%
cstidy::identify_data_structure("deaths_n") %>%
plot()