# Checking if the packages you need are installed -- if not, it will install for you
packages <- c("tidyverse", "usethis", "devtools",
"remotes", "lubridate", "knitr",
"janitor", "geofacet", "ggrepel",
"png", "gifski", "sf", "tigris",
"cowplot", "biscale", "ggtext")
if (length(setdiff(packages, rownames(installed.packages()))) > 0) {
install.packages(setdiff(packages, rownames(installed.packages())),
repos = "https://cran.us.r-project.org")
}
devtools::install_github('wpinvestigative/arcos')
usethis::use_course("https://github.com/r-journalism/nicar-2023-fancier-viz/archive/master.zip")Visualizing relationships five different ways
One data set, Five visualizations (technically two data sets)
Before you do anything at all…
Update your software
Make sure you have the latest versions of R and RStudio installed.
A lot of these new tools we’re using depend on the latest builds of R and RStudio to work correctly.
No, seriously, do it.
Install packages
Copy and paste the following lines of code into your RStudio console.
The last line of code you ran will walk you through installing the class materials to your desktop. Select yes at the right points.
When you’re done downloading the files, you can move on to geofacet.
Later on you can follow these steps on how to download and transform the data used in these walkthroughs below.
Data for the walkthroughs
The opioid transaction data from the DEA and opioid deaths data from the CDC needed for these walkthroughs have already been downloaded and transformed. The steps below exist for transparency.
Preparing state data
Get annual opioid sales data from ARCOS api
The Washington Post published a significant portion of a database that tracks the path of every opioid pain pill, from manufacturer to pharmacy, in the United States between 2006 and 2014. The data below is pulled from an API that the Post created to make it easier to query the data.
We’ll pull opioid sales data by year.
library(tidyverse)
library(knitr)
library(janitor)
devtools::install_github('wpinvestigative/arcos')
library(arcos)
states <- combined_buyer_annual(key="WaPo") %>%
clean_names()
states %>%
head() %>%
kable()| buyer_dea_no | buyer_bus_act | buyer_county | buyer_state | year | dosage_unit |
|---|---|---|---|---|---|
| A90777889 | RETAIL PHARMACY | KINGS | NY | 2006 | 110700 |
| A90777889 | RETAIL PHARMACY | KINGS | NY | 2007 | 119100 |
| A90777889 | RETAIL PHARMACY | KINGS | NY | 2008 | 100300 |
| A90777889 | RETAIL PHARMACY | KINGS | NY | 2009 | 92630 |
| A90777889 | RETAIL PHARMACY | KINGS | NY | 2010 | 65860 |
| A90777889 | RETAIL PHARMACY | KINGS | NY | 2011 | 72030 |
Transform the data
Next, we’ll summarize the data by state and year.
state_names <- c(state.name, "District of Columbia")
state_abbr <- c(state.abb, "DC")
state_names <- data.frame(state=state_names, buyer_state=state_abbr)
annual_states <- states %>%
group_by(buyer_state, year) %>%
summarize(pills=sum(dosage_unit)) %>%
left_join(state_names) %>%
filter(!is.na(state))
annual_states %>%
head() %>%
kable()| buyer_state | year | pills | state |
|---|---|---|---|
| AK | 2006 | 15667010 | Alaska |
| AK | 2007 | 17272433 | Alaska |
| AK | 2008 | 18707811 | Alaska |
| AK | 2009 | 20160949 | Alaska |
| AK | 2010 | 21012703 | Alaska |
| AK | 2011 | 22444775 | Alaska |
Get annual opioid overdose data from the CDC
NCHS data is compiled in the CDC WONDER online tool. To retrieve these data from the tool:
• Select the Multiple Cause of Death (Detailed Mortality) query system; • Select table layout (for example, by year, state, county); and • Supply the appropriate underlying codes (X and Y code) and contributing codes (T codes)
- X40 - X44
- X60 - X64
- X85
- Y10 - Y14
- T40.0 - T40.4, T40.6
The data imported below was exported from the CDC Wonder data portal.
state_cdc <- read_tsv("data/od_deaths_state.csv") %>%
filter(is.na(Notes)) %>%
clean_names()
state_cdc %>%
head() %>%
kable()| notes | state | state_code | year | year_code | deaths | population | crude_rate |
|---|---|---|---|---|---|---|---|
| NA | Alabama | 01 | 2006 | 2006 | 124 | 4628981 | 2.7 |
| NA | Alabama | 01 | 2007 | 2007 | 165 | 4672840 | 3.5 |
| NA | Alabama | 01 | 2008 | 2008 | 185 | 4718206 | 3.9 |
| NA | Alabama | 01 | 2009 | 2009 | 206 | 4757938 | 4.3 |
| NA | Alabama | 01 | 2010 | 2010 | 187 | 4779736 | 3.9 |
| NA | Alabama | 01 | 2011 | 2011 | 176 | 4802740 | 3.7 |
Join the state data
Bringing together the sales data with the deaths data.
state_joined <- annual_states %>%
left_join(state_cdc) %>%
select(-notes, -state_code, -year_code)Joining with `by = join_by(year, state)`
state_joined %>%
head() %>%
kable()| buyer_state | year | pills | state | deaths | population | crude_rate |
|---|---|---|---|---|---|---|
| AK | 2006 | 15667010 | Alaska | 29 | 675302 | 4.3 |
| AK | 2007 | 17272433 | Alaska | 15 | 680300 | Unreliable |
| AK | 2008 | 18707811 | Alaska | 88 | 687455 | 12.8 |
| AK | 2009 | 20160949 | Alaska | 90 | 698895 | 12.9 |
| AK | 2010 | 21012703 | Alaska | 62 | 710231 | 8.7 |
| AK | 2011 | 22444775 | Alaska | 66 | 722718 | 9.1 |
Wrangle the data
Let’s calculate rates.
Additionally, let’s add in state categories and divisions.
Then, we’ll save it.
state_joined <- state_joined %>%
mutate(pills_per_person=round(pills/population,1)) %>%
mutate(death_per_1m=round(deaths/population*1000000,1)) %>%
# rename(death_rate=crude_rate) %>%
pivot_longer(cols=c("death_per_1m", "pills_per_person"),
names_to="type",
values_to="rate")
regions <- read_csv("https://github.com/cphalpert/census-regions/raw/master/us%20census%20bureau%20regions%20and%20divisions.csv") %>%
clean_names()Rows: 51 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): State, State Code, Region, Division
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
state_joined <- left_join(state_joined, regions)Joining with `by = join_by(state)`
write_csv(state_joined, "data/opioids_states.csv", na="")County data annual
Let’s do the same thing but with county levle data.
Get annual opioid sales data from ARCOS api
Once again, we’ll pull from the Post ARCOS api.
pharm_c <- summarized_county_annual(key="WaPo") %>%
clean_names()
pharm_c %>%
head() %>%
kable()| buyer_county | buyer_state | year | count | dosage_unit | countyfips |
|---|---|---|---|---|---|
| ABBEVILLE | SC | 2006 | 877 | 363620 | 45001 |
| ABBEVILLE | SC | 2007 | 908 | 402940 | 45001 |
| ABBEVILLE | SC | 2008 | 871 | 424590 | 45001 |
| ABBEVILLE | SC | 2009 | 930 | 467230 | 45001 |
| ABBEVILLE | SC | 2010 | 1197 | 539280 | 45001 |
| ABBEVILLE | SC | 2011 | 1327 | 566560 | 45001 |
Transform the data
Get annual opioid overdose data from the CDC
Followed the same steps as we did for state data but for counties.
county_cdc <- read_tsv("data/od_deaths_county.csv") %>%
filter(is.na(Notes)) %>%
clean_names() %>%
rename(countyfips=county_code)
county_cdc %>%
head() %>%
kable()| notes | state | state_code | county | countyfips | year | year_code | deaths | population | crude_rate | age_adjusted_rate |
|---|---|---|---|---|---|---|---|---|---|---|
| NA | Alabama | 01 | Baldwin County, AL | 01003 | 2006 | 2006 | 12 | 168121 | Unreliable | Unreliable |
| NA | Alabama | 01 | Baldwin County, AL | 01003 | 2007 | 2007 | 14 | 172404 | Unreliable | Unreliable |
| NA | Alabama | 01 | Baldwin County, AL | 01003 | 2008 | 2008 | 14 | 175827 | Unreliable | Unreliable |
| NA | Alabama | 01 | Baldwin County, AL | 01003 | 2009 | 2009 | 10 | 179406 | Unreliable | Unreliable |
| NA | Alabama | 01 | Baldwin County, AL | 01003 | 2010 | 2010 | 10 | 182265 | Unreliable | Unreliable |
| NA | Alabama | 01 | Baldwin County, AL | 01003 | 2011 | 2011 | 11 | 186717 | Unreliable | Unreliable |
Join the county data
county_joined <- pharm_c %>%
full_join(county_cdc) %>%
select(-notes, -state_code, -year_code, -age_adjusted_rate)Joining with `by = join_by(year, countyfips)`
county_joined %>%
head() %>%
kable()| buyer_county | buyer_state | year | count | dosage_unit | countyfips | state | county | deaths | population | crude_rate |
|---|---|---|---|---|---|---|---|---|---|---|
| ABBEVILLE | SC | 2006 | 877 | 363620 | 45001 | NA | NA | NA | NA | NA |
| ABBEVILLE | SC | 2007 | 908 | 402940 | 45001 | NA | NA | NA | NA | NA |
| ABBEVILLE | SC | 2008 | 871 | 424590 | 45001 | NA | NA | NA | NA | NA |
| ABBEVILLE | SC | 2009 | 930 | 467230 | 45001 | NA | NA | NA | NA | NA |
| ABBEVILLE | SC | 2010 | 1197 | 539280 | 45001 | NA | NA | NA | NA | NA |
| ABBEVILLE | SC | 2011 | 1327 | 566560 | 45001 | NA | NA | NA | NA | NA |
Wrangle the data
Calculating rates and saving the data.
# first have to bring in accurate population counts for counties annually
county_pops <- county_population(key = "WaPo") %>%
select(countyfips, year, population)
county_joined <- county_joined %>%
select(-population) %>%
left_join(county_pops) %>%
mutate(pills_per_person=round(dosage_unit/population,1)) %>%
mutate(death_per_100k=round(deaths/population*100000,1)) %>%
# rename(death_rate=crude_rate) %>%
pivot_longer(cols=c("death_per_100k", "pills_per_person"),
names_to="type",
values_to="rate")Joining with `by = join_by(year, countyfips)`
write_csv(county_joined, "data/opioids_counties.csv", na="")County data combined
Let’s do it one more time but for all deaths between 2006 and 2014 combined.
Get annual opioid sales data from ARCOS api
This time we’ll summarize it.
pharm_c <- summarized_county_annual(key="WaPo") %>%
clean_names()
pharm_c_combined <- pharm_c %>%
group_by(buyer_county, buyer_state, countyfips) %>%
summarize(count=sum(count, na.rm=T),
dosage_unit=sum(dosage_unit, na.rm=T))`summarise()` has grouped output by 'buyer_county', 'buyer_state'. You can
override using the `.groups` argument.
pharm_c_combined %>%
head() %>%
kable()| buyer_county | buyer_state | countyfips | count | dosage_unit |
|---|---|---|---|---|
| ABBEVILLE | SC | 45001 | 10749 | 4591000 |
| ACADIA | LA | 22001 | 55952 | 22197708 |
| ACCOMACK | VA | 51001 | 19536 | 6660470 |
| ADA | ID | 16001 | 290484 | 136806828 |
| ADAIR | IA | 19001 | 5752 | 1783165 |
| ADAIR | KY | 21001 | 21985 | 8783100 |
Get annual opioid overdose data from the CDC
county_cdc <- read_tsv("data/od_deaths_counties_combined.csv") %>%
filter(is.na(Notes)) %>%
clean_names() %>%
rename(countyfips=county_code)
county_cdc %>%
head() %>%
kable()| notes | state | state_code | county | countyfips | deaths | population | crude_rate |
|---|---|---|---|---|---|---|---|
| NA | Alabama | 01 | Baldwin County, AL | 01003 | 105 | 1651181 | 6.4 |
| NA | Alabama | 01 | Bibb County, AL | 01007 | 11 | 203479 | Unreliable |
| NA | Alabama | 01 | Blount County, AL | 01009 | 48 | 514537 | 9.3 |
| NA | Alabama | 01 | Calhoun County, AL | 01015 | 10 | 1053553 | Unreliable |
| NA | Alabama | 01 | Chilton County, AL | 01021 | 10 | 391161 | Unreliable |
| NA | Alabama | 01 | Cullman County, AL | 01043 | 13 | 722726 | Unreliable |
Join the county data
county_joined <- pharm_c_combined %>%
full_join(county_cdc) %>%
select(-notes, -state_code, -crude_rate)Joining with `by = join_by(countyfips)`
county_joined %>%
head() %>%
kable()| buyer_county | buyer_state | countyfips | count | dosage_unit | state | county | deaths | population |
|---|---|---|---|---|---|---|---|---|
| ABBEVILLE | SC | 45001 | 10749 | 4591000 | NA | NA | NA | NA |
| ACADIA | LA | 22001 | 55952 | 22197708 | Louisiana | Acadia Parish, LA | 39 | 554207 |
| ACCOMACK | VA | 51001 | 19536 | 6660470 | Virginia | Accomack County, VA | 22 | 303140 |
| ADA | ID | 16001 | 290484 | 136806828 | Idaho | Ada County, ID | 269 | 3555029 |
| ADAIR | IA | 19001 | 5752 | 1783165 | NA | NA | NA | NA |
| ADAIR | KY | 21001 | 21985 | 8783100 | Kentucky | Adair County, KY | 14 | 168051 |
Wrangle the data
Calculating rates.
county_pops <- county_pops %>%
filter(year==2014) %>%
select(countyfips, year, population)
county_joined <- county_joined %>%
select(-population) %>%
left_join(county_pops) %>%
mutate(pills_per_person=round(dosage_unit/population,1)) %>%
mutate(death_per_100k=round(deaths/population*100000,1)) %>%
# rename(death_rate=crude_rate) %>%
pivot_longer(cols=c("death_per_100k", "pills_per_person"),
names_to="type",
values_to="rate")Joining with `by = join_by(countyfips)`
write_csv(county_joined, "data/opioids_counties_combined.csv", na="")Next, we’ll move on to first steps in visualizing these data sets with geofacet.