# Checking if the packages you need are installed -- if not, it will install for you
<- c("tidyverse", "usethis", "devtools",
packages "remotes", "lubridate", "knitr",
"janitor", "geofacet", "ggrepel",
"png", "gifski", "sf", "tigris",
"cowplot", "biscale", "ggtext")
if (length(setdiff(packages, rownames(installed.packages()))) > 0) {
install.packages(setdiff(packages, rownames(installed.packages())),
repos = "https://cran.us.r-project.org")
}
::install_github('wpinvestigative/arcos')
devtools
::use_course("https://github.com/r-journalism/nicar-2023-fancier-viz/archive/master.zip") usethis
Visualizing relationships five different ways
One data set, Five visualizations (technically two data sets)
Before you do anything at all…
Update your software
Make sure you have the latest versions of R and RStudio installed.
A lot of these new tools we’re using depend on the latest builds of R and RStudio to work correctly.
No, seriously, do it.
Install packages
Copy and paste the following lines of code into your RStudio console.
The last line of code you ran will walk you through installing the class materials to your desktop. Select yes at the right points.
When you’re done downloading the files, you can move on to geofacet.
Later on you can follow these steps on how to download and transform the data used in these walkthroughs below.
Data for the walkthroughs
The opioid transaction data from the DEA and opioid deaths data from the CDC needed for these walkthroughs have already been downloaded and transformed. The steps below exist for transparency.
Preparing state data
Get annual opioid sales data from ARCOS api
The Washington Post published a significant portion of a database that tracks the path of every opioid pain pill, from manufacturer to pharmacy, in the United States between 2006 and 2014. The data below is pulled from an API that the Post created to make it easier to query the data.
We’ll pull opioid sales data by year.
library(tidyverse)
library(knitr)
library(janitor)
::install_github('wpinvestigative/arcos')
devtoolslibrary(arcos)
<- combined_buyer_annual(key="WaPo") %>%
states clean_names()
%>%
states head() %>%
kable()
buyer_dea_no | buyer_bus_act | buyer_county | buyer_state | year | dosage_unit |
---|---|---|---|---|---|
A90777889 | RETAIL PHARMACY | KINGS | NY | 2006 | 110700 |
A90777889 | RETAIL PHARMACY | KINGS | NY | 2007 | 119100 |
A90777889 | RETAIL PHARMACY | KINGS | NY | 2008 | 100300 |
A90777889 | RETAIL PHARMACY | KINGS | NY | 2009 | 92630 |
A90777889 | RETAIL PHARMACY | KINGS | NY | 2010 | 65860 |
A90777889 | RETAIL PHARMACY | KINGS | NY | 2011 | 72030 |
Transform the data
Next, we’ll summarize the data by state and year.
<- c(state.name, "District of Columbia")
state_names <- c(state.abb, "DC")
state_abbr
<- data.frame(state=state_names, buyer_state=state_abbr)
state_names
<- states %>%
annual_states group_by(buyer_state, year) %>%
summarize(pills=sum(dosage_unit)) %>%
left_join(state_names) %>%
filter(!is.na(state))
%>%
annual_states head() %>%
kable()
buyer_state | year | pills | state |
---|---|---|---|
AK | 2006 | 15667010 | Alaska |
AK | 2007 | 17272433 | Alaska |
AK | 2008 | 18707811 | Alaska |
AK | 2009 | 20160949 | Alaska |
AK | 2010 | 21012703 | Alaska |
AK | 2011 | 22444775 | Alaska |
Get annual opioid overdose data from the CDC
NCHS data is compiled in the CDC WONDER online tool. To retrieve these data from the tool:
• Select the Multiple Cause of Death (Detailed Mortality) query system; • Select table layout (for example, by year, state, county); and • Supply the appropriate underlying codes (X and Y code) and contributing codes (T codes)
- X40 - X44
- X60 - X64
- X85
- Y10 - Y14
- T40.0 - T40.4, T40.6
The data imported below was exported from the CDC Wonder data portal.
<- read_tsv("data/od_deaths_state.csv") %>%
state_cdc filter(is.na(Notes)) %>%
clean_names()
%>%
state_cdc head() %>%
kable()
notes | state | state_code | year | year_code | deaths | population | crude_rate |
---|---|---|---|---|---|---|---|
NA | Alabama | 01 | 2006 | 2006 | 124 | 4628981 | 2.7 |
NA | Alabama | 01 | 2007 | 2007 | 165 | 4672840 | 3.5 |
NA | Alabama | 01 | 2008 | 2008 | 185 | 4718206 | 3.9 |
NA | Alabama | 01 | 2009 | 2009 | 206 | 4757938 | 4.3 |
NA | Alabama | 01 | 2010 | 2010 | 187 | 4779736 | 3.9 |
NA | Alabama | 01 | 2011 | 2011 | 176 | 4802740 | 3.7 |
Join the state data
Bringing together the sales data with the deaths data.
<- annual_states %>%
state_joined left_join(state_cdc) %>%
select(-notes, -state_code, -year_code)
Joining with `by = join_by(year, state)`
%>%
state_joined head() %>%
kable()
buyer_state | year | pills | state | deaths | population | crude_rate |
---|---|---|---|---|---|---|
AK | 2006 | 15667010 | Alaska | 29 | 675302 | 4.3 |
AK | 2007 | 17272433 | Alaska | 15 | 680300 | Unreliable |
AK | 2008 | 18707811 | Alaska | 88 | 687455 | 12.8 |
AK | 2009 | 20160949 | Alaska | 90 | 698895 | 12.9 |
AK | 2010 | 21012703 | Alaska | 62 | 710231 | 8.7 |
AK | 2011 | 22444775 | Alaska | 66 | 722718 | 9.1 |
Wrangle the data
Let’s calculate rates.
Additionally, let’s add in state categories and divisions.
Then, we’ll save it.
<- state_joined %>%
state_joined mutate(pills_per_person=round(pills/population,1)) %>%
mutate(death_per_1m=round(deaths/population*1000000,1)) %>%
# rename(death_rate=crude_rate) %>%
pivot_longer(cols=c("death_per_1m", "pills_per_person"),
names_to="type",
values_to="rate")
<- read_csv("https://github.com/cphalpert/census-regions/raw/master/us%20census%20bureau%20regions%20and%20divisions.csv") %>%
regions clean_names()
Rows: 51 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): State, State Code, Region, Division
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- left_join(state_joined, regions) state_joined
Joining with `by = join_by(state)`
write_csv(state_joined, "data/opioids_states.csv", na="")
County data annual
Let’s do the same thing but with county levle data.
Get annual opioid sales data from ARCOS api
Once again, we’ll pull from the Post ARCOS api.
<- summarized_county_annual(key="WaPo") %>%
pharm_c clean_names()
%>%
pharm_c head() %>%
kable()
buyer_county | buyer_state | year | count | dosage_unit | countyfips |
---|---|---|---|---|---|
ABBEVILLE | SC | 2006 | 877 | 363620 | 45001 |
ABBEVILLE | SC | 2007 | 908 | 402940 | 45001 |
ABBEVILLE | SC | 2008 | 871 | 424590 | 45001 |
ABBEVILLE | SC | 2009 | 930 | 467230 | 45001 |
ABBEVILLE | SC | 2010 | 1197 | 539280 | 45001 |
ABBEVILLE | SC | 2011 | 1327 | 566560 | 45001 |
Transform the data
Get annual opioid overdose data from the CDC
Followed the same steps as we did for state data but for counties.
<- read_tsv("data/od_deaths_county.csv") %>%
county_cdc filter(is.na(Notes)) %>%
clean_names() %>%
rename(countyfips=county_code)
%>%
county_cdc head() %>%
kable()
notes | state | state_code | county | countyfips | year | year_code | deaths | population | crude_rate | age_adjusted_rate |
---|---|---|---|---|---|---|---|---|---|---|
NA | Alabama | 01 | Baldwin County, AL | 01003 | 2006 | 2006 | 12 | 168121 | Unreliable | Unreliable |
NA | Alabama | 01 | Baldwin County, AL | 01003 | 2007 | 2007 | 14 | 172404 | Unreliable | Unreliable |
NA | Alabama | 01 | Baldwin County, AL | 01003 | 2008 | 2008 | 14 | 175827 | Unreliable | Unreliable |
NA | Alabama | 01 | Baldwin County, AL | 01003 | 2009 | 2009 | 10 | 179406 | Unreliable | Unreliable |
NA | Alabama | 01 | Baldwin County, AL | 01003 | 2010 | 2010 | 10 | 182265 | Unreliable | Unreliable |
NA | Alabama | 01 | Baldwin County, AL | 01003 | 2011 | 2011 | 11 | 186717 | Unreliable | Unreliable |
Join the county data
<- pharm_c %>%
county_joined full_join(county_cdc) %>%
select(-notes, -state_code, -year_code, -age_adjusted_rate)
Joining with `by = join_by(year, countyfips)`
%>%
county_joined head() %>%
kable()
buyer_county | buyer_state | year | count | dosage_unit | countyfips | state | county | deaths | population | crude_rate |
---|---|---|---|---|---|---|---|---|---|---|
ABBEVILLE | SC | 2006 | 877 | 363620 | 45001 | NA | NA | NA | NA | NA |
ABBEVILLE | SC | 2007 | 908 | 402940 | 45001 | NA | NA | NA | NA | NA |
ABBEVILLE | SC | 2008 | 871 | 424590 | 45001 | NA | NA | NA | NA | NA |
ABBEVILLE | SC | 2009 | 930 | 467230 | 45001 | NA | NA | NA | NA | NA |
ABBEVILLE | SC | 2010 | 1197 | 539280 | 45001 | NA | NA | NA | NA | NA |
ABBEVILLE | SC | 2011 | 1327 | 566560 | 45001 | NA | NA | NA | NA | NA |
Wrangle the data
Calculating rates and saving the data.
# first have to bring in accurate population counts for counties annually
<- county_population(key = "WaPo") %>%
county_pops select(countyfips, year, population)
<- county_joined %>%
county_joined select(-population) %>%
left_join(county_pops) %>%
mutate(pills_per_person=round(dosage_unit/population,1)) %>%
mutate(death_per_100k=round(deaths/population*100000,1)) %>%
# rename(death_rate=crude_rate) %>%
pivot_longer(cols=c("death_per_100k", "pills_per_person"),
names_to="type",
values_to="rate")
Joining with `by = join_by(year, countyfips)`
write_csv(county_joined, "data/opioids_counties.csv", na="")
County data combined
Let’s do it one more time but for all deaths between 2006 and 2014 combined.
Get annual opioid sales data from ARCOS api
This time we’ll summarize it.
<- summarized_county_annual(key="WaPo") %>%
pharm_c clean_names()
<- pharm_c %>%
pharm_c_combined group_by(buyer_county, buyer_state, countyfips) %>%
summarize(count=sum(count, na.rm=T),
dosage_unit=sum(dosage_unit, na.rm=T))
`summarise()` has grouped output by 'buyer_county', 'buyer_state'. You can
override using the `.groups` argument.
%>%
pharm_c_combined head() %>%
kable()
buyer_county | buyer_state | countyfips | count | dosage_unit |
---|---|---|---|---|
ABBEVILLE | SC | 45001 | 10749 | 4591000 |
ACADIA | LA | 22001 | 55952 | 22197708 |
ACCOMACK | VA | 51001 | 19536 | 6660470 |
ADA | ID | 16001 | 290484 | 136806828 |
ADAIR | IA | 19001 | 5752 | 1783165 |
ADAIR | KY | 21001 | 21985 | 8783100 |
Get annual opioid overdose data from the CDC
<- read_tsv("data/od_deaths_counties_combined.csv") %>%
county_cdc filter(is.na(Notes)) %>%
clean_names() %>%
rename(countyfips=county_code)
%>%
county_cdc head() %>%
kable()
notes | state | state_code | county | countyfips | deaths | population | crude_rate |
---|---|---|---|---|---|---|---|
NA | Alabama | 01 | Baldwin County, AL | 01003 | 105 | 1651181 | 6.4 |
NA | Alabama | 01 | Bibb County, AL | 01007 | 11 | 203479 | Unreliable |
NA | Alabama | 01 | Blount County, AL | 01009 | 48 | 514537 | 9.3 |
NA | Alabama | 01 | Calhoun County, AL | 01015 | 10 | 1053553 | Unreliable |
NA | Alabama | 01 | Chilton County, AL | 01021 | 10 | 391161 | Unreliable |
NA | Alabama | 01 | Cullman County, AL | 01043 | 13 | 722726 | Unreliable |
Join the county data
<- pharm_c_combined %>%
county_joined full_join(county_cdc) %>%
select(-notes, -state_code, -crude_rate)
Joining with `by = join_by(countyfips)`
%>%
county_joined head() %>%
kable()
buyer_county | buyer_state | countyfips | count | dosage_unit | state | county | deaths | population |
---|---|---|---|---|---|---|---|---|
ABBEVILLE | SC | 45001 | 10749 | 4591000 | NA | NA | NA | NA |
ACADIA | LA | 22001 | 55952 | 22197708 | Louisiana | Acadia Parish, LA | 39 | 554207 |
ACCOMACK | VA | 51001 | 19536 | 6660470 | Virginia | Accomack County, VA | 22 | 303140 |
ADA | ID | 16001 | 290484 | 136806828 | Idaho | Ada County, ID | 269 | 3555029 |
ADAIR | IA | 19001 | 5752 | 1783165 | NA | NA | NA | NA |
ADAIR | KY | 21001 | 21985 | 8783100 | Kentucky | Adair County, KY | 14 | 168051 |
Wrangle the data
Calculating rates.
<- county_pops %>%
county_pops filter(year==2014) %>%
select(countyfips, year, population)
<- county_joined %>%
county_joined select(-population) %>%
left_join(county_pops) %>%
mutate(pills_per_person=round(dosage_unit/population,1)) %>%
mutate(death_per_100k=round(deaths/population*100000,1)) %>%
# rename(death_rate=crude_rate) %>%
pivot_longer(cols=c("death_per_100k", "pills_per_person"),
names_to="type",
values_to="rate")
Joining with `by = join_by(countyfips)`
write_csv(county_joined, "data/opioids_counties_combined.csv", na="")
Next, we’ll move on to first steps in visualizing these data sets with geofacet.