This is the exploratory visualizaton behind the Trend CT story: Who in CT is being cited for marijuana possession and by whom?.
Visit the repo for the data used in this analysis. (Also, check out the reproducible scripts and data behind many of our other stories in our central data stories repo)
The data used in this analysis (marijuana citations between 2011 and 2014) is from the Connecticut State Police via a FOIA request made by Evan Anderson via Muckrock.com.
What’s in this walkthrough
Exploratory analysis and visualizations of marijuana citations.
library(dplyr)
library(lubridate)
library(ggplot2)
library(tidyr)
library(stringr)
library(extrafont)
library(ggalt)
library(scales)
library(gridExtra)
library(grid)
library(knitr)
#install.packages("devtools")
#devtools::install_github("trendct/ctnamecleaner")
library(ctnamecleaner)
#devtools::install_github("hrecht/censusapi")
library("censusapi")
source("keys.R")
mj <- read.csv("data/15-218_Marijuana_Arrests_by_Agency_2011-2013_NIBRS.csv", stringsAsFactors=F)
colnames(mj) <- c("date", "description", "gender", "race", "ethnicity", "age")
mj2 <- filter(mj,
date!="" &
!grepl("Unit of", date) &
!grepl("NSS ", date) &
!grepl("Report ", date) &
!grepl("Arrests ", date) &
!grepl("IncidentDate", date)
)
mj2$department <- "temp"
department_name <- "temp"
for (i in 1:nrow(mj2)) {
#print(i)
department_name <- ifelse(grepl("/", mj2$date[i]), department_name, mj2$date[i])
mj2$department[i] <- department_name
#print( mj2$department[i] )
}
mj2 <- mj2 %>%
filter(gender=="M" | gender=="F")
mj2$date <- mdy(mj2$date)
mj2$year <- year(mj2$date)
mj2$month <- month(mj2$date)
mj_sex <- mj2 %>%
group_by(gender) %>%
summarise(citations=n())
kable(mj_sex)
| gender | citations |
|---|---|
| F | 1674 |
| M | 10061 |
mj2$age <- as.numeric(mj2$age)
ggplot(mj2, aes(mj2$age)) + geom_histogram(binwidth=1, aes(fill = ..count..)) + ggtitle("Marijuana citations by age in Connecticut")
mj_most <- mj2 %>%
group_by(department) %>%
summarise(arrests=n()) %>%
arrange(-arrests)
kable(head(mj_most, 10))
| department | arrests |
|---|---|
| Connecticut State Police | 1480 |
| New Haven Police Department | 1451 |
| Stamford Police Department | 770 |
| Norwalk Police Department | 761 |
| West Hartford Police Department | 401 |
| New London Police Department | 384 |
| Middletown Police Department | 356 |
| East Hartford Police Department | 328 |
| Southington Police Department | 289 |
| Glastonbury Police Department | 282 |
mj_most$town <- gsub(" Police Department", "", mj_most$department)
mj_most$town <- gsub(" Police Dept.", "", mj_most$town)
mj_most2 <- ctpopulator(town, mj_most)
## [1] "Checking to see if names match..."
non_mj_most <- subset(mj_most2, is.na(pop2013))
mj_most2_map <- subset(mj_most2, !is.na(pop2013))
mj_most2_map$per_capita <- (mj_most2_map$arrests/mj_most2_map$pop2013)*10000
mj_most2_map <- mj_most2_map[c("town", "per_capita", "arrests")]
colnames(mj_most2_map) <- c("Town", "Per capita arrests", "Total arrests")
mj_most2_map$Town <- str_to_title(mj_most2_map$Town)
mj_most2_map <- arrange(mj_most2_map, -`Per capita arrests`)
kable(head(mj_most2_map,10))
| Town | Per capita arrests | Total arrests |
|---|---|---|
| Clinton | 179.01654 | 237 |
| New London | 139.19095 | 384 |
| Derby | 132.38844 | 170 |
| New Haven | 111.32594 | 1451 |
| Granby | 104.57285 | 118 |
| Norwalk | 87.97790 | 761 |
| Windsor | 84.87973 | 247 |
| Glastonbury | 81.61848 | 282 |
| Farmington | 81.00987 | 206 |
| Old Saybrook | 79.93761 | 82 |
mj_arrests_years <- mj2 %>%
group_by(department, year) %>%
summarise(arrests=n())
mj_arrests_years$town <- gsub(" Police Department", "", mj_arrests_years$department)
mj_arrests_years$town <- gsub(" Police Dept.", "", mj_arrests_years$town)
mj_arrests_years <- ctpopulator(town, mj_arrests_years)
## [1] "Checking to see if names match..."
mj_arrests_years2 <- subset(mj_arrests_years, !is.na(pop2013))
mj_arrests_years2$per_capita <- mj_arrests_years2$arrests/mj_arrests_years2$pop2013*10000
gg <- ggplot(mj_arrests_years, aes(x=year, y=arrests))
gg <- gg + geom_bar(stat="identity")
gg <- gg + facet_wrap(~department, ncol = 3)
gg <- gg + labs(x=NULL, y=NULL, title="Total marijuana citations",
subtitle="Between 2011 and 2014.",
caption="SOURCE: National Incident-Based Reporting System, U.S. Census \nAndrew Ba Tran/TrendCT.org")
gg <- gg + theme_bw(base_family="Lato Regular")
gg <- gg + theme(axis.ticks.y=element_blank())
gg <- gg + theme(panel.border=element_blank())
gg <- gg + theme(legend.key=element_blank())
gg <- gg + theme(plot.title=element_text(face="bold", family="Lato Regular", size=22))
gg <- gg + theme(plot.caption=element_text(face="bold", family="Lato Regular", size=9, color="gray", margin=margin(t=10, r=80)))
gg <- gg + theme(legend.position="none")
gg
gg <- ggplot(mj_arrests_years2, aes(x=year, y=per_capita))
gg <- gg + geom_bar(stat="identity")
gg <- gg + facet_wrap(~department, ncol = 3)
gg <- gg + labs(x=NULL, y=NULL, title="Per capita marijuana citations",
subtitle="Per 10,000 residents. Between 2011 and 2014.",
caption="SOURCE: National Incident-Based Reporting System, U.S. Census \nAndrew Ba Tran/TrendCT.org")
gg <- gg + theme_bw(base_family="Lato Regular")
gg <- gg + theme(axis.ticks.y=element_blank())
gg <- gg + theme(panel.border=element_blank())
gg <- gg + theme(legend.key=element_blank())
gg <- gg + theme(plot.title=element_text(face="bold", family="Lato Regular", size=22))
gg <- gg + theme(plot.caption=element_text(face="bold", family="Lato Regular", size=9, color="gray", margin=margin(t=10, r=80)))
gg <- gg + theme(legend.position="none")
gg
mj_arrests_years <- mj2 %>%
group_by(department, year) %>%
summarise(arrests=n()) %>%
spread(year, arrests) %>%
mutate(per_change=round((`2013` - `2011`) / `2011` *100, 2)) %>%
arrange(-per_change)
kable(head(mj_arrests_years,10))
| department | 2011 | 2012 | 2013 | per_change |
|---|---|---|---|---|
| Thomaston Police Department | 2 | 2 | 7 | 250.00 |
| C.C.S.U. Police Department | 2 | 3 | 5 | 150.00 |
| Orange Police Department | 16 | 33 | 26 | 62.50 |
| Ansonia Police Department | 25 | 28 | 39 | 56.00 |
| Plainfield Police Department | 11 | NA | 16 | 45.45 |
| Norwalk Police Department | 221 | 225 | 315 | 42.53 |
| Granby Police Department | 29 | 48 | 41 | 41.38 |
| Clinton Police Department | 61 | 93 | 83 | 36.07 |
| East Hartford Police Department | 117 | 61 | 150 | 28.21 |
| New Milford Police Department | 22 | 21 | 28 | 27.27 |
mj2$race_ethnicity <- ifelse(mj2$ethnicity=="H", "Hispanic", mj2$race)
mj2$race_ethnicity <- ifelse(mj2$race_ethnicity=="A", "Asian", mj2$race_ethnicity)
mj2$race_ethnicity <- ifelse(mj2$race_ethnicity=="B", "Black", mj2$race_ethnicity)
mj2$race_ethnicity <- ifelse(mj2$race_ethnicity=="W", "White", mj2$race_ethnicity)
mj2$race_ethnicity <- ifelse(mj2$race_ethnicity=="I", "Indian", mj2$race_ethnicity)
mj2$race_ethnicity <- ifelse(mj2$race_ethnicity=="U", "Unknown", mj2$race_ethnicity)
mj_arrests_race <- mj2 %>%
group_by(department, race_ethnicity) %>%
summarise(arrests=n())
## chart
ggplot(mj_arrests_race, aes(x=race_ethnicity, y=arrests)) + geom_bar(stat="identity") + coord_flip() + facet_wrap(~department, ncol = 4, scales = "free_x")
# B02001_001E - Total
# B02001_002E - White
# B02001_003E - Black
# B02001_004E - Indian
# B02001_005E - Asian
# B03001_003E - Hispanic
race_towns <- getCensus(name="acs5",
vintage=2014,
key=census_key,
vars=c("NAME", "B02001_001E", "B02001_002E", "B02001_003E",
"B02001_004E", "B02001_005E", "B03001_003E"),
region="county subdivision:*", regionin="state:09")
colnames(race_towns) <- c("town", "state", "county", "countysub", "total_pop", "White", "Black", "Indian", "Asian", "Hispanic")
race_towns <- race_towns[c("town", "total_pop", "White", "Black", "Indian", "Asian", "Hispanic")]
race_towns <- subset(race_towns, !grepl("County subdivisions", town))
race_towns$town <- gsub(" town.*", "", race_towns$town)
race_towns_long <- race_towns %>%
gather("race_ethnicity", "population", 3:7) %>%
mutate(percent_population=round(population/total_pop*100,2))
mj_arrests_race_spread <- mj_arrests_race %>%
group_by(department) %>%
mutate(total=sum(arrests, na.rm=T), percent=round(arrests/total*100,2))
mj_arrests_race_spread$town <- gsub(" Police Department", "", mj_arrests_race_spread$department)
mj_arrests_race_spread$town <- gsub(" Police Dept.", "", mj_arrests_race_spread$town)
mj_arrests_race_spread <- left_join(mj_arrests_race_spread, race_towns_long)
mj_arrests_filtered <- subset(mj_arrests_race_spread, !is.na(total_pop))
mj_arrests_filtered <- filter(mj_arrests_filtered, race_ethnicity!="Indian")
mj_filtered <- mj_arrests_filtered[c ("percent", "percent_population")]
gg <- ggplot(mj_arrests_filtered, aes(percent, percent_population))
gg <- gg + geom_abline(intercept = 1, color="grey65")
gg <- gg + geom_point(data = mj_filtered, color = "grey85")
gg <- gg + geom_point(aes(color=race_ethnicity))
gg <- gg + facet_wrap(~race_ethnicity, ncol=4)
gg <- gg + labs(y="Percent population", x="Percent cited",
title="Marijuana citations by race compared to town's population",
subtitle="Minorities tend to be cited disproportionately to the proportion of the population of the towns they live in.\nBased on citations between 2011 and 2014.",
caption="SOURCE: National Incident-Based Reporting System, U.S. Census \nAndrew Ba Tran/TrendCT.org")
gg <- gg + theme_bw(base_family="Lato Regular")
gg <- gg + theme(axis.ticks.y=element_blank())
gg <- gg + theme(panel.border=element_blank())
gg <- gg + theme(legend.key=element_blank())
gg <- gg + theme(plot.title=element_text(face="bold", family="Lato Regular", size=22))
gg <- gg + theme(plot.caption=element_text(face="bold", family="Lato Regular", size=9, color="gray", margin=margin(t=10, r=80)))
gg <- gg + theme(legend.position="none")
gg