This is the exploratory visualizaton behind the Trend CT story: Who in CT is being cited for marijuana possession and by whom?.

Visit the repo for the data used in this analysis. (Also, check out the reproducible scripts and data behind many of our other stories in our central data stories repo)

The data used in this analysis (marijuana citations between 2011 and 2014) is from the Connecticut State Police via a FOIA request made by Evan Anderson via Muckrock.com.

What’s in this walkthrough

Exploratory analysis and visualizations of marijuana citations.

library(dplyr)
library(lubridate)
library(ggplot2)
library(tidyr)
library(stringr)
library(extrafont)
library(ggalt)
library(scales)
library(gridExtra)
library(grid)
library(knitr)
#install.packages("devtools")
#devtools::install_github("trendct/ctnamecleaner")
library(ctnamecleaner)
#devtools::install_github("hrecht/censusapi")
library("censusapi")
source("keys.R")

Loading and prepping the data

mj <- read.csv("data/15-218_Marijuana_Arrests_by_Agency_2011-2013_NIBRS.csv", stringsAsFactors=F)

colnames(mj) <- c("date", "description", "gender", "race", "ethnicity", "age")

mj2 <- filter(mj,
              date!="" &
              !grepl("Unit of", date) &
              !grepl("NSS ", date) &
              !grepl("Report ", date) &
              !grepl("Arrests ", date) &
              !grepl("IncidentDate", date)
                )

mj2$department <- "temp"

department_name <- "temp"

for (i in 1:nrow(mj2)) {
  #print(i)
  department_name <- ifelse(grepl("/", mj2$date[i]), department_name, mj2$date[i])
  mj2$department[i] <- department_name
  #print( mj2$department[i] )
}

mj2 <- mj2 %>%
  filter(gender=="M" | gender=="F")

mj2$date <- mdy(mj2$date)
mj2$year <- year(mj2$date)
mj2$month <- month(mj2$date)

Men and women

mj_sex <- mj2 %>%
  group_by(gender) %>%
  summarise(citations=n())

kable(mj_sex)
gender citations
F 1674
M 10061

Age distribution

mj2$age <- as.numeric(mj2$age)

ggplot(mj2, aes(mj2$age)) + geom_histogram(binwidth=1, aes(fill = ..count..)) + ggtitle("Marijuana citations by age in Connecticut")

Departments that arrested the most

mj_most <- mj2 %>%
  group_by(department) %>%
  summarise(arrests=n()) %>%
  arrange(-arrests)

kable(head(mj_most, 10))
department arrests
Connecticut State Police 1480
New Haven Police Department 1451
Stamford Police Department 770
Norwalk Police Department 761
West Hartford Police Department 401
New London Police Department 384
Middletown Police Department 356
East Hartford Police Department 328
Southington Police Department 289
Glastonbury Police Department 282

After adjusting for population

mj_most$town <- gsub(" Police Department", "", mj_most$department)
mj_most$town <- gsub(" Police Dept.", "", mj_most$town)

mj_most2 <- ctpopulator(town, mj_most)
## [1] "Checking to see if names match..."
non_mj_most <- subset(mj_most2, is.na(pop2013))

mj_most2_map <- subset(mj_most2, !is.na(pop2013))
mj_most2_map$per_capita <- (mj_most2_map$arrests/mj_most2_map$pop2013)*10000

mj_most2_map <- mj_most2_map[c("town", "per_capita", "arrests")]
colnames(mj_most2_map) <- c("Town", "Per capita arrests", "Total arrests")

mj_most2_map$Town <- str_to_title(mj_most2_map$Town)
mj_most2_map <- arrange(mj_most2_map, -`Per capita arrests`)
kable(head(mj_most2_map,10))
Town Per capita arrests Total arrests
Clinton 179.01654 237
New London 139.19095 384
Derby 132.38844 170
New Haven 111.32594 1451
Granby 104.57285 118
Norwalk 87.97790 761
Windsor 84.87973 247
Glastonbury 81.61848 282
Farmington 81.00987 206
Old Saybrook 79.93761 82

Arrests over time by department (total)

mj_arrests_years <- mj2 %>%
  group_by(department, year) %>%
  summarise(arrests=n())

mj_arrests_years$town <- gsub(" Police Department", "", mj_arrests_years$department)
mj_arrests_years$town <- gsub(" Police Dept.", "", mj_arrests_years$town)

mj_arrests_years <- ctpopulator(town, mj_arrests_years)
## [1] "Checking to see if names match..."
mj_arrests_years2 <- subset(mj_arrests_years, !is.na(pop2013))
mj_arrests_years2$per_capita <- mj_arrests_years2$arrests/mj_arrests_years2$pop2013*10000


gg <- ggplot(mj_arrests_years, aes(x=year, y=arrests)) 
gg <- gg + geom_bar(stat="identity") 
gg <- gg + facet_wrap(~department, ncol = 3)
gg <- gg + labs(x=NULL, y=NULL, title="Total marijuana citations",
                subtitle="Between 2011 and 2014.",
                caption="SOURCE: National Incident-Based Reporting System, U.S. Census \nAndrew Ba Tran/TrendCT.org")
gg <- gg + theme_bw(base_family="Lato Regular")
gg <- gg + theme(axis.ticks.y=element_blank())
gg <- gg + theme(panel.border=element_blank())
gg <- gg + theme(legend.key=element_blank())
gg <- gg + theme(plot.title=element_text(face="bold", family="Lato Regular", size=22))
gg <- gg + theme(plot.caption=element_text(face="bold", family="Lato Regular", size=9, color="gray", margin=margin(t=10, r=80)))
gg <- gg + theme(legend.position="none")
gg

Arrests over time by department (per capita)

gg <- ggplot(mj_arrests_years2, aes(x=year, y=per_capita)) 
gg <- gg + geom_bar(stat="identity") 
gg <- gg + facet_wrap(~department, ncol = 3)
gg <- gg + labs(x=NULL, y=NULL, title="Per capita marijuana citations",
                subtitle="Per 10,000 residents. Between 2011 and 2014.",
                caption="SOURCE: National Incident-Based Reporting System, U.S. Census \nAndrew Ba Tran/TrendCT.org")
gg <- gg + theme_bw(base_family="Lato Regular")
gg <- gg + theme(axis.ticks.y=element_blank())
gg <- gg + theme(panel.border=element_blank())
gg <- gg + theme(legend.key=element_blank())
gg <- gg + theme(plot.title=element_text(face="bold", family="Lato Regular", size=22))
gg <- gg + theme(plot.caption=element_text(face="bold", family="Lato Regular", size=9, color="gray", margin=margin(t=10, r=80)))
gg <- gg + theme(legend.position="none")
gg

mj_arrests_years <- mj2 %>%
  group_by(department, year) %>%
  summarise(arrests=n()) %>%
  spread(year, arrests) %>%
  mutate(per_change=round((`2013` - `2011`) / `2011` *100, 2)) %>%
  arrange(-per_change)

kable(head(mj_arrests_years,10))
department 2011 2012 2013 per_change
Thomaston Police Department 2 2 7 250.00
C.C.S.U. Police Department 2 3 5 150.00
Orange Police Department 16 33 26 62.50
Ansonia Police Department 25 28 39 56.00
Plainfield Police Department 11 NA 16 45.45
Norwalk Police Department 221 225 315 42.53
Granby Police Department 29 48 41 41.38
Clinton Police Department 61 93 83 36.07
East Hartford Police Department 117 61 150 28.21
New Milford Police Department 22 21 28 27.27

Arrests by race by department

mj2$race_ethnicity <- ifelse(mj2$ethnicity=="H", "Hispanic", mj2$race)
mj2$race_ethnicity <- ifelse(mj2$race_ethnicity=="A", "Asian", mj2$race_ethnicity)
mj2$race_ethnicity <- ifelse(mj2$race_ethnicity=="B", "Black", mj2$race_ethnicity)
mj2$race_ethnicity <- ifelse(mj2$race_ethnicity=="W", "White", mj2$race_ethnicity)

mj2$race_ethnicity <- ifelse(mj2$race_ethnicity=="I", "Indian", mj2$race_ethnicity)
mj2$race_ethnicity <- ifelse(mj2$race_ethnicity=="U", "Unknown", mj2$race_ethnicity)


mj_arrests_race <- mj2 %>%
  group_by(department, race_ethnicity) %>%
  summarise(arrests=n())

## chart
ggplot(mj_arrests_race, aes(x=race_ethnicity, y=arrests)) + geom_bar(stat="identity") + coord_flip() + facet_wrap(~department, ncol = 4, scales = "free_x")

Bringing in census population data

# B02001_001E - Total
# B02001_002E - White
# B02001_003E - Black
# B02001_004E - Indian
# B02001_005E - Asian
# B03001_003E - Hispanic

race_towns <- getCensus(name="acs5",
                         vintage=2014,
                         key=census_key,
                         vars=c("NAME", "B02001_001E", "B02001_002E", "B02001_003E",
                                "B02001_004E", "B02001_005E", "B03001_003E"),
                         region="county subdivision:*", regionin="state:09")

colnames(race_towns) <- c("town", "state", "county", "countysub", "total_pop", "White", "Black", "Indian", "Asian", "Hispanic")
race_towns <- race_towns[c("town", "total_pop", "White", "Black", "Indian", "Asian", "Hispanic")]
race_towns <- subset(race_towns, !grepl("County subdivisions", town))
race_towns$town <- gsub(" town.*", "", race_towns$town)

race_towns_long <- race_towns %>%
  gather("race_ethnicity", "population", 3:7) %>%
  mutate(percent_population=round(population/total_pop*100,2))

Percent of tickets by race compared to percent of population by race

mj_arrests_race_spread <- mj_arrests_race %>%
  group_by(department) %>%
  mutate(total=sum(arrests, na.rm=T), percent=round(arrests/total*100,2))

mj_arrests_race_spread$town <- gsub(" Police Department", "", mj_arrests_race_spread$department)
mj_arrests_race_spread$town <- gsub(" Police Dept.", "", mj_arrests_race_spread$town)



mj_arrests_race_spread <- left_join(mj_arrests_race_spread, race_towns_long)

mj_arrests_filtered <- subset(mj_arrests_race_spread, !is.na(total_pop))
mj_arrests_filtered <- filter(mj_arrests_filtered, race_ethnicity!="Indian")

mj_filtered <- mj_arrests_filtered[c ("percent", "percent_population")]

gg <- ggplot(mj_arrests_filtered, aes(percent, percent_population)) 
gg <- gg + geom_abline(intercept = 1, color="grey65")
gg <- gg + geom_point(data = mj_filtered, color = "grey85") 
gg <- gg + geom_point(aes(color=race_ethnicity)) 
gg <- gg + facet_wrap(~race_ethnicity, ncol=4)
gg <- gg + labs(y="Percent population", x="Percent cited", 
                title="Marijuana citations by race compared to town's population", 
                subtitle="Minorities tend to be cited disproportionately to the proportion of the population of the towns they live in.\nBased on citations between 2011 and 2014.", 
                caption="SOURCE: National Incident-Based Reporting System, U.S. Census \nAndrew Ba Tran/TrendCT.org")
gg <- gg + theme_bw(base_family="Lato Regular")
gg <- gg + theme(axis.ticks.y=element_blank())
gg <- gg + theme(panel.border=element_blank())
gg <- gg + theme(legend.key=element_blank())
gg <- gg + theme(plot.title=element_text(face="bold", family="Lato Regular", size=22))
gg <- gg + theme(plot.caption=element_text(face="bold", family="Lato Regular", size=9, color="gray", margin=margin(t=10, r=80)))
gg <- gg + theme(legend.position="none")
gg