R Plots

Various plots I've made using R with publicly available datasets.

R Logo

R Plot
View R Code

 # R Code for NHANES plot
 llibrary(tidyverse)

 cdcdata <- read_csv("data/National_Health_and_Nutrition_Examination_Survey__NHANES__-_National_Cardiovascular_Disease_Surveillance_System.csv")
 
 count(cdcdata, Class, Topic)
 
 overall_heart_disease <- filter(
   cdcdata, 
   Topic == "Hypertension",
   Break_Out_Category == "Overall", 
   Data_Value_Type == "Age-Standardized",
   QuestionId != "NH034",
   QuestionId != "NH024",
   !is.na(Data_Value)
 )
 
 hd_clean <- separate(overall_heart_disease, YearStart, c("start", "end"), sep = "-") %>%
   mutate(
     start = as.numeric(start),
     end = as.numeric(end)
   ) %>% 
   mutate(
     label = case_when(
       QuestionId == "NH033" ~ "BP Control among US adults (BP=130/90)",
       QuestionId == "NH023" ~ "BP Control among US adults (BP=140/90)",
       QuestionId == "NH030" ~ "Prevalence of HTN (BP=130/90)",
       QuestionId == "NH020" ~ "Prevalence of HTN (BP=140/90)",
       QuestionId == "NH031" ~ "HTN awareness among adults with HTN (BP=130/90)",
       QuestionId == "NH021" ~ "HTN awareness among adults with HTN (BP=140/90)",
       QuestionId == "NH032" ~ "HTN Medication Use in adults with HTN (BP=130/90)",
       QuestionId == "NH022" ~ "HTN Medication Use in adults with HTN (BP=140/90)"
       
     )
   )
 
 library(showtext)
 font_add_google(name = "Montserrat", family = "Montserrat")
 showtext_auto()
 
 ggplot(hd_clean, aes(start, Data_Value)) + 
   geom_smooth(method = "lm", color = "#A3C7D6", fill = "#83A2FF") + 
   geom_errorbar(
     aes(ymin = Low_Confidence_Limit, ymax = High_Confidence_Limit), 
     width = 0.7,
     color = "#9F73AB"
   ) + 
   geom_point(color = "#3F3B6C") +
   facet_wrap("label", scales = "free_y", ncol = 3) +
   theme_light() +
   labs(
     title = "National Cardiovascular Disease (1999 - 2016)",
     subtitle = "Data from NHANES provided by CDC",
     # title = "NHANES Data on Cardiovascular Disease",
     x = "Year", y = "Age Standardized Rate", 
   ) +
   theme(
     plot.title = element_text(hjust = 0.0),
     text = element_text(family = "Montserrat"),
     strip.background = element_rect(fill = "#3F3B6C")
   )

         
R Plot
View R Code

 # R Code for Chloropleth Map
library(sf)
library(readr)
library(tidyr)
library(dplyr)
library(ggplot2)

library(showtext)
font_add_google(name = "Montserrat", family = "Montserrat")
showtext_auto()

# get US Counties data
counties <- tigris::counties(cb = TRUE) |> 
  # make the map less detailed so it is easier to plot
  st_simplify(dTolerance = 1000) |>
  # create the fips code
  mutate(FIPS = paste0(STATEFP, COUNTYFP))


# read in the hospital locations
hospitals_raw <- read_csv("rawdata/best_hospitals_states.csv", skip = 1) |> 
  # remove hawaii and alaska
  filter(!State %in% c("AK", "HI")) |> 
  # turn it into a spatial dataframe
  st_as_sf(coords = c("Longitude", "Latitude"), crs = 4326) |> 
  # reproject to conic albers for plotting
  st_transform(5070)

# read USDA data
groceries_stores <- read_csv("rawdata/StateAndCountyData.csv") |> 
  # filter to the grocery variable
  filter(Variable_Code == "GROCPTH16") |> 
  # select only the fips and our variable of interest, renaming it
  select(FIPS, grocery_store_count = Value) |> 
  # create a FIPS code by adding 0s to the left
  mutate(FIPS = sprintf("%05s", FIPS)) 

# read in the social vulnerability index data we cleaned
# join to our dataset
all_variables <- read_csv("cleandata/joined-svi-places.csv") |> 
  left_join(groceries_stores)

# join all of the variables to the US counties
# remove hawaii, alaska, and territories
all_vars_sf <- left_join(counties, all_variables) |> 
  filter(!STUSPS %in% c("HI", "AK", "AS", "MP", "VI", "PR", "GU")) |> 
  st_transform(5070)


# create a plot of health disease and locate hospitals on the map
ggplot() +
  geom_sf(aes(fill = CHD), data = all_vars_sf, lwd = 0, alpha = 1) +
  geom_sf(data = hospitals_raw, color = "yellow", pch = 8) +
  scale_fill_viridis_c(option = "F") +
  theme_void() +
  theme(legend.position = "bottom") + 
  labs(fill = "Coronary Heart Disease rate",
       subtitle = "*Locations of AI-Powered Healthcare") +
  theme(
    legend.title = element_text(family = "Montserrat", face = "bold", size = 11), 
    legend.position = "right",
    plot.subtitle =element_text(hjust = 0.5, size = 10, family = "Montserrat"),
    axis.title = element_text(family = "Montserrat", face = "bold", size = 11),  
    legend.key.size = unit(2, "lines")  
  )


# remove all chracter columns for regression
model_data <- all_vars_sf |> 
  st_drop_geometry() |> 
  select(-where(is.character), -ALAND, -AWATER) 

# predict obesity by everything else 
mod <- lm(OBESITY ~ ., model_data) 

mod_coefs <- broom::tidy(mod)

model_data
skimr::skim(model_data)
         
R Plot
View R Code

 # R Code for Georgia Plot showing percentile percentage of Uninsured Persons
 library(sf)

# print out the layes that are available
st_layers("data/SVI2020_GEORGIA_tract.gdb/")

# read in as sf object
tracts <- read_sf("data/SVI2020_GEORGIA_tract.gdb", layer = "SVI2020_GEORGIA_tract")

library(showtext)
font_add_google(name = "Montserrat", family = "Montserrat")
showtext_auto()

library(tidyverse)

map <- ggplot(tracts, aes(fill = EPL_UNINSUR)) +
  geom_sf(lwd = 0.15, color = "black")

# we use filter to remove outliers
tracts %>% 
  filter(EPL_UNINSUR > -200) %>% 
  ggplot(aes(fill = EPL_UNINSUR)) +
  geom_sf(lwd = 0.15, color = "black")

# instead of removing the values, we make the values missing
tracts %>% 
  mutate(EPL_UNINSUR = ifelse(EPL_UNINSUR > -200, EPL_UNINSUR, NA)) %>%
  select(EPL_UNINSUR) %>% 
  ggplot(aes(fill = EPL_UNINSUR)) +
  geom_sf(lwd = 0.15, color = "black") +
  # using pre-defined color scheme
  scale_fill_viridis_c(option = "viridis") +
  #specify desired color gradient
  #scale_fill_gradient(low = "red", high = "yellow") + 
  theme_minimal() +
  labs(
    title = "Percentile Percentage of Uninsured Persons",
    #subtitle = "Ranking values range from 0 to 1, with higher values indicating greater
    #vulnerability",
    caption = "*Data from CDC/ATSDR Social Vulnerability Index (2020)
    Higher values indicating greater vulnerability", 
    fill= "Ranking Values"
  ) + 
  theme(
    plot.title = element_text(hjust = 0.5),
    text = element_text(family = "Montserrat")
  )


# make the map interactive 
#plotly::ggplotly(map)

         
R Plot
View R Code

 # R Code for Spatial Regression plot
 #This is my regression 

library(sf) # for spatial analysis
library(tidyverse)
install.packages("tigris")

# read the cleaned csv
places_df_raw <- read_csv("cleandata/joined-svi-places.csv") 

# get the counties geometries from tigirs R package
counties <- tigris::counties(cb = TRUE)

# combine the state fips code with the county fips code
# to create the full FIPS code. 
# need to convert to longitude and latitude (crs 4326) 
# to compare with hospital locations
counties_clean <- counties |> 
  transmute(FIPS = paste0(STATEFP, COUNTYFP)) |> 
  st_transform(crs = 4326)

# create the data frame with locations of health care systems
ai_hc_systems <- data.frame(
  health_system = c(
    "Kaiser Permanente", "Stanford Health", "UC San Diego", "Mayo Clinic",
    "Mass General Brigham", "NYU Langone Health", "Duke Health",
    "Vanderbilt Health", "University of Wisconsin Madison Health"
  ),
  latitude = c(
    37.82833, 37.433868, 32.87624, 44.022446, 42.3471718, 40.7420735, 36.0070213,
    36.1409237, 43.0763751
  ),
  longitude = c(
    -122.23247, -122.18, -117.210327, -92.4680463, -71.0825069, -73.9743366,
    -78.9373479, -86.8016342, -89.4320049
  )
) |> 
  st_as_sf(crs = 4326, coords = c("longitude", "latitude"))

# we want to filter to locations that are 200 miles away
one_mile_meters <- 1609.34 # one mile 
two_hundred_miles <- one_mile_meters * 200 # two hundred miles away

# create a 200 mile radius around each hospital
hc_two_hundred_miles_buffer <- st_buffer(ai_hc_systems, two_hundred_miles)

# spatial filter all counties to the areas that are within 200 miles
counties_nearish_ai_hc <- st_filter(
  counties_clean, 
  hc_two_hundred_miles_buffer
)

# calculate distance from AI health care system
dist_matrix <- st_distance(counties_nearish_ai_hc, ai_hc_systems)

# find the minimum distance for each county to a AI hc system
distance_from_ai_facility <- Rfast::rowMins(dist_matrix, value = TRUE)


places_with_distance <- inner_join(
  places_df_raw,
  # create a new column called nearest_ai_facility which calculates the distances to nearest hospital in miles
  mutate(counties_nearish_ai_hc, nearest_ai_facility = distance_from_ai_facility / one_mile_meters)
)

# fit a regression that estimates 
mod <- lm(OBESITY ~ nearest_ai_facility + CHD + CASTHMA, data = places_with_distance)
summary(mod)


ggplot(places_with_distance, aes(x = nearest_ai_facility, y = OBESITY, color = CHD)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", color = "red", linetype = "dashed") +
  labs(
    title = "Obesity Rates vs. Distance to AI Healthcare Facility",
    x = "Distance to Nearest AI Facility (Miles)",
    y = "Obesity Rate (%)",
    color = "Coronary Heart Disease Rate (%)"
  ) +
  theme_minimal() +
  scale_color_gradient(low = "yellow", high = "red") +
  theme(legend.position = "bottom")