We are using a common dataset across the R and Python coding examples.
The code below reads in the csv file final_with_deceased.csv. It also calls the libraries we are going to use.
library(ggplot2)
library(dplyr)
library(tidyr)
mydata <- as.data.frame(read.csv("final_with_deceased.csv"))
The first step when working with a data file is an exploratory data analysis.
head(mydata, 5) # prints first five rows
tail(mydata, 5) # prints last five rows
sample_n(mydata, 5) # prints five randomly selected rows
print(dim(mydata)) # print number of columns and rows
[1] 156030 27
summary(mydata) # overview of each variable in the dataset
person_id birth_datetime race_source_value ethnicity_source_value gender_source_value visit_occurrence_id
Min. : 1 Length:156030 Length:156030 Length:156030 Length:156030 Min. : 1
1st Qu.: 30962 Class :character Class :character Class :character Class :character 1st Qu.: 782279
Median : 62112 Mode :character Mode :character Mode :character Mode :character Median :1566715
Mean : 62078 Mean :1566862
3rd Qu.: 93254 3rd Qu.:2358876
Max. :124150 Max. :3139398
visit_start_date visit_end_date visit_type condition observation_source age_at_visit_years
Length:156030 Length:156030 Length:156030 Length:156030 Length:156030 Min. : 0.01
Class :character Class :character Class :character Class :character Class :character 1st Qu.: 16.16
Mode :character Mode :character Mode :character Mode :character Mode :character Median : 36.07
Mean : 37.86
3rd Qu.: 57.17
Max. :110.73
measurement_Date body_height_cm bmi body_temperature_c body_weight_kg systolic diastolic
Length:156030 Min. : 50.6 Min. :12.70 Min. :36.1 Min. : 1.80 Min. : 97.0 Min. : 67.00
Class :character 1st Qu.:159.7 1st Qu.:27.20 1st Qu.:38.5 1st Qu.: 66.80 1st Qu.:113.0 1st Qu.: 76.00
Mode :character Median :167.7 Median :28.00 Median :39.7 Median : 78.00 Median :120.0 Median : 80.00
Mean :163.1 Mean :27.24 Mean :39.7 Mean : 73.22 Mean :121.3 Mean : 80.42
3rd Qu.:176.1 3rd Qu.:29.80 3rd Qu.:40.9 3rd Qu.: 87.40 3rd Qu.:128.0 3rd Qu.: 84.00
Max. :198.7 Max. :53.30 Max. :42.2 Max. :181.20 Max. :201.0 Max. :121.00
NA's :152974 NA's :153062 NA's :70085 NA's :76295 NA's :76285 NA's :76285
heart_rate_bpm oxygen_saturation_percent respiratory_rate_per_minute flu_last_administered tdap_last_administered
Min. : 50.0 Min. :66.10 Min. :12.00 Length:156030 Length:156030
1st Qu.: 85.1 1st Qu.:78.50 1st Qu.:18.20 Class :character Class :character
Median :122.0 Median :82.00 Median :25.50 Mode :character Mode :character
Mean :123.4 Mean :82.01 Mean :25.56
3rd Qu.:161.5 3rd Qu.:85.50 3rd Qu.:32.70
Max. :200.0 Max. :89.00 Max. :40.00
NA's :76295 NA's :79221 NA's :76295
mmr_last_administered polio_last_administered deceased
Length:156030 Length:156030 Length:156030
Class :character Class :character Class :character
Mode :character Mode :character Mode :character
All of our dates are character variables, so let’s convert those to dates.
mydata <- mydata %>%
mutate(visit_start_date = as.Date(visit_start_date)) %>%
mutate(visit_end_date = as.Date(visit_end_date) ) %>%
mutate(birth_datetime = as.Date(birth_datetime)) %>%
mutate(flu_last_administered = as.Date(flu_last_administered) ) %>%
mutate(tdap_last_administered = as.Date(tdap_last_administered) ) %>%
mutate(mmr_last_administered = as.Date(mmr_last_administered)) %>%
mutate(polio_last_administered = as.Date(polio_last_administered))
summary(mydata)
person_id birth_datetime race_source_value ethnicity_source_value gender_source_value visit_occurrence_id
Min. : 1 Min. :1909-06-24 Length:156030 Length:156030 Length:156030 Min. : 1
1st Qu.: 30962 1st Qu.:1953-03-09 Class :character Class :character Class :character 1st Qu.: 782279
Median : 62112 Median :1971-06-14 Mode :character Mode :character Mode :character Median :1566715
Mean : 62078 Mean :1971-12-17 Mean :1566862
3rd Qu.: 93254 3rd Qu.:1994-04-29 3rd Qu.:2358876
Max. :124150 Max. :2020-04-21 Max. :3139398
visit_start_date visit_end_date visit_type condition observation_source age_at_visit_years
Min. :1909-09-17 Min. :1909-09-17 Length:156030 Length:156030 Length:156030 Min. : 0.01
1st Qu.:2007-05-16 1st Qu.:2007-10-26 Class :character Class :character Class :character 1st Qu.: 16.16
Median :2020-02-22 Median :2020-02-24 Mode :character Mode :character Mode :character Median : 36.07
Mean :2009-10-28 Mean :2009-12-29 Mean : 37.86
3rd Qu.:2020-03-07 3rd Qu.:2020-03-09 3rd Qu.: 57.17
Max. :2020-05-26 Max. :2020-05-27 Max. :110.73
measurement_Date body_height_cm bmi body_temperature_c body_weight_kg systolic diastolic
Length:156030 Min. : 50.6 Min. :12.70 Min. :36.1 Min. : 1.80 Min. : 97.0 Min. : 67.00
Class :character 1st Qu.:159.7 1st Qu.:27.20 1st Qu.:38.5 1st Qu.: 66.80 1st Qu.:113.0 1st Qu.: 76.00
Mode :character Median :167.7 Median :28.00 Median :39.7 Median : 78.00 Median :120.0 Median : 80.00
Mean :163.1 Mean :27.24 Mean :39.7 Mean : 73.22 Mean :121.3 Mean : 80.42
3rd Qu.:176.1 3rd Qu.:29.80 3rd Qu.:40.9 3rd Qu.: 87.40 3rd Qu.:128.0 3rd Qu.: 84.00
Max. :198.7 Max. :53.30 Max. :42.2 Max. :181.20 Max. :201.0 Max. :121.00
NA's :152974 NA's :153062 NA's :70085 NA's :76295 NA's :76285 NA's :76285
heart_rate_bpm oxygen_saturation_percent respiratory_rate_per_minute flu_last_administered tdap_last_administered
Min. : 50.0 Min. :66.10 Min. :12.00 Min. :1908-10-06 Min. :1921-06-24
1st Qu.: 85.1 1st Qu.:78.50 1st Qu.:18.20 1st Qu.:2006-10-14 1st Qu.:2009-09-19
Median :122.0 Median :82.00 Median :25.50 Median :2019-09-13 Median :2013-06-11
Mean :123.4 Mean :82.01 Mean :25.56 Mean :2009-04-26 Mean :2007-07-04
3rd Qu.:161.5 3rd Qu.:85.50 3rd Qu.:32.70 3rd Qu.:2019-11-07 3rd Qu.:2016-11-04
Max. :200.0 Max. :89.00 Max. :40.00 Max. :2019-12-31 Max. :2020-05-25
NA's :76295 NA's :79221 NA's :76295 NA's :31146
mmr_last_administered polio_last_administered deceased
Min. :1910-07-21 Min. :1909-09-11 Length:156030
1st Qu.:1956-12-04 1st Qu.:1957-01-03 Class :character
Median :1975-01-21 Median :1975-04-08 Mode :character
Mean :1975-06-06 Mean :1975-08-21
3rd Qu.:1997-09-01 3rd Qu.:1998-01-02
Max. :2020-05-19 Max. :2020-05-25
NA's :1807 NA's :228
Next, lets look into any missing data.
n <- nrow(mydata) # n is number of rows (observations)
missing_count <- colSums(is.na(mydata)) # calculate number missing
missing_pct <- missing_count/n * 100 # calculate percent missing
non_missing_count <- n - missing_count # calculate number non-missing
print(cbind(missing_count, missing_pct, non_missing_count))
missing_count missing_pct non_missing_count
person_id 0 0.0000000 156030
birth_datetime 0 0.0000000 156030
race_source_value 0 0.0000000 156030
ethnicity_source_value 0 0.0000000 156030
gender_source_value 0 0.0000000 156030
visit_occurrence_id 0 0.0000000 156030
visit_start_date 0 0.0000000 156030
visit_end_date 0 0.0000000 156030
visit_type 0 0.0000000 156030
condition 0 0.0000000 156030
observation_source 0 0.0000000 156030
age_at_visit_years 0 0.0000000 156030
measurement_Date 0 0.0000000 156030
body_height_cm 152974 98.0414023 3056
bmi 153062 98.0978017 2968
body_temperature_c 70085 44.9176440 85945
body_weight_kg 76295 48.8976479 79735
systolic 76285 48.8912389 79745
diastolic 76285 48.8912389 79745
heart_rate_bpm 76295 48.8976479 79735
oxygen_saturation_percent 79221 50.7729283 76809
respiratory_rate_per_minute 76295 48.8976479 79735
flu_last_administered 0 0.0000000 156030
tdap_last_administered 31146 19.9615459 124884
mmr_last_administered 1807 1.1581106 154223
polio_last_administered 228 0.1461257 155802
deceased 0 0.0000000 156030
Now we need to prepare our data for analysis.
# create a variable for length of stay
mydata$los = as.numeric(mydata$visit_end_date - mydata$visit_start_date)
summary(mydata$los)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 0.00 0.00 61.74 0.00 38325.00
# modify labels for the deceased column
mydata$deceased_flag = mydata$deceased
mydata = mydata %>%
mutate(deceased_flag = recode(deceased_flag, 'Y' = 'Deceased', 'N' = 'Alive'))
# make a column for visit month and year
mydata$visit_year = as.numeric(format(mydata$visit_start_date, "%Y"))
mydata$visit_month = as.numeric(format(mydata$visit_start_date, "%m"))
# make gender, race, and ethnicity factors
mydata$race_source_value = as.factor(mydata$race_source_value)
mydata$ethnicity_source_value = as.factor(mydata$ethnicity_source_value)
mydata$gender_source_value = as.factor(mydata$gender_source_value)
# make other categorical variables into factors
mydata$visit_type = as.factor(mydata$visit_type)
mydata$deceased = as.factor(mydata$deceased)
mydata$deceased_flag = as.factor(mydata$deceased_flag)
After we do this work, we can summarize the data again!
summary(mydata)
person_id birth_datetime race_source_value ethnicity_source_value gender_source_value visit_occurrence_id
Min. : 1 Min. :1909-06-24 asian : 10813 hispanic : 17192 F:77339 Min. : 1
1st Qu.: 30962 1st Qu.:1953-03-09 black : 13118 nonhispanic:138838 M:78691 1st Qu.: 782279
Median : 62112 Median :1971-06-14 native: 805 Median :1566715
Mean : 62078 Mean :1971-12-17 other : 158 Mean :1566862
3rd Qu.: 93254 3rd Qu.:1994-04-29 white :131136 3rd Qu.:2358876
Max. :124150 Max. :2020-04-21 Max. :3139398
visit_start_date visit_end_date visit_type condition observation_source
Min. :1909-09-17 Min. :1909-09-17 Emergency Room Visit: 20 Length:156030 Length:156030
1st Qu.:2007-05-16 1st Qu.:2007-10-26 Inpatient Visit : 21056 Class :character Class :character
Median :2020-02-22 Median :2020-02-24 Outpatient Visit :134954 Mode :character Mode :character
Mean :2009-10-28 Mean :2009-12-29
3rd Qu.:2020-03-07 3rd Qu.:2020-03-09
Max. :2020-05-26 Max. :2020-05-27
age_at_visit_years measurement_Date body_height_cm bmi body_temperature_c body_weight_kg
Min. : 0.01 Length:156030 Min. : 50.6 Min. :12.70 Min. :36.1 Min. : 1.80
1st Qu.: 16.16 Class :character 1st Qu.:159.7 1st Qu.:27.20 1st Qu.:38.5 1st Qu.: 66.80
Median : 36.07 Mode :character Median :167.7 Median :28.00 Median :39.7 Median : 78.00
Mean : 37.86 Mean :163.1 Mean :27.24 Mean :39.7 Mean : 73.22
3rd Qu.: 57.17 3rd Qu.:176.1 3rd Qu.:29.80 3rd Qu.:40.9 3rd Qu.: 87.40
Max. :110.73 Max. :198.7 Max. :53.30 Max. :42.2 Max. :181.20
NA's :152974 NA's :153062 NA's :70085 NA's :76295
systolic diastolic heart_rate_bpm oxygen_saturation_percent respiratory_rate_per_minute
Min. : 97.0 Min. : 67.00 Min. : 50.0 Min. :66.10 Min. :12.00
1st Qu.:113.0 1st Qu.: 76.00 1st Qu.: 85.1 1st Qu.:78.50 1st Qu.:18.20
Median :120.0 Median : 80.00 Median :122.0 Median :82.00 Median :25.50
Mean :121.3 Mean : 80.42 Mean :123.4 Mean :82.01 Mean :25.56
3rd Qu.:128.0 3rd Qu.: 84.00 3rd Qu.:161.5 3rd Qu.:85.50 3rd Qu.:32.70
Max. :201.0 Max. :121.00 Max. :200.0 Max. :89.00 Max. :40.00
NA's :76285 NA's :76285 NA's :76295 NA's :79221 NA's :76295
flu_last_administered tdap_last_administered mmr_last_administered polio_last_administered deceased los
Min. :1908-10-06 Min. :1921-06-24 Min. :1910-07-21 Min. :1909-09-11 N:139602 Min. : 0.00
1st Qu.:2006-10-14 1st Qu.:2009-09-19 1st Qu.:1956-12-04 1st Qu.:1957-01-03 Y: 16428 1st Qu.: 0.00
Median :2019-09-13 Median :2013-06-11 Median :1975-01-21 Median :1975-04-08 Median : 0.00
Mean :2009-04-26 Mean :2007-07-04 Mean :1975-06-06 Mean :1975-08-21 Mean : 61.74
3rd Qu.:2019-11-07 3rd Qu.:2016-11-04 3rd Qu.:1997-09-01 3rd Qu.:1998-01-02 3rd Qu.: 0.00
Max. :2019-12-31 Max. :2020-05-25 Max. :2020-05-19 Max. :2020-05-25 Max. :38325.00
NA's :31146 NA's :1807 NA's :228
deceased_flag visit_year visit_month
Alive :139602 Min. :1909 Min. : 1.000
Deceased: 16428 1st Qu.:2007 1st Qu.: 3.000
Median :2020 Median : 3.000
Mean :2009 Mean : 4.689
3rd Qu.:2020 3rd Qu.: 7.000
Max. :2020 Max. :12.000
The last step in data cleaning is separating out the conditions! Right now, they are all in one string together, separated by a colon (:).
head(mydata$condition, 10)
[1] "Dyspnea:Pneumonia:Respiratory distress:Wheezing" "Viral sinusitis"
[3] "Sore throat symptom:Dyspnea:Wheezing" "Perennial allergic rhinitis"
[5] "Cough" "Chronic sinusitis"
[7] "Dyspnea:Wheezing:Cough" "Chronic sinusitis"
[9] "Cough" "Respiratory distress:Sore throat symptom:Cough:Pneumonia"
# we're going to separate each condition into a row
longdata <- mydata %>% mutate(condition = strsplit(condition, ':')) %>%
unnest(condition) %>%
group_by(person_id) %>%
mutate(row=row_number())
longdata$condition <- as.factor(longdata$condition)
summary(longdata$condition)
Acute bacterial sinusitis Acute bronchitis
939 7097
Acute respiratory distress syndrome Acute respiratory failure
2454 9139
Acute viral pharyngitis Asthma
8347 202
Childhood asthma Chronic sinusitis
1715 26783
Cough Cystic fibrosis
63448 39
Dyspnea Emphysematous bronchitis
18533 1875
Hemoptysis Nasal congestion
904 4324
Non-small cell carcinoma of lung, TNM stage 1 Non-small cell lung cancer
1424 1429
Perennial allergic rhinitis Perennial allergic rhinitis with seasonal variation
2935 2964
Pneumonia Primary small cell malignant neoplasm of lung, TNM stage 1
20464 275
Pulmonary emphysema Respiratory distress
2130 19139
Seasonal allergic rhinitis Sinusitis
1506 943
Small cell carcinoma of lung Sore throat symptom
274 13166
Streptococcal sore throat Viral sinusitis
1840 16232
Wheezing
18533
# look at conditions by visit types
longdata %>% group_by(visit_type, condition) %>% summarize(n = n())
NA
NA
NA
Distributions can be shown several ways.
One way is a column histogram. Let’s look at age at visit (in years).
# this histogram has 30 bins
mydata %>%
ggplot(aes(x=age_at_visit_years)) +
geom_histogram(bins=30, color="#e9ecef", alpha=0.4) +
labs(title = "Distribution of Age at Visit (years)") +
xlab("Age (years)")
# this histogram only has 25 bins
mydata %>%
ggplot(aes(x=age_at_visit_years)) +
geom_histogram(bins=25, color="#e9ecef", alpha=0.4) +
labs(title = "Distribution of Age at Visit (years)") +
xlab("Age (years)")
NA
NA
Boxplots are another way to show distributions.
# boxplot of BMI
ggplot(data=subset(mydata, !is.na(bmi)), aes(y=bmi)) +
geom_boxplot() +
ggtitle("BMI Distribution") +
ylab("BMI")
Let’s also look at the distribution of the length of stay.
mydata %>%
ggplot(aes(x=los)) +
geom_histogram(bins=30, color="#e9ecef", alpha=0.4) +
labs(title = "Distribution of Length of Stay (Days)") +
xlab("Days")
NA
NA
Running this code shows there is clearly a value that makes no sense!
We can look in the data for which observations may be causing this issue. We’ll look at visits that are not outpatient visits that do have more than 100 days in the hospital.
mydata %>%
filter(los > 100, visit_type != 'Outpatient Visit')
NA
Let’s just look at values that are less than 100 days and that are not outpatient visits
mydata %>%
filter(los < 100, visit_type != 'Outpatient Visit') %>%
ggplot(aes(x=los)) +
geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.8, bins=30) +
labs(title ="Distribution of Length of Stay (Non-Outpatient Visits)") +
xlab("Length of Stay (Days)")
mydata %>%
filter(los < 100, visit_type != 'Outpatient Visit') %>%
ggplot(aes(x=los)) +
geom_density(fill="#69b3a2", color="#e9ecef", alpha=0.8) +
labs(title ="Distribution of Length of Stay (Non-Outpatient Visits)") +
xlab("Length of Stay (Days)")
We can make histograms and density plots of any continuous variable grouped by a second variable.
Here are some of blood pressure by gender.
# create a new dataset that is long-- with a row for systolic and a row for diastolic blood pressure per visit and person
bplong <- mydata %>%
pivot_longer(cols = systolic:diastolic, names_to = 'Blood_Pressure_Type', values_to = 'Blood_Pressure_Value' )
bplong %>%
filter(!is.na(Blood_Pressure_Value)) %>%
ggplot(aes(x=gender_source_value, y=Blood_Pressure_Value , fill=Blood_Pressure_Type)) +
geom_boxplot() +
scale_fill_manual(values=c("darkgreen", "orange")) +
labs(title = "Blood Pressure by Gender") +
xlab("Gender")
We can also use a bar chart to plot the mean and a standard error bar for each of the above categories.
bplong %>%
filter(!is.na(Blood_Pressure_Value)) %>%
ggplot(aes(x = interaction(Blood_Pressure_Type, gender_source_value), y = Blood_Pressure_Value)) +
geom_bar(stat = 'summary', fun = 'mean') +
labs(title = "Mean Blood Pressure by Gender") +
xlab("Blood Pressure and Gender") +
ylab("Blood Pressure (mmHg)")
You can also overlay the distributions of blood pressure by gender and have panels in your plot for systolic and diastolic.
bp <- ggplot(bplong %>% filter(!is.na(Blood_Pressure_Value)), aes(x = Blood_Pressure_Value, fill=gender_source_value)) +
geom_density(alpha = 0.4) +
scale_color_manual(values = c("pink", "lightblue")) +
facet_wrap(~Blood_Pressure_Type)
bp
Now let’s look at a comparison of length of stay for inpatient visits by gender.
# Histogram
mydata %>%
filter(los < 100, visit_type != 'Outpatient Visit') %>%
ggplot(aes(x=los,fill=gender_source_value)) +
geom_histogram(alpha=0.8, bins=30) +
scale_fill_manual(values=c("pink", "lightblue")) +
labs(title ="Distribution of Length of Stay (Non-Outpatient Visits)") +
xlab("Length of Stay (Days)")
# Density plots
mydata %>%
filter(los < 100, visit_type != 'Outpatient Visit') %>%
ggplot(aes(x=los, fill=gender_source_value)) +
geom_density( alpha=0.4) +
scale_fill_manual(values=c("pink", "lightblue"))+
labs(title ="Distribution of Length of Stay (Non-Outpatient Visits)") +
xlab("Length of Stay (Days)")
# Boxplots
mydata %>%
filter(los < 100, visit_type == 'Inpatient Visit') %>%
ggplot(aes(y=los, x=gender_source_value, fill=gender_source_value)) +
geom_boxplot() +
scale_fill_manual(values=c("pink", "lightblue")) +
labs(title = "Length of Stay for Inpatient Visits by Gender") +
xlab("Gender") +
ylab("Length of Stay (days)")
# Violin Plots
mydata %>%
filter(los < 100, visit_type == 'Inpatient Visit') %>%
ggplot(aes(y=los, x=gender_source_value, fill=gender_source_value)) +
geom_violin() +
scale_fill_manual(values=c("pink", "lightblue")) +
labs(title = "Length of Stay for Inpatient Visits by Gender") +
xlab("Gender") +
ylab("Length of Stay (days)")
There are a lot of ways to explore relationships between continuous data, as well.
mydata %>%
filter(!is.na(respiratory_rate_per_minute)) %>%
filter(!is.na(oxygen_saturation_percent)) %>%
ggplot(aes(x = respiratory_rate_per_minute, y = oxygen_saturation_percent)) +
geom_point(color='steelblue', alpha = .1) +
labs(title ="Oxygen Saturation vs. Respiratory Rate") +
xlab("Respiratory Rate (breaths per minute") +
ylab("Oxygen Saturation (%)")
We can also look at age versus oxygen saturation by outcome.
mydata %>%
filter(!is.na(oxygen_saturation_percent)) %>%
filter(!is.na(age_at_visit_years)) %>%
ggplot(aes(x = age_at_visit_years, y = oxygen_saturation_percent, color=deceased_flag)) +
geom_point(alpha = .1) +
labs(title ="Age vs. Oxygen Saturation by Outcome", color = "Outcome") +
xlab("Age (years)") +
ylab("Oxygen Saturation (%)")
NA
Here’s another comparison with age.
mydata %>%
filter(los < 100, visit_type != 'Outpatient Visit', !is.na(age_at_visit_years), !is.na(gender_source_value)) %>%
ggplot(aes(x = age_at_visit_years, y = los, color=gender_source_value)) +
geom_point(alpha = .1) +
labs(title ="Age vs. Length of Stay by Gender for Non-Outpatient Visits", color = "Gender") +
xlab("Age (years)") +
ylab("Length of Stay (Days)")
NA
NA
We can also look at visit counts over time! These data come from a simluated dataset based on COVID data.
data2020 = mydata %>% filter(visit_year == 2020)
visitcounts = count(data2020, visit_start_date)
visitcounts %>%
ggplot(aes(x=visit_start_date, y=n)) +
geom_line() +
xlab("Date") +
ylab("Visit count") +
labs(title = "Daily Visit Counts (2020)")
NA
NA
Let’s categorize visits as COVID-related or non-COVID-related and plot both series.
data2020$COVID = grepl('COVID',data2020$observation_source, fixed = TRUE)
COVIDvisitcounts = count(data2020, visit_start_date, COVID)
COVIDvisitcounts$COVID_formatted = ifelse(COVIDvisitcounts$COVID == TRUE, "COVID-related", "Non-COVID")
COVIDvisitcounts %>%
ggplot(aes(x=visit_start_date, y=n, color=COVID_formatted)) +
geom_line() +
xlab("Date") +
ylab("Visit count") +
labs(title = "Daily Visit Counts (2020)", color="Category")
NA
NA
NA
NA