df <- read.csv("data.csv", stringsAsFactors = FALSE)
df <- janitor::clean_names(df) # optional
age <- df$age_at_visit_years
cat("Missing age:", sum(is.na(age)), "\n")
## Missing age: 0
print(summary(age))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01 16.16 36.07 37.86 57.17 110.73
cat("Min age:", suppressWarnings(min(age, na.rm = TRUE)), "\n")
## Min age: 0.01
cat("Max age:", suppressWarnings(max(age, na.rm = TRUE)), "\n")
## Max age: 110.73
df %>%
filter(!is.na(age_at_visit_years)) %>%
ggplot(aes(x = age_at_visit_years)) +
geom_histogram(bins = 40) +
labs(
title = "Age Distribution",
x = "Age at visit (years)",
y = "Count"
) +
theme_minimal()
Pediatric: 0–18, Young Adult: 18–40, Middle Age: 40–65, Elderly: 65+
df <- df %>%
mutate(
age_group = cut(
age_at_visit_years,
breaks = c(0, 18, 40, 65, 100),
labels = c("Pediatric", "Young Adult", "Middle Age", "Elderly"),
right = FALSE,
include.lowest = TRUE
)
)
age_group_counts <- df %>%
count(age_group, sort = TRUE) %>%
mutate(pct = n / sum(n) * 100)
age_group_counts
## age_group n pct
## 1 Middle Age 46359 29.711594
## 2 Pediatric 42555 27.273601
## 3 Young Adult 42008 26.923028
## 4 Elderly 23971 15.363071
## 5 <NA> 1137 0.728706
The condition column contains multiple conditions
separated by colons (:).
df <- df %>%
mutate(
condition_list = str_split(coalesce(condition, ""), ":")
)
all_conditions <- df %>%
select(visit_occurrence_id, condition_list) %>%
unnest(condition_list) %>%
mutate(condition_item = str_trim(condition_list)) %>%
filter(condition_item != "")
condition_counts <- all_conditions %>%
count(condition_item, sort = TRUE)
head(condition_counts, 10)
## # A tibble: 10 × 2
## condition_item n
## <chr> <int>
## 1 Cough 63448
## 2 Chronic sinusitis 26783
## 3 Pneumonia 20464
## 4 Respiratory distress 19139
## 5 Dyspnea 18533
## 6 Wheezing 18533
## 7 Viral sinusitis 16232
## 8 Sore throat symptom 13166
## 9 Acute respiratory failure 9139
## 10 Acute viral pharyngitis 8347
top10 <- condition_counts %>% slice_max(n, n = 10)
ggplot(top10, aes(x = fct_reorder(condition_item, n), y = n)) +
geom_col() +
coord_flip() +
labs(title = "Top 10 Conditions", x = "Condition", y = "Count") +
theme_minimal()
Many vital signs are only recorded for COVID-suspected visits.
covid_df <- df %>%
filter(observation_source == "Suspected COVID-19")
cat("COVID-suspected visits:", nrow(covid_df), "\n")
## COVID-suspected visits: 73304
cat("Total visits:", nrow(df), "\n")
## Total visits: 156030
cat("Percentage:", round(nrow(covid_df) / nrow(df) * 100, 1), "%\n")
## Percentage: 47 %
vital_cols <- c(
"oxygen_saturation_percent",
"respiratory_rate_per_minute",
"heart_rate_bpm",
"body_temperature_c",
"systolic",
"diastolic"
)
vital_summary <- covid_df %>%
summarise(
across(
all_of(vital_cols),
list(
n = ~sum(!is.na(.)),
missing = ~sum(is.na(.)),
mean = ~mean(., na.rm = TRUE),
sd = ~sd(., na.rm = TRUE),
median = ~median(., na.rm = TRUE),
p25 = ~quantile(., 0.25, na.rm = TRUE),
p75 = ~quantile(., 0.75, na.rm = TRUE),
min = ~min(., na.rm = TRUE),
max = ~max(., na.rm = TRUE)
),
.names = "{.col}_{.fn}"
)
)
vital_summary
## oxygen_saturation_percent_n oxygen_saturation_percent_missing
## 1 73289 15
## oxygen_saturation_percent_mean oxygen_saturation_percent_sd
## 1 82.00605 4.050161
## oxygen_saturation_percent_median oxygen_saturation_percent_p25
## 1 82 78.5
## oxygen_saturation_percent_p75 oxygen_saturation_percent_min
## 1 85.5 75
## oxygen_saturation_percent_max respiratory_rate_per_minute_n
## 1 89 73301
## respiratory_rate_per_minute_missing respiratory_rate_per_minute_mean
## 1 3 25.9982
## respiratory_rate_per_minute_sd respiratory_rate_per_minute_median
## 1 8.073901 26
## respiratory_rate_per_minute_p25 respiratory_rate_per_minute_p75
## 1 19 33
## respiratory_rate_per_minute_min respiratory_rate_per_minute_max
## 1 12 40
## heart_rate_bpm_n heart_rate_bpm_missing heart_rate_bpm_mean heart_rate_bpm_sd
## 1 73301 3 125.1293 43.41808
## heart_rate_bpm_median heart_rate_bpm_p25 heart_rate_bpm_p75
## 1 125.1 87.4 163
## heart_rate_bpm_min heart_rate_bpm_max body_temperature_c_n
## 1 50 200 73292
## body_temperature_c_missing body_temperature_c_mean body_temperature_c_sd
## 1 12 39.94862 1.300445
## body_temperature_c_median body_temperature_c_p25 body_temperature_c_p75
## 1 40 38.8 41.1
## body_temperature_c_min body_temperature_c_max systolic_n systolic_missing
## 1 37.2 42.2 73301 3
## systolic_mean systolic_sd systolic_median systolic_p25 systolic_p75
## 1 121.2707 13.34127 120 113 128
## systolic_min systolic_max diastolic_n diastolic_missing diastolic_mean
## 1 97 201 73301 3 80.39916
## diastolic_sd diastolic_median diastolic_p25 diastolic_p75 diastolic_min
## 1 6.753591 80 76 84 67
## diastolic_max
## 1 121
vitals_long <- covid_df %>%
select(all_of(vital_cols)) %>%
pivot_longer(cols = everything(), names_to = "vital", values_to = "value")
vitals_long %>%
filter(vital %in% c(
"oxygen_saturation_percent",
"respiratory_rate_per_minute",
"heart_rate_bpm",
"body_temperature_c"
)) %>%
ggplot(aes(x = value)) +
geom_histogram(bins = 40) +
facet_wrap(~vital, scales = "free", ncol = 2) +
labs(
title = "Vital Signs Distributions (COVID-Suspected)",
x = "Value",
y = "Count"
) +
theme_minimal()
Question: Are there any concerning values in the vital signs? What might explain extreme values?
# Your answer here
Count inpatient vs outpatient visits, compute percentages, and plot.
visit_counts <- df %>%
count(visit_type, sort = TRUE) %>%
mutate(pct = n / sum(n) * 100)
visit_counts
## visit_type n pct
## 1 Outpatient Visit 134954 86.49234122
## 2 Inpatient Visit 21056 13.49484074
## 3 Emergency Room Visit 20 0.01281805
ggplot(visit_counts, aes(x = fct_reorder(visit_type, n), y = n)) +
geom_col() +
coord_flip() +
labs(title = "Visit Type Distribution", x = "Visit type", y = "Count") +
theme_minimal()
outlier_flags <- covid_df %>%
transmute(
temp = body_temperature_c,
o2 = oxygen_saturation_percent,
hr = heart_rate_bpm,
rr = respiratory_rate_per_minute,
temp_extreme = !is.na(temp) & (temp < 34 | temp > 42),
o2_critical = !is.na(o2) & (o2 < 80),
o2_concerning = !is.na(o2) & (o2 < 90),
hr_extreme = !is.na(hr) & (hr > 180),
rr_extreme = !is.na(rr) & (rr > 40)
)
summary(outlier_flags)
## temp o2 hr rr temp_extreme
## Min. :37.20 Min. :75.00 Min. : 50.0 Min. :12 Mode :logical
## 1st Qu.:38.80 1st Qu.:78.50 1st Qu.: 87.4 1st Qu.:19 FALSE:70807
## Median :40.00 Median :82.00 Median :125.1 Median :26 TRUE :2497
## Mean :39.95 Mean :82.01 Mean :125.1 Mean :26
## 3rd Qu.:41.10 3rd Qu.:85.50 3rd Qu.:163.0 3rd Qu.:33
## Max. :42.20 Max. :89.00 Max. :200.0 Max. :40
## NA's :12 NA's :15 NA's :3 NA's :3
## o2_critical o2_concerning hr_extreme rr_extreme
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:47500 FALSE:15 FALSE:63462 FALSE:73304
## TRUE :25804 TRUE :73289 TRUE :9842
##
##
##
##
numeric_cols <- c(
"age_at_visit_years",
"oxygen_saturation_percent",
"respiratory_rate_per_minute",
"heart_rate_bpm",
"body_temperature_c",
"systolic",
"diastolic"
)
corr_df <- covid_df %>%
select(all_of(numeric_cols)) %>%
mutate(across(everything(), as.numeric))
corr_mat <- cor(corr_df, use = "pairwise.complete.obs")
corr_long <- reshape2::melt(corr_mat)
ggplot(corr_long, aes(Var1, Var2, fill = value)) +
geom_tile() +
scale_fill_gradient2(limits = c(-1, 1)) +
coord_equal() +
labs(title = "Correlation Heatmap (COVID-Suspected)", x = "", y = "", fill = "Correlation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Question: What correlations do you observe? Any surprising or concerning?
# Your answer here
covid_df %>%
filter(!is.na(body_temperature_c), !is.na(oxygen_saturation_percent)) %>%
ggplot(aes(x = body_temperature_c, y = oxygen_saturation_percent, color = deceased)) +
geom_point(alpha = 0.4) +
labs(
title = "Temperature vs Oxygen Saturation (COVID-Suspected)",
x = "Body temperature (C)",
y = "Oxygen saturation (%)",
color = "Deceased"
) +
theme_minimal()
condition_count (colons + 1)df <- df %>%
mutate(
condition_count = if_else(
is.na(condition) | str_trim(condition) == "",
0L,
str_count(condition, ":") + 1L
)
)
df %>% count(condition_count, sort = FALSE)
## condition_count n
## 1 1 116807
## 2 2 13792
## 3 3 11911
## 4 4 5933
## 5 5 3697
## 6 6 2566
## 7 7 597
## 8 8 344
## 9 9 9
## 10 10 216
## 11 12 117
## 12 14 24
## 13 15 3
## 14 16 5
## 15 18 1
## 16 21 1
## 17 28 3
## 18 36 2
## 19 42 2
covid_df <- covid_df %>%
mutate(
condition_count = if_else(
is.na(condition) | str_trim(condition) == "",
0L,
str_count(condition, ":") + 1L
),
high_condition_count = condition_count >= 3
)
mean_vitals_by_group <- covid_df %>%
group_by(high_condition_count) %>%
summarise(across(all_of(vital_cols), ~mean(., na.rm = TRUE)), .groups = "drop")
mean_vitals_by_group
## # A tibble: 2 × 7
## high_condition_count oxygen_saturation_percent respiratory_rate_per_minute
## <lgl> <dbl> <dbl>
## 1 FALSE 82.0 26.0
## 2 TRUE 82.0 26.0
## # ℹ 4 more variables: heart_rate_bpm <dbl>, body_temperature_c <dbl>,
## # systolic <dbl>, diastolic <dbl>
Box plots showing oxygen saturation by visit type.
covid_df %>%
filter(!is.na(oxygen_saturation_percent), !is.na(visit_type)) %>%
ggplot(aes(x = visit_type, y = oxygen_saturation_percent)) +
geom_boxplot(outlier.alpha = 0.2) +
coord_flip() +
labs(
title = "Oxygen Saturation by Visit Type (COVID-Suspected)",
x = "Visit type",
y = "Oxygen saturation (%)"
) +
theme_minimal()
vitals_deceased_summary <- covid_df %>%
group_by(deceased) %>%
summarise(
across(
all_of(vital_cols),
list(mean = ~mean(., na.rm = TRUE), median = ~median(., na.rm = TRUE)),
.names = "{.col}_{.fn}"
),
.groups = "drop"
)
vitals_deceased_summary
## # A tibble: 2 × 13
## deceased oxygen_saturation_per…¹ oxygen_saturation_pe…² respiratory_rate_per…³
## <chr> <dbl> <dbl> <dbl>
## 1 N 82.0 82 26.0
## 2 Y 82.0 81.9 26.0
## # ℹ abbreviated names: ¹oxygen_saturation_percent_mean,
## # ²oxygen_saturation_percent_median, ³respiratory_rate_per_minute_mean
## # ℹ 9 more variables: respiratory_rate_per_minute_median <dbl>,
## # heart_rate_bpm_mean <dbl>, heart_rate_bpm_median <dbl>,
## # body_temperature_c_mean <dbl>, body_temperature_c_median <dbl>,
## # systolic_mean <dbl>, systolic_median <dbl>, diastolic_mean <dbl>,
## # diastolic_median <dbl>
covid_df %>%
filter(!is.na(oxygen_saturation_percent), !is.na(deceased)) %>%
ggplot(aes(x = deceased, y = oxygen_saturation_percent)) +
geom_boxplot(outlier.alpha = 0.2) +
labs(
title = "Oxygen Saturation by Deceased Status (COVID-Suspected)",
x = "Deceased (Y/N)",
y = "Oxygen saturation (%)"
) +
theme_minimal()
Convert the date columns to Date and extract year/month.
df <- df %>%
mutate(
visit_start_date = ymd(visit_start_date),
visit_end_date = ymd(visit_end_date),
visit_year = year(visit_start_date),
visit_month = month(visit_start_date, label = TRUE, abbr = TRUE)
)
visits_by_year <- df %>% count(visit_year, sort = TRUE)
visits_by_year
## visit_year n
## 1 2020 86706
## 2 2019 16934
## 3 2018 1239
## 4 2016 1215
## 5 2017 1185
## 6 2011 1152
## 7 2012 1152
## 8 2006 1148
## 9 2015 1145
## 10 2010 1139
## 11 2013 1136
## 12 2009 1135
## 13 2005 1126
## 14 2007 1115
## 15 2014 1094
## 16 2008 1068
## 17 2004 1067
## 18 2003 1043
## 19 2002 1013
## 20 1999 980
## 21 2001 963
## 22 1997 918
## 23 1998 906
## 24 2000 896
## 25 1994 880
## 26 1995 860
## 27 1993 858
## 28 1996 856
## 29 1992 838
## 30 1991 820
## 31 1990 777
## 32 1987 749
## 33 1989 749
## 34 1985 725
## 35 1986 717
## 36 1984 700
## 37 1982 695
## 38 1988 669
## 39 1981 657
## 40 1983 656
## 41 1979 610
## 42 1977 603
## 43 1980 602
## 44 1978 592
## 45 1975 578
## 46 1976 576
## 47 1973 538
## 48 1972 536
## 49 1974 535
## 50 1971 496
## 51 1970 484
## 52 1968 483
## 53 1969 481
## 54 1967 471
## 55 1965 430
## 56 1966 427
## 57 1964 421
## 58 1963 395
## 59 1962 370
## 60 1960 337
## 61 1961 336
## 62 1959 329
## 63 1958 321
## 64 1957 310
## 65 1956 295
## 66 1955 284
## 67 1954 265
## 68 1953 228
## 69 1949 225
## 70 1950 219
## 71 1951 219
## 72 1952 212
## 73 1945 185
## 74 1948 184
## 75 1947 181
## 76 1946 179
## 77 1943 170
## 78 1944 141
## 79 1940 136
## 80 1942 127
## 81 1938 115
## 82 1941 114
## 83 1936 111
## 84 1939 106
## 85 1937 100
## 86 1934 98
## 87 1935 98
## 88 1930 88
## 89 1933 79
## 90 1932 72
## 91 1928 66
## 92 1927 65
## 93 1931 65
## 94 1923 61
## 95 1922 56
## 96 1925 56
## 97 1924 55
## 98 1929 52
## 99 1919 48
## 100 1921 47
## 101 1918 46
## 102 1920 46
## 103 1926 46
## 104 1917 37
## 105 1915 33
## 106 1916 24
## 107 1914 20
## 108 1913 14
## 109 1912 11
## 110 1911 7
## 111 1909 1
## 112 1910 1
df %>%
filter(visit_year == 2020) %>%
count(visit_month) %>%
ggplot(aes(x = visit_month, y = n)) +
geom_col() +
labs(title = "Visits by Month (2020)", x = "Month", y = "Count") +
theme_minimal()
# What patterns do you observe?
df <- df %>%
mutate(length_of_stay = as.numeric(difftime(visit_end_date, visit_start_date, units = "days")))
summary(df$length_of_stay)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 0.00 61.74 0.00 38325.00
inpatient_df <- df %>% filter(str_detect(visit_type, "Inpatient"))
summary(inpatient_df$length_of_stay)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 11.00 14.00 28.32 18.00 21849.00
inpatient_df %>%
filter(!is.na(length_of_stay)) %>%
ggplot(aes(x = length_of_stay)) +
geom_histogram(bins = 40) +
labs(title = "Length of Stay (Inpatient)", x = "Days", y = "Count") +
theme_minimal()
visits_per_patient <- df %>%
count(person_id, name = "n_visits") %>%
arrange(desc(n_visits))
head(visits_per_patient, 10)
## person_id n_visits
## 1 26705 19
## 2 96990 17
## 3 65183 16
## 4 65946 16
## 5 89696 16
## 6 92962 16
## 7 5952 15
## 8 6160 15
## 9 15283 15
## 10 19844 15
pct_multiple <- mean(visits_per_patient$n_visits > 1) * 100
cat("Percent of patients with multiple visits:", round(pct_multiple, 2), "%\n")
## Percent of patients with multiple visits: 41.84 %
top_patient <- visits_per_patient$person_id[1]
df %>%
filter(person_id == top_patient) %>%
arrange(visit_start_date) %>%
select(person_id, visit_start_date, visit_end_date, visit_type, deceased, observation_source, condition) %>%
head(50)
## person_id visit_start_date visit_end_date visit_type deceased
## 1 26705 1917-08-09 1917-08-09 Outpatient Visit N
## 2 26705 1919-08-18 1919-08-18 Outpatient Visit N
## 3 26705 1924-11-12 1924-11-12 Outpatient Visit N
## 4 26705 1926-10-24 1926-10-24 Outpatient Visit N
## 5 26705 1930-02-01 1930-02-01 Outpatient Visit N
## 6 26705 1934-06-09 1934-06-09 Outpatient Visit N
## 7 26705 1937-09-14 1937-09-14 Outpatient Visit N
## 8 26705 1947-04-17 1947-04-17 Outpatient Visit N
## 9 26705 1948-03-25 1948-03-25 Outpatient Visit N
## 10 26705 1950-04-05 1950-04-05 Outpatient Visit N
## 11 26705 1967-03-05 1967-03-05 Outpatient Visit N
## 12 26705 1968-02-19 1968-02-19 Outpatient Visit N
## 13 26705 1978-08-10 1978-08-10 Outpatient Visit N
## 14 26705 1981-07-14 1981-07-14 Outpatient Visit N
## 15 26705 1994-10-31 1994-10-31 Outpatient Visit N
## 16 26705 1995-06-24 1995-06-24 Outpatient Visit N
## 17 26705 2012-06-17 2012-06-17 Outpatient Visit N
## 18 26705 2020-03-03 2020-03-03 Outpatient Visit N
## 19 26705 2020-05-18 2020-05-18 Outpatient Visit N
## observation_source condition
## 1 Viral sinusitis
## 2 Viral sinusitis
## 3 Viral sinusitis
## 4 Viral sinusitis
## 5 Viral sinusitis
## 6 Viral sinusitis
## 7 Viral sinusitis
## 8 Viral sinusitis
## 9 Viral sinusitis
## 10 Viral sinusitis
## 11 Viral sinusitis
## 12 Viral sinusitis
## 13 Viral sinusitis
## 14 Viral sinusitis
## 15 Viral sinusitis
## 16 Viral sinusitis
## 17 Viral sinusitis
## 18 Suspected COVID-19 Wheezing:Dyspnea:Cough
## 19 Viral sinusitis
df <- df %>%
mutate(
flu_last_administered = ymd(flu_last_administered),
days_since_flu_vaccine = as.numeric(difftime(visit_start_date, flu_last_administered, units = "days"))
)
summary(df$days_since_flu_vaccine)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 110.0 156.0 185.7 247.0 486.0
pct_vax_past_year <- df %>%
filter(!is.na(days_since_flu_vaccine)) %>%
summarise(pct = mean(days_since_flu_vaccine <= 365, na.rm = TRUE) * 100) %>%
pull(pct)
cat("Percent vaccinated within past year:", round(pct_vax_past_year, 2), "%\n")
## Percent vaccinated within past year: 91.56 %
overall_mortality_rate <- mean(df$deceased == "Y", na.rm = TRUE) * 100
cat("Overall mortality rate:", round(overall_mortality_rate, 2), "%\n")
## Overall mortality rate: 10.53 %
mortality_by_visit <- df %>%
filter(!is.na(visit_type), !is.na(deceased)) %>%
tabyl(visit_type, deceased) %>%
adorn_percentages("row") %>%
adorn_pct_formatting(digits = 2)
mortality_by_visit
## visit_type N Y
## Emergency Room Visit 100.00% 0.00%
## Inpatient Visit 69.74% 30.26%
## Outpatient Visit 92.55% 7.45%
if (!"age_group" %in% names(df)) {
df <- df %>%
mutate(
age_group = cut(
age_at_visit_years,
breaks = c(0, 18, 40, 65, 100),
labels = c("Pediatric", "Young Adult", "Middle Age", "Elderly"),
right = FALSE,
include.lowest = TRUE
)
)
}
mortality_by_age <- df %>%
filter(!is.na(age_group), !is.na(deceased)) %>%
tabyl(age_group, deceased) %>%
adorn_percentages("row") %>%
adorn_pct_formatting(digits = 2)
mortality_by_age
## age_group N Y
## Pediatric 98.13% 1.87%
## Young Adult 91.82% 8.18%
## Middle Age 84.33% 15.67%
## Elderly 80.97% 19.03%
visit_by_age <- df %>%
filter(!is.na(age_group), !is.na(visit_type)) %>%
tabyl(age_group, visit_type) %>%
adorn_percentages("row") %>%
adorn_pct_formatting(digits = 2)
visit_by_age
## age_group Emergency Room Visit Inpatient Visit Outpatient Visit
## Pediatric 0.02% 2.39% 97.58%
## Young Adult 0.01% 10.28% 89.70%
## Middle Age 0.01% 18.44% 81.55%
## Elderly 0.00% 28.29% 71.70%
Compare average age, inpatient rate, mortality rate.
df <- df %>%
mutate(
is_covid_suspected = observation_source == "Suspected COVID-19",
is_inpatient = str_detect(visit_type, "Inpatient")
)
covid_summary <- df %>%
group_by(is_covid_suspected) %>%
summarise(
avg_age = mean(age_at_visit_years, na.rm = TRUE),
mortality_rate = mean(deceased == "Y", na.rm = TRUE) * 100,
inpatient_rate = mean(is_inpatient, na.rm = TRUE) * 100,
n_visits = n(),
.groups = "drop"
)
covid_summary
## # A tibble: 2 × 5
## is_covid_suspected avg_age mortality_rate inpatient_rate n_visits
## <lgl> <dbl> <dbl> <dbl> <int>
## 1 FALSE 34.6 0.441 5.54 82726
## 2 TRUE 41.5 21.9 22.5 73304
Test difference in oxygen saturation between deceased and non-deceased (COVID-suspected).
deceased_o2 <- covid_df %>% filter(deceased == "Y") %>% pull(oxygen_saturation_percent) %>% na.omit()
survived_o2 <- covid_df %>% filter(deceased == "N") %>% pull(oxygen_saturation_percent) %>% na.omit()
cat("Deceased - Mean:", round(mean(deceased_o2), 2),
"Median:", round(median(deceased_o2), 2),
"n:", length(deceased_o2), "\n")
## Deceased - Mean: 81.97 Median: 81.9 n: 16061
cat("Survived - Mean:", round(mean(survived_o2), 2),
"Median:", round(median(survived_o2), 2),
"n:", length(survived_o2), "\n")
## Survived - Mean: 82.02 Median: 82 n: 57228
# Wilcoxon rank-sum test (equivalent to Mann–Whitney U)
mw_test <- wilcox.test(deceased_o2, survived_o2, alternative = "two.sided")
mw_test
##
## Wilcoxon rank sum test with continuity correction
##
## data: deceased_o2 and survived_o2
## W = 456683836, p-value = 0.2232
## alternative hypothesis: true location shift is not equal to 0
# Interpretation: if p-value < 0.05, conclude a statistically significant difference.
Identify conditions most associated with inpatient admission.
cond_rates <- df %>%
mutate(
condition_item = str_split(coalesce(condition, ""), ":"),
is_inpatient = str_detect(visit_type, "Inpatient")
) %>%
select(visit_occurrence_id, condition_item, is_inpatient) %>%
unnest(condition_item) %>%
mutate(condition_item = str_trim(condition_item)) %>%
filter(condition_item != "") %>%
group_by(condition_item) %>%
summarise(
inpatient_rate = mean(is_inpatient, na.rm = TRUE) * 100,
total_visits = n(),
.groups = "drop"
) %>%
filter(total_visits >= 100) %>%
arrange(desc(inpatient_rate))
head(cond_rates, 10)
## # A tibble: 10 × 3
## condition_item inpatient_rate total_visits
## <chr> <dbl> <int>
## 1 Acute respiratory distress syndrome 100 2454
## 2 Pneumonia 100 20464
## 3 Respiratory distress 100 19139
## 4 Acute respiratory failure 99.9 9139
## 5 Hemoptysis 50.2 904
## 6 Dyspnea 36.5 18533
## 7 Wheezing 36.5 18533
## 8 Cough 20.0 63448
## 9 Sore throat symptom 18.4 13166
## 10 Nasal congestion 14.0 4324