python3 -m venv venv
source venv/bin/activate
pip install jupyter seaborn

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np

#  Setup 
sns.set(style="whitegrid", context="talk")
plt.rcParams["figure.figsize"] = (10, 6)

# Initialize random number generator for reproducibility, Panda's utilizes NumPy module not "random"
np.random.seed(42)

# Load data
df = pd.read_csv("final_with_deceased.csv")

# Perform basic exploratory data analysis
# df.head(n)  # top n rows, n defaults to 5
# df.tail(n)  # last n rows
# df.sample(5) # sample x rows
df

print("Dataframe shape:",df.shape)
print(df.info())

Dataframe shape: (156030, 27)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156030 entries, 0 to 156029
Data columns (total 27 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   person_id                    156030 non-null  int64  
 1   birth_datetime               156030 non-null  object 
 2   race_source_value            156030 non-null  object 
 3   ethnicity_source_value       156030 non-null  object 
 4   gender_source_value          156030 non-null  object 
 5   visit_occurrence_id          156030 non-null  int64  
 6   visit_start_date             156030 non-null  object 
 7   visit_end_date               156030 non-null  object 
 8   visit_type                   156030 non-null  object 
 9   condition                    156030 non-null  object 
 10  observation_source           78197 non-null   object 
 11  age_at_visit_years           156030 non-null  float64
 12  measurement_Date             88854 non-null   object 
 13  body_height_cm               3056 non-null    float64
 14  bmi                          2968 non-null    float64
 15  body_temperature_c           85945 non-null   float64
 16  body_weight_kg               79735 non-null   float64
 17  systolic                     79745 non-null   float64
 18  diastolic                    79745 non-null   float64
 19  heart_rate_bpm               79735 non-null   float64
 20  oxygen_saturation_percent    76809 non-null   float64
 21  respiratory_rate_per_minute  79735 non-null   float64
 22  flu_last_administered        156030 non-null  object 
 23  tdap_last_administered       124884 non-null  object 
 24  mmr_last_administered        154223 non-null  object 
 25  polio_last_administered      155802 non-null  object 
 26  deceased                     156030 non-null  object 
dtypes: float64(10), int64(2), object(15)
memory usage: 32.1+ MB
None

for c in ["visit_start_date", "visit_end_date", "birth_datetime", "measurement_Date","flu_last_administered","tdap_last_administered","mmr_last_administered","polio_last_administered"]:
    if c in df.columns:
        df[c] = pd.to_datetime(df[c], errors="coerce")

n_rows = len(df)

missing_table = (   # create a new dataframe
    df.isna()
      .agg(['sum', 'mean'])
      .T
      .rename(columns={'sum': 'missing_count', 'mean': 'missing_percent'})
)

missing_table['missing_percent'] = (missing_table['missing_percent'] * 100).round(2)
missing_table['non_missing_count'] = n_rows - missing_table['missing_count']
missing_table['dtype'] = df.dtypes.astype(str)

missing_table = (
    missing_table
      .reset_index(names='column')
      .sort_values(by=['missing_percent', 'column'], ascending=[False, True])
      .set_index('column')
)

missing_table

# Create a column for visit length - ignoring visit type 
los = (df["visit_end_date"] - df["visit_start_date"]).dt.days
df["length_of_stay_days"] = los.clip(lower=0)


# Modify labels for deceased column
df["deceased_flag"] = df["deceased"].map({"Y": "Deceased", "N": "Alive"}).fillna("Unknown").astype("category")

# columns for year and month
df["visit_year"] = df["visit_start_date"].dt.year
df["visit_month"] = df["visit_start_date"].dt.to_period("M").astype(str)

df['gender_source_value'] = df['gender_source_value'].astype('category')
df['race_source_value'] = df['race_source_value'].astype('category')
df['ethnicity_source_value'] = df['ethnicity_source_value'].astype('category')

df.sample(5)

df.describe()             # for numeric columns

df.describe(include=['object','category'])  # for categorical columns

# Check for duplicates
df.duplicated().sum()

np.int64(0)

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156030 entries, 0 to 156029
Data columns (total 31 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   person_id                    156030 non-null  int64         
 1   birth_datetime               156030 non-null  datetime64[ns]
 2   race_source_value            156030 non-null  category      
 3   ethnicity_source_value       156030 non-null  category      
 4   gender_source_value          156030 non-null  category      
 5   visit_occurrence_id          156030 non-null  int64         
 6   visit_start_date             156030 non-null  datetime64[ns]
 7   visit_end_date               156030 non-null  datetime64[ns]
 8   visit_type                   156030 non-null  object        
 9   condition                    156030 non-null  object        
 10  observation_source           78197 non-null   object        
 11  age_at_visit_years           156030 non-null  float64       
 12  measurement_Date             88854 non-null   datetime64[ns]
 13  body_height_cm               3056 non-null    float64       
 14  bmi                          2968 non-null    float64       
 15  body_temperature_c           85945 non-null   float64       
 16  body_weight_kg               79735 non-null   float64       
 17  systolic                     79745 non-null   float64       
 18  diastolic                    79745 non-null   float64       
 19  heart_rate_bpm               79735 non-null   float64       
 20  oxygen_saturation_percent    76809 non-null   float64       
 21  respiratory_rate_per_minute  79735 non-null   float64       
 22  flu_last_administered        156030 non-null  datetime64[ns]
 23  tdap_last_administered       124884 non-null  datetime64[ns]
 24  mmr_last_administered        154223 non-null  datetime64[ns]
 25  polio_last_administered      155802 non-null  datetime64[ns]
 26  deceased                     156030 non-null  object        
 27  length_of_stay_days          156030 non-null  int64         
 28  deceased_flag                156030 non-null  category      
 29  visit_year                   156030 non-null  int32         
 30  visit_month                  156030 non-null  object        
dtypes: category(4), datetime64[ns](8), float64(10), int32(1), int64(3), object(5)
memory usage: 32.1+ MB
None

# Produce quick categorical counts
print(df['gender_source_value'].value_counts(dropna=False), end="\n\n")
print(df['ethnicity_source_value'].value_counts(dropna=False), end="\n\n")
print(df['race_source_value'].value_counts(dropna=False), end="\n\n")
print(df['visit_type'].value_counts(dropna=False), end="\n\n")

gender_source_value
M    78691
F    77339
Name: count, dtype: int64

ethnicity_source_value
nonhispanic    138838
hispanic        17192
Name: count, dtype: int64

race_source_value
white     131136
black      13118
asian      10813
native       805
other        158
Name: count, dtype: int64

visit_type
Outpatient Visit        134954
Inpatient Visit          21056
Emergency Room Visit        20
Name: count, dtype: int64

df['condition']

0           Dyspnea:Pneumonia:Respiratory distress:Wheezing
1                                           Viral sinusitis
2                      Sore throat symptom:Dyspnea:Wheezing
3                               Perennial allergic rhinitis
4                                                     Cough
                                ...                        
156025                                                Cough
156026                                      Viral sinusitis
156027    Acute respiratory failure:Pneumonia:Respirator...
156028                                      Viral sinusitis
156029    Wheezing:Cough:Respiratory distress:Pneumonia:...
Name: condition, Length: 156030, dtype: object

import re

# robust split on ":" allowing extra spaces; keep NaN if empty
def split_conditions(s):
    if pd.isna(s) or str(s).strip() == "":
        return []
    # split on ":" with optional surrounding spaces
    parts = re.split(r"\s*:\s*", str(s))
    # normalize: strip, drop empties, lower (or title-case if you prefer)
    parts = [p.strip() for p in parts if p and p.strip()]
    return parts

# apply once to create a list-typed column
df["condition_list"] = df["condition"].map(split_conditions)

cond_long = (
    df[["visit_occurrence_id", "person_id", "visit_start_date"]]
      .assign(condition_item=df["condition_list"])
      .explode("condition_item", ignore_index=True)
)

# drop rows where no condition exists after cleaning
cond_long = cond_long.dropna(subset=["condition_item"])

# (optional) dedupe within visit in case the same condition appears twice
cond_long = cond_long.drop_duplicates(subset=["visit_occurrence_id", "condition_item"])
cond_long

top_conditions = (
    cond_long["condition_item"]
    .value_counts()
    .head(10)
)

top_conditions

condition_item
Cough                        61710
Chronic sinusitis            26739
Pneumonia                    19502
Respiratory distress         18177
Dyspnea                      17939
Wheezing                     17939
Viral sinusitis              16103
Sore throat symptom          12782
Acute respiratory failure     8714
Acute viral pharyngitis       8292
Name: count, dtype: int64

ax = sns.histplot(data=df, x="age_at_visit_years", bins=30, kde=True)
ax.set_title("Distribution of Age at Visit (years)")
ax.set_xlabel("Age (years)")
plt.show()

# Slightly alternate syntax for setting the title and x-axis label
sns.histplot(data=df, x="age_at_visit_years", bins=30, kde=True).set(title="Distribution of Age at Visit (years)", xlabel = "Age(years)")
plt.show()

# Code to save the image to a png file
def save_show(fig, filename):
    fig.tight_layout()
    fig.savefig(filename, dpi=150, bbox_inches="tight")

fig = plt.figure()
sns.histplot(data=df, x="age_at_visit_years", bins=30, kde=True).set(title="Distribution of Age at Visit (years)")
save_show(fig,"dist_age_at_visit.png")
plt.show()

# BMI Distribution as a box plot
sns.boxplot(data=df, y="bmi")
sns.stripplot(data=df, y="bmi", size=3, alpha=0.4, color="0.3")  # adds jittering to see values
plt.title("BMI Distribution")
plt.ylabel("BMI")
plt.show()

# Distribution of the length of stays
sns.histplot(data=df, x="length_of_stay_days", bins=30, kde=True)
plt.title("Distribution of Length of Stay (Days)")
plt.xlabel("Days")
plt.ylabel("Count")

Text(0, 0.5, 'Count')

# Let's see what data could be causing this.
df[df["length_of_stay_days"] > 100]

# For outpatient visits, assume this is a data issue and the length should be 0
is_outpatient = df['visit_type'].astype(str).str.contains('outpatient', case=False, na=False)

# align dates, then recompute LOS as zero
df.loc[is_outpatient, 'visit_end_date'] = df.loc[is_outpatient, 'visit_start_date']
df.loc[is_outpatient, 'length_of_stay_days'] = 0

sns.histplot(data=df, x="length_of_stay_days", bins=30, kde=True)
plt.title("Distribution of Length of Stay (Days)")
plt.xlabel("Days")
plt.ylabel("Count")

Text(0, 0.5, 'Count')

df[df["length_of_stay_days"] > 100]

# remove any records where length_of_stay_days > 100
df = df[df["length_of_stay_days"] <= 100].copy()

sns.histplot(data=df, x="length_of_stay_days", bins=50, kde=True)
plt.title("Distribution of Length of Stay (Days)")
plt.xlabel("Days")
plt.ylabel("Count")

Text(0, 0.5, 'Count')

# filter to non-outpatient visits
non_outpatient = df[~df["visit_type"].astype(str).str.contains("outpatient", case=False, na=False)]

# basic distribution plot
plt.figure(figsize=(10, 6))
sns.histplot(
    data=non_outpatient,
    x="length_of_stay_days",
    bins=45,
    kde=True,
    color="steelblue"
)
plt.title("Distribution of Length of Stay (Inpatient/Emergency Room Visits)")
plt.xlabel("Length of Stay (days)")
plt.ylabel("Number of Visits")
plt.tight_layout()
plt.show()

# Now, let's look a blood pressure

# Melt systolic/diastolic into long form
bp_long = df.melt(
    id_vars=["gender_source_value"],
    value_vars=["systolic", "diastolic"],
    var_name="Blood Pressure Type",
    value_name="Value"
)

sns.boxplot(
    data=bp_long,
    x="gender_source_value",
    y="Value",
    hue="Blood Pressure Type",
    palette="Set2"
)

plt.title("Blood Pressure Distribution by Gender")
plt.xlabel("Gender")
plt.ylabel("Blood Pressure (mmHg)")
plt.legend(title="Type")

<matplotlib.legend.Legend at 0x110894910>

plt.figure(figsize=(10, 6))
sns.barplot(
    data=bp_long,
    x="gender_source_value",
    y="Value",
    hue="Blood Pressure Type",
    errorbar=('ci', 95),
    palette="coolwarm"
)
plt.title("Mean Blood Pressure by Gender (with 95% CI)")
plt.xlabel("Gender")
plt.ylabel("Blood Pressure (mmHg)")
plt.tight_layout()
plt.show()

g_raw = df["gender_source_value"].astype(str).str.strip().str.lower()
df["gender_clean"] = np.where(
    g_raw.str.startswith("m"), "Male",
    np.where(g_raw.str.startswith("f"), "Female", "Other")
)

# long (tidy) format
bp_long = df.melt(
    id_vars=["gender_clean"],
    value_vars=["systolic", "diastolic"],
    var_name="Blood Pressure Type",
    value_name="Value"
).dropna(subset=["Value"])

# keep only Male/Female as requested
bp_long = bp_long[bp_long["gender_clean"].isin(["Male", "Female"])]

# --- Plot: histograms by gender, split (facet) by BP type ---
palette = {"Male": "#1f77b4", "Female": "#ff69b4"}  # blue / pink

# choose sensible binning; you can tweak the range as needed
bins = np.arange(40, 221, 5)  # 5 mmHg bins from 40 to 220

g = sns.displot(
    data=bp_long,
    x="Value",
    hue="gender_clean",
    col="Blood Pressure Type",
    kind="hist",
    bins=bins,
    element="step",           # overlaid outlines for clarity
    common_bins=True,         # same bins for both facets
    multiple="layer",         # overlay, not stacked
    palette=palette,
    height=5,
    aspect=1.2
)

g.set_axis_labels("Blood Pressure (mmHg)", "Count")
g.set_titles("{col_name}")
plt.tight_layout()
plt.show()

# Comparison Bar Chart
top = (cond_long["condition_item"]
       .dropna()
       .value_counts()
       .head(10)
       .sort_values(ascending=True))  

print(top)

plt.figure(figsize=(12, 6))
ax = sns.barplot(x=top.values, y=top.index, orient="h")
ax.set_title("Top 10 Conditions")
ax.set_xlabel("Number of occurrences")
ax.set_ylabel("Condition")

# nice value labels at the end of each bar
for i, v in enumerate(top.values):
    ax.text(v, i, f"{v:,}", va="center", ha="left", fontsize=9)

plt.tight_layout()
plt.show()

condition_item
Acute viral pharyngitis       8292
Acute respiratory failure     8714
Sore throat symptom          12782
Viral sinusitis              16103
Dyspnea                      17939
Wheezing                     17939
Respiratory distress         18177
Pneumonia                    19502
Chronic sinusitis            26739
Cough                        61710
Name: count, dtype: int64

# Comparison of visit length by gender (and showing distribution)
inpatient = df[df["visit_type"].astype(str).str.contains("inpatient", case=False, na=False)]

plt.figure(figsize=(8, 6))
palette = {"Male": "#1f77b4", "Female": "#ff69b4"}
sns.boxplot(
    data=inpatient,
    x="gender_clean",
    y="length_of_stay_days",
    hue="gender_clean",          # ← add hue
    dodge=False,                 # ← keep a single box per category
    palette=palette,
    legend=False                 # ← hide redundant legend
)
plt.title("Length of Stay for Inpatient Visits by Gender")
plt.xlabel("Gender")
plt.ylabel("Length of Stay (days)")
plt.tight_layout()
plt.show()

plt.figure(figsize=(8, 6))
sns.barplot(
    data=inpatient,
    x="gender_clean",
    y="length_of_stay_days",
    hue="gender_clean",          # same trick
    dodge=False,
    errorbar=('ci', 95),
    palette=palette,
    legend=False
)
plt.title("Average Length of Stay (Inpatient) by Gender")
plt.xlabel("Gender")
plt.ylabel("Length of Stay (days)")
plt.tight_layout()
plt.show()

sns.violinplot(
    data=inpatient,
    x="gender_clean",
    y="length_of_stay_days",
    palette=palette,
    hue="gender_clean",
    legend=False
)

<Axes: xlabel='gender_clean', ylabel='length_of_stay_days'>

# Additional Comparisons: Condition and visit type
# Condition counts by visit type / outcome, etc.
# (assuming df has 'visit_type' and 'deceased')
cond_by_visit_type = (
    cond_long
    .merge(df[["visit_occurrence_id", "visit_type", "deceased"]], on="visit_occurrence_id", how="left")
    .groupby(["visit_type", "condition_item"])
    .size()
    .reset_index(name="count")
    .sort_values(["visit_type","count"], ascending=[True,False])
)
cond_by_visit_type.head(8)

TOP_K = 10
ranked = (cond_by_visit_type
          .assign(rank=cond_by_visit_type.groupby("visit_type")["count"]
                  .rank(method="first", ascending=False))
          .query("rank <= @TOP_K"))

# percent share within each visit type
ranked = ranked.merge(
    cond_by_visit_type.groupby("visit_type", as_index=False)["count"].sum().rename(columns={"count": "visit_total"}),
    on="visit_type", how="left"
)
ranked["pct"] = (ranked["count"] / ranked["visit_total"] * 100).round(1)

# order bars by count within each facet
ranked["condition_item"] = ranked["condition_item"].astype(str)
ranked["condition_order"] = ranked.groupby("visit_type")["count"].rank(ascending=True, method="first")
ranked = ranked.sort_values(["visit_type", "condition_order"])

# plot
g = sns.catplot(
    data=ranked,
    x="count", y="condition_item",
    col="visit_type", col_wrap=2,  # wrap if many visit types
    kind="bar", orient="h", sharex=False, height=5, aspect=1.2
)
g.set_axis_labels("Count", "Condition")
g.set_titles("{col_name}")


plt.tight_layout()
plt.show()

# Alternate view as clustered(grouped) horizontal bar
N = 15  # number of top conditions overall to show
H = max(6, N * 0.45)  # figure height that scales with N

# 1) Pick the overall top-N conditions (by total count across visit types)
overall = (cond_by_visit_type.groupby("condition_item", observed=True)["count"]
           .sum()
           .sort_values(ascending=False))
top_conditions = overall.head(N).index

# 2) Filter and order categories so the biggest end up at the bottom
plot_df = cond_by_visit_type[cond_by_visit_type["condition_item"].isin(top_conditions)].copy()

# y-order: ascending total so largest ends up at the top of the chart
y_order = (plot_df.groupby("condition_item", observed=True)["count"]
           .sum()
           .sort_values(ascending=True).index)

# (optional) consistent visit_type order by total volume
hue_order = (plot_df.groupby("visit_type", observed=True)["count"]
             .sum()
             .sort_values(ascending=False).index)

# 3) Plot clustered horizontal bars
plt.figure(figsize=(12, H))
ax = sns.barplot(
    data=plot_df,
    x="count", y="condition_item",
    hue="visit_type", hue_order=hue_order,
    order=y_order, orient="h"
)

ax.set_title(f"Top {N} Conditions by Visit Type (clustered)", pad=12)
ax.set_xlabel("Count")
ax.set_ylabel("Condition")

# Put legend outside and clean up
ax.legend(title="Visit Type", bbox_to_anchor=(1.02, 1), loc="upper left", frameon=True)
sns.despine(left=True, bottom=True)
ax.grid(axis="x", linestyle="--", alpha=0.4)
plt.tight_layout()
plt.show()

# drop missing values
df_rel = df.dropna(subset=["oxygen_saturation_percent", "respiratory_rate_per_minute"]).copy()

# base scatterplot
plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=df_rel,
    x="respiratory_rate_per_minute",
    y="oxygen_saturation_percent",
    alpha=0.2,
    color="steelblue"
)
plt.title("Oxygen Saturation vs. Respiratory Rate")
plt.xlabel("Respiratory Rate (breaths per minute)")
plt.ylabel("Oxygen Saturation (%)")
plt.tight_layout()
plt.show()

# Bins (tweak as needed)
rr_bins = np.arange(10, 50, 1)      # 1-bpm bins
spo2_bins = np.arange(70, 101, 1)  # 1% bins

plt.figure(figsize=(9, 6))
ax = sns.histplot(
    data=df_rel,
    x="respiratory_rate_per_minute",
    y="oxygen_saturation_percent",
    bins=[rr_bins, spo2_bins],
    cbar=True,                   # colorbar on the right
    cbar_kws={"label": "Count"},
    stat="count"                 # use "density" if you want probabilities
)
plt.title("Oxygen Saturation vs Respiratory Rate — Heatmap")
plt.xlabel("Respiratory Rate (breaths/min)")
plt.ylabel("Oxygen Saturation (%)")
plt.tight_layout()
plt.show()

sns.scatterplot(
    data=df.dropna(subset=["age_at_visit_years", "oxygen_saturation_percent"]),
    x="age_at_visit_years",
    y="oxygen_saturation_percent",
    hue="deceased_flag",
    alpha=0.7
)
plt.title("Age vs Oxygen Saturation by Outcome")
plt.xlabel("Age (years)")
plt.ylabel("Oxygen Saturation (%)")
plt.legend(title="Outcome", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.show()

sns.scatterplot(
    data=non_outpatient.dropna(subset=["length_of_stay_days", "age_at_visit_years"]),
    x="age_at_visit_years",
    y="length_of_stay_days",
    hue="gender_source_value",
    alpha=0.7
)
plt.title("Age vs Length of Stay by Gender for In-patient/Emergency Visits")
plt.xlabel("Age (years)")
plt.ylabel("Length of Stay (days)")
plt.legend(title="Gender", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.show()

def stacked_bar(df, group, hue, title, x_label=None, y_label="Count",
                normalize=False, palette="Set2"):
    """
    Build stacked bars from categorical counts (or row-wise % if normalize=True).
    """
    # NOTE: observed=False keeps current behavior (includes unused category levels).
    ct = (df.groupby([group, hue], observed=False).size()
            .reset_index(name="count")
            .pivot(index=group, columns=hue, values="count")
            .fillna(0))

    if normalize:
        ct = ct.div(ct.sum(axis=1), axis=0) * 100
        y_label = "Percent"

    group_order = ct.sum(axis=1).sort_values(ascending=False).index
    colors = sns.color_palette(palette, n_colors=ct.shape[1])

    ax = ct.loc[group_order].plot(kind="bar", stacked=True, figsize=(10, 6),
                                  color=colors, edgecolor="none")
    ax.set_title(title, pad=12)
    ax.set_xlabel(x_label or group)
    ax.set_ylabel(y_label)
    ax.legend(title=hue, bbox_to_anchor=(1.02, 1), loc="upper left", frameon=True)
    plt.tight_layout()
    plt.show()

# 1) Visit types by race (stacked counts)
stacked_bar(
    df,
    group="race_source_value",
    hue="visit_type",
    title="Visit Types by Race",
    x_label="Race",
    palette="Set2"
)

# 2) Gender composition by race (stacked counts)
stacked_bar(
    df,
    group="race_source_value",
    hue="gender_source_value",
    title="Gender Composition by Race",
    x_label="Race",
    palette="Pastel2"
)

# 3) Ethnicity composition by visit type (stacked counts)
stacked_bar(
    df,
    group="visit_type",
    hue="ethnicity_source_value",
    title="Ethnicity Composition by Visit Type",
    x_label="Visit Type",
    palette="Set3"
)

# 4) Deceased status by race (stacked counts)
stacked_bar(
    df,
    group="race_source_value",
    hue="deceased",
    title="Deceased Status by Race",
    x_label="Race",
    palette="Set1"
)

# Optional: percentage versions of any of the above
# stacked_bar(df, "race_source_value", "visit_type", "Visit Types by Race (% within race)", x_label="Race", normalize=True)
# stacked_bar(df, "race_source_value", "gender_source_value", "Gender Composition by Race (%)", x_label="Race", normalize=True)
# stacked_bar(df, "visit_type", "ethnicity_source_value", "Ethnicity Composition by Visit Type (%)", x_label="Visit Type", normalize=True)
# stacked_bar(df, "race_source_value", "deceased", "Deceased Status by Race (%)", x_label="Race", normalize=True)

visits_2020 = df[df["visit_start_date"].dt.year == 2020].copy()

# --- Count visits per day ---
daily_visits = (
    visits_2020.groupby(df["visit_start_date"].dt.date)
    .size()
    .reset_index(name="visit_count")
    .rename(columns={"visit_start_date": "date"})
)

# --- Plot time series ---
plt.figure(figsize=(12, 6))
sns.lineplot(
    data=daily_visits,
    x="date",
    y="visit_count",
    marker="o",
    linewidth=1.5
)

plt.title("Daily Visit Counts (2020)")
plt.xlabel("Date")
plt.ylabel("Number of Visits")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

visits_2020 = df[df["visit_start_date"].dt.year == 2020].copy()

# --- COVID classification from observation_source (case-insensitive) ---
# Works even if observation_source is multivalued text
visits_2020["is_covid"] = (
    visits_2020["observation_source"]
      .astype(str)
      .str.contains("covid", case=False, na=False)
)

# --- Count visits per day by COVID status ---
daily = (
    visits_2020
      .groupby([visits_2020["visit_start_date"].dt.date, "is_covid"])
      .size()
      .reset_index(name="visit_count")
      .rename(columns={"visit_start_date": "date"})
)

# Friendly labels for plotting
daily["Category"] = daily["is_covid"].map({True: "COVID-related", False: "Non-COVID"})

# --- Plot daily counts ---
plt.figure(figsize=(12, 6))
sns.lineplot(
    data=daily,
    x="date",
    y="visit_count",
    hue="Category",                 # hue assigned to avoid palette warning
    marker="o",
    linewidth=1.6,
    legend=True
)
plt.title("Daily Visit Counts in 2020 — COVID vs Non-COVID (by observation_source)")
plt.xlabel("Date")
plt.ylabel("Number of Visits")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# --- CONFIG ---
TOP_K = 10
START = "2020-02-01"
END   = "2020-04-01"  # inclusive
# If you store numeric weights (e.g., a "count" column), set this to that name; else None for simple row counts.
WEIGHT_COL = None  # e.g., "count"

# --- Pick the date column robustly ---
date_col = "visit_start_date" if "visit_start_date" in cond_long.columns else "date"
cond = cond_long.copy()
cond[date_col] = pd.to_datetime(cond[date_col], errors="coerce")

# --- Filter to the window ---
mask = (cond[date_col] >= pd.Timestamp(START)) & (cond[date_col] <= pd.Timestamp(END))
win = cond.loc[mask].dropna(subset=[date_col, "condition_item"])

# --- Determine the TOP_K conditions within the window ---
if WEIGHT_COL is None:
    top_conditions = (win["condition_item"].value_counts().head(TOP_K).index.tolist())
else:
    top_conditions = (win.groupby("condition_item")[WEIGHT_COL]
                        .sum()
                        .sort_values(ascending=False)
                        .head(TOP_K)
                        .index
                        .tolist())

win = win[win["condition_item"].isin(top_conditions)]

# --- Daily counts for the top conditions ---
if WEIGHT_COL is None:
    daily = (win
             .assign(day=win[date_col].dt.floor("D"))
             .groupby(["day", "condition_item"])
             .size()
             .rename("count")
             .reset_index())
else:
    daily = (win
             .assign(day=win[date_col].dt.floor("D"))
             .groupby(["day", "condition_item"])[WEIGHT_COL]
             .sum()
             .rename("count")
             .reset_index())

# Ensure every condition has every day in the range (fill 0s)
full_idx = pd.MultiIndex.from_product(
    [pd.date_range(START, END, freq="D"), top_conditions],
    names=["day", "condition_item"]
)
daily = (daily
         .set_index(["day", "condition_item"])
         .reindex(full_idx, fill_value=0)
         .reset_index())

# ---------- 1) Multi-line chart (Seaborn lineplot) ----------
plt.figure(figsize=(12, 7))
ax = sns.lineplot(data=daily, x="day", y="count", hue="condition_item")

ax.set(
    title=f"Top {TOP_K} Conditions by Day (Counts) • {START} to {END}",
    xlabel="Date",
    ylabel="Daily Count"
)

# Rotate and space out labels
ax.xaxis.set_major_locator(mdates.WeekdayLocator(interval=1))   # one tick per week
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %d'))     # e.g. "Feb 01"
plt.xticks(rotation=45, ha='right')

# Lighten gridlines for clarity
sns.despine()
ax.grid(True, linestyle="--", alpha=0.5)

# Move legend outside for readability
ax.legend(title="Condition", bbox_to_anchor=(1.02, 1), loc="upper left")

plt.tight_layout()
plt.show()

# ---------- 2) Stacked AREA (composition over time, % per day) ----------
wide = daily.pivot(index="day", columns="condition_item", values="count").sort_index()
share = wide.div(wide.sum(axis=1).replace(0, np.nan), axis=0).fillna(0) * 100

plt.figure(figsize=(12, 7))
share.plot(kind="area", stacked=True, figsize=(12, 7))
plt.title(f"Composition of Top {TOP_K} Conditions by Day (% of visits) • {START} to {END}")
plt.xlabel("Day")
plt.ylabel("Percent of visits")
plt.legend(title="Condition", bbox_to_anchor=(1.02, 1), loc="upper left")
plt.tight_layout()
plt.show()

# ---------- 3) Heatmap (condition x day) ----------
heat = wide.T  # rows=condition, cols=day
plt.figure(figsize=(12, 6))
ax = sns.heatmap(heat, cmap="mako", cbar_kws={"label": "Daily count"})
cols = heat.columns  # should be a DatetimeIndex or array-like of dates
labels = pd.Index(cols).astype(str).str[:10]   # keep 'YYYY-MM-DD'

# choose ~10–12 ticks, skipping the rest
n = len(cols)
step = max(1, n // 12)                         # show ~12 ticks (tweak as needed)
tick_idx = np.arange(0, n, step)

ax.set_xticks(tick_idx)                        # positions correspond to column indices
ax.set_xticklabels(labels[tick_idx], rotation=45, ha='right')

# optional polish
ax.set_xlabel("Day")
ax.set_ylabel("Condition")
sns.despine(left=True, bottom=True)
plt.tight_layout()
plt.show()

<Figure size 1200x700 with 0 Axes>

# what conditions co-occur?
# Co-occurrence (conditions that tend to appear together per visit)

N = 10
topN = cond_long["condition_item"].value_counts().head(N).index

matrix = (
    cond_long[cond_long["condition_item"].isin(topN)]
    .assign(val=1)
    .pivot_table(index="visit_occurrence_id", columns="condition_item", values="val", fill_value=0)
)

# co-occurrence counts (symmetric)
cooc_cnt = matrix.T.dot(matrix)   # diagonal = occurrences of each condition

np.fill_diagonal(cooc_cnt.values, 0)  # hide self-counts on diagonal

mask = np.triu(np.ones_like(cooc_cnt, dtype=bool), k=1)

plt.figure(figsize=(15, 10))
ax = sns.heatmap(
    cooc_cnt,
    mask=mask,  # remove this line to make it symmetric
    cmap="mako_r",
    square=True,
    linewidths=0,
    cbar_kws={"label": "co-occurrence count"}
)
ax.set_title(f"Condition Co-occurrence (Top {len(cooc_cnt)} conditions)")
ax.set_xlabel("Condition")
ax.set_ylabel("Condition")
ax.grid(False)
plt.xticks(rotation=45, ha="right")
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Sankey (Condition → Visit Type → Deceased) with robust rendering & diagnostics

import plotly.graph_objects as go
import plotly.io as pio

pio.renderers.default = "notebook_connected"   # works in Jupyter classic/JLab

base = (cond_long[["visit_occurrence_id", "condition_item"]]
        .merge(df[["visit_occurrence_id", "visit_type", "deceased_flag"]],
               on="visit_occurrence_id", how="inner")
        .dropna(subset=["condition_item", "visit_type", "deceased_flag"]))

TOP_K = 12

top_conditions = base["condition_item"].value_counts().head(TOP_K).index
base = base[base["condition_item"].isin(top_conditions)]

# Build node list
conditions   = sorted(base["condition_item"].unique().tolist())
visit_types  = sorted(base["visit_type"].unique().tolist())
deceased_flg = sorted(base["deceased_flag"].unique().tolist())

nodes = conditions + visit_types + deceased_flg
node_idx = {name: i for i, name in enumerate(nodes)}

# Links: condition - visit_type
cv = (base.groupby(["condition_item", "visit_type"], observed=True)
           .size().reset_index(name="count"))
src1 = [node_idx[c] for c in cv["condition_item"]]
tgt1 = [node_idx[v] for v in cv["visit_type"]]
val1 = cv["count"].tolist()

# Links: visit_type - deceased
vd = (base.groupby(["visit_type", "deceased_flag"], observed=True)
           .size().reset_index(name="count"))
src2 = [node_idx[v] for v in vd["visit_type"]]
tgt2 = [node_idx[d] for d in vd["deceased_flag"]]
val2 = vd["count"].tolist()

fig = go.Figure(data=[go.Sankey(
    arrangement="snap",
    node=dict(
        pad=16, thickness=16, line=dict(width=0.5, color="gray"),
        label=nodes
    ),
    link=dict(
        source=src1 + src2,
        target=tgt1 + tgt2,
        value=val1 + val2
    )
)])

fig.update_layout(
    title=f"Sankey: Condition → Visit Type → Deceased (Top {TOP_K} conditions)",
    font=dict(size=12)
)
fig.show()

	missing_count	missing_percent	non_missing_count	dtype
column
bmi	153062.0	98.10	2968.0	float64
body_height_cm	152974.0	98.04	3056.0	float64
oxygen_saturation_percent	79221.0	50.77	76809.0	float64
observation_source	77833.0	49.88	78197.0	object
body_weight_kg	76295.0	48.90	79735.0	float64
heart_rate_bpm	76295.0	48.90	79735.0	float64
respiratory_rate_per_minute	76295.0	48.90	79735.0	float64
diastolic	76285.0	48.89	79745.0	float64
systolic	76285.0	48.89	79745.0	float64
body_temperature_c	70085.0	44.92	85945.0	float64
measurement_Date	67176.0	43.05	88854.0	datetime64[ns]
tdap_last_administered	31146.0	19.96	124884.0	datetime64[ns]
mmr_last_administered	1807.0	1.16	154223.0	datetime64[ns]
polio_last_administered	228.0	0.15	155802.0	datetime64[ns]
age_at_visit_years	0.0	0.00	156030.0	float64
birth_datetime	0.0	0.00	156030.0	datetime64[ns]
condition	0.0	0.00	156030.0	object
deceased	0.0	0.00	156030.0	object
ethnicity_source_value	0.0	0.00	156030.0	object
flu_last_administered	0.0	0.00	156030.0	datetime64[ns]
gender_source_value	0.0	0.00	156030.0	object
person_id	0.0	0.00	156030.0	int64
race_source_value	0.0	0.00	156030.0	object
visit_end_date	0.0	0.00	156030.0	datetime64[ns]
visit_occurrence_id	0.0	0.00	156030.0	int64
visit_start_date	0.0	0.00	156030.0	datetime64[ns]
visit_type	0.0	0.00	156030.0	object

	person_id	birth_datetime	race_source_value	ethnicity_source_value	gender_source_value	visit_occurrence_id	visit_start_date	visit_end_date	visit_type	condition	...	respiratory_rate_per_minute	flu_last_administered	tdap_last_administered	mmr_last_administered	polio_last_administered	deceased	deceased_flag	visit_year	visit_month
102970	82028	1967-08-18	white	nonhispanic	F	2067368	1984-12-11	1984-12-11	Outpatient Visit	Chronic sinusitis	...	NaN	1983-09-04	1979-08-18	1971-08-18	1971-08-18	N	Alive	1984	1984-12
108638	86604	2001-01-08	white	nonhispanic	F	2186424	2020-03-05	2020-03-05	Outpatient Visit	Cough:Sore throat symptom:Cough:Sore throat sy...	...	39.7	2019-10-28	2013-01-08	2005-01-08	2005-01-08	N	Alive	2020	2020-03
102910	81982	2008-12-20	white	nonhispanic	M	2066113	2020-03-08	2020-03-08	Outpatient Visit	Cough:Sore throat symptom	...	15.8	2019-11-28	NaT	2012-12-20	2012-12-20	N	Alive	2020	2020-03
43269	34287	1983-03-01	asian	nonhispanic	F	870705	2019-10-02	2019-10-02	Outpatient Visit	Acute viral pharyngitis	...	NaN	2018-10-07	2015-03-01	1987-03-01	1987-03-01	N	Alive	2019	2019-10
133580	106363	1953-01-15	white	nonhispanic	F	2686366	1957-05-30	1957-05-30	Outpatient Visit	Acute viral pharyngitis	...	NaN	1956-09-20	NaT	1957-01-15	1957-01-15	N	Alive	1957	1957-05

	person_id	birth_datetime	visit_occurrence_id	visit_start_date	visit_end_date	age_at_visit_years	measurement_Date	body_height_cm	bmi	body_temperature_c	...	diastolic	heart_rate_bpm	oxygen_saturation_percent	respiratory_rate_per_minute	flu_last_administered	tdap_last_administered	mmr_last_administered	polio_last_administered	length_of_stay_days	visit_year
count	156030.000000	156030	1.560300e+05	156030	156030	156030.000000	88854	3056.000000	2968.000000	85945.00000	...	79745.000000	79735.000000	76809.000000	79735.000000	156030	124884	154223	155802	156030.000000	156030.000000
mean	62077.565180	1971-12-17 22:48:31.840030784	1.566862e+06	2009-10-28 18:34:32.924437760	2009-12-29 12:17:16.046914048	37.864144	2019-08-24 19:06:15.582415872	163.141132	27.243194	39.70430	...	80.420540	123.435253	82.008590	25.559516	2009-04-26 01:44:21.134397184	2007-07-04 19:40:22.484866048	1975-06-06 06:02:43.726551808	1975-08-21 00:05:41.602803584	61.737999	2009.484920
min	1.000000	1909-06-24 00:00:00	1.000000e+00	1909-09-17 00:00:00	1909-09-17 00:00:00	0.010000	1919-05-11 00:00:00	50.600000	12.700000	36.10000	...	67.000000	50.000000	66.100000	12.000000	1908-10-06 00:00:00	1921-06-24 00:00:00	1910-07-21 00:00:00	1909-09-11 00:00:00	0.000000	1909.000000
25%	30962.000000	1953-03-09 00:00:00	7.822792e+05	2007-05-16 06:00:00	2007-10-26 00:00:00	16.160000	2020-02-28 00:00:00	159.700000	27.200000	38.50000	...	76.000000	85.100000	78.500000	18.200000	2006-10-14 00:00:00	2009-09-19 18:00:00	1956-12-04 00:00:00	1957-01-03 00:00:00	0.000000	2007.000000
50%	62111.500000	1971-06-14 00:00:00	1.566715e+06	2020-02-22 00:00:00	2020-02-24 00:00:00	36.070000	2020-03-05 00:00:00	167.700000	28.000000	39.70000	...	80.000000	122.000000	82.000000	25.500000	2019-09-13 00:00:00	2013-06-11 00:00:00	1975-01-21 00:00:00	1975-04-08 00:00:00	0.000000	2020.000000
75%	93253.500000	1994-04-29 00:00:00	2.358876e+06	2020-03-07 00:00:00	2020-03-09 00:00:00	57.170000	2020-03-10 00:00:00	176.100000	29.800000	40.90000	...	84.000000	161.500000	85.500000	32.700000	2019-11-07 00:00:00	2016-11-04 00:00:00	1997-09-01 12:00:00	1998-01-02 00:00:00	0.000000	2020.000000
max	124150.000000	2020-04-21 00:00:00	3.139398e+06	2020-05-26 00:00:00	2020-05-27 00:00:00	110.730000	2020-05-26 00:00:00	198.700000	53.300000	42.20000	...	121.000000	200.000000	89.000000	40.000000	2019-12-31 00:00:00	2020-05-25 00:00:00	2020-05-19 00:00:00	2020-05-25 00:00:00	38325.000000	2020.000000
std	35919.884529	NaN	9.067587e+05	NaN	NaN	25.103145	NaN	23.250398	4.428474	1.42849	...	6.791986	43.523960	4.050973	8.243366	NaN	NaN	NaN	NaN	995.266731	19.479078

	visit_occurrence_id	person_id	visit_start_date	condition_item
0	1	1	2020-03-11	Dyspnea
1	1	1	2020-03-11	Pneumonia
2	1	1	2020-03-11	Respiratory distress
3	1	1	2020-03-11	Wheezing
4	28	2	2020-05-07	Viral sinusitis
...	...	...	...	...
249048	3139398	124150	2020-03-07	Wheezing
249049	3139398	124150	2020-03-07	Cough
249050	3139398	124150	2020-03-07	Respiratory distress
249051	3139398	124150	2020-03-07	Pneumonia
249052	3139398	124150	2020-03-07	Dyspnea

	person_id	birth_datetime	race_source_value	ethnicity_source_value	gender_source_value	visit_occurrence_id	visit_start_date	visit_end_date	visit_type	condition	...	flu_last_administered	tdap_last_administered	mmr_last_administered	polio_last_administered	deceased	length_of_stay_days	deceased_flag	visit_year	visit_month	condition_list
26	24	1951-09-14	white	nonhispanic	F	573	2019-09-06	2020-01-03	Outpatient Visit	Acute bacterial sinusitis:Viral sinusitis:Acut...	...	2018-10-20	2013-09-14	1955-09-14	1955-09-14	N	119	Alive	2019	2019-09	[Acute bacterial sinusitis, Viral sinusitis, A...
183	128	1940-06-29	white	nonhispanic	F	3314	1946-04-28	1997-08-03	Outpatient Visit	Chronic sinusitis	...	1945-11-24	NaT	1944-06-29	1944-06-29	N	18725	Alive	1946	1946-04	[Chronic sinusitis]
469	370	1977-09-24	white	nonhispanic	F	8406	1995-04-15	2020-05-09	Outpatient Visit	Acute viral pharyngitis	...	1994-12-23	1989-09-24	1981-09-24	1981-09-24	N	9156	Alive	1995	1995-04	[Acute viral pharyngitis]
706	564	1991-04-17	white	nonhispanic	M	13558	2000-05-11	2020-03-05	Outpatient Visit	Cough:Viral sinusitis	...	1999-09-01	NaT	1995-04-17	1995-04-17	N	7238	Alive	2000	2000-05	[Cough, Viral sinusitis]
871	684	1979-05-06	black	nonhispanic	M	16996	2013-05-12	2020-01-12	Outpatient Visit	Chronic sinusitis:Acute bronchitis	...	2012-09-25	2011-05-06	1983-05-06	1983-05-06	N	2436	Alive	2013	2013-05	[Chronic sinusitis, Acute bronchitis]
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
153359	122081	1978-07-29	white	nonhispanic	F	3083995	1997-03-29	2019-06-29	Outpatient Visit	Chronic sinusitis	...	1996-09-16	1990-07-29	1982-07-29	1982-07-29	N	8127	Alive	1997	1997-03	[Chronic sinusitis]
154478	122951	1972-12-31	black	nonhispanic	M	3106400	1978-09-10	2020-05-03	Outpatient Visit	Cough:Chronic sinusitis:Cough:Chronic sinusitis	...	1977-11-26	NaT	1976-12-31	1976-12-31	N	15211	Alive	1978	1978-09	[Cough, Chronic sinusitis, Cough, Chronic sinu...
154542	123001	1935-08-18	white	nonhispanic	M	3107835	1948-04-25	2020-03-29	Outpatient Visit	Chronic sinusitis:Cough:Chronic sinusitis:Cough	...	1947-12-31	1947-08-18	1939-08-18	1939-08-18	N	26271	Alive	1948	1948-04	[Chronic sinusitis, Cough, Chronic sinusitis, ...
155094	123416	1986-09-30	black	nonhispanic	F	3118639	2013-09-17	2019-11-26	Outpatient Visit	Chronic sinusitis:Acute bronchitis	...	2012-12-25	2008-09-30	1990-09-30	1990-09-30	N	2261	Alive	2013	2013-09	[Chronic sinusitis, Acute bronchitis]
155203	123495	1978-03-25	white	nonhispanic	M	3119890	2016-09-18	2019-12-29	Outpatient Visit	Acute bronchitis	...	2015-12-01	2010-03-25	1982-03-25	1982-03-25	N	1197	Alive	2016	2016-09	[Acute bronchitis]

Clinical Data Visaulization in Python¶

Setup¶

Exploratory Data Analysis (EDA)¶

Produce Descriptive Statistics¶

Distributions¶

Comparisons¶

Relationships¶

Composition¶

Time series¶

Composition / Time Series¶

Additional Graphs¶

Co-occurences¶

Sankey¶

	person_id	birth_datetime	race_source_value	ethnicity_source_value	gender_source_value	visit_occurrence_id	visit_start_date	visit_end_date	visit_type	condition	...	systolic	diastolic	heart_rate_bpm	oxygen_saturation_percent	respiratory_rate_per_minute	flu_last_administered	tdap_last_administered	mmr_last_administered	polio_last_administered	deceased
0	1	1958-12-02	white	nonhispanic	F	1	2020-03-11	2020-04-01	Inpatient Visit	Dyspnea:Pneumonia:Respiratory distress:Wheezing	...	132.0	81.0	178.9	84.8	37.0	2019-09-11	2010-12-02	1962-12-02	1962-12-02	Y
1	2	1945-10-02	white	nonhispanic	F	28	2020-05-07	2020-05-07	Outpatient Visit	Viral sinusitis	...	NaN	NaN	NaN	NaN	NaN	2019-12-04	2017-10-02	1949-10-02	1949-10-02	N
2	3	1968-04-20	white	nonhispanic	M	188	2020-03-15	2020-03-15	Outpatient Visit	Sore throat symptom:Dyspnea:Wheezing	...	108.0	76.0	57.1	78.4	32.1	2019-11-19	2010-04-20	1972-04-20	1972-04-20	N
3	5	1988-08-09	white	nonhispanic	F	198	1992-08-15	1992-08-29	Outpatient Visit	Perennial allergic rhinitis	...	NaN	NaN	NaN	NaN	NaN	1991-10-24	NaN	1992-08-09	1992-08-09	N
4	5	1988-08-09	white	nonhispanic	F	206	2020-03-10	2020-03-10	Outpatient Visit	Cough	...	130.0	84.0	132.8	88.4	14.4	2019-10-23	2010-08-09	1992-08-09	1992-08-09	Y
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
156025	124148	2017-08-11	white	nonhispanic	M	3139386	2020-03-08	2020-03-08	Outpatient Visit	Cough	...	108.0	83.0	170.0	84.8	32.6	2019-12-27	NaN	2018-08-11	2018-02-11	N
156026	124149	1948-12-16	black	nonhispanic	F	3139391	2019-10-23	2019-10-23	Outpatient Visit	Viral sinusitis	...	NaN	NaN	NaN	NaN	NaN	2018-12-08	2010-12-16	1952-12-16	1952-12-16	N
156027	124149	1948-12-16	black	nonhispanic	F	3139397	2020-02-14	2020-03-06	Inpatient Visit	Acute respiratory failure:Pneumonia:Respirator...	...	119.0	73.0	149.8	87.5	13.6	2019-12-12	2010-12-16	1952-12-16	1952-12-16	N
156028	124149	1948-12-16	black	nonhispanic	F	3139393	2020-03-17	2020-03-17	Outpatient Visit	Viral sinusitis	...	NaN	NaN	NaN	NaN	NaN	2019-12-04	2010-12-16	1952-12-16	1952-12-16	N
156029	124150	1964-09-09	white	nonhispanic	M	3139398	2020-03-07	2020-03-26	Inpatient Visit	Wheezing:Cough:Respiratory distress:Pneumonia:...	...	122.0	82.0	64.8	80.5	12.8	2019-12-14	2016-09-09	1968-09-09	1968-09-09	N

	person_id	birth_datetime	race_source_value	ethnicity_source_value	gender_source_value	visit_occurrence_id	visit_start_date	visit_end_date	visit_type	condition	...	flu_last_administered	tdap_last_administered	mmr_last_administered	polio_last_administered	deceased	length_of_stay_days	deceased_flag	visit_year	visit_month	condition_list
1070	843	1936-07-30	white	nonhispanic	F	20871	1979-07-20	1994-05-06	Inpatient Visit	Non-small cell carcinoma of lung, TNM stage 1:...	...	1978-11-22	1978-07-30	1940-07-30	1940-07-30	Y	5404	Deceased	1979	1979-07	[Non-small cell carcinoma of lung, TNM stage 1...
2372	1876	1960-07-23	asian	nonhispanic	F	46292	2006-07-12	2018-03-04	Inpatient Visit	Emphysematous bronchitis	...	2005-09-08	2002-07-23	1964-07-23	1964-07-23	N	4253	Alive	2006	2006-07	[Emphysematous bronchitis]
10768	8484	1921-12-19	white	nonhispanic	F	213316	1967-12-09	2003-01-22	Inpatient Visit	Chronic sinusitis	...	1966-10-04	1963-12-19	1925-12-19	1925-12-19	Y	12828	Deceased	1967	1967-12	[Chronic sinusitis]
11824	9327	1955-06-18	white	nonhispanic	M	233448	2018-08-11	2019-06-02	Inpatient Visit	Viral sinusitis	...	2017-11-07	2017-06-18	1959-06-18	1959-06-18	Y	295	Deceased	2018	2018-08	[Viral sinusitis]
12297	9715	1951-12-03	white	nonhispanic	F	242597	2013-11-17	2020-03-17	Inpatient Visit	Wheezing:Dyspnea:Cough	...	2012-12-22	2003-12-03	1955-12-03	1955-12-03	N	2312	Alive	2013	2013-11	[Wheezing, Dyspnea, Cough]
12298	9715	1951-12-03	white	nonhispanic	F	242598	2019-07-18	2020-03-17	Inpatient Visit	Respiratory distress:Pneumonia	...	2018-12-09	2013-12-03	1955-12-03	1955-12-03	Y	243	Deceased	2019	2019-07	[Respiratory distress, Pneumonia]
18926	14848	1992-03-28	white	nonhispanic	F	375742	2006-12-18	2014-07-27	Inpatient Visit	Pulmonary emphysema	...	2005-09-23	2004-03-28	1996-03-28	1996-03-28	N	2778	Alive	2006	2006-12	[Pulmonary emphysema]
20758	16297	1936-08-04	white	hispanic	F	414146	1982-07-24	2020-03-20	Inpatient Visit	Acute respiratory distress syndrome:Acute resp...	...	1981-11-08	1978-08-04	1940-08-04	1940-08-04	N	13754	Alive	1982	1982-07	[Acute respiratory distress syndrome, Acute re...
20759	16297	1936-08-04	white	hispanic	F	414147	1990-04-15	2020-03-20	Inpatient Visit	Chronic sinusitis:Chronic sinusitis	...	1989-09-02	1988-08-04	1940-08-04	1940-08-04	N	10932	Alive	1990	1990-04	[Chronic sinusitis, Chronic sinusitis]
20760	16297	1936-08-04	white	hispanic	F	414144	1996-12-09	2020-03-20	Inpatient Visit	Pneumonia:Respiratory distress:Pneumonia:Acute...	...	1995-10-26	1988-08-04	1940-08-04	1940-08-04	N	8502	Alive	1996	1996-12	[Pneumonia, Respiratory distress, Pneumonia, A...
20761	16297	1936-08-04	white	hispanic	F	414148	2006-06-20	2020-03-20	Inpatient Visit	Sore throat symptom:Cough	...	2005-11-23	1998-08-04	1940-08-04	1940-08-04	Y	5022	Deceased	2006	2006-06	[Sore throat symptom, Cough]
27533	21689	1937-01-29	white	nonhispanic	F	548488	1982-01-18	1982-08-28	Inpatient Visit	Emphysematous bronchitis	...	1981-11-20	1979-01-29	1941-01-29	1941-01-29	Y	222	Deceased	1982	1982-01	[Emphysematous bronchitis]
31302	24783	1919-06-29	white	nonhispanic	F	625032	1965-06-17	2007-01-29	Inpatient Visit	Chronic sinusitis	...	1964-09-26	1961-06-29	1923-06-29	1923-06-29	N	15201	Alive	1965	1965-06	[Chronic sinusitis]
42043	33359	1947-04-02	white	nonhispanic	F	846866	1986-03-23	2008-09-11	Inpatient Visit	Chronic sinusitis	...	1985-12-31	1979-04-02	1951-04-02	1951-04-02	N	8208	Alive	1986	1986-03	[Chronic sinusitis]
44997	35718	1931-11-01	white	nonhispanic	M	905997	1991-10-17	1995-01-16	Inpatient Visit	Non-small cell lung cancer:Non-small cell carc...	...	1990-11-15	1983-11-01	1935-11-01	1935-11-01	Y	1187	Deceased	1991	1991-10	[Non-small cell lung cancer, Non-small cell ca...
51627	40997	1944-06-02	white	nonhispanic	F	1040195	1983-05-24	1984-06-09	Inpatient Visit	Pulmonary emphysema	...	1982-12-06	1976-06-02	1948-06-02	1948-06-02	N	382	Alive	1983	1983-05	[Pulmonary emphysema]
52352	41626	1913-07-02	black	nonhispanic	F	1056587	2002-03-30	2020-04-04	Inpatient Visit	Acute viral pharyngitis:Acute viral pharyngitis	...	2001-09-24	1995-07-02	1917-07-02	1917-07-02	N	6580	Alive	2002	2002-03	[Acute viral pharyngitis, Acute viral pharyngi...
54192	43130	1933-04-25	black	nonhispanic	F	1092254	2007-04-07	2020-02-20	Inpatient Visit	Viral sinusitis	...	2006-12-07	2005-04-25	1937-04-25	1937-04-25	N	4702	Alive	2007	2007-04	[Viral sinusitis]
54193	43130	1933-04-25	black	nonhispanic	F	1092257	2007-05-01	2020-02-20	Inpatient Visit	Respiratory distress:Streptococcal sore throat...	...	2006-11-15	2005-04-25	1937-04-25	1937-04-25	Y	4678	Deceased	2007	2007-05	[Respiratory distress, Streptococcal sore thro...
55949	44501	1910-02-13	white	nonhispanic	F	1125321	1987-01-25	1995-09-04	Inpatient Visit	Chronic sinusitis	...	1986-11-26	1982-02-13	1914-02-13	1914-02-13	N	3144	Alive	1987	1987-01	[Chronic sinusitis]
69089	55038	1963-08-08	white	nonhispanic	F	1386092	2004-07-28	2020-05-08	Inpatient Visit	Respiratory distress:Wheezing:Pneumonia:Dyspnea	...	2003-10-13	1995-08-08	1967-08-08	1967-08-08	Y	5763	Deceased	2004	2004-07	[Respiratory distress, Wheezing, Pneumonia, Dy...
69187	55117	1937-09-07	white	nonhispanic	F	1388448	2019-11-21	2020-04-10	Inpatient Visit	Cough	...	2018-11-25	2019-09-07	1941-09-07	1941-09-07	N	141	Alive	2019	2019-11	[Cough]
69188	55117	1937-09-07	white	nonhispanic	F	1388484	2019-12-31	2020-04-10	Inpatient Visit	Respiratory distress:Pneumonia	...	2018-11-28	2019-09-07	1941-09-07	1941-09-07	Y	101	Deceased	2019	2019-12	[Respiratory distress, Pneumonia]
69538	55393	1961-01-28	white	nonhispanic	F	1395989	2019-04-29	2020-03-29	Inpatient Visit	Acute respiratory distress syndrome	...	2018-12-22	2013-01-28	1965-01-28	1965-01-28	N	335	Alive	2019	2019-04	[Acute respiratory distress syndrome]
69539	55393	1961-01-28	white	nonhispanic	F	1395972	2019-07-13	2020-03-29	Inpatient Visit	Sore throat symptom:Cough:Sore throat symptom:...	...	2018-11-25	2013-01-28	1965-01-28	1965-01-28	N	260	Alive	2019	2019-07	[Sore throat symptom, Cough, Sore throat sympt...
69540	55393	1961-01-28	white	nonhispanic	F	1395980	2019-07-26	2020-03-29	Inpatient Visit	Respiratory distress:Pneumonia:Acute respirato...	...	2018-12-31	2013-01-28	1965-01-28	1965-01-28	Y	247	Deceased	2019	2019-07	[Respiratory distress, Pneumonia, Acute respir...
70633	56265	1948-10-04	white	nonhispanic	M	1416849	2016-09-17	2017-04-04	Inpatient Visit	Chronic sinusitis	...	2015-11-24	2010-10-04	1952-10-04	1952-10-04	Y	199	Deceased	2016	2016-09	[Chronic sinusitis]
74390	59274	1938-01-11	white	nonhispanic	M	1495128	2002-12-29	2020-03-23	Inpatient Visit	Acute respiratory distress syndrome:Acute resp...	...	2001-11-05	2000-01-11	1942-01-11	1942-01-11	N	6294	Alive	2002	2002-12	[Acute respiratory distress syndrome, Acute re...
74391	59274	1938-01-11	white	nonhispanic	M	1495145	2003-01-27	2020-03-23	Inpatient Visit	Acute viral pharyngitis	...	2002-12-05	2000-01-11	1942-01-11	1942-01-11	N	6265	Alive	2003	2003-01	[Acute viral pharyngitis]
74392	59274	1938-01-11	white	nonhispanic	M	1495143	2019-12-12	2020-03-23	Inpatient Visit	Dyspnea:Pneumonia:Acute respiratory failure:Re...	...	2018-09-30	2010-01-11	1942-01-11	1942-01-11	Y	102	Deceased	2019	2019-12	[Dyspnea, Pneumonia, Acute respiratory failure...
76677	61050	1930-04-23	white	nonhispanic	M	1540887	1961-04-29	2020-03-19	Inpatient Visit	Cough:Chronic sinusitis:Wheezing:Dyspnea:Cough...	...	1960-12-08	1952-04-23	1934-04-23	1934-04-23	N	21509	Alive	1961	1961-04	[Cough, Chronic sinusitis, Wheezing, Dyspnea, ...
80540	64138	1930-08-12	white	nonhispanic	F	1616150	2019-09-23	2020-03-31	Inpatient Visit	Acute respiratory distress syndrome	...	2018-11-04	2012-08-12	1934-08-12	1934-08-12	N	190	Alive	2019	2019-09	[Acute respiratory distress syndrome]
87473	69571	1959-05-19	white	nonhispanic	F	1746560	2010-05-06	2020-03-11	Inpatient Visit	Cough:Respiratory distress:Acute respiratory f...	...	2009-10-07	2001-05-19	1963-05-19	1963-05-19	N	3597	Alive	2010	2010-05	[Cough, Respiratory distress, Acute respirator...
88361	70248	1944-05-18	white	nonhispanic	F	1763122	1991-12-26	2017-05-19	Inpatient Visit	Chronic sinusitis	...	1990-09-11	1986-05-18	1948-05-18	1948-05-18	N	9276	Alive	1991	1991-12	[Chronic sinusitis]
92221	73336	1981-11-19	black	nonhispanic	F	1842067	1998-11-15	2020-03-14	Inpatient Visit	Wheezing:Asthma:Pneumonia:Respiratory distress...	...	1997-11-08	1993-11-19	1985-11-19	1985-11-19	Y	7790	Deceased	1998	1998-11	[Wheezing, Asthma, Pneumonia, Respiratory dist...
94229	74994	1936-09-02	white	nonhispanic	F	1885750	2004-08-16	2020-03-13	Inpatient Visit	Dyspnea:Wheezing	...	2003-10-26	1998-09-02	1940-09-02	1940-09-02	N	5688	Alive	2004	2004-08	[Dyspnea, Wheezing]
94230	74994	1936-09-02	white	nonhispanic	F	1885751	2019-11-24	2020-03-13	Inpatient Visit	Respiratory distress:Acute respiratory failure...	...	2018-12-16	2018-09-02	1940-09-02	1940-09-02	Y	110	Deceased	2019	2019-11	[Respiratory distress, Acute respiratory failu...
95880	76324	1954-07-31	white	nonhispanic	F	1919614	1995-07-22	2014-05-19	Inpatient Visit	Chronic sinusitis	...	1994-10-12	1986-07-31	1958-07-31	1958-07-31	N	6876	Alive	1995	1995-07	[Chronic sinusitis]
101074	80504	1935-08-30	white	nonhispanic	F	2028798	1965-08-22	2020-03-18	Inpatient Visit	Acute respiratory failure:Respiratory distress...	...	1964-09-16	1957-08-30	1939-08-30	1939-08-30	Y	19932	Deceased	1965	1965-08	[Acute respiratory failure, Respiratory distre...
105309	83887	1950-09-14	white	nonhispanic	F	2115890	2016-08-28	2020-03-21	Inpatient Visit	Cough:Cough:Cough:Cough:Cough	...	2015-10-16	2012-09-14	1954-09-14	1954-09-14	N	1301	Alive	2016	2016-08	[Cough, Cough, Cough, Cough, Cough]
109968	87652	1946-02-23	white	nonhispanic	F	2212189	2007-02-08	2020-03-01	Inpatient Visit	Respiratory distress:Pneumonia	...	2006-11-04	1998-02-23	1950-02-23	1950-02-23	N	4770	Alive	2007	2007-02	[Respiratory distress, Pneumonia]
109969	87652	1946-02-23	white	nonhispanic	F	2212188	2019-03-30	2020-03-01	Inpatient Visit	Dyspnea:Cough:Wheezing	...	2018-09-17	2018-02-23	1950-02-23	1950-02-23	Y	337	Deceased	2019	2019-03	[Dyspnea, Cough, Wheezing]
115753	92252	1919-10-20	white	hispanic	M	2330818	1981-10-04	1987-01-27	Inpatient Visit	Small cell carcinoma of lung:Primary small cel...	...	1980-09-25	1971-10-20	1923-10-20	1923-10-20	Y	1941	Deceased	1981	1981-10	[Small cell carcinoma of lung, Primary small c...
116748	93017	1931-11-28	black	nonhispanic	F	2351479	1957-11-21	1993-01-31	Inpatient Visit	Chronic sinusitis	...	1956-12-16	1953-11-28	1935-11-28	1935-11-28	N	12855	Alive	1957	1957-11	[Chronic sinusitis]
117611	93745	1940-12-15	white	nonhispanic	F	2370800	1991-12-03	2009-03-30	Inpatient Visit	Chronic sinusitis	...	1990-09-25	1982-12-15	1944-12-15	1944-12-15	N	6327	Alive	1991	1991-12	[Chronic sinusitis]
122136	97244	1964-10-09	white	nonhispanic	F	2459832	2008-09-29	2012-05-20	Inpatient Visit	Pulmonary emphysema	...	2007-12-31	2006-10-09	1968-10-09	1968-10-09	N	1329	Alive	2008	2008-09	[Pulmonary emphysema]
122344	97413	1949-12-28	white	nonhispanic	M	2464005	2009-12-14	2020-04-24	Inpatient Visit	Cough:Cough	...	2008-09-07	2001-12-28	1953-12-28	1953-12-28	Y	3784	Deceased	2009	2009-12	[Cough, Cough]
124208	98886	1914-09-05	asian	nonhispanic	F	2501404	2013-09-02	2020-03-08	Inpatient Visit	Acute respiratory distress syndrome:Cough:Acut...	...	2012-10-15	2006-09-05	1918-09-05	1918-09-05	N	2379	Alive	2013	2013-09	[Acute respiratory distress syndrome, Cough, A...
131297	104562	1912-04-03	white	nonhispanic	M	2644104	1978-03-18	1988-04-22	Inpatient Visit	Chronic sinusitis	...	1977-12-09	1974-04-03	1916-04-03	1916-04-03	N	3688	Alive	1978	1978-03	[Chronic sinusitis]
135825	108173	1937-11-16	white	nonhispanic	F	2729172	1971-11-09	2006-03-02	Inpatient Visit	Non-small cell carcinoma of lung, TNM stage 1:...	...	1970-09-07	1969-11-16	1941-11-16	1941-11-16	N	12532	Alive	1971	1971-11	[Non-small cell carcinoma of lung, TNM stage 1...
137323	109398	1936-07-24	white	nonhispanic	F	2758662	1998-07-09	2020-03-22	Inpatient Visit	Respiratory distress:Pneumonia:Respiratory dis...	...	1997-09-23	1988-07-24	1940-07-24	1940-07-24	N	7927	Alive	1998	1998-07	[Respiratory distress, Pneumonia, Respiratory ...
141096	112381	1920-04-15	white	nonhispanic	F	2834510	1966-04-04	1993-08-27	Inpatient Visit	Chronic sinusitis	...	1965-12-01	1962-04-15	1924-04-15	1924-04-15	Y	10007	Deceased	1966	1966-04	[Chronic sinusitis]
142080	113143	1913-01-05	white	nonhispanic	F	2852934	1953-12-27	1990-06-12	Inpatient Visit	Viral sinusitis:Viral sinusitis:Viral sinusiti...	...	1952-10-19	1945-01-05	1917-01-05	1917-01-05	N	13316	Alive	1953	1953-12	[Viral sinusitis, Viral sinusitis, Viral sinus...
147532	117484	1961-09-07	white	nonhispanic	F	2962896	2005-08-27	2014-09-26	Inpatient Visit	Small cell carcinoma of lung:Primary small cel...	...	2004-11-27	2003-09-07	1965-09-07	1965-09-07	N	3317	Alive	2005	2005-08	[Small cell carcinoma of lung, Primary small c...
147533	117484	1961-09-07	white	nonhispanic	F	2962894	2014-05-02	2014-09-26	Inpatient Visit	Acute bronchitis	...	2013-09-25	2013-09-07	1965-09-07	1965-09-07	Y	147	Deceased	2014	2014-05	[Acute bronchitis]
148117	117947	1918-07-18	white	nonhispanic	F	2976382	1959-07-09	2019-05-04	Inpatient Visit	Chronic sinusitis	...	1958-12-16	1950-07-18	1922-07-18	1922-07-18	N	21849	Alive	1959	1959-07	[Chronic sinusitis]
149292	118859	1944-06-30	white	nonhispanic	F	3002349	2019-07-13	2020-03-23	Inpatient Visit	Viral sinusitis:Viral sinusitis:Viral sinusitis	...	2018-09-18	2016-06-30	1948-06-30	1948-06-30	N	254	Alive	2019	2019-07	[Viral sinusitis, Viral sinusitis, Viral sinus...
149293	118859	1944-06-30	white	nonhispanic	F	3002347	2019-11-19	2020-03-23	Inpatient Visit	Acute respiratory failure:Wheezing:Pneumonia:D...	...	2018-12-31	2016-06-30	1948-06-30	1948-06-30	Y	125	Deceased	2019	2019-11	[Acute respiratory failure, Wheezing, Pneumoni...

	visit_type	condition_item	count
1	Emergency Room Visit	Acute respiratory failure	13
3	Emergency Room Visit	Pulmonary emphysema	3
0	Emergency Room Visit	Acute bronchitis	2
2	Emergency Room Visit	Emphysematous bronchitis	2
19	Inpatient Visit	Pneumonia	19488
22	Inpatient Visit	Respiratory distress	18163
11	Inpatient Visit	Cough	11988
7	Inpatient Visit	Acute respiratory failure	8693