Modelling Diabetes Risk: Social and Behavioural Factors in BRFSS Data

Exploratory Data Analysis

1 Setup

Load packages and helpers
library(here)
library(tidyverse)
library(arrow)
library(gtsummary)
library(janitor)

source(here("R", "utils.R"))       # miss_summary(), fmt_pval(), n_pct(), etc.
source(here("R", "plot_theme.R"))  # theme_epi(), scale_fill_epi(), save_fig()

tidyverse covers data wrangling and visualisation; arrow reads the Parquet file produced by the SQL pipeline; gtsummary generates publication-ready Table 1s and univariable regression summaries. Project-level helpers are sourced from R/.


2 Data Pipeline

The raw BRFSS 2024 data were downloaded as a fixed-width file and converted to Parquet using R/download_raw.R. The mart was built using DuckDB with four SQL scripts run in sequence: a view over the raw Parquet file, a recode and inclusion-criteria step to produce the analytical dataset, validation checks, and export to Parquet for use in R.

01 — Create view over raw Parquet file
-- 01_load_raw.sql
-- Create a view over the raw Parquet file so DuckDB can query it by name.
create or replace view brfss_raw as
select * from read_parquet('data/raw/LLCP2024.parquet');

select count(*) as n_rows from brfss_raw;
02 — Recode variables and apply inclusion criteria
-- 02_build_mart.sql
-- Select and recode all variables, then filter to respondents with a
-- non-missing diabetes outcome (DIABETE4 codes 7, 9, and 2 set to NULL).
create or replace table analytical_dataset as
with base as (
    select
        -- Survey design variables (retained for potential weighted analysis)
        _PSU as psu,
        _STSTR as strata,
        _LLCPWT as llcp_weight,
        -- Raw outcome variable
        DIABETE4 as diabetes_raw,
        -- Demographic predictors
        _AGEG5YR as age_group_raw,
        SEXVAR as sex_raw,
        _RACE as race_raw,
        -- Social determinants
        _EDUCAG as education_raw,
        _INCOMG1 as income_raw,
        PERSDOC3 as has_provider_raw,
        -- Clinical / behavioural predictors
        _BMI5CAT as bmi_cat_raw,
        _TOTINDA as phys_activity_raw,
        _SMOKER3 as smoking_raw,
        _RFDRHV9 as heavy_alcohol_raw,
        _MENT14D as mental_health_raw
    FROM brfss_raw
),

recoded as (
    select
        psu,
        strata,
        llcp_weight,
        case
            when diabetes_raw = 1 then 1
            when diabetes_raw in (3, 4) then 0
            else null -- 2, 7, 9 are various forms of missingness
        end as diabetes,
        case
            when age_group_raw in (1, 2) then '18-34'
            when age_group_raw in (3, 4) then '35-44'
            when age_group_raw in (5, 6) then '45-54'
            when age_group_raw in (7, 8) then '55-64'
            when age_group_raw in (9, 10) then '65-74'
            when age_group_raw in (11, 12, 13) then '75+'
            else null
        end as age_group,
        case
            when sex_raw = 1 then 'Male'
            when sex_raw = 2 then 'Female'
            else null
        end as sex,
        -- Race/ethnicity: CDC combines race and Hispanic ethnicity into a single
        -- computed variable. Hispanic is its own category regardless of race.
        -- NH (Non-Hispanic) prefix is dropped for clarity for non-US readers.
        case
            when race_raw = 1 then 'White'
            when race_raw = 2 then 'Black'
            when race_raw = 3 then 'American Indian/Alaskan Native'
            when race_raw = 4 then 'Asian'
            when race_raw = 5 then 'Pacific Islander'
            when race_raw = 6 then 'Other/Multiracial'
            when race_raw = 7 then 'Hispanic'
            when race_raw = 8 then 'Other'
            else null
        end as race_ethnicity,
        case
            when education_raw = 1 then 'Did not graduate high school'
            when education_raw = 2 then 'High school graduate'
            when education_raw = 3 then 'Some college'
            when education_raw = 4 then 'College graduate'
            else null
        end as education,
        case
            when income_raw = 1 then '<$15k'
            when income_raw = 2 then '$15k-$25k'
            when income_raw = 3 then '$25k-$35k'
            when income_raw = 4 then '$35k-$50k'
            when income_raw = 5 then '$50k-$100k'
            when income_raw = 6 then '>$100k'
            else null
        end as income_group,
        case
            when has_provider_raw in (1, 2) then 1
            when has_provider_raw = 3 then 0
            else null
        end as has_provider,
        case
            when bmi_cat_raw = 1 then 'Underweight'
            when bmi_cat_raw = 2 then 'Normal'
            when bmi_cat_raw = 3 then 'Overweight'
            when bmi_cat_raw = 4 then 'Obese'
            else null
        end as bmi_category,
        case
            when phys_activity_raw = 1 then 1
            when phys_activity_raw = 2 then 0
            else null
        end as physically_active,
        case
            when smoking_raw = 1 then 'Current (daily)'
            when smoking_raw = 2 then 'Current (some days)'
            when smoking_raw = 3 then 'Former'
            when smoking_raw = 4 then 'Never'
            else null
        end as smoking_status,
        case
            when heavy_alcohol_raw = 1 then 0
            when heavy_alcohol_raw = 2 then 1
            else null
        end as heavy_drinker,
        case
            when mental_health_raw = 1 then '0 days'
            when mental_health_raw = 2 then '1-13 days'
            when mental_health_raw = 3 then '14+ days'
            else null
        end as mental_health_days
    from base
)
select * from recoded
where diabetes is not null;
03 — Validation checks
-- 03_validation.sql
-- Row counts, outcome distribution, and missingness by predictor.
select count(*) from analytical_dataset;

select
    diabetes,
    count(*) as n,
    round(100.0 * count(*) / sum(count(*)) over (), 2) as pct
from analytical_dataset
group by diabetes
order by diabetes;

select
    count(*) filter (where age_group is null) as miss_age,
    count(*) filter (where sex is null) as miss_sex,
    count(*) filter (where race_ethnicity is null) as miss_race,
    count(*) filter (where education is null) as miss_education,
    count(*) filter (where income_group is null) as miss_income,
    count(*) filter (where bmi_category is null) as miss_bmi,
    count(*) filter (where physically_active is null) as miss_activity,
    count(*) filter (where smoking_status is null) as miss_smoking,
    count(*) filter (where heavy_drinker is null) as miss_alcohol,
    count(*) filter (where has_provider is null) as miss_provider,
    count(*) filter (where mental_health_days is null) as miss_mh
from analytical_dataset;

select bmi_category, diabetes, count(*) as n
from analytical_dataset
where bmi_category is not null
group by bmi_category, diabetes
order by bmi_category, diabetes;
04 — Export to Parquet
-- 04_export.sql
copy (select * from analytical_dataset)
to 'data/processed/analysis_dataset.parquet'
(format parquet);

The Parquet file is the mart for analysis — the raw data file stays untouched, and all downstream R code reads from data/processed/.


3 Data Import

Code
df <- read_parquet(here("data", "processed", "analysis_dataset.parquet"))
dim(df)
## [1] 453241     15
names(df)
##  [1] "psu"                "strata"             "llcp_weight"       
##  [4] "diabetes"           "age_group"          "sex"               
##  [7] "race_ethnicity"     "education"          "income_group"      
## [10] "has_provider"       "bmi_category"       "physically_active" 
## [13] "smoking_status"     "heavy_drinker"      "mental_health_days"

The analytical dataset contains 453241 respondents with a fully observed binary outcome; rows with missing or ambiguous diabetes responses (codes 2, 7, 9 in DIABETE4) were removed during the SQL build step. The columns include three survey design fields (psu, strata, llcp_weight) and 11 demographic, socioeconomic, and behavioural predictors. No further outcome filtering is needed, so any remaining issues lie in predictor encoding and missingness.


4 Sample Characteristics

4.1 Predictor distributions

Understanding the sample composition across predictors helps identify sparse categories, where estimates will be less stable and require cautious interpretation.

Code
df_dist <- df |>
  mutate(
    age_group          = factor(age_group,
                                levels = c("18-34", "35-44", "45-54", "55-64", "65-74", "75+")),
    education          = factor(education,
                                levels = c("Did not graduate high school", "High school graduate",
                                           "Some college", "College graduate")),
    income_group       = factor(income_group,
                                levels = c("<$15k", "$15k-$25k", "$25k-$35k",
                                           "$35k-$50k", "$50k-$100k", ">$100k")),
    bmi_category       = factor(bmi_category,
                                levels = c("Underweight", "Normal", "Overweight", "Obese")),
    smoking_status     = factor(smoking_status,
                                levels = c("Never", "Former",
                                           "Current (some days)", "Current (daily)")),
    mental_health_days = factor(mental_health_days,
                                levels = c("0 days", "1-13 days", "14+ days"))
  )

cat_vars <- c("age_group", "sex", "race_ethnicity", "education", "income_group",
              "bmi_category", "smoking_status", "mental_health_days")

dist_data <- purrr::map_dfr(cat_vars, function(v) {
  counts <- df_dist |>
    filter(!is.na(.data[[v]])) |>
    count(variable = v, level = as.character(.data[[v]])) |>
    mutate(pct = n / sum(n) * 100)

  if (is.factor(df_dist[[v]])) {
    counts |> mutate(level = factor(level, levels = levels(df_dist[[v]])))
  } else {
    counts |> arrange(desc(n)) |> mutate(level = factor(level, levels = unique(level)))
  }
}) |>
  mutate(level = fct_recode(level,
                             "AI/AN" = "American Indian/Alaskan Native"))

facet_labels <- c(
  age_group          = "Age group",
  sex                = "Sex",
  race_ethnicity     = "Race/ethnicity",
  education          = "Education",
  income_group       = "Household income",
  bmi_category       = "BMI category",
  smoking_status     = "Smoking status",
  mental_health_days = "Poor mental health days"
)

ggplot(dist_data, aes(x = pct, y = level)) +
  geom_col(fill = palette_epi[["sky_blue"]], show.legend = FALSE) +
  geom_text(aes(label = paste0(round(pct, 1), "%")), hjust = -0.1, size = 3) +
  facet_wrap(~variable, scales = "free_y", ncol = 2,
             labeller = as_labeller(facet_labels)) +
  labs(x = "% of sample", y = NULL,
       title = "Categorical predictor distributions") +
  xlim(0, max(dist_data$pct) * 1.2) +
  theme(
    panel.grid.major.y = element_blank(),
    panel.grid.minor   = element_blank(),
    panel.grid.major.x = element_line(colour = "grey92", linewidth = 0.3),
    plot.title         = element_text(hjust = 0.5, size = 14, face = "bold"),
    strip.text         = element_text(face = "bold", size = 10)
  )

Categorical predictor distributions

The sample skews older and more educated, with adults aged 65 and over accounting for 48% of respondents and college graduates forming the largest education category (42.1%). The BMI distribution is predominantly above the normal-weight threshold, with 69% of respondents falling in the overweight or obese categories, indicating that BMI-related contrasts are well represented in the data. Race and ethnicity show the most analytically consequential imbalance: White respondents account for 73.5% of the sample, while several groups — Pacific Islander, American Indian or Alaska Native, Other/Multiracial, Hispanic, and Asian — each fall below 3%. Estimates for these smaller groups will therefore be less precise and require cautious interpretation.

Code
binary_labels <- c(
  physically_active = "Physically active",
  heavy_drinker     = "Heavy drinker",
  has_provider      = "Has personal doctor"
)

binary_data <- tibble(
  variable = names(binary_labels),
  label    = binary_labels
) |>
  mutate(pct = map_dbl(variable, ~mean(df[[.x]], na.rm = TRUE) * 100)) |>
  arrange(pct) |>
  mutate(label = factor(label, levels = label))

ggplot(binary_data, aes(x = pct, y = label)) +
  geom_segment(aes(x = 0, xend = pct, yend = label),
               colour = palette_epi[["sky_blue"]]) +
  geom_point(colour = palette_epi[["sky_blue"]], size = 3) +
  geom_text(aes(label = paste0(round(pct, 1), "%")),
            hjust = -0.3, size = 3) +
  labs(x = "% of sample", y = NULL,
       title = "Binary predictor prevalence") +
  xlim(0, 105) +
  theme(
    panel.grid.major.y = element_blank(),
    panel.grid.minor   = element_blank(),
    panel.grid.major.x = element_line(colour = "grey92", linewidth = 0.3),
    plot.title         = element_text(hjust = 0.5, size = 14, face = "bold")
  )

Binary predictor prevalence

All three binary predictors are strongly imbalanced. Heavy drinking is rare (5.9%), so estimates of its association with diabetes rest on a small exposed group. Having a personal doctor (87.6%) and being physically active (76.7%) remain one-sided, limiting contrast and reducing precision in their effect estimates.

4.2 Outcome prevalence

Code
prev <- df |>
  count(diabetes) |>
  mutate(
    prop = n / sum(n),
    pct  = scales::percent(prop, accuracy = 0.1)
  )

prev

14.5% of respondents (65,809 of 453,241) reported a diabetes diagnosis. The outcome is moderately imbalanced rather than rare, providing sufficient cases for stable estimation while still requiring attention to this imbalance in interpretation.

4.3 Table 1 — sample characteristics by diabetes status

Code
df |>
  select(
    diabetes,
    age_group, sex, race_ethnicity, education, income_group,
    bmi_category, physically_active, smoking_status,
    heavy_drinker, has_provider, mental_health_days
  ) |>
  mutate(
    diabetes   = factor(diabetes, levels = c(0, 1), labels = c("No diabetes", "Diabetes")),
    age_group  = factor(age_group,
                        levels = c("18-34", "35-44", "45-54", "55-64", "65-74", "75+")),
    education  = factor(education,
                        levels = c("Did not graduate high school", "High school graduate",
                                   "Some college", "College graduate")),
    income_group = factor(income_group,
                          levels = c("<$15k", "$15k-$25k", "$25k-$35k",
                                     "$35k-$50k", "$50k-$100k", ">$100k")),
    bmi_category = factor(bmi_category,
                          levels = c("Underweight", "Normal", "Overweight", "Obese")),
    smoking_status = factor(smoking_status,
                            levels = c("Never", "Former", "Current (some days)", "Current (daily)"))
  ) |>
  tbl_summary(
    by      = diabetes,
    missing = "no",
    label   = list(
      age_group          ~ "Age group",
      sex                ~ "Sex",
      race_ethnicity     ~ "Race/ethnicity",
      education          ~ "Education",
      income_group       ~ "Household income",
      bmi_category       ~ "BMI category",
      physically_active  ~ "Physically active",
      smoking_status     ~ "Smoking status",
      heavy_drinker      ~ "Heavy drinker",
      has_provider       ~ "Has personal doctor",
      mental_health_days ~ "Poor mental health days"
    )
  ) |>
  add_p() |>
  add_overall() |>
  bold_p(t = 0.05) |>
  bold_labels() |>
  modify_caption("**Table 1.** Sample characteristics by diabetes status")
Table 1. Sample characteristics by diabetes status
Characteristic Overall
N = 453,2411
No diabetes
N = 387,4321
Diabetes
N = 65,8091
p-value2
Age group


<0.001
    18-34 52,912 (12%) 52,044 (14%) 868 (1.3%)
    35-44 54,067 (12%) 52,075 (14%) 1,992 (3.1%)
    45-54 58,953 (13%) 53,726 (14%) 5,227 (8.1%)
    55-64 65,995 (15%) 55,146 (14%) 10,849 (17%)
    65-74 90,478 (20%) 71,895 (19%) 18,583 (29%)
    75+ 122,745 (28%) 95,443 (25%) 27,302 (42%)
Sex


<0.001
    Female 236,324 (52%) 203,373 (52%) 32,951 (50%)
    Male 216,917 (48%) 184,059 (48%) 32,858 (50%)
Race/ethnicity


<0.001
    American Indian/Alaskan Native 6,361 (1.4%) 5,005 (1.3%) 1,356 (2.1%)
    Asian 12,471 (2.8%) 11,117 (2.9%) 1,354 (2.1%)
    Black 34,803 (7.8%) 27,651 (7.3%) 7,152 (11%)
    Hispanic 10,340 (2.3%) 8,935 (2.4%) 1,405 (2.2%)
    Other 47,877 (11%) 41,391 (11%) 6,486 (10%)
    Other/Multiracial 3,694 (0.8%) 3,117 (0.8%) 577 (0.9%)
    Pacific Islander 2,040 (0.5%) 1,647 (0.4%) 393 (0.6%)
    White 326,779 (74%) 281,040 (74%) 45,739 (71%)
Education


<0.001
    Did not graduate high school 26,740 (5.9%) 21,034 (5.5%) 5,706 (8.7%)
    High school graduate 114,805 (25%) 95,762 (25%) 19,043 (29%)
    Some college 119,613 (27%) 100,090 (26%) 19,523 (30%)
    College graduate 189,843 (42%) 168,591 (44%) 21,252 (32%)
Household income


<0.001
    <$15k 20,524 (6.1%) 16,099 (5.6%) 4,425 (8.6%)
    $15k-$25k 33,077 (9.8%) 25,591 (8.9%) 7,486 (15%)
    $25k-$35k 41,302 (12%) 33,456 (12%) 7,846 (15%)
    $35k-$50k 49,259 (15%) 40,917 (14%) 8,342 (16%)
    $50k-$100k 111,561 (33%) 96,279 (34%) 15,282 (30%)
    >$100k 81,748 (24%) 73,660 (26%) 8,088 (16%)
BMI category


<0.001
    Underweight 7,313 (1.8%) 6,870 (2.0%) 443 (0.7%)
    Normal 120,070 (29%) 110,978 (32%) 9,092 (15%)
    Overweight 145,411 (35%) 126,113 (36%) 19,298 (32%)
    Obese 138,199 (34%) 106,875 (30%) 31,324 (52%)
Physically active 346,848 (77%) 306,100 (79%) 40,748 (62%) <0.001
Smoking status


<0.001
    Never 256,434 (61%) 223,656 (62%) 32,778 (53%)
    Former 118,674 (28%) 97,155 (27%) 21,519 (35%)
    Current (some days) 13,832 (3.3%) 12,028 (3.3%) 1,804 (2.9%)
    Current (daily) 32,737 (7.8%) 27,454 (7.6%) 5,283 (8.6%)
Heavy drinker 23,954 (5.9%) 22,272 (6.4%) 1,682 (2.8%) <0.001
Has personal doctor 393,314 (88%) 330,328 (86%) 62,986 (96%) <0.001
Poor mental health days


<0.001
    0 days 267,874 (60%) 228,118 (60%) 39,756 (62%)
    1-13 days 115,947 (26%) 101,937 (27%) 14,010 (22%)
    14+ days 61,452 (14%) 50,831 (13%) 10,621 (16%)
1 n (%)
2 Pearson’s Chi-squared test

Age and BMI show the strongest gradients: the diabetes group skews substantially older, and obesity is markedly more common (52% vs 30%). Socioeconomic differences follow a similar pattern, with lower income and lower educational attainment more prevalent in the diabetes group. Behavioural and health indicators add further contrasts — lower physical activity, a higher proportion of former smokers, and more frequent poor mental health days — though these likely reflect a mix of upstream risk factors and downstream consequences of disease rather than independent effects. These overlapping relationships will be estimated more precisely in the adjusted model.


5 Outcome Prevalence by Subgroup

Code
df_prev <- df |>
  mutate(smoking_status = factor(smoking_status,
                                  levels = c("Never", "Former",
                                             "Current (some days)", "Current (daily)")))

cat_vars <- c("age_group", "sex", "race_ethnicity", "education",
              "income_group", "bmi_category", "smoking_status",
              "mental_health_days")

facet_labels <- c(
  age_group          = "Age group",
  sex                = "Sex",
  race_ethnicity     = "Race/ethnicity",
  education          = "Education",
  income_group       = "Household income",
  bmi_category       = "BMI category",
  smoking_status     = "Smoking status",
  mental_health_days = "Poor mental health days"
)

prev_data <- purrr::map_dfr(cat_vars, function(v) {
  counts <- df_prev |>
    filter(!is.na(.data[[v]])) |>
    group_by(variable = v, level = .data[[v]]) |>
    summarise(
      n        = n(),
      cases    = sum(diabetes),
      prev_pct = mean(diabetes) * 100,
      .groups  = "drop"
    )

  if (is.factor(df_prev[[v]])) {
    counts |> mutate(level = factor(as.character(level),
                                    levels = levels(df_prev[[v]])))
  } else {
    counts |> arrange(prev_pct) |>
      mutate(level = factor(as.character(level), levels = unique(as.character(level))))
  }
}) |>
  mutate(level = fct_recode(level, "AI/AN" = "American Indian/Alaskan Native"))

ggplot(prev_data, aes(x = prev_pct, y = level)) +
  geom_col(fill = palette_epi[["sky_blue"]], show.legend = FALSE) +
  geom_text(aes(label = paste0(round(prev_pct, 1), "%")),
            hjust = -0.1, size = 3) +
  facet_wrap(~variable, scales = "free_y", ncol = 2,
             labeller = as_labeller(facet_labels)) +
  labs(x = "Diabetes prevalence (%)", y = NULL,
       title = "Crude diabetes prevalence by predictor category") +
  xlim(0, max(prev_data$prev_pct) * 1.2) +
  theme(
    panel.grid.major.y = element_blank(),
    panel.grid.minor   = element_blank(),
    panel.grid.major.x = element_line(colour = "grey92", linewidth = 0.3),
    plot.title         = element_text(hjust = 0.5, size = 14, face = "bold"),
    strip.text         = element_text(face = "bold", size = 10)
  )

Crude diabetes prevalence by key predictor categories

The age gradient climbs from 1.6% in those aged 18–34 to 22.2% in those aged 75+, while the BMI gradient is similarly pronounced, with the obese group at 22.7% compared with 7.6% in the normal-weight group. Socioeconomic gradients are also evident, with higher prevalence in lower income and lower education categories.

Behavioural and health indicators show additional differences, including higher prevalence among those reporting 14+ poor mental health days and among former smokers, although these patterns may reflect confounding or changes following diagnosis rather than independent effects. Variation across race and ethnicity is substantial but warrants cautious interpretation due to small sample sizes in several groups.


6 Unadjusted Associations

Each predictor is regressed against the diabetes outcome individually, producing a crude odds ratio (OR) that quantifies the association without controlling for any other variable.

Code
df |>
  mutate(
    diabetes          = factor(diabetes, levels = c(0, 1)),
    age_group         = factor(age_group,
                               levels = c("18-34", "35-44", "45-54", "55-64", "65-74", "75+")),
    sex               = factor(sex, levels = c("Male", "Female")),
    race_ethnicity    = factor(race_ethnicity, levels = c("White", "Black",
                                                          "American Indian/Alaskan Native",
                                                          "Asian", "Pacific Islander",
                                                          "Other/Multiracial",
                                                          "Hispanic", "Other")),
    education         = factor(education,
                               levels = c("College graduate", "Some college",
                                          "High school graduate", "Did not graduate high school")),
    income_group      = factor(income_group,
                               levels = c(">$100k", "$50k-$100k", "$35k-$50k",
                                          "$25k-$35k", "$15k-$25k", "<$15k")),
    bmi_category      = factor(bmi_category,
                               levels = c("Normal", "Underweight", "Overweight", "Obese")),
    smoking_status    = factor(smoking_status,
                               levels = c("Never", "Former", "Current (some days)", "Current (daily)")),
    mental_health_days = factor(mental_health_days,
                                levels = c("0 days", "1-13 days", "14+ days"))
  ) |>
  select(
    diabetes,
    age_group, sex, race_ethnicity, education, income_group,
    bmi_category, physically_active, smoking_status,
    heavy_drinker, has_provider, mental_health_days
  ) |>
  tbl_uvregression(
    method       = glm,
    y            = diabetes,
    method.args  = list(family = binomial),
    exponentiate = TRUE,
    estimate_fun = \(x) style_sigfig(x, digits = 3),
    label        = list(
      age_group          ~ "Age group",
      sex                ~ "Sex",
      race_ethnicity     ~ "Race/ethnicity",
      education          ~ "Education",
      income_group       ~ "Household income",
      bmi_category       ~ "BMI category",
      physically_active  ~ "Physically active",
      smoking_status     ~ "Smoking status",
      heavy_drinker      ~ "Heavy drinker",
      has_provider       ~ "Has personal doctor",
      mental_health_days ~ "Poor mental health days"
    )
  ) |>
  bold_p(t = 0.05) |>
  bold_labels() |>
  modify_caption("**Table 2.** Unadjusted odds ratios for diabetes by predictor")
Table 2. Unadjusted odds ratios for diabetes by predictor
Characteristic N OR 95% CI p-value
Age group 445,150


    18-34

    35-44
2.29 2.12, 2.49 <0.001
    45-54
5.83 5.43, 6.28 <0.001
    55-64
11.8 11.0, 12.7 <0.001
    65-74
15.5 14.5, 16.6 <0.001
    75+
17.2 16.0, 18.4 <0.001
Sex 453,241


    Male

    Female
0.908 0.893, 0.923 <0.001
Race/ethnicity 444,365


    White

    Black
1.59 1.55, 1.63 <0.001
    American Indian/Alaskan Native
1.66 1.57, 1.77 <0.001
    Asian
0.748 0.706, 0.792 <0.001
    Pacific Islander
1.47 1.31, 1.64 <0.001
    Other/Multiracial
1.14 1.04, 1.24 0.005
    Hispanic
0.966 0.912, 1.02 0.2
    Other
0.963 0.936, 0.990 0.008
Education 451,001


    College graduate

    Some college
1.55 1.52, 1.58 <0.001
    High school graduate
1.58 1.54, 1.61 <0.001
    Did not graduate high school
2.15 2.08, 2.22 <0.001
Household income 337,471


    >$100k

    $50k-$100k
1.45 1.40, 1.49 <0.001
    $35k-$50k
1.86 1.80, 1.92 <0.001
    $25k-$35k
2.14 2.07, 2.21 <0.001
    $15k-$25k
2.66 2.57, 2.76 <0.001
    <$15k
2.50 2.40, 2.61 <0.001
BMI category 410,993


    Normal

    Underweight
0.787 0.712, 0.867 <0.001
    Overweight
1.87 1.82, 1.92 <0.001
    Obese
3.58 3.49, 3.67 <0.001
Physically active 452,007 0.431 0.424, 0.439 <0.001
Smoking status 421,677


    Never

    Former
1.51 1.48, 1.54 <0.001
    Current (some days)
1.02 0.972, 1.08 0.4
    Current (daily)
1.31 1.27, 1.36 <0.001
Heavy drinker 407,142 0.424 0.403, 0.446 <0.001
Has personal doctor 448,757 4.29 4.11, 4.48 <0.001
Poor mental health days 445,273


    0 days

    1-13 days
0.789 0.772, 0.805 <0.001
    14+ days
1.20 1.17, 1.23 <0.001
Abbreviations: CI = Confidence Interval, OR = Odds Ratio

The age odds ratio rises monotonically from 2.29 in the 35–44 group to 17.2 in those aged 75+, while obesity is associated with 3.58 times the odds of the normal-weight group. Socioeconomic gradients are also evident, with progressively higher odds of diabetes in lower income and lower education categories.

Behavioural and health indicators show additional associations, including lower odds among physically active respondents and higher odds among former smokers, though these patterns likely reflect confounding and behavioural changes following diagnosis rather than independent effects. Several variables display clear signs of reverse causation, particularly heavy drinking (OR 0.42) and having a personal doctor (OR 4.29), where the direction of association is more consistent with disease status influencing behaviour or healthcare contact. These crude estimates provide context, but many relationships are likely to attenuate or change after adjustment.


7 Missingness

The SQL pipeline recoded “don’t know”, “refused”, and “not applicable” responses to NULL. Variables not shown have no missing values.

Code
miss_summary(df)

Income group stands out with 25.5% missingness — the only variable where nonresponse is large enough to meaningfully reduce the effective sample under complete-case analysis. Most remaining variables fall below 10%, with heavy drinking (10.2%) and BMI category (9.3%) contributing some additional row loss. Whether income nonresponse is random or related to diabetes risk is unclear, but its scale is important to consider when interpreting results from complete-case models.


8 Summary

Age and BMI are the most consistent signals, with strong, monotonic gradients across the descriptive analyses. Socioeconomic predictors show clear unadjusted associations, while income group stands out with substantial missingness (25.5%), reducing the effective sample under complete-case analysis. Several behavioural variables, particularly heavy drinking and having a personal doctor, show patterns more consistent with reverse causation than independent effects.

The adjusted model will help separate these overlapping relationships and identify which associations persist after controlling for confounding.


9 Session Info

R session info
sessionInfo()
## R version 4.5.0 (2025-04-11)
## Platform: aarch64-apple-darwin20
## Running under: macOS 26.0.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Australia/Sydney
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices datasets  utils     methods   base     
## 
## other attached packages:
##  [1] janitor_2.2.1   gtsummary_2.5.0 arrow_23.0.1.2  lubridate_1.9.5
##  [5] forcats_1.0.1   stringr_1.6.0   dplyr_1.2.1     purrr_1.2.1    
##  [9] readr_2.2.0     tidyr_1.3.2     tibble_3.3.1    ggplot2_4.0.2  
## [13] tidyverse_2.0.0 here_1.0.2     
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6         xfun_0.57            htmlwidgets_1.6.4   
##  [4] tzdb_0.5.0           vctrs_0.7.2          tools_4.5.0         
##  [7] generics_0.1.4       pkgconfig_2.0.3      RColorBrewer_1.1-3  
## [10] S7_0.2.1             assertthat_0.2.1     gt_1.3.0            
## [13] lifecycle_1.0.5      compiler_4.5.0       farver_2.1.2        
## [16] snakecase_0.11.1     litedown_0.9         htmltools_0.5.9     
## [19] sass_0.4.10          yaml_2.3.12          pillar_1.11.1       
## [22] broom.helpers_1.22.0 commonmark_2.0.0     tidyselect_1.2.1    
## [25] digest_0.6.39        stringi_1.8.7        labeling_0.4.3      
## [28] labelled_2.16.0      rprojroot_2.1.1      fastmap_1.2.0       
## [31] grid_4.5.0           cli_3.6.5            magrittr_2.0.5      
## [34] base64enc_0.1-6      cards_0.7.1          broom_1.0.12        
## [37] withr_3.0.2          scales_1.4.0         backports_1.5.1     
## [40] bit64_4.6.0-1        cardx_0.3.2          timechange_0.4.0    
## [43] rmarkdown_2.31       bit_4.6.0            hms_1.1.4           
## [46] evaluate_1.0.5       knitr_1.51           haven_2.5.5         
## [49] markdown_2.0         rlang_1.2.0          glue_1.8.0          
## [52] xml2_1.5.2           renv_1.2.0           rstudioapi_0.18.0   
## [55] jsonlite_2.0.0       R6_2.6.1             fs_2.0.1