Modelling Diabetes Risk: Social and Behavioural Factors in BRFSS Data

Exploratory Data Analysis

1 Setup

Load packages and helpers

library(here)
library(tidyverse)
library(arrow)
library(gtsummary)
library(janitor)

source(here("R", "utils.R"))       # miss_summary(), fmt_pval(), n_pct(), etc.
source(here("R", "plot_theme.R"))  # theme_epi(), scale_fill_epi(), save_fig()

tidyverse covers data wrangling and visualisation; arrow reads the Parquet file produced by the SQL pipeline; gtsummary generates publication-ready Table 1s and univariable regression summaries. Project-level helpers are sourced from R/.

2 Data Pipeline

The raw BRFSS 2024 data were downloaded as a SAS XPT file and converted to Parquet using R/xpt_to_parquet.R. The mart was built using DuckDB with four SQL scripts run in sequence: a view over the raw Parquet file, a recode and inclusion-criteria step to produce the analytical dataset, validation checks, and export to Parquet for use in R.

01: Create view over raw Parquet file

-- 01_load_raw.sql
-- Create a view over the raw Parquet file so DuckDB can query it by name.
create or replace view brfss_raw as
select * from read_parquet('data/raw/LLCP2024.parquet');

select count(*) as n_rows from brfss_raw;

02: Recode variables and apply inclusion criteria

-- 02_build_mart.sql
-- Select and recode all variables, then filter to respondents with a
-- non-missing diabetes outcome (DIABETE4 codes 7, 9, and 2 set to NULL).
create or replace table analytical_dataset as
with base as (
    select
        -- Survey design variables (retained for potential weighted analysis)
        _PSU as psu,
        _STSTR as strata,
        _LLCPWT as llcp_weight,
        -- Raw outcome variable
        DIABETE4 as diabetes_raw,
        -- Demographic predictors
        _AGEG5YR as age_group_raw,
        SEXVAR as sex_raw,
        _RACE as race_raw,
        -- Social determinants
        _EDUCAG as education_raw,
        _INCOMG1 as income_raw,
        PERSDOC3 as has_provider_raw,
        -- Clinical / behavioural predictors
        _BMI5CAT as bmi_cat_raw,
        _TOTINDA as phys_activity_raw,
        _SMOKER3 as smoking_raw,
        _RFDRHV9 as heavy_alcohol_raw,
        _MENT14D as mental_health_raw
    FROM brfss_raw
),

recoded as (
    select
        psu,
        strata,
        llcp_weight,
        case
            when diabetes_raw = 1 then 1
            when diabetes_raw in (3, 4) then 0
            else null -- 2, 7, 9 are various forms of missingness
        end as diabetes,
        case
            when age_group_raw in (1, 2) then '18-34'
            when age_group_raw in (3, 4) then '35-44'
            when age_group_raw in (5, 6) then '45-54'
            when age_group_raw in (7, 8) then '55-64'
            when age_group_raw in (9, 10) then '65-74'
            when age_group_raw in (11, 12, 13) then '75+'
            else null
        end as age_group,
        case
            when sex_raw = 1 then 'Male'
            when sex_raw = 2 then 'Female'
            else null
        end as sex,
        -- Race/ethnicity: CDC combines race and Hispanic ethnicity into a single
        -- computed variable. Hispanic is its own category regardless of race.
        -- NH (Non-Hispanic) prefix is dropped for clarity for non-US readers.
        case
            when race_raw = 1 then 'White'
            when race_raw = 2 then 'Black'
            when race_raw = 3 then 'American Indian/Alaskan Native'
            when race_raw = 4 then 'Asian'
            when race_raw = 5 then 'Pacific Islander'
            when race_raw = 6 then 'Other/Multiracial'
            when race_raw = 7 then 'Hispanic'
            when race_raw = 8 then 'Other'
            else null
        end as race_ethnicity,
        case
            when education_raw = 1 then 'Did not graduate high school'
            when education_raw = 2 then 'High school graduate'
            when education_raw = 3 then 'Some college'
            when education_raw = 4 then 'College graduate'
            else null
        end as education,
        case
            when income_raw = 1 then '<$15k'
            when income_raw = 2 then '$15k-$25k'
            when income_raw = 3 then '$25k-$35k'
            when income_raw = 4 then '$35k-$50k'
            when income_raw = 5 then '$50k-$100k'
            when income_raw = 6 then '>$100k'
            else null
        end as income_group,
        case
            when has_provider_raw in (1, 2) then 1
            when has_provider_raw = 3 then 0
            else null
        end as has_provider,
        case
            when bmi_cat_raw = 1 then 'Underweight'
            when bmi_cat_raw = 2 then 'Normal'
            when bmi_cat_raw = 3 then 'Overweight'
            when bmi_cat_raw = 4 then 'Obese'
            else null
        end as bmi_category,
        case
            when phys_activity_raw = 1 then 1
            when phys_activity_raw = 2 then 0
            else null
        end as physically_active,
        case
            when smoking_raw = 1 then 'Current (daily)'
            when smoking_raw = 2 then 'Current (some days)'
            when smoking_raw = 3 then 'Former'
            when smoking_raw = 4 then 'Never'
            else null
        end as smoking_status,
        case
            when heavy_alcohol_raw = 1 then 0
            when heavy_alcohol_raw = 2 then 1
            else null
        end as heavy_drinker,
        case
            when mental_health_raw = 1 then '0 days'
            when mental_health_raw = 2 then '1-13 days'
            when mental_health_raw = 3 then '14+ days'
            else null
        end as mental_health_days
    from base
)
select * from recoded
where diabetes is not null;

03: Validation checks

-- 03_validation.sql
-- Row counts, outcome distribution, and missingness by predictor.
select count(*) from analytical_dataset;

select
    diabetes,
    count(*) as n,
    round(100.0 * count(*) / sum(count(*)) over (), 2) as pct
from analytical_dataset
group by diabetes
order by diabetes;

select
    count(*) filter (where age_group is null) as miss_age,
    count(*) filter (where sex is null) as miss_sex,
    count(*) filter (where race_ethnicity is null) as miss_race,
    count(*) filter (where education is null) as miss_education,
    count(*) filter (where income_group is null) as miss_income,
    count(*) filter (where bmi_category is null) as miss_bmi,
    count(*) filter (where physically_active is null) as miss_activity,
    count(*) filter (where smoking_status is null) as miss_smoking,
    count(*) filter (where heavy_drinker is null) as miss_alcohol,
    count(*) filter (where has_provider is null) as miss_provider,
    count(*) filter (where mental_health_days is null) as miss_mh
from analytical_dataset;

select bmi_category, diabetes, count(*) as n
from analytical_dataset
where bmi_category is not null
group by bmi_category, diabetes
order by bmi_category, diabetes;

04: Export to Parquet

-- 04_export.sql
copy (select * from analytical_dataset)
to 'data/processed/analysis_dataset.parquet'
(format parquet);

The Parquet file is the mart for analysis: the raw data file stays untouched, and all downstream R code reads from data/processed/.

3 Data Import

Code

df <- read_parquet(here("data", "processed", "analysis_dataset.parquet"))
dim(df)
## [1] 453241     15
names(df)
##  [1] "psu"                "strata"             "llcp_weight"       
##  [4] "diabetes"           "age_group"          "sex"               
##  [7] "race_ethnicity"     "education"          "income_group"      
## [10] "has_provider"       "bmi_category"       "physically_active" 
## [13] "smoking_status"     "heavy_drinker"      "mental_health_days"

The analytical dataset contains 453241 respondents with a fully observed binary outcome; rows with missing or ambiguous diabetes responses (codes 2, 7, 9 in DIABETE4) were removed during the SQL build step. The columns include three survey design fields (psu, strata, llcp_weight) and 11 demographic, socioeconomic, and behavioural predictors. No further outcome filtering is needed, so any remaining issues lie in predictor encoding and missingness.

4 Sample Characteristics

4.1 Predictor distributions

Understanding the sample composition across predictors helps identify sparse categories, where estimates will be less stable.

Code

df_dist <- df |>
  mutate(
    age_group          = factor(age_group,
                                levels = c("18-34", "35-44", "45-54", "55-64", "65-74", "75+")),
    education          = factor(education,
                                levels = c("Did not graduate high school", "High school graduate",
                                           "Some college", "College graduate")),
    income_group       = factor(income_group,
                                levels = c("<$15k", "$15k-$25k", "$25k-$35k",
                                           "$35k-$50k", "$50k-$100k", ">$100k")),
    bmi_category       = factor(bmi_category,
                                levels = c("Underweight", "Normal", "Overweight", "Obese")),
    smoking_status     = factor(smoking_status,
                                levels = c("Never", "Former",
                                           "Current (some days)", "Current (daily)")),
    mental_health_days = factor(mental_health_days,
                                levels = c("0 days", "1-13 days", "14+ days"))
  )

cat_vars <- c("age_group", "sex", "race_ethnicity", "education", "income_group",
              "bmi_category", "smoking_status", "mental_health_days")

dist_data <- purrr::map_dfr(cat_vars, function(v) {
  counts <- df_dist |>
    filter(!is.na(.data[[v]])) |>
    count(variable = v, level = as.character(.data[[v]])) |>
    mutate(pct = n / sum(n) * 100)

  if (is.factor(df_dist[[v]])) {
    counts |> mutate(level = factor(level, levels = levels(df_dist[[v]])))
  } else {
    counts |> arrange(desc(n)) |> mutate(level = factor(level, levels = unique(level)))
  }
}) |>
  mutate(level = fct_recode(level,
                             "AI/AN" = "American Indian/Alaskan Native"))

facet_labels <- c(
  age_group          = "Age group",
  sex                = "Sex",
  race_ethnicity     = "Race/ethnicity",
  education          = "Education",
  income_group       = "Household income",
  bmi_category       = "BMI category",
  smoking_status     = "Smoking status",
  mental_health_days = "Poor mental health days"
)

ggplot(dist_data, aes(x = pct, y = level)) +
  geom_col(fill = palette_epi[["sky_blue"]], show.legend = FALSE) +
  geom_text(aes(label = paste0(round(pct, 1), "%")), hjust = -0.1, size = 3) +
  facet_wrap(~variable, scales = "free_y", ncol = 2,
             labeller = as_labeller(facet_labels)) +
  labs(x = "% of sample", y = NULL,
       title = "Categorical predictor distributions") +
  xlim(0, max(dist_data$pct) * 1.2) +
  theme(
    panel.grid.major.y = element_blank(),
    panel.grid.minor   = element_blank(),
    panel.grid.major.x = element_line(colour = "grey92", linewidth = 0.3),
    plot.title         = element_text(hjust = 0.5, size = 14, face = "bold"),
    strip.text         = element_text(face = "bold", size = 10)
  )

The sample skews older and more educated: adults aged 65 and over make up 48% of respondents, and college graduates are the largest education category (42.1%). Most respondents (69%) are overweight or obese, so BMI contrasts are well represented in the data. The most consequential imbalance is in race and ethnicity. White respondents account for 73.5% of the sample, while several groups (Pacific Islander, American Indian or Alaska Native, Other/Multiracial, Hispanic, and Asian) each fall below 3%, so their estimates will be less precise and need cautious interpretation.

Code

binary_labels <- c(
  physically_active = "Physically active",
  heavy_drinker     = "Heavy drinker",
  has_provider      = "Has personal doctor"
)

binary_data <- tibble(
  variable = names(binary_labels),
  label    = binary_labels
) |>
  mutate(pct = map_dbl(variable, ~mean(df[[.x]], na.rm = TRUE) * 100)) |>
  arrange(pct) |>
  mutate(label = factor(label, levels = label))

ggplot(binary_data, aes(x = pct, y = label)) +
  geom_segment(aes(x = 0, xend = pct, yend = label),
               colour = palette_epi[["sky_blue"]]) +
  geom_point(colour = palette_epi[["sky_blue"]], size = 3) +
  geom_text(aes(label = paste0(round(pct, 1), "%")),
            hjust = -0.3, size = 3) +
  labs(x = "% of sample", y = NULL,
       title = "Binary predictor prevalence") +
  xlim(0, 105) +
  theme(
    panel.grid.major.y = element_blank(),
    panel.grid.minor   = element_blank(),
    panel.grid.major.x = element_line(colour = "grey92", linewidth = 0.3),
    plot.title         = element_text(hjust = 0.5, size = 14, face = "bold")
  )

All three binary predictors are strongly imbalanced. Heavy drinking is rare (5.9%), so estimates of its association with diabetes rest on a small exposed group. Having a personal doctor (87.6%) and being physically active (76.7%) are also one-sided, which limits contrast and precision.

4.2 Outcome prevalence

Code

prev <- df |>
  count(diabetes) |>
  mutate(
    prop = n / sum(n),
    pct  = scales::percent(prop, accuracy = 0.1)
  )

prev

14.5% of respondents (65,809 of 453,241) reported a diabetes diagnosis. The outcome is moderately imbalanced rather than rare: there are enough cases for stable estimation, but the imbalance matters when interpreting results.

4.3 Table 1: Sample characteristics by diabetes status

Code

df |>
  select(
    diabetes,
    age_group, sex, race_ethnicity, education, income_group,
    bmi_category, physically_active, smoking_status,
    heavy_drinker, has_provider, mental_health_days
  ) |>
  mutate(
    diabetes   = factor(diabetes, levels = c(0, 1), labels = c("No diabetes", "Diabetes")),
    age_group  = factor(age_group,
                        levels = c("18-34", "35-44", "45-54", "55-64", "65-74", "75+")),
    education  = factor(education,
                        levels = c("Did not graduate high school", "High school graduate",
                                   "Some college", "College graduate")),
    income_group = factor(income_group,
                          levels = c("<$15k", "$15k-$25k", "$25k-$35k",
                                     "$35k-$50k", "$50k-$100k", ">$100k")),
    bmi_category = factor(bmi_category,
                          levels = c("Underweight", "Normal", "Overweight", "Obese")),
    smoking_status = factor(smoking_status,
                            levels = c("Never", "Former", "Current (some days)", "Current (daily)"))
  ) |>
  tbl_summary(
    by      = diabetes,
    missing = "no",
    label   = list(
      age_group          ~ "Age group",
      sex                ~ "Sex",
      race_ethnicity     ~ "Race/ethnicity",
      education          ~ "Education",
      income_group       ~ "Household income",
      bmi_category       ~ "BMI category",
      physically_active  ~ "Physically active",
      smoking_status     ~ "Smoking status",
      heavy_drinker      ~ "Heavy drinker",
      has_provider       ~ "Has personal doctor",
      mental_health_days ~ "Poor mental health days"
    )
  ) |>
  add_p() |>
  add_overall() |>
  bold_p(t = 0.05) |>
  bold_labels() |>
  modify_caption("**Table 1.** Sample characteristics by diabetes status")

**Table 1.** Sample characteristics by diabetes status
Characteristic	Overall N = 453,241¹	No diabetes N = 387,432¹	Diabetes N = 65,809¹	p-value²
Age group				<0.001
18-34	52,912 (12%)	52,044 (14%)	868 (1.3%)
35-44	54,067 (12%)	52,075 (14%)	1,992 (3.1%)
45-54	58,953 (13%)	53,726 (14%)	5,227 (8.1%)
55-64	65,995 (15%)	55,146 (14%)	10,849 (17%)
65-74	90,478 (20%)	71,895 (19%)	18,583 (29%)
75+	122,745 (28%)	95,443 (25%)	27,302 (42%)
Sex				<0.001
Female	236,324 (52%)	203,373 (52%)	32,951 (50%)
Male	216,917 (48%)	184,059 (48%)	32,858 (50%)
Race/ethnicity				<0.001
American Indian/Alaskan Native	6,361 (1.4%)	5,005 (1.3%)	1,356 (2.1%)
Asian	12,471 (2.8%)	11,117 (2.9%)	1,354 (2.1%)
Black	34,803 (7.8%)	27,651 (7.3%)	7,152 (11%)
Hispanic	10,340 (2.3%)	8,935 (2.4%)	1,405 (2.2%)
Other	47,877 (11%)	41,391 (11%)	6,486 (10%)
Other/Multiracial	3,694 (0.8%)	3,117 (0.8%)	577 (0.9%)
Pacific Islander	2,040 (0.5%)	1,647 (0.4%)	393 (0.6%)
White	326,779 (74%)	281,040 (74%)	45,739 (71%)
Education				<0.001
Did not graduate high school	26,740 (5.9%)	21,034 (5.5%)	5,706 (8.7%)
High school graduate	114,805 (25%)	95,762 (25%)	19,043 (29%)
Some college	119,613 (27%)	100,090 (26%)	19,523 (30%)
College graduate	189,843 (42%)	168,591 (44%)	21,252 (32%)
Household income				<0.001
<$15k	20,524 (6.1%)	16,099 (5.6%)	4,425 (8.6%)
$15k-$25k	33,077 (9.8%)	25,591 (8.9%)	7,486 (15%)
$25k-$35k	41,302 (12%)	33,456 (12%)	7,846 (15%)
$35k-$50k	49,259 (15%)	40,917 (14%)	8,342 (16%)
$50k-$100k	111,561 (33%)	96,279 (34%)	15,282 (30%)
>$100k	81,748 (24%)	73,660 (26%)	8,088 (16%)
BMI category				<0.001
Underweight	7,313 (1.8%)	6,870 (2.0%)	443 (0.7%)
Normal	120,070 (29%)	110,978 (32%)	9,092 (15%)
Overweight	145,411 (35%)	126,113 (36%)	19,298 (32%)
Obese	138,199 (34%)	106,875 (30%)	31,324 (52%)
Physically active	346,848 (77%)	306,100 (79%)	40,748 (62%)	<0.001
Smoking status				<0.001
Never	256,434 (61%)	223,656 (62%)	32,778 (53%)
Former	118,674 (28%)	97,155 (27%)	21,519 (35%)
Current (some days)	13,832 (3.3%)	12,028 (3.3%)	1,804 (2.9%)
Current (daily)	32,737 (7.8%)	27,454 (7.6%)	5,283 (8.6%)
Heavy drinker	23,954 (5.9%)	22,272 (6.4%)	1,682 (2.8%)	<0.001
Has personal doctor	393,314 (88%)	330,328 (86%)	62,986 (96%)	<0.001
Poor mental health days				<0.001
0 days	267,874 (60%)	228,118 (60%)	39,756 (62%)
1-13 days	115,947 (26%)	101,937 (27%)	14,010 (22%)
14+ days	61,452 (14%)	50,831 (13%)	10,621 (16%)
¹ n (%)
² Pearson’s Chi-squared test

Age and BMI show the strongest gradients: the diabetes group skews substantially older, and obesity is markedly more common (52% vs 30%). Socioeconomic differences follow a similar pattern, with lower income and lower education more prevalent in the diabetes group. Behavioural and health indicators add further contrasts (lower physical activity, more former smokers, and more frequent poor mental health days), though these likely reflect a mix of upstream risk factors and downstream consequences of disease. These overlapping relationships are revisited in the adjusted model.

5 Outcome Prevalence by Subgroup

Code

df_prev <- df |>
  mutate(smoking_status = factor(smoking_status,
                                  levels = c("Never", "Former",
                                             "Current (some days)", "Current (daily)")))

cat_vars <- c("age_group", "sex", "race_ethnicity", "education",
              "income_group", "bmi_category", "smoking_status",
              "mental_health_days")

facet_labels <- c(
  age_group          = "Age group",
  sex                = "Sex",
  race_ethnicity     = "Race/ethnicity",
  education          = "Education",
  income_group       = "Household income",
  bmi_category       = "BMI category",
  smoking_status     = "Smoking status",
  mental_health_days = "Poor mental health days"
)

prev_data <- purrr::map_dfr(cat_vars, function(v) {
  counts <- df_prev |>
    filter(!is.na(.data[[v]])) |>
    group_by(variable = v, level = .data[[v]]) |>
    summarise(
      n        = n(),
      cases    = sum(diabetes),
      prev_pct = mean(diabetes) * 100,
      .groups  = "drop"
    )

  if (is.factor(df_prev[[v]])) {
    counts |> mutate(level = factor(as.character(level),
                                    levels = levels(df_prev[[v]])))
  } else {
    counts |> arrange(prev_pct) |>
      mutate(level = factor(as.character(level), levels = unique(as.character(level))))
  }
}) |>
  mutate(level = fct_recode(level, "AI/AN" = "American Indian/Alaskan Native"))

ggplot(prev_data, aes(x = prev_pct, y = level)) +
  geom_col(fill = palette_epi[["sky_blue"]], show.legend = FALSE) +
  geom_text(aes(label = paste0(round(prev_pct, 1), "%")),
            hjust = -0.1, size = 3) +
  facet_wrap(~variable, scales = "free_y", ncol = 2,
             labeller = as_labeller(facet_labels)) +
  labs(x = "Diabetes prevalence (%)", y = NULL,
       title = "Crude diabetes prevalence by predictor category") +
  xlim(0, max(prev_data$prev_pct) * 1.2) +
  theme(
    panel.grid.major.y = element_blank(),
    panel.grid.minor   = element_blank(),
    panel.grid.major.x = element_line(colour = "grey92", linewidth = 0.3),
    plot.title         = element_text(hjust = 0.5, size = 14, face = "bold"),
    strip.text         = element_text(face = "bold", size = 10)
  )

Crude diabetes prevalence by key predictor categories

Prevalence climbs from 1.6% at ages 18-34 to 22.2% at 75+, and the BMI gradient is nearly as steep: 22.7% in the obese group versus 7.6% at normal weight. Prevalence also rises as income and education fall.

Behavioural and health indicators show further differences, including higher prevalence among former smokers and among those reporting 14+ poor mental health days, though these may reflect confounding or changes following diagnosis rather than independent effects. Variation across race and ethnicity is substantial but rests on small samples in several groups.

6 Unadjusted Associations

Diabetes status is regressed on each predictor individually, producing a crude odds ratio (OR) for the association without adjustment for any other variable.

Code

df |>
  mutate(
    diabetes          = factor(diabetes, levels = c(0, 1)),
    age_group         = factor(age_group,
                               levels = c("18-34", "35-44", "45-54", "55-64", "65-74", "75+")),
    sex               = factor(sex, levels = c("Male", "Female")),
    race_ethnicity    = factor(race_ethnicity, levels = c("White", "Black",
                                                          "American Indian/Alaskan Native",
                                                          "Asian", "Pacific Islander",
                                                          "Other/Multiracial",
                                                          "Hispanic", "Other")),
    education         = factor(education,
                               levels = c("College graduate", "Some college",
                                          "High school graduate", "Did not graduate high school")),
    income_group      = factor(income_group,
                               levels = c(">$100k", "$50k-$100k", "$35k-$50k",
                                          "$25k-$35k", "$15k-$25k", "<$15k")),
    bmi_category      = factor(bmi_category,
                               levels = c("Normal", "Underweight", "Overweight", "Obese")),
    smoking_status    = factor(smoking_status,
                               levels = c("Never", "Former", "Current (some days)", "Current (daily)")),
    mental_health_days = factor(mental_health_days,
                                levels = c("0 days", "1-13 days", "14+ days"))
  ) |>
  select(
    diabetes,
    age_group, sex, race_ethnicity, education, income_group,
    bmi_category, physically_active, smoking_status,
    heavy_drinker, has_provider, mental_health_days
  ) |>
  tbl_uvregression(
    method       = glm,
    y            = diabetes,
    method.args  = list(family = binomial),
    exponentiate = TRUE,
    estimate_fun = \(x) style_sigfig(x, digits = 3),
    label        = list(
      age_group          ~ "Age group",
      sex                ~ "Sex",
      race_ethnicity     ~ "Race/ethnicity",
      education          ~ "Education",
      income_group       ~ "Household income",
      bmi_category       ~ "BMI category",
      physically_active  ~ "Physically active",
      smoking_status     ~ "Smoking status",
      heavy_drinker      ~ "Heavy drinker",
      has_provider       ~ "Has personal doctor",
      mental_health_days ~ "Poor mental health days"
    )
  ) |>
  bold_p(t = 0.05) |>
  bold_labels() |>
  modify_caption("**Table 2.** Unadjusted odds ratios for diabetes by predictor")

**Table 2.** Unadjusted odds ratios for diabetes by predictor
Characteristic	N	OR	95% CI	p-value
Age group	445,150
18-34		—	—
35-44		2.29	2.12, 2.49	<0.001
45-54		5.83	5.43, 6.28	<0.001
55-64		11.8	11.0, 12.7	<0.001
65-74		15.5	14.5, 16.6	<0.001
75+		17.2	16.0, 18.4	<0.001
Sex	453,241
Male		—	—
Female		0.908	0.893, 0.923	<0.001
Race/ethnicity	444,365
White		—	—
Black		1.59	1.55, 1.63	<0.001
American Indian/Alaskan Native		1.66	1.57, 1.77	<0.001
Asian		0.748	0.706, 0.792	<0.001
Pacific Islander		1.47	1.31, 1.64	<0.001
Other/Multiracial		1.14	1.04, 1.24	0.005
Hispanic		0.966	0.912, 1.02	0.2
Other		0.963	0.936, 0.990	0.008
Education	451,001
College graduate		—	—
Some college		1.55	1.52, 1.58	<0.001
High school graduate		1.58	1.54, 1.61	<0.001
Did not graduate high school		2.15	2.08, 2.22	<0.001
Household income	337,471
>$100k		—	—
$50k-$100k		1.45	1.40, 1.49	<0.001
$35k-$50k		1.86	1.80, 1.92	<0.001
$25k-$35k		2.14	2.07, 2.21	<0.001
$15k-$25k		2.66	2.57, 2.76	<0.001
<$15k		2.50	2.40, 2.61	<0.001
BMI category	410,993
Normal		—	—
Underweight		0.787	0.712, 0.867	<0.001
Overweight		1.87	1.82, 1.92	<0.001
Obese		3.58	3.49, 3.67	<0.001
Physically active	452,007	0.431	0.424, 0.439	<0.001
Smoking status	421,677
Never		—	—
Former		1.51	1.48, 1.54	<0.001
Current (some days)		1.02	0.972, 1.08	0.4
Current (daily)		1.31	1.27, 1.36	<0.001
Heavy drinker	407,142	0.424	0.403, 0.446	<0.001
Has personal doctor	448,757	4.29	4.11, 4.48	<0.001
Poor mental health days	445,273
0 days		—	—
1-13 days		0.789	0.772, 0.805	<0.001
14+ days		1.20	1.17, 1.23	<0.001
Abbreviations: CI = Confidence Interval, OR = Odds Ratio

The age odds ratio rises steadily from 2.29 in the 35-44 group to 17.2 in those aged 75+, and adults with obesity have 3.58 times the odds of those at normal weight. Odds also rise step by step as income and education fall.

As in the prevalence plots, the lower odds among physically active respondents and higher odds among former smokers likely reflect confounding and behaviour changes following diagnosis. Heavy drinking (OR 0.42) and having a personal doctor (OR 4.29) show clear signs of reverse causation: the direction of association fits disease status influencing behaviour or healthcare contact. These crude estimates provide context, but many will attenuate or change after adjustment.

7 Missingness

The SQL pipeline recoded “don’t know”, “refused”, and “not applicable” responses to NULL. Variables not shown have no missing values.

Code

miss_summary(df)

Income group stands out with 25.5% missingness, the only variable where nonresponse is large enough to meaningfully shrink the complete-case sample. Most other variables fall below 10%, with heavy drinking (10.2%) and BMI category (9.3%) adding some row loss. Whether income nonresponse is random or related to diabetes risk is unclear, but its scale matters when interpreting complete-case results.

8 Summary

Age and BMI are the most consistent signals, with strong, monotonic gradients across the descriptive analyses. Socioeconomic predictors show clear unadjusted associations, while income group carries substantial missingness (25.5%) that shrinks the complete-case sample. Several behavioural variables, particularly heavy drinking and having a personal doctor, show patterns more consistent with reverse causation than independent effects.

The adjusted model will help separate these overlapping relationships and show which associations persist.

9 Session Info

R session info

sessionInfo()
## R version 4.5.0 (2025-04-11)
## Platform: aarch64-apple-darwin20
## Running under: macOS 26.0.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Australia/Sydney
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices datasets  utils     methods   base     
## 
## other attached packages:
##  [1] janitor_2.2.1   gtsummary_2.5.0 arrow_23.0.1.2  lubridate_1.9.5
##  [5] forcats_1.0.1   stringr_1.6.0   dplyr_1.2.1     purrr_1.2.1    
##  [9] readr_2.2.0     tidyr_1.3.2     tibble_3.3.1    ggplot2_4.0.2  
## [13] tidyverse_2.0.0 here_1.0.2     
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6         xfun_0.57            htmlwidgets_1.6.4   
##  [4] tzdb_0.5.0           vctrs_0.7.2          tools_4.5.0         
##  [7] generics_0.1.4       pkgconfig_2.0.3      RColorBrewer_1.1-3  
## [10] S7_0.2.1             assertthat_0.2.1     gt_1.3.0            
## [13] lifecycle_1.0.5      compiler_4.5.0       farver_2.1.2        
## [16] snakecase_0.11.1     litedown_0.9         htmltools_0.5.9     
## [19] sass_0.4.10          yaml_2.3.12          pillar_1.11.1       
## [22] broom.helpers_1.22.0 commonmark_2.0.0     tidyselect_1.2.1    
## [25] digest_0.6.39        stringi_1.8.7        labeling_0.4.3      
## [28] labelled_2.16.0      rprojroot_2.1.1      fastmap_1.2.0       
## [31] grid_4.5.0           cli_3.6.5            magrittr_2.0.5      
## [34] base64enc_0.1-6      cards_0.7.1          broom_1.0.12        
## [37] withr_3.0.2          scales_1.4.0         backports_1.5.1     
## [40] bit64_4.6.0-1        cardx_0.3.2          timechange_0.4.0    
## [43] rmarkdown_2.31       bit_4.6.0            hms_1.1.4           
## [46] evaluate_1.0.5       knitr_1.51           haven_2.5.5         
## [49] markdown_2.0         rlang_1.2.0          glue_1.8.0          
## [52] xml2_1.5.2           renv_1.2.0           rstudioapi_0.18.0   
## [55] jsonlite_2.0.0       R6_2.6.1             fs_2.0.1