1 Library

library(summarytools)  # descriptive table (descr, freq, stby)
library(gtsummary)     # descriptive table (tbl_summary)
library(ordinal)       # clm, fit proportional odds model
library(broom)         # tidy, to extract results from models
library(knitr)         # for nice display of results
library(gofcat)        # model fit

2 Dataset

ms = read.csv("ms.csv", stringsAsFactors = T)
str(ms)
'data.frame':   4083 obs. of  4 variables:
 $ fbs    : Factor w/ 3 levels "DM","IFG","Normal": 3 3 3 3 3 3 3 3 3 3 ...
 $ totchol: num  5.43 5.13 5.55 4.01 5.21 ...
 $ hpt    : Factor w/ 2 levels "no","yes": 2 1 1 1 1 1 1 1 1 1 ...
 $ weight : num  40 54.6 37 48.4 44.5 ...
summary(ms)
     fbs          totchol        hpt           weight      
 DM    : 594   Min.   : 0.180   no :3601   Min.   : 30.00  
 IFG   : 364   1st Qu.: 4.970   yes: 482   1st Qu.: 53.80  
 Normal:3125   Median : 5.700              Median : 62.30  
               Mean   : 5.791              Mean   : 63.93  
               3rd Qu.: 6.525              3rd Qu.: 72.00  
               Max.   :23.140              Max.   :187.80  
ms

2.1 Categories of outcome

levels(ms$fbs)  # Levels: DM IFG Normal, DM as reference
[1] "DM"     "IFG"    "Normal"
ms$fbs = factor(ms$fbs, levels = c("Normal", "IFG", "DM"), ordered = T)  # ordinal, Normal as reference category
str(ms$fbs)
 Ord.factor w/ 3 levels "Normal"<"IFG"<..: 1 1 1 1 1 1 1 1 1 1 ...
levels(ms$fbs)  # Normal as reference
[1] "Normal" "IFG"    "DM"    

3 Descriptive

3.1 Numerical

descr(ms)
Non-numerical variable(s) ignored: fbs, hpt
Descriptive Statistics  
ms  
N: 4083  

                    totchol    weight
----------------- --------- ---------
             Mean      5.79     63.93
          Std.Dev      1.28     14.30
              Min      0.18     30.00
               Q1      4.97     53.80
           Median      5.70     62.30
               Q3      6.53     72.00
              Max     23.14    187.80
              MAD      1.14     13.49
              IQR      1.56     18.20
               CV      0.22      0.22
         Skewness      1.16      0.98
      SE.Skewness      0.04      0.04
         Kurtosis     10.20      2.97
          N.Valid   4083.00   4083.00
        Pct.Valid    100.00    100.00
with(ms, stby(totchol, fbs, descr))
Descriptive Statistics  
totchol by fbs  
Data Frame: ms  
N: 3125  

                     Normal      IFG       DM
----------------- --------- -------- --------
             Mean      5.69     6.08     6.16
          Std.Dev      1.24     1.37     1.31
              Min      0.18     2.33     2.32
               Q1      4.89     5.29     5.29
           Median      5.62     5.94     6.03
               Q3      6.41     6.72     6.96
              Max     23.14    14.91    12.29
              MAD      1.13     1.07     1.19
              IQR      1.52     1.43     1.67
               CV      0.22     0.22     0.21
         Skewness      1.27     1.40     0.55
      SE.Skewness      0.04     0.13     0.10
         Kurtosis     13.58     6.20     1.26
          N.Valid   3125.00   364.00   594.00
        Pct.Valid    100.00   100.00   100.00
with(ms, stby(weight, fbs, descr))
Descriptive Statistics  
weight by fbs  
Data Frame: ms  
N: 3125  

                     Normal      IFG       DM
----------------- --------- -------- --------
             Mean     62.74    67.71    67.86
          Std.Dev     13.96    14.50    14.85
              Min     30.00    38.90    30.80
               Q1     53.00    56.80    58.00
           Median     61.00    65.85    66.00
               Q3     71.00    76.75    75.70
              Max    187.80   121.00   147.50
              MAD     13.34    14.60    12.82
              IQR     18.00    19.77    17.60
               CV      0.22     0.21     0.22
         Skewness      1.03     0.58     1.04
      SE.Skewness      0.04     0.13     0.10
         Kurtosis      3.58     0.37     2.55
          N.Valid   3125.00   364.00   594.00
        Pct.Valid    100.00   100.00   100.00

3.2 Categorical

freq(ms$fbs); freq(ms$hpt)
Frequencies  
ms$fbs  
Type: Ordered Factor  

               Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
------------ ------ --------- -------------- --------- --------------
      Normal   3125     76.54          76.54     76.54          76.54
         IFG    364      8.92          85.45      8.92          85.45
          DM    594     14.55         100.00     14.55         100.00
        <NA>      0                               0.00         100.00
       Total   4083    100.00         100.00    100.00         100.00
Frequencies  
ms$hpt  
Type: Factor  

              Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
----------- ------ --------- -------------- --------- --------------
         no   3601     88.19          88.19     88.19          88.19
        yes    482     11.81         100.00     11.81         100.00
       <NA>      0                               0.00         100.00
      Total   4083    100.00         100.00    100.00         100.00
with(ms, stby(hpt, fbs, freq))
Frequencies  
ms$hpt  
Type: Factor  
Group: fbs = Normal  

              Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
----------- ------ --------- -------------- --------- --------------
         no   2844     91.01          91.01     91.01          91.01
        yes    281      8.99         100.00      8.99         100.00
       <NA>      0                               0.00         100.00
      Total   3125    100.00         100.00    100.00         100.00

Group: fbs = IFG  

              Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
----------- ------ --------- -------------- --------- --------------
         no    289     79.40          79.40     79.40          79.40
        yes     75     20.60         100.00     20.60         100.00
       <NA>      0                               0.00         100.00
      Total    364    100.00         100.00    100.00         100.00

Group: fbs = DM  

              Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
----------- ------ --------- -------------- --------- --------------
         no    468     78.79          78.79     78.79          78.79
        yes    126     21.21         100.00     21.21         100.00
       <NA>      0                               0.00         100.00
      Total    594    100.00         100.00    100.00         100.00

3.3 Overall

tbl_summary(ms, by = fbs,
            statistic = list(all_continuous() ~ "{mean} ({sd})", 
                             all_categorical() ~ "{n} ({p}%)"),
            digits = list(all_continuous() ~ c(2,2), 
                          all_categorical() ~ c(0,1)))
Characteristic Normal, N = 3,1251 IFG, N = 3641 DM, N = 5941
totchol 5.69 (1.24) 6.08 (1.37) 6.16 (1.31)
hpt 281 (9.0%) 75 (20.6%) 126 (21.2%)
weight 62.74 (13.96) 67.71 (14.50) 67.86 (14.85)
1 Mean (SD); n (%)

4 Variable selection

4.1 Univariable

log_chol = clm(fbs ~ totchol, data = ms)
summary(log_chol)
formula: fbs ~ totchol
data:    ms

 link  threshold nobs logLik   AIC     niter max.grad cond.H 
 logit flexible  4083 -2817.64 5641.28 5(1)  5.09e-07 1.8e+03

Coefficients:
        Estimate Std. Error z value Pr(>|z|)    
totchol  0.26227    0.02875   9.124   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Threshold coefficients:
           Estimate Std. Error z value
Normal|IFG   2.7288     0.1763   15.48
IFG|DM       3.3281     0.1794   18.55
log_hpt = clm(fbs ~ hpt, data = ms)
summary(log_hpt)
formula: fbs ~ hpt
data:    ms

 link  threshold nobs logLik   AIC     niter max.grad cond.H 
 logit flexible  4083 -2816.60 5639.19 5(1)  1.44e-08 2.5e+01

Coefficients:
       Estimate Std. Error z value Pr(>|z|)    
hptyes  0.95306    0.09802   9.723   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Threshold coefficients:
           Estimate Std. Error z value
Normal|IFG  1.32036    0.04086   32.31
IFG|DM      1.92115    0.04836   39.73
log_wt = clm(fbs ~ weight, data = ms)
summary(log_wt)
formula: fbs ~ weight
data:    ms

 link  threshold nobs logLik   AIC     niter max.grad cond.H 
 logit flexible  4083 -2817.00 5640.01 6(1)  7.25e-13 2.0e+05

Coefficients:
       Estimate Std. Error z value Pr(>|z|)    
weight 0.023197   0.002486    9.33   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Threshold coefficients:
           Estimate Std. Error z value
Normal|IFG   2.6944     0.1691   15.93
IFG|DM       3.2935     0.1722   19.12

4.2 Multivariable

mlog = clm(fbs ~ totchol + hpt + weight, data = ms)
summary(mlog)
formula: fbs ~ totchol + hpt + weight
data:    ms

 link  threshold nobs logLik   AIC     niter max.grad cond.H 
 logit flexible  4083 -2740.27 5490.55 6(1)  1.04e-11 4.2e+05

Coefficients:
        Estimate Std. Error z value Pr(>|z|)    
totchol 0.257688   0.029182   8.830   <2e-16 ***
hptyes  0.848765   0.099833   8.502   <2e-16 ***
weight  0.021852   0.002544   8.591   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Threshold coefficients:
           Estimate Std. Error z value
Normal|IFG   4.2516     0.2503   16.98
IFG|DM       4.8722     0.2535   19.22
tidy(mlog, conf.int = T)

Let’s see p-values by LR test:

anova(clm(fbs ~ hpt + weight, data = ms), mlog)  # D0 - D1, model without - model with variable
anova(clm(fbs ~ totchol + weight, data = ms), mlog)
anova(clm(fbs ~ totchol + hpt, data = ms), mlog)

\(\rightarrow\) Preliminary main effects model

5 Variable assessment

  • Linearity in logit – numerical variable, from separate binary logistic models

  • Other numerical issues

    • Small cell counts
    • Multicollinearity

\(\rightarrow\) Main effects model

6 Interaction

Specify interaction based on clinical opinion, let’s say between hpt * weight:

mlogx = clm(fbs ~ totchol + hpt + weight + hpt * weight, data = ms)
summary(mlogx)  # Interaction not significant
formula: fbs ~ totchol + hpt + weight + hpt * weight
data:    ms

 link  threshold nobs logLik   AIC     niter max.grad cond.H 
 logit flexible  4083 -2739.90 5491.79 6(1)  2.58e-12 8.6e+05

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
totchol        0.256810   0.029201   8.794  < 2e-16 ***
hptyes         1.257361   0.479443   2.623  0.00873 ** 
weight         0.022821   0.002778   8.215  < 2e-16 ***
hptyes:weight -0.005987   0.006880  -0.870  0.38419    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Threshold coefficients:
           Estimate Std. Error z value
Normal|IFG   4.3103     0.2596   16.60
IFG|DM       4.9309     0.2627   18.77

\(\rightarrow\) Preliminary final model

7 Model fit assessment

7.1 Proportional odds assumption

brant.test(mlog)

Brant Test:
           chi-sq   df   pr(>chi)
Omnibus     5.030    3       0.17
totchol     0.803    1       0.37
hptyes      2.105    1       0.15
weight      1.830    1       0.18

H0: Proportional odds assumption holds

Compare with mlogit model to see why proportional odds assumption holds, check ORs

tidy(nnet::multinom(fbs ~ totchol + hpt + weight, data = ms), exponentiate = T)
# weights:  15 (8 variable)
initial  value 4485.633975 
iter  10 value 2739.040075
final  value 2738.198554 
converged
tidy(mlog, exp = T)

7.2 Others

hosmerlem(mlog, tables = T)  # ordinal version of Hosmer-Lemeshow test

Hosmer-Lemeshow Test:

                    Chi-sq  df  pr(>chi)  
ordinal(Hosmerlem)  32.609  17   0.01263 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

H0: No lack of fit dictated

rho: 100% 


Observed Freqs:

 group total Normal IFG  DM
     1   408    375  12  21
     2   408    357  19  32
     3   408    353  25  30
     4   408    313  38  57
     5   408    325  37  46
     6   408    318  33  57
     7   408    315  24  69
     8   408    286  51  71
     9   408    255  56  97
    10   411    228  69 114

Expected Freqs:

 group     OS   Normal     IFG       DM
     1 1.1711 363.4518 19.2985  25.2497
     2 1.2211 350.7204 24.3407  32.9389
     3 1.2559 341.9670 27.6564  38.3766
     4 1.2867 334.2806 30.4688  43.2506
     5 1.3183 326.4476 33.2387  48.3138
     6 1.3531 317.9045 36.1438  53.9517
     7 1.3948 307.7534 39.4339  60.8128
     8 1.4525 293.8628 43.6362  70.5010
     9 1.5461 271.8254 49.5247  86.6499
    10 1.7858 220.1254 58.7670 132.1076
lipsitz(mlog)  # Lipsitz Test for Categorical Response Models

Lipsitz Test:

                      LR   df   pr(>chi)   
ordinal(lipsitz)  23.311    9   0.005533 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

H0: No lack of fit dictated
# Both show poor fit!
Rsquared(mlog, measure = "nagelkerke")

Nagelkerke's R2: 

0.07595 

+ Regression diagnostics – from separate binary logistic models

\(\rightarrow\) Final model

8 Presentation and interpretation

Present in table

tbl_regression(mlog, exp = T)
Characteristic OR1 95% CI1 p-value
totchol 1.29 1.22, 1.37 <0.001
hpt


    no
    yes 2.34 1.92, 2.84 <0.001
weight 1.02 1.02, 1.03 <0.001
1 OR = Odds Ratio, CI = Confidence Interval
# Details
tidy(mlog, conf.int = T) |> kable(digits = 3)
term estimate std.error statistic p.value conf.low conf.high coef.type
Normal|IFG 4.252 0.250 16.985 0 NA NA intercept
IFG|DM 4.872 0.253 19.222 0 NA NA intercept
totchol 0.258 0.029 8.830 0 0.201 0.315 location
hptyes 0.849 0.100 8.502 0 0.652 1.043 location
weight 0.022 0.003 8.591 0 0.017 0.027 location
tidy(mlog, conf.int = T, exponentiate = T) |> kable(digits = 3)
term estimate std.error statistic p.value conf.low conf.high coef.type
Normal|IFG 70.218 0.250 16.985 0 NA NA intercept
IFG|DM 130.608 0.253 19.222 0 NA NA intercept
totchol 1.294 0.029 8.830 0 1.222 1.370 location
hptyes 2.337 0.100 8.502 0 1.919 2.839 location
weight 1.022 0.003 8.591 0 1.017 1.027 location