Data, 2003a Kosinski & Barnhart:
cad = read.csv("https://wnarifin.github.io/data/2003kosinski_cad.csv")
str(cad)
## 'data.frame': 2688 obs. of 5 variables:
## $ X1: int 0 0 0 0 0 0 0 0 0 0 ...
## $ X2: int 0 0 0 0 0 0 0 0 0 0 ...
## $ X3: int 0 0 0 0 0 0 0 0 0 0 ...
## $ T : int 0 0 0 0 0 0 0 0 1 1 ...
## $ D : int 0 0 0 0 0 0 0 1 0 0 ...
# Verification status
cad$R = 1 # R = Verified: 1 = yes, 0 = no
cad[is.na(cad$D), "R"] = 0
# Incomplete data with NA recoding
cad_ = cad
cad_[is.na(cad_$D), "D"] = -1 # recode NA as -1, this will be -1*0=0 for unverified. Else NA*0=NA.
library(prediction)
library(mice)
##
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
##
## cbind, rbind
\[ \widehat{Sensitivity}_{CC} = P(T=1|D=1) \]
\[ \widehat{Specificity}_{CC} = P(T=0|D=0) \]
tbl = table(cad[, c("T", "D")]); tbl
## D
## T 0 1
## 0 39 5
## 1 232 195
sn0 = tbl[2,2]/sum(tbl[,2]); sn0
## [1] 0.975
sp0 = tbl[1,1]/sum(tbl[,1]); sp0
## [1] 0.1439114
\[ \begin{split} \widehat{Sensitivity}_{BG} & = P(T=1|D=1) \\ & = \frac{P(T=1)P(D=1|T=1,V=1)}{\sum_{T}{P(T)P(D=1|T,V=1)}} \end{split} \]
\[ \begin{split} \widehat{Specificity}_{BG} & = P(T=0|D=0) \\ & = \frac{P(T=0)P(D=0|T=0,V=1)}{\sum_{T}{P(T)P(D=0|T,V=1)}} \end{split} \]
tbl_ = table(cad_[, c("T", "D")]); tbl_ # -1 column -- unverified cases
## D
## T -1 0 1
## 0 1221 39 5
## 1 996 232 195
upper = (sum(cad_$T)/nrow(cad_)) * (sum(cad_$D*cad_$T*cad_$R)/sum(cad_$T*cad_$R))
lower = (sum(1-cad_$T)/nrow(cad_)) * (sum(cad_$D*(1-cad_$T)*cad_$R)/sum((1-cad_$T)*cad_$R))
sn1 = upper / (upper+lower); sn1
## [1] 0.8188629
upper1 = (sum(1-cad_$T)/nrow(cad_)) * (sum((1-cad_$D)*(1-cad_$T)*cad_$R)/sum((1-cad_$T)*cad_$R))
lower1 = (sum(cad_$T)/nrow(cad_)) * (sum((1-cad_$D)*cad_$T*cad_$R)/sum(cad_$T*cad_$R))
sp1 = upper1 / (upper1+lower1); sp1
## [1] 0.5918754
model1 = glm(D ~ T, data = cad, family = "binomial") # Modelled using complete data
summary(model1)
##
## Call:
## glm(formula = D ~ T, family = "binomial", data = cad)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1046 -1.1046 -0.4912 1.2520 2.0856
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.0541 0.4750 -4.324 1.53e-05 ***
## T 1.8804 0.4848 3.878 0.000105 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 642.20 on 470 degrees of freedom
## Residual deviance: 619.89 on 469 degrees of freedom
## (2217 observations deleted due to missingness)
## AIC: 623.89
##
## Number of Fisher Scoring iterations: 4
preds = prediction(model1, cad_)$fitted # Predicted on incomplete data
sn2 = sum(cad_$T*preds) / sum(preds); sn2 # P(T=1|D=1)
## [1] 0.8188629
sp2 = sum((1-cad_$T)*(1-preds)) / sum(1-preds); sp2 # P(T=0|D=0)
## [1] 0.5918754
MI by Logistic Regression. Also refer to codes by deGroot et al 2008 for Predictive Mean Matching.
m = 10 # number of imputed data sets
seednum = 12345
cad1 = cad[, c("T","D")] # select only T & D, else MI use all variables for imputation
cad1$D = as.factor(cad1$D); data_impute = mice(cad1, m = m, method = "logreg", seed = seednum) # logistic regression
# results dependent on seed & method, seems pmm less variant than logreg
data_impute$method
## T D
## "" "logreg"
cad_mi = complete(data_impute, action = "repeated") # MI data
## New names:
## * T -> T...1
## * D -> D...2
## * T -> T...3
## * D -> D...4
## * T -> T...5
## * ...
names(cad_mi)
## [1] "T.1" "T.2" "T.3" "T.4" "T.5" "T.6" "T.7" "T.8" "T.9" "T.10"
## [11] "D.1" "D.2" "D.3" "D.4" "D.5" "D.6" "D.7" "D.8" "D.9" "D.10"
# loop to calculate sn & spe
sn3m = sp3m = rep(NA, m)
for (i in 1:m) {
tbl = table(T=cad_mi[,1],D=cad_mi[,m+1])
sn3m[i] = tbl[2,2]/sum(tbl[,2])
sp3m = tbl[1,1]/sum(tbl[,1])
}
sn3 = mean(sn3m); sn3
## [1] 0.8454545
sp3 = mean(sp3m); sp3
## [1] 0.5734714
By running multiple cycles of weighted logistic regression models on pseudodata – Konsinski & Barnhart, 2003. They considered selection model for joint probability of \(V\), \(T\) and \(D\) given covariates \(\mathbf{X}\) as follows: \[ \label{eq:kos1} P(V,T,D|\mathbf{X}) = P(D|\mathbf{X}) \times P(T|D,\mathbf{X}) \times P(V|T,D,\mathbf{X}) \]
They parameterize the components in the equation with three separate logistic regression models:
Disease component: \[logit\ P(D_i=1|\mathbf{x}_i)=\mathbf{z}_{0i}^{'}\mathbf{\alpha}\]
Diagnostic test component: \[logit\ P(T_i=1|D_i,\mathbf{x}_i)=\mathbf{z}_{1i}^{'}\mathbf{\beta}\]
Missing data mechanism component: \[logit\ P(R_i=1|D_i,T_i,\mathbf{x}_i)=\mathbf{z}_{2i}^{'}\mathbf{\gamma}\] The method is implemented as follows:
The weights \(w_k(\theta)\) are defined as follows: \[ w_{k}(\theta) = \begin{cases} 1 & \text{for}\ k=1,...,n-u\\ p_{k}(\theta)/\{p_k(\theta) + p_{k+U}(\theta) \} & \text{for}\ k=n-u+1,...,n\\ 1-w_{k-u}(\theta) & \text{for}\ k=n+1,...,n+u \end{cases} \] where \(p_{k}(\theta) = \prod_{m=0}^2 \{p_{mk}(\theta)\}^{y_{mk}} \{1-p_{mk}(\theta)\}^{1-y_{mk}}\). \(p_{mk}\) can be easily obtained from the respective logistic regression models for each observation at each iteration using standard logistic regression module by obtaining the probability of the outcomes (i.e. \(D=1\), \(T=1\) or \(V=1\)) for each of the patients.
# pseudo-data
cad$R = 1 # R = Verified: 1 = yes, 0 = no
cad[is.na(cad$D), "R"] = 0
cad_pseudo = rbind(cad, cad[cad$R == 0, ]) # replicate U observation
str(cad_pseudo)
## 'data.frame': 4905 obs. of 6 variables:
## $ X1: int 0 0 0 0 0 0 0 0 0 0 ...
## $ X2: int 0 0 0 0 0 0 0 0 0 0 ...
## $ X3: int 0 0 0 0 0 0 0 0 0 0 ...
## $ T : int 0 0 0 0 0 0 0 0 1 1 ...
## $ D : int 0 0 0 0 0 0 0 1 0 0 ...
## $ R : num 1 1 1 1 1 1 1 1 1 1 ...
sum(is.na(cad_pseudo$D)) # 2217*2 = 4434
## [1] 4434
table(cad_pseudo$T)
##
## 0 1
## 2486 2419
# create 0, 1 for 2U rows
cad_pseudo[(sum(cad$R)+1):nrow(cad), "D"] = 0 # 1st U rows
cad_pseudo[(nrow(cad)+1):nrow(cad_pseudo), "D"] = 1 # 2nd U rows
# view
head(cad_pseudo)
## X1 X2 X3 T D R
## 1 0 0 0 0 0 1
## 2 0 0 0 0 0 1
## 3 0 0 0 0 0 1
## 4 0 0 0 0 0 1
## 5 0 0 0 0 0 1
## 6 0 0 0 0 0 1
head(cad_pseudo[(sum(cad$R)+1):nrow(cad),])
## X1 X2 X3 T D R
## 472 0 0 0 0 0 0
## 473 0 0 0 0 0 0
## 474 0 0 0 0 0 0
## 475 0 0 0 0 0 0
## 476 0 0 0 0 0 0
## 477 0 0 0 0 0 0
head(cad_pseudo[(nrow(cad)+1):nrow(cad_pseudo),])
## X1 X2 X3 T D R
## 4721 0 0 0 0 1 0
## 4731 0 0 0 0 1 0
## 4741 0 0 0 0 1 0
## 4751 0 0 0 0 1 0
## 4761 0 0 0 0 1 0
## 4771 0 0 0 0 1 0
n_u = nrow(cad_pseudo) # total cases + U
# index k
index_1 = which(cad$R == 1) # verified
index_2 = (sum(cad$R)+1):nrow(cad) # unverified U
index_3 = (nrow(cad)+1):nrow(cad_pseudo) # unverified 2U
Components: a = disease, b = diagnostic, c = missing data mechanism
# initialize values
weight_k = weight_ka = weight_kb = weight_kc = rep(1, nrow(cad_pseudo)) # init weight = 1 for all
coef_1 = vector("list", 0)
max_t = 500 # may increase
# EM Algorithm Iteration
for (t in 1:max_t) {
# a -- P(D)
model1a = glm(D ~ 1, data = cad_pseudo, family = "binomial", weight = weight_k)
su_model1a = summary(model1a)
coef_1a = su_model1a$coefficients
fitted_pa = model1a$fitted.values
# b -- P(T|D)
model1b = glm(T ~ D, data = cad_pseudo, family = "binomial", weight = weight_k)
su_model1b = summary(model1b)
coef_1b = su_model1b$coefficients
fitted_pb = model1b$fitted.values
# c -- P(R|T)
model1c = glm(R ~ T, data = cad_pseudo, family = "binomial", weight = weight_k)
su_model1c = summary(model1c)
coef_1c = su_model1c$coefficients
fitted_pc = model1c$fitted.values
# all
coef_1 = append(coef_1, list(list(coef_1a, coef_1b, coef_1c)))
# weight
fitted_ps = list(fitted_pa, fitted_pb, fitted_pc)
ys = list(model1a$y, model1b$y, model1c$y)
p0 = (fitted_ps[[1]]^ys[[1]]) * ((1 - fitted_ps[[1]])^(1 - ys[[1]])) # P(D)
p1 = (fitted_ps[[2]]^ys[[2]]) * ((1 - fitted_ps[[2]])^(1 - ys[[2]])) # P(T|D)
p2 = (fitted_ps[[3]]^ys[[3]]) * ((1 - fitted_ps[[3]])^(1 - ys[[3]])) # P(R|T)
pk = p0 * p1 * p2 # P(R,T,D|X)
weight_k[index_2] = pk[index_2] / (pk[index_2] + pk[index_3])
weight_k[index_3] = 1 - weight_k[index_2]
# next iteration
t = t + 1
}
coef_1[[max_t]]
## [[1]]
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.8700818 0.04228422 -20.57699 4.412645e-94
##
## [[2]]
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.3717234 0.04674685 -7.951839 1.837626e-15
## D 1.8803858 0.10334682 18.194908 5.663396e-74
##
## [[3]]
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.323236 0.1534478 -21.65711 5.209001e-104
## T 2.476273 0.1639883 15.10030 1.611971e-51
sn4 = predict(model1b, list(D=1), type="response"); sn4 # P(T=1|D=1)
## 1
## 0.8188629
sp4 = 1 - predict(model1b, list(D=0), type="response"); sp4 # P(T=0|D=0)
## 1
## 0.5918754
snsp = c(sn0,sp0,sn1,sp1,sn2,sp2,sn3,sp3,sn4,sp4)
snsp = matrix(snsp, ncol = 2, nrow = length(snsp)/2, byrow = T)
rownames(snsp) = c("Complete-case","B&G Count","B&G Regression","Multiple Imputation","EM Algorithm")
colnames(snsp) = c("Sensitivity","Specificity")
tbl_compare = round(snsp, 3)
knitr::kable(tbl_compare)
Sensitivity | Specificity | |
---|---|---|
Complete-case | 0.975 | 0.144 |
B&G Count | 0.819 | 0.592 |
B&G Regression | 0.819 | 0.592 |
Multiple Imputation | 0.845 | 0.573 |
EM Algorithm | 0.819 | 0.592 |