This code performs cross-sectional analysis using both naive and TMLE analyses on the first wave of simulated data created in the data creation code. The code performs a series of analyses:
Firstly, we define SuperLearner libraries to be used by SuperLearner:
SLlib <- c("SL.glm")
SLlib2 <- c("SL.glm","SL.glm.interaction","SL.stepAIC","SL.ranger")
Next, we create a simple cross-sectional dataset from the longitudinal dataset, using just the variables ba, bb, bc, a_0, l_0, and y_0
csdata <- ldata[,c(2,3,4,6,7,8)]
Now, we run TMLE using manual estimation of each of the component models (outcome and propensity), and then updating the outcome model based on those intitial models. It is worth noting that the initial estimate of the outcome model is the same as the ‘naive’ estimate that would be obtained if we were attempting to estimate the effect of exposure without accounting for the exposure-affected confounding.
# Correctly specified
Q0 <- glm(data=csdata,"y_0 ~ a_0 + l_0 + ba + bb + bc")
QAW <- data.frame(cbind(QA=predict(Q0,type="response"),
Q0=predict(Q0,type="response",newdata=cbind(csdata[,1:4],a_0=0)),
Q1=predict(Q0,type="response",newdata=cbind(csdata[,1:4],a_0=1))))
G <- glm(data=csdata,"a_0 ~ l_0 + ba + bb + bc",family=binomial)
GAW <- predict(G,type="response")
HA1 <- csdata[,5]/GAW
HA0 <- -(1-csdata[,5])/(1-GAW)
H <- HA1+HA0
Q1 <- glm(data=data.frame(cbind(Y=csdata[,6],HA1=HA1,HA0=-HA0,QAW)),"Y ~ -1 + HA1 + HA0 + offset(QA)")
muA1 <- QAW$Q1 + coef(Q1)[1]/GAW
muA0 <- QAW$Q0 + coef(Q1)[2]/(1-GAW)
TMLE <- c(coef=mean(muA1-muA0),
se=var((HA1-HA0)*(csdata[,6]-QAW$QA) + QAW$Q1 - QAW$Q0 - (muA1-muA0))/length(csdata[,1]))
TMLE
coef se
1.10302092 0.01312571
# Outcome model mispecified
Q0m1 <- glm(data=csdata,"y_0 ~ a_0 + ba + bb + bc")
QAWm1 <- data.frame(cbind(QA=predict(Q0m1,type="response"),
Q0=predict(Q0m1,type="response",newdata=cbind(csdata[,1:4],a_0=0)),
Q1=predict(Q0m1,type="response",newdata=cbind(csdata[,1:4],a_0=1))))
Gm1 <- glm(data=csdata,"a_0 ~ l_0 + ba + bb + bc",family=binomial)
GAWm1 <- predict(Gm1,type="response")
HA1m1 <- csdata[,5]/GAWm1
HA0m1 <- -(1-csdata[,5])/(1-GAWm1)
Hm1 <- HA1m1+HA0m1
Q1m1 <- glm(data=data.frame(cbind(Y=csdata[,6],HA1=HA1m1,HA0=-HA0m1,QAWm1)),"Y ~ -1 + HA1 + HA0 + offset(QA)")
muA1m1 <- QAWm1$Q1 + coef(Q1m1)[1]/GAWm1
muA0m1 <- QAWm1$Q0 + coef(Q1m1)[2]/(1-GAWm1)
TMLEm1 <- c(coef=mean(muA1m1-muA0m1),
se=var((HA1m1-HA0m1)*(csdata[,6]-QAWm1$QA) + QAWm1$Q1 - QAWm1$Q0 - (muA1m1-muA0m1))/length(csdata[,1]))
TMLEm1
coef se
1.10768728 0.01554685
# Both models mispecified
Q0m2 <- glm(data=csdata,"y_0 ~ a_0 + ba + bb + bc")
QAWm2 <- data.frame(cbind(QA=predict(Q0m2,type="response"),
Q0=predict(Q0m2,type="response",newdata=cbind(csdata[,1:4],a_0=0)),
Q1=predict(Q0m2,type="response",newdata=cbind(csdata[,1:4],a_0=1))))
Gm2 <- glm(data=csdata,"a_0 ~ ba + bb + bc",family=binomial)
GAWm2 <- predict(Gm2,type="response")
HA1m2 <- csdata[,5]/GAWm2
HA0m2 <- -(1-csdata[,5])/(1-GAWm2)
Hm2 <- HA1m2+HA0m2
Q1m2 <- glm(data=data.frame(cbind(Y=csdata[,6],HA1=HA1m2,HA0=-HA0m2,QAWm2)),"Y ~ -1 + HA1 + HA0 + offset(QA)")
muA1m2 <- QAWm2$Q1 + coef(Q1m2)[1]/GAWm2
muA0m2 <- QAWm2$Q0 + coef(Q1m2)[2]/(1-GAWm2)
TMLEm2 <- c(coef=mean(muA1m2-muA0m2),
se=var((HA1m2-HA0m2)*(csdata[,6]-QAWm2$QA) + QAWm2$Q1 - QAWm2$Q0 - (muA1m2-muA0m2))/length(csdata[,1]))
TMLEm2
coef se
1.46015483 0.01291413
Next, we repeat those same analyses, but this time using the ‘tmle’ package
# Correctly specified
rtmle <- tmle(Y=csdata[,6],A=csdata[,5],W=csdata[,1:4],
Q.SL.library=SLlib,
g.SL.library=SLlib,
Qform="Y~A+l_0+ba+bb+bc",
gform="A~l_0+ba+bb+bc")
summary(rtmle)
Initial estimation of Q
Procedure: glm, user-supplied model
Model:
Y ~ (Intercept) + A + l_0 + ba + bb + bc
Coefficients:
(Intercept) 0.4305679
A 0.1276895
l_0 0.1125699
ba 0.02584993
bb -0.01445202
bc 0.03275401
Estimation of g (treatment mechanism)
Procedure: user-supplied regression formula
Model:
A ~ (Intercept) + l_0 + ba + bb + bc
Coefficients:
(Intercept) -2.098232
l_0 1.634153
ba 0.2559936
bb -0.3602987
bc 0.2255404
Estimation of g.Z (intermediate variable assignment mechanism)
Procedure: No intermediate variable
Estimation of g.Delta (missingness mechanism)
Procedure: No missingness
Bounds on g: ( 0.025 0.975 )
Additive Effect
Parameter Estimate: 1.103
Estimated Variance: 0.013117
p-value: <2e-16
95% Conf Interval: (0.87855, 1.3275)
Additive Effect among the Treated
Parameter Estimate: 1.1373
Estimated Variance: 0.011125
p-value: <2e-16
95% Conf Interval: (0.93059, 1.3441)
Additive Effect among the Controls
Parameter Estimate: 1.1009
Estimated Variance: 0.01473
p-value: <2e-16
95% Conf Interval: (0.86298, 1.3387)
# Outcome model mispecified
rtmlem1 <- tmle(Y=csdata[,6],A=csdata[,5],W=csdata[,1:4],
Q.SL.library=SLlib,
g.SL.library=SLlib,
Qform="Y~A+ba+bb+bc",
gform="A~l_0+ba+bb+bc")
summary(rtmlem1)
Initial estimation of Q
Procedure: glm, user-supplied model
Model:
Y ~ (Intercept) + A + ba + bb + bc
Coefficients:
(Intercept) 0.4474137
A 0.1635434
ba 0.02552904
bb -0.01169196
bc 0.03284896
Estimation of g (treatment mechanism)
Procedure: user-supplied regression formula
Model:
A ~ (Intercept) + l_0 + ba + bb + bc
Coefficients:
(Intercept) -2.098232
l_0 1.634153
ba 0.2559936
bb -0.3602987
bc 0.2255404
Estimation of g.Z (intermediate variable assignment mechanism)
Procedure: No intermediate variable
Estimation of g.Delta (missingness mechanism)
Procedure: No missingness
Bounds on g: ( 0.025 0.975 )
Additive Effect
Parameter Estimate: 1.1077
Estimated Variance: 0.013919
p-value: <2e-16
95% Conf Interval: (0.87647, 1.339)
Additive Effect among the Treated
Parameter Estimate: 1.1433
Estimated Variance: 0.013972
p-value: <2e-16
95% Conf Interval: (0.91158, 1.3749)
Additive Effect among the Controls
Parameter Estimate: 1.1084
Estimated Variance: 0.01538
p-value: <2e-16
95% Conf Interval: (0.86531, 1.3515)
# Both models mispecified
rtmlem2 <- tmle(Y=csdata[,6],A=csdata[,5],W=csdata[,1:4],
Q.SL.library=SLlib,
g.SL.library=SLlib,
Qform="Y~A+ba+bb+bc",
gform="A~ba+bb+bc")
summary(rtmlem2)
Initial estimation of Q
Procedure: glm, user-supplied model
Model:
Y ~ (Intercept) + A + ba + bb + bc
Coefficients:
(Intercept) 0.4474137
A 0.1635434
ba 0.02552904
bb -0.01169196
bc 0.03284896
Estimation of g (treatment mechanism)
Procedure: user-supplied regression formula
Model:
A ~ (Intercept) + ba + bb + bc
Coefficients:
(Intercept) -1.614376
ba 0.2488812
bb -0.3046403
bc 0.2158541
Estimation of g.Z (intermediate variable assignment mechanism)
Procedure: No intermediate variable
Estimation of g.Delta (missingness mechanism)
Procedure: No missingness
Bounds on g: ( 0.025 0.975 )
Additive Effect
Parameter Estimate: 1.4601
Estimated Variance: 0.012888
p-value: <2e-16
95% Conf Interval: (1.2376, 1.6826)
Additive Effect among the Treated
Parameter Estimate: 1.4533
Estimated Variance: 0.011659
p-value: <2e-16
95% Conf Interval: (1.2417, 1.665)
Additive Effect among the Controls
Parameter Estimate: 1.4604
Estimated Variance: 0.013641
p-value: <2e-16
95% Conf Interval: (1.2315, 1.6893)
Finally, we carry out analysis using SuperLearner, allowing ‘tmle’ to define the internal models:
sltmle <- tmle(Y=csdata[,6],A=csdata[,5],W=csdata[,1:4],
Q.SL.library=SLlib2,
g.SL.library=SLlib2)
Lets see a summary of the results produced by each of the methods, so we can compare them:
Note that the coefficients and standard errors produced by the two methods are almost identical. Differences are due to the fact that ‘tmle’ transforms continuous outcomes into a quasi-binomial variable (continuous, but in the range of 0,1) prior to conducting analysis, and also truncates the propensity score to reduce variability. These differences can be more pronounced, but in this case lead to only small differences between the analyses.
csresults <- matrix(c(coef(summary(Q0))[2,1],coef(summary(Q0))[2,2],
coef(summary(Q0m1))[2,1],coef(summary(Q0m1))[2,2],
mean(muA1-muA0),sqrt(var((HA1-HA0)*(csdata[,6]-QAW$QA) + QAW$Q1 - QAW$Q0 - (muA1-muA0))/length(csdata[,1])),
mean(muA1m1-muA0m1),sqrt(var((HA1m1-HA0m1)*(csdata[,6]-QAWm1$QA) + QAWm1$Q1 - QAWm1$Q0 - (muA1m1-muA0m1))/length(csdata[,1])),
mean(muA1m2-muA0m2),sqrt(var((HA1m2-HA0m2)*(csdata[,6]-QAWm2$QA) + QAWm2$Q1 - QAWm2$Q0 - (muA1m2-muA0m2))/length(csdata[,1])),
rtmle$estimates$ATE$psi,sqrt(rtmle$estimates$ATE$var.psi),
rtmlem1$estimates$ATE$psi,sqrt(rtmlem1$estimates$ATE$var.psi),
rtmlem2$estimates$ATE$psi,sqrt(rtmlem2$estimates$ATE$var.psi),
sltmle$estimates$ATE$psi,sqrt(sltmle$estimates$ATE$var.psi)),nrow=9,ncol=2,byrow=TRUE)
rownames(csresults) <- c("GLM - correctly specified","GLM - incorrectly specified","Manual TMLE - correctly specified","Manual TMLE - outcome specified","Manual TMLE - doubly specified","'tmle' package - correctly specified","'tmle' package - outcome specified","'tmle' package - doubly specified","SuperLearner TMLE")
colnames(csresults) <- c("Coef","SE")
csresults
Coef SE
GLM - correctly specified 1.130330 0.09814239
GLM - incorrectly specified 1.447715 0.09919178
Manual TMLE - correctly specified 1.103021 0.11456747
Manual TMLE - outcome specified 1.107687 0.12468699
Manual TMLE - doubly specified 1.460155 0.11364035
'tmle' package - correctly specified 1.103027 0.11452810
'tmle' package - outcome specified 1.107713 0.11798018
'tmle' package - doubly specified 1.460102 0.11352737
SuperLearner TMLE 1.081280 0.07905581