We use the following libraries:
library(mice)
First, we have to get the data. Make sure that the path is changed to the path you saved the datafile.
file <- path.expand("~/Project/060.19899 RECAP/Kluis/WP5 Statistical Methods/Workshop/Aurelie INSERM/data_July2017.txt")
mydata <- read.csv(file = file, na = "NA", stringsAsFactors=TRUE)
str(mydata, list.len = 999)
## 'data.frame': 5070 obs. of 6 variables:
## $ a4_weeks : int 24 31 30 29 31 31 29 28 31 29 ...
## $ gmi_vi_hi_parca_asq_ten2: int NA 0 0 NA 0 0 NA NA 0 0 ...
## $ follow : int 0 1 1 0 1 1 0 0 1 1 ...
## $ motherage : int 25 29 41 40 41 40 37 35 29 30 ...
## $ f10 : int 1 0 1 0 1 1 0 0 1 0 ...
## $ native2 : int 1 1 1 1 0 1 1 1 NA 1 ...
Categorize alle variables except motherage by the following chunk of code:
varfactor <- c("a4_weeks","gmi_vi_hi_parca_asq_ten2","native2","f10","follow")
mydata[,varfactor] <- lapply(mydata[,varfactor] , factor)
str(mydata, list.len = 999)
## 'data.frame': 5070 obs. of 6 variables:
## $ a4_weeks : Factor w/ 9 levels "23","24","25",..: 2 9 8 7 9 9 7 6 9 7 ...
## $ gmi_vi_hi_parca_asq_ten2: Factor w/ 2 levels "0","1": NA 1 1 NA 1 1 NA NA 1 1 ...
## $ follow : Factor w/ 2 levels "0","1": 1 2 2 1 2 2 1 1 2 2 ...
## $ motherage : int 25 29 41 40 41 40 37 35 29 30 ...
## $ f10 : Factor w/ 2 levels "0","1": 2 1 2 1 2 2 1 1 2 1 ...
## $ native2 : Factor w/ 2 levels "0","1": 2 2 2 2 1 2 2 2 NA 2 ...
dim(mydata) #the number of observations and the number of variables
## [1] 5070 6
summary(mydata)
## a4_weeks gmi_vi_hi_parca_asq_ten2 follow motherage
## 31 :1397 0 :2355 0:1757 Min. :14.00
## 30 :1085 1 : 719 1:3313 1st Qu.:26.00
## 29 : 741 NA's:1996 Median :31.00
## 28 : 620 Mean :30.59
## 27 : 485 3rd Qu.:35.00
## 26 : 358 Max. :53.00
## (Other): 384 NA's :18
## f10 native2
## 0 :2058 0 : 995
## 1 :2880 1 :3748
## NA's: 132 NA's: 327
##
##
##
##
To create the response indicator:
r <- mydata$follow == 1
neuro <- as.numeric(mydata[r, "gmi_vi_hi_parca_asq_ten2"])-1
#number of responders
nb_responders <- length(which(mydata$follow == 1))
#number of responders without missing values on outcome
nb_responders_wo_miss <- length(which(neuro != "NA")) - sum(is.na(neuro))
1-(nb_responders_wo_miss/nb_responders) # 14% of missing values for the outcome - a lot
## [1] 0.1442801
mean_crude <- mean(neuro,na.rm = TRUE)*100
mean_crude
## [1] 23.38972
## fit logistic regression model wihtout imputation
fit0 <- glm(follow == 1 ~ native2 + f10 + motherage + a4_weeks, family = binomial(),na.action = na.exclude,data=mydata)
prop0 <- predict(fit0, type = "response")
weight0 <- 1/prop0
new_data <- cbind(mydata,weight0)
new_data <- na.omit(new_data)
new_data <- subset(new_data,follow == 1)
summary(new_data)
## a4_weeks gmi_vi_hi_parca_asq_ten2 follow motherage f10
## 31 :761 0:2201 0: 0 Min. :15.0 0:1053
## 30 :648 1: 679 1:2880 1st Qu.:28.0 1:1827
## 29 :411 Median :31.0
## 28 :352 Mean :31.4
## 27 :294 3rd Qu.:35.0
## 26 :210 Max. :52.0
## (Other):204
## native2 weight0
## 0: 493 Min. :1.086
## 1:2387 1st Qu.:1.313
## Median :1.423
## Mean :1.499
## 3rd Qu.:1.609
## Max. :3.594
##
mean_weighted <- (weighted.mean(x = as.numeric(new_data[,"gmi_vi_hi_parca_asq_ten2"]), w = new_data[, "weight0"])-1)*100
mean_weighted
## [1] 23.98952
## To get the number of missing values in your dataset
1-(sum(complete.cases(mydata))/dim(mydata)[1])
## [1] 0.4319527
# md.pattern(mydata)
# fluxplot(mydata)
ini <- mice(mydata, maxit = 0, m = 43, seed = 12345)
imp <- mice.mids(ini, maxit = 10, print = FALSE)
plot(imp)
#library("lattice")
#bwplot(imp, ~ motherage)
long <- complete(imp, "long", include = TRUE)
long$neuro <- as.numeric(long$gmi_vi_hi_parca_asq_ten2) - 1
long2<-aggregate(long, by = list(long$.imp), FUN = mean, na.rm = TRUE)
mean_imp <- mean(subset(long2, Group.1 !="0")$neuro)*100
mean_imp
## [1] 24.03926
## fit logistic regression model
fit <- with(imp, glm(follow == 1 ~ native2 + f10 + motherage + a4_weeks, family = binomial()))
prop <- matrix(NA, nrow = length(fit$analyses[[1]]$weights),
ncol = length(fit$analyses))
for (i in 1:length(fit$analyses)) {
prop[, i] <- predict(fit$analyses[[i]], type = "response")
}
propensity <- rowMeans(prop)
# construct inverse weights
weight_all <- 1/propensity
summary(weight_all)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.096 1.360 1.496 1.579 1.723 3.636
hist(weight_all)
# select weight for followed-up respondents
weight <- weight_all[mydata$follow==1]
summary(weight)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.096 1.340 1.452 1.531 1.650 3.636
hist(weight)
# two histograms
hist(weight_all, col = "grey")
hist(weight, col = "blue", add = TRUE)
neuro_imp <- as.numeric(imp$data[r, "gmi_vi_hi_parca_asq_ten2"]) - 1
mean_imp_weighted <- weighted.mean(x = neuro_imp, w = weight, na.rm = TRUE)*100
#print results
cat("crude=",mean_crude," weigthed=",mean_weighted," imputed=",mean_imp," imputed and weighted=",mean_imp_weighted)
## crude= 23.38972 weigthed= 23.98952 imputed= 24.03926 imputed and weighted= 23.85348