We use the following libraries:

```{r}
library(mice)
```


First, we have to get the data. Make sure that the path is changed to the path you saved the datafile.
```{r}
file <- path.expand("~/Project/060.19899 RECAP/Kluis/WP5 Statistical Methods/Workshop/Aurelie INSERM/data_July2017.txt")
mydata <- read.csv(file = file, na = "NA", stringsAsFactors=TRUE)
str(mydata, list.len = 999)
```

Categorize alle variables except *motherage* by the following chunk of code:
```{r}
varfactor <- c("a4_weeks","gmi_vi_hi_parca_asq_ten2","native2","f10","follow")
mydata[,varfactor] <- lapply(mydata[,varfactor] , factor)
str(mydata, list.len = 999)

dim(mydata) #the number of observations and the number of variables

summary(mydata)
```

To create the response indicator:
```{r}
r <- mydata$follow == 1
```

##Method 1: Get the crude prevalence
```{r}
neuro <- as.numeric(mydata[r, "gmi_vi_hi_parca_asq_ten2"])-1

#number of responders
nb_responders <- length(which(mydata$follow == 1))
#number of responders without missing values on outcome
nb_responders_wo_miss <- length(which(neuro != "NA")) - sum(is.na(neuro))

1-(nb_responders_wo_miss/nb_responders) # 14% of missing values for the outcome - a lot

mean_crude <- mean(neuro,na.rm = TRUE)*100
mean_crude
```

##Method 2: Corrected the prevalence with taking into account non-responders - no correction on missing values.
```{r}
## fit logistic regression model wihtout imputation
fit0 <- glm(follow == 1 ~ native2 + f10 + motherage + a4_weeks, family = binomial(),na.action = na.exclude,data=mydata)

prop0 <- predict(fit0, type = "response")
weight0 <- 1/prop0

new_data <- cbind(mydata,weight0)
new_data <- na.omit(new_data)
new_data <- subset(new_data,follow == 1)
summary(new_data)

mean_weighted <- (weighted.mean(x = as.numeric(new_data[,"gmi_vi_hi_parca_asq_ten2"]), w = new_data[, "weight0"])-1)*100
mean_weighted
```

##Method 3: Corrected the prevalence without taking into account non-responders - correction on missing values
```{r warning = FALSE}
## To get the number of missing values in your dataset
1-(sum(complete.cases(mydata))/dim(mydata)[1])

# md.pattern(mydata)
# fluxplot(mydata)

ini <- mice(mydata, maxit = 0, m = 43, seed = 12345)
imp <- mice.mids(ini, maxit = 10, print = FALSE)

plot(imp)
#library("lattice")
#bwplot(imp, ~ motherage)


long <- complete(imp, "long", include = TRUE)
long$neuro <-  as.numeric(long$gmi_vi_hi_parca_asq_ten2) - 1

long2<-aggregate(long, by = list(long$.imp), FUN = mean, na.rm = TRUE)

mean_imp <- mean(subset(long2, Group.1 !="0")$neuro)*100
mean_imp
```

##Method 4: Corrected the prevalence with taking into account non-responders - correction on missing values
```{r}
## fit logistic regression model
fit <- with(imp, glm(follow == 1 ~ native2 + f10 + motherage + a4_weeks, family = binomial()))

prop <- matrix(NA, nrow = length(fit$analyses[[1]]$weights),
ncol = length(fit$analyses))

for (i in 1:length(fit$analyses)) {
prop[, i] <- predict(fit$analyses[[i]], type = "response")
}
propensity <- rowMeans(prop)

# construct inverse weights
weight_all <- 1/propensity
summary(weight_all)
hist(weight_all)

# select weight for followed-up respondents
weight <- weight_all[mydata$follow==1]
summary(weight)
hist(weight)

# two histograms
hist(weight_all, col = "grey")
hist(weight, col = "blue", add = TRUE)

neuro_imp <- as.numeric(imp$data[r, "gmi_vi_hi_parca_asq_ten2"]) - 1
mean_imp_weighted <- weighted.mean(x = neuro_imp, w = weight, na.rm = TRUE)*100

#print results
cat("crude=",mean_crude," weigthed=",mean_weighted," imputed=",mean_imp," imputed and weighted=",mean_imp_weighted)
```