###Impute qualification

##Setup
dat <- data.frame(y      = 4 - qualf4[l_ind],
                  coh    = coh[l_ind],
                  HDIc2a = as.factor(HDIc2a[l_ind]),
                  HDIc2b = as.factor(HDIc2b[l_ind]),
                  HDIc2c = as.factor(HDIc2c[l_ind]),
                  HDIc2d = as.factor(HDIc2d[l_ind]),
                  HDIc3a = as.factor(HDIc3a[l_ind]),
                  HDIc3b = as.factor(HDIc3b[l_ind]),
                  HDIc3c = as.factor(HDIc3c[l_ind]),
                  HDIc3d = as.factor(HDIc3d[l_ind]),
                  HDIc3e = as.factor(HDIc3e[l_ind]),
                  HDIc3f = as.factor(HDIc3f[l_ind]),
                  HDIc4a = as.factor(HDIc4a[l_ind]),
                  HDIc4b = as.factor(HDIc4b[l_ind]),
                  HDIc4c = as.factor(HDIc4c[l_ind]),
                  HDIc4d = as.factor(HDIc4d[l_ind]),
                  HDIc5  = as.factor(HDIc5[l_ind]),
                  weight = weights_st[l_ind])


##Model selection
if (mod_sel) {
  #Create data frame for BIC values
  cohrange <- c(1980:1984)
  BICs <- matrix(0,32,length(cohrange))
  rownames(BICs) <- paste0("mod",c(1, 2, "3.2a","3.2b","3.2c","3.2d",
                                   "3.3a","3.3b","3.3c","3.3d","3.3e","3.3f",
                                   "3.4a","3.4b","3.4c","3.4d","3.5",
                                   "4.2a","4.2b","4.2c","4.2d",
                                   "4.3a","4.3b","4.3c","4.3d","4.3e","4.3f",
                                   "4.4a","4.4b","4.4c","4.4d","4.5"))
  colnames(BICs) <- cohrange
  
  #Fit GAMs and extract BICs
  for (cutoff in cohrange) {
    print(cutoff)
    gam1    <- gam(list(y ~ 1, ~ 1, ~ 1), family = multinom(K = 3), data = dat, subset = coh <= cutoff)
    gam2    <- gam(list(y ~ 1 + coh, ~ 1 + coh, ~ 1 + coh), family = multinom(K = 3), data = dat, subset = coh <= cutoff)
    gam3.2a <- gam(list(y ~ 1 + coh + HDIc2a, ~ 1 + coh + HDIc2a, ~ 1 + coh + HDIc2a), family = multinom(K = 3), data = dat, subset = coh <= cutoff)
    gam3.2b <- gam(list(y ~ 1 + coh + HDIc2b, ~ 1 + coh + HDIc2b, ~ 1 + coh + HDIc2b), family = multinom(K = 3), data = dat, subset = coh <= cutoff)
    gam3.2c <- gam(list(y ~ 1 + coh + HDIc2c, ~ 1 + coh + HDIc2c, ~ 1 + coh + HDIc2c), family = multinom(K = 3), data = dat, subset = coh <= cutoff)
    gam3.2d <- gam(list(y ~ 1 + coh + HDIc2d, ~ 1 + coh + HDIc2d, ~ 1 + coh + HDIc2d), family = multinom(K = 3), data = dat, subset = coh <= cutoff)
    gam3.3a <- gam(list(y ~ 1 + coh + HDIc3a, ~ 1 + coh + HDIc3a, ~ 1 + coh + HDIc3a), family = multinom(K = 3), data = dat, subset = coh <= cutoff)
    gam3.3b <- gam(list(y ~ 1 + coh + HDIc3b, ~ 1 + coh + HDIc3b, ~ 1 + coh + HDIc3b), family = multinom(K = 3), data = dat, subset = coh <= cutoff)
    gam3.3c <- gam(list(y ~ 1 + coh + HDIc3c, ~ 1 + coh + HDIc3c, ~ 1 + coh + HDIc3c), family = multinom(K = 3), data = dat, subset = coh <= cutoff)
    gam3.3d <- gam(list(y ~ 1 + coh + HDIc3d, ~ 1 + coh + HDIc3d, ~ 1 + coh + HDIc3d), family = multinom(K = 3), data = dat, subset = coh <= cutoff)
    gam3.3e <- gam(list(y ~ 1 + coh + HDIc3e, ~ 1 + coh + HDIc3e, ~ 1 + coh + HDIc3e), family = multinom(K = 3), data = dat, subset = coh <= cutoff)
    gam3.3f <- gam(list(y ~ 1 + coh + HDIc3f, ~ 1 + coh + HDIc3f, ~ 1 + coh + HDIc3f), family = multinom(K = 3), data = dat, subset = coh <= cutoff)
    gam3.4a <- gam(list(y ~ 1 + coh + HDIc4a, ~ 1 + coh + HDIc4a, ~ 1 + coh + HDIc4a), family = multinom(K = 3), data = dat, subset = coh <= cutoff)
    gam3.4b <- gam(list(y ~ 1 + coh + HDIc4b, ~ 1 + coh + HDIc4b, ~ 1 + coh + HDIc4b), family = multinom(K = 3), data = dat, subset = coh <= cutoff)
    gam3.4c <- gam(list(y ~ 1 + coh + HDIc4c, ~ 1 + coh + HDIc4c, ~ 1 + coh + HDIc4c), family = multinom(K = 3), data = dat, subset = coh <= cutoff)
    gam3.4d <- gam(list(y ~ 1 + coh + HDIc4d, ~ 1 + coh + HDIc4d, ~ 1 + coh + HDIc4d), family = multinom(K = 3), data = dat, subset = coh <= cutoff)
    gam3.5  <- gam(list(y ~ 1 + coh + HDIc5, ~ 1 + coh + HDIc5, ~ 1 + coh + HDIc5), family = multinom(K = 3), data = dat, subset = coh <= cutoff)
    gam4.2a <- gam(list(y ~ 1 + coh + HDIc2a + coh:HDIc2a, ~ 1 + coh + HDIc2a + coh:HDIc2a, ~ 1 + coh + HDIc2a + coh:HDIc2a), family = multinom(K = 3), data = dat, subset = coh <= cutoff)
    gam4.2b <- gam(list(y ~ 1 + coh + HDIc2b + coh:HDIc2b, ~ 1 + coh + HDIc2b + coh:HDIc2b, ~ 1 + coh + HDIc2b + coh:HDIc2b), family = multinom(K = 3), data = dat, subset = coh <= cutoff)
    gam4.2c <- gam(list(y ~ 1 + coh + HDIc2c + coh:HDIc2c, ~ 1 + coh + HDIc2c + coh:HDIc2c, ~ 1 + coh + HDIc2c + coh:HDIc2c), family = multinom(K = 3), data = dat, subset = coh <= cutoff)
    gam4.2d <- gam(list(y ~ 1 + coh + HDIc2d + coh:HDIc2d, ~ 1 + coh + HDIc2d + coh:HDIc2d, ~ 1 + coh + HDIc2d + coh:HDIc2d), family = multinom(K = 3), data = dat, subset = coh <= cutoff)
    gam4.3a <- gam(list(y ~ 1 + coh + HDIc3a + coh:HDIc3a, ~ 1 + coh + HDIc3a + coh:HDIc3a, ~ 1 + coh + HDIc3a + coh:HDIc3a), family = multinom(K = 3), data = dat, subset = coh <= cutoff)
    gam4.3b <- gam(list(y ~ 1 + coh + HDIc3b + coh:HDIc3b, ~ 1 + coh + HDIc3b + coh:HDIc3b, ~ 1 + coh + HDIc3b + coh:HDIc3b), family = multinom(K = 3), data = dat, subset = coh <= cutoff)
    gam4.3c <- gam(list(y ~ 1 + coh + HDIc3c + coh:HDIc3c, ~ 1 + coh + HDIc3c + coh:HDIc3c, ~ 1 + coh + HDIc3c + coh:HDIc3c), family = multinom(K = 3), data = dat, subset = coh <= cutoff)
    gam4.3d <- gam(list(y ~ 1 + coh + HDIc3d + coh:HDIc3d, ~ 1 + coh + HDIc3d + coh:HDIc3d, ~ 1 + coh + HDIc3d + coh:HDIc3d), family = multinom(K = 3), data = dat, subset = coh <= cutoff)
    gam4.3e <- gam(list(y ~ 1 + coh + HDIc3e + coh:HDIc3e, ~ 1 + coh + HDIc3e + coh:HDIc3e, ~ 1 + coh + HDIc3e + coh:HDIc3e), family = multinom(K = 3), data = dat, subset = coh <= cutoff)
    gam4.3f <- gam(list(y ~ 1 + coh + HDIc3f + coh:HDIc3f, ~ 1 + coh + HDIc3f + coh:HDIc3f, ~ 1 + coh + HDIc3f + coh:HDIc3f), family = multinom(K = 3), data = dat, subset = coh <= cutoff)
    gam4.4a <- gam(list(y ~ 1 + coh + HDIc4a + coh:HDIc4a, ~ 1 + coh + HDIc4a + coh:HDIc4a, ~ 1 + coh + HDIc4a + coh:HDIc4a), family = multinom(K = 3), data = dat, subset = coh <= cutoff)
    gam4.4b <- gam(list(y ~ 1 + coh + HDIc4b + coh:HDIc4b, ~ 1 + coh + HDIc4b + coh:HDIc4b, ~ 1 + coh + HDIc4b + coh:HDIc4b), family = multinom(K = 3), data = dat, subset = coh <= cutoff)
    gam4.4c <- gam(list(y ~ 1 + coh + HDIc4c + coh:HDIc4c, ~ 1 + coh + HDIc4c + coh:HDIc4c, ~ 1 + coh + HDIc4c + coh:HDIc4c), family = multinom(K = 3), data = dat, subset = coh <= cutoff)
    gam4.4d <- gam(list(y ~ 1 + coh + HDIc4d + coh:HDIc4d, ~ 1 + coh + HDIc4d + coh:HDIc4d, ~ 1 + coh + HDIc4d + coh:HDIc4d), family = multinom(K = 3), data = dat, subset = coh <= cutoff)
    gam4.5 <- gam(list(y ~ 1 + coh + HDIc5+ coh:HDIc5, ~ 1 + coh + HDIc5+ coh:HDIc5, ~ 1 + coh + HDIc5+ coh:HDIc5), family = multinom(K = 3), data = dat, subset = coh <= cutoff)
    
    BICs[,paste0(cutoff)] <- BIC(gam1,gam2,gam3.2a,gam3.2b,gam3.2c,gam3.2d,
                                 gam3.3a,gam3.3b,gam3.3c,gam3.3d,gam3.3e,gam3.3f,
                                 gam3.4a,gam3.4b,gam3.4c,gam3.4d,gam3.5,
                                 gam4.2a,gam4.2b,gam4.2c,gam4.2d,
                                 gam4.3a,gam4.3b,gam4.3c,gam4.3d,gam4.3e,gam4.3f,
                                 gam4.4a,gam4.4b,gam4.4c,gam4.4d,gam4.5)$BIC
  }
  
  #Identify model with lowest BIC for each cutoff
  data.frame(coh = cohrange, mod = rownames(BICs)[apply(BICs, 2, which.min)])
}


##Perform imputation with chosen cutoff and model
#Setup
cutoff <- 1982
gam4.3e <- gam(list(y ~ 1 + coh + HDIc3e + coh:HDIc3e, ~ 1 + coh + HDIc3e + coh:HDIc3e, ~ 1 + coh + HDIc3e + coh:HDIc3e), family = multinom(K = 3), data = dat, subset = coh <= cutoff)
countfunc <- function (x) c(sum(x==1), sum(x==2), sum(x==3), sum(x==4))
set.seed(1)
unic <- coh[l_ind]
unih3 <- HDIc3e[l_ind]
uids <- id[l_ind]
uniq <- qualf4[l_ind]
uniq2 <- uniq

#Imputation
for (cohort in c((cutoff + 1):1992)) {
  for (i in 1:3) {
    fit <- predict(gam4.3e, newdata = data.frame(coh = cohort, HDIc3e = i),type="response")[4:1]
    tot <- sum(unic == cohort & unih3 == i)
    obsd <- tot*fit
    obsr <- round(obsd)
    if (sum(obsr) - tot == 1) {
      obsr[which.min(abs(obsd %% 1 - 0.5))] <- obsr[which.min(abs(obsd %% 1 - 0.5))] - 1
    }
    if (sum(obsr)-tot==-1) {
      obsr[which.min(abs(obsd %% 1 - 0.5))] <- obsr[which.min(abs(obsd %% 1 - 0.5))] + 1
    }
    icoh <- which(unic == cohort & unih3 == i)
    cids <- uids[icoh]
    oriq <- uniq[icoh]
    newq <- uniq[icoh]
    
    #Move < O Level to O Level
    size <- countfunc(newq)[1] - obsr[1]
    if (size<0) {
      obsd <- c(countfunc(newq)[1], (sum(countfunc(newq)[2:4])*fit[2:4])/sum(fit[2:4]))
      obsr <- round(obsd)
    } 
    if (sum(obsr) - tot == 1) {
      obsr[which.min(abs(obsd %% 1 - 0.5))] <- obsr[which.min(abs(obsd %% 1 - 0.5))] - 1
    }
    if (sum(obsr) - tot == -1) {
      obsr[which.min(abs(obsd %% 1 - 0.5))] <- obsr[which.min(abs(obsd %% 1 - 0.5))] + 1
    }
    size <- countfunc(newq)[1] - obsr[1]
    if (size > 0) {
      x <- cids[newq == 1]
      if (length(x) == 1) ind1 <- x
      if (length(x) > 1)  ind1 <- sample(x, size)
      newq[cids %in% ind1] <- 2
    }
    
    #Move O Level to A Level
    size <- countfunc(newq)[2] - obsr[2]
    if (size < 0) {
      obsd <- c(countfunc(newq)[1:2], (sum(countfunc(newq)[3:4])*fit[3:4])/sum(fit[3:4]))
      obsr <- round(obsd)
    } 
    if (sum(obsr) - tot == 1) {
      obsr[which.min(abs(obsd %% 1 - 0.5))] <- obsr[which.min(abs(obsd %% 1 - 0.5))] - 1
    }
    if (sum(obsr) - tot == -1) {
      obsr[which.min(abs(obsd %% 1 - 0.5))] <- obsr[which.min(abs(obsd %% 1 - 0.5))] + 1
    }
    size <- countfunc(newq)[2] - obsr[2]
    if (size > 0) {
      x <- cids[newq == 2]
      if (length(x) == 1) ind2 <- x
      if (length(x) > 1)  ind2 <- sample(x, size)
      newq[cids %in% ind2] <- 3
    }
    
    #Move A Level to Degree
    size <- countfunc(newq)[3] - obsr[3]
    if (size < 0) {
      obsr <- countfunc(newq)
    }
    if (size > 0) {
      x <- cids[newq == 3]
      if (length(x) == 1) ind3 <- x
      if (length(x) > 1)  ind3 <- sample(x, size)
      newq[cids %in% ind3] <- 4  
    }
    uniq2[icoh] <- newq   
  }
}


##Update Q
idimp <- uids[which(uniq2 != uniq)]
iddat <- data.frame(id = 1:length(uids))
rownames(iddat) <- uids

qualf4imp <- qualf4
qualf4imp[id %in% idimp] <- uniq2[iddat[paste(id[id %in% idimp]),"id"]]
qualf2aimp <- ifelse(qualf4imp < 2, 1, 2)
qualf2bimp <- ifelse(qualf4imp < 3, 1, 2)
qualf2cimp <- ifelse(qualf4imp < 4, 1, 2)
qualf3aimp <- ifelse(qualf4imp > 3, qualf4imp-1, qualf4imp)
qualf3bimp <- ifelse(qualf4imp > 2, qualf4imp-1, qualf4imp)
qualf3cimp <- ifelse(qualf4imp > 1, qualf4imp-1, qualf4imp)

qualf2aimp_0 <- qualf2aimp[parityc == 0]
qualf2aimp_1 <- qualf2aimp[parityc == 1]
qualf2aimp_2 <- qualf2aimp[parityc == 2]
qualf2aimp_3 <- qualf2aimp[parityc == 3]

qualf2bimp_0 <- qualf2bimp[parityc == 0]
qualf2bimp_1 <- qualf2bimp[parityc == 1]
qualf2bimp_2 <- qualf2bimp[parityc == 2]
qualf2bimp_3 <- qualf2bimp[parityc == 3]

qualf2cimp_0 <- qualf2cimp[parityc == 0]
qualf2cimp_1 <- qualf2cimp[parityc == 1]
qualf2cimp_2 <- qualf2cimp[parityc == 2]
qualf2cimp_3 <- qualf2cimp[parityc == 3]

qualf3aimp_0 <- qualf3aimp[parityc == 0]
qualf3aimp_1 <- qualf3aimp[parityc == 1]
qualf3aimp_2 <- qualf3aimp[parityc == 2]
qualf3aimp_3 <- qualf3aimp[parityc == 3]

qualf3bimp_0 <- qualf3bimp[parityc == 0]
qualf3bimp_1 <- qualf3bimp[parityc == 1]
qualf3bimp_2 <- qualf3bimp[parityc == 2]
qualf3bimp_3 <- qualf3bimp[parityc == 3]

qualf3cimp_0 <- qualf3cimp[parityc == 0]
qualf3cimp_1 <- qualf3cimp[parityc == 1]
qualf3cimp_2 <- qualf3cimp[parityc == 2]
qualf3cimp_3 <- qualf3cimp[parityc == 3]

qualf4imp_0 <- qualf4imp[parityc == 0]
qualf4imp_1 <- qualf4imp[parityc == 1]
qualf4imp_2 <- qualf4imp[parityc == 2]
qualf4imp_3 <- qualf4imp[parityc == 3]
