###Process UKHLS Wave 1 data (continued)

##Initial setup
#Create 'negate' function
`%notin%` <- Negate(`%in%`)

#Create copy of a_indresp
dat <- a_indresp

#Create index column
index <- 1:nrow(dat)

#Subset to full respondents (exclude proxy respondents)
index <- intersect(index, which(dat$a_prsex == -8))

#Subset to women living in England and Wales
index <- intersect(index, which(dat$a_country %in% c(1, 2)))


##Dealing with missing mother month/year of birth
#Identify women with missing month or year of birth (or both), and create a reduced dataset
#for them with some selected variables (month and year of birth, age at last birthday, month
#and year of start of interview)
indbirth <- intersect(index, which(dat$a_birthm < 0 | dat$a_birthy < 0))
datbirth <- dat[indbirth, c("a_birthm", "a_birthy", "a_dvage", "a_istrtdatm", "a_istrtdaty")]

#All women with missing month and year of birth have provided month and year of start of interview,
#so can estimate their month and year of birth by assuming they are halfway through their current
#age at the time of survey
ind1 <- which(datbirth$a_birthm < 0 & datbirth$a_birthy < 0)
dobcalc <- 1900 + (12*(datbirth[ind1, "a_istrtdaty"] - 1900) + 
                     datbirth[ind1, "a_istrtdatm"] - (12*datbirth[ind1, "a_dvage"] + 5))/12
datbirth[ind1, "a_birthm"] <- 12*(dobcalc - floor(dobcalc))
datbirth[ind1, "a_birthm"] <- ifelse(datbirth[ind1, "a_birthm"] == 0, 12, datbirth[ind1, "a_birthm"])
datbirth[ind1, "a_birthy"] <- ifelse(datbirth[ind1, "a_birthm"] == 12, floor(dobcalc) - 1, floor(dobcalc))

#All women with missing month of birth only have provided month and year of start of interview,
#so can estimate their month of birth using their age at the time of survey
ind2 <- which(datbirth$a_birthm < 0 & datbirth$a_birthy > 0)
dobcalc <- datbirth[ind2, "a_birthy"] + datbirth[ind2, "a_dvage"] - datbirth[ind2, "a_istrtdaty"]
#If dobcalc = 0, we know their birthday must be in the months up to and including the month of survey (assume complete)
datbirth[ind2[dobcalc == 0], "a_birthm"] <- ceiling(datbirth[ind2[dobcalc == 0], "a_istrtdatm"]/2)
#If dobcalc = -1, we know their birthday must be in the remaining months of the year of survey (reverse of above)
datbirth[ind2[dobcalc == -1], "a_birthm"] <- ceiling((12 + datbirth[ind2[dobcalc == -1], "a_istrtdatm"])/2)
#If dobcalc doesn't equal 0 or -1, a_dvage is not useful and we impute their month at 6.5
datbirth[ind2[dobcalc %notin% c(-1, 0)], "a_birthm"] <- 6.5

#All women with missing year of birth only have provided month and year of start of interview,
#so can calculate their year of birth using their age at the time of survey and whether their
#interview occurred before or after their birth month.
ind3 <- which(datbirth$a_birthm > 0 & datbirth$a_birthy < 0)
ind4 <- ind3[which(datbirth[ind3, "a_birthm"] <= datbirth[ind3, "a_istrtdatm"])]
datbirth[ind4, "a_birthy"] <- datbirth[ind4, "a_istrtdaty"] - datbirth[ind4, "a_dvage"]
ind5 <- ind3[which(datbirth[ind3, "a_birthm"] > datbirth[ind3, "a_istrtdatm"])]
datbirth[ind5, "a_birthy"] <- datbirth[ind5, "a_istrtdaty"] - (datbirth[ind5, "a_dvage"] + 1)

#Replace month and year values in dat
dat[rownames(datbirth), c("a_birthm", "a_birthy")] <- datbirth[,c("a_birthm", "a_birthy")]
dat$a_birthm <- round(dat$a_birthm, 1)

#Subset to women born in 1945-1993
index <- intersect(index, which(dat$a_birthy %in% 1945:1993))


##Construct fertility histories
#Remove women who do not say they have no biological children but do not complete a fertility history
index <- setdiff(index, which(dat$a_lprnt != 2 & is.na(dat$pres)))

#Remove women with one or more missing child years of birth
index <- setdiff(index, which(dat$nmisdoby > 0))

#Ages at birth
dat$dob1900 <- 12*(dat$a_birthy - 1900) + dat$a_birthm
dob1900ch <- dat[, paste0("dob1900ch_", 1:20)]
agebth <- (dob1900ch - dat$dob1900)/12
agebth <- floor(agebth)
agebth <- as.matrix(agebth)
interval <- dob1900ch[, -c(1)] - dob1900ch[, -c(20)]

#Remove women with no ages at birth at 12 or above
index <- setdiff(index, intersect(index, which(apply(agebth, 1, max, na.rm = T) != -Inf &
                                                 apply(agebth, 1, max, na.rm = T) < 12)))

#Remove women with first age at birth after the ages below 12 being 12
flaglt12 <- intersect(index, which(apply(agebth, 1, function(x) length(which(x < 12)) > 0)))
index <- setdiff(index,
                 intersect(flaglt12,
                           which(apply(agebth, 1, function(x) min(x[x >= 12], na.rm = T)) == 12)))

#Create person-year records
birth.bin <- birth <- parity <- gap <- matrix(0, nrow(dat), 33)

for (i in setdiff(intersect(index, which(!is.na(dat$pres))), flaglt12)) {
  t <- table(agebth[i, ][agebth[i, ] < 45])
  birth.bin[i, as.numeric(names(t)) - 11] <- 1
  birth[i, as.numeric(names(t)) - 11] <- t
  parity[i, ] <- cumsum(c(0, birth[i, 1:32]))
  for (j in 1:max(parity[i, ])) if (sum(parity[i, ]) > 0)
    gap[i, which(parity[i, ] == j)] <- 1:length(which(parity[i, ] == j))
}

for (i in intersect(index, flaglt12)) {
  nmis <- length(which(agebth[i, ] < 12))
  t <- table(agebth[i, ][agebth[i, ] %in% 12:44])
  birth.bin[i, as.numeric(names(t)) - 11] <- 1
  birth[i, as.numeric(names(t)) - 11] <- t
  parity[i, ] <- cumsum(c(0, birth[i, 1:32])) + nmis
  for (j in 1:max(parity[i, ])) if (sum(parity[i, ]) > 0)
    gap[i, which(parity[i, ] == j)] <- 1:length(which(parity[i, ] == j))
  birth.bin[i, 1:(as.numeric(names(t))[1] - 11)] <- NA
  birth[i, 1:(as.numeric(names(t))[1] - 11)] <- NA
  parity[i, 1:(as.numeric(names(t))[1] - 11)] <- NA
  gap[i, 1:(as.numeric(names(t))[1] - 11)] <- NA  
}

#Remove ages 12-14 as we will model from 15-44 only
birth.bin <- birth.bin[, -c(1:3)]
birth <- birth[, -c(1:3)]
parity <- parity[, -c(1:3)]
gap <- gap[, -c(1:3)]

#Remove person-years not fully observed within observation period (January 1960 - November 2008)
py_beg <- py_end <- matrix(0, nrow(dat), 30)
for (i in index) {
  py_beg[i,] <- dat$dob1900[i] + seq(180, 528, by = 12)
  py_end[i,] <- dat$dob1900[i] + seq(191, 539, by = 12)
}
py_rng <- c(12*(1960 - 1900) + 1, 12*(2008 - 1900) + 11)
py_begind <- py_endind <- matrix(NA, nrow(dat), 30)
py_begind[py_beg >= py_rng[1] & py_beg <= py_rng[2]] <- 1
py_endind[py_end >= py_rng[1] & py_end <= py_rng[2]] <- 1
py_begind[is.na(parity)] <- NA
py_endind[is.na(parity)] <- NA
py_ind <- py_begind + py_endind

#Remove women with no person-years observed in the observation period
index <- setdiff(index, which(apply(py_ind, 1, function(x) sum(!is.na(x)) == 0)))

#Remove person-years where respondents were not living in the UK
for (i in intersect(index, which(dat$a_yr2uk4 > 0))) {
  age2uk <- dat$a_yr2uk4[i] - dat$a_birthy[i]
  if (age2uk %in% 15:43) py_ind[i, 1:(age2uk - 14)] <- NA
  if (age2uk > 43) py_ind[i, ] <- NA
}

#Remove women with no person-years lived in the UK
index <- setdiff(index, which(apply(py_ind, 1, function(x) sum(!is.na(x)) == 0)))


##Create HDI variables
#Process HDI data
source("r/process_HDI.r")

#Create HDI variables
dat$HDIr <- dat$HDIc2a <- dat$HDIc2b <- dat$HDIc2c <- dat$HDIc2d <- 
  dat$HDIc3a <- dat$HDIc3b <- dat$HDIc3c <- dat$HDIc3d <- dat$HDIc3e <- dat$HDIc3f <- 
  dat$HDIc4a <- dat$HDIc4b <- dat$HDIc4c <- dat$HDIc4d <- dat$HDIc5 <- NA
k <- 1
for (i in index) {
  j <- which(HDIsurv$a_plbornc_all == dat$a_plbornc_all[index[k]])
  dat$HDIr[i] <- HDIsurv$HDIr[j]
  dat$HDIc2a[i] <- HDIsurv$HDIc2a[j]
  dat$HDIc2b[i] <- HDIsurv$HDIc2b[j]
  dat$HDIc2c[i] <- HDIsurv$HDIc2c[j]
  dat$HDIc2d[i] <- HDIsurv$HDIc2d[j]
  dat$HDIc3a[i] <- HDIsurv$HDIc3a[j]
  dat$HDIc3b[i] <- HDIsurv$HDIc3b[j]
  dat$HDIc3c[i] <- HDIsurv$HDIc3c[j]
  dat$HDIc3d[i] <- HDIsurv$HDIc3d[j]
  dat$HDIc3e[i] <- HDIsurv$HDIc3e[j]
  dat$HDIc3f[i] <- HDIsurv$HDIc3f[j]
  dat$HDIc4a[i] <- HDIsurv$HDIc4a[j]
  dat$HDIc4b[i] <- HDIsurv$HDIc4b[j]
  dat$HDIc4c[i] <- HDIsurv$HDIc4c[j]
  dat$HDIc4d[i] <- HDIsurv$HDIc4d[j]
  dat$HDIc5[i] <- HDIsurv$HDIc5[j]
  k <- k+1
}


##Create highest educational qualification variables (impute missing responses with their
#weighted cohort-specific maxima)
#Identify cohorts for which imputation is required
cohimp <- sort(unique(dat$a_birthy[intersect(index, which(dat$qualfadj == "Not known"))]))

#Create function that computes weighted counts by qualification for a given cohort x
#and then picks out the category with the largest count
cohimpfunc <- function(x) {
  tabCQw <- aggregate(dat$a_indinus_xw[intersect(index, which(dat$qualfadj != "Not known" & dat$a_birthy == x))],
                      by=list(dat$qualfadj[intersect(index, which(dat$qualfadj != "Not known" & dat$a_birthy == x))]), sum)
  a <- tabCQw$Group.1[which.max(tabCQw$x)]
  a
}

#Apply function to all cohorts in cohimp
cohimpqw <- sapply(cohimp, cohimpfunc)

#Impute missing responses
dat$qualfadj[intersect(index, which(dat$qualfadj == "Not known"))] <- 
  cohimpqw[sapply(intersect(index, which(dat$qualfadj == "Not known")),
                  function(x) which(cohimp == dat$a_birthy[x]))]

#Convert to numeric variable
dat %<>% mutate(qualfadj_num = case_when(qualfadj == "Less than O Level" ~ 1,
                                         qualfadj == "O Level"           ~ 2,
                                         qualfadj == "A Level"           ~ 3,
                                         qualfadj == "Degree"            ~ 4))

#Create qualification variables
dat$qualfadj2a <- ifelse(dat$qualfadj_num < 2, 1, 2)
dat$qualfadj2b <- ifelse(dat$qualfadj_num < 3, 1, 2)
dat$qualfadj2c <- ifelse(dat$qualfadj_num < 4, 1, 2)
dat$qualfadj3a <- ifelse(dat$qualfadj_num > 3, dat$qualfadj_num - 1, dat$qualfadj_num)
dat$qualfadj3b <- ifelse(dat$qualfadj_num > 2, dat$qualfadj_num - 1, dat$qualfadj_num)
dat$qualfadj3c <- ifelse(dat$qualfadj_num > 1, dat$qualfadj_num - 1, dat$qualfadj_num)
dat$qualfadj4 <- dat$qualfadj_num


##Construct final variables
#All parities
age <- matrix(rep(15:44, each = nrow(dat)), nrow(dat), 30, byrow = F)
coh <- matrix(rep(dat$a_birthy, 30), nrow(dat), 30, byrow = F)
id <- matrix(rep(1:nrow(dat), 30), nrow(dat), 30, byrow = F)
weights <- matrix(rep(dat$a_indinus_xw, 30), nrow(dat), 30, byrow = F)
HDIr <- matrix(rep(dat$HDIr, 30), nrow(dat), 30, byrow = F)
HDIc2a <- matrix(rep(dat$HDIc2a, 30), nrow(dat), 30, byrow = F)
HDIc2b <- matrix(rep(dat$HDIc2b, 30), nrow(dat), 30, byrow = F)
HDIc2c <- matrix(rep(dat$HDIc2c, 30), nrow(dat), 30, byrow = F)
HDIc2d <- matrix(rep(dat$HDIc2d, 30), nrow(dat), 30, byrow = F)
HDIc3a <- matrix(rep(dat$HDIc3a, 30), nrow(dat), 30, byrow = F)
HDIc3b <- matrix(rep(dat$HDIc3b, 30), nrow(dat), 30, byrow = F)
HDIc3c <- matrix(rep(dat$HDIc3c, 30), nrow(dat), 30, byrow = F)
HDIc3d <- matrix(rep(dat$HDIc3d, 30), nrow(dat), 30, byrow = F)
HDIc3e <- matrix(rep(dat$HDIc3e, 30), nrow(dat), 30, byrow = F)
HDIc3f <- matrix(rep(dat$HDIc3f, 30), nrow(dat), 30, byrow = F)
HDIc4a <- matrix(rep(dat$HDIc4a, 30), nrow(dat), 30, byrow = F)
HDIc4b <- matrix(rep(dat$HDIc4b, 30), nrow(dat), 30, byrow = F)
HDIc4c <- matrix(rep(dat$HDIc4c, 30), nrow(dat), 30, byrow = F)
HDIc4d <- matrix(rep(dat$HDIc4d, 30), nrow(dat), 30, byrow = F)
HDIc5 <- matrix(rep(dat$HDIc5, 30), nrow(dat), 30, byrow = F)
qualf2a <- matrix(rep(dat$qualfadj2a, 30), nrow(dat), 30, byrow = F)
qualf2b <- matrix(rep(dat$qualfadj2b, 30), nrow(dat), 30, byrow = F)
qualf2c <- matrix(rep(dat$qualfadj2c, 30), nrow(dat), 30, byrow = F)
qualf3a <- matrix(rep(dat$qualfadj3a, 30), nrow(dat), 30, byrow = F)
qualf3b <- matrix(rep(dat$qualfadj3b, 30), nrow(dat), 30, byrow = F)
qualf3c <- matrix(rep(dat$qualfadj3c, 30), nrow(dat), 30, byrow = F)
qualf4 <- matrix(rep(dat$qualfadj4, 30), nrow(dat), 30, byrow = F)

birth.bin <- birth.bin[which(!is.na(py_ind))]
birth <- birth[which(!is.na(py_ind))]
parity <- parity[which(!is.na(py_ind))]
parityc <- ifelse(parity > 3, 3, parity)
gap <- gap[which(!is.na(py_ind))]
gapc <- ifelse(gap > 11, 11, gap)
age <- age[which(!is.na(py_ind))]
coh <- coh[which(!is.na(py_ind))]
id <- id[which(!is.na(py_ind))]
weights <- weights[which(!is.na(py_ind))]
HDIr <- HDIr[which(!is.na(py_ind))]
HDIc2a <- HDIc2a[which(!is.na(py_ind))]
HDIc2b <- HDIc2b[which(!is.na(py_ind))]
HDIc2c <- HDIc2c[which(!is.na(py_ind))]
HDIc2d <- HDIc2d[which(!is.na(py_ind))]
HDIc3a <- HDIc3a[which(!is.na(py_ind))]
HDIc3b <- HDIc3b[which(!is.na(py_ind))]
HDIc3c <- HDIc3c[which(!is.na(py_ind))]
HDIc3d <- HDIc3d[which(!is.na(py_ind))]
HDIc3e <- HDIc3e[which(!is.na(py_ind))]
HDIc3f <- HDIc3f[which(!is.na(py_ind))]
HDIc4a <- HDIc4a[which(!is.na(py_ind))]
HDIc4b <- HDIc4b[which(!is.na(py_ind))]
HDIc4c <- HDIc4c[which(!is.na(py_ind))]
HDIc4d <- HDIc4d[which(!is.na(py_ind))]
HDIc5 <- HDIc5[which(!is.na(py_ind))]
qualf2a <- qualf2a[which(!is.na(py_ind))]
qualf2b <- qualf2b[which(!is.na(py_ind))]
qualf2c <- qualf2c[which(!is.na(py_ind))]
qualf3a <- qualf3a[which(!is.na(py_ind))]
qualf3b <- qualf3b[which(!is.na(py_ind))]
qualf3c <- qualf3c[which(!is.na(py_ind))]
qualf4 <- qualf4[which(!is.na(py_ind))]

#Parity-specific variables
birth.bin_0 <- birth.bin[parityc == 0]
birth.bin_1 <- birth.bin[parityc == 1]
birth.bin_2 <- birth.bin[parityc == 2]
birth.bin_3 <- birth.bin[parityc == 3]

birth_0 <- birth[parityc == 0]
birth_1 <- birth[parityc == 1]
birth_2 <- birth[parityc == 2]
birth_3 <- birth[parityc == 3]

parity_3 <- parity[parityc == 3]

gap_1 <- gap[parityc == 1]
gap_2 <- gap[parityc == 2]
gap_3 <- gap[parityc == 3]

gapc_1 <- gapc[parityc == 1]
gapc_2 <- gapc[parityc == 2]
gapc_3 <- gapc[parityc == 3]

age_0 <- age[parityc == 0]
age_1 <- age[parityc == 1]
age_2 <- age[parityc == 2]
age_3 <- age[parityc == 3]

coh_0 <- coh[parityc == 0]
coh_1 <- coh[parityc == 1]
coh_2 <- coh[parityc == 2]
coh_3 <- coh[parityc == 3]

id_0 <- id[parityc == 0]
id_1 <- id[parityc == 1]
id_2 <- id[parityc == 2]
id_3 <- id[parityc == 3]

weights_0 <- weights[parityc == 0]
weights_1 <- weights[parityc == 1]
weights_2 <- weights[parityc == 2]
weights_3 <- weights[parityc == 3]

HDIr_0 <- HDIr[parityc == 0]
HDIr_1 <- HDIr[parityc == 1]
HDIr_2 <- HDIr[parityc == 2]
HDIr_3 <- HDIr[parityc == 3]

HDIc2a_0 <- HDIc2a[parityc == 0]
HDIc2a_1 <- HDIc2a[parityc == 1]
HDIc2a_2 <- HDIc2a[parityc == 2]
HDIc2a_3 <- HDIc2a[parityc == 3]

HDIc2b_0 <- HDIc2b[parityc == 0]
HDIc2b_1 <- HDIc2b[parityc == 1]
HDIc2b_2 <- HDIc2b[parityc == 2]
HDIc2b_3 <- HDIc2b[parityc == 3]

HDIc2c_0 <- HDIc2c[parityc == 0]
HDIc2c_1 <- HDIc2c[parityc == 1]
HDIc2c_2 <- HDIc2c[parityc == 2]
HDIc2c_3 <- HDIc2c[parityc == 3]

HDIc2d_0 <- HDIc2d[parityc == 0]
HDIc2d_1 <- HDIc2d[parityc == 1]
HDIc2d_2 <- HDIc2d[parityc == 2]
HDIc2d_3 <- HDIc2d[parityc == 3]

HDIc3a_0 <- HDIc3a[parityc == 0]
HDIc3a_1 <- HDIc3a[parityc == 1]
HDIc3a_2 <- HDIc3a[parityc == 2]
HDIc3a_3 <- HDIc3a[parityc == 3]

HDIc3b_0 <- HDIc3b[parityc == 0]
HDIc3b_1 <- HDIc3b[parityc == 1]
HDIc3b_2 <- HDIc3b[parityc == 2]
HDIc3b_3 <- HDIc3b[parityc == 3]

HDIc3c_0 <- HDIc3c[parityc == 0]
HDIc3c_1 <- HDIc3c[parityc == 1]
HDIc3c_2 <- HDIc3c[parityc == 2]
HDIc3c_3 <- HDIc3c[parityc == 3]

HDIc3d_0 <- HDIc3d[parityc == 0]
HDIc3d_1 <- HDIc3d[parityc == 1]
HDIc3d_2 <- HDIc3d[parityc == 2]
HDIc3d_3 <- HDIc3d[parityc == 3]

HDIc3e_0 <- HDIc3e[parityc == 0]
HDIc3e_1 <- HDIc3e[parityc == 1]
HDIc3e_2 <- HDIc3e[parityc == 2]
HDIc3e_3 <- HDIc3e[parityc == 3]

HDIc3f_0 <- HDIc3f[parityc == 0]
HDIc3f_1 <- HDIc3f[parityc == 1]
HDIc3f_2 <- HDIc3f[parityc == 2]
HDIc3f_3 <- HDIc3f[parityc == 3]

HDIc4a_0 <- HDIc4a[parityc == 0]
HDIc4a_1 <- HDIc4a[parityc == 1]
HDIc4a_2 <- HDIc4a[parityc == 2]
HDIc4a_3 <- HDIc4a[parityc == 3]

HDIc4b_0 <- HDIc4b[parityc == 0]
HDIc4b_1 <- HDIc4b[parityc == 1]
HDIc4b_2 <- HDIc4b[parityc == 2]
HDIc4b_3 <- HDIc4b[parityc == 3]

HDIc4c_0 <- HDIc4c[parityc == 0]
HDIc4c_1 <- HDIc4c[parityc == 1]
HDIc4c_2 <- HDIc4c[parityc == 2]
HDIc4c_3 <- HDIc4c[parityc == 3]

HDIc4d_0 <- HDIc4d[parityc == 0]
HDIc4d_1 <- HDIc4d[parityc == 1]
HDIc4d_2 <- HDIc4d[parityc == 2]
HDIc4d_3 <- HDIc4d[parityc == 3]

HDIc5_0 <- HDIc5[parityc == 0]
HDIc5_1 <- HDIc5[parityc == 1]
HDIc5_2 <- HDIc5[parityc == 2]
HDIc5_3 <- HDIc5[parityc == 3]

qualf2a_0 <- qualf2a[parityc == 0]
qualf2a_1 <- qualf2a[parityc == 1]
qualf2a_2 <- qualf2a[parityc == 2]
qualf2a_3 <- qualf2a[parityc == 3]

qualf2b_0 <- qualf2b[parityc == 0]
qualf2b_1 <- qualf2b[parityc == 1]
qualf2b_2 <- qualf2b[parityc == 2]
qualf2b_3 <- qualf2b[parityc == 3]

qualf2c_0 <- qualf2c[parityc == 0]
qualf2c_1 <- qualf2c[parityc == 1]
qualf2c_2 <- qualf2c[parityc == 2]
qualf2c_3 <- qualf2c[parityc == 3]

qualf3a_0 <- qualf3a[parityc == 0]
qualf3a_1 <- qualf3a[parityc == 1]
qualf3a_2 <- qualf3a[parityc == 2]
qualf3a_3 <- qualf3a[parityc == 3]

qualf3b_0 <- qualf3b[parityc == 0]
qualf3b_1 <- qualf3b[parityc == 1]
qualf3b_2 <- qualf3b[parityc == 2]
qualf3b_3 <- qualf3b[parityc == 3]

qualf3c_0 <- qualf3c[parityc == 0]
qualf3c_1 <- qualf3c[parityc == 1]
qualf3c_2 <- qualf3c[parityc == 2]
qualf3c_3 <- qualf3c[parityc == 3]

qualf4_0 <- qualf4[parityc == 0]
qualf4_1 <- qualf4[parityc == 1]
qualf4_2 <- qualf4[parityc == 2]
qualf4_3 <- qualf4[parityc == 3]

#Only retain relevant objects in workspace
a_plbornc_all <- dat$a_plbornc_all
rm(list=setdiff(ls(),c("index", "birth.bin", "birth", "parity", "parityc", "gap", "gapc", "age", "coh", "id", "weights",
     "a_plbornc_all", "HDIr", "HDIc2a", "HDIc2b", "HDIc2c", "HDIc2d",
     "HDIc3a", "HDIc3b", "HDIc3c", "HDIc3d", "HDIc3e", "HDIc3f",
     "HDIc4a", "HDIc4b", "HDIc4c", "HDIc4d", "HDIc5",
     "qualf2a", "qualf2b", "qualf2c", "qualf3a", "qualf3b", "qualf3c", "qualf4",
     "birth.bin_0", "birth.bin_1", "birth.bin_2", "birth.bin_3",
     "birth_0", "birth_1", "birth_2", "birth_3", "parity_3",
     "gap_1", "gap_2", "gap_3", "gapc_1", "gapc_2", "gapc_3",
     "age_0", "age_1", "age_2", "age_3",
     "coh_0", "coh_1", "coh_2", "coh_3",
     "id_0", "id_1", "id_2", "id_3",
     "weights_0", "weights_1", "weights_2", "weights_3",
     "HDIsurv","HDIr_0", "HDIr_1", "HDIr_2", "HDIr_3",
     "HDIc2a_0", "HDIc2a_1", "HDIc2a_2", "HDIc2a_3",
     "HDIc2b_0", "HDIc2b_1", "HDIc2b_2", "HDIc2b_3",
     "HDIc2c_0", "HDIc2c_1", "HDIc2c_2", "HDIc2c_3",
     "HDIc2d_0", "HDIc2d_1", "HDIc2d_2", "HDIc2d_3",
     "HDIc3a_0", "HDIc3a_1", "HDIc3a_2", "HDIc3a_3",
     "HDIc3b_0", "HDIc3b_1", "HDIc3b_2", "HDIc3b_3",
     "HDIc3c_0", "HDIc3c_1", "HDIc3c_2", "HDIc3c_3",
     "HDIc3d_0", "HDIc3d_1", "HDIc3d_2", "HDIc3d_3",
     "HDIc3e_0", "HDIc3e_1", "HDIc3e_2", "HDIc3e_3",
     "HDIc3f_0", "HDIc3f_1", "HDIc3f_2", "HDIc3f_3",
     "HDIc4a_0", "HDIc4a_1", "HDIc4a_2", "HDIc4a_3",
     "HDIc4b_0", "HDIc4b_1", "HDIc4b_2", "HDIc4b_3",
     "HDIc4c_0", "HDIc4c_1", "HDIc4c_2", "HDIc4c_3",
     "HDIc4d_0", "HDIc4d_1", "HDIc4d_2", "HDIc4d_3",
     "HDIc5_0", "HDIc5_1", "HDIc5_2", "HDIc5_3",
     "qualf2a_0", "qualf2a_1", "qualf2a_2", "qualf2a_3",
     "qualf2b_0", "qualf2b_1", "qualf2b_2", "qualf2b_3",
     "qualf2c_0", "qualf2c_1", "qualf2c_2", "qualf2c_3",
     "qualf3a_0", "qualf3a_1", "qualf3a_2", "qualf3a_3",
     "qualf3b_0", "qualf3b_1", "qualf3b_2", "qualf3b_3",
     "qualf3c_0", "qualf3c_1", "qualf3c_2", "qualf3c_3",
     "qualf4_0", "qualf4_1", "qualf4_2", "qualf4_3")))

#Create indicators
lastfunc <- function(x,y) tail(which(y==x), 1)
l_ind <- sapply(index, lastfunc, y=id)
p0_ind <- sapply(unique(id_0), lastfunc, y=id_0)
p1_ind <- sapply(unique(id_1), lastfunc, y=id_1)
p2_ind <- sapply(unique(id_2), lastfunc, y=id_2)
p3_ind <- sapply(unique(id_3), lastfunc, y=id_3)

#Standardise weights to the sample size
wtmult <- length(index)/sum(weights[l_ind])
weights_st <- weights*wtmult
weights_0st <- weights_0*wtmult
weights_1st <- weights_1*wtmult
weights_2st <- weights_2*wtmult
weights_3st <- weights_3*wtmult
