###Model Q|A,C

##Setup
supp.labsq4 <- c("< GCSE","GCSE","A Level","Degree")
names(supp.labsq4) <- c(1,2,3,4)
supp.labsq2b <- c("< A Level","At least A Level")
names(supp.labsq2b) <- c(1,2)
supp.labsq3b <- c("< GCSE","GCSE/A Level","Degree")
names(supp.labsq3b) <- c(1,2,3)
supp.labsp <- c("Parity 0","Parity 1","Parity 2")
names(supp.labsp) <- c(0,1,2)
agerange <- 15:44
cohrange <- 1945:2003

#Summary function
sumfunc <- function(files,newdata,qmax,yind) {
  propsa <- list()
  for (i in 1:length(files)) {
    load(files[i])
    theta <- rstan::extract(stanout,pars="theta",permuted=F)
    propsb <- newdata[,paste(1:qmax)]
    propsb[,paste(1:qmax)] <- matrix(apply(theta[,1,],2,mean),nrow=nrow(propsb),byrow=F)
    propsa[[i]] <- as.matrix(propsb)
  }
  propsa
}

#Chi-square function
chifunc <- function(res,newdata,qmax) {
  chi <- numeric()
  for (i in 1:length(res)) {
    pred <- res[[i]]
    predc <- pred * newdata[,"yw"]
    chires <- (newdata[,paste(1:qmax)] * newdata[,"wtmult"] - predc)/sqrt(predc)
    chi[i] <- sum(chires^2,na.rm=T)
  }
  chi
}

#BIC function
BICfunc <- function(res,newdata,qmax,p) {
  BIC <- numeric()
  ind <- which(!is.na(newdata$y))
  for (i in 1:length(res))
    BIC[i] <- -2*sum(newdata$wtmult[ind]*(lfactorial(newdata[ind,"y"])-
                                       apply(lfactorial(newdata[ind,paste(1:qmax)]),1,sum)+
                                       apply(newdata[ind,paste(1:qmax)]*
                                               log(res[[i]][ind,]),1,sum)))+
      p[i]*log(sum(newdata$wtmult[ind]*newdata[ind,"y"]))
  BIC
}


##Parity 0 modelling
#Prepare data for modelling
newdata0 <- aggregate(qualf4_0 ~ age_0 + coh_0, FUN = function(x) c(y=length(x), q1=length(x[x==1]), q2=length(x[x==2]), q3=length(x[x==3]), q4=length(x[x==4])), subset=coh_0 <= 1982)
newdata0 <- data.frame(newdata0$age_0,newdata0$coh_0,newdata0$qualf4_0)
colnames(newdata0) <- c("a","c","y","y1","y2","y3","y4")
newdata0$a <- newdata0$a - median(15:44)
newdata0$yw <- aggregate(weights_0st ~ age_0 + coh_0, FUN = sum, subset = coh_0 <= 1982)$weights_0st
newdata0$wtmult <- newdata0$yw/newdata0$y
colnames(newdata0) <- c("a","c","y",paste0(1:4),"yw","wtmult")
qmax0 <- 4

newdata02 <- expand.grid(a=agerange,c=cohrange)
newdata02$cc <- newdata02$c-median(1945:1982)
newdata02$ccind <- newdata02$c - 1944
newdata02$cind72 <- ifelse(newdata02$c >= 1972, 1972, newdata02$c)
newdata02$cind82 <- ifelse(newdata02$c >= 1982, 1982, newdata02$c)
newdata02$ccind72 <- newdata02$cind72-1944
newdata02$ccind82 <- newdata02$cind82-1944
newdata02$cc72 <- ifelse(newdata02$c<=1971,0,newdata02$cc)
newdata02$aind <- newdata02$a-14
newdata02$a <- newdata02$a-median(15:44)

newdata02 <- left_join(newdata02,newdata0,by=c("a","c"))
Nac <- nrow(newdata02)
y <- newdata02[,paste0(1:qmax0)]
yind <- which(!is.na(y[,1]))
Nobs <- length(yind)
y[is.na(y)] <- 0
cc <- newdata02$cc
ccind <- newdata02$ccind
cc72 <- newdata02$cc72
ccind72 <- newdata02$ccind72
ccind82 <- newdata02$ccind82
Nc <- max(ccind)
Nc72 <- max(ccind72)
Nc82 <- max(ccind82)
a <- newdata02$a
aind <- newdata02$aind
Na <- max(aind)
wt <- newdata02$wtmult
wt[is.na(wt)] <- 0

standata <- list(Nac=Nac, Na=Na, Nc72=Nc72, Nc82=Nc82, Nc=Nc, Nobs=Nobs, y=y, ccind=ccind, ccind72=ccind72, ccind82=ccind82, cc=cc, cc72=cc72, aind=aind, a=a, yind=yind, wt=wt)

#Fit models
for (i in c(1:7,"5.1","5.2")) {
  cat("Fitting Model",i)
  stanout <- stan(file=paste0("stan/p0_QAC_",i,".stan"),data=standata,chains=1,iter=2000)
  save(stanout,file=paste0("output/p0_QAC_",i,".RData"))
}

#Compare models
files0 <- paste0("output/p0_QAC_",c(1:7,"5.1","5.2"),".RData")
res0 <- sumfunc(files0,newdata02,qmax0)
chi0w2 <- round(chifunc(res0,newdata02,qmax0),2)
p0 <- c(3,6,114,117,145,173,201,118,119)
df0 <- (qmax0-1)*nrow(newdata0)-p0
qchi0 <- round(qchisq(0.95,df0),2)
BIC0 <- round(BICfunc(res0,newdata02,qmax0,p0),2)


##Parity 1 modelling
#Prepare data for modelling
newdata1 <- aggregate(qualf2b_1 ~ age_1 + coh_1, FUN = function(x) c(y=length(x), q1=length(x[x==1]), q2=length(x[x==2])), subset=coh_1 <= 1982)
newdata1 <- data.frame(newdata1$age_1,newdata1$coh_1,newdata1$qualf2b_1)
colnames(newdata1) <- c("a","c","y","y1","y2")
newdata1$a <- newdata1$a - median(15:44)
newdata1$yw <- aggregate(weights_1st ~ age_1 + coh_1, FUN = sum, subset = coh_1 <= 1982)$weights_1st
newdata1$wtmult <- newdata1$yw/newdata1$y
colnames(newdata1) <- c("a","c","y",paste0(1:2),"yw","wtmult")
qmax1 <- 2

newdata12 <- expand.grid(a=agerange,c=cohrange)
newdata12$cc <- newdata12$c-median(1945:1982)
newdata12$ccind <- newdata12$c - 1944
newdata12$cind72 <- ifelse(newdata12$c >= 1972, 1972, newdata12$c)
newdata12$cind82 <- ifelse(newdata12$c >= 1982, 1982, newdata12$c)
newdata12$ccind72 <- newdata12$cind72-1944
newdata12$ccind82 <- newdata12$cind82-1944
newdata12$cc72 <- ifelse(newdata12$c<=1971,0,newdata12$cc)
newdata12$aind <- newdata12$a-14
newdata12$a <- newdata12$a-median(15:44)

newdata12 <- left_join(newdata12,newdata1,by=c("a","c"))
Nac <- nrow(newdata12)
y <- newdata12[,paste0(1:qmax1)]
yind <- which(!is.na(y[,1]))
Nobs <- length(yind)
y[is.na(y)] <- 0
cc <- newdata12$cc
ccind <- newdata12$ccind
cc72 <- newdata12$cc72
ccind72 <- newdata12$ccind72
ccind82 <- newdata12$ccind82
Nc <- max(ccind)
Nc72 <- max(ccind72)
Nc82 <- max(ccind82)
a <- newdata12$a
aind <- newdata12$aind
Na <- max(aind)
wt <- newdata12$wtmult
wt[is.na(wt)] <- 0

standata <- list(Nac=Nac, Na=Na, Nc72=Nc72, Nc82=Nc82, Nc=Nc, Nobs=Nobs, y=y, ccind=ccind, ccind72=ccind72, ccind82=ccind82, cc=cc, cc72=cc72, aind=aind, a=a, yind=yind, wt=wt)

#Fit models
for (i in c(1:4,7,"7.1","7.2")[7]) {
  cat("Fitting Model",i)
  stanout <- stan(file=paste0("stan/p1_QAC_",i,".stan"),data=standata,chains=1,iter=2000)
  save(stanout,file=paste0("output/p1_QAC_",i,".RData"))
}

#Compare models
files1 <- paste0("output/p1_QAC_",c(1:4,7,"7.1","7.2"),".RData")
res1 <- sumfunc(files1,newdata12,qmax1)
chi1w2 <- round(chifunc(res1,newdata12,qmax1),2)
p1 <- c(1,2,38,39,67,58,59)
df1 <- (qmax1-1)*nrow(newdata1)-p1
qchi1 <- round(qchisq(0.95,df1),2)
BIC1 <- round(BICfunc(res1,newdata12,qmax1,p1),2)


##Parity 2 modelling
newdata2 <- aggregate(qualf3b_2 ~ age_2 + coh_2, FUN = function(x) c(y=length(x), q1=length(x[x==1]), q2=length(x[x==2]), q3=length(x[x==3])), subset=coh_2 <= 1982)
newdata2 <- data.frame(newdata2$age_2,newdata2$coh_2,newdata2$qualf3b_2)
colnames(newdata2) <- c("a","c","y","y1","y2","y3")
newdata2$a <- newdata2$a - median(17:44)
newdata2$yw <- aggregate(weights_2st ~ age_2 + coh_2, FUN = sum, subset = coh_2 <= 1982)$weights_2st
newdata2$wtmult <- newdata2$yw/newdata2$y
colnames(newdata2) <- c("a","c","y",paste0(1:3),"yw","wtmult")
qmax2 <- 3

newdata22 <- expand.grid(a=agerange,c=cohrange)
newdata22$cc <- newdata22$c-median(1945:1982)
newdata22$ccind <- newdata22$c - 1944
newdata22$cind72 <- ifelse(newdata22$c >= 1972, 1972, newdata22$c)
newdata22$cind82 <- ifelse(newdata22$c >= 1982, 1982, newdata22$c)
newdata22$ccind72 <- newdata22$cind72-1944
newdata22$ccind82 <- newdata22$cind82-1944
newdata22$cc72 <- ifelse(newdata22$c<=1971,0,newdata22$cc)
newdata22$aind <- newdata22$a-14
newdata22$a <- newdata22$a-median(17:44)

newdata22 <- left_join(newdata22,newdata2,by=c("a","c"))
Nac <- nrow(newdata22)
y <- newdata22[,paste0(1:qmax2)]
yind <- which(!is.na(y[,1]))
Nobs <- length(yind)
y[is.na(y)] <- 0
cc <- newdata22$cc
ccind <- newdata22$ccind
cc72 <- newdata22$cc72
ccind72 <- newdata22$ccind72
ccind82 <- newdata22$ccind82
Nc <- max(ccind)
Nc72 <- max(ccind72)
Nc82 <- max(ccind82)
a <- newdata22$a
aind <- newdata22$aind
Na <- max(aind)
wt <- newdata22$wtmult
wt[is.na(wt)] <- 0

standata <- list(Nac=Nac, Na=Na, Nc72=Nc72, Nc82=Nc82, Nc=Nc, Nobs=Nobs, y=y, ccind=ccind, ccind72=ccind72, ccind82=ccind82, cc=cc, cc72=cc72, aind=aind, a=a, yind=yind, wt=wt)

#Fit models
for (i in c(1:5,7,"5.1","5.2")) {
  cat("Fitting Model",i)
  stanout <- stan(file=paste0("stan/p2_QAC_",i,".stan"),data=standata,chains=1,iter=2000)
  save(stanout,file=paste0("output/p2_QAC_",i,".RData"))
}

#Compare models
files2 <- paste0("output/p2_QAC_",c(1:5,7,"5.1","5.2"),".RData")
res2 <- sumfunc(files2,newdata22,qmax2)
chi2w2 <- round(chifunc(res2,newdata22,qmax2),2)
p2 <- c(2,4,76,78,106,134,88,89)
df2 <- (qmax2-1)*nrow(newdata2)-p2
qchi2 <- round(qchisq(0.95,df2),2)
BIC2 <- round(BICfunc(res2,newdata22,qmax2,p2),2)
