#############################################################################################
#                                                                                           #
# Code to replicate results in "Combining individual- and population-level data to develop  #
# a Bayesian parity-specific fertility projection model"                                    #
# Email: J.V.Ellison@soton.ac.uk                                                            #
#                                                                                           #    
#############################################################################################

## 1. Set-up
#To run this code you will need R version 3.4.0 or above and be able to install
#packages, in particular rstan, which is required to implement the Hamiltonian
#Monte Carlo methodology to fit the models. This will probably require you to also 
#install the package Rtools, explained in the rstan installation instructions that
#can be found at "https://github.com/stan-dev/rstan/wiki".

#This code requires the user to obtain permission to download the Special Licence version
#of Understanding Society. We use the seventh edition:
#University of Essex. Institute for Social and Economic Research, NatCen Social Research,
#Kantar Public. (2017). Understanding Society: Waves 1-7, 2009-2016: Special Licence Access.
#[data collection]. 7th Edition. UK Data Service. SN: 6931, "http://doi.org/10.5255/UKDA-SN-6931-7".
#The code has been written for the 'TAB' format of the Special Licence data files.
#More recent editions contain minor updates to the data so results may differ slightly.

#This code also downloads HDI data from the UNDP Human Development Reports website
#("https://hdr.undp.org/sites/default/files/2021-22_HDR/HDR21-22_Composite_indices_complete_time_series.csv").
#Note that the 2018 values extracted in the code are not identical to those used in the analyses due
#to data updates, and therefore results may differ slightly.

#Please note that the steps are designed to be followed sequentially. Also note that
#although "source" commands have been used, it is advisable to look at each R script before
#sourcing it in case any modifications or actions are required.

#Set working directory to the location of the source file
setwd("<path>")

#Set 'dir' to location of UKHLS Wave 1 data files
dir <- "<path>"

#Install required packages (only need to do this when running R code for first time)
source("r/install_packages.r")

#Load packages
library(tidyverse)
library(magrittr)
library(haven)
library(readxl)
library(curl)
library(mgcv)
library(rstan)
library(sp)
library(boot)
library(RColorBrewer)
library(zoo)
library(ggpattern)
windowsFonts(Calibri=windowsFont("Calibri"))


## 2. Data processing (Section 2 and Appendix A)
#Processes UKHLS wave 1 data
source("r/process_ukhls_1.r")
source("r/process_ukhls_2.r")

#Downloads and processes the ONS parity-specific fertility rates from
#"https://www.ons.gov.uk/peoplepopulationandcommunity/birthsdeathsandmarriages/livebirths/adhocs/11482fertilityratesbyparity1934to2018englandandwales".
#Saves rates, births, exposures and proportions for ages 15-44, cohorts 1945-2003 and maximum year 2018
#in "data" as "ONS2018_allc.RData", and for cohorts 1945-1992 and maximum year 2008 as "ONS2018_resc.RData".
source("r/process_ons.r")

#Generates and saves Figure 1 in "plots"
source("r/plot_fig_1.r")


## 3. Qualification imputation (Section 3.1 and Appendix B)
#Performs model selection process, fits chosen imputation model and performs mean imputation
mod_sel <- FALSE #change to TRUE if want to perform model selection 
source("r/impute_Q.r")

#Generates and saves Figures 11 and 13 in "plots"
source("r/plot_fig_11_13.r")


## 4. Model selection (Section 3.3)
#Performs model selection process for each parity and saves fitted GAMs in "output"
#NB: The user needs to change 'par_sel' on line 468 to perform model selection for parities 1/2/3+
source("r/mod_sel.r")


## 5. Covariate models
#Q|A,C (Section 4.2)
#Performs model selection process described in Appendix E and saves Stan output in "output"
source("r/model_QAC.r")

#Generates and saves Figures 16 and 17 in "plots"
source("r/plot_fig_16_17.r")

#Generates and saves Figure 2 in "plots"
source("r/plot_fig_2.r")

#T|A,(Q) (Section 4.3)
#Performs model selection process described in Appendix F and saves Stan output in "output"
source("r/model_TAQ.r")

#Generates and saves Figure 18 in "plots"
source("r/plot_fig_18.r")

#Generates and saves Figure 3 in "plots"
source("r/plot_fig_3.r")


## 6. Integrated models - fitting (Section 4.5)
#Fits UKHLS-only (1:0) GAMs to all parities and saves Stan output in "output"
source("r/fit_UKHLS_only.r")

#Fits integrated models and saves Stan output in "output" for parities 0, 1, 2 and 3+.
#Note that the code fits the 1:1 model by default - the 9:1, 2:1, 1:2 and 1:9 models can be fitted
#by uncommenting the appropriate lines in the R scripts.
#The code uses the ONS data up to 2018 by default. To perform backtesting and only use the ONS data up to
#2013, change "back" from FALSE to TRUE in line 10 of the R scripts.
source("r/fit_integrated_p0.r")
source("r/fit_integrated_p1.r")
source("r/fit_integrated_p2.r")
source("r/fit_integrated_p3.r")


## 7. Integrated models - results (Sections 5.1-5.2)
#Processes integrated model output from Step 6 (for a given parity, the UKHLS-only model requires
#at least one integrated model to have been fitted in order to perform the marginalization). 
#The code also assumes that the same model(s) have been fitted for each parity. The user must
#indicate which models have been fitted on line 12. Also note that the code to produce Figures 4-6 
#assumes that the models represented in these plots have been fitted.
#The code also extracts and saves the samples of the marginalized probabilities across the parities
#for each model separately in "output", as these are required for Step 9.
source("r/process_integrated.r")

#Generates and saves Figure 4 in "plots"
source("r/plot_fig_4.r")

#Generates and saves Figure 5 in "plots"
source("r/plot_fig_5.r")

#Generates and saves Figure 6 in "plots"
source("r/plot_fig_6.r")


## 8. Integrated models - backtesting results (Section 5.3)
#Processes backtesting model output from Step 6. The code requires the user to specify (in line 11)
#an integrated model that has been fitted to each parity using the ONS data up to 2018 and 2013
#(i.e. with back = TRUE and back = FALSE in Step 6).
#The code also extracts and saves the samples of the marginalized probabilities across the parities
#for the backtesting models in "output", as these are required for Step 9.
source("r/process_backtesting.r")

#Generates and saves Figure 7 in "plots"
source("r/plot_fig_7.r")


## 9. Integrated models - aggregate forecasts (Section 5.4)
#Downloads and processes the ONS 2018-based NPP midyear population projections from
#"https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/populationandmigration/populationprojections/datasets/z2zippedpopulationprojectionsdatafilesgbandenglandandwales/2018based/tablez2opendata18ewandgb.zip".
#Saves midyear population projections and midyear fertility rate projections for 2019-2050 for
#females aged 15-44 in "data" as "ew_npp_2018.RData".
#NB: This script cannot be sourced directly as it is necessary to save an XML file as an xlsx file.
source("r/process_npp.r")

#Uses the marginalized probability samples saved in Steps 7 and 8 to generate ASFR and CFR forecasts
#for each model. Requires the user to specify which models have been fitted on line 16 (for back = F)
#and line 21 (for back = T). Saves forecasts in "output" as "agg_forecast.RData".
source("r/agg_forecast.r")

#Generates and saves Figure 8 in "plots"
source("r/plot_fig_8.r")

#Generates and saves Figure 9 in "plots"
source("r/plot_fig_9.r")
#NB: Only the principal NPP projection line is added because to obtain the low and high fertility
#variants requires combining the separate variant projections for England and Wales.
#NB: The credible intervals for the Schmertmann et al. (2014) model are not provided but can be
#obtained straightforwardly by the user by downloading the R code provided on the project website
#("https://schmert.net/cohort-fertility/"). Note the following minor modifications to the code:
#1. Instead of using the "VV" HFD files (period fertility rates by calendar year, age reached during
#the year and birth cohort), to be consistent with the ONS data setup we use the "RR" files (period
#fertility rates by calendar year and age in completed years) and so use approximate cohorts calculated
#as Cohort = Year - Age. We also replace the England & Wales rates with the ONS estimates for consistency
#with our integrated models, and do not include the "additional HFD-style data for 12 more countries".
#2. We set the last year of observation ("forecast.year") as 2018 rather than 2010, thereby fitting
#the model to the 1964-2003 cohorts instead of the 1956-1995 cohorts. Consistent with this, we
#extend the historical cohort range by 8 years, thereby using the 1900-1957 cohorts for training.
