# Script encoding: UTF-8
# R version: 3.6.3
# Author: Emmanuel Aramendia

# This script provides the code for Granger causality testing as performed in the paper
# Title: Moving from final to useful stage in energy-economy analysis: a critical assessment
# doi: 10.1016/j.apenergy.2020.116194

# The script is decomposed as follows:
# (1) Importing libraries
# (2) Importing data (see file ...)
# (3) Computing 1-year growth rates
# (4) Testing time series for stationarity
# (5) Running causality tests
#     i. Setting up the result data frame
#     ii. Initialising n
#     iii. Choosing energy and gdp variables
#     iv. Tests with grangertest function
#     v. Tests with causality function
#     vi. Tests with Toda-Yamamoto procedure
# (6) Exporting data to csv if needed.


#### (1) Importing needed libraries ####
# Add versions.

# Tidyverse packages
library(dplyr)# script working with version 1.0.2
library(readr)# script working with version 1.3.1

# Other packages
library(lmtest) # script working with version 0.9.37
library(tseries) # script working with version 0.10.47
library(vars) # script working with version 1.5.3
library(aod) # script working with version 1.3.1

#### (2) Importing data ####
raw_ES_data <- readr::read_csv(file = "",# Fill in the path
                    col_types = cols(
                      .default = readr::col_double()
                    ))


#### (3) Computing 1-year growth rates ####
ES_data <- raw_ES_data %>%
  dplyr::mutate(g_rgdpe = (rgdpe - dplyr::lag(rgdpe))/dplyr::lag(rgdpe) * 100,
         g_rgdpo = (rgdpo - dplyr::lag(rgdpo))/dplyr::lag(rgdpo) * 100,
         g_rgdpna = (rgdpna - dplyr::lag(rgdpna))/dplyr::lag(rgdpna) * 100,
         g_PE = (PE - dplyr::lag(PE))/dplyr::lag(PE) * 100,
         g_FEG = (FEG - dplyr::lag(FEG))/dplyr::lag(FEG) * 100,
         g_UEG = (UEG - dplyr::lag(UEG))/dplyr::lag(UEG) * 100,
         g_UXG = (UXG - dplyr::lag(UXG))/dplyr::lag(UXG) * 100)


#### (4) Testing for stationarity ####

test <- ES_data[-1,] # The data frame that is used subsequently for testing for stationarity. We get rid of the first row, because growth rates are full of NAs.

# Note on tests:
# - The Augmented Dickey-Fuller (ADF) test needs being rejected; so p-value needs to be < 0.1
# - The Phillips-Perron (PP) test needs being rejected; so p-value needs to be < 0.1
# - The Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test needs being accepted; so p-value needs to be > 0.1

# (4.i) Testing final energy growth rate g_FEG
tseries::adf.test(test$g_FEG)
tseries::pp.test(test$g_FEG)
tseries::kpss.test(test$g_FEG)

tseries::adf.test(diff(test$g_FEG,1))
tseries::pp.test(diff(test$g_FEG,1))
tseries::kpss.test(diff(test$g_FEG,1))
# Once differentiated, all tests are passing.


# (4.ii) Testing useful energy growth rate g_UEG
tseries::adf.test(test$g_UEG)
tseries::pp.test(test$g_UEG)
tseries::kpss.test(test$g_UEG)

tseries::adf.test(diff(test$g_UEG,1))
tseries::pp.test(diff(test$g_UEG,1))
tseries::kpss.test(diff(test$g_UEG,1))
# Once differentiated, all tests are passing.


# (4.iii) Testing useful exergy growth rate g_UXG
tseries::adf.test(test$g_UXG)
tseries::pp.test(test$g_UXG)
tseries::kpss.test(test$g_UXG)

tseries::adf.test(diff(test$g_UXG,1))
tseries::pp.test(diff(test$g_UXG,1))
tseries::kpss.test(diff(test$g_UXG,1))
# Once differentiated, all tests are passing.


# (4.vi) Testing gdp (as rgdpe) growth rate g_rgdpe
tseries::adf.test(test$g_rgdpe)
tseries::pp.test(test$g_rgdpe)
tseries::kpss.test(test$g_rgdpe)

tseries::adf.test(diff(test$g_rgdpe,1))
tseries::pp.test(diff(test$g_rgdpe,1))
tseries::kpss.test(diff(test$g_rgdpe,1))
# Once differentiated, all tests are passing.


# (4.v) Testing gdp (as rgdpo) growth rate g_rgdpo
tseries::adf.test(test$g_rgdpo)
tseries::pp.test(test$g_rgdpo)
tseries::kpss.test(test$g_rgdpo)

tseries::adf.test(diff(test$g_rgdpo,1))
tseries::pp.test(diff(test$g_rgdpo,1))
tseries::kpss.test(diff(test$g_rgdpo,1))
# Once differentiated, all tests are passing.


# (4.vi) Testing gdp (as rgdpna) growth rate g_rgdpna
tseries::adf.test(test$g_rgdpna)
tseries::pp.test(test$g_rgdpna)
tseries::kpss.test(test$g_rgdpna)

tseries::adf.test(diff(test$g_rgdpna,1))
tseries::pp.test(diff(test$g_rgdpna,1))
tseries::kpss.test(diff(test$g_rgdpna,1))
# Once differentiated, all tests are passing.


#### (5) Running causality tests ####

# (5.i) Setting up results data frame

results <- data.frame(
  test_id = integer(0),
  energy = character(0),# what energy variable is used
  gdp = character(0),# what gdp variable is used
  granger_test = character(0),# what causality test procedure is used
  lags = integer(0),# how many lags are included in the VAR model
  ncriteria = integer(0),# how many criteria support the use of the number of lags (minimum = 1; maximum = 4)
  year_1 = integer(0),# beginning year for the test
  year_2 = integer(0),# end year for the test
  time_length = integer(0),# number of years included in the test
  validity = integer(0),
  p_e_to_gdp = numeric(0),# p-value for causality running from energy consumption growth to gdp growth
  p_gdp_to_e = numeric(0) # p-value for causality running from gdp growth to energy consumption growth
)


# (5.ii) Initialising n, which is the ID of the test
n <- 1


# (5.iii) Choosing the energy and gdp variables for which we want to run tests
energy.list <- list("g_FEG", "g_UEG", "g_UXG")
gdp.list <- list("g_rgdpe", "g_rgdpo", "g_rgdpna")


# 5.(iv) Running the tests using the grangertest (lmtest package) function
for (ENERGY in energy.list){
  for (GDP in gdp.list){
    
    dy <- data.frame(
      denergy = diff(ES_data[-1,][[ENERGY]],1), # Taking first - empty - row out and differentiating
      dgdp = diff(ES_data[-1,][[GDP]],1)
    )
    
    # The two following for loops ensure that we are covering every single time period of length >= 30 years, so every time period with >= 30 data points.
    for (l in 30:55){# length of time period
      for (i in 1:(55-l+1)){# varying beginning year within the given length
        
        var_selection <- vars::VARselect(dy[i:(i+l-1),],lag=8,type="both") # Good indexes
        
        # Each of the four criterion returns a (potentially) different lag number
        lag_values <- list(var_selection$selection[[1]], 
                           var_selection$selection[[2]],
                           var_selection$selection[[3]],
                           var_selection$selection[[4]])
        
        # Now we test that for a given number of lags returned by a criterion, the model is stable and non serially correlated
        for (lag in unique(lag_values)){# Unique to avoid testing twice the same model
          
          number_criteria <- (lag_values[1]==lag)*1 + (lag_values[2]==lag)*1 + (lag_values[3]==lag)*1 + (lag_values[4]==lag)*1
          
          var <- vars::VAR(dy[i:(i+l-1),], p = lag, type = "const")
          validity_check <- 1# by default, the model is deemed valid
          
          ser_corr_check <- vars::serial.test(var)[2][[1]][3]$p.value
          stab_check <- max(abs(vars::roots(var)))
          
          # If one of the two conditions (stability and non serial correlation) is not verified, then the model is classified as non valid
          if ((ser_corr_check < 0.1) || (stab_check > 1)){
            validity_check <- 0
          }
          
          # Granger tests + stocking results
          gdp_to_e <- lmtest::grangertest(dy$denergy[i:(i+l-1)] ~ dy$dgdp[i:(i+l-1)], order = lag)
          e_to_gdp <- lmtest::grangertest(dy$dgdp[i:(i+l-1)] ~ dy$denergy[i:(i+l-1)], order = lag)
          
          to_add <- data.frame(
            test_id = n,
            energy = ENERGY,
            gdp = GDP,
            granger_test = "grangertest",
            lags = lag,
            ncriteria = number_criteria,
            year_1 = 1960+i,
            year_2 = 1960+i+l-1,
            time_length = l,
            validity = validity_check,
            p_e_to_gdp = e_to_gdp[4][[1]][2],
            p_gdp_to_e = gdp_to_e[4][[1]][2]
          )
          
          results <- rbind(results, to_add)
          
          n <- n+1
        }
      }
    }
    
  }
}


# 5.(v) Running the tests using the causality (from vars package) function
for (ENERGY in energy.list){
  for (GDP in gdp.list){
    
    dy <- data.frame(
      denergy = diff(ES_data[-1,][[ENERGY]],1), # Taking first - empty - row out and differentiating
      dgdp = diff(ES_data[-1,][[GDP]],1)
    )
    
    # The two following for loops ensure that we are covering every single time period of length >= 30 years, so every time period with >= 30 data points.
    for (l in 30:55){# length of time period
      for (i in 1:(55-l+1)){ # varying beginning year within the given length
        
        var_selection <- vars::VARselect(dy[i:(i+l-1),],lag=8,type="both") # Good indexes
        
        # Each of the four criterion returns a (potentially) different lag number
        lag_values <- list(var_selection$selection[[1]], 
                           var_selection$selection[[2]],
                           var_selection$selection[[3]],
                           var_selection$selection[[4]])
        
        # Now we test that for a given number of lags returned by a criterion, the model is stable and non serially correlated
        for (lag in unique(lag_values)){# Unique to avoid testing twice the same model
          
          number_criteria <- (lag_values[1]==lag)*1 + (lag_values[2]==lag)*1 + (lag_values[3]==lag)*1 + (lag_values[4]==lag)*1
          
          var <- vars::VAR(dy[i:(i+l-1),], p = lag, type = "const")
          validity_check <- 1# by default, the model is deemed valid
          
          ser_corr_check <- vars::serial.test(var)[2][[1]][3]$p.value
          stab_check <- max(abs(vars::roots(var)))
          
          # If one of the two conditions (stability and non serial correlation) is not verified, then the model is classified as non valid
          if ((ser_corr_check < 0.1) || (stab_check > 1)){
            validity_check <- 0
          }
          
          # Granger tests + stocking results
          p_gdp_causing_energy <- vars::causality(var, cause = "dgdp")$Granger
          p_energy_causing_gdp <- vars::causality(var, cause = "denergy")$Granger
          
          to_add <- data.frame(
            test_id = n,
            energy = ENERGY,
            gdp = GDP,
            granger_test = "causality",
            lags = lag,
            ncriteria = number_criteria,
            year_1 = 1960+i,
            year_2 = 1960+i+l-1,
            time_length = l,
            validity = validity_check,
            p_e_to_gdp = p_energy_causing_gdp$p.value,
            p_gdp_to_e = p_gdp_causing_energy$p.value
          )
          
          results <- rbind(results, to_add)
          
          n <- n+1
        }
      }
    }
    
  }
}


# 5.(vi) Running the tests using the Toda-Yamamoto (TY) procedure
# We follow the TY procedure implementation as described in https://datazen.info/toda-yamamoto-implementation-in-r/
for (ENERGY in energy.list){
  for (GDP in gdp.list){
    
    y <- data.frame(
      energy = ES_data[-1,][[ENERGY]],# Taking first - empty - row out
      gdp = ES_data[-1,][[GDP]]
    )
    
    # The two following for loops ensure that we are covering every single time period of length >= 30 years, so every time period with >= 30 data points.
    for (l in 30:56){# length of time period
      for (i in 1:(56-l+1)){# varying beginning year within the given length
        
        var_selection <- vars::VARselect(y[i:(i+l-1),],lag=8,type="both") # Good indexes
        
        # Each of the four criterion returns a (potentially) different lag number
        lag_values <- list(var_selection$selection[[1]], 
                           var_selection$selection[[2]],
                           var_selection$selection[[3]],
                           var_selection$selection[[4]])
        
        # Now we test that for a given number of lags returned by a criterion, the model is stable and non serially correlated
        for (lag in unique(lag_values)){# Unique to avoid testing twice the same model
          
          number_criteria <- (lag_values[1]==lag)*1 + (lag_values[2]==lag)*1 + (lag_values[3]==lag)*1 + (lag_values[4]==lag)*1
          
          var_prelim <- vars::VAR(y[i:(i+l-1),], p = lag, type = "const")
          
          validity_check <- 1# by default, the model is deemed valid
          
          ser_corr_check <- vars::serial.test(var_prelim)$serial$p.value
          stab_check <- max(abs(vars::roots(var_prelim)))
          
          # If one of the two conditions (stability and non serial correlation) is not verified, then the model is classified as non valid
          if (ser_corr_check < 0.1 || stab_check > 1){
            validity_check <- 0
          }
          
          var <- vars::VAR(y[i:(i+l-1),], p = (lag+1), type = "const") # As all these time series are I(1)
          
          # Granger tests + stocking results
          p_gdp_causing_energy <- aod::wald.test(b=coef(var$varresult[[1]]), Sigma=vcov(var$varresult[[1]]), Terms = seq(2,2*lag,2))$result$chi2[3]
          p_energy_causing_gdp <- aod::wald.test(b=coef(var$varresult[[2]]), Sigma=vcov(var$varresult[[2]]), Terms= seq(1,2*lag-1,2))$result$chi2[3]
          
          to_add <- data.frame(
            test_id = n,
            energy = ENERGY,
            gdp = GDP,
            granger_test = "toda-yamamoto",
            lags = lag,
            ncriteria = number_criteria,
            year_1 = 1960+i,
            year_2 = 1960+i+l-1,
            time_length = l,
            validity = validity_check,
            p_e_to_gdp = p_energy_causing_gdp,
            p_gdp_to_e = p_gdp_causing_energy
          )
          
          results <- rbind(results, to_add)
          
          n <- n+1
        }
      }
    }
    
  }
}


#### (6) Exporting results to csv if needed ####
readr::write_csv(x = results,
                 path = "",# fill in path.
                 col_names = TRUE)