#-----------------------------------------------#
# Title: 01_risk
#-----------------------------------------------#
# Project: Ordered Logit Empirical Applications
# Author: Carlos Gonzalez
# Date: November 2025
#-----------------------------------------------#
# Description:
# - Creates simulated data for the risk application
# - Includes all functions related to the analysis of
# the risk application 
#----------------------------------------------#

#--------------------------------------#
# SIMULATED DATA
#--------------------------------------#

# Step 1: Generate and Order Menu data #
#--------------------------------------#

# Menu data
data_gen = function(J = 20, 
                    pi_vector = c(1/100, 1/10), 
                    q_vector = c(1/5, 4/5), 
                    seed = 12345){
  
  # J: Number of menus
  # pi_vector: min and max in interval where prices are uniformly sampled from
  # q_vector: min and max in interval where the probabilities of state 1 are uniformly sampled from
  # seed for replicability
  
  # Set seed
  set.seed(seed)
  
  # Prices
  pi = replicate(2, runif(J, min = min(pi_vector), 
                          max = max(pi_vector)))
  
  # Probabilities
  q_1 = runif(J, min = min(q_vector), max = max(q_vector))
  q = cbind(q_1, 1 - q_1)
  
  # cbind data
  menu_data = cbind(pi, q) |> as_tibble()
  colnames(menu_data) = c("pi_1_orig", "pi_2_orig",
                          "q_1_orig", "q_2_orig")
  
  # Compute phi_ratios
  menu_data = menu_data |> mutate(phi_1_orig = pi_1_orig / q_1_orig,
                                  phi_2_orig = pi_2_orig / q_2_orig)
  
  menu_data
  
}

# Order choices such that \phi_1 < \phi_2
# Order menus such that menu 1 : \argmin_j \phi_j = \phi_2 / \phi_1
order_data = function(df){
  
  # df: An object created via data_gen()
  
  # Intuition: If \phi_1 < \phi_2, then keep it as it is; else reverse
  # Create additional metrics
  # Order menus according to \phi_j
  df |> mutate(pi_1 = ifelse(phi_1_orig < phi_2_orig, pi_1_orig, pi_2_orig),
               pi_2 = ifelse(phi_1_orig < phi_2_orig, pi_2_orig, pi_1_orig),
               q_1 = ifelse(phi_1_orig < phi_2_orig, q_1_orig, q_2_orig),
               q_2 = ifelse(phi_1_orig < phi_2_orig, q_2_orig, q_1_orig),
               phi_1 = ifelse(phi_1_orig < phi_2_orig, phi_1_orig, phi_2_orig),
               phi_2 = ifelse(phi_1_orig < phi_2_orig, phi_2_orig, phi_1_orig),
               log_phi_1 = log(phi_1),
               log_phi_2 = log(phi_2),
               phi_j = phi_2 / phi_1)|>
    arrange(phi_j) |>
    mutate(menu = 1:nrow(df)) |>
    select(-ends_with("orig"))
}

# LaTeX export function TABLE 1A
export_table_1a = function(df){
  
  # df: An object created via order_data()
  
  df |> select(menu, phi_j, pi_1, pi_2, q_1, phi_1, phi_2) |>
        kbl(caption = "Risk: Menus indexed by \\phi_j (Table 1A Appendix D)",
            format="latex",
            col.names = c("Menu","\\phi_j", "\\pi_1", "\\pi_2", "q_1", "\\phi_1", "\\phi_2"),
            align="c",
            digits = 3) |>
        kable_minimal(full_width = F,  html_font = "Source Sans Pro")
}

# Step 2: Generate Choice Data
#-----------------------------#
# (CRRA utility - Logistic distribution)

# Generate data for a single menu j
choice_data_j = function(j, df, N,
                         tau, sigma){
  
  # j index for menu
  # df: an object created via order_data()
  # N: number of observations per menu (paper considers {20, 80, 160, 480})
  # tau: location parameter in Logit (main specification sets \tau = 0.6)
  # sigma: scale parameter in Logit (main specification sets \sigma = 0.3)
  
  # Recover data for menu j \in J
  pi_1 = df$pi_1[j]
  pi_2 = df$pi_2[j]
  phi_1 = df$phi_1[j]
  phi_2 = df$phi_2[j]
  
  # Sample N risk_aversion coefficients from logistic distribution
  t = rlogis(N, location = tau, scale = sigma)
  
  # Select optimally under CRRA utility considerations
  # Noting that: If risk aversion <= 0 \implies c_2 = 0
  r = (phi_2 / phi_1)^(-1/t)
  c_2 = ifelse(t <= 0, 0, r / (pi_1 + r * pi_2))
  c_1 = (1 - pi_2 * c_2) / pi_1
  
  # Bind data
  cbind(menu = rep(j, N),
        c_1 = c_1,
        c_2 = c_2,
        t_real = t)
}

# Generate data for all menus
choice_data_gen = function(df, N = 160, tau = 0.6, sigma = 0.3, seed = 1234){
  
  # df: an object created via order_data()
  # N: number of observations per menu (paper considers {20, 80, 160, 480})
  # tau: location parameter in Logit (main specification sets \tau = 0.6)
  # sigma: scale parameter in Logit (main specification sets \sigma = 0.3)
  
  # Set seed
  set.seed(seed)
  
  # Number of menus
  J = nrow(df)
  
  # lapply over choice_data_j
  cons_data = do.call(rbind, 
                      lapply(1:J, function(j) 
                             choice_data_j(j, df, N, tau, sigma))) |> as_tibble()
  colnames(cons_data) = c("menu", "c_1", "c_2", "t_real")
  
  # Merge in menu data and recover quantile info based on consumption ratio r
  cons_data |> left_join(df, by = "menu") |>
               mutate(n_obs = N,
                      r = c_2 / c_1) |>
               group_by(menu) |>
               mutate(quantile = signif(row_number(r) / N * 100)) |>
               ungroup()
  
}

# Auxiliary function for menu selection
menu_selection_verif = function(df, menu_selection, n_menus_random){ 
  
  # df: An object created via choice_data_gen()
  # menu_selection: one of c("all", "random", "num_vector")
  #   - all: selects all menus in unique(df$menu)
  #   - random: selects n_menus_random at random from unique(df$menu)
  #             if menu_selection != random, n_menus_random is ignored
  #   - num_vector: a vector of menus to be selected i.e. c(1, 7, 14, 15)
  
  # Select menus according to user specification
  if (is.character(menu_selection)){
    if(menu_selection == "all"){
    } else if(menu_selection == "random"){
      sel_menus = sample(unique(df$menu), n_menus_random)
      df = df |> filter(menu %in% sel_menus)
    }
  } else{
    df = df |> filter(menu %in% menu_selection)
  }
  
  # Selected menus
  print(paste("The following menus are considered"))
  print(unique(df$menu))
  
  # Return df
  df
  
}

# FIGURE 1A
# Plot CDF of consumption ratio for a selection of menus
plot_cdf = function(df_list, menu_selection, n_menus_random = 10, seed = 12345){
  
  # df_list: A list of one or more datasets created via choice_data_gen()
  # menu_selection: one of c("all", "random", "num_vector")
  #   - all: selects all menus in unique(df$menu)
  #   - random: selects n_menus_random at random from unique(df$menu)
  #             if menu_selection != random, n_menus_random is ignored
  #   - num_vector: a vector of menus to be plotted i.e. c(1, 7, 14, 15)
  # seed for replicability (only relevant when menus are sampled at random)
  
  # Bind datasets together
  df = do.call(rbind, df_list)
  
  # Select menus according to user specification
  df = menu_selection_verif(df, menu_selection, n_menus_random)
  
  # Plot
  cdf_plot =
    df |> mutate(menu = as.factor(menu)) |>
    rename(Menu = menu) |>
    ggplot() +
    geom_line(aes(x = r, y = quantile, 
                  color = Menu,
                  linetype = Menu,
                  group = interaction(Menu))) +
    labs(group = "Menu", x = TeX(r"(Consumption Ratio $r = c^2_j / c^1_j$)"), 
         y = "Percentile") +
    theme_minimal() +
    #ggtitle("Empirical CDF for each Menu") +
    facet_wrap(~n_obs, nrow = 1) +
    theme(legend.position = "bottom",           
          legend.title = element_text(size = 16),
          legend.text = element_text(size = 16),
          axis.title.x = element_text(size = 16, 
                                      margin = margin(t = 10, r = 0, b = 0, l = 0)),
          axis.title.y = element_text(size = 16),
          strip.text = element_text(size = 16)) +
    guides(color = guide_legend(nrow = 1))
  
  plot(cdf_plot)
  ggsave("output_final/figure1a_risk.png", cdf_plot, height = 5 , width = 10)
}


#--------------------------------------#
# NON-PARAMETRIC ANALYSIS
#--------------------------------------#

# Step 3: Directed Graph
#-----------------------#

# Auxiliary function for quantile_selection
quant_selection_verif = function(df, quant_selection){ 
  
  # df: an object created via choice_data_gen()
  # quant_selection: one of c("percentile", "decile", "num_vector")
  #   - percentile: selects quantiles 1:100. quantiles need to be matched exactly
  #   - decile: selects quantiles 10, 20, ..., 100. quantiles need to be matched exactly
  #   - num_vector: a vector of quantiles to be selected i.e. c(19, 31, 42, 89)
  
  # Select quantiles according to user specification
  if (is.character(quant_selection)){
    if(quant_selection == "percentile"){
      df = df |> filter(quantile %in% 1:100)
      print(paste(length(unique(df$quantile)), "percentiles are considered for analysis"))
    } else if(quant_selection == "decile"){
      df = df |> filter(quantile %in% seq(10, 100, by = 10))
      print(paste(length(unique(df$quantile)), "deciles are considered for analysis"))
    }
  } else{
    df = df |> filter(quantile %in% quant_selection) # For numeric vectors
    print(paste(length(unique(df$quantile)), "quantiles are considered for analysis"))
  }
  
  # Return df
  df
  
}


# Generate Weight Matrix

# Matrix function for single coordinate (j,k) in matrix
matrix_function = function(j, k, df){
  
  # j: row coordinate
  # k: column coordinate (referred to j' in paper. adopt k in code for clarity)
  # df: an object created via order_data()
  
  # Initialize
  c_1_j = df$c_1[j]
  c_2_j = df$c_2[j]
  c_1_k = df$c_1[k]
  c_2_k = df$c_2[k]
  
  log_phi_1_j = df$log_phi_1[j]
  log_phi_1_k = df$log_phi_1[k]
  log_phi_2_j = df$log_phi_2[j]
  log_phi_2_k = df$log_phi_2[k]
  
  # When sampled at random from a prob distribution with continuous density
  # The only tie with positive probability is c_2_j = c_2_k = 0
  # We break ties in favor of the "cheapest" c_2
  # (i.e. if c_2_j = c_2_k then assign c_2_j > c_2_k \iff phi_j < phi_k)
  # Note that Prob(phi_j = phi_k) = 0
  
  if(c_2_j == c_2_k){ # Break ties
    
    # Compute phi_ratios
    phi_j = df$phi_2[j] / df$phi_1[j]
    phi_k = df$phi_2[k] / df$phi_1[k]
    
    # Break in favor of "cheapest" c_2
    if (phi_j < phi_k){c_2_j = c_2_j + 0.0001} else{c_2_k = c_2_k + 0.0001}
  } 
  
  # Initialize
  int_jk = -Inf
  int_kj = -Inf
  
  # Cases as defined in Table 2 (Appendix D)
  if(c_1_j > c_2_j & c_2_j > c_1_k & c_1_k > c_2_k){ # Case 1
    int_jk = log_phi_1_k - log_phi_1_j
    int_kj = Inf
  } else if(c_1_j > c_1_k & c_1_k > c_2_j & c_2_j >= c_2_k){ # Case 2
    int_jk = max(log_phi_1_k - log_phi_1_j, log_phi_2_k - log_phi_2_j)
    int_kj = log_phi_2_j - log_phi_1_k
  } else if(c_1_j > c_1_k & c_1_k > c_2_k & c_2_k >= c_2_j){ # Case 3
    int_jk = log_phi_1_k - log_phi_1_j
    int_kj = log_phi_2_j - log_phi_2_k
  } else if(c_1_k > c_1_j & c_1_j > c_2_j & c_2_j >= c_2_k){ # Case 4
    int_jk = log_phi_2_k - log_phi_2_j
    int_kj = log_phi_1_j - log_phi_1_k
  } else if(c_1_k > c_1_j & c_1_j > c_2_k & c_2_k >= c_2_j){ # Case 5
    int_jk = log_phi_2_k - log_phi_1_j
    int_kj = max(log_phi_1_j - log_phi_1_k, log_phi_2_j - log_phi_2_k)
  } else if(c_1_k > c_2_k & c_2_k > c_1_j & c_1_j > c_2_j){ # Case 6
    int_jk = Inf
    int_kj = log_phi_1_j - log_phi_1_k
  }
  
  # Export coordinates and results
  rbind(c(j, k , int_jk),
        c(k, j, int_kj))
}

# Weight-matrix generating function (for single quantile) and selection of menus
# Plot weight matrix for median quantile - FIGURE 2
# It mapplys matrix function for all menu pairs
gen_weight_matrix = function(df, p = 50,
                             menu_selection = "all", n_menus_random = 10,
                             plot = F, weights = F, seed = 12345){
  
  # df: an object created via choice_data_gen()
  # p: quantile to be considered
  # menu_selection: one of c("all", "random", "num_vector")
  #   - all: selects all menus in unique(df$menu)
  #   - random: selects n_menus_random at random from unique(df$menu)
  #             if menu_selection != random, n_menus_random is ignored
  #   - num_vector: a vector of menus to be selected i.e. c(1, 7, 14, 15)
  # plot: logical indicating if plotting weighted matrix
  # weights: logical indicating whether edge width in plot should be proportional to weights
  # seed for replicability (only relevant when menu_selection = random)
  
  # Set seed
  set.seed(seed)
  
  # Select quantile
  df = df |> filter(quantile == p)
  
  # Select menus
  df = menu_selection_verif(df, menu_selection, n_menus_random)
  
  # Generate coords for fast matrix inputation
  coords = expand.grid(j = 1:nrow(df), k = 1:nrow(df))
  coords = coords[coords$j > coords$k,] # we input a value of 0 for w[j, j]
  
  # Mapply matrix function
  matrix_list = mapply(matrix_function, 
                       j = coords$j,
                       k = coords$k,
                       MoreArgs = list(df), 
                       SIMPLIFY = FALSE)
  
  # Recover matrix from list input
  matrix_tibble = do.call(rbind, matrix_list) |> as_tibble(.name_repair = "minimal")
  colnames(matrix_tibble) = c("row", "column", "value")
  mat =  matrix_tibble |>
         pivot_wider(names_from = column, values_from = value) |>
         arrange(row) |>
         column_to_rownames(loc = "row") |>
         as.matrix()
  
  # Set diagonal = 0 [not relevant for computation]
  diag(mat) = 0
  
  # Plotting weighted matrix
  if (plot){
    # Plotting software does not allow for infinite weights
    # Replace infinite weights with expression below
    max_mat = max(mat[mat<Inf]) * 1.1
    mat = ifelse(mat == Inf, max_mat, mat)
    
    # Create graph from matrix
    g = graph_from_adjacency_matrix(mat, mode = "directed", 
                                    weighted = T, diag = F)
    
    # Assign names if relevant
    if (is.numeric(menu_selection)){
      V(g)$name = menu_selection
    } else{V(g)$name = 1:nrow(mat)}
    
    # Plot aesthetics
    
    # Vertex Color
    V(g)$color = "white"
    
    # Edge attributes
    E(g)$color = ifelse(E(g)$weight >= 0, "lightgrey", "black")
    
    # Arrow size
    E(g)$arrow.size = 0.01
    
    # If weights = T, transform weights appropriately
    if (weights){
      E(g)$weight = 5*abs(E(g)$weight) # Thicken weights proportionally for display considerations
    } else{
      E(g)$weight = 1
    }
    
    # Saving plot
    filename = paste0("output_final/figure2_risk", unique(df$n_obs),
                      "w_", weights,  ".pdf")
    pdf(filename)
    
    plot(g,layout = layout_with_fr,  # Fruchterman-Reingold layout for display
         vertex.size = 15,            # Adjust vertex size
         vertex.label.color = "black",# Vertex label color
         vertex.label.cex = 1,      # Vertex label size
         edge.curved = 0.2,           # Add slight curvature to edges
         edge.width = E(g)$weight)    # Use weight as width
    
    dev.off()
    
    # Plotting plot
    plot(g,layout = layout_with_fr,  # Fruchterman-Reingold layout for display
         vertex.size = 15,            # Adjust vertex size
         vertex.label.color = "black",# Vertex label color
         vertex.label.cex = 1,      # Vertex label size
         edge.curved = 0.2,           # Add slight curvature to edges
         edge.width = E(g)$weight)    # Use weight as width
  }
  
  # Export matrix
  mat
}

# Step 4. Bellman Ford
#----------------------#

# Note: To avoid overwriting temporal objects while vectorizing
# this program creates external files in your device
# These files are deleted upon completion of the task

# BF for single quantile
bellman = function(mat, p){
  
  # mat: a matrix object created via gen_weight_matrix
  # p: quantile to be considered
  
  # Create graph
  g = graph_from_adjacency_matrix(mat, mode = "directed", 
                                  weighted = T, diag = F)
  
  # Run Bellman-Ford Algorithm
  # It is a fully connected graph hence it suffices to look for paths between any two vertices
  suppressWarnings(b <<- try(
    bellmanFord(g, from = V(g)[1], to = V(g)[2]), silent = T))
  
  # Stores error in outside folder to avoid in-console displays
  if(inherits(b, "try-error")){
    export = c("quantile" = p) |> t()
    write.table(export, file = "output_final/errors.csv", sep = ",",
                col.names = F, row.names = F, append = T)
  }
}

# Full BF routine for a single quantile
full_rout_bell = function(df, p, menu_selection,
                          n_menus_random, seed){
  
  # df: an object created via choice_data_gen()
  # p: quantile to be considered
  # menu_selection: one of c("all", "random", "num_vector")
  #   - all: selects all menus in unique(df$menu)
  #   - random: selects n_menus_random at random from unique(df$menu)
  #             if menu_selection != random, n_menus_random is ignored
  #   - num_vector: a vector of menus to be selected i.e. c(1, 7, 14, 15)
  # seed for replicability (only relevant when menu_selection = random)
  
  # Generate matrix (plotting is not allowed within full_rout_bell)
  mat = gen_weight_matrix(df, p = p, 
                          menu_selection = menu_selection, n_menus_random,
                          seed = seed)
  # Run BF
  bellman(mat, p)
}

# Workers function for multiprocessing
# Number of workers is set in 00_main
n_workers_function = function(workers = n_workers){return(workers)}

# Parallel process over quantiles
# Produces results in TABLE 3 (Appendix D)
lapply_routine_bell = function(df, quant_selection = "percentile",
                               menu_selection = "all", n_menus_random = 10, seed = 12345){
  
  # df: an object created via choice_data_gen()
  # quant_selection: one of c("percentile", "decile", "num_vector")
  #   - percentile: selects quantiles 1:100. quantiles need to be matched exactly
  #   - decile: selects quantiles 10, 20, ..., 100. quantiles need to be matched exactly
  #   - num_vector: a vector of quantiles to be selected i.e. c(19, 31, 42, 89)
  
  # menu_selection: one of c("all", "random", "num_vector")
  #   - all: selects all menus in unique(df$menu)
  #   - random: selects n_menus_random at random from unique(df$menu)
  #             if menu_selection != random, n_menus_random is ignored
  #   - num_vector: a vector of menus to be selected i.e. c(1, 7, 14, 15)
  # seed for replicability (only relevant for menu_selection = "random")
  
  # Select quantiles according to user specification
  df = quant_selection_verif(df, quant_selection)
  
  # Initialize directory for storing errors while vectorizing
  if (file.exists("output_final/errors.csv")){unlink("output_final/errors.csv")}
  
  # Plan multi-session
  plan(multisession(workers = n_workers_function()))
  
  # Vectorize full_routine bell
  future_lapply(sort(unique(df$quantile)), 
                full_rout_bell, df = df,
                menu_selection = menu_selection,
                n_menus_random = n_menus_random,
                seed = seed,
                future.seed = T)
  
  if (file.exists("output_final/errors.csv")){
    
    # Recover errors for analysis and display
    summary_tab = read.csv("output_final/errors.csv", header = FALSE) |> as_tibble()
    colnames(summary_tab) = "quantile"
    
    # Delete external file
    unlink("output_final/errors.csv")
    
    # Display
    return(summary_tab |> arrange(quantile))
  } else{print("Your dataset satisfies SAREU for all considered quantiles")}
}


#--------------------------------------#
# SEMI-PARAMETRIC ANALAYSIS
#--------------------------------------#

# Step 5. Induced parameters
#---------------------------#

# Recover induced parameters based on consumption and menu data
risk_aversion_params = function(df){
  
  # df: an object created via choice_data_gen()
  
  # Number of menus
  J = length(unique(df$menu))
  
  # Recover t and compute quantile 
  df |> mutate(t = ifelse(c_2 == 0, 0,
                          -log(phi_j) / log(r))) |>
        group_by(menu, n_obs) |>
        arrange(t) |>
        mutate(quantile_t = signif(row_number(t) / n_obs * 100) ) |>
        ungroup()
}

# Plot induced parameters ECDF
# FIGURE 3A
plot_induced_param = function(df_list, menu_selection = "random", n_menus_random = 10,
                              recover_params, seed = 12345){
  
  # df_list: a list of one or more objects created via choice_data_gen()
  # menu_selection: one of c("all", "random", "num_vector")
  #   - all: selects all menus in unique(df$menu)
  #   - random: selects n_menus_random at random from unique(df$menu)
  #             if menu_selection != random, n_menus_random is ignored
  #   - num_vector: a vector of menus to be selected i.e. c(1, 7, 14, 15)
  # recover_params: a function which outputs an induced parameter t (like risk_aversion_params)
  # seed for replicability (only relevant for menu_selection = random)
  
  # Set seed
  set.seed(seed)
  
  # Bind datasets together
  df = do.call(rbind, df_list)
  
  # Select menus according to user specification
  df = menu_selection_verif(df, menu_selection, n_menus_random)
  
  # Recover t-parameter via recover_params
  df = recover_params(df)
  
  # Plot
  plot_cdf_induced_param =
    df |> filter(t < 2) |> # For plotting considerations only
    mutate(Menu = as.factor(menu)) |>
    ggplot() +
    geom_line(aes(x = t, y = quantile_t, 
                  color = Menu,
                  linetype = Menu,
                  group = interaction(Menu))) +
    labs(group = "Menu", x = "Induced Risk Aversion Parameter", 
         y = "Percentile") +
    theme_minimal() +
    #ggtitle("Empirical Induced Parameter CDF for each Menu") +
    facet_wrap(~n_obs, nrow = 1) +
    theme(legend.position = "bottom",           
          legend.title = element_text(size = 16),
          legend.text = element_text(size = 16),
          axis.title.x = element_text(size = 16, 
                                      margin = margin(t = 10, r = 0, b = 0, l = 0)),
          axis.title.y = element_text(size = 16),
          strip.text = element_text(size = 16)) +
    guides(color = guide_legend(nrow = 1))
  
  plot(plot_cdf_induced_param)
  ggsave("output_final/figure3a_risk.png", plot_cdf_induced_param, 
         height = 5 , width = 10)
  
} 

# Step 6. Equality of distributions #
#-----------------------------------#

# Non-parametric testing AD (adjusted for J samples)
# Documentation: https://cran.r-project.org/web/packages/kSamples/kSamples.pdf

similarity_testing = function(df, recover_params,
                              menu_selection = "all", n_menus_random = 10,
                              quant_selection = "percentile"){
  
  # df: an object created via choice_data_gen()
  # recover_params: a function which outputs an induced parameter t (like risk_aversion_params)
  # menu_selection: one of c("all", "random", "num_vector")
  #   - all: selects all menus in unique(df$menu)
  #   - random: selects n_menus_random at random from unique(df$menu)
  #             if menu_selection != random, n_menus_random is ignored
  #   - num_vector: a vector of menus to be selected i.e. c(1, 7, 14, 15)
  
  # quant_selection: one of c("percentile", "decile", "num_vector")
  #   - percentile: selects quantiles 1:100. quantiles need to be matched exactly
  #   - decile: selects quantiles 10, 20, ..., 100. quantiles need to be matched exactly
  #   - num_vector: a vector of quantiles to be selected i.e. c(19, 31, 42, 89)
  
  # Check and Select relevant menus according to user specification
  df = menu_selection_verif(df, menu_selection, n_menus_random)
  
  # Select quantiles according to user specification
  df = quant_selection_verif(df, quant_selection)
  
  # Recover t-parameter via recover_params
  df = recover_params(df)
  
  # Testing #
  #---------#
  
  # Drop param = Inf values to compute integral
  if(max(df$t) == Inf){
    print("One or more induced parameters have been inputted to be Infinity")
    df = df |> filter(t < Inf)
  }
  
  test = ad.test(t ~ menu, data = df)
  return(test$ad[1,]) # Version 1 of AD test given continuous data
}

# TABLE 6
similarity_testing_wrap = function(df_list, recover_params,
                                   menu_selection = "all", n_menus_random = 10,
                                   quant_selection = "percentile"){
  
  # df_list: a list of one or more objects created via choice_data_gen()
  # recover_params: a function which outputs an induced parameter t (like risk_aversion_params)
  # menu_selection: one of c("all", "random", "num_vector")
  #   - all: selects all menus in unique(df$menu)
  #   - random: selects n_menus_random at random from unique(df$menu)
  #             if menu_selection != random, n_menus_random is ignored
  #   - num_vector: a vector of menus to be selected i.e. c(1, 7, 14, 15)
  
  # quant_selection: one of c("percentile", "decile", "num_vector")
  #   - percentile: selects quantiles 1:100. quantiles need to be matched exactly
  #   - decile: selects quantiles 10, 20, ..., 100. quantiles need to be matched exactly
  #   - num_vector: a vector of quantiles to be selected i.e. c(19, 31, 42, 89)
  
  test_table =
    sapply(X = df_list, FUN = similarity_testing,
           recover_params = recover_params,
           menu_selection = menu_selection,
           n_menus_random = n_menus_random,
           quant_selection = quant_selection) |> t() |> as_tibble()
  
  cbind(c("20 obs", "80 obs", "160 obs", "480 obs"), test_table) |>
    kbl(format="latex",
        col.names = c("N obs", "AD Statistic","Stand. AD Stat.", "Asymp p-value"),
        align="c",
        digits = 3) |>
    kable_minimal(full_width = F,  html_font = "Source Sans Pro")
}

#--------------------------------------#
# PARAMETRIC ANALAYSIS
#--------------------------------------#

# Step 7. Mass analysis
#----------------------#
mass_analysis = function(df_list){
  
  # df_list: a list of one or more objects created via choice_data_gen()
  
  # Bind datasets together
  df = do.call(rbind, df_list)
  
  # Mass analysis
  mass_analysis_df =
    df |> mutate(larger_than_0 = ifelse(c_2 == 0, 1, 0)) |>
          group_by(menu, n_obs) |>
          summarize(mass_at_0 = mean(larger_than_0) * 100,
                    largest_r = max(r),
                    phi_j = mean(phi_2 / phi_1)) |>
          pivot_wider(names_from = n_obs, 
                      values_from = c("largest_r", "mass_at_0"))
  
  # Output as list  
  list("largest_r_df" = mass_analysis_df |> select(menu, phi_j, starts_with("largest_r")),
       "mass_at_0_df" = mass_analysis_df |> select(menu, phi_j, starts_with("mass_at")))
}

# LaTeX output for mass analysis table
# TABLE 7 and 8
mass_output = function(df){
  
  # df: a tibble created via mass_analysis()
  
  df |> select(menu, phi_j, ends_with("0")) |>
        kbl(format="latex",
            col.names = c("Menu","\\Phi_j", "20 obs", "80 obs", "160 obs", "480 obs"),
            align="c",
            digits = 3) |>
        kable_minimal(full_width = F,  html_font = "Source Sans Pro")
}

# Step 8. CLA Property
#---------------------#

# Recover CLA property relevant data (sum of t and sum of log odds)
cla_data = function(df, recover_params, 
                    choices_per_menu = 5, seed = 1234){
  
  # df: an object created via choice_data_gen()
  # recover_params: a function which outputs an induced parameter t (like risk_aversion_params)
  # choices_per_menu: number of random choices to be selected by menu
  # seed for replicability
  
  # Set seed
  set.seed(seed)
  
  # Recover t-parameter via recover_params
  df = recover_params(df)
  
  # Initialize df
  cla_df = tibble(menu = NA, t = NA, t_prime = NA, 
                  log_odds_t = NA, log_odds_t_prime = NA,
                  sum_t = NA, sum_log_odds = NA)
  
  # Recover n observations per menu and merge together
  for (j in 1:length(unique(df$menu))){
    
    # Select menu
    df_menu = df |> filter(menu == j)
    
    # Compute ECDF
    ecdf_t = ecdf(df_menu |> pull(t)) # Including 0s
    
    # Sample interior observations
    index = sample(which(df_menu$t > 0 & df_menu$t < max(df_menu$t)), choices_per_menu)
    
    # Compute log_odds by using ecdf_t
    subset_menu_t = df_menu[index,] |> mutate(ecdf = ecdf_t(t),
                                              log_odds_t = log(ecdf / (1 - ecdf))) |>
      select(t, log_odds_t)
    
    # Create a mirror subset to sum across
    subset_menu_t_prime = subset_menu_t |> rename(t_prime = t,
                                                  log_odds_t_prime = log_odds_t)
    
    # Merge both data sets and sum across them 
    grid_t = expand_grid("t" = subset_menu_t$t, "t_prime" = subset_menu_t$t)
    df_t = grid_t |> left_join(subset_menu_t, by = "t") |>
                      left_join(subset_menu_t_prime, by = "t_prime") |>
                      mutate(sum_t = t + t_prime,
                             sum_log_odds = log_odds_t + log_odds_t_prime,
                             menu = j) |>
                      select(menu, t, t_prime, log_odds_t, 
                             log_odds_t_prime, sum_t, sum_log_odds)
    
    # Bind data
    cla_df = rbind(cla_df, df_t)
    
  }
  
  cla_df[-1,] # Exclude initial empty row
  cla_df$n_obs = unique(df$n_obs)
  cla_df
  
}

# FIGURE 4A
# Plot CLA property
plot_cla = function(df_list, true_params){
  
  # df_list: a list of objects created via cla_data()
  # true_params: a vector of true parameters to be compared to
  
  # Initialize true_params
  tau = true_params[1]
  sigma = true_params[2]
  
  # Bind datasets together
  df = do.call(rbind, df_list)
  
  # Create segment according to true values
  # y = 1 / sigma * x - 2 * tau / sigma
  
  x_start = 0
  x_end = 4
  y_start = 1/sigma *(x_start - 2 * tau)
  y_end = 1/sigma *(x_end - 2 * tau)
  
  # Auxiliary vector with true parameter information
  true_log_vector = paste0("Log(", tau,", ", sigma, ")")
  
  # Plot
  cla_plot =
    df |> ggplot() +
    geom_point(aes(x = sum_t, y = sum_log_odds)) +
    geom_segment(aes(x = x_start, y = y_start, xend = x_end, yend = y_end,
                     color = true_log_vector), linewidth = 1) +
    scale_color_manual(values = "red") +
    theme_minimal() +
    labs(x = "Sum Risk Aversion Parameter",
         y = TeX(r"(Sum $\log$ $\frac{F_j(t)}{1-F_j(t)}$)"),
         color = "") +
    #ggtitle("CLA Property") +
    facet_wrap(~n_obs, nrow = 1) +
    theme(legend.position = "bottom",           
          legend.title = element_text(size = 16),
          legend.text = element_text(size = 16),
          axis.title.x = element_text(size = 16, 
                                      margin = margin(t = 10, r = 0, b = 0, l = 0)),
          axis.title.y = element_text(size = 16),
          strip.text = element_text(size = 16))
  
  plot(cla_plot)
  ggsave("output_final/figure4a_risk.png", cla_plot, height = 5 , width = 10)
  
}

# Step 9. Censored logit parametric MLE
#--------------------------------------#

# Logistic density
likelihood_function_logit = function(logis_params, df_geq0, M_0, M){
  
  # logis_params: a vector of parameters to evaluate likelihood function at
  # df_geq0: an object created via recover_params() containing only non-0 observations
  # M_0: number of zero observations in original dataset
  # M: number of observations in original dataset
  
  # Initialize parameters
  tau = logis_params[1]
  sigma = logis_params[2]
  
  # Likelihood function as in Step 9 of Section 6.2
  - M_0*log(1 + exp(tau/sigma)) - sum((df_geq0$t - tau)/sigma + log(sigma) + 
                                        2*log(1 + exp(-(df_geq0$t - tau)/sigma)))
}

# Gradient of logistic density
gradient_function_logit = function(logis_params, df_geq0, M_0, M){
  
  # logis_params: a vector of parameters to evaluate likelihood function at
  # df_geq0: an object created via recover_params() containing only non-0 observations
  # M_0: number of zero observations in original dataset
  # M: number of observations in original dataset
  
  # Initialize parameters
  tau = logis_params[1]
  sigma = logis_params[2]
  
  # Gradient wrt to \tau and \sigma of likelihood function logit
  c(1 / sigma * (- M_0 * exp(tau/sigma) / (1 + exp(tau / sigma)) + 
                   (M - M_0) - sum( 2*exp(-(df_geq0$t - tau) / sigma) / (1 + exp(-(df_geq0$t - tau) / sigma)))),
    1 / sigma *(tau / sigma * M_0 * exp(tau/sigma) / (1 + exp(tau / sigma)) + 
                  - (M - M_0) - sum( 2*(df_geq0$t - tau) / sigma * (exp(-(df_geq0$t - tau) / sigma)) / (1 + exp(-(df_geq0$t - tau) / sigma)) +
                                       + (df_geq0$t - tau)/ sigma )))
  
}

# TABLE 11 A
# Parametric estimation
parametric_estimation = function(df, recover_params){
  
  # df: an object created via choice_data_gen()
  # recover_params: a function which outputs an induced parameter t (like risk_aversion_params)
  
  # Recover t-parameter via recover_params
  if (class(recover_params) == "function"){
    df = recover_params(df)
  }
  
  # Auxiliary variables (see paper for reference and notation)
  M = nrow(df)
  M_0 = nrow(df[df$t == 0,])
  df_geq0 = df |> filter(t > 0) # Strictly positive observations
  
  # Start values in optimization
  start_values = c(mean(df_geq0$t), sd(df_geq0$t))
  
  # Optimization
  optim_results = optim(par = start_values, # Uses plug-in coefficients as starting values
                        fn = likelihood_function_logit,
                        gr = gradient_function_logit,
                        df_geq0 = df_geq0,
                        M_0 = M_0,
                        M = M,
                        method = 'Nelder-Mead',
                        control=list(fnscale = -1)) # To maximize (not minimize)

  c("tau" = round(optim_results$par[1], 3),
    "sigma" = round(optim_results$par[2], 3),
    "avg_ll" = round(optim_results$value / unique(df$n_obs), 3))
  
}

# GoF Test via Bootrstrap
#-------------------------#

# Compute AD statistic
ad_statistic = function(df, param_results) {
  
  # df: an object created via choice_data_gen()
  # param_results: a vector created via parametric_estimation()
  
  # Recover parameters given parametric_estimation() output
  tau = param_results["tau"]
  sigma = param_results["sigma"]
  
  # Compute AD test in t > 0 data
  df_geq0 = df |> filter(t > 0)
  
  # True logistic at x = df_geq0$t
  F_logis = plogis((df_geq0$t - tau) / sigma)
  
  # Empirical CDF
  F_emp = ecdf(df_geq0$t)(df_geq0$t)
  
  # AD statistic
  sum((F_emp - F_logis)^2 / (F_logis * (1 - F_logis)))
}

# Simulate data (Logistic) under the null according to parametric bootstrap
simulate_censored_logistic = function(n_draws, param_results) {
  
  # n_draws: number of draws (= nrow(df))
  # param_results: a vector created via parametric_estimation()
  
  tau = param_results["tau"]
  sigma = param_results["sigma"]
  
  t_sim = rlogis(n_draws, location = tau, scale = sigma)
  t_sim[t_sim < 0] = 0  # Apply censoring
  
  # Export data
  tibble(t = t_sim, n_obs = n_draws)
}

# Bootstrap wrap
bootstrap_wrap = function(n_draws, param_results, recover_params){
  
  # n_draws: number of draws (= nrow(df))
  # param_results: a vector created via parametric_estimation()
  # recover_params: a function which outputs an induced parameter t (like risk_aversion_params)
  
  # Simulate data
  df_boot = simulate_censored_logistic(n_draws, param_results)
  
  # Parametric estimation of simulated data
  params_boot = parametric_estimation(df_boot, "no")
  
  # Compute AD statistic
  ad_statistic(df_boot, params_boot)
  
}

# TABLE 11 B
# Bootstrap AD test function
bootstrap_ad_test = function(df, recover_params, B = 1000, seed = 12345) {
  
  # df: an object created via choice_data_gen()
  # recover_params: a function which outputs an induced parameter t (like risk_aversion_params)
  # B: number of bootrstrap draws
  
  # Set seed for replicability
  set.seed(seed)
  
  # Step 1: Estimate parameters via parametric estimation
  params = parametric_estimation(df, recover_params)
  
  # Step 2: Compute AD test on original data
  df = recover_params(df)
  true_ad = ad_statistic(df, params)
  
  # Step 3: Bootstrap
  n_draws = nrow(df)
  
  ad_bootrstrap = future_replicate(B, 
                                   bootstrap_wrap(n_draws = n_draws,
                                                  param_results = params,
                                                  recover_params = recover_params))
  
  # Step 4: Compute p-value
  p_val = mean(ad_bootrstrap >= true_ad)
  
  c("true_ad" = true_ad, 
    "p_value" = p_val)
}