# ============================================================================
# ECONOMETRICS LAB 4: REGRESSION ANALYSIS & DIAGNOSTIC VISUALIZATION
# ============================================================================
# Gender Wage Gap Case Study
#
# In previous labs, you learned:
# - Simple regression
# - Data wrangling with dplyr
# - Visualization with ggplot2
#
# Today we bring it all together for COMPLETE regression analysis:
# - Data exploration and the gender wage gap
# - Log transformations and model specification
# - Regression diagnostics
# - Model comparison
# - Professional reporting

# ============================================================================
# SETUP
# ============================================================================

library(tidyverse)
library(broom) # For tidy model outputs
library(modelsummary) # For output tables
library(patchwork) # For combining plots

# Load data
df <- read_csv("data/qp_sample.csv") |>
  as_tibble()

# ============================================================================
# PART 1: DATA EXPLORATION
# ============================================================================

# Descriptive Statistics
datasummary_skim(df, output = "default")
datasummary_correlation(df, output = "default")

# Calculate the mean wage difference between male and female
df |>
  group_by(male) |>
  summarise(average_wage = mean(wage_lhr))

# Interpret log differences as percentages
# Example: exp(mean_male - mean_female) - 1

# --- Graph the Data: Log Wage Distribution ---

df |>
  ggplot(aes(x = wage_lhr)) +
  geom_histogram(
    bins = 100,
    color = "white",
    fill = "steelblue",
  ) +
  xlim(-1, 3) +
  labs(x = "Log of hourly real wage") +
  theme_minimal()

# --- Graph the Data: Wage Distribution ---

df |>
  ggplot(aes(x = exp(wage_lhr))) + # We can use the variable directly
  geom_histogram(
    bins = 100,
    color = "steelblue",
    fill = "steelblue",
    alpha = 0.5
  ) +
  xlim(0, 7) +
  labs(x = "Hourly real wage") +
  theme_minimal()

# --- Men and Women Comparison ---

df |>
  mutate(male = as.character(male)) |>
  ggplot(aes(x = wage_lhr, color = male, fill = male)) +
  geom_histogram(alpha = 0.5, position = "identity", bins = 50) +
  xlim(-1, 3) +
  scale_color_manual(
    values = c("#f66068", "#27a0cc"),
    labels = c("Women", "Men"),
    name = ""
  ) +
  scale_fill_manual(
    values = c("#f66068", "#27a0cc"),
    labels = c("Women", "Men"),
    name = ""
  ) +
  labs(
    x = "Log of hourly real wage",
    y = "Count"
  ) +
  theme_minimal()

# --- Raw Gender Wage Gap Over Time ---

df_wage_gap <- df |>
  group_by(male, year) |>
  summarise(average_wage = mean(wage_lhr), .groups = "drop") |>
  pivot_wider(
    names_from = male,
    values_from = average_wage,
    names_prefix = "wage_"
  ) |>
  mutate(diff = wage_1 - wage_0)

# Plot the Gender Wage Gap
df_wage_gap |>
  pivot_longer(!year, names_to = "series", values_to = "wage") |>
  ggplot(aes(x = year, y = wage, color = series)) +
  geom_line(linewidth = 1.5) +
  scale_color_manual(
    values = c("#206095", "#f66068", "#27a0cc"),
    labels = c("Wage Gap", "Women", "Men"),
    name = ""
  ) +
  labs(y = "Log of real hourly wage", x = "Year") +
  theme_minimal()


# ============================================================================
# PART 2: SIMPLE LINEAR MODELS & THE GENDER WAGE GAP
# ============================================================================

# --- OLS by hand ---

(beta1 <- cov(df$wage_lhr, df$male) / var(df$male))

(beta0 <- mean(df$wage_lhr) - beta1 * mean(df$male))

# --- Using the lm function ---

m1 <- lm(wage_lhr ~ male, data = df)
summary(m1)

# --- Multiple Linear Models ---

# Control for education
m2 <- lm(wage_lhr ~ male + educ, data = df)
summary(m2)


# ============================================================================
# PART 3: MODEL SPECIFICATION & LOG TRANSFORMATIONS
# ============================================================================

# The dependent variable (wage_lhr) is already in logs.
# Let's explore how education affects wages with different specifications.

# Question: How does education affect wages, controlling for gender?

# Model 1: Log-linear (wage_lhr is already log(wage))
model_educ <- lm(wage_lhr ~ educ + male, data = df)

# Model 2: Add age (proxy for experience)
model_educ_age <- lm(wage_lhr ~ educ + male + age, data = df)
summary(model_educ_age)

# Model 3: Add quadratic age (diminishing returns to experience)
model_educ_age_age2 <- lm(wage_lhr ~ educ + male + age + I(age^2), data = df)

# Compare R-squared
summary(model_educ)$r.squared
summary(model_educ_age)$r.squared
summary(model_educ_age_age2)$r.squared

# Interpretation:
# Since wage_lhr = log(hourly wage), coefficients are semi-elasticities.
# coef on educ: percentage change in wage per unit increase in education
# coef on male: percentage wage premium for being male (the gender gap)

coef(model_educ)["educ"] # % increase per education level
coef(model_educ)["male"] # raw gender gap in log points

# Compare models visually: predicted wage by education for men vs women
tibble(educ = rep(1:8, 2), male = rep(c(0, 1), each = 8)) |>
  mutate(
    pred_educ = predict(model_educ, newdata = pick(everything())),
    pred_educ_age = predict(
      model_educ_age,
      newdata = mutate(pick(everything()), age = 40)
    ),
    pred_educ_age_age2 = predict(
      model_educ_age_age2,
      newdata = mutate(pick(everything()), age = 40)
    ),
    gender = if_else(male == 1, "Men", "Women")
  ) |>
  ggplot(aes(x = educ, y = pred_educ_age_age2, color = gender)) +
  geom_line(linewidth = 1) +
  scale_color_manual(values = c("Men" = "#27a0cc", "Women" = "#f66068")) +
  labs(
    title = "Predicted Log Wage by Education Level",
    subtitle = "Model: log(wage) ~ education + male",
    x = "Education Level",
    y = "Predicted Log(Hourly Wage)",
    color = ""
  ) +
  theme_minimal()

# EXERCISE 1:
# 1. Run a model with wage_lhr ~ educ + male + tenure
# 2. Run a model with wage_lhr ~ educ + male + tenure + age
# 3. Compare R-squared values
# 4. Interpret the coefficient on male in each model
# YOUR CODE HERE:

# ============================================================================
# PART 6: COMPARING MODELS & PROFESSIONAL REPORTING
# ============================================================================

# --- Model Comparison Table ---

models <- list(
  "Baseline (male)" = lm(wage_lhr ~ male, data = df),
  "+ Education" = lm(wage_lhr ~ male + educ, data = df),
  "+ Age" = lm(wage_lhr ~ male + educ + age, data = df),
  "+ Age\u00B2" = lm(wage_lhr ~ male + educ + age + I(age^2), data = df),
  "+ Tenure" = lm(wage_lhr ~ male + educ + age + I(age^2) + tenure, data = df),
  "+ Sector" = lm(
    wage_lhr ~ male + educ + age + I(age^2) + tenure + as.factor(sector),
    data = df
  )
)

# Extract key statistics
model_comparison <- map_df(models, glance, .id = "model") |>
  select(model, r.squared, adj.r.squared, nobs)

model_comparison

# Visualize model comparison
model_comparison |>
  ggplot(aes(x = reorder(model, adj.r.squared), y = adj.r.squared)) +
  geom_col(fill = "steelblue") +
  geom_text(aes(label = round(adj.r.squared, 3)), vjust = -0.5) +
  coord_flip() +
  labs(
    title = "Model Comparison: Adjusted R-squared",
    x = NULL,
    y = "Adjusted R-squared"
  ) +
  theme_minimal()

# --- How does the gender gap change across models? ---

# Extract the coefficient on male from each model
gender_gap <- map_df(
  models,
  ~ tidy(.x) |> filter(term == "male"),
  .id = "model"
)

gender_gap |>
  ggplot(aes(x = reorder(model, -estimate), y = estimate)) +
  geom_point(size = 3, color = "steelblue") +
  geom_errorbar(
    aes(ymin = estimate - 1.96 * std.error, ymax = estimate + 1.96 * std.error),
    width = 0.2
  ) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
  coord_flip() +
  labs(
    title = "Gender Wage Gap Across Model Specifications",
    subtitle = "Coefficient on 'male' with 95% confidence intervals",
    x = NULL,
    y = "Male wage premium (log points)"
  ) +
  theme_minimal()

# --- Complete Analysis Report ---

# Best model
final_model <- lm(wage_lhr ~ male + educ + age + I(age^2) + tenure, data = df)
summary(final_model)

# Summary plot: actual vs predicted
df |>
  mutate(predicted = predict(final_model)) |>
  ggplot(aes(x = wage_lhr, y = predicted)) +
  geom_point(alpha = 0.3) +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
  labs(
    title = "Model Performance: Actual vs Predicted Log(Wage)",
    x = "Actual Log(Hourly Wage)",
    y = "Predicted Log(Hourly Wage)"
  ) +
  theme_minimal()

# ============================================================================
# TAKE-HOME CHALLENGE
# ============================================================================

# Complete regression analysis project:
# 1. Formulate a research question about the gender wage gap
# 2. Try multiple model specifications (adding controls progressively)
# 3. Track how the gender gap coefficient changes across specifications
# 4. Write 2-3 sentences interpreting your main findings

# ============================================================================
# KEY TAKEAWAYS
# ============================================================================
# MODEL SPECIFICATION:
# - Try multiple specifications (adding controls progressively)
# - Compare using R-squared and visual fit
# - Log models: coefficients are semi-elasticities (percentage effects)
# - Omitted variable bias: the gender gap changes as we add controls
#
# WORKFLOW:
# - Explore data with plots
# - Fit models progressively
# - Compare models
