# ============================================================================
# ECONOMETRICS CLASS 3: DATA WRANGLING & VISUALIZATION WITH {tidyverse}
# ============================================================================

# In Class 2, you learned R basics.
# Today we'll learn the MODERN way to work with data:
# - Wrangle data with {dplyr}
# - Visualize with {ggplot2}

# ============================================================================
# SETUP: INSTALLING AND LOADING PACKAGES
# ============================================================================

# Install packages (only need to do this once!)
# Uncomment the lines below if you haven't installed them yet:
# install.packages("tidyverse")
# install.packages("WDI")
# install.packages("wooldridge")

# Load packages
library(tidyverse) # Includes both dplyr AND ggplot2!
library(WDI) # World Bank Development Indicators
library(wooldridge) # Real economics datasets

# ============================================================================
# PART 1: EXPLORING DATA WITH VISUALIZATION
# ============================================================================

# Load real wage data
data(wage1)
wage_data <- tibble(wage1)

head(wage_data)
summary(wage_data)

# --- THE GRAMMAR OF GRAPHICS (ggplot2) ---

# Basic structure: ggplot(data, aes(x, y)) + geom_*()

# Simple scatter plot: wage vs education
ggplot(wage_data, aes(x = educ, y = wage)) +
  geom_point()

# Make it better - add transparency and color
ggplot(wage_data, aes(x = educ, y = wage)) +
  geom_point(alpha = 0.5, color = "steelblue") +
  labs(
    title = "Education and Wages",
    x = "Years of Education",
    y = "Hourly Wage ($)"
  )

# Add a regression line
ggplot(wage_data, aes(x = educ, y = wage)) +
  geom_point(alpha = 0.5, color = "steelblue") +
  geom_smooth(method = "lm", color = "red", se = FALSE) +
  labs(
    title = "Education and Wages",
    x = "Years of Education",
    y = "Hourly Wage ($)"
  ) +
  theme_minimal()

# --- HISTOGRAMS: Understanding distributions ---

ggplot(wage_data, aes(x = wage)) +
  geom_histogram(bins = 30, fill = "steelblue", color = "white") +
  labs(
    title = "Distribution of Wages",
    x = "Hourly Wage ($)",
    y = "Count"
  ) +
  theme_minimal()

# What does this tell us about the wage distribution?

# --- BOX PLOTS: Comparing groups ---

# First, let's create a grouping variable
wage_data <- mutate(wage_data, gender = ifelse(female == 1, "Female", "Male"))

ggplot(wage_data, aes(x = gender, y = wage, fill = gender)) +
  geom_boxplot() +
  labs(
    title = "Wage Distribution by Gender",
    x = "Gender",
    y = "Hourly Wage ($)"
  ) +
  theme_minimal()

# What does this show about the gender wage gap?

# EXERCISE 1:
# 1. Create a scatter plot of wage vs exper (experience)
# 2. Add a regression line
# 3. Add proper labels and a title
# 4. Use theme_minimal()
# YOUR CODE HERE:

# ============================================================================
# PART 2: FILTER & VISUALIZE
# ============================================================================

# The PIPE operator: |> (read as "then")
# Let's combine filtering with visualization!

# Compare high vs low education workers
# This is an in-place mutation or transformation of the dataset.
# We are not assigning the result to a new or existent object.
wage_data |>
  mutate(education_level = ifelse(educ >= 16, "College+", "No College")) |>
  ggplot(aes(x = exper, y = wage, color = education_level)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    title = "Experience and Wages by Education Level",
    x = "Years of Experience",
    y = "Hourly Wage ($)",
    color = "Education"
  ) +
  theme_minimal()

# --- FILTER: Keep only rows that meet conditions ---

# Only workers with 16+ years of education
# Now we are assigning the results to a new object
high_ed_data <- wage_data |>
  filter(educ >= 16)

ggplot(high_ed_data, aes(x = exper, y = wage)) +
  geom_point(alpha = 0.5, color = "darkgreen") +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "College Graduates: Experience vs Wage") +
  theme_minimal()

# Multiple conditions
# In this case, we are using an 'and' condition through '&'
# An 'or' condition could also be used through '|'
wage_data |>
  filter(educ >= 12 & exper >= 5) |>
  ggplot(aes(x = wage)) +
  geom_histogram(bins = 25, fill = "coral", color = "white") +
  labs(
    title = "Wage Distribution: 12+ years education, 5+ years experience",
    x = "Hourly Wage ($)"
  ) +
  theme_minimal()

# EXERCISE 2:
# Using the pipe |>:
# 1. Filter to workers with tenure >= 5 years
# 2. Create a scatter plot: educ vs wage
# 3. Color points by gender
# 4. Add appropriate labels
# YOUR CODE HERE:

# ============================================================================
# PART 3: MUTATE & VISUALIZE TRANSFORMATIONS
# ============================================================================

# --- MUTATE: Create new variables ---

# Log transformations for better visualization
wage_data |>
  mutate(log_wage = log(wage)) |>
  ggplot(aes(x = educ, y = log_wage)) +
  geom_point(alpha = 0.5, color = "steelblue") +
  geom_smooth(method = "lm", color = "red") +
  labs(
    title = "Education and Log(Wage)",
    x = "Years of Education",
    y = "Log(Hourly Wage)"
  ) +
  theme_minimal()

# Create categorical variables for grouping
wage_data |>
  mutate(
    experience_group = case_when(
      exper < 10 ~ "0-9 years",
      exper < 20 ~ "10-19 years",
      TRUE ~ "20+ years"
    )
  ) |>
  ggplot(aes(x = experience_group, y = wage, fill = experience_group)) +
  geom_boxplot() +
  labs(
    title = "Wages by Experience Level",
    x = "Experience Group",
    y = "Hourly Wage ($)"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

# --- FACETING: Small multiples ---

wage_data |>
  mutate(
    high_ed = ifelse(educ >= 16, "College+", "No College"),
    high_exp = ifelse(exper >= 10, "Experienced", "Entry Level")
  ) |>
  ggplot(aes(x = wage)) +
  geom_histogram(bins = 20, fill = "steelblue", color = "white") +
  facet_grid(high_ed ~ high_exp) +
  labs(
    title = "Wage Distributions by Education and Experience",
    x = "Hourly Wage ($)"
  ) +
  theme_minimal()

# EXERCISE 3:
# 1. Create a new variable: wage_category (low: <5, medium: 5-10, high: >10)
# 2. Count how many workers in each category (use geom_bar)
# 3. Fill bars by gender
# 4. Add labels and theme
# YOUR CODE HERE:

# ============================================================================
# PART 4: GROUP_BY, SUMMARISE & VISUALIZE SUMMARIES
# ============================================================================

# --- SUMMARISE: Calculate statistics ---

# Average wage by education level
wage_summary <- wage_data |>
  mutate(
    education_level = cut(
      educ,
      breaks = c(0, 12, 16, 20),
      labels = c("< HS", "HS-College", "College+")
    )
  ) |>
  group_by(education_level) |>
  summarise(
    mean_wage = mean(wage),
    median_wage = median(wage),
    n = n() # a counting function
  )

wage_summary

# Visualize the summary
ggplot(
  wage_summary,
  aes(x = education_level, y = mean_wage, fill = education_level)
) +
  geom_col() +
  geom_text(aes(label = paste0("$", round(mean_wage, 2))), vjust = -0.5) +
  labs(
    title = "Average Wage by Education Level",
    x = "Education Level",
    y = "Mean Hourly Wage ($)"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

# --- WORLD BANK DATA: Real international comparisons ---

country_data_raw <- WDI(
  country = "all",
  indicator = c(
    gdp_per_cap = "NY.GDP.PCAP.KD",
    life_exp = "SP.DYN.LE00.IN",
    population = "SP.POP.TOTL"
  ),
  start = 2022,
  end = 2022,
  extra = TRUE
)

country_data <- country_data_raw |>
  filter(!is.na(gdp_per_cap) & !is.na(life_exp)) |>
  filter(!is.na(region) & region != "Aggregates")

# Visualize immediately!
ggplot(country_data, aes(x = gdp_per_cap, y = life_exp)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", color = "red") +
  scale_x_log10(labels = scales::dollar) +
  labs(
    title = "GDP and Life Expectancy (2022)",
    x = "GDP per Capita (log scale)",
    y = "Life Expectancy (years)"
  ) +
  theme_minimal()

# Group by region and visualize
region_summary <- country_data |>
  group_by(region) |>
  summarise(
    mean_gdp = mean(gdp_per_cap),
    mean_life_exp = mean(life_exp),
    countries = n()
  ) |>
  arrange(desc(mean_life_exp))

ggplot(
  region_summary,
  aes(x = reorder(region, mean_life_exp), y = mean_life_exp, fill = region)
) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Average Life Expectancy by Region (2022)",
    x = NULL,
    y = "Life Expectancy (years)"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

# Color by income level
ggplot(country_data, aes(x = log(gdp_per_cap), y = life_exp, color = income)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    title = "GDP and Life Expectancy by Income Level",
    x = "Log(GDP per Capita)",
    y = "Life Expectancy (years)",
    color = "Income Level"
  ) +
  theme_minimal()

# EXERCISE 4:
# Using country_data:
# 1. Group by income level
# 2. Calculate mean GDP per capita and mean life expectancy
# 3. Create a scatter plot with income levels on x-axis,
#    mean_life_exp on y-axis
# 4. Make point size proportional to number of countries
# YOUR CODE HERE:

# ============================================================================
# TAKE-HOME CHALLENGE
# ============================================================================

# Create a complete visualization workflow:
# 1. Pick one of the datasets (wage_data or country_data)
# 2. Use filter, mutate, and group_by to prepare the data
# 3. Create at least 3 different types of plots (scatter, box, histogram, etc.)
# 4. Each plot should tell a different story about the data
# 5. Make them publication-ready with proper labels and themes

# ============================================================================
# KEY TAKEAWAYS
# ============================================================================
# INTEGRATED WORKFLOW:
# - Always visualize your data as you wrangle it
# - Use |> to chain operations: data |> filter() |> ggplot()
# - Visualization helps you understand your transformations
#
# CORE DPLYR VERBS:
# - filter(): Keep rows that meet conditions
# - mutate(): Create new variables
# - group_by() + summarise(): Calculate group statistics
#
# CORE GGPLOT GEOMS:
# - geom_point(): Scatter plots
# - geom_histogram(): Distributions
# - geom_boxplot(): Compare groups
# - geom_col()/geom_bar(): Bar charts
# - geom_smooth(): Add regression lines
#
# LAYERING:
# - ggplot builds plots in layers with '+'
# - Start with data and aesthetics: ggplot(data, aes(x, y))
# - Add geometries: + geom_point()
# - Add labels: + labs()
# - Add themes: + theme_minimal()