Answer the following questions and complete the exercises in
RMarkdown. Please embed all of your code and push your final work to
your repository. Your final lab report should be organized, clean, and
run free from errors. Remember, you must remove the # for
the included code chunks to run. Be sure to add your name to the author
header above. For any included plots, make sure they are clearly
labeled. You are free to use any plot type that you feel best
communicates the results of your analysis.
Make sure to use the formatting conventions of RMarkdown to make your report neat and clean!
library(tidyverse)
options(scipen = 999)
For this homework, we are going to use the Gapminder data on population
(pop), GDP per capita (gdp), and life expectancy (lex). These data were
downloaded and compiled using the gapminder_compile.Rmd
script in the data folder.
Let’s focus our attention on data from the past 100 years, between 1925 and 2025.
gapminder <- read_csv("data/gapminder.csv") %>%
filter(between(year, 1925, 2025))
dim(gapminder)
## [1] 19695 5
names(gapminder)
## [1] "country" "year" "pop" "gdp" "lex"
glimpse(gapminder)
## Rows: 19,695
## Columns: 5
## $ country <chr> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "A…
## $ year <dbl> 1925, 1926, 1927, 1928, 1929, 1930, 1931, 1932, 1933, 1934, 19…
## $ pop <dbl> 8748779, 8289705, 7854721, 7442562, 7121951, 6888085, 6736411,…
## $ gdp <dbl> 1113.793, 1127.866, 1142.117, 1156.548, 1171.161, 1185.959, 12…
## $ lex <dbl> 38.87, 39.01, 39.16, 39.30, 39.44, 39.59, 39.73, 39.88, 40.02,…
gapminder %>%
summarize(n_countries = n_distinct(country))
## # A tibble: 1 × 1
## n_countries
## <int>
## 1 195
pop_growth_top5 <- gapminder %>%
filter(year %in% c(1925, 2025)) %>%
select(country, year, pop) %>%
pivot_wider(names_from = year, values_from = pop, names_prefix = "year_") %>%
mutate(pop_growth = year_2025 - year_1925,
abs_pop_growth = abs(pop_growth)) %>%
arrange(desc(abs_pop_growth)) %>%
slice_head(n = 5)
pop_growth_top5
## # A tibble: 5 × 5
## country year_1925 year_2025 pop_growth abs_pop_growth
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 India 304406019 1463865525 1159459506 1159459506
## 2 China 473741473 1416096094 942354621 942354621
## 3 Indonesia 49685263 285721236 236035973 236035973
## 4 Pakistan 22624808 255219554 232594746 232594746
## 5 USA 116193761 347275807 231082046 231082046
top_country <- pop_growth_top5 %>%
slice_head(n = 1) %>%
pull(country)
gapminder %>%
filter(country == top_country) %>%
ggplot(aes(year, pop)) +
geom_line(color = "steelblue", linewidth = 1) +
labs(
title = paste("Population Growth:", top_country),
x = "Year",
y = "Population"
) +
theme_minimal()
gapminder %>%
filter(year %in% c(1925, 2025)) %>%
group_by(year) %>%
summarize(
min_lex = min(lex, na.rm = TRUE),
median_lex = median(lex, na.rm = TRUE),
mean_lex = mean(lex, na.rm = TRUE),
max_lex = max(lex, na.rm = TRUE),
.groups = "drop"
)
## # A tibble: 2 × 5
## year min_lex median_lex mean_lex max_lex
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1925 21.0 35.9 38.1 63.5
## 2 2025 52.8 74.8 73.9 85.6
geom_density). What
do you notice about the shift in life expectancy over time?gapminder %>%
filter(year %in% c(1925, 2025)) %>%
ggplot(aes(lex, fill = factor(year))) +
geom_density(alpha = 0.4) +
labs(
title = "Life Expectancy Distribution in 1925 vs 2025",
x = "Life Expectancy",
y = "Density",
fill = "Year"
) +
theme_minimal()
## Warning: Removed 13 rows containing non-finite outside the scale range
## (`stat_density()`).
gapminder %>%
filter(year == 2025, country != "Monaco") %>%
ggplot(aes(gdp, lex)) +
geom_point(alpha = 0.6, color = "darkgreen") +
geom_smooth(method = "lm", se = FALSE, color = "black", linewidth = 0.8) +
labs(
title = "GDP per Capita vs Life Expectancy (2025)",
x = "GDP per Capita",
y = "Life Expectancy"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
gdp_growth_top5 <- gapminder %>%
filter(year %in% c(1975, 2025), country != "Monaco") %>%
select(country, year, gdp) %>%
pivot_wider(names_from = year, values_from = gdp, names_prefix = "year_") %>%
mutate(gdp_growth = year_2025 - year_1975,
abs_gdp_growth = abs(gdp_growth)) %>%
arrange(desc(abs_gdp_growth)) %>%
slice_head(n = 5)
gdp_growth_top5
## # A tibble: 5 × 5
## country year_1975 year_2025 gdp_growth abs_gdp_growth
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Singapore 12896. 137906. 125010. 125010.
## 2 Ireland 14336. 119406. 105071. 105071.
## 3 Luxembourg 30950. 131038. 100088. 100088.
## 4 Guyana 5071. 79371. 74300. 74300.
## 5 Norway 25513. 94896. 69382. 69382.
top5_gdp_countries <- gdp_growth_top5 %>%
pull(country)
gapminder %>%
filter(year >= 1975, country %in% top5_gdp_countries) %>%
ggplot(aes(year, gdp, color = country)) +
geom_line(linewidth = 1) +
labs(
title = "GDP per Capita Trends (Top 5 Absolute Growth Countries)",
x = "Year",
y = "GDP per Capita",
color = "Country"
) +
theme_minimal()
lex_growth_table <- gapminder %>%
filter(year %in% c(1975, 2025)) %>%
select(country, year, lex) %>%
pivot_wider(names_from = year, values_from = lex, names_prefix = "year_") %>%
mutate(lex_growth = year_2025 - year_1975) %>%
arrange(desc(lex_growth)) %>%
slice_head(n = 10)
lex_growth_table
## # A tibble: 10 × 4
## country year_1975 year_2025 lex_growth
## <chr> <dbl> <dbl> <dbl>
## 1 Cambodia 24.7 71.0 46.4
## 2 Maldives 51.7 80.5 28.8
## 3 Ethiopia 43.2 70.7 27.5
## 4 Bangladesh 50.6 76.3 25.7
## 5 Lao 45.5 70.6 25.1
## 6 Rwanda 45.0 70.0 25
## 7 Timor-Leste 47.2 72.2 25.0
## 8 Eritrea 40.7 65.4 24.8
## 9 Bhutan 51.0 74.8 23.8
## 10 Myanmar 47.0 70.8 23.7
lex_growth_table %>%
ggplot(aes(reorder(country, lex_growth), lex_growth)) +
geom_col(fill = "tomato") +
coord_flip() +
labs(
title = "Top 10 Countries by Life Expectancy Gain (1975-2025)",
x = "Country",
y = "Life Expectancy Gain (years)"
) +
theme_minimal()
The analyses below add common EDA checks that are usually done before modeling or interpretation.
quality_summary <- gapminder %>%
summarize(
n_rows = n(),
n_columns = ncol(gapminder),
n_countries = n_distinct(country),
min_year = min(year, na.rm = TRUE),
max_year = max(year, na.rm = TRUE)
)
missing_summary <- gapminder %>%
summarize(across(everything(), ~sum(is.na(.)))) %>%
pivot_longer(
cols = everything(),
names_to = "variable",
values_to = "missing_n"
) %>%
mutate(missing_pct = 100 * missing_n / nrow(gapminder)) %>%
arrange(desc(missing_n))
duplicate_country_year <- gapminder %>%
count(country, year) %>%
filter(n > 1)
country_year_coverage <- gapminder %>%
group_by(country) %>%
summarize(n_years = n_distinct(year), .groups = "drop")
quality_summary
## # A tibble: 1 × 5
## n_rows n_columns n_countries min_year max_year
## <int> <int> <int> <dbl> <dbl>
## 1 19695 5 195 1925 2025
missing_summary
## # A tibble: 5 × 3
## variable missing_n missing_pct
## <chr> <int> <dbl>
## 1 lex 357 1.81
## 2 gdp 202 1.03
## 3 pop 25 0.127
## 4 country 0 0
## 5 year 0 0
duplicate_country_year
## # A tibble: 0 × 3
## # ℹ 3 variables: country <chr>, year <dbl>, n <int>
country_year_coverage %>% summarize(min_years = min(n_years), max_years = max(n_years))
## # A tibble: 1 × 2
## min_years max_years
## <int> <int>
## 1 101 101
missing_summary %>%
ggplot(aes(reorder(variable, missing_n), missing_n)) +
geom_col(fill = "gray50") +
coord_flip() +
labs(
title = "Missing Values by Variable",
x = "Variable",
y = "Number of Missing Values"
) +
theme_minimal()
numeric_summary <- gapminder %>%
summarize(
across(
c(pop, gdp, lex),
list(
min = ~min(., na.rm = TRUE),
q1 = ~quantile(., 0.25, na.rm = TRUE),
median = ~median(., na.rm = TRUE),
mean = ~mean(., na.rm = TRUE),
q3 = ~quantile(., 0.75, na.rm = TRUE),
max = ~max(., na.rm = TRUE),
sd = ~sd(., na.rm = TRUE)
)
)
) %>%
pivot_longer(
cols = everything(),
names_to = c("variable", ".value"),
names_sep = "_"
)
outliers_2025 <- gapminder %>%
filter(year == 2025) %>%
select(country, pop, gdp, lex) %>%
pivot_longer(cols = c(pop, gdp, lex), names_to = "variable", values_to = "value") %>%
group_by(variable) %>%
mutate(
q1 = quantile(value, 0.25, na.rm = TRUE),
q3 = quantile(value, 0.75, na.rm = TRUE),
iqr = q3 - q1,
lower = q1 - 1.5 * iqr,
upper = q3 + 1.5 * iqr,
is_outlier = value < lower | value > upper
) %>%
ungroup() %>%
filter(is_outlier) %>%
arrange(variable, desc(value)) %>%
select(variable, country, value, lower, upper)
numeric_summary
## # A tibble: 3 × 8
## variable min q1 median mean q3 max sd
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 pop 651 741871. 3915338. 22712317. 12044477 1463865525 9.54e7
## 2 gdp 316. 2124. 4661. 12054. 13082. 475911. 2.17e4
## 3 lex 4.18 48.1 61.9 59.2 70.9 85.6 1.44e1
outliers_2025
## # A tibble: 26 × 5
## variable country value lower upper
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 gdp Monaco 475911. -44812. 90281.
## 2 gdp Singapore 137906. -44812. 90281.
## 3 gdp Luxembourg 131038. -44812. 90281.
## 4 gdp Ireland 119406. -44812. 90281.
## 5 gdp Qatar 116057. -44812. 90281.
## 6 gdp Norway 94896. -44812. 90281.
## 7 lex Lesotho 52.8 52.9 94.8
## 8 pop India 1463865525 -43724636 78608214
## 9 pop China 1416096094 -43724636 78608214
## 10 pop USA 347275807 -43724636 78608214
## # ℹ 16 more rows
gapminder %>%
filter(year == 2025) %>%
pivot_longer(cols = c(pop, gdp, lex), names_to = "variable", values_to = "value") %>%
ggplot(aes(x = variable, y = value)) +
geom_boxplot(fill = "lightblue") +
scale_y_log10() +
labs(
title = "2025 Distributions by Variable (Log Scale)",
x = "Variable",
y = "Value (log10 scale)"
) +
theme_minimal()
## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
global_yearly <- gapminder %>%
group_by(year) %>%
summarize(
global_population = sum(pop, na.rm = TRUE),
weighted_mean_lex = weighted.mean(lex, pop, na.rm = TRUE),
median_gdp = median(gdp, na.rm = TRUE),
.groups = "drop"
)
global_yearly
## # A tibble: 101 × 4
## year global_population weighted_mean_lex median_gdp
## <dbl> <dbl> <dbl> <dbl>
## 1 1925 1965415847 40.2 1999.
## 2 1926 1982851730 40.7 2071.
## 3 1927 2000538108 39.2 2129.
## 4 1928 2018498174 41.5 2191.
## 5 1929 2037115392 40.6 2284.
## 6 1930 2056439755 42.2 2315.
## 7 1931 2076473021 42.5 2320.
## 8 1932 2097208807 42.6 2254.
## 9 1933 2118603025 41.7 2282.
## 10 1934 2140276958 43.8 2309.
## # ℹ 91 more rows
global_yearly %>%
ggplot(aes(year, weighted_mean_lex)) +
geom_line(color = "firebrick", linewidth = 1) +
labs(
title = "Population-Weighted Global Life Expectancy Over Time",
x = "Year",
y = "Weighted Mean Life Expectancy"
) +
theme_minimal()
global_yearly %>%
ggplot(aes(year, median_gdp)) +
geom_line(color = "navy", linewidth = 1) +
scale_y_log10() +
labs(
title = "Global Median GDP per Capita Over Time",
x = "Year",
y = "Median GDP per Capita (log10 scale)"
) +
theme_minimal()
correlation_by_year <- gapminder %>%
filter(gdp > 0) %>%
group_by(year) %>%
summarize(
cor_log10gdp_lex = cor(log10(gdp), lex, use = "complete.obs"),
.groups = "drop"
)
correlation_by_year
## # A tibble: 101 × 2
## year cor_log10gdp_lex
## <dbl> <dbl>
## 1 1925 0.665
## 2 1926 0.669
## 3 1927 0.691
## 4 1928 0.686
## 5 1929 0.687
## 6 1930 0.688
## 7 1931 0.663
## 8 1932 0.646
## 9 1933 0.599
## 10 1934 0.710
## # ℹ 91 more rows
correlation_by_year %>%
ggplot(aes(year, cor_log10gdp_lex)) +
geom_line(color = "darkgreen", linewidth = 1) +
geom_hline(yintercept = 0, linetype = "dashed", color = "gray40") +
labs(
title = "Correlation Between log10(GDP) and Life Expectancy Over Time",
x = "Year",
y = "Pearson Correlation"
) +
theme_minimal()
Please knit your work as an .html file and upload to Canvas. Homework is due before the start of the next lab. No late work is accepted. Make sure to use the formatting conventions of RMarkdown to make your report neat and clean!