blob: 853b6296de82f78d3cb61b49aa14c8ef218fd0f9 [file] [log] [blame]
#!/usr/bin/env Rscript
#
# Plot corpus sizes by year for Stern magazine
#
library(RKorAPClient)
library(ggplot2)
library(scales) # For comma_format()
# Define years to analyze
years <- 1990:2024
# Create virtual corpus definitions for each year, restricted to Stern
vcs <- paste("corpusTitle=Stern & pubDate in", years)
# Connect to KorAP
kco <- KorAPConnection(verbose = TRUE)
# Get corpus statistics for each year
cat("Retrieving corpus sizes for Stern magazine from", min(years), "to", max(years), "...\n")
corpus_data <- corpusStats(kco, vc = vcs, as.df = TRUE)
# Add year column for plotting
corpus_data$year <- years
# Create ggplot column plot
g <- ggplot(corpus_data, aes(x = year, y = tokens)) +
geom_col(fill = "steelblue", alpha = 0.7) +
scale_x_continuous(breaks = seq(1990, 2024, 5)) +
scale_y_continuous(labels = scales::comma_format()) +
labs(
title = "Corpus Size by Year - Stern Magazine",
subtitle = "Number of tokens in DeReKo",
x = "Year",
y = "Number of Tokens"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 12),
axis.text.x = element_text(angle = 45, hjust = 1)
)
print(g)
# Print summary statistics
cat("\nSummary of corpus sizes:\n")
cat("Total years with data:", sum(corpus_data$tokens > 0, na.rm = TRUE), "\n")
cat("Peak year:", corpus_data$year[which.max(corpus_data$tokens)],
"with", format(max(corpus_data$tokens, na.rm = TRUE), big.mark = ","), "tokens\n")
cat("Total tokens across all years:", format(sum(corpus_data$tokens, na.rm = TRUE), big.mark = ","), "\n")
# Return the data for further analysis
corpus_data