Add corpus size by year demo Change-Id: Ic5cfd91fbb3231c7ac933872b34df9a5fb0d13b4

commit: 16b9d0ce7449965dda0e481d86147a12fcf39aba [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Fri Jul 25 18:22:33 2025 +0200
committer: Marc Kupietz <kupietz@ids-mannheim.de> Fri Jul 25 18:23:20 2025 +0200
tree: eb9ddfdee8d2001668a023792922ec65435f54ee
parent: 2d9bc3d51ae61146e8c7c0d21909bd61ed8b12c4 [diff]
diff --git a/demo/00Index b/demo/00Index
index 537de21..5a56500 100644
--- a/demo/00Index
+++ b/demo/00Index

@@ -17,3 +17,4 @@
 adjectiveCollocates             Shows adjective collocates of 'Gendern'.
 relativeTextpositionBoxplot     Plot the relative text positions of a some adverbs as highcharter boxplot.
 retrieveTitlesFromDeliko				Retrieves and shows all titles from novels of a chosen genre in the DeLiKo@DNB German fiction corpus.
+corpus_size_by_year             Plot corpus sizes by year for Stern magazine

diff --git a/demo/corpus_size_by_year.R b/demo/corpus_size_by_year.R
new file mode 100644
index 0000000..853b629
--- /dev/null
+++ b/demo/corpus_size_by_year.R

@@ -0,0 +1,53 @@
+#!/usr/bin/env Rscript
+#
+# Plot corpus sizes by year for Stern magazine
+#
+library(RKorAPClient)
+library(ggplot2)
+library(scales)  # For comma_format()
+
+# Define years to analyze
+years <- 1990:2024
+
+# Create virtual corpus definitions for each year, restricted to Stern
+vcs <- paste("corpusTitle=Stern & pubDate in", years)
+
+# Connect to KorAP
+kco <- KorAPConnection(verbose = TRUE)
+
+# Get corpus statistics for each year
+cat("Retrieving corpus sizes for Stern magazine from", min(years), "to", max(years), "...\n")
+corpus_data <- corpusStats(kco, vc = vcs, as.df = TRUE)
+
+# Add year column for plotting
+corpus_data$year <- years
+
+# Create ggplot column plot
+g <- ggplot(corpus_data, aes(x = year, y = tokens)) +
+  geom_col(fill = "steelblue", alpha = 0.7) +
+  scale_x_continuous(breaks = seq(1990, 2024, 5)) +
+  scale_y_continuous(labels = scales::comma_format()) +
+  labs(
+    title = "Corpus Size by Year - Stern Magazine",
+    subtitle = "Number of tokens in DeReKo",
+    x = "Year",
+    y = "Number of Tokens"
+  ) +
+  theme_minimal() +
+  theme(
+    plot.title = element_text(size = 14, face = "bold"),
+    plot.subtitle = element_text(size = 12),
+    axis.text.x = element_text(angle = 45, hjust = 1)
+  )
+
+print(g)
+
+# Print summary statistics
+cat("\nSummary of corpus sizes:\n")
+cat("Total years with data:", sum(corpus_data$tokens > 0, na.rm = TRUE), "\n")
+cat("Peak year:", corpus_data$year[which.max(corpus_data$tokens)],
+    "with", format(max(corpus_data$tokens, na.rm = TRUE), big.mark = ","), "tokens\n")
+cat("Total tokens across all years:", format(sum(corpus_data$tokens, na.rm = TRUE), big.mark = ","), "\n")
+
+# Return the data for further analysis
+corpus_data
commit	16b9d0ce7449965dda0e481d86147a12fcf39aba	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Fri Jul 25 18:22:33 2025 +0200
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Fri Jul 25 18:23:20 2025 +0200
tree	eb9ddfdee8d2001668a023792922ec65435f54ee
parent	2d9bc3d51ae61146e8c7c0d21909bd61ed8b12c4 [diff]