Merge "Add R scripts to plot performance charts"
diff --git a/.gitignore b/.gitignore
index b044dfd..87ecf80 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,4 @@
*~
.*
!.gitignore
+.Rproj.user
diff --git a/charts/performance.tsv b/charts/performance.tsv
new file mode 100644
index 0000000..cebdfa8
--- /dev/null
+++ b/charts/performance.tsv
@@ -0,0 +1,28 @@
+Tool Version Sen Tok Model Tokens/ms effi1x - 100 runs Tokens/ms effi10x - 100 runs
+KorAP-Tokenizer 72.90 199.28
+Datok x x datok 837.89 2478.71
+ x x matok 1371.19 2976.80
+BlingFire 0.1.8 x wbd.bin 431.92 1697.73
+ x sbd.bin 417.10 1908.87
+Cutter 2.5 x x 0.38
+JTok 2.1.19 31.19 117.22
+OpenNLP x Simple 290.71 1330.23
+ x Tokenizer 74.65 145.08
+ x SentenceD 247.84 853.01
+SoMaJo x x P=1 8.15 8.41
+ x x P=8 27.32 39.91
+SpaCy x Tokenizer 19.73 44.40
+ x Sentencizer 16.94
+ x Statistical 4.90
+ x Dependency 2.24
+Stanford x 75.47 156.24
+ x x T,S,M 46.95 91.56
+Syntok x segmenter 59.66 61.07
+ x tokenizer 103.90 108.40
+Waste 2.0.20-1 x x 141.07 144.95
+Elephant x 8.57 8.68
+TreeTagger x 69.92 72.98
+Deep-EOS x bi-lstm-de 0.25
+ x cnn-de 0.27
+ x lstm-de 0.29
+NNsplit x 0.90
diff --git a/charts/performance_chart.R b/charts/performance_chart.R
new file mode 100755
index 0000000..10f2d36
--- /dev/null
+++ b/charts/performance_chart.R
@@ -0,0 +1,26 @@
+#!/bin/env Rscript
+library(tidyverse)
+library(idsThemeR) # install_git("https://korap.ids-mannheim.de/gerrit/IDS-Mannheim/idsThemeR")
+library(extrafont)
+
+df <- read_tsv("performance.tsv")
+df %>%
+ fill(Tool) %>%
+ mutate(order_by = pmax(.[[6]], .[[7]], na.rm = TRUE)) %>%
+ filter(Tool!="wc", !is.na(order_by)) %>%
+ pivot_longer(cols=c(7, 6)) %>%
+ mutate(name=str_replace_all(name, ".*[^0-9]([0-9]+)x.*", "\\1 × Effi")) %>%
+ mutate(tool = paste0(Tool, if_else(is.na(Model), "", paste0(" (", Model, ")")))) %>%
+ { df2 <<- . } %>%
+ mutate(Tool= factor(tool) %>% fct_reorder(order_by)) %>%
+ ggplot(aes(x=Tool, y=value, fill=name )) + # forcats::fct_rev(name) to reorder x1 and x10
+ geom_col(position="dodge") +
+ labs(x="Tokens / ms", y= NULL, fill = "test corpus") +
+ coord_flip() +
+ theme_ids(style="light") +
+ theme(legend.position = c(0.8, 0.2)) +
+ scale_fill_ids(palette = "ids")
+
+ggsave("tok_perf.png", width = 70 * .pt, height = 50 *.pt, units = "mm", dpi = 600)
+ggsave("tok_perf.pdf", device = cairo_pdf, width = 70 * .pt, height = 50 *.pt, units = "mm", dpi = 600)
+ggsave("tok_perf.svg", width = 70 * .pt, height = 50 *.pt, units = "mm", dpi = 600)
diff --git a/charts/single_bar_perf_chart.R b/charts/single_bar_perf_chart.R
new file mode 100644
index 0000000..c1e9757
--- /dev/null
+++ b/charts/single_bar_perf_chart.R
@@ -0,0 +1,16 @@
+df <- read_tsv("performance.tsv")
+ylabel <- colnames(df)[7]
+colnames(df)[7] <- "perf"
+df %>%
+ fill(Tool) %>%
+ filter(Tool!="wc", !is.na(perf)) %>%
+ arrange(desc(perf)) %>%
+ mutate(tool = paste0(Tool, if_else(is.na(Model), "", paste0(" (", Model, ")")))) %>%
+ mutate(Tool= factor(tool) %>% fct_reorder(perf)) %>%
+ ggplot(aes(x=Tool, y=perf)) +
+ geom_col() +
+ ylab("Tokens/ms") +
+ xlab(NULL) +
+ coord_flip()# +
+# geom_text(aes(label=perf), position=position_stack(vjus=0.5), hjust=0.25)
+#ggsave("/tmp/tok_perf.png", width = 70 * .pt, height = 40 *.pt, units = "mm", dpi = 600)
diff --git a/charts/tok_performance_charts.Rproj b/charts/tok_performance_charts.Rproj
new file mode 100644
index 0000000..8e3c2eb
--- /dev/null
+++ b/charts/tok_performance_charts.Rproj
@@ -0,0 +1,13 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX