Just use million tokens in useful corpora overview
diff --git a/scripts/useful_deliko_subcorpora.R b/scripts/useful_deliko_subcorpora.R
index 132623e..3613548 100755
--- a/scripts/useful_deliko_subcorpora.R
+++ b/scripts/useful_deliko_subcorpora.R
@@ -199,38 +199,81 @@
make_table <- function(data, locale = c("de", "en")) {
locale <- match.arg(locale)
- short_si <- function(values) {
+ format_millions <- function(values) {
decimal_mark <- if (locale == "de") "," else "."
big_mark <- if (locale == "de") "." else ","
scales::label_number(
accuracy = 0.1,
decimal.mark = decimal_mark,
big.mark = big_mark,
- trim = TRUE,
- scale_cut = scales::cut_si("")
+ trim = TRUE
)(values)
}
if (locale == "de") {
link_text <- str_remove(data$de_description, "\\.$")
- tokens_short <- short_si(data$tokens)
+ tokens_million <- format_millions(data$tokens / 1e6)
tokens_full <- format_number_de(data$tokens)
- tokens <- map2_chr(tokens_short, tokens_full, ~ glue("<span title='{.y}'>{.x}</span>"))
- documents <- format_number_de(data$documents)
+ tokens_order <- format(
+ data$tokens,
+ scientific = FALSE,
+ trim = TRUE,
+ big.mark = "",
+ decimal.mark = "."
+ )
+ tokens <- pmap_chr(
+ list(tokens_million, tokens_full, tokens_order),
+ ~ glue("<span title='{..2}' data-order='{..3}'>{..1}</span>")
+ )
+ documents_display <- format_number_de(data$documents)
+ documents_order <- format(
+ data$documents,
+ scientific = FALSE,
+ trim = TRUE,
+ big.mark = "",
+ decimal.mark = "."
+ )
+ documents <- map2_chr(
+ documents_display,
+ documents_order,
+ ~ glue("<span data-order='{.y}'>{.x}</span>")
+ )
vc_header <- "VC-Definition (für Client-Bibliotheken)"
title_header <- "Titel / Link"
- tokens_header <- "Tokens"
- documents_header <- "Dokumente"
+ tokens_header <- "Millionen Token"
+ documents_header <- "Bücher"
language_url <- "//cdn.datatables.net/plug-ins/1.10.11/i18n/German.json"
} else {
link_text <- str_remove(data$en_description, "\\.$")
- tokens_short <- short_si(data$tokens)
+ tokens_million <- format_millions(data$tokens / 1e6)
tokens_full <- format_number_en(data$tokens)
- tokens <- map2_chr(tokens_short, tokens_full, ~ glue("<span title='{.y}'>{.x}</span>"))
- documents <- format_number_en(data$documents)
+ tokens_order <- format(
+ data$tokens,
+ scientific = FALSE,
+ trim = TRUE,
+ big.mark = "",
+ decimal.mark = "."
+ )
+ tokens <- pmap_chr(
+ list(tokens_million, tokens_full, tokens_order),
+ ~ glue("<span title='{..2}' data-order='{..3}'>{..1}</span>")
+ )
+ documents_display <- format_number_en(data$documents)
+ documents_order <- format(
+ data$documents,
+ scientific = FALSE,
+ trim = TRUE,
+ big.mark = "",
+ decimal.mark = "."
+ )
+ documents <- map2_chr(
+ documents_display,
+ documents_order,
+ ~ glue("<span data-order='{.y}'>{.x}</span>")
+ )
vc_header <- "VC definition (for client libraries)"
title_header <- "Title / Link"
- tokens_header <- "Tokens"
- documents_header <- "Documents"
+ tokens_header <- "Million Tokens"
+ documents_header <- "Books"
language_url <- "//cdn.datatables.net/plug-ins/1.10.11/i18n/English.json"
}
@@ -239,9 +282,13 @@
`{{vc_header}}` := paste0(data$corpus_query, make_copy_icon(data$corpus_query)),
`{{tokens_header}}` := tokens,
`{{documents_header}}` := documents
- )
+ ) %>%
+ mutate(
+ tokens_sort = as.numeric(data$tokens),
+ documents_sort = as.numeric(data$documents)
+ )
- names(table_data) <- c(title_header, vc_header, tokens_header, documents_header)
+ names(table_data) <- c(title_header, vc_header, tokens_header, documents_header, "tokens_sort", "documents_sort")
datatable(
table_data,
@@ -253,6 +300,18 @@
list(
className = "dt-head-right dt-body-right dt-foot-right",
targets = 2:3
+ ),
+ list(
+ visible = FALSE,
+ targets = 4:5
+ ),
+ list(
+ orderData = 4,
+ targets = 2
+ ),
+ list(
+ orderData = 5,
+ targets = 3
)
),
language = list(url = language_url),
@@ -267,6 +326,11 @@
rownames = FALSE
) %>%
formatStyle(colnames(table_data), fontFamily = "Fira Sans, Lato, sans-serif") %>%
+ formatStyle(
+ c(tokens_header, documents_header),
+ fontFamily = "Fira Sans, Lato, sans-serif",
+ fontVariantNumeric = "tabular-nums"
+ ) %>%
htmlwidgets::onRender(copy_js)
}