Support Wikipedia templates
Change-Id: Iad7777c54b30c2131e6bac09786807a62e1cac07
diff --git a/Changes b/Changes
index f01547d..cad6232 100644
--- a/Changes
+++ b/Changes
@@ -1,5 +1,6 @@
-0.3.1 2026-02-03
+0.3.1 2026-02-04
- Introduce hyphenated abbreviations in german tokenizer.
+ - Support Wikipedia templates.
0.2.2 2023-09-06
- Fix behaviour for end of text character positions
diff --git a/datok_test.go b/datok_test.go
index 4e2a44d..6c64260 100644
--- a/datok_test.go
+++ b/datok_test.go
@@ -806,21 +806,59 @@
assert.Equal(".", tokens[10])
assert.Equal(11, len(tokens));
+ // Regression test for https://github.com/KorAP/KorAP-Tokenizer/issues/131
+ tokens = ttokenize(dat, w, "Donau\u00ADdampf\u00ADschiff")
+ assert.Equal("Donau\u00ADdampf\u00ADschiff", tokens[0])
+ assert.Equal(1, len(tokens));
+
+ // Regression test for https://github.com/KorAP/KorAP-Tokenizer/issues/115
+ tokens = ttokenize(dat, w, "Die Serb*innen wie die Kosovo-Albaner*innen")
+ assert.Equal("Die", tokens[0]);
+ assert.Equal("Serb*innen", tokens[1]);
+ assert.Equal("wie", tokens[2]);
+ assert.Equal("die", tokens[3]);
+ assert.Equal("Kosovo-Albaner*innen", tokens[4]);
+ assert.Equal(5, len(tokens));
+
+ // Test Wikipedia emoji template from the issue
+ tokens = ttokenize(dat, w, "Ein Smiley [_EMOJI:{{S|;)}}_] hier")
+ assert.Equal("Ein", tokens[0]);
+ assert.Equal("Smiley", tokens[1]);
+ assert.Equal("[_EMOJI:{{S|;)}}_]", tokens[2]); // Should be one token
+ assert.Equal("hier", tokens[3]);
+ assert.Equal(4, len(tokens));
+
+ // Test simple pragma still works
+ tokens = ttokenize(dat, w, "Name: [_ANONYMIZED_] Ende")
+ assert.Equal("Name", tokens[0]);
+ assert.Equal(":", tokens[1]);
+ assert.Equal("[_ANONYMIZED_]", tokens[2]); // Should be one token
+ assert.Equal("Ende", tokens[3]);
+ assert.Equal(4, len(tokens));
+
+ /*
+ DeReKo-Behaviour
+ tokens = ttokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
+ assert.Equal("'ve", tokens[1]);
+ assert.Equal("'ll", tokens[3]);
+ assert.Equal("'d", tokens[5]);
+ assert.Equal("'m", tokens[7]);
+ assert.Equal("'re", tokens[9]);
+ assert.Equal("'s", tokens[11]);
+ assert.Equal("is", tokens[12]);
+ assert.Equal("n't", tokens[13]);
+ assert.Equal(14, len(tokens));
+
+
+ assert.Equal(tokens[0], "Der")
+ assert.Equal(tokens[1], "alte")
+ assert.Equal(tokens[2], "Mann")
+ assert.Equal(len(tokens), 3)
/*
@Test
public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
- tokens = tokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
- assert.Equal("'ve", tokens[1]);
- assert.Equal("'ll", tokens[3]);
- assert.Equal("'d", tokens[5]);
- assert.Equal("'m", tokens[7]);
- assert.Equal("'re", tokens[9]);
- assert.Equal("'s", tokens[11]);
- assert.Equal("is", tokens[12]);
- assert.Equal("n't", tokens[13]);
- assert.Equal(14, len(tokens));
}
@Test
diff --git a/src/all/allpost.xfst b/src/all/allpost.xfst
index 32e7a90..2135910 100644
--- a/src/all/allpost.xfst
+++ b/src/all/allpost.xfst
@@ -16,6 +16,9 @@
! XML entities
source all/entities.xfst
+! Wikipedia templates
+source all/wiki.xfst
+
! Arrows
define Arrows [Alldash ">" | "<" Alldash];
diff --git a/src/all/allpref.xfst b/src/all/allpref.xfst
index 6d8f3bb..5c2e3c2 100644
--- a/src/all/allpref.xfst
+++ b/src/all/allpref.xfst
@@ -13,6 +13,7 @@
define Digit [%0|1|2|3|4|5|6|7|8|9];
define AsciiLetter [a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z];
+define AsciiLetterCap [A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z];
define HexLetter [Digit|a|A|b|B|c|C|d|D|e|E|f|F];
define EOT "\u0004";
diff --git a/src/all/wiki.xfst b/src/all/wiki.xfst
new file mode 100644
index 0000000..d05d28a
--- /dev/null
+++ b/src/all/wiki.xfst
@@ -0,0 +1 @@
+define Wikitemplate [ %[ "_" [ AsciiLetterCap | "-" ]+ ( %: [ ? - %[ - %] ]+ ) "_" %] ];
\ No newline at end of file
diff --git a/src/de/tokenizer.xfst b/src/de/tokenizer.xfst
index ce4bec3..aa2092b 100644
--- a/src/de/tokenizer.xfst
+++ b/src/de/tokenizer.xfst
@@ -75,7 +75,7 @@
Email @-> ... NLout,
File @-> ... NLout,
Domain @-> ... NLout,
- [Emoticons|Arrows] @-> ... NLout
+ [Emoticons|Arrows|Wikitemplate] @-> ... NLout
];
source all/allsentencesplit.xfst