Support Plusampersand words in compounds
Change-Id: I095681ece9c7e2e80fb2975eb6bf87463b17db7c
diff --git a/Changes b/Changes
index 9880f32..e3c52a1 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,4 @@
-0.1.2 2021-12-05
+0.1.2 2021-12-07
- Improve performance of rune to symbol conversion in transduction
method.
+ - Support Plusampersand word list in compounds.
\ No newline at end of file
diff --git a/matrix_test.go b/matrix_test.go
index b0d2698..7200608 100644
--- a/matrix_test.go
+++ b/matrix_test.go
@@ -769,7 +769,7 @@
assert.Equal(tokens[11], ".")
assert.Equal(12, len(tokens))
- // Plusampersand compounds
+ // Plusampersand compounds (1)
tokens = ttokenize(mat, w, "Die 2G+-Regel soll weitere Covid-19-Erkrankungen reduzieren.")
assert.Equal(tokens[0], "Die")
assert.Equal(tokens[1], "2G+-Regel")
@@ -780,6 +780,13 @@
assert.Equal(tokens[6], ".")
assert.Equal(7, len(tokens))
+ // Plusampersand compounds (2)
+ tokens = ttokenize(mat, w, "Der Neu-C++-Programmierer.")
+ assert.Equal(tokens[0], "Der")
+ assert.Equal(tokens[1], "Neu-C++-Programmierer")
+ assert.Equal(tokens[2], ".")
+ assert.Equal(3, len(tokens))
+
/*
@Test
public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
diff --git a/src/tokenizer.xfst b/src/tokenizer.xfst
index b59ee8a..ccc27c1 100644
--- a/src/tokenizer.xfst
+++ b/src/tokenizer.xfst
@@ -69,9 +69,11 @@
define Char \[WS|NL|Punct|Apos]; ! |¨;
-! source lexicon.xfst
-! define Word;
-define Word Char+ ([Dash|Apos|Asterisk] Char+)*;
+define Word Char+ ([Apos|Asterisk] Char+)*;
+
+define Plusampersand @txt"txt/plusampersand.txt";
+define Word [Plusampersand | Word] (Dash [Plusampersand | Word])*;
+
define URLChar [Char|[Sym - ["<"|">"|%"]]];
!define Alpha ["a"|"b"|"c"|"d"|"e"|"f"|"g"|"h"|"i"|"j"|"k"|"l"|"m"|"n"|"o"|"p"|"q"|"r"|"s"|"t"|"u"|"v"|"w"|"x"|"y"|"z"|"_"];
@@ -114,8 +116,6 @@
! Abbreviations and Initials
define Abbr [ @txt"txt/abbrv.txt" | Letter ] %.;
-define Plusampersand @txt"txt/plusampersand.txt" (Dash Word);
-
! A solution to the "(author): problem" may be to add ) at the end of any
! string as a possible ending
@@ -219,7 +219,6 @@
URL @-> ... NLout,
Email @-> ... NLout,
File @-> ... NLout,
- Plusampersand @-> ... NLout,
Domain @-> ... NLout,
Emoji @-> ... NLout
] .o. [[WS|NL]+ @-> 0 || [ .#. | NLout ] _ ];