Split tokenizer rules into language-specific and language-dependent
Change-Id: I6e5eeabe9a306ce9fc3e62340ba7b948ecc4fa0f
diff --git a/Makefile b/Makefile
index 297bfb8..7fb0073 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@
buildfoma:
cd src && \
- foma -e "source tokenizer.xfst" \
+ foma -e "source de/tokenizer.xfst" \
-e "save stack ../testdata/tokenizer.fst" -q -s && \
cd ..
diff --git a/Readme.md b/Readme.md
index 6365186..39f342b 100644
--- a/Readme.md
+++ b/Readme.md
@@ -9,6 +9,7 @@
The library contains sources for a german tokenizer
based on [KorAP-Tokenizer](https://github.com/KorAP/KorAP-Tokenizer).
+
## Tokenization
```
@@ -33,6 +34,7 @@
> *Caution*: When experimenting with STDIN and echo,
> you may need to disable history expansion.
+
## Conversion
```
@@ -107,13 +109,13 @@
```shell
$ cd src && \
- foma -e "source tokenizer.xfst" \
+ foma -e "source de/tokenizer.xfst" \
-e "save stack ../mytokenizer.fst" -q -s && \
cd ..
```
-This will load and compile `tokenizer.xfst` and will save
-the compiled FST as `mytokenizer.fst`
+This will load and compile the german `tokenizer.xfst`
+and will save the compiled FST as `mytokenizer.fst`
in the root directory.
To generate a Datok FSA (matrix representation) based on
@@ -135,6 +137,7 @@
* This may take quite some time depending on the number
of arcs in the FST and is therefore now deprecated.
+
## Technology
Internally the FSA is represented
@@ -150,6 +153,7 @@
in the FST is implemented as an extended DFA following Mizobuchi
et al. (2000) and implementation details following Kanda et al. (2018).
+
## License
Datok is published under the [Apache 2.0 License](LICENSE).
@@ -162,7 +166,7 @@
[Çağrı Çöltekin](https://github.com/coltekin/TRmorph/)
(published under the MIT License),
and [Marc Kupietz](https://github.com/KorAP/KorAP-Tokenizer)
- (published under the Apache License).
+(published under the Apache License).
The foma parser is based on
[*foma2js*](https://github.com/mhulden/foma),
diff --git a/src/all/allpost.xfst b/src/all/allpost.xfst
new file mode 100644
index 0000000..6ea6dcf
--- /dev/null
+++ b/src/all/allpost.xfst
@@ -0,0 +1,51 @@
+! General rules that require certain language specific definitions
+
+! A solution to the "(author): problem" may be to add ) at the end of any
+! string as a possible ending
+define Years ["(" Digit+ (".") ")"] | ["[" Digit+ (".") "]"];
+
+! 20:00 Uhr, 00:12:25,34
+define Times [ ( [%0|1|2|3|4|5] ) Digit [ ":" [%0|1|2|3|4|5] Digit ]^{1,2} ( "," [ Digit ]^{1,3} ) ];
+
+! Emoticons
+source all/emoticons.xfst
+
+! XML sources
+source all/xml.xfst
+
+! XML entities
+source all/entities.xfst
+
+! Technical protocols
+source all/protocols.xfst
+
+! Also supports
+! 19.4.2015, 19/4/2015 etc.
+define DigitPunct ["_"|"-"|"."|","|Slash];
+define Num Digit+ [DigitPunct Digit+]* (Char+);
+
+! ordinals
+define Ord Digit ( Digit (Digit) ) %.;
+
+! TODO:
+! floating point, serial, model numbers, ip addresses, etc.
+! every other segment must have at least one digit
+
+! Omission words like "fu**ing!"
+define Omission Char+ Asterisk Asterisk+ Char*;
+
+! acronyms: U.S.A., I.B.M., etc.
+! use a post-filter to remove dots
+define AcronymDep Letter %. Letter %. [Letter %.]+;
+
+
+! TODO: Name words with ' and `
+
+! Support ASCII elements, like
+! +---------------+
+! <---->, -->, <--
+! +---------------+
+! <---> | Worker Node N |
+! +---------------+
+! |============= Core =============|
+
diff --git a/src/all/allpref.xfst b/src/all/allpref.xfst
new file mode 100644
index 0000000..bb5183f
--- /dev/null
+++ b/src/all/allpref.xfst
@@ -0,0 +1,70 @@
+! This tokenizer is based on work by
+! - StandardTokenizerImpl by the Lucene project
+! under the Apache License
+! - https://github.com/dlwh/epic by David Hall (2014)
+! under the Apacahe License
+! - KorAPTokenizerImpl.jflex by Marc Kupietz (2016)
+! under the Apache License
+! - https://github.com/coltekin/TRmorph/tokenize.xfst by Çağrı Çöltekin (2011-2015)
+! under the MIT License
+
+define NLout "@_TOKEN_SYMBOL_@";
+! define NLout "\u000a";
+
+define Digit [%0|1|2|3|4|5|6|7|8|9];
+define AsciiLetter [a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z];
+define HexLetter [Digit|a|A|b|B|c|C|d|D|e|E|f|F];
+define EOT "\u0004";
+
+!!!!!!!!!!!!!!!!!
+! <from tmorph> !
+!!!!!!!!!!!!!!!!!
+define WS [" "|"\u0009"|"\u00a0"|"\u1680"|
+ "\u2000"|"\u2001"|"\u2002"|"\u2003"|"\u2004"|"\u2005"|
+ "\u2006"|"\u2007"|"\u2008"|"\u2009"|"\u200a"|
+ "\u202f"|"\u205f"|"\u3000"];
+
+define NL ["\u000a"|"\u000b"|"\u000c"|"\u000d"|"\u0085"|"\u2028"|"\u2029"|EOT];
+
+! Punctuation that ends sentences
+! Differs!
+define SP [["."|"?"|"!"]+|"…"];
+
+! Left punctuation
+define LP ["("|"["|"{"|
+ "“"|"‘"|"‹"|"«"|
+ "'"|%"|
+ ! differs
+ ["'" "'"] |
+ "*"|"/"|"_"| ! Can be Markdown
+ ! from book
+ [%, %,]];
+
+! Right punctuation - excluding the characters that can be used as apostrophe
+define RP [SP|","|";"|":"|
+ ")"|"]"|"}"|
+ "”"|"›"|"»"|%"|[%’ %’]|["'" "'"]|[%‘ %‘]|
+ "*"|"/"|"_"]; ! Can be Markdown
+
+define DQuotes ["”"|%"|"»"|"«"];
+
+define Sym ["-"|"+"|"<"|">"|"*"|"/"|%=|%@|%&];
+define Apos %'|%’|%`;
+define Punct [LP|RP|Sym];
+!define nonSym \[WS|LP|RP|Sym];
+!!!!!!!!!!!!!!!!!!
+! </from tmorph> !
+!!!!!!!!!!!!!!!!!!
+
+define Emdash [%- %- (%-)+ | ["\u2014"|"\u2015"|"\u2e3a"|"\u2e3b"|"\ufe58"]+];
+define Dash ["-"|"\u2011"|"\u2012"|"\u2013"|"\u2e1a"|"\ufe63"|"\uff0d"];
+define Slash ["⁄"|"∕"|"/"|"/"];
+define Asterisk ["*"];
+
+define Char \[WS|NL|Punct|Apos]; ! |¨;
+
+!define Alpha ["a"|"b"|"c"|"d"|"e"|"f"|"g"|"h"|"i"|"j"|"k"|"l"|"m"|"n"|"o"|"p"|"q"|"r"|"s"|"t"|"u"|"v"|"w"|"x"|"y"|"z"|"_"];
+
+define SentenceEnd SP NLout [DQuotes (NLout ")") | ["›"|%‹|%’|"'"] ( NLout DQuotes (NLout ")") | NLout ")" ) | ")" ] (NLout SP);
+
+define NotSentenceExtension [? - "”" - %" - "»" - "«" - "›" - %‹ - %’ - "'" - ")" - NLout];
\ No newline at end of file
diff --git a/src/all/allsentencesplit.xfst b/src/all/allsentencesplit.xfst
new file mode 100644
index 0000000..bef8cbd
--- /dev/null
+++ b/src/all/allsentencesplit.xfst
@@ -0,0 +1,19 @@
+echo - Introduce Sentence splitter
+! And compose Whitespace ignorance
+
+read regex Token .o. [
+ ! Put a Token boundary behind the longest possible
+ ! sentence ending punctuation sequence,
+ ! that isn't followed by a comma
+ SentenceEnd @-> ... NLout \/ _ NLout \%,
+] .o. [
+ ! Put a Token boundary behind a punctuation
+ ! that is not a start of a punctuation sequence
+ SP @-> ... NLout \/ NLout _ NLout NotSentenceExtension
+] .o. [
+ ! Put a Token boundary behind ... if not followed by a small character
+ [%. %. %.] @-> ... NLout \/ _ NLout WS+ NotSmallCaps
+] .o. [
+ ! Remove whitespace between Tokens
+ [WS|NL]+ @-> 0 || [ .#. | NLout ] _
+];
diff --git a/src/emoticons.xfst b/src/all/emoticons.xfst
similarity index 100%
rename from src/emoticons.xfst
rename to src/all/emoticons.xfst
diff --git a/src/entities.xfst b/src/all/entities.xfst
similarity index 100%
rename from src/entities.xfst
rename to src/all/entities.xfst
diff --git a/src/protocols.xfst b/src/all/protocols.xfst
similarity index 100%
rename from src/protocols.xfst
rename to src/all/protocols.xfst
diff --git a/src/xml.xfst b/src/all/xml.xfst
similarity index 100%
rename from src/xml.xfst
rename to src/all/xml.xfst
diff --git a/src/txt/abbrv.txt b/src/de/abbrv.txt
similarity index 100%
rename from src/txt/abbrv.txt
rename to src/de/abbrv.txt
diff --git a/src/txt/plusampersand.txt b/src/de/plusampersand.txt
similarity index 100%
rename from src/txt/plusampersand.txt
rename to src/de/plusampersand.txt
diff --git a/src/de/tokenizer.xfst b/src/de/tokenizer.xfst
new file mode 100644
index 0000000..407c482
--- /dev/null
+++ b/src/de/tokenizer.xfst
@@ -0,0 +1,81 @@
+source all/allpref.xfst
+
+define Caseinsensitive [
+a (->) A,
+b (->) B,
+c (->) C,
+d (->) D,
+e (->) E,
+f (->) F,
+g (->) G,
+h (->) H,
+i (->) I,
+j (->) J,
+k (->) K,
+l (->) L,
+m (->) M,
+n (->) N,
+o (->) O,
+p (->) P,
+q (->) Q,
+r (->) R,
+s (->) S,
+t (->) T,
+u (->) U,
+v (->) V,
+w (->) W,
+x (->) X,
+y (->) Y,
+z (->) Z,
+ö (->) Ö,
+ü (->) Ü,
+ä (->) Ä,
+è (->) È,
+é (->) É,
+ú (->) Ú,
+á (->) Á,
+â (->) Â,
+ê (->) Ê,
+î (->) Î,
+ô (->) Ô,
+û (->) Û,
+ß (->) {SS}
+];
+
+define Letter [ [ AsciiLetter | ö | ü | ä | è | é | ú | á | â | ê | î | ô | û | ß ] .o. Caseinsensitive ];
+
+define NotSmallCaps [? - a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - z - ü - ö - ä - è - é - ú - á - â - ê - î - ô - û];
+
+define Word Char+ ([Apos|Asterisk] Char+)* ([s|S] [%’|%`]);
+
+define Plusampersand @txt"de/plusampersand.txt";
+define Word [Plusampersand | Word] (Dash [Plusampersand | Word])*;
+
+! Abbreviations and Initials
+! The abbreviation list is part of the sentence splitter tool
+! of the IDS.
+define Abbr [ @txt"de/abbrv.txt" | Letter ] %.;
+
+define Streetname Word {str} %.;
+
+source all/allpost.xfst
+
+echo - Compile Real Token
+
+define RealToken [Punct|Emdash|Abbr|Streetname|Word|SNS|AcronymDep|Ord|Num|Years|Times|XMLEntities|Omission];
+
+echo - Introduce Token splitter
+
+define Token [
+ RealToken @-> ... NLout,
+ XML @-> ... NLout,
+ URL @-> ... NLout,
+ Email @-> ... NLout,
+ File @-> ... NLout,
+ Domain @-> ... NLout,
+ Emoticons @-> ... NLout
+];
+
+source all/allsentencesplit.xfst
+
+! foma -e "source tokenizer.xfst" -q -s && cat text.txt | flookup tokenizer.fst -x -b
\ No newline at end of file
diff --git a/src/tokenizer.xfst b/src/tokenizer.xfst
deleted file mode 100644
index f20d7e2..0000000
--- a/src/tokenizer.xfst
+++ /dev/null
@@ -1,197 +0,0 @@
-! This tokenizer is based on work by
-! - StandardTokenizerImpl by the Lucene project
-! under the Apache License
-! - https://github.com/dlwh/epic by David Hall (2014)
-! under the Apacahe License
-! - KorAPTokenizerImpl.jflex by Marc Kupietz (2016)
-! under the Apache License
-! - https://github.com/coltekin/TRmorph/tokenize.xfst by Çağrı Çöltekin (2011-2015)
-! under the MIT License
-!
-! The abbreviation list is part of the sentence splitter tool
-! of the IDS.
-
-define NLout "@_TOKEN_SYMBOL_@";
-! define NLout "\u000a";
-
-define Digit [%0|1|2|3|4|5|6|7|8|9];
-define AsciiLetter [a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z];
-define HexLetter [Digit|a|A|b|B|c|C|d|D|e|E|f|F];
-define EOT "\u0004";
-
-!!!!!!!!!!!!!!!!!
-! <from tmorph> !
-!!!!!!!!!!!!!!!!!
-define WS [" "|"\u0009"|"\u00a0"|"\u1680"|
- "\u2000"|"\u2001"|"\u2002"|"\u2003"|"\u2004"|"\u2005"|
- "\u2006"|"\u2007"|"\u2008"|"\u2009"|"\u200a"|
- "\u202f"|"\u205f"|"\u3000"];
-
-define NL ["\u000a"|"\u000b"|"\u000c"|"\u000d"|"\u0085"|"\u2028"|"\u2029"|EOT];
-
-! Punctuation that ends sentences
-! Differs!
-define SP [["."|"?"|"!"]+|"…"];
-
-! Left punctuation
-define LP ["("|"["|"{"|
- "“"|"‘"|"‹"|"«"|
- "'"|%"|
- ! differs
- ["'" "'"] |
- "*"|"/"|"_"| ! Can be Markdown
- ! from book
- [%, %,]];
-
-! Right punctuation - excluding the characters that can be used as apostrophe
-define RP [SP|","|";"|":"|
- ")"|"]"|"}"|
- "”"|"›"|"»"|%"|[%’ %’]|["'" "'"]|[%‘ %‘]|
- "*"|"/"|"_"]; ! Can be Markdown
-
-define Sym ["-"|"+"|"<"|">"|"*"|"/"|%=|%@|%&];
-define Apos %'|%’|%`;
-define Punct [LP|RP|Sym];
-!define nonSym \[WS|LP|RP|Sym];
-!!!!!!!!!!!!!!!!!!
-! </from tmorph> !
-!!!!!!!!!!!!!!!!!!
-
-define Emdash [%- %- (%-)+ | ["\u2014"|"\u2015"|"\u2e3a"|"\u2e3b"|"\ufe58"]+];
-define Dash ["-"|"\u2011"|"\u2012"|"\u2013"|"\u2e1a"|"\ufe63"|"\uff0d"];
-define Slash ["⁄"|"∕"|"/"|"/"];
-define Asterisk ["*"];
-
-define Char \[WS|NL|Punct|Apos]; ! |¨;
-
-define Word Char+ ([Apos|Asterisk] Char+)* ([s|S] [%’|%`]);
-
-define Plusampersand @txt"txt/plusampersand.txt";
-define Word [Plusampersand | Word] (Dash [Plusampersand | Word])*;
-
-!define Alpha ["a"|"b"|"c"|"d"|"e"|"f"|"g"|"h"|"i"|"j"|"k"|"l"|"m"|"n"|"o"|"p"|"q"|"r"|"s"|"t"|"u"|"v"|"w"|"x"|"y"|"z"|"_"];
-
-define Caseinsensitive [
-a (->) A,
-b (->) B,
-c (->) C,
-d (->) D,
-e (->) E,
-f (->) F,
-g (->) G,
-h (->) H,
-i (->) I,
-j (->) J,
-k (->) K,
-l (->) L,
-m (->) M,
-n (->) N,
-o (->) O,
-p (->) P,
-q (->) Q,
-r (->) R,
-s (->) S,
-t (->) T,
-u (->) U,
-v (->) V,
-w (->) W,
-x (->) X,
-y (->) Y,
-z (->) Z,
-ö (->) Ö,
-ü (->) Ü,
-ä (->) Ä,
-ß (->) {SS}
-];
-
-define Letter [ [ AsciiLetter | ö | ü | ä | ß ] .o. Caseinsensitive ];
-
-! Abbreviations and Initials
-define Abbr [ @txt"txt/abbrv.txt" | Letter ] %.;
-
-! A solution to the "(author): problem" may be to add ) at the end of any
-! string as a possible ending
-
-define Years ["(" Digit+ (".") ")"] | ["[" Digit+ (".") "]"];
-
-! 20:00 Uhr, 00:12:25,34 Minuten
-define Times [ ( [%0|1|2|3|4|5] ) Digit [ ":" [%0|1|2|3|4|5] Digit ]^{1,2} ( "," [ Digit ]^{1,3} ) ];
-
-! Emoticons
-source emoticons.xfst
-
-! acronyms: U.S.A., I.B.M., etc.
-! use a post-filter to remove dots
-define AcronymDep Letter %. Letter %. [Letter %.]+;
-
-! XML sources
-source xml.xfst
-
-! XML entities
-source entities.xfst
-
-! Technical protocols
-source protocols.xfst
-
-define Streetname Word {str} %.;
-
-! Also supports
-! 19.4.2015, 19/4/2015 etc.
-define DigitPunct ["_"|"-"|"."|","|Slash];
-define Num Digit+ [DigitPunct Digit+]* (Char+);
-
-! ordinals
-define Ord Digit ( Digit (Digit) ) %.;
-
-! TODO:
-! floating point, serial, model numbers, ip addresses, etc.
-! every other segment must have at least one digit
-
-! Omission words like "fu**ing!"
-define Omission Char+ Asterisk Asterisk+ Char*;
-
-
-! TODO: Name words with ' and `
-
-! Support ASCII elements, like
-! +---------------+
-! <---->, -->, <--
-! +---------------+
-! <---> | Worker Node N |
-! +---------------+
-! |============= Core =============|
-
-
-echo - Compile Real Token
-
-define RealToken [Punct|Emdash|Abbr|Streetname|Word|SNS|AcronymDep|Ord|Num|Years|Times|XMLEntities|Omission];
-
-echo - Introduce Token splitter
-
-define Token [
- RealToken @-> ... NLout,
- XML @-> ... NLout,
- URL @-> ... NLout,
- Email @-> ... NLout,
- File @-> ... NLout,
- Domain @-> ... NLout,
- Emoticons @-> ... NLout
-];
-
-echo - Introduce Sentence splitter
-! And compose Whitespace ignorance
-
-define DQuotes ["”"|%"|"»"|"«"];
-define NotSmallCaps [? - a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - z - ü - ö - ä];
-
-read regex Token .o. [
- SP NLout [DQuotes (NLout ")") | ["›"|%‹|%’|"'"] ( NLout DQuotes (NLout ")") | NLout ")" ) | ")" ] (NLout SP) @-> ... NLout \/ _ NLout \%,
-] .o. [
- SP @-> ... NLout \/ NLout _ NLout [? - "”" - %" - "»" - "«" - "›" - %‹ - %’ - "'" - ")" - NLout]
-] .o. [
- [%. %. %.] @-> ... NLout \/ _ NLout WS+ NotSmallCaps
-] .o. [
- [WS|NL]+ @-> 0 || [ .#. | NLout ] _
-];
-
-! foma -e "source tokenizer.xfst" -q -s && cat text.txt | flookup tokenizer.fst -x -b
\ No newline at end of file