Split tokenizer rules into language-specific and language-dependent Change-Id: I6e5eeabe9a306ce9fc3e62340ba7b948ecc4fa0f

commit: 78f6714529aea5209c4363edf94570a4f5686d19 [log] [tgz]
author: Akron <nils@diewald-online.de> Sat Apr 09 14:10:44 2022 +0200
committer: Akron <nils@diewald-online.de> Sat Apr 09 14:10:44 2022 +0200
tree: 907711b536979f8a39dd63a8f05364fd9f524331
parent: 61948ef87b5c0e556439fff72a270ac1f5ca9bc7 [diff]
diff --git a/Makefile b/Makefile
index 297bfb8..7fb0073 100644
--- a/Makefile
+++ b/Makefile

@@ -3,7 +3,7 @@
 
 buildfoma:
 	cd src && \
-	foma -e "source tokenizer.xfst" \
+	foma -e "source de/tokenizer.xfst" \
 	-e "save stack ../testdata/tokenizer.fst" -q -s && \
 	cd ..
 

diff --git a/Readme.md b/Readme.md
index 6365186..39f342b 100644
--- a/Readme.md
+++ b/Readme.md

@@ -9,6 +9,7 @@
 The library contains sources for a german tokenizer
 based on [KorAP-Tokenizer](https://github.com/KorAP/KorAP-Tokenizer).
 
+
 ## Tokenization
 
 ```
@@ -33,6 +34,7 @@
 > *Caution*: When experimenting with STDIN and echo,
 > you may need to disable history expansion.
 
+
 ## Conversion
 
 ```
@@ -107,13 +109,13 @@
 
 ```shell
 $ cd src && \
-  foma -e "source tokenizer.xfst" \
+  foma -e "source de/tokenizer.xfst" \
   -e "save stack ../mytokenizer.fst" -q -s && \
   cd ..
 ```
 
-This will load and compile `tokenizer.xfst` and will save
-the compiled FST as `mytokenizer.fst`
+This will load and compile the german `tokenizer.xfst`
+and will save the compiled FST as `mytokenizer.fst`
 in the root directory.
 
 To generate a Datok FSA (matrix representation) based on
@@ -135,6 +137,7 @@
 * This may take quite some time depending on the number
 of arcs in the FST and is therefore now deprecated.
 
+
 ## Technology
 
 Internally the FSA is represented
@@ -150,6 +153,7 @@
 in the FST is implemented as an extended DFA following Mizobuchi
 et al. (2000) and implementation details following Kanda et al. (2018).
 
+
 ## License
 
 Datok is published under the [Apache 2.0 License](LICENSE).
@@ -162,7 +166,7 @@
 [Çağrı Çöltekin](https://github.com/coltekin/TRmorph/)
 (published under the MIT License),
 and [Marc Kupietz](https://github.com/KorAP/KorAP-Tokenizer)
- (published under the Apache License).
+(published under the Apache License).
 
 The foma parser is based on
 [*foma2js*](https://github.com/mhulden/foma),

diff --git a/src/all/allpost.xfst b/src/all/allpost.xfst
new file mode 100644
index 0000000..6ea6dcf
--- /dev/null
+++ b/src/all/allpost.xfst

@@ -0,0 +1,51 @@
+! General rules that require certain language specific definitions
+
+! A solution to the "(author): problem" may be to add ) at the end of any
+! string as a possible ending
+define Years ["(" Digit+ (".") ")"] | ["[" Digit+ (".") "]"];
+
+! 20:00 Uhr, 00:12:25,34
+define Times [ ( [%0|1|2|3|4|5] ) Digit [ ":" [%0|1|2|3|4|5] Digit ]^{1,2} ( "," [ Digit ]^{1,3} ) ];
+
+! Emoticons
+source all/emoticons.xfst
+
+! XML sources
+source all/xml.xfst
+
+! XML entities
+source all/entities.xfst
+
+! Technical protocols
+source all/protocols.xfst
+
+! Also supports
+!   19.4.2015, 19/4/2015 etc.
+define DigitPunct ["_"|"-"|"."|","|Slash];
+define Num Digit+ [DigitPunct Digit+]* (Char+);
+
+! ordinals
+define Ord Digit ( Digit (Digit) ) %.;
+
+! TODO:
+!   floating point, serial, model numbers, ip addresses, etc.
+!   every other segment must have at least one digit
+
+! Omission words like "fu**ing!"
+define Omission Char+ Asterisk Asterisk+ Char*;
+
+! acronyms: U.S.A., I.B.M., etc.
+! use a post-filter to remove dots
+define AcronymDep Letter %. Letter %. [Letter %.]+;
+
+
+! TODO: Name words with ' and `
+
+! Support ASCII elements, like
+! +---------------+
+! <---->, -->, <--
+!       +---------------+
+! <---> | Worker Node N |
+!       +---------------+
+! |============= Core =============|
+

diff --git a/src/all/allpref.xfst b/src/all/allpref.xfst
new file mode 100644
index 0000000..bb5183f
--- /dev/null
+++ b/src/all/allpref.xfst

@@ -0,0 +1,70 @@
+! This tokenizer is based on work by
+!  - StandardTokenizerImpl by the Lucene project
+!    under the Apache License
+!  - https://github.com/dlwh/epic by David Hall (2014)
+!    under the Apacahe License
+!  - KorAPTokenizerImpl.jflex by Marc Kupietz (2016)
+!    under the Apache License
+!  - https://github.com/coltekin/TRmorph/tokenize.xfst by Çağrı Çöltekin (2011-2015)
+!    under the MIT License
+
+define NLout "@_TOKEN_SYMBOL_@";
+! define NLout "\u000a";
+
+define Digit [%0|1|2|3|4|5|6|7|8|9];
+define AsciiLetter [a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z];
+define HexLetter [Digit|a|A|b|B|c|C|d|D|e|E|f|F];
+define EOT "\u0004";
+
+!!!!!!!!!!!!!!!!!
+! <from tmorph> !
+!!!!!!!!!!!!!!!!!
+define WS [" "|"\u0009"|"\u00a0"|"\u1680"|
+           "\u2000"|"\u2001"|"\u2002"|"\u2003"|"\u2004"|"\u2005"| 
+           "\u2006"|"\u2007"|"\u2008"|"\u2009"|"\u200a"|
+           "\u202f"|"\u205f"|"\u3000"];
+
+define NL ["\u000a"|"\u000b"|"\u000c"|"\u000d"|"\u0085"|"\u2028"|"\u2029"|EOT];
+
+! Punctuation that ends sentences
+! Differs!
+define SP [["."|"?"|"!"]+|"…"];
+
+! Left punctuation
+define LP ["("|"["|"{"|
+           "“"|"‘"|"‹"|"«"|
+           "'"|%"|
+           ! differs
+           ["'" "'"] |
+           "*"|"/"|"_"| ! Can be Markdown
+           ! from book
+           [%, %,]];
+
+! Right punctuation - excluding the characters that can be used as apostrophe
+define RP [SP|","|";"|":"|
+              ")"|"]"|"}"|
+              "”"|"›"|"»"|%"|[%’ %’]|["'" "'"]|[%‘ %‘]|
+              "*"|"/"|"_"]; ! Can be Markdown
+
+define DQuotes ["”"|%"|"»"|"«"];
+
+define Sym ["-"|"+"|"<"|">"|"*"|"/"|%=|%@|%&];
+define Apos %'|%’|%`;
+define Punct [LP|RP|Sym];
+!define nonSym \[WS|LP|RP|Sym];
+!!!!!!!!!!!!!!!!!!
+! </from tmorph> !
+!!!!!!!!!!!!!!!!!!
+
+define Emdash [%- %- (%-)+ | ["\u2014"|"\u2015"|"\u2e3a"|"\u2e3b"|"\ufe58"]+];
+define Dash ["-"|"\u2011"|"\u2012"|"\u2013"|"\u2e1a"|"\ufe63"|"\uff0d"];
+define Slash ["⁄"|"∕"|"／"|"/"];
+define Asterisk ["*"];
+
+define Char \[WS|NL|Punct|Apos]; ! |¨;
+
+!define Alpha ["a"|"b"|"c"|"d"|"e"|"f"|"g"|"h"|"i"|"j"|"k"|"l"|"m"|"n"|"o"|"p"|"q"|"r"|"s"|"t"|"u"|"v"|"w"|"x"|"y"|"z"|"_"];
+
+define SentenceEnd SP NLout [DQuotes (NLout ")") | ["›"|%‹|%’|"'"] ( NLout DQuotes (NLout ")") | NLout ")" ) | ")" ] (NLout SP);
+
+define NotSentenceExtension [? - "”" - %" - "»" - "«" - "›" - %‹ - %’ - "'" - ")" - NLout];
\ No newline at end of file

diff --git a/src/all/allsentencesplit.xfst b/src/all/allsentencesplit.xfst
new file mode 100644
index 0000000..bef8cbd
--- /dev/null
+++ b/src/all/allsentencesplit.xfst

@@ -0,0 +1,19 @@
+echo - Introduce Sentence splitter
+! And compose Whitespace ignorance
+
+read regex Token .o. [
+  ! Put a Token boundary behind the longest possible
+  ! sentence ending punctuation sequence,
+  ! that isn't followed by a comma
+  SentenceEnd @-> ... NLout \/ _ NLout \%,
+] .o. [
+  ! Put a Token boundary behind a punctuation                      
+  ! that is not a start of a punctuation sequence
+  SP @-> ... NLout \/ NLout _ NLout NotSentenceExtension
+] .o. [
+  ! Put a Token boundary behind ... if not followed by a small character
+  [%. %. %.] @-> ... NLout \/ _ NLout WS+ NotSmallCaps
+] .o. [
+  ! Remove whitespace between Tokens
+  [WS|NL]+ @-> 0 || [ .#. | NLout ] _
+];

diff --git a/src/emoticons.xfst b/src/all/emoticons.xfst
similarity index 100%
rename from src/emoticons.xfst
rename to src/all/emoticons.xfst


diff --git a/src/entities.xfst b/src/all/entities.xfst
similarity index 100%
rename from src/entities.xfst
rename to src/all/entities.xfst


diff --git a/src/protocols.xfst b/src/all/protocols.xfst
similarity index 100%
rename from src/protocols.xfst
rename to src/all/protocols.xfst


diff --git a/src/xml.xfst b/src/all/xml.xfst
similarity index 100%
rename from src/xml.xfst
rename to src/all/xml.xfst


diff --git a/src/txt/abbrv.txt b/src/de/abbrv.txt
similarity index 100%
rename from src/txt/abbrv.txt
rename to src/de/abbrv.txt


diff --git a/src/txt/plusampersand.txt b/src/de/plusampersand.txt
similarity index 100%
rename from src/txt/plusampersand.txt
rename to src/de/plusampersand.txt


diff --git a/src/de/tokenizer.xfst b/src/de/tokenizer.xfst
new file mode 100644
index 0000000..407c482
--- /dev/null
+++ b/src/de/tokenizer.xfst

@@ -0,0 +1,81 @@
+source all/allpref.xfst
+
+define Caseinsensitive [
+a (->) A,
+b (->) B,
+c (->) C,
+d (->) D,
+e (->) E,
+f (->) F,
+g (->) G,
+h (->) H,
+i (->) I,
+j (->) J,
+k (->) K,
+l (->) L,
+m (->) M,
+n (->) N,
+o (->) O,
+p (->) P,
+q (->) Q,
+r (->) R,
+s (->) S,
+t (->) T,
+u (->) U,
+v (->) V,
+w (->) W,
+x (->) X,
+y (->) Y,
+z (->) Z,
+ö (->) Ö,
+ü (->) Ü,
+ä (->) Ä,
+è (->) È,
+é (->) É,
+ú (->) Ú,
+á (->) Á,
+â (->) Â,
+ê (->) Ê,
+î (->) Î,
+ô (->) Ô,
+û (->) Û,
+ß (->) {SS}
+];
+
+define Letter [ [ AsciiLetter | ö | ü | ä | è | é | ú | á | â | ê | î | ô | û | ß ] .o. Caseinsensitive ];
+
+define NotSmallCaps [? - a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - z - ü - ö - ä - è - é - ú - á - â - ê - î - ô - û];
+
+define Word Char+ ([Apos|Asterisk] Char+)* ([s|S] [%’|%`]);
+
+define Plusampersand @txt"de/plusampersand.txt";
+define Word [Plusampersand | Word] (Dash [Plusampersand | Word])*;
+
+! Abbreviations and Initials
+! The abbreviation list is part of the sentence splitter tool
+! of the IDS.
+define Abbr [ @txt"de/abbrv.txt" | Letter ] %.;
+
+define Streetname Word {str} %.;
+
+source all/allpost.xfst
+
+echo - Compile Real Token
+
+define RealToken [Punct|Emdash|Abbr|Streetname|Word|SNS|AcronymDep|Ord|Num|Years|Times|XMLEntities|Omission];
+
+echo - Introduce Token splitter
+
+define Token [
+  RealToken @-> ... NLout,
+  XML @-> ... NLout,
+  URL @-> ... NLout,
+  Email @-> ... NLout,
+  File @-> ... NLout,
+  Domain @-> ... NLout,
+  Emoticons @-> ... NLout
+];
+
+source all/allsentencesplit.xfst
+
+! foma -e "source tokenizer.xfst" -q -s && cat text.txt | flookup tokenizer.fst -x -b
\ No newline at end of file

diff --git a/src/tokenizer.xfst b/src/tokenizer.xfst
deleted file mode 100644
index f20d7e2..0000000
--- a/src/tokenizer.xfst
+++ /dev/null

@@ -1,197 +0,0 @@
-! This tokenizer is based on work by
-!  - StandardTokenizerImpl by the Lucene project
-!    under the Apache License
-!  - https://github.com/dlwh/epic by David Hall (2014)
-!    under the Apacahe License
-!  - KorAPTokenizerImpl.jflex by Marc Kupietz (2016)
-!    under the Apache License
-!  - https://github.com/coltekin/TRmorph/tokenize.xfst by Çağrı Çöltekin (2011-2015)
-!    under the MIT License
-!
-! The abbreviation list is part of the sentence splitter tool
-! of the IDS.
-
-define NLout "@_TOKEN_SYMBOL_@";
-! define NLout "\u000a";
-
-define Digit [%0|1|2|3|4|5|6|7|8|9];
-define AsciiLetter [a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z];
-define HexLetter [Digit|a|A|b|B|c|C|d|D|e|E|f|F];
-define EOT "\u0004";
-
-!!!!!!!!!!!!!!!!!
-! <from tmorph> !
-!!!!!!!!!!!!!!!!!
-define WS [" "|"\u0009"|"\u00a0"|"\u1680"|
-           "\u2000"|"\u2001"|"\u2002"|"\u2003"|"\u2004"|"\u2005"| 
-           "\u2006"|"\u2007"|"\u2008"|"\u2009"|"\u200a"|
-           "\u202f"|"\u205f"|"\u3000"];
-
-define NL ["\u000a"|"\u000b"|"\u000c"|"\u000d"|"\u0085"|"\u2028"|"\u2029"|EOT];
-
-! Punctuation that ends sentences
-! Differs!
-define SP [["."|"?"|"!"]+|"…"];
-
-! Left punctuation
-define LP ["("|"["|"{"|
-           "“"|"‘"|"‹"|"«"|
-           "'"|%"|
-           ! differs
-           ["'" "'"] |
-           "*"|"/"|"_"| ! Can be Markdown
-           ! from book
-           [%, %,]];
-
-! Right punctuation - excluding the characters that can be used as apostrophe
-define RP [SP|","|";"|":"|
-              ")"|"]"|"}"|
-              "”"|"›"|"»"|%"|[%’ %’]|["'" "'"]|[%‘ %‘]|
-              "*"|"/"|"_"]; ! Can be Markdown
-
-define Sym ["-"|"+"|"<"|">"|"*"|"/"|%=|%@|%&];
-define Apos %'|%’|%`;
-define Punct [LP|RP|Sym];
-!define nonSym \[WS|LP|RP|Sym];
-!!!!!!!!!!!!!!!!!!
-! </from tmorph> !
-!!!!!!!!!!!!!!!!!!
-
-define Emdash [%- %- (%-)+ | ["\u2014"|"\u2015"|"\u2e3a"|"\u2e3b"|"\ufe58"]+];
-define Dash ["-"|"\u2011"|"\u2012"|"\u2013"|"\u2e1a"|"\ufe63"|"\uff0d"];
-define Slash ["⁄"|"∕"|"／"|"/"];
-define Asterisk ["*"];
-
-define Char \[WS|NL|Punct|Apos]; ! |¨;
-
-define Word Char+ ([Apos|Asterisk] Char+)* ([s|S] [%’|%`]);
-
-define Plusampersand @txt"txt/plusampersand.txt";
-define Word [Plusampersand | Word] (Dash [Plusampersand | Word])*;
-
-!define Alpha ["a"|"b"|"c"|"d"|"e"|"f"|"g"|"h"|"i"|"j"|"k"|"l"|"m"|"n"|"o"|"p"|"q"|"r"|"s"|"t"|"u"|"v"|"w"|"x"|"y"|"z"|"_"];
-
-define Caseinsensitive [
-a (->) A,
-b (->) B,
-c (->) C,
-d (->) D,
-e (->) E,
-f (->) F,
-g (->) G,
-h (->) H,
-i (->) I,
-j (->) J,
-k (->) K,
-l (->) L,
-m (->) M,
-n (->) N,
-o (->) O,
-p (->) P,
-q (->) Q,
-r (->) R,
-s (->) S,
-t (->) T,
-u (->) U,
-v (->) V,
-w (->) W,
-x (->) X,
-y (->) Y,
-z (->) Z,
-ö (->) Ö,
-ü (->) Ü,
-ä (->) Ä,
-ß (->) {SS}
-];
-
-define Letter [ [ AsciiLetter | ö | ü | ä | ß ] .o. Caseinsensitive ];
-
-! Abbreviations and Initials
-define Abbr [ @txt"txt/abbrv.txt" | Letter ] %.;
-
-! A solution to the "(author): problem" may be to add ) at the end of any
-! string as a possible ending
-
-define Years ["(" Digit+ (".") ")"] | ["[" Digit+ (".") "]"];
-
-! 20:00 Uhr, 00:12:25,34 Minuten
-define Times [ ( [%0|1|2|3|4|5] ) Digit [ ":" [%0|1|2|3|4|5] Digit ]^{1,2} ( "," [ Digit ]^{1,3} ) ];
-
-! Emoticons
-source emoticons.xfst
-
-! acronyms: U.S.A., I.B.M., etc.
-! use a post-filter to remove dots
-define AcronymDep Letter %. Letter %. [Letter %.]+;
-
-! XML sources
-source xml.xfst
-
-! XML entities
-source entities.xfst
-
-! Technical protocols
-source protocols.xfst
-
-define Streetname Word {str} %.;
-
-! Also supports
-!   19.4.2015, 19/4/2015 etc.
-define DigitPunct ["_"|"-"|"."|","|Slash];
-define Num Digit+ [DigitPunct Digit+]* (Char+);
-
-! ordinals
-define Ord Digit ( Digit (Digit) ) %.;
-
-! TODO:
-!   floating point, serial, model numbers, ip addresses, etc.
-!   every other segment must have at least one digit
-
-! Omission words like "fu**ing!"
-define Omission Char+ Asterisk Asterisk+ Char*;
-
-
-! TODO: Name words with ' and `
-
-! Support ASCII elements, like
-! +---------------+
-! <---->, -->, <--
-!       +---------------+
-! <---> | Worker Node N |
-!       +---------------+
-! |============= Core =============|
-
-
-echo - Compile Real Token
-
-define RealToken [Punct|Emdash|Abbr|Streetname|Word|SNS|AcronymDep|Ord|Num|Years|Times|XMLEntities|Omission];
-
-echo - Introduce Token splitter
-
-define Token [
-  RealToken @-> ... NLout,
-  XML @-> ... NLout,
-  URL @-> ... NLout,
-  Email @-> ... NLout,
-  File @-> ... NLout,
-  Domain @-> ... NLout,
-  Emoticons @-> ... NLout
-];
-
-echo - Introduce Sentence splitter
-! And compose Whitespace ignorance
-
-define DQuotes ["”"|%"|"»"|"«"];
-define NotSmallCaps [? - a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - z - ü - ö - ä];
-
-read regex Token .o. [
-  SP NLout [DQuotes (NLout ")") | ["›"|%‹|%’|"'"] ( NLout DQuotes (NLout ")") | NLout ")" ) | ")" ] (NLout SP) @-> ... NLout \/ _ NLout \%,
-] .o. [
-  SP @-> ... NLout \/ NLout _ NLout [? - "”" - %" - "»" - "«" - "›" - %‹ - %’ - "'" - ")" - NLout]
-] .o. [
-  [%. %. %.] @-> ... NLout \/ _ NLout WS+ NotSmallCaps
-] .o. [
-  [WS|NL]+ @-> 0 || [ .#. | NLout ] _
-];
-
-! foma -e "source tokenizer.xfst" -q -s && cat text.txt | flookup tokenizer.fst -x -b
\ No newline at end of file
commit	78f6714529aea5209c4363edf94570a4f5686d19	[log] [tgz]
author	Akron <nils@diewald-online.de>	Sat Apr 09 14:10:44 2022 +0200
committer	Akron <nils@diewald-online.de>	Sat Apr 09 14:10:44 2022 +0200
tree	907711b536979f8a39dd63a8f05364fd9f524331
parent	61948ef87b5c0e556439fff72a270ac1f5ca9bc7 [diff]