Introduce XML tests
diff --git a/datokenizer_test.go b/datokenizer_test.go
index 3cece06..fd498cb 100644
--- a/datokenizer_test.go
+++ b/datokenizer_test.go
@@ -171,8 +171,8 @@
assert.Equal(dat.epsilon, 1)
assert.Equal(dat.unknown, 2)
assert.Equal(dat.identity, 3)
- assert.Equal(dat.final, 137)
- assert.Equal(len(dat.sigma), 132)
+ assert.Equal(dat.final, 145)
+ assert.Equal(len(dat.sigma), 140)
assert.True(len(dat.array) > 3600000)
assert.True(dat.maxSize > 3600000)
assert.True(tmatch(dat, "bau"))
@@ -847,6 +847,46 @@
*/
}
+func TestFullTokenizerXML(t *testing.T) {
+ assert := assert.New(t)
+
+ dat := LoadDatokFile("testdata/tokenizer.datok")
+ assert.NotNil(dat)
+
+ b := make([]byte, 0, 2048)
+ w := bytes.NewBuffer(b)
+ var tokens []string
+
+ tokens = ttokenize(dat, w, "Das <b>beste</b> Fußballspiel")
+ assert.Equal("Das", tokens[0])
+ assert.Equal("<b>", tokens[1])
+ assert.Equal("beste", tokens[2])
+ assert.Equal("</b>", tokens[3])
+ assert.Equal("Fußballspiel", tokens[4])
+ assert.Equal(5, len(tokens))
+
+ tokens = ttokenize(dat, w, "Das <b class=\"c\">beste</b> Fußballspiel")
+ assert.Equal("Das", tokens[0])
+ assert.Equal("<b class=\"c\">", tokens[1])
+ assert.Equal("beste", tokens[2])
+ assert.Equal("</b>", tokens[3])
+ assert.Equal("Fußballspiel", tokens[4])
+ assert.Equal(5, len(tokens))
+
+ tokens = ttokenize(dat, w, "der<x y=\"alte \"> <x x> alte</x> etc. et. Mann.")
+ assert.Equal("der", tokens[0])
+ assert.Equal("<x y=\"alte \">", tokens[1])
+ assert.Equal("<x x>", tokens[2])
+ assert.Equal("alte", tokens[3])
+ assert.Equal("</x>", tokens[4])
+ assert.Equal("etc.", tokens[5])
+ assert.Equal("et", tokens[6])
+ assert.Equal(".", tokens[7])
+ assert.Equal("Mann", tokens[8])
+ assert.Equal(".", tokens[9])
+ assert.Equal(10, len(tokens))
+}
+
func BenchmarkTransduce(b *testing.B) {
bu := make([]byte, 0, 2048)
w := bytes.NewBuffer(bu)
diff --git a/src/tokenizer.xfst b/src/tokenizer.xfst
index 6d21d8d..f7a089f 100644
--- a/src/tokenizer.xfst
+++ b/src/tokenizer.xfst
@@ -11,7 +11,6 @@
! The abbreviation list is part of the sentence splitter tool
! of the IDS.
-! define NLout "\u000a";
define NLout "@_TOKEN_SYMBOL_@";
! define NLout "\u000a";
@@ -55,7 +54,7 @@
define Sym ["-"|"+"|"<"|">"|"*"|"/"|%=|%@];
define Apos %'|%’|%`;
-define Punct LP|RP|Sym;
+define Punct [LP|RP|Sym];
!define nonSym \[WS|LP|RP|Sym];
!!!!!!!!!!!!!!!!!!
! </from tmorph> !
@@ -136,8 +135,24 @@
define Domain Char+ [Dash Char+]* Dot TldEnd;
-!define XML "<" Alpha URLChar* (">");
-define XML "<" URLChar+ (">");
+! XML rule
+define XMLns [AsciiLetter [AsciiLetter|Digit|%-]* (%: AsciiLetter [AsciiLetter|Digit|%-]*)] .o. Caseinsensitive;
+define XML [
+ "<" [
+ [
+ XMLns
+ [WS+ XMLns WS*
+ (%= WS*
+ [[%" [? - %" - %>]+ %"] | [%' [? - %' - %>]+ %']]
+ )
+ ]*
+ ]
+ |
+ [
+ "/" XMLns
+ ]
+ ] WS* ">"
+].u;
!define Email [Alpha [URLChar-At]* At Alpha URLChar* [Dot [[Alpha URLChar+]-Dot-At]]+];
define Email URLChar+ At URLChar+ [Dot URLChar+]+;
@@ -193,17 +208,24 @@
! |============= Core =============|
+echo - Compile Real Token
-define RealToken [XML|Email|URL|SNS|Abbr|Plusampersand|Streetname|Omission|Domain|AcronymDep|File|Emdash|Punct|Ord|Num|Years|Emoji|Word];
+define RealToken [Punct|Word|XML|Email|URL|SNS|Domain|AcronymDep|File|Ord|Num|Years];
echo - Introduce Token splitter
-define Token [RealToken @-> ... NLout]
-! .o. [NL -> 0]
-.o. [WS+ @-> 0]
+
+define Token [
+ RealToken @-> ... NLout,
+ Abbr @-> ... NLout,
+ Plusampersand @-> ... NLout,
+ Emoji @-> ... NLout,
+ [Streetname|Omission|Emdash] @-> ... NLout
+ ]
+.o. [WS+ @-> 0 || NLout _ ]
;
echo - Introduce Sentence splitter
-read regex Token .o. [[["."|"!"|"?"]+|"…"] @-> ... NLout \/ NLout _];
+read regex Token .o. [[["."|"!"|"?"]+|"…"] @-> ... NLout \/ NLout _ ];
! foma -e "source tokenizer.xfst" -q -s && cat text.txt | flookup tokenizer.fst -x -b
diff --git a/testdata/tokenizer.datok b/testdata/tokenizer.datok
index 496d564..e652e3c 100644
--- a/testdata/tokenizer.datok
+++ b/testdata/tokenizer.datok
Binary files differ
diff --git a/testdata/tokenizer.fst b/testdata/tokenizer.fst
index 583b0e2..951c165 100644
--- a/testdata/tokenizer.fst
+++ b/testdata/tokenizer.fst
Binary files differ