Add XML entities
Change-Id: Ib22e51ec8427e0af23a9dcf83c01e6e16837c91e
diff --git a/matrix_test.go b/matrix_test.go
index fe96c62..a762f1d 100644
--- a/matrix_test.go
+++ b/matrix_test.go
@@ -753,6 +753,22 @@
assert.Equal(tokens[4], ".")
assert.Equal(5, len(tokens))
+ // XML entities
+ tokens = ttokenize(mat, w, "Das ist 1:30 Stunden&20 Minuten zu spät >.")
+ assert.Equal(tokens[0], "Das")
+ assert.Equal(tokens[1], "ist")
+ assert.Equal(tokens[2], " ")
+ assert.Equal(tokens[3], "1:30")
+ assert.Equal(tokens[4], "Stunden")
+ assert.Equal(tokens[5], "&")
+ assert.Equal(tokens[6], "20")
+ assert.Equal(tokens[7], "Minuten")
+ assert.Equal(tokens[8], "zu")
+ assert.Equal(tokens[9], "spät")
+ assert.Equal(tokens[10], ">")
+ assert.Equal(tokens[11], ".")
+ assert.Equal(12, len(tokens))
+
/*
@Test
public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
diff --git a/src/entities.xfst b/src/entities.xfst
new file mode 100644
index 0000000..92430ef
--- /dev/null
+++ b/src/entities.xfst
@@ -0,0 +1,5 @@
+define XMLentitiesDec ["#" Digit Digit+ ];
+define XMLentitiesHex ["#" ["x"|"X"] HexLetter HexLetter+ ];
+define XMLentitiesStr [[[AsciiLetter .o. Caseinsensitive].l | Digit | "_" | "-" ]+ ];
+
+read regex ["&" [XMLentitiesDec|XMLentitiesHex|XMLentitiesStr] ";"];
\ No newline at end of file
diff --git a/src/tokenizer.xfst b/src/tokenizer.xfst
index 742f147..dca20c4 100644
--- a/src/tokenizer.xfst
+++ b/src/tokenizer.xfst
@@ -54,7 +54,7 @@
! from book
[%‘ %‘]|[%’ %’]];
-define Sym ["-"|"+"|"<"|">"|"*"|"/"|%=|%@];
+define Sym ["-"|"+"|"<"|">"|"*"|"/"|%=|%@|%&];
define Apos %'|%’|%`;
define Punct [LP|RP|Sym];
!define nonSym \[WS|LP|RP|Sym];
@@ -163,7 +163,12 @@
] WS* ">"
].u;
-!define Email [Alpha [URLChar-At]* At Alpha URLChar* [Dot [[Alpha URLChar+]-Dot-At]]+];
+! XML entities
+source entities.xfst
+define XMLEntities;
+
+
+! Email addresses
define Email URLChar+ At URLChar+ [Dot URLChar+]+;
! Twitter user, hashtag, Google+
@@ -225,6 +230,7 @@
echo - Introduce Token splitter
define Token [
+ XMLEntities @-> ... NLout,
Abbr @-> ... NLout,
RealToken @-> ... NLout,
Plusampersand @-> ... NLout,