Merge "Add notification regarding load factor"
diff --git a/Changes b/Changes
index 5fca5a1..b9359ff 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,6 @@
+0.1.6 2022-04-14
+ - Rename TOKEN_SYMBOL to TOKEN_BOUND.
+
0.1.5 2022-03-28
- Improve Emoticon-List.
diff --git a/Readme.md b/Readme.md
index f02e452..353bc36 100644
--- a/Readme.md
+++ b/Readme.md
@@ -59,14 +59,14 @@
- Character accepting arcs need to be translated
*only* to themselves or to ε (the empty symbol).
- Multi-character symbols are not allowed,
- except for the `@_TOKEN_SYMBOL_@`,
+ except for the `@_TOKEN_BOUND_@`,
that denotes the end of a token.
- ε accepting arcs (transitions not consuming
any character) need to be translated to
- the `@_TOKEN_SYMBOL_@`.
-- Two consecutive `@_TOKEN_SYMBOL_@`s mark a sentence end.
+ the `@_TOKEN_BOUND_@`.
+- Two consecutive `@_TOKEN_BOUND_@`s mark a sentence end.
- Flag diacritics are not supported.
-- Final states are ignored. The `@_TOKEN_SYMBOL_@` marks
+- Final states are ignored. The `@_TOKEN_BOUND_@` marks
the end of a token instead.
A minimal usable tokenizer written in XFST and following
@@ -74,7 +74,7 @@
and Beesley (2004) would look like this:
```xfst
-define TE "@_TOKEN_SYMBOL_@";
+define TB "@_TOKEN_BOUND_@";
define WS [" "|"\u000a"|"\u0009"];
@@ -84,17 +84,17 @@
define Word Char+;
-! Compose token ends
-define Tokenizer [[Word|PUNCT] @-> ... TE] .o.
+! Compose token bounds
+define Tokenizer [[Word|PUNCT] @-> ... TB] .o.
! Compose Whitespace ignorance
[WS+ @-> 0] .o.
! Compose sentence ends
- [[PUNCT+] @-> ... TE \/ TE _ ];
+ [[PUNCT+] @-> ... TB \/ TB _ ];
read regex Tokenizer;
```
-> *Hint*: For development it's easier to replace `@_TOKEN_SYMBOL_@`
+> *Hint*: For development it's easier to replace `@_TOKEN_BOUND_@`
with a newline.
## Building
diff --git a/datok_test.go b/datok_test.go
index 9558661..8562a98 100644
--- a/datok_test.go
+++ b/datok_test.go
@@ -213,7 +213,7 @@
dat = LoadDatokFile("testdata/tokenizer.datok")
}
assert.NotNil(dat)
- assert.True(dat.LoadFactor() >= 70)
+ assert.True(dat.LoadFactor() >= 60)
assert.Equal(dat.epsilon, 1)
assert.Equal(dat.unknown, 2)
assert.Equal(dat.identity, 3)
diff --git a/fomafile.go b/fomafile.go
index 8f8f046..9d7106c 100644
--- a/fomafile.go
+++ b/fomafile.go
@@ -409,10 +409,16 @@
auto.identity = number
}
+ // Deprecated
case "@_TOKEN_SYMBOL_@":
{
auto.tokenend = number
}
+
+ case "@_TOKEN_BOUND_@":
+ {
+ auto.tokenend = number
+ }
default:
{
// MCS not supported
diff --git a/matrix_test.go b/matrix_test.go
index 6681b2e..d650b52 100644
--- a/matrix_test.go
+++ b/matrix_test.go
@@ -1116,6 +1116,14 @@
assert.Equal(tokens[7], "-_-;;;")
assert.Equal(tokens[8], "-_-^")
assert.Equal(len(tokens), 9)
+
+ tokens = ttokenize(mat, w, "das -> Lustig<-!")
+ assert.Equal("das", tokens[0])
+ assert.Equal("->", tokens[1])
+ assert.Equal("Lustig", tokens[2])
+ assert.Equal("<-", tokens[3])
+ assert.Equal("!", tokens[4])
+ assert.Equal(5, len(tokens))
}
func TestMatrixFullTokenizerXML(t *testing.T) {
@@ -1166,6 +1174,22 @@
assert.Equal("ging", tokens[2])
assert.Equal(".", tokens[3])
assert.Equal(4, len(tokens))
+
+ tokens = ttokenize(mat, w, "das <?robot xgh ?> <!-- hm hm --> <![CDATA[ cdata ]]> <br />")
+ assert.Equal("das", tokens[0])
+ assert.Equal("<?robot", tokens[1])
+ assert.Equal("xgh", tokens[2])
+ assert.Equal("?>", tokens[3])
+ assert.Equal("<!--", tokens[4])
+ assert.Equal("hm", tokens[5])
+ assert.Equal("hm", tokens[6])
+ assert.Equal("-->", tokens[7])
+ assert.Equal("<![CDATA[", tokens[8])
+ assert.Equal("cdata", tokens[9])
+ assert.Equal("]]>", tokens[10])
+ assert.Equal("<br />", tokens[11])
+ assert.Equal(12, len(tokens))
+
}
func TestMatokDatokEquivalence(t *testing.T) {
diff --git a/src/all/allpost.xfst b/src/all/allpost.xfst
index 6ea6dcf..32e7a90 100644
--- a/src/all/allpost.xfst
+++ b/src/all/allpost.xfst
@@ -16,6 +16,9 @@
! XML entities
source all/entities.xfst
+! Arrows
+define Arrows [Alldash ">" | "<" Alldash];
+
! Technical protocols
source all/protocols.xfst
diff --git a/src/all/allpref.xfst b/src/all/allpref.xfst
index bb5183f..6d8f3bb 100644
--- a/src/all/allpref.xfst
+++ b/src/all/allpref.xfst
@@ -8,7 +8,7 @@
! - https://github.com/coltekin/TRmorph/tokenize.xfst by Çağrı Çöltekin (2011-2015)
! under the MIT License
-define NLout "@_TOKEN_SYMBOL_@";
+define NLout "@_TOKEN_BOUND_@";
! define NLout "\u000a";
define Digit [%0|1|2|3|4|5|6|7|8|9];
@@ -58,6 +58,7 @@
define Emdash [%- %- (%-)+ | ["\u2014"|"\u2015"|"\u2e3a"|"\u2e3b"|"\ufe58"]+];
define Dash ["-"|"\u2011"|"\u2012"|"\u2013"|"\u2e1a"|"\ufe63"|"\uff0d"];
+define Alldash ["-"|"\u2011"|"\u2012"|"\u2013"|"\u2e1a"|"\ufe63"|"\uff0d"|"\u2014"|"\u2015"|"\u2e3a"|"\u2e3b"|"\ufe58"];
define Slash ["⁄"|"∕"|"/"|"/"];
define Asterisk ["*"];
diff --git a/src/all/xml.xfst b/src/all/xml.xfst
index 06e247d..4526117 100644
--- a/src/all/xml.xfst
+++ b/src/all/xml.xfst
@@ -1,6 +1,11 @@
! XML rule
define XMLns [AsciiLetter [AsciiLetter|Digit|%-]* (%: AsciiLetter [AsciiLetter|Digit|%-]*)] .o. Caseinsensitive;
-define XML [
+
+define XMLcomment [ %< %! %- %- | %- %- %> ];
+define XMLpi [ %< %? AsciiLetter [AsciiLetter | Digit | %- ]* | %? %> ];
+define CDATA [ %< %! %[ {CDATA} %[ | %] %] %> ];
+
+define XML [[
"<" [
[
XMLns
@@ -16,4 +21,4 @@
"/" XMLns
]
] WS* ">"
-].u;
+].u | XMLcomment | XMLpi | CDATA ];
\ No newline at end of file
diff --git a/src/de/tokenizer.xfst b/src/de/tokenizer.xfst
index 407c482..9670f9a 100644
--- a/src/de/tokenizer.xfst
+++ b/src/de/tokenizer.xfst
@@ -73,7 +73,7 @@
Email @-> ... NLout,
File @-> ... NLout,
Domain @-> ... NLout,
- Emoticons @-> ... NLout
+ [Emoticons|Arrows] @-> ... NLout
];
source all/allsentencesplit.xfst