Merge "Add notification regarding load factor"

commit: a44944df913511c0b695f8a907ee836160f78fb5 [log] [tgz]
author: Nils Diewald <nils@diewald-online.de> Thu Apr 28 10:21:10 2022 +0200
committer: Gerrit Code Review <gerrit2@korap.ids-mannheim.de> Thu Apr 28 10:21:10 2022 +0200
tree: 148bf8987842677706c03e641b6b6de6a6ff3b54
parent: b15acb9406974883d9c9f931cf40c5e008406871 [diff]
parent: 6a4ce18791dd36d152832a5aabbd6a1a496a004e [diff]
diff --git a/Changes b/Changes
index 5fca5a1..b9359ff 100644
--- a/Changes
+++ b/Changes

@@ -1,3 +1,6 @@
+0.1.6 2022-04-14
+    - Rename TOKEN_SYMBOL to TOKEN_BOUND.
+
 0.1.5 2022-03-28
     - Improve Emoticon-List.
 

diff --git a/Readme.md b/Readme.md
index f02e452..353bc36 100644
--- a/Readme.md
+++ b/Readme.md

@@ -59,14 +59,14 @@
 - Character accepting arcs need to be translated
   *only* to themselves or to ε (the empty symbol).
 - Multi-character symbols are not allowed,
-  except for the `@_TOKEN_SYMBOL_@`,
+  except for the `@_TOKEN_BOUND_@`,
   that denotes the end of a token.
 - ε accepting arcs (transitions not consuming
   any character) need to be translated to
-  the `@_TOKEN_SYMBOL_@`.
-- Two consecutive `@_TOKEN_SYMBOL_@`s mark a sentence end.
+  the `@_TOKEN_BOUND_@`.
+- Two consecutive `@_TOKEN_BOUND_@`s mark a sentence end.
 - Flag diacritics are not supported.
-- Final states are ignored. The `@_TOKEN_SYMBOL_@` marks
+- Final states are ignored. The `@_TOKEN_BOUND_@` marks
   the end of a token instead.
 
 A minimal usable tokenizer written in XFST and following
@@ -74,7 +74,7 @@
 and Beesley (2004) would look like this:
 
 ```xfst
-define TE "@_TOKEN_SYMBOL_@";
+define TB "@_TOKEN_BOUND_@";
 
 define WS [" "|"\u000a"|"\u0009"];
 
@@ -84,17 +84,17 @@
 
 define Word Char+;
 
-! Compose token ends
-define Tokenizer [[Word|PUNCT] @-> ... TE] .o.
+! Compose token bounds
+define Tokenizer [[Word|PUNCT] @-> ... TB] .o.
 ! Compose Whitespace ignorance
        [WS+ @-> 0] .o.
 ! Compose sentence ends
-       [[PUNCT+] @-> ... TE \/ TE _ ];
+       [[PUNCT+] @-> ... TB \/ TB _ ];
 
 read regex Tokenizer;
 ```
 
-> *Hint*: For development it's easier to replace `@_TOKEN_SYMBOL_@`
+> *Hint*: For development it's easier to replace `@_TOKEN_BOUND_@`
 with a newline.
 
 ## Building

diff --git a/datok_test.go b/datok_test.go
index 9558661..8562a98 100644
--- a/datok_test.go
+++ b/datok_test.go

@@ -213,7 +213,7 @@
 		dat = LoadDatokFile("testdata/tokenizer.datok")
 	}
 	assert.NotNil(dat)
-	assert.True(dat.LoadFactor() >= 70)
+	assert.True(dat.LoadFactor() >= 60)
 	assert.Equal(dat.epsilon, 1)
 	assert.Equal(dat.unknown, 2)
 	assert.Equal(dat.identity, 3)

diff --git a/fomafile.go b/fomafile.go
index 8f8f046..9d7106c 100644
--- a/fomafile.go
+++ b/fomafile.go

@@ -409,10 +409,16 @@
 							auto.identity = number
 						}
 
+						// Deprecated
 					case "@_TOKEN_SYMBOL_@":
 						{
 							auto.tokenend = number
 						}
+
+					case "@_TOKEN_BOUND_@":
+						{
+							auto.tokenend = number
+						}
 					default:
 						{
 							// MCS not supported

diff --git a/matrix_test.go b/matrix_test.go
index 6681b2e..d650b52 100644
--- a/matrix_test.go
+++ b/matrix_test.go

@@ -1116,6 +1116,14 @@
 	assert.Equal(tokens[7], "-_-;;;")
 	assert.Equal(tokens[8], "-_-^")
 	assert.Equal(len(tokens), 9)
+
+	tokens = ttokenize(mat, w, "das -> Lustig<-!")
+	assert.Equal("das", tokens[0])
+	assert.Equal("->", tokens[1])
+	assert.Equal("Lustig", tokens[2])
+	assert.Equal("<-", tokens[3])
+	assert.Equal("!", tokens[4])
+	assert.Equal(5, len(tokens))
 }
 
 func TestMatrixFullTokenizerXML(t *testing.T) {
@@ -1166,6 +1174,22 @@
 	assert.Equal("ging", tokens[2])
 	assert.Equal(".", tokens[3])
 	assert.Equal(4, len(tokens))
+
+	tokens = ttokenize(mat, w, "das  <?robot xgh ?>  <!-- hm hm -->   <![CDATA[ cdata ]]>  <br />")
+	assert.Equal("das", tokens[0])
+	assert.Equal("<?robot", tokens[1])
+	assert.Equal("xgh", tokens[2])
+	assert.Equal("?>", tokens[3])
+	assert.Equal("<!--", tokens[4])
+	assert.Equal("hm", tokens[5])
+	assert.Equal("hm", tokens[6])
+	assert.Equal("-->", tokens[7])
+	assert.Equal("<![CDATA[", tokens[8])
+	assert.Equal("cdata", tokens[9])
+	assert.Equal("]]>", tokens[10])
+	assert.Equal("<br />", tokens[11])
+	assert.Equal(12, len(tokens))
+
 }
 
 func TestMatokDatokEquivalence(t *testing.T) {

diff --git a/src/all/allpost.xfst b/src/all/allpost.xfst
index 6ea6dcf..32e7a90 100644
--- a/src/all/allpost.xfst
+++ b/src/all/allpost.xfst

@@ -16,6 +16,9 @@
 ! XML entities
 source all/entities.xfst
 
+! Arrows
+define Arrows [Alldash ">" | "<" Alldash];
+
 ! Technical protocols
 source all/protocols.xfst
 

diff --git a/src/all/allpref.xfst b/src/all/allpref.xfst
index bb5183f..6d8f3bb 100644
--- a/src/all/allpref.xfst
+++ b/src/all/allpref.xfst

@@ -8,7 +8,7 @@
 !  - https://github.com/coltekin/TRmorph/tokenize.xfst by Çağrı Çöltekin (2011-2015)
 !    under the MIT License
 
-define NLout "@_TOKEN_SYMBOL_@";
+define NLout "@_TOKEN_BOUND_@";
 ! define NLout "\u000a";
 
 define Digit [%0|1|2|3|4|5|6|7|8|9];
@@ -58,6 +58,7 @@
 
 define Emdash [%- %- (%-)+ | ["\u2014"|"\u2015"|"\u2e3a"|"\u2e3b"|"\ufe58"]+];
 define Dash ["-"|"\u2011"|"\u2012"|"\u2013"|"\u2e1a"|"\ufe63"|"\uff0d"];
+define Alldash ["-"|"\u2011"|"\u2012"|"\u2013"|"\u2e1a"|"\ufe63"|"\uff0d"|"\u2014"|"\u2015"|"\u2e3a"|"\u2e3b"|"\ufe58"];
 define Slash ["⁄"|"∕"|"／"|"/"];
 define Asterisk ["*"];
 

diff --git a/src/all/xml.xfst b/src/all/xml.xfst
index 06e247d..4526117 100644
--- a/src/all/xml.xfst
+++ b/src/all/xml.xfst

@@ -1,6 +1,11 @@
 ! XML rule
 define XMLns [AsciiLetter [AsciiLetter|Digit|%-]* (%: AsciiLetter [AsciiLetter|Digit|%-]*)] .o. Caseinsensitive;
-define XML [
+
+define XMLcomment [ %< %! %- %- | %- %- %> ];
+define XMLpi [ %< %? AsciiLetter [AsciiLetter | Digit | %- ]* | %? %> ];
+define CDATA [ %< %! %[ {CDATA} %[ | %] %] %> ];
+
+define XML [[
   "<" [
       [
         XMLns
@@ -16,4 +21,4 @@
         "/" XMLns
       ]
     ] WS* ">"
-].u;
+].u | XMLcomment | XMLpi | CDATA ];
\ No newline at end of file

diff --git a/src/de/tokenizer.xfst b/src/de/tokenizer.xfst
index 407c482..9670f9a 100644
--- a/src/de/tokenizer.xfst
+++ b/src/de/tokenizer.xfst

@@ -73,7 +73,7 @@
   Email @-> ... NLout,
   File @-> ... NLout,
   Domain @-> ... NLout,
-  Emoticons @-> ... NLout
+  [Emoticons|Arrows] @-> ... NLout
 ];
 
 source all/allsentencesplit.xfst
commit	a44944df913511c0b695f8a907ee836160f78fb5	[log] [tgz]
author	Nils Diewald <nils@diewald-online.de>	Thu Apr 28 10:21:10 2022 +0200
committer	Gerrit Code Review <gerrit2@korap.ids-mannheim.de>	Thu Apr 28 10:21:10 2022 +0200
tree	148bf8987842677706c03e641b6b6de6a6ff3b54
parent	b15acb9406974883d9c9f931cf40c5e008406871 [diff]
parent	6a4ce18791dd36d152832a5aabbd6a1a496a004e [diff]