Improve Emoticons

Change-Id: I0d72781b41381aa2c86e41287b8f824af4af95d1
diff --git a/Changes b/Changes
index 9612c12..5fca5a1 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,6 @@
+0.1.5 2022-03-28
+    - Improve Emoticon-List.
+
 0.1.4 2022-03-27
     - Improved handling of ellipsis.
     - Make algorithm more robust to nevere fail.
diff --git a/matrix_test.go b/matrix_test.go
index ac9b054..3509d15 100644
--- a/matrix_test.go
+++ b/matrix_test.go
@@ -376,7 +376,7 @@
 	assert.Equal("»\nNun\n,\ngib\ndich\nzufrieden\n,\nich\nfange\nschon\nan\n...", sentences[0])
 	assert.Equal("Also\nBaron\nInnstetten\n!", sentences[1])
 
-	// Check paranthesis at the end of sentences.
+	// Check parantheses at the end of the sentence
 	w.Reset()
 	assert.True(mat.Transduce(strings.NewReader("(Er ging.) Und kam (später)."), w))
 	sentences = strings.Split(w.String(), "\n\n")
@@ -1083,6 +1083,32 @@
 	*/
 }
 
+func TestMatrixEmoticons(t *testing.T) {
+	assert := assert.New(t)
+
+	if mat == nil {
+		mat = LoadMatrixFile("testdata/tokenizer.matok")
+	}
+
+	assert.NotNil(mat)
+
+	b := make([]byte, 0, 2048)
+	w := bytes.NewBuffer(b)
+	var tokens []string
+
+	tokens = ttokenize(mat, w, ":-* ;) :)) :*( ^___^ T__T ^^; -_-;;; -_-^")
+	assert.Equal(tokens[0], ":-*")
+	assert.Equal(tokens[1], ";)")
+	assert.Equal(tokens[2], ":))")
+	assert.Equal(tokens[3], ":*(")
+	assert.Equal(tokens[4], "^___^")
+	assert.Equal(tokens[5], "T__T")
+	assert.Equal(tokens[6], "^^;")
+	assert.Equal(tokens[7], "-_-;;;")
+	assert.Equal(tokens[8], "-_-^")
+	assert.Equal(len(tokens), 9)
+}
+
 func TestMatrixFullTokenizerXML(t *testing.T) {
 	assert := assert.New(t)
 
diff --git a/src/emoji.xfst b/src/emoji.xfst
deleted file mode 100644
index d36ed38..0000000
--- a/src/emoji.xfst
+++ /dev/null
@@ -1,28 +0,0 @@
-read regex [
-["<" ("/") "3"+]|
-["ಠ" "_" "ಠ"]|
-["(" "T" ["_"|"."] "T" ")"]|
-["(" "♥" ["_"|"."] "♥" ")"]|
-["(" "-" ["_"|"."] "-" ")"]|
-
-! May be end of brackets as well, like
-!   Author (2018):
-[[")"|"("] ("'"|"-"|"o") [":"|"="|"x"]]|
-! May be end of xml, like
-!   <b class="emp">=</b>
-[["<"*|">"*] ["B"|"X"|"8"|":"|";"|"="|"x"] ("'"|"-"|"o") ["/"|"<"|"C"|"["|")"|"("|"D"|"P"|"d"|"p"|"3"|">"|"o"|"O"|"*"]]|
-[["D"|">"] ("'") ":"]|
-
-! May be end of square bracket
-!   Author [2018]:
-["]" ":"]|
-["x" "("]|
-["^" (".") "^"]|
-["o" (".") "O"]|
-[%\ ["{" "o" "}"|"o"|"m"] "/"]|
-["*" "_" "*"]|
-["." "_" "."]|
-[":" ["a"|"b"|"c"|"d"|"e"|"f"|"g"|"h"|"i"|"j"|"k"|"l"|"m"|"n"|"o"|"p"|"q"|"r"|"s"|"t"|"u"|"v"|"w"|"x"|"y"|"z"|"_"|"-"]+ ":"]|
-[">" "_" "<"]|
-["*" "<" ":" "-" ")"]
-];
diff --git a/src/emoticons.xfst b/src/emoticons.xfst
new file mode 100644
index 0000000..74bffff
--- /dev/null
+++ b/src/emoticons.xfst
@@ -0,0 +1,42 @@
+! Partially by Park, Barash, Fink & Cha (2013)
+
+define verticalemoticon [
+[ "ಠ" "_" "ಠ"]|
+[ "T" ["_"|"."|"-"]+ "T"] |
+[ "♥" ["_"|"."|"-"]+ "♥" ] |
+[ "@" ["_"|"."|"-"]* "@" ] |
+[ "*" ["_"|"."|"-"]+ "*" ] |
+[ "x" ["_"|"."|"-"]+ "x" ] |
+[ "X" ["_"|"."|"-"]+ "X" ] |
+[ "-" ["_"|"."]+ "-" ] |
+[ "." ["_"]+ "." ] |
+[ "^" ["_"|"."|"-"]* "^" ] |
+[ ">" ["_"|"."|"-"]* "<" ] |
+[ ["o"|"O"] ["_"|"."|"-"]+ ["o"|"O"] ] 
+];
+
+read regex [
+["<" ("/") "3"+] |
+verticalemoticon (";"+|"^") |
+["(" verticalemoticon ")"] |
+
+! May be end of brackets as well, like
+!   Author (2018):
+[ [")"|"("] ["'"|"-"|"o"]* [":"|"="|"x"] ] |
+! May be end of xml, like
+!   <b class="emp">=</b>
+[ ["<"*|">"*] ["B"|"X"|"8"|":"|";"|"="|"x"] ["'"|"-"|"o"]* ["/"|"<"|"C"|"["|")"|"("|"D"|"P"|"d"|"p"|"3"|">"|"o"|"O"|"*"]] |
+[ ["D"|">"] ("'") ":"] |
+
+! May be end of square bracket
+!   Author [2018]:
+["]" ":"] |
+[(">") [";"|":"] ["-"|"*"]* [ ")" | "(" | %] | %[ ]+ ] |
+[(">") [";"|":"] ["-"]* ["*"|"P"|"p"|"o"|"O"|"D"]] |
+["x" "("] |
+["^" (".") "^"] |
+[%\ ["{" "o" "}"|"o"|"m"] "/"] |
+[":" ["a"|"b"|"c"|"d"|"e"|"f"|"g"|"h"|"i"|"j"|"k"|"l"|"m"|"n"|"o"|"p"|"q"|"r"|"s"|"t"|"u"|"v"|"w"|"x"|"y"|"z"|"_"|"-"]+ ":"] |
+[">" "_" "<"] |
+["*" "<" ":" "-" ")"]
+];
diff --git a/src/tokenizer.xfst b/src/tokenizer.xfst
index 5fe1aec..4372d4a 100644
--- a/src/tokenizer.xfst
+++ b/src/tokenizer.xfst
@@ -119,8 +119,8 @@
 ! 20:00 Uhr, 00:12:25,34 Minuten
 define Times [ ( [%0|1|2|3|4|5] ) Digit [ ":" [%0|1|2|3|4|5] Digit ]^{1,2} ( "," [ Digit ]^{1,3} ) ];
 
-source emoji.xfst
-define Emoji;
+source emoticons.xfst
+define Emoticons;
 
 ! acronyms: U.S.A., I.B.M., etc.
 ! use a post-filter to remove dots
@@ -215,7 +215,7 @@
   Email @-> ... NLout,
   File @-> ... NLout,
   Domain @-> ... NLout,
-  Emoji @-> ... NLout
+  Emoticons @-> ... NLout
 ];
 
 echo - Introduce Sentence splitter
diff --git a/testdata/tokenizer.datok b/testdata/tokenizer.datok
index 9dddefa..6ebbaba 100644
--- a/testdata/tokenizer.datok
+++ b/testdata/tokenizer.datok
Binary files differ
diff --git a/testdata/tokenizer.fst b/testdata/tokenizer.fst
index def2160..1c25aed 100644
--- a/testdata/tokenizer.fst
+++ b/testdata/tokenizer.fst
Binary files differ
diff --git a/testdata/tokenizer.matok b/testdata/tokenizer.matok
index 3591617..41894f3 100644
--- a/testdata/tokenizer.matok
+++ b/testdata/tokenizer.matok
Binary files differ