Improve Emoticons
Change-Id: I0d72781b41381aa2c86e41287b8f824af4af95d1
diff --git a/Changes b/Changes
index 9612c12..5fca5a1 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,6 @@
+0.1.5 2022-03-28
+ - Improve Emoticon-List.
+
0.1.4 2022-03-27
- Improved handling of ellipsis.
- Make algorithm more robust to nevere fail.
diff --git a/matrix_test.go b/matrix_test.go
index ac9b054..3509d15 100644
--- a/matrix_test.go
+++ b/matrix_test.go
@@ -376,7 +376,7 @@
assert.Equal("»\nNun\n,\ngib\ndich\nzufrieden\n,\nich\nfange\nschon\nan\n...", sentences[0])
assert.Equal("Also\nBaron\nInnstetten\n!", sentences[1])
- // Check paranthesis at the end of sentences.
+ // Check parantheses at the end of the sentence
w.Reset()
assert.True(mat.Transduce(strings.NewReader("(Er ging.) Und kam (später)."), w))
sentences = strings.Split(w.String(), "\n\n")
@@ -1083,6 +1083,32 @@
*/
}
+func TestMatrixEmoticons(t *testing.T) {
+ assert := assert.New(t)
+
+ if mat == nil {
+ mat = LoadMatrixFile("testdata/tokenizer.matok")
+ }
+
+ assert.NotNil(mat)
+
+ b := make([]byte, 0, 2048)
+ w := bytes.NewBuffer(b)
+ var tokens []string
+
+ tokens = ttokenize(mat, w, ":-* ;) :)) :*( ^___^ T__T ^^; -_-;;; -_-^")
+ assert.Equal(tokens[0], ":-*")
+ assert.Equal(tokens[1], ";)")
+ assert.Equal(tokens[2], ":))")
+ assert.Equal(tokens[3], ":*(")
+ assert.Equal(tokens[4], "^___^")
+ assert.Equal(tokens[5], "T__T")
+ assert.Equal(tokens[6], "^^;")
+ assert.Equal(tokens[7], "-_-;;;")
+ assert.Equal(tokens[8], "-_-^")
+ assert.Equal(len(tokens), 9)
+}
+
func TestMatrixFullTokenizerXML(t *testing.T) {
assert := assert.New(t)
diff --git a/src/emoji.xfst b/src/emoji.xfst
deleted file mode 100644
index d36ed38..0000000
--- a/src/emoji.xfst
+++ /dev/null
@@ -1,28 +0,0 @@
-read regex [
-["<" ("/") "3"+]|
-["ಠ" "_" "ಠ"]|
-["(" "T" ["_"|"."] "T" ")"]|
-["(" "♥" ["_"|"."] "♥" ")"]|
-["(" "-" ["_"|"."] "-" ")"]|
-
-! May be end of brackets as well, like
-! Author (2018):
-[[")"|"("] ("'"|"-"|"o") [":"|"="|"x"]]|
-! May be end of xml, like
-! <b class="emp">=</b>
-[["<"*|">"*] ["B"|"X"|"8"|":"|";"|"="|"x"] ("'"|"-"|"o") ["/"|"<"|"C"|"["|")"|"("|"D"|"P"|"d"|"p"|"3"|">"|"o"|"O"|"*"]]|
-[["D"|">"] ("'") ":"]|
-
-! May be end of square bracket
-! Author [2018]:
-["]" ":"]|
-["x" "("]|
-["^" (".") "^"]|
-["o" (".") "O"]|
-[%\ ["{" "o" "}"|"o"|"m"] "/"]|
-["*" "_" "*"]|
-["." "_" "."]|
-[":" ["a"|"b"|"c"|"d"|"e"|"f"|"g"|"h"|"i"|"j"|"k"|"l"|"m"|"n"|"o"|"p"|"q"|"r"|"s"|"t"|"u"|"v"|"w"|"x"|"y"|"z"|"_"|"-"]+ ":"]|
-[">" "_" "<"]|
-["*" "<" ":" "-" ")"]
-];
diff --git a/src/emoticons.xfst b/src/emoticons.xfst
new file mode 100644
index 0000000..74bffff
--- /dev/null
+++ b/src/emoticons.xfst
@@ -0,0 +1,42 @@
+! Partially by Park, Barash, Fink & Cha (2013)
+
+define verticalemoticon [
+[ "ಠ" "_" "ಠ"]|
+[ "T" ["_"|"."|"-"]+ "T"] |
+[ "♥" ["_"|"."|"-"]+ "♥" ] |
+[ "@" ["_"|"."|"-"]* "@" ] |
+[ "*" ["_"|"."|"-"]+ "*" ] |
+[ "x" ["_"|"."|"-"]+ "x" ] |
+[ "X" ["_"|"."|"-"]+ "X" ] |
+[ "-" ["_"|"."]+ "-" ] |
+[ "." ["_"]+ "." ] |
+[ "^" ["_"|"."|"-"]* "^" ] |
+[ ">" ["_"|"."|"-"]* "<" ] |
+[ ["o"|"O"] ["_"|"."|"-"]+ ["o"|"O"] ]
+];
+
+read regex [
+["<" ("/") "3"+] |
+verticalemoticon (";"+|"^") |
+["(" verticalemoticon ")"] |
+
+! May be end of brackets as well, like
+! Author (2018):
+[ [")"|"("] ["'"|"-"|"o"]* [":"|"="|"x"] ] |
+! May be end of xml, like
+! <b class="emp">=</b>
+[ ["<"*|">"*] ["B"|"X"|"8"|":"|";"|"="|"x"] ["'"|"-"|"o"]* ["/"|"<"|"C"|"["|")"|"("|"D"|"P"|"d"|"p"|"3"|">"|"o"|"O"|"*"]] |
+[ ["D"|">"] ("'") ":"] |
+
+! May be end of square bracket
+! Author [2018]:
+["]" ":"] |
+[(">") [";"|":"] ["-"|"*"]* [ ")" | "(" | %] | %[ ]+ ] |
+[(">") [";"|":"] ["-"]* ["*"|"P"|"p"|"o"|"O"|"D"]] |
+["x" "("] |
+["^" (".") "^"] |
+[%\ ["{" "o" "}"|"o"|"m"] "/"] |
+[":" ["a"|"b"|"c"|"d"|"e"|"f"|"g"|"h"|"i"|"j"|"k"|"l"|"m"|"n"|"o"|"p"|"q"|"r"|"s"|"t"|"u"|"v"|"w"|"x"|"y"|"z"|"_"|"-"]+ ":"] |
+[">" "_" "<"] |
+["*" "<" ":" "-" ")"]
+];
diff --git a/src/tokenizer.xfst b/src/tokenizer.xfst
index 5fe1aec..4372d4a 100644
--- a/src/tokenizer.xfst
+++ b/src/tokenizer.xfst
@@ -119,8 +119,8 @@
! 20:00 Uhr, 00:12:25,34 Minuten
define Times [ ( [%0|1|2|3|4|5] ) Digit [ ":" [%0|1|2|3|4|5] Digit ]^{1,2} ( "," [ Digit ]^{1,3} ) ];
-source emoji.xfst
-define Emoji;
+source emoticons.xfst
+define Emoticons;
! acronyms: U.S.A., I.B.M., etc.
! use a post-filter to remove dots
@@ -215,7 +215,7 @@
Email @-> ... NLout,
File @-> ... NLout,
Domain @-> ... NLout,
- Emoji @-> ... NLout
+ Emoticons @-> ... NLout
];
echo - Introduce Sentence splitter
diff --git a/testdata/tokenizer.datok b/testdata/tokenizer.datok
index 9dddefa..6ebbaba 100644
--- a/testdata/tokenizer.datok
+++ b/testdata/tokenizer.datok
Binary files differ
diff --git a/testdata/tokenizer.fst b/testdata/tokenizer.fst
index def2160..1c25aed 100644
--- a/testdata/tokenizer.fst
+++ b/testdata/tokenizer.fst
Binary files differ
diff --git a/testdata/tokenizer.matok b/testdata/tokenizer.matok
index 3591617..41894f3 100644
--- a/testdata/tokenizer.matok
+++ b/testdata/tokenizer.matok
Binary files differ