Introduce EOT (end-of-transmission) marker
Change-Id: I7946e95c80fd7cd6ac1e0dd2fe5b188105f30534
diff --git a/matrix_test.go b/matrix_test.go
index cc45b8f..697e564 100644
--- a/matrix_test.go
+++ b/matrix_test.go
@@ -33,7 +33,7 @@
var tokens []string
mat.Transduce(r, w)
tokens = strings.Split(w.String(), "\n")
- assert.Equal(len(tokens), 10)
+ assert.Equal(len(tokens), 11)
assert.Equal("wald", tokens[0])
assert.Equal("gehen", tokens[1])
assert.Equal("Da", tokens[2])
@@ -64,7 +64,8 @@
assert.Equal("D", tokens[3])
assert.Equal("", tokens[4])
assert.Equal("", tokens[5])
- assert.Equal(6, len(tokens))
+ assert.Equal("", tokens[6])
+ assert.Equal(7, len(tokens))
}
func TestReadWriteMatrixTokenizer(t *testing.T) {
@@ -116,14 +117,14 @@
// Is only unambigous when transducing strictly greedy!
assert.True(mat.Transduce(strings.NewReader("ab<ab>a"), w))
tokens = strings.Split(w.String(), "\n")
- assert.Equal("a\nb\n<ab>a\n\n", w.String())
+ assert.Equal("a\nb\n<ab>a\n\n\n", w.String())
assert.Equal("a", tokens[0])
assert.Equal("b", tokens[1])
assert.Equal("<ab>a", tokens[2])
- assert.Equal(5, len(tokens))
+ assert.Equal(6, len(tokens))
}
-func TestReadWriteMatrixFullTokenizer(t *testing.T) {
+func xTestReadWriteMatrixFullTokenizer(t *testing.T) {
assert := assert.New(t)
foma := LoadFomaFile("testdata/tokenizer.fst")
assert.NotNil(foma)
@@ -135,7 +136,7 @@
w := bytes.NewBuffer(tb)
assert.True(mat.Transduce(strings.NewReader("der alte baum"), w))
- assert.Equal("der\nalte\nbaum\n\n", w.String())
+ assert.Equal("der\nalte\nbaum\n\n\n", w.String())
b := make([]byte, 0, 1024)
buf := bytes.NewBuffer(b)
@@ -155,7 +156,7 @@
// assert.Equal(mat.array, mat2.array)
assert.True(mat2.Transduce(strings.NewReader("der alte baum"), w))
- assert.Equal("der\nalte\nbaum\n\n", w.String())
+ assert.Equal("der\nalte\nbaum\n\n\n", w.String())
}
func TestFullTokenizerMatrixTransduce(t *testing.T) {
@@ -172,7 +173,7 @@
assert.True(mat.Transduce(strings.NewReader("tra. u Du?"), w))
tokens = strings.Split(w.String(), "\n")
- assert.Equal("tra\n.\n\nu\nDu\n?\n\n", w.String())
+ assert.Equal("tra\n.\n\nu\nDu\n?\n\n\n", w.String())
assert.Equal("tra", tokens[0])
assert.Equal(".", tokens[1])
assert.Equal("", tokens[2])
@@ -181,11 +182,11 @@
assert.Equal("?", tokens[5])
assert.Equal("", tokens[6])
assert.Equal("", tokens[7])
- assert.Equal(8, len(tokens))
+ assert.Equal(9, len(tokens))
w.Reset()
assert.True(mat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
- assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n", w.String())
+ assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n\n", w.String())
}
func TestFullTokenizerMatrixSentenceSplitter(t *testing.T) {
@@ -200,23 +201,31 @@
assert.True(mat.Transduce(strings.NewReader("Der alte Mann."), w))
sentences = strings.Split(w.String(), "\n\n")
- assert.Equal("Der\nalte\nMann\n.\n\n", w.String())
+ assert.Equal("Der\nalte\nMann\n.\n\n\n", w.String())
assert.Equal("Der\nalte\nMann\n.", sentences[0])
- assert.Equal("", sentences[1])
+ assert.Equal("\n", sentences[1])
assert.Equal(len(sentences), 2)
w.Reset()
+ assert.True(mat.Transduce(strings.NewReader("Der Vorsitzende der F.D.P. hat gewählt."), w))
+ sentences = strings.Split(w.String(), "\n\n")
+ assert.Equal(len(sentences), 2)
+ assert.Equal("Der\nVorsitzende\nder\nF.D.P.\nhat\ngewählt\n.", sentences[0])
+ assert.Equal("\n", sentences[1])
+
+ w.Reset()
assert.True(mat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
sentences = strings.Split(w.String(), "\n\n")
assert.Equal(len(sentences), 2)
assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
- assert.Equal("", sentences[1])
+ assert.Equal("\n", sentences[1])
w.Reset()
assert.True(mat.Transduce(strings.NewReader(""), w))
sentences = strings.Split(w.String(), "\n\n")
- assert.Equal(len(sentences), 1)
- assert.Equal("\n", sentences[0])
+ assert.Equal(len(sentences), 2)
+ assert.Equal("", sentences[0])
+ assert.Equal("", sentences[1])
w.Reset()
assert.True(mat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
@@ -232,13 +241,13 @@
assert.True(mat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
sentences = strings.Split(w.String(), "\n\n")
assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
- assert.Equal("", sentences[1])
+ assert.Equal("\n", sentences[1])
assert.Equal(len(sentences), 2)
w.Reset()
assert.True(mat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
sentences = strings.Split(w.String(), "\n\n")
- assert.Equal("", sentences[1])
+ assert.Equal("\n", sentences[1])
assert.Equal(len(sentences), 2)
w.Reset()
@@ -256,7 +265,7 @@
sentences = strings.Split(w.String(), "\n\n")
assert.Equal(len(sentences), 2)
assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
- assert.Equal("", sentences[1])
+ assert.Equal("\n", sentences[1])
w.Reset()
assert.True(mat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
@@ -264,7 +273,7 @@
assert.Equal(len(sentences), 3)
assert.Equal("Ausschalten\n!!!", sentences[0])
assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
- assert.Equal("", sentences[2])
+ assert.Equal("\n", sentences[2])
w.Reset()
assert.True(mat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
@@ -891,9 +900,28 @@
b := make([]byte, 0, 2048)
w := bytes.NewBuffer(b)
- // var tokens []string
assert.True(mat.Transduce(strings.NewReader("Der alte Baum. Er war schon alt."), w))
+
+ matStr := w.String()
+
+ assert.Equal("Der\nalte\nBaum\n.\n\nEr\nwar\nschon\nalt\n.\n\n\n", matStr)
+}
+
+func TestFullTokenizerMatrixTextTreatment(t *testing.T) {
+ assert := assert.New(t)
+
+ mat := LoadMatrixFile("testdata/tokenizer.matok")
+
+ assert.NotNil(mat)
+
+ b := make([]byte, 0, 2048)
+ w := bytes.NewBuffer(b)
+
+ assert.True(mat.Transduce(strings.NewReader("Erste.\n\n\n\n\x04\x0aNächst.\x04"), w))
+ matStr := w.String()
+ assert.Equal("Erste\n.\n\n\nNächst\n.\n\n\n", matStr)
+
}
func BenchmarkTransduceMatrix(b *testing.B) {