Add sentence end detection to matrix
diff --git a/datok_test.go b/datok_test.go
index 67617b2..8c2b894 100644
--- a/datok_test.go
+++ b/datok_test.go
@@ -75,6 +75,7 @@
var tokens []string
dat.Transduce(r, w)
tokens = strings.Split(w.String(), "\n")
+ assert.Equal(len(tokens), 10)
assert.Equal("wald", tokens[0])
assert.Equal("gehen", tokens[1])
assert.Equal("Da", tokens[2])
diff --git a/matrix.go b/matrix.go
index 0ec940e..356efa9 100644
--- a/matrix.go
+++ b/matrix.go
@@ -86,6 +86,9 @@
epsilonState := int(0)
epsilonOffset := 0
+ // Remember if the last transition was epsilon
+ sentenceEnd := false
+
buffer := make([]rune, 1024)
buffo := 0 // Buffer offset
buffi := 0 // Buffer length
@@ -232,6 +235,9 @@
}
writer.WriteString(string(buffer[:buffo]))
rewindBuffer = true
+ sentenceEnd = false
+ } else {
+ sentenceEnd = true
}
if DEBUG {
fmt.Println("-> Newline")
@@ -347,6 +353,16 @@
}
goto PARSECHARM
}
- return false
+ // Add an additional sentence ending, if the file is over but no explicit
+ // sentence split was reached. This may be controversial and therefore
+ // optional via parameter.
+ if !sentenceEnd {
+ writer.WriteRune('\n')
+ if DEBUG {
+ fmt.Println("-> Newline")
+ }
+ }
+
+ return true
}
diff --git a/matrix_test.go b/matrix_test.go
index 18ed3a2..49a1523 100644
--- a/matrix_test.go
+++ b/matrix_test.go
@@ -21,7 +21,7 @@
var tokens []string
mat.Transduce(r, w)
tokens = strings.Split(w.String(), "\n")
- assert.Equal(len(tokens), 9)
+ assert.Equal(len(tokens), 10)
assert.Equal("wald", tokens[0])
assert.Equal("gehen", tokens[1])
assert.Equal("Da", tokens[2])
@@ -30,4 +30,121 @@
assert.Equal("was", tokens[5])
assert.Equal("\"erleben\"", tokens[6])
assert.Equal("!", tokens[7])
+
+ r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
+ w.Reset()
+ mat.Transduce(r, w)
+ tokens = strings.Split(w.String(), "\n")
+ assert.Equal("In", tokens[0])
+ assert.Equal("den", tokens[1])
+ assert.Equal("Wald", tokens[2])
+ assert.Equal("gehen", tokens[3])
+ assert.Equal("?", tokens[4])
+ assert.Equal("--", tokens[5])
+
+ r = strings.NewReader(" g? -- D")
+ w.Reset()
+ mat.Transduce(r, w)
+ tokens = strings.Split(w.String(), "\n")
+ assert.Equal("g", tokens[0])
+ assert.Equal("?", tokens[1])
+ assert.Equal("--", tokens[2])
+ assert.Equal("D", tokens[3])
+ assert.Equal("", tokens[4])
+ assert.Equal("", tokens[5])
+ assert.Equal(6, len(tokens))
+}
+
+func TestFullTokenizerMatrixSentenceSplitter(t *testing.T) {
+ assert := assert.New(t)
+ foma := LoadFomaFile("testdata/tokenizer.fst")
+ assert.NotNil(foma)
+
+ mat := foma.ToMatrix()
+
+ b := make([]byte, 0, 2048)
+ w := bytes.NewBuffer(b)
+ var sentences []string
+
+ // testSentSplitterSimple
+ assert.True(mat.Transduce(strings.NewReader("Der alte Mann."), w))
+ sentences = strings.Split(w.String(), "\n\n")
+
+ assert.Equal("Der\nalte\nMann\n.\n\n", w.String())
+ assert.Equal("Der\nalte\nMann\n.", sentences[0])
+ assert.Equal("", sentences[1])
+ assert.Equal(len(sentences), 2)
+
+ w.Reset()
+ assert.True(mat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
+ sentences = strings.Split(w.String(), "\n\n")
+ assert.Equal(len(sentences), 2)
+ assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
+ assert.Equal("", sentences[1])
+
+ /*
+
+ w.Reset()
+ assert.True(mat.Transduce(strings.NewReader(""), w))
+ sentences = strings.Split(w.String(), "\n\n")
+ assert.Equal(len(sentences), 1)
+ assert.Equal("\n", sentences[0])
+
+ w.Reset()
+ assert.True(mat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
+ sentences = strings.Split(w.String(), "\n\n")
+ assert.Equal(len(sentences), 2)
+
+ w.Reset()
+ assert.True(mat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
+ sentences = strings.Split(w.String(), "\n\n")
+ assert.Equal(len(sentences), 2)
+
+ w.Reset()
+ assert.True(mat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
+ sentences = strings.Split(w.String(), "\n\n")
+ assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
+ assert.Equal("", sentences[1])
+ assert.Equal(len(sentences), 2)
+
+ w.Reset()
+ assert.True(mat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
+ sentences = strings.Split(w.String(), "\n\n")
+ assert.Equal("", sentences[1])
+ assert.Equal(len(sentences), 2)
+
+ w.Reset()
+ assert.True(mat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
+ sentences = strings.Split(w.String(), "\n\n")
+ assert.Equal(len(sentences), 2)
+
+ w.Reset()
+ assert.True(mat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
+ sentences = strings.Split(w.String(), "\n\n")
+ assert.Equal(len(sentences), 2)
+
+ w.Reset()
+ assert.True(mat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
+ sentences = strings.Split(w.String(), "\n\n")
+ assert.Equal(len(sentences), 2)
+ assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
+ assert.Equal("", sentences[1])
+
+ w.Reset()
+ assert.True(mat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
+ sentences = strings.Split(w.String(), "\n\n")
+ assert.Equal(len(sentences), 3)
+ assert.Equal("Ausschalten\n!!!", sentences[0])
+ assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
+ assert.Equal("", sentences[2])
+
+ w.Reset()
+ assert.True(mat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
+ sentences = strings.Split(w.String(), "\n\n")
+ assert.Equal(len(sentences), 2)
+ */
+ /*
+ Test:
+ "\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
+ */
}