Fix sentence splitting tests
diff --git a/datokenizer.go b/datokenizer.go
index d862164..c37aaeb 100644
--- a/datokenizer.go
+++ b/datokenizer.go
@@ -1347,6 +1347,7 @@
fmt.Println("-> Flush buffer: [", string(data), "]")
}
writer.Write(data)
+
if dat.isTokenEnd(t) {
writer.WriteRune('\n')
if DEBUG {
@@ -1355,6 +1356,16 @@
}
}
+ // Add an additional sentence ending, if the file is over but no explicit
+ // sentence split was reached. This may be controversial and therefore
+ // optional via parameter.
+ if !dat.isTokenEnd(t0) {
+ writer.WriteRune('\n')
+ if DEBUG {
+ fmt.Println("-> Newline")
+ }
+ }
+
// There may be a new line at the end, from an epsilon, so we go on!
return true
}
diff --git a/datokenizer_test.go b/datokenizer_test.go
index 05847b2..720199f 100644
--- a/datokenizer_test.go
+++ b/datokenizer_test.go
@@ -81,7 +81,8 @@
assert.Equal("--", tokens[2])
assert.Equal("D", tokens[3])
assert.Equal("", tokens[4])
- assert.Equal(5, len(tokens))
+ assert.Equal("", tokens[5])
+ assert.Equal(6, len(tokens))
}
func TestReadWriteTokenizer(t *testing.T) {
@@ -201,7 +202,7 @@
assert.True(dat.Transduce(strings.NewReader(""), w))
sentences = strings.Split(w.String(), "\n\n")
assert.Equal(len(sentences), 1)
- assert.Equal("", sentences[0])
+ assert.Equal("\n", sentences[0])
w.Reset()
assert.True(dat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
@@ -213,41 +214,43 @@
sentences = strings.Split(w.String(), "\n\n")
assert.Equal(len(sentences), 2)
- /*
- w.Reset()
- assert.True(dat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
- sentences = strings.Split(w.String(), "\n\n")
- assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum\n", sentences[0])
- assert.Equal(len(sentences), 1)
+ w.Reset()
+ assert.True(dat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
+ sentences = strings.Split(w.String(), "\n\n")
+ assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
+ assert.Equal("", sentences[1])
+ assert.Equal(len(sentences), 2)
- w.Reset()
- assert.True(dat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
- sentences = strings.Split(w.String(), "\n\n")
- assert.Equal(len(sentences), 1)
+ w.Reset()
+ assert.True(dat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
+ sentences = strings.Split(w.String(), "\n\n")
+ assert.Equal("", sentences[1])
+ assert.Equal(len(sentences), 2)
- w.Reset()
- assert.True(dat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
- sentences = strings.Split(w.String(), "\n\n")
- assert.Equal(len(sentences), 1)
+ w.Reset()
+ assert.True(dat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
+ sentences = strings.Split(w.String(), "\n\n")
+ assert.Equal(len(sentences), 2)
- w.Reset()
- assert.True(dat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
- sentences = strings.Split(w.String(), "\n\n")
- assert.Equal(len(sentences), 1)
+ w.Reset()
+ assert.True(dat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
+ sentences = strings.Split(w.String(), "\n\n")
+ assert.Equal(len(sentences), 2)
- w.Reset()
- assert.True(dat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
- sentences = strings.Split(w.String(), "\n\n")
- assert.Equal(len(sentences), 1)
- assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen\n", sentences[0])
+ w.Reset()
+ assert.True(dat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
+ sentences = strings.Split(w.String(), "\n\n")
+ assert.Equal(len(sentences), 2)
+ assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
+ assert.Equal("", sentences[1])
- w.Reset()
- assert.True(dat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
- sentences = strings.Split(w.String(), "\n\n")
- assert.Equal(len(sentences), 2)
- assert.Equal("Ausschalten\n!!!", sentences[0])
- assert.Equal("Hast\nDu\nnicht\ngehört\n???\n", sentences[1])
- */
+ w.Reset()
+ assert.True(dat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
+ sentences = strings.Split(w.String(), "\n\n")
+ assert.Equal(len(sentences), 3)
+ assert.Equal("Ausschalten\n!!!", sentences[0])
+ assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
+ assert.Equal("", sentences[2])
/*
w.Reset()