Introduce matrix serialization and deserialization
diff --git a/datok_test.go b/datok_test.go
index 1ebb167..4b4bd1e 100644
--- a/datok_test.go
+++ b/datok_test.go
@@ -1008,3 +1008,8 @@
// BenchmarkTransduce-4 37105 27714 ns/op 8240 B/op 3 allocs/op
// BenchmarkToDoubleArray-4 76600 15973 ns/op 10703 B/op 29 allocs/op
// BenchmarkToDoubleArrayLarger-4 21 55161934 ns/op 6357889 B/op 2578 allocs/op
+// 2021-09-30 - Go 1.17.1
+// BenchmarkTransduce-4 47222 25962 ns/op 8240 B/op 3 allocs/op
+// BenchmarkToDoubleArray-4 69192 17355 ns/op 10704 B/op 29 allocs/op
+// BenchmarkToDoubleArrayLarger-4 16 65042885 ns/op 6357794 B/op 2576 allocs/op
+// BenchmarkTransduceMatrix-4 45404 25156 ns/op 8240 B/op 3 allocs/op
diff --git a/matrix.go b/matrix.go
index 4b439d0..ac53b9a 100644
--- a/matrix.go
+++ b/matrix.go
@@ -23,8 +23,6 @@
epsilon int
unknown int
identity int
- // final int
- // tokenend int
}
// ToMatrix turns the intermediate tokenizer into a
@@ -32,17 +30,14 @@
func (auto *Automaton) ToMatrix() *MatrixTokenizer {
mat := &MatrixTokenizer{
- sigma: make(map[rune]int),
- // final: auto.final,
- unknown: auto.unknown,
- identity: auto.identity,
- epsilon: auto.epsilon,
- // tokenend: auto.tokenend,
+ sigma: make(map[rune]int),
+ unknown: auto.unknown,
+ identity: auto.identity,
+ epsilon: auto.epsilon,
stateCount: auto.stateCount,
}
- mat.array = make([]uint32, (auto.stateCount+1)*(auto.sigmaCount))
-
+ max := 0
for num, sym := range auto.sigmaRev {
if int(sym) < 256 {
mat.sigmaASCII[int(sym)] = num
@@ -51,9 +46,17 @@
if num > auto.sigmaCount {
panic("sigmaCount is smaller")
}
+ if num > max {
+ max = num
+ }
}
+ // Add final entry to the list (maybe not necessary actually)
+
remember := make([]bool, auto.stateCount+2)
+ // lower sigmaCount, as no final value exists
+ mat.array = make([]uint32, (auto.stateCount+1)*(max+1))
+
// Store all transitions in matrix
var toMatrix func([]uint32, int)
@@ -120,7 +123,8 @@
}
// Get sigma as a list
- sigmalist := make([]rune, len(mat.sigma)+12)
+ // In datok it's 16 - 4*4
+ sigmalist := make([]rune, len(mat.sigma)+16)
max := 0
for sym, num := range mat.sigma {
sigmalist[num] = sym
@@ -129,17 +133,17 @@
}
}
+ // Add final entry to the list (maybe not necessary actually)
sigmalist = sigmalist[:max+1]
- buf := make([]byte, 0, 12)
+ buf := make([]byte, 0, 14)
bo.PutUint16(buf[0:2], VERSION)
bo.PutUint16(buf[2:4], uint16(mat.epsilon))
bo.PutUint16(buf[4:6], uint16(mat.unknown))
bo.PutUint16(buf[6:8], uint16(mat.identity))
- bo.PutUint16(buf[8:10], uint16(mat.stateCount))
- bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
- // bo.PutUint32(buf[12:16], uint32(len(mat.array)*2)) // Legacy support
- more, err := wb.Write(buf[0:12])
+ bo.PutUint32(buf[8:12], uint32(mat.stateCount))
+ bo.PutUint16(buf[12:14], uint16(len(sigmalist)))
+ more, err := wb.Write(buf[0:14])
if err != nil {
log.Println(err)
return int64(all), err
@@ -171,7 +175,6 @@
}
all += more
- // for x := 0; x < len(dat.array); x++ {
for _, x := range mat.array {
bo.PutUint32(buf[0:4], uint32(x))
more, err = wb.Write(buf[0:4])
@@ -184,19 +187,6 @@
log.Println("Can not write base uint32")
return int64(all), err
}
- /*
- bo.PutUint32(buf[0:4], bc.check)
- more, err = wb.Write(buf[0:4])
- if err != nil {
- log.Println(err)
- return int64(all), err
- }
- all += more
- if more != 4 {
- log.Println("Can not write check uint32")
- return int64(all), err
- }
- */
}
return int64(all), err
@@ -229,13 +219,11 @@
// Initialize tokenizer with default values
mat := &MatrixTokenizer{
- sigma: make(map[rune]int),
- epsilon: 0,
- unknown: 0,
- identity: 0,
- // final: 0,
+ sigma: make(map[rune]int),
+ epsilon: 0,
+ unknown: 0,
+ identity: 0,
stateCount: 0,
- // transCount: 0,
}
r := bufio.NewReader(ior)
@@ -255,13 +243,13 @@
return nil
}
- more, err := io.ReadFull(r, buf[0:12])
+ more, err := io.ReadFull(r, buf[0:14])
if err != nil {
log.Println(err)
return nil
}
- if more != 12 {
+ if more != 14 {
log.Println("Read bytes do not fit")
return nil
}
@@ -276,11 +264,9 @@
mat.epsilon = int(bo.Uint16(buf[2:4]))
mat.unknown = int(bo.Uint16(buf[4:6]))
mat.identity = int(bo.Uint16(buf[6:8]))
- mat.stateCount = int(bo.Uint16(buf[8:10]))
-
- sigmaCount := int(bo.Uint16(buf[10:12]))
- arraySize := (mat.stateCount + 1) * (sigmaCount + 1)
- // int(bo.Uint32(buf[12:16]))
+ mat.stateCount = int(bo.Uint32(buf[8:12]))
+ sigmaCount := int(bo.Uint16(buf[12:14]))
+ arraySize := (mat.stateCount + 1) * sigmaCount
// Shouldn't be relevant though
// mat.maxSize = arraySize - 1
@@ -318,12 +304,11 @@
}
if len(dataArray) < arraySize*4 {
- log.Println("Not enough bytes read", len(dataArray), arraySize)
+ log.Println("Not enough bytes read", len(dataArray), arraySize*4)
return nil
}
for x := 0; x < arraySize; x++ {
- // mat.array[x] = bo.Uint32(dataArray[x*8 : (x*8)+4])
mat.array[x] = bo.Uint32(dataArray[x*4 : (x*4)+4])
}
diff --git a/matrix_test.go b/matrix_test.go
index 37a61b2..71a4fb5 100644
--- a/matrix_test.go
+++ b/matrix_test.go
@@ -2,6 +2,8 @@
import (
"bytes"
+ "fmt"
+ "os"
"strings"
"testing"
@@ -61,7 +63,7 @@
assert.NotNil(foma)
mat := foma.ToMatrix()
- assert.NotNil(foma)
+ assert.NotNil(mat)
assert.True(tmatch(mat, "bau"))
assert.True(tmatch(mat, "bad"))
@@ -70,7 +72,7 @@
buf := bytes.NewBuffer(b)
n, err := mat.WriteTo(buf)
assert.Nil(err)
- assert.Equal(int64(248), n)
+ assert.Equal(int64(230), n)
mat2 := ParseMatrix(buf)
assert.NotNil(mat2)
assert.Equal(mat.sigma, mat2.sigma)
@@ -85,6 +87,74 @@
assert.True(tmatch(mat2, "wald gehen"))
}
+func TestReadWriteMatrixFullTokenizer(t *testing.T) {
+ assert := assert.New(t)
+ foma := LoadFomaFile("testdata/tokenizer.fst")
+ assert.NotNil(foma)
+
+ mat := foma.ToMatrix()
+ assert.NotNil(foma)
+
+ tb := make([]byte, 0, 2048)
+ w := bytes.NewBuffer(tb)
+
+ assert.True(mat.Transduce(strings.NewReader("der alte baum"), w))
+ assert.Equal("der\nalte\nbaum\n\n", w.String())
+
+ b := make([]byte, 0, 1024)
+ buf := bytes.NewBuffer(b)
+ _, err := mat.WriteTo(buf)
+ assert.Nil(err)
+ w.Reset()
+ // assert.Equal(int64(248), n)
+
+ mat2 := ParseMatrix(buf)
+ assert.NotNil(mat2)
+ assert.Equal(mat.sigma, mat2.sigma)
+ assert.Equal(mat.epsilon, mat2.epsilon)
+ assert.Equal(mat.unknown, mat2.unknown)
+ assert.Equal(mat.identity, mat2.identity)
+ assert.Equal(mat.stateCount, mat2.stateCount)
+ assert.Equal(len(mat.array), len(mat2.array))
+ // assert.Equal(mat.array, mat2.array)
+
+ assert.True(mat2.Transduce(strings.NewReader("der alte baum"), w))
+ assert.Equal("der\nalte\nbaum\n\n", w.String())
+}
+
+func TestFullTokenizerMatrixTransduce(t *testing.T) {
+ assert := assert.New(t)
+
+ foma := LoadFomaFile("testdata/tokenizer.fst")
+ assert.NotNil(foma)
+
+ mat := foma.ToMatrix()
+
+ assert.NotNil(mat)
+
+ b := make([]byte, 0, 2048)
+ w := bytes.NewBuffer(b)
+ var tokens []string
+
+ assert.True(mat.Transduce(strings.NewReader("tra. u Du?"), w))
+
+ tokens = strings.Split(w.String(), "\n")
+ assert.Equal("tra\n.\n\nu\nDu\n?\n\n", w.String())
+ assert.Equal("tra", tokens[0])
+ assert.Equal(".", tokens[1])
+ assert.Equal("", tokens[2])
+ assert.Equal("u", tokens[3])
+ assert.Equal("Du", tokens[4])
+ assert.Equal("?", tokens[5])
+ assert.Equal("", tokens[6])
+ assert.Equal("", tokens[7])
+ assert.Equal(8, len(tokens))
+
+ w.Reset()
+ assert.True(mat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
+ assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n", w.String())
+}
+
func TestFullTokenizerMatrixSentenceSplitter(t *testing.T) {
assert := assert.New(t)
foma := LoadFomaFile("testdata/tokenizer.fst")
@@ -112,69 +182,682 @@
assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
assert.Equal("", sentences[1])
- /*
+ w.Reset()
+ assert.True(mat.Transduce(strings.NewReader(""), w))
+ sentences = strings.Split(w.String(), "\n\n")
+ assert.Equal(len(sentences), 1)
+ assert.Equal("\n", sentences[0])
- w.Reset()
- assert.True(mat.Transduce(strings.NewReader(""), w))
- sentences = strings.Split(w.String(), "\n\n")
- assert.Equal(len(sentences), 1)
- assert.Equal("\n", sentences[0])
+ w.Reset()
+ assert.True(mat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
+ sentences = strings.Split(w.String(), "\n\n")
+ assert.Equal(len(sentences), 2)
- w.Reset()
- assert.True(mat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
- sentences = strings.Split(w.String(), "\n\n")
- assert.Equal(len(sentences), 2)
+ w.Reset()
+ assert.True(mat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
+ sentences = strings.Split(w.String(), "\n\n")
+ assert.Equal(len(sentences), 2)
- w.Reset()
- assert.True(mat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
- sentences = strings.Split(w.String(), "\n\n")
- assert.Equal(len(sentences), 2)
+ w.Reset()
+ assert.True(mat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
+ sentences = strings.Split(w.String(), "\n\n")
+ assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
+ assert.Equal("", sentences[1])
+ assert.Equal(len(sentences), 2)
- w.Reset()
- assert.True(mat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
- sentences = strings.Split(w.String(), "\n\n")
- assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
- assert.Equal("", sentences[1])
- assert.Equal(len(sentences), 2)
+ w.Reset()
+ assert.True(mat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
+ sentences = strings.Split(w.String(), "\n\n")
+ assert.Equal("", sentences[1])
+ assert.Equal(len(sentences), 2)
- w.Reset()
- assert.True(mat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
- sentences = strings.Split(w.String(), "\n\n")
- assert.Equal("", sentences[1])
- assert.Equal(len(sentences), 2)
+ w.Reset()
+ assert.True(mat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
+ sentences = strings.Split(w.String(), "\n\n")
+ assert.Equal(len(sentences), 2)
- w.Reset()
- assert.True(mat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
- sentences = strings.Split(w.String(), "\n\n")
- assert.Equal(len(sentences), 2)
+ w.Reset()
+ assert.True(mat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
+ sentences = strings.Split(w.String(), "\n\n")
+ assert.Equal(len(sentences), 2)
- w.Reset()
- assert.True(mat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
- sentences = strings.Split(w.String(), "\n\n")
- assert.Equal(len(sentences), 2)
+ w.Reset()
+ assert.True(mat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
+ sentences = strings.Split(w.String(), "\n\n")
+ assert.Equal(len(sentences), 2)
+ assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
+ assert.Equal("", sentences[1])
- w.Reset()
- assert.True(mat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
- sentences = strings.Split(w.String(), "\n\n")
- assert.Equal(len(sentences), 2)
- assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
- assert.Equal("", sentences[1])
+ w.Reset()
+ assert.True(mat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
+ sentences = strings.Split(w.String(), "\n\n")
+ assert.Equal(len(sentences), 3)
+ assert.Equal("Ausschalten\n!!!", sentences[0])
+ assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
+ assert.Equal("", sentences[2])
- w.Reset()
- assert.True(mat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
- sentences = strings.Split(w.String(), "\n\n")
- assert.Equal(len(sentences), 3)
- assert.Equal("Ausschalten\n!!!", sentences[0])
- assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
- assert.Equal("", sentences[2])
-
- w.Reset()
- assert.True(mat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
- sentences = strings.Split(w.String(), "\n\n")
- assert.Equal(len(sentences), 2)
- */
+ w.Reset()
+ assert.True(mat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
+ sentences = strings.Split(w.String(), "\n\n")
+ assert.Equal(len(sentences), 2)
/*
Test:
"\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
*/
}
+
+func TestFullTokenizerMatrixTokenSplitter(t *testing.T) {
+ assert := assert.New(t)
+
+ foma := LoadFomaFile("testdata/tokenizer.fst")
+ assert.NotNil(foma)
+
+ mat := foma.ToMatrix()
+
+ b := make([]byte, 0, 2048)
+ w := bytes.NewBuffer(b)
+ var tokens []string
+
+ // testTokenizerSimple
+ tokens = ttokenize(mat, w, "Der alte Mann")
+ assert.Equal(tokens[0], "Der")
+ assert.Equal(tokens[1], "alte")
+ assert.Equal(tokens[2], "Mann")
+ assert.Equal(len(tokens), 3)
+
+ tokens = ttokenize(mat, w, "Der alte Mann.")
+ assert.Equal(tokens[0], "Der")
+ assert.Equal(tokens[1], "alte")
+ assert.Equal(tokens[2], "Mann")
+ assert.Equal(tokens[3], ".")
+ assert.Equal(len(tokens), 4)
+
+ // testTokenizerAbbr
+ tokens = ttokenize(mat, w, "Der Vorsitzende der F.D.P. hat gewählt")
+ assert.Equal(tokens[0], "Der")
+ assert.Equal(tokens[1], "Vorsitzende")
+ assert.Equal(tokens[2], "der")
+ assert.Equal(tokens[3], "F.D.P.")
+ assert.Equal(tokens[4], "hat")
+ assert.Equal(tokens[5], "gewählt")
+ assert.Equal(len(tokens), 6)
+ // Ignored in KorAP-Tokenizer
+
+ // testTokenizerHost1
+ tokens = ttokenize(mat, w, "Gefunden auf wikipedia.org")
+ assert.Equal(tokens[0], "Gefunden")
+ assert.Equal(tokens[1], "auf")
+ assert.Equal(tokens[2], "wikipedia.org")
+ assert.Equal(len(tokens), 3)
+
+ // testTokenizerWwwHost
+ tokens = ttokenize(mat, w, "Gefunden auf www.wikipedia.org")
+ assert.Equal("Gefunden", tokens[0])
+ assert.Equal("auf", tokens[1])
+ assert.Equal("www.wikipedia.org", tokens[2])
+ assert.Equal(3, len(tokens))
+
+ // testTokenizerWwwUrl
+ tokens = ttokenize(mat, w, "Weitere Infos unter www.info.biz/info")
+ assert.Equal("www.info.biz/info", tokens[3])
+
+ // testTokenizerFtpHost
+ /*
+ tokens = tokenize(dat, w, "Kann von ftp.download.org heruntergeladen werden")
+ assert.Equal("Kann", tokens[0])
+ assert.Equal("von", tokens[1])
+ assert.Equal("ftp.download.org", tokens[2])
+ assert.Equal(5, len(tokens))
+ // Ignored in KorAP-Tokenizer
+ */
+
+ // testTokenizerDash
+ tokens = ttokenize(mat, w, "Das war -- spitze")
+ assert.Equal(tokens[0], "Das")
+ assert.Equal(tokens[1], "war")
+ assert.Equal(tokens[2], "--")
+ assert.Equal(tokens[3], "spitze")
+ assert.Equal(len(tokens), 4)
+
+ // testTokenizerEmail1
+ tokens = ttokenize(mat, w, "Ich bin unter korap@ids-mannheim.de erreichbar.")
+ assert.Equal(tokens[0], "Ich")
+ assert.Equal(tokens[1], "bin")
+ assert.Equal(tokens[2], "unter")
+ assert.Equal(tokens[3], "korap@ids-mannheim.de")
+ assert.Equal(tokens[4], "erreichbar")
+ assert.Equal(tokens[5], ".")
+ assert.Equal(len(tokens), 6)
+
+ // testTokenizerEmail2
+ tokens = ttokenize(mat, w, "Oder unter korap[at]ids-mannheim[dot]de.")
+ assert.Equal(tokens[0], "Oder")
+ assert.Equal(tokens[1], "unter")
+ assert.Equal(tokens[2], "korap[at]ids-mannheim[dot]de")
+ assert.Equal(tokens[3], ".")
+ assert.Equal(len(tokens), 4)
+
+ // testTokenizerEmail3
+ tokens = ttokenize(mat, w, "Oder unter korap(at)ids-mannheim(dot)de.")
+ assert.Equal(tokens[0], "Oder")
+ assert.Equal(tokens[1], "unter")
+ assert.Equal(tokens[2], "korap(at)ids-mannheim(dot)de")
+ assert.Equal(tokens[3], ".")
+ assert.Equal(len(tokens), 4)
+ // Ignored in KorAP-Tokenizer
+
+ // testTokenizerDoNotAcceptQuotedEmailNames
+ tokens = ttokenize(mat, w, "\"John Doe\"@xx.com")
+ assert.Equal("\"", tokens[0])
+ assert.Equal("John", tokens[1])
+ assert.Equal("Doe", tokens[2])
+ assert.Equal("\"", tokens[3])
+ assert.Equal("@xx", tokens[4])
+ assert.Equal(".", tokens[5]) // Differs - as the sentence splitter splits here!
+ assert.Equal("com", tokens[6])
+ assert.Equal(7, len(tokens))
+
+ // testTokenizerTwitter
+ tokens = ttokenize(mat, w, "Folgt @korap und #korap")
+ assert.Equal(tokens[0], "Folgt")
+ assert.Equal(tokens[1], "@korap")
+ assert.Equal(tokens[2], "und")
+ assert.Equal(tokens[3], "#korap")
+ assert.Equal(len(tokens), 4)
+
+ // testTokenizerWeb1
+ tokens = ttokenize(mat, w, "Unsere Website ist https://korap.ids-mannheim.de/?q=Baum")
+ assert.Equal(tokens[0], "Unsere")
+ assert.Equal(tokens[1], "Website")
+ assert.Equal(tokens[2], "ist")
+ assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
+ assert.Equal(len(tokens), 4)
+
+ // testTokenizerWeb2
+ tokens = ttokenize(mat, w, "Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)")
+ assert.Equal(tokens[0], "Wir")
+ assert.Equal(tokens[1], "sind")
+ assert.Equal(tokens[2], "auch")
+ assert.Equal(tokens[3], "im")
+ assert.Equal(tokens[4], "Internet")
+ assert.Equal(tokens[5], "(")
+ assert.Equal(tokens[6], "https://korap.ids-mannheim.de/?q=Baum")
+ assert.Equal(tokens[7], ")")
+ assert.Equal(len(tokens), 8)
+ // Ignored in KorAP-Tokenizer
+
+ // testTokenizerWeb3
+ tokens = ttokenize(mat, w, "Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.")
+ assert.Equal(tokens[0], "Die")
+ assert.Equal(tokens[1], "Adresse")
+ assert.Equal(tokens[2], "ist")
+ assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
+ assert.Equal(tokens[4], ".")
+ assert.Equal(len(tokens), 5)
+ // Ignored in KorAP-Tokenizer
+
+ // testTokenizerServer
+ tokens = ttokenize(mat, w, "Unser Server ist 10.0.10.51.")
+ assert.Equal(tokens[0], "Unser")
+ assert.Equal(tokens[1], "Server")
+ assert.Equal(tokens[2], "ist")
+ assert.Equal(tokens[3], "10.0.10.51")
+ assert.Equal(tokens[4], ".")
+ assert.Equal(len(tokens), 5)
+
+ // testTokenizerNum
+ tokens = ttokenize(mat, w, "Zu 50,4% ist es sicher")
+ assert.Equal(tokens[0], "Zu")
+ assert.Equal(tokens[1], "50,4%")
+ assert.Equal(tokens[2], "ist")
+ assert.Equal(tokens[3], "es")
+ assert.Equal(tokens[4], "sicher")
+ assert.Equal(len(tokens), 5)
+ // Differs from KorAP-Tokenizer
+
+ // testTokenizerDate
+ tokens = ttokenize(mat, w, "Der Termin ist am 5.9.2018")
+ assert.Equal(tokens[0], "Der")
+ assert.Equal(tokens[1], "Termin")
+ assert.Equal(tokens[2], "ist")
+ assert.Equal(tokens[3], "am")
+ assert.Equal(tokens[4], "5.9.2018")
+ assert.Equal(len(tokens), 5)
+
+ tokens = ttokenize(mat, w, "Der Termin ist am 5/9/2018")
+ assert.Equal(tokens[0], "Der")
+ assert.Equal(tokens[1], "Termin")
+ assert.Equal(tokens[2], "ist")
+ assert.Equal(tokens[3], "am")
+ assert.Equal(tokens[4], "5/9/2018")
+ assert.Equal(len(tokens), 5)
+
+ // testTokenizerDateRange
+ /*
+ tokens = tokenize(dat, w, "Der Termin war vom 4.-5.9.2018")
+ assert.Equal(tokens[0], "Der")
+ assert.Equal(tokens[1], "Termin")
+ assert.Equal(tokens[2], "war")
+ assert.Equal(tokens[3], "vom")
+ assert.Equal(tokens[4], "4.")
+ assert.Equal(tokens[5], "-")
+ assert.Equal(tokens[6], "5.9.2018")
+ assert.Equal(len(tokens), 7)
+ // Ignored in KorAP-Tokenizer
+ */
+
+ // testTokenizerEmoji1
+ tokens = ttokenize(mat, w, "Das ist toll! ;)")
+ assert.Equal(tokens[0], "Das")
+ assert.Equal(tokens[1], "ist")
+ assert.Equal(tokens[2], "toll")
+ assert.Equal(tokens[3], "!")
+ assert.Equal(tokens[4], ";)")
+ assert.Equal(len(tokens), 5)
+
+ // testTokenizerRef1
+ tokens = ttokenize(mat, w, "Kupietz und Schmidt (2018): Korpuslinguistik")
+ assert.Equal(tokens[0], "Kupietz")
+ assert.Equal(tokens[1], "und")
+ assert.Equal(tokens[2], "Schmidt")
+ assert.Equal(tokens[3], "(2018)")
+ assert.Equal(tokens[4], ":")
+ assert.Equal(tokens[5], "Korpuslinguistik")
+ assert.Equal(len(tokens), 6)
+ // Differs from KorAP-Tokenizer!
+
+ // testTokenizerRef2 () {
+ tokens = ttokenize(mat, w, "Kupietz und Schmidt [2018]: Korpuslinguistik")
+ assert.Equal(tokens[0], "Kupietz")
+ assert.Equal(tokens[1], "und")
+ assert.Equal(tokens[2], "Schmidt")
+ assert.Equal(tokens[3], "[2018]")
+ assert.Equal(tokens[4], ":")
+ assert.Equal(tokens[5], "Korpuslinguistik")
+ assert.Equal(len(tokens), 6)
+ // Differs from KorAP-Tokenizer!
+
+ // testTokenizerOmission1 () {
+ tokens = ttokenize(mat, w, "Er ist ein A****loch!")
+ assert.Equal(tokens[0], "Er")
+ assert.Equal(tokens[1], "ist")
+ assert.Equal(tokens[2], "ein")
+ assert.Equal(tokens[3], "A****loch")
+ assert.Equal(tokens[4], "!")
+ assert.Equal(len(tokens), 5)
+
+ // testTokenizerOmission2
+ tokens = ttokenize(mat, w, "F*ck!")
+ assert.Equal(tokens[0], "F*ck")
+ assert.Equal(tokens[1], "!")
+ assert.Equal(len(tokens), 2)
+
+ // testTokenizerOmission3 () {
+ tokens = ttokenize(mat, w, "Dieses verf***** Kleid!")
+ assert.Equal(tokens[0], "Dieses")
+ assert.Equal(tokens[1], "verf*****")
+ assert.Equal(tokens[2], "Kleid")
+ assert.Equal(tokens[3], "!")
+ assert.Equal(len(tokens), 4)
+
+ // Probably interpreted as HOST
+ // testTokenizerFileExtension1
+ tokens = ttokenize(mat, w, "Ich habe die readme.txt heruntergeladen")
+ assert.Equal(tokens[0], "Ich")
+ assert.Equal(tokens[1], "habe")
+ assert.Equal(tokens[2], "die")
+ assert.Equal(tokens[3], "readme.txt")
+ assert.Equal(tokens[4], "heruntergeladen")
+ assert.Equal(len(tokens), 5)
+
+ // Probably interpreted as HOST
+ // testTokenizerFileExtension2
+ tokens = ttokenize(mat, w, "Nimm die README.TXT!")
+ assert.Equal(tokens[0], "Nimm")
+ assert.Equal(tokens[1], "die")
+ assert.Equal(tokens[2], "README.TXT")
+ assert.Equal(tokens[3], "!")
+ assert.Equal(len(tokens), 4)
+
+ // Probably interpreted as HOST
+ // testTokenizerFileExtension3
+ tokens = ttokenize(mat, w, "Zeig mir profile.jpeg")
+ assert.Equal(tokens[0], "Zeig")
+ assert.Equal(tokens[1], "mir")
+ assert.Equal(tokens[2], "profile.jpeg")
+ assert.Equal(len(tokens), 3)
+
+ // testTokenizerFile1
+
+ tokens = ttokenize(mat, w, "Zeig mir c:\\Dokumente\\profile.docx")
+ assert.Equal(tokens[0], "Zeig")
+ assert.Equal(tokens[1], "mir")
+ assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx")
+ assert.Equal(len(tokens), 3)
+
+ // testTokenizerFile2
+ tokens = ttokenize(mat, w, "Gehe zu /Dokumente/profile.docx")
+ assert.Equal(tokens[0], "Gehe")
+ assert.Equal(tokens[1], "zu")
+ assert.Equal(tokens[2], "/Dokumente/profile.docx")
+ assert.Equal(len(tokens), 3)
+
+ // testTokenizerFile3
+ tokens = ttokenize(mat, w, "Zeig mir c:\\Dokumente\\profile.jpeg")
+ assert.Equal(tokens[0], "Zeig")
+ assert.Equal(tokens[1], "mir")
+ assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg")
+ assert.Equal(len(tokens), 3)
+ // Ignored in KorAP-Tokenizer
+
+ // testTokenizerPunct
+ tokens = ttokenize(mat, w, "Er sagte: \"Es geht mir gut!\", daraufhin ging er.")
+ assert.Equal(tokens[0], "Er")
+ assert.Equal(tokens[1], "sagte")
+ assert.Equal(tokens[2], ":")
+ assert.Equal(tokens[3], "\"")
+ assert.Equal(tokens[4], "Es")
+ assert.Equal(tokens[5], "geht")
+ assert.Equal(tokens[6], "mir")
+ assert.Equal(tokens[7], "gut")
+ assert.Equal(tokens[8], "!")
+ assert.Equal(tokens[9], "\"")
+ assert.Equal(tokens[10], ",")
+ assert.Equal(tokens[11], "daraufhin")
+ assert.Equal(tokens[12], "ging")
+ assert.Equal(tokens[13], "er")
+ assert.Equal(tokens[14], ".")
+ assert.Equal(len(tokens), 15)
+
+ // testTokenizerPlusAmpersand
+ tokens = ttokenize(mat, w, ""Das ist von C&A!"")
+ assert.Equal(tokens[0], """)
+ assert.Equal(tokens[1], "Das")
+ assert.Equal(tokens[2], "ist")
+ assert.Equal(tokens[3], "von")
+ assert.Equal(tokens[4], "C&A")
+ assert.Equal(tokens[5], "!")
+ assert.Equal(tokens[6], """)
+ assert.Equal(len(tokens), 7)
+
+ // testTokenizerLongEnd
+ tokens = ttokenize(mat, w, "Siehst Du?!!?")
+ assert.Equal(tokens[0], "Siehst")
+ assert.Equal(tokens[1], "Du")
+ assert.Equal(tokens[2], "?!!?")
+ assert.Equal(len(tokens), 3)
+
+ // testTokenizerIrishO
+ tokens = ttokenize(mat, w, "Peter O'Toole")
+ assert.Equal(tokens[0], "Peter")
+ assert.Equal(tokens[1], "O'Toole")
+ assert.Equal(len(tokens), 2)
+
+ // testTokenizerAbr
+ tokens = ttokenize(mat, w, "Früher bzw. später ...")
+ assert.Equal(tokens[0], "Früher")
+ assert.Equal(tokens[1], "bzw.")
+ assert.Equal(tokens[2], "später")
+ assert.Equal(tokens[3], "...")
+ assert.Equal(len(tokens), 4)
+
+ // testTokenizerUppercaseRule
+ tokens = ttokenize(mat, w, "Es war spät.Morgen ist es früh.")
+ assert.Equal(tokens[0], "Es")
+ assert.Equal(tokens[1], "war")
+ assert.Equal(tokens[2], "spät")
+ assert.Equal(tokens[3], ".")
+ assert.Equal(tokens[4], "Morgen")
+ assert.Equal(tokens[5], "ist")
+ assert.Equal(tokens[6], "es")
+ assert.Equal(tokens[7], "früh")
+ assert.Equal(tokens[8], ".")
+ assert.Equal(len(tokens), 9)
+ // Ignored in KorAP-Tokenizer
+
+ // testTokenizerOrd
+ tokens = ttokenize(mat, w, "Sie erreichte den 1. Platz!")
+ assert.Equal(tokens[0], "Sie")
+ assert.Equal(tokens[1], "erreichte")
+ assert.Equal(tokens[2], "den")
+ assert.Equal(tokens[3], "1.")
+ assert.Equal(tokens[4], "Platz")
+ assert.Equal(tokens[5], "!")
+ assert.Equal(len(tokens), 6)
+
+ // testNoZipOuputArchive
+ tokens = ttokenize(mat, w, "Archive: Ich bin kein zip\n")
+ assert.Equal(tokens[0], "Archive")
+ assert.Equal(tokens[1], ":")
+ assert.Equal(tokens[2], "Ich")
+ assert.Equal(tokens[3], "bin")
+ assert.Equal(tokens[4], "kein")
+ assert.Equal(tokens[5], "zip")
+ assert.Equal(6, len(tokens))
+
+ // testTokenizerStrasse
+ tokens = ttokenize(mat, w, "Ich wohne in der Weststr. und Du?")
+ assert.Equal(tokens[4], "Weststr.")
+ assert.Equal(8, len(tokens))
+
+ // germanTokenizerKnowsGermanOmissionWords
+ tokens = ttokenize(mat, w, "D'dorf Ku'damm Lu'hafen M'gladbach W'schaft")
+ assert.Equal("D'dorf", tokens[0])
+ assert.Equal("Ku'damm", tokens[1])
+ assert.Equal("Lu'hafen", tokens[2])
+ assert.Equal("M'gladbach", tokens[3])
+ assert.Equal("W'schaft", tokens[4])
+ assert.Equal(5, len(tokens))
+
+ // germanTokenizerDoesNOTSeparateGermanContractions
+ tokens = ttokenize(mat, w, "mach's macht's was'n ist's haste willste kannste biste kriegste")
+ assert.Equal("mach's", tokens[0])
+ assert.Equal("macht's", tokens[1])
+ assert.Equal("was'n", tokens[2])
+ assert.Equal("ist's", tokens[3])
+ assert.Equal("haste", tokens[4])
+ assert.Equal("willste", tokens[5])
+ assert.Equal("kannste", tokens[6])
+ assert.Equal("biste", tokens[7])
+ assert.Equal("kriegste", tokens[8])
+ assert.Equal(9, len(tokens))
+
+ /*
+ @Test
+ public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
+ DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
+ tokens = tokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
+ assert.Equal("'ve", tokens[1]);
+ assert.Equal("'ll", tokens[3]);
+ assert.Equal("'d", tokens[5]);
+ assert.Equal("'m", tokens[7]);
+ assert.Equal("'re", tokens[9]);
+ assert.Equal("'s", tokens[11]);
+ assert.Equal("is", tokens[12]);
+ assert.Equal("n't", tokens[13]);
+ assert.Equal(14, len(tokens));
+ }
+
+ @Test
+ public void frenchTokenizerKnowsFrenchAbbreviations () {
+ DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
+ tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")
+ assert.Equal("Approx.", tokens[0]);
+ assert.Equal("juill.", tokens[2]);
+ assert.Equal("prof.", tokens[5]);
+ assert.Equal("exerc.", tokens[15]);
+ assert.Equal("no.", tokens[16]);
+ assert.Equal("pp.", tokens[21]);
+ }
+
+ @Test
+ public void frenchTokenizerKnowsFrenchContractions () {
+ DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
+ tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île")
+ assert.Equal("J'", tokens[0]);
+ assert.Equal("j'", tokens[2]);
+ assert.Equal("qu'", tokens[4]);
+ assert.Equal("d'", tokens[6]);
+ assert.Equal("jusqu'", tokens[8]);
+ assert.Equal("Aujourd'hui", tokens[10]);
+ assert.Equal("D'", tokens[11]); // ’
+ assert.Equal("Quelqu'un", tokens[13]); // ’
+ assert.Equal("Presqu'île", tokens[14]); // ’
+ }
+
+ @Test
+ public void frenchTokenizerKnowsFrenchClitics () {
+ DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
+ tokens = tokenize(dat, w, "suis-je sont-elles ")
+ assert.Equal("suis", tokens[0]);
+ assert.Equal("-je", tokens[1]);
+ assert.Equal("sont", tokens[2]);
+ assert.Equal("-elles", tokens[3]);
+ }
+
+ @Test
+ public void testEnglishTokenizerScienceAbbreviations () {
+ DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
+ tokens = tokenize(dat, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.")
+ assert.Equal("Approx.", tokens[0]);
+ assert.Equal("in", tokens[1]);
+ assert.Equal("Sept.", tokens[2]);
+ assert.Equal("1954", tokens[3]);
+ assert.Equal(",", tokens[4]);
+ assert.Equal("Assoc.", tokens[5]);
+ assert.Equal("Prof.", tokens[6]);
+ assert.Equal("Dr.", tokens[7]);
+ assert.Equal("R.", tokens[8]);
+ assert.Equal("J.", tokens[9]);
+ assert.Equal("Ewing", tokens[10]);
+ assert.Equal("reviewed", tokens[11]);
+ assert.Equal("articles", tokens[12]);
+ assert.Equal("on", tokens[13]);
+ assert.Equal("Enzymol.", tokens[14]);
+ assert.Equal("Bacteriol.", tokens[15]);
+ assert.Equal("effects", tokens[16]);
+ assert.Equal("later", tokens[17]);
+ assert.Equal("published", tokens[18]);
+ assert.Equal("in", tokens[19]);
+ assert.Equal("Nutr.", tokens[20]);
+ assert.Equal("Rheumatol.", tokens[21]);
+ assert.Equal("No.", tokens[22]);
+ assert.Equal("12", tokens[23]);
+ assert.Equal("and", tokens[24]);
+ assert.Equal("Nº.", tokens[25]);
+ assert.Equal("13.", tokens[26]);
+ assert.Equal(",", tokens[27]);
+ assert.Equal("pp.", tokens[28]);
+ assert.Equal("17-18", tokens[29]);
+ assert.Equal(".", tokens[30]);
+ }
+
+ @Test
+ public void englishTokenizerCanGuessWhetherIIsAbbrev () {
+ DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
+ tokens = tokenize(dat, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
+ assert.Equal("I.", tokens[1]);
+ assert.Equal("I", tokens[8]);
+ assert.Equal(".", tokens[9]);
+ assert.Equal("I", tokens[12]);
+ assert.Equal(".", tokens[13]);
+ }
+
+ @Test
+ public void testZipOuputArchive () {
+
+ final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
+ System.setOut(new PrintStream(clearOut));
+ tokens = tokenize(dat, w, "Archive: ich/bin/ein.zip\n")
+ assert.Equal(0, len(tokens));
+ }
+ */
+ /*
+
+ @Test
+ public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException {
+ DerekoDfaTokenizer_de tok = (DerekoDfaTokenizer_de) new KorapTokenizer.Builder()
+ .tokenizerClassName(DerekoDfaTokenizer_de.class.getName())
+ .printOffsets(true)
+ .build();
+ Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
+ assert.Equal("Text1", tokens[0].getType());
+ assert.Equal(len(tokens), 9 );
+ }
+ */
+}
+
+func TestFullTokenizerMatrixXML(t *testing.T) {
+ assert := assert.New(t)
+
+ foma := LoadFomaFile("testdata/tokenizer.fst")
+ assert.NotNil(foma)
+
+ mat := foma.ToMatrix()
+ assert.NotNil(mat)
+
+ b := make([]byte, 0, 2048)
+ w := bytes.NewBuffer(b)
+ var tokens []string
+
+ tokens = ttokenize(mat, w, "Das <b>beste</b> Fußballspiel")
+ assert.Equal("Das", tokens[0])
+ assert.Equal("<b>", tokens[1])
+ assert.Equal("beste", tokens[2])
+ assert.Equal("</b>", tokens[3])
+ assert.Equal("Fußballspiel", tokens[4])
+ assert.Equal(5, len(tokens))
+
+ tokens = ttokenize(mat, w, "Das <b class=\"c\">beste</b> Fußballspiel")
+ assert.Equal("Das", tokens[0])
+ assert.Equal("<b class=\"c\">", tokens[1])
+ assert.Equal("beste", tokens[2])
+ assert.Equal("</b>", tokens[3])
+ assert.Equal("Fußballspiel", tokens[4])
+ assert.Equal(5, len(tokens))
+
+ tokens = ttokenize(mat, w, "der<x y=\"alte \"> <x x> alte</x> etc. et. Mann.")
+ assert.Equal("der", tokens[0])
+ assert.Equal("<x y=\"alte \">", tokens[1])
+ assert.Equal("<x x>", tokens[2])
+ assert.Equal("alte", tokens[3])
+ assert.Equal("</x>", tokens[4])
+ assert.Equal("etc.", tokens[5])
+ assert.Equal("et", tokens[6])
+ assert.Equal(".", tokens[7])
+ assert.Equal("Mann", tokens[8])
+ assert.Equal(".", tokens[9])
+ assert.Equal(10, len(tokens))
+}
+
+func BenchmarkTransduceMatrix(b *testing.B) {
+ bu := make([]byte, 0, 2048)
+ w := bytes.NewBuffer(bu)
+
+ s := `Der Vorsitzende der Abk. hat gewählt. Gefunden auf wikipedia.org. Ich bin unter korap@ids-mannheim.de erreichbar.
+ Unsere Website ist https://korap.ids-mannheim.de/?q=Baum. Unser Server ist 10.0.10.51. Zu 50.4% ist es sicher.
+ Der Termin ist am 5.9.2018.
+ Ich habe die readme.txt heruntergeladen.
+ Ausschalten!!! Hast Du nicht gehört???
+ Ich wohne in der Weststr. und Du? Kupietz und Schmidt [2018]: Korpuslinguistik. Dieses verf***** Kleid! Ich habe die readme.txt heruntergeladen.
+ Er sagte: \"Es geht mir gut!\", daraufhin ging er. "Das ist von C&A!" Früher bzw. später ... Sie erreichte den 1. Platz!
+ Archive: Ich bin kein zip. D'dorf Ku'damm Lu'hafen M'gladbach W'schaft.
+ Mach's macht's was'n ist's haste willste kannste biste kriegste.`
+ r := strings.NewReader(s)
+
+ foma := LoadFomaFile("testdata/tokenizer.fst")
+ mat := foma.ToMatrix()
+
+ b.ResetTimer()
+
+ for i := 0; i < b.N; i++ {
+ w.Reset()
+ r.Reset(s)
+ ok := mat.Transduce(r, w)
+ if !ok {
+ fmt.Println("Fail!")
+ fmt.Println(w.String())
+ os.Exit(1)
+ }
+ }
+}
diff --git a/testdata/simpletok.matok b/testdata/simpletok.matok
index 51ee615..8cb1030 100644
--- a/testdata/simpletok.matok
+++ b/testdata/simpletok.matok
Binary files differ