Make tokenizer robust and never failing
Change-Id: I7f249434bc233b560c8d493f1f0c2abd4d69db91
diff --git a/datok.go b/datok.go
index 6fe4f3c..48bb757 100644
--- a/datok.go
+++ b/datok.go
@@ -868,6 +868,7 @@
// Check for epsilon transitions and remember
if dat.array[dat.array[t0].getBase()+uint32(dat.epsilon)].getCheck() == t0 {
+
// Remember state for backtracking to last tokenend state
epsilonState = t0
epsilonOffset = buffc
@@ -906,7 +907,7 @@
}
a = dat.unknown
- } else if a != dat.epsilon {
+ } else if a != dat.epsilon && epsilonState != 0 {
// Try again with epsilon symbol, in case everything else failed
t0 = epsilonState
@@ -919,7 +920,51 @@
}
} else {
- break
+
+ if DEBUG {
+ log.Println("Fail!")
+ }
+
+ // w.Fail(bufft)
+
+ // The following procedure means the automaton fails to consume a certain character.
+ // In the tokenization scenario, this means, the tokenizer will drop the old or current data as a
+ // token and start blank at the root node of the automaton for the remaining data.
+ // It may be beneficial to have something like a "drop()" event to capture these cases,
+ // as they are likely the result of a bad automaton design.
+ if buffc-bufft == 0 {
+ buffc++
+ }
+
+ if DEBUG {
+ log.Println("-> Flush buffer: [", string(buffer[bufft:buffc]), "]", showBufferNew(buffer, bufft, buffc, buffi))
+ }
+ w.Token(bufft, buffer[:buffc])
+
+ sentenceEnd = false
+ textEnd = false
+
+ if DEBUG {
+ log.Println("-> Rewind buffer", bufft, buffc, buffi, epsilonOffset)
+ }
+
+ for x, i := range buffer[buffc:buffi] {
+ buffer[x] = i
+ }
+
+ buffi -= buffc
+ epsilonState = 0
+
+ buffc = 0
+ bufft = 0
+
+ a = dat.epsilon
+
+ // Restart from root state
+ t = uint32(1)
+ newchar = true
+ // goto PARSECHARM
+ continue
}
newchar = false
@@ -1009,7 +1054,8 @@
newchar = true
// TODO:
- // Prevent endless epsilon loops!
+ // Prevent endless epsilon loops by checking
+ // the model has no epsilon loops1
}
// Input reader is not yet finished
@@ -1017,6 +1063,7 @@
if DEBUG {
log.Println("Not at the end - problem", t0, ":", dat.outgoing(t0))
}
+ // This should never happen
return false
}
@@ -1024,37 +1071,6 @@
log.Println("Entering final check")
}
- /*
- The following code is for deprecated automata relying on
- final states. Datok now requires final states to be marked
- with tokenends.
-
- // Automaton is in a final state, so flush the buffer and return
- x := dat.array[t].getBase() + uint32(dat.final)
-
- if x < dat.array[1].getCheck() && dat.array[x].getCheck() == t {
-
- if buffi > 0 {
- if DEBUG {
- log.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
- }
- w.Token(0, buffer[:buffi])
- }
-
- // Add an additional sentence ending, if the file is over but no explicit
- // sentence split was reached. This may be controversial and therefore
- // optional via parameter.
- if !dat.array[t0].isTokenEnd() {
- w.SentenceEnd()
- }
-
- // TODO:
- // There may be a new line at the end, from an epsilon,
- // so we may need to go on!
- return true
- }
- */
-
// Check epsilon transitions as long as possible
t0 = t
t = dat.array[t0].getBase() + uint32(dat.epsilon)
@@ -1075,6 +1091,16 @@
goto PARSECHAR
}
+ // something left in buffer
+ if buffc-bufft > 0 {
+ if DEBUG {
+ log.Println("-> Flush buffer: [", string(buffer[bufft:buffc]), "]", showBufferNew(buffer, bufft, buffc, buffi))
+ }
+ w.Token(bufft, buffer[:buffc])
+ sentenceEnd = false
+ textEnd = false
+ }
+
// Add an additional sentence ending, if the file is over but no explicit
// sentence split was reached. This may be controversial and therefore
// optional via parameter.
diff --git a/datok_test.go b/datok_test.go
index b4d0670..86dd428 100644
--- a/datok_test.go
+++ b/datok_test.go
@@ -13,10 +13,10 @@
var dat *DaTokenizer
-func tmatch(tok Tokenizer, s string) bool {
+func ttokenizeStr(tok Tokenizer, str string) string {
b := make([]byte, 0, 2048)
w := bytes.NewBuffer(b)
- return tok.Transduce(strings.NewReader(s), w)
+ return strings.Join(ttokenize(tok, w, str), "\n")
}
func ttokenize(tok Tokenizer, w *bytes.Buffer, str string) []string {
@@ -33,37 +33,75 @@
func TestDoubleArraySimpleString(t *testing.T) {
assert := assert.New(t)
-
// bau | bauamt
tok := LoadFomaFile("testdata/bauamt.fst")
dat := tok.ToDoubleArray()
- assert.True(tmatch(dat, "bau"))
- assert.True(tmatch(dat, "bauamt"))
- assert.False(tmatch(dat, "baum"))
- assert.True(tmatch(dat, "baua"))
+
+ b := make([]byte, 0, 2048)
+ w := bytes.NewBuffer(b)
+ var tokens []string
+
+ tokens = ttokenize(dat, w, "ibauamt")
+ assert.Equal("i", tokens[0])
+ assert.Equal("bauamt", tokens[1])
+
+ tokens = ttokenize(dat, w, "ibbauamt")
+ assert.Equal("i", tokens[0])
+
+ assert.Equal("b", tokens[1])
+ assert.Equal("bauamt", tokens[2])
+
+ tokens = ttokenize(dat, w, "bau")
+ assert.Equal("bau", tokens[0])
+
+ tokens = ttokenize(dat, w, "baum")
+ assert.Equal("bau", tokens[0])
+ assert.Equal("m", tokens[1])
+
+ tokens = ttokenize(dat, w, "baudibauamt")
+ assert.Equal("bau", tokens[0])
+ assert.Equal("d", tokens[1])
+ assert.Equal("i", tokens[2])
+ assert.Equal("bauamt", tokens[3])
}
func TestDoubleArraySimpleBranches(t *testing.T) {
assert := assert.New(t)
-
// (bau | wahl) (amt | en)
tok := LoadFomaFile("testdata/wahlamt.fst")
dat := tok.ToDoubleArray()
- assert.True(tmatch(dat, "bau"))
- assert.True(tmatch(dat, "bauamt"))
- assert.True(tmatch(dat, "wahlamt"))
- assert.True(tmatch(dat, "bauen"))
- assert.True(tmatch(dat, "wahlen"))
- assert.False(tmatch(dat, "baum"))
+
+ b := make([]byte, 0, 2048)
+ w := bytes.NewBuffer(b)
+ var tokens []string
+
+ tokens = ttokenize(dat, w, "bau")
+ assert.Equal("bau", tokens[0])
+
+ tokens = ttokenize(dat, w, "bauamt")
+ assert.Equal("bauamt", tokens[0])
+
+ tokens = ttokenize(dat, w, "wahlamt")
+ assert.Equal("wahlamt", tokens[0])
+
+ tokens = ttokenize(dat, w, "bauen")
+ assert.Equal("bauen", tokens[0])
+
+ tokens = ttokenize(dat, w, "wahlen")
+ assert.Equal("wahlen", tokens[0])
+
+ tokens = ttokenize(dat, w, "baum")
+ assert.Equal("bau", tokens[0])
+ assert.Equal("m", tokens[1])
}
-func TestSimpleTokenizer(t *testing.T) {
+func TestDoubleArraySimpleTokenizer(t *testing.T) {
assert := assert.New(t)
tok := LoadFomaFile("testdata/simpletok.fst")
dat := tok.ToDoubleArray()
- assert.True(tmatch(dat, "bau"))
- assert.True(tmatch(dat, "bad"))
- assert.True(tmatch(dat, "wald gehen"))
+ assert.Equal(ttokenizeStr(dat, "bau"), "bau")
+ assert.Equal(ttokenizeStr(dat, "bad"), "bad")
+ assert.Equal(ttokenizeStr(dat, "wald gehen"), "wald\ngehen")
}
func TestDoubleArraySimpleTokenizerTransduce(t *testing.T) {
@@ -114,9 +152,9 @@
assert := assert.New(t)
tok := LoadFomaFile("testdata/simpletok.fst")
dat := tok.ToDoubleArray()
- assert.True(tmatch(dat, "bau"))
- assert.True(tmatch(dat, "bad"))
- assert.True(tmatch(dat, "wald gehen"))
+ assert.Equal(ttokenizeStr(dat, "bau"), "bau")
+ assert.Equal(ttokenizeStr(dat, "bad"), "bad")
+ assert.Equal(ttokenizeStr(dat, "wald gehen"), "wald\ngehen")
b := make([]byte, 0, 1024)
buf := bytes.NewBuffer(b)
@@ -133,9 +171,9 @@
assert.Equal(dat.identity, dat2.identity)
assert.Equal(dat.final, dat2.final)
assert.Equal(dat.LoadFactor(), dat2.LoadFactor())
- assert.True(tmatch(dat2, "bau"))
- assert.True(tmatch(dat2, "bad"))
- assert.True(tmatch(dat2, "wald gehen"))
+ assert.Equal(ttokenizeStr(dat2, "bau"), "bau")
+ assert.Equal(ttokenizeStr(dat2, "bad"), "bad")
+ assert.Equal(ttokenizeStr(dat2, "wald gehen"), "wald\ngehen")
assert.Equal(dat.TransCount(), 17)
assert.Equal(dat2.TransCount(), 17)
@@ -183,9 +221,9 @@
// assert.Equal(len(dat.sigma), 137)
// assert.True(len(dat.array) > 3000000)
// assert.True(dat.maxSize > 3000000)
- assert.True(tmatch(dat, "bau"))
- assert.True(tmatch(dat, "bad"))
- assert.True(tmatch(dat, "wald gehen"))
+ assert.Equal(ttokenizeStr(dat, "bau"), "bau")
+ assert.Equal(ttokenizeStr(dat, "bad"), "bad")
+ assert.Equal(ttokenizeStr(dat, "wald gehen"), "wald\ngehen")
}
func TestDoubleArrayTokenizerBranch(t *testing.T) {
@@ -878,6 +916,29 @@
*/
}
+func TestDoubleArrayFullTokenizerSentenceSplitterBug1(t *testing.T) {
+ assert := assert.New(t)
+
+ if dat == nil {
+ dat = LoadDatokFile("testdata/tokenizer.datok")
+ }
+
+ b := make([]byte, 0, 2048)
+ w := bytes.NewBuffer(b)
+ var sentences []string
+
+ text := `Wüllersdorf war aufgestanden. »Ich finde es furchtbar, daß Sie recht haben, aber Sie haben recht. Ich quäle Sie nicht länger mit meinem 'Muß es sein?'. Die Welt ist einmal, wie sie ist, und die Dinge verlaufen nicht, wie wir wollen, sondern wie die andern wollen. Das mit dem 'Gottesgericht', wie manche hochtrabend versichern, ist freilich ein Unsinn, nichts davon, umgekehrt, unser Ehrenkultus ist ein Götzendienst, aber wir müssen uns ihm unterwerfen, solange der Götze gilt.«`
+
+ w.Reset()
+ assert.True(dat.Transduce(strings.NewReader(text), w))
+ sentences = strings.Split(w.String(), "\n\n")
+ assert.Equal(len(sentences), 5)
+ assert.Equal("Wüllersdorf\nwar\naufgestanden\n.", sentences[0])
+ assert.Equal("»\nIch\nfinde\nes\nfurchtbar\n,\ndaß\nSie\nrecht\nhaben\n,\naber\nSie\nhaben\nrecht\n.", sentences[1])
+ assert.Equal("Ich\nquäle\nSie\nnicht\nlänger\nmit\nmeinem\n'\nMuß\nes\nsein\n?\n'\n.\n \nDie\nWelt\nist\neinmal\n,\nwie\nsie\nist\n,\nund\ndie\nDinge\nverlaufen\nnicht\n,\nwie\nwir\nwollen\n,\nsondern\nwie\ndie\nandern\nwollen\n.", sentences[2])
+ assert.Equal("Das\nmit\ndem\n'\nGottesgericht\n'\n,\nwie\nmanche\nhochtrabend\nversichern\n,\nist\nfreilich\nein\nUnsinn\n,\nnichts\ndavon\n,\numgekehrt\n,\nunser\nEhrenkultus\nist\nein\nGötzendienst\n,\naber\nwir\nmüssen\nuns\nihm\nunterwerfen\n,\nsolange\nder\nGötze\ngilt\n.\n«", sentences[3])
+}
+
func TestDoubleArrayLoadFactor1(t *testing.T) {
assert := assert.New(t)
tok := LoadFomaFile("testdata/abbr_bench.fst")
diff --git a/matrix.go b/matrix.go
index 567430b..1861528 100644
--- a/matrix.go
+++ b/matrix.go
@@ -414,12 +414,16 @@
if int(char) == EOT {
eot = true
}
+
+ // mat.SigmaASCII[] is initialized with mat.identity
a = mat.sigmaASCII[int(char)]
} else {
a, ok = mat.sigma[char]
// Use identity symbol if character is not in sigma
if !ok && mat.identity != -1 {
+
+ // TODO: Maybe use unknown?
a = mat.identity
}
}
@@ -434,7 +438,7 @@
// Maybe not necessary - and should be simpler!
// Just Remove
- t0 &= ^FIRSTBIT
+ // t0 &= ^FIRSTBIT
epsilonState = t0
epsilonOffset = buffc
@@ -444,8 +448,14 @@
}
}
- // Checks a transition based on t0, a and buffo
- t = mat.array[(int(a)-1)*mat.stateCount+int(t0)]
+ // can happen when no identity is defined.
+ // This shouldn't be tested in every loop
+ if a == 0 {
+ t = 0
+ } else {
+ // Checks a transition based on t0, a and buffo
+ t = mat.array[(int(a)-1)*mat.stateCount+int(t0)]
+ }
if DEBUG {
// Char is only relevant if set
@@ -468,7 +478,7 @@
}
a = mat.unknown
- } else if a != mat.epsilon {
+ } else if a != mat.epsilon && epsilonState != 0 {
// Try again with epsilon symbol, in case everything else failed
t0 = epsilonState
@@ -481,7 +491,51 @@
}
} else {
- break
+
+ if DEBUG {
+ log.Println("Fail!")
+ }
+
+ // w.Fail(bufft)
+
+ // The following procedure means the automaton fails to consume a certain character.
+ // In the tokenization scenario, this means, the tokenizer will drop the old or current data as a
+ // token and start blank at the root node of the automaton for the remaining data.
+ // It may be beneficial to have something like a "drop()" event to capture these cases,
+ // as they are likely the result of a bad automaton design.
+ if buffc-bufft == 0 {
+ buffc++
+ }
+
+ if DEBUG {
+ log.Println("-> Flush buffer: [", string(buffer[bufft:buffc]), "]", showBufferNew(buffer, bufft, buffc, buffi))
+ }
+ w.Token(bufft, buffer[:buffc])
+
+ sentenceEnd = false
+ textEnd = false
+
+ if DEBUG {
+ log.Println("-> Rewind buffer", bufft, buffc, buffi, epsilonOffset)
+ }
+
+ for x, i := range buffer[buffc:buffi] {
+ buffer[x] = i
+ }
+
+ buffi -= buffc
+ epsilonState = 0
+
+ buffc = 0
+ bufft = 0
+
+ a = mat.epsilon
+
+ // Restart from root state
+ t = uint32(1)
+ newchar = true
+ // goto PARSECHARM
+ continue
}
newchar = false
@@ -570,6 +624,7 @@
if DEBUG {
log.Println("Not at the end")
}
+ // This should never happen
return false
}
@@ -597,6 +652,16 @@
goto PARSECHARM
}
+ // something left in buffer
+ if buffc-bufft > 0 {
+ if DEBUG {
+ log.Println("-> Flush buffer: [", string(buffer[bufft:buffc]), "]", showBufferNew(buffer, bufft, buffc, buffi))
+ }
+ w.Token(bufft, buffer[:buffc])
+ sentenceEnd = false
+ textEnd = false
+ }
+
// Add an additional sentence ending, if the file is over but no explicit
// sentence split was reached. This may be controversial and therefore
// optional via parameter.
diff --git a/matrix_test.go b/matrix_test.go
index 3b64d5c..1e5bdd1 100644
--- a/matrix_test.go
+++ b/matrix_test.go
@@ -70,6 +70,40 @@
assert.Equal(7, len(tokens))
}
+func TestMatrixSimpleString(t *testing.T) {
+ assert := assert.New(t)
+ // bau | bauamt
+ tok := LoadFomaFile("testdata/bauamt.fst")
+ mat := tok.ToMatrix()
+
+ b := make([]byte, 0, 2048)
+ w := bytes.NewBuffer(b)
+ var tokens []string
+
+ tokens = ttokenize(mat, w, "ibauamt")
+ assert.Equal("i", tokens[0])
+ assert.Equal("bauamt", tokens[1])
+
+ tokens = ttokenize(mat, w, "ibbauamt")
+ assert.Equal("i", tokens[0])
+
+ assert.Equal("b", tokens[1])
+ assert.Equal("bauamt", tokens[2])
+
+ tokens = ttokenize(mat, w, "bau")
+ assert.Equal("bau", tokens[0])
+
+ tokens = ttokenize(mat, w, "baum")
+ assert.Equal("bau", tokens[0])
+ assert.Equal("m", tokens[1])
+
+ tokens = ttokenize(mat, w, "baudibauamt")
+ assert.Equal("bau", tokens[0])
+ assert.Equal("d", tokens[1])
+ assert.Equal("i", tokens[2])
+ assert.Equal("bauamt", tokens[3])
+}
+
func TestMatrixReadWriteTokenizer(t *testing.T) {
assert := assert.New(t)
foma := LoadFomaFile("testdata/simpletok.fst")
@@ -78,9 +112,9 @@
mat := foma.ToMatrix()
assert.NotNil(mat)
- assert.True(tmatch(mat, "bau"))
- assert.True(tmatch(mat, "bad"))
- assert.True(tmatch(mat, "wald gehen"))
+ assert.Equal(ttokenizeStr(mat, "bau"), "bau")
+ assert.Equal(ttokenizeStr(mat, "bad"), "bad")
+ assert.Equal(ttokenizeStr(mat, "wald gehen"), "wald\ngehen")
b := make([]byte, 0, 1024)
buf := bytes.NewBuffer(b)
n, err := mat.WriteTo(buf)
@@ -95,9 +129,9 @@
assert.Equal(mat.stateCount, mat2.stateCount)
assert.Equal(len(mat.array), len(mat2.array))
assert.Equal(mat.array, mat2.array)
- assert.True(tmatch(mat2, "bau"))
- assert.True(tmatch(mat2, "bad"))
- assert.True(tmatch(mat2, "wald gehen"))
+ assert.Equal(ttokenizeStr(mat2, "bau"), "bau")
+ assert.Equal(ttokenizeStr(mat2, "bad"), "bad")
+ assert.Equal(ttokenizeStr(mat2, "wald gehen"), "wald\ngehen")
}
func TestMatrixIgnorableMCS(t *testing.T) {
@@ -341,6 +375,30 @@
assert.Equal(len(sentences), 3)
assert.Equal("»\nNun\n,\ngib\ndich\nzufrieden\n,\nich\nfange\nschon\nan\n...", sentences[0])
assert.Equal("Also\nBaron\nInnstetten\n!", sentences[1])
+
+}
+
+func TestMatrixFullTokenizerMatrixSentenceSplitterBug1(t *testing.T) {
+ assert := assert.New(t)
+
+ if mat == nil {
+ mat = LoadMatrixFile("testdata/tokenizer.matok")
+ }
+
+ b := make([]byte, 0, 2048)
+ w := bytes.NewBuffer(b)
+ var sentences []string
+
+ text := `Wüllersdorf war aufgestanden. »Ich finde es furchtbar, daß Sie recht haben, aber Sie haben recht. Ich quäle Sie nicht länger mit meinem 'Muß es sein?'. Die Welt ist einmal, wie sie ist, und die Dinge verlaufen nicht, wie wir wollen, sondern wie die andern wollen. Das mit dem 'Gottesgericht', wie manche hochtrabend versichern, ist freilich ein Unsinn, nichts davon, umgekehrt, unser Ehrenkultus ist ein Götzendienst, aber wir müssen uns ihm unterwerfen, solange der Götze gilt.«`
+
+ w.Reset()
+ assert.True(mat.Transduce(strings.NewReader(text), w))
+ sentences = strings.Split(w.String(), "\n\n")
+ assert.Equal(len(sentences), 5)
+ assert.Equal("Wüllersdorf\nwar\naufgestanden\n.", sentences[0])
+ assert.Equal("»\nIch\nfinde\nes\nfurchtbar\n,\ndaß\nSie\nrecht\nhaben\n,\naber\nSie\nhaben\nrecht\n.", sentences[1])
+ assert.Equal("Ich\nquäle\nSie\nnicht\nlänger\nmit\nmeinem\n'\nMuß\nes\nsein\n?\n'\n.\n \nDie\nWelt\nist\neinmal\n,\nwie\nsie\nist\n,\nund\ndie\nDinge\nverlaufen\nnicht\n,\nwie\nwir\nwollen\n,\nsondern\nwie\ndie\nandern\nwollen\n.", sentences[2])
+ assert.Equal("Das\nmit\ndem\n'\nGottesgericht\n'\n,\nwie\nmanche\nhochtrabend\nversichern\n,\nist\nfreilich\nein\nUnsinn\n,\nnichts\ndavon\n,\numgekehrt\n,\nunser\nEhrenkultus\nist\nein\nGötzendienst\n,\naber\nwir\nmüssen\nuns\nihm\nunterwerfen\n,\nsolange\nder\nGötze\ngilt\n.\n«", sentences[3])
}
func TestMatrixFullTokenizerTokenSplitter(t *testing.T) {
diff --git a/token_writer.go b/token_writer.go
index ed580ef..f6dffa4 100644
--- a/token_writer.go
+++ b/token_writer.go
@@ -23,6 +23,7 @@
TextEnd func(int)
Flush func() error
Token func(int, []rune)
+ // Fail func(int)
}
// Create a new token writer based on the options
@@ -36,6 +37,8 @@
tw := &TokenWriter{}
+ // tw.Fail = func(_ int) {}
+
// Collect token positions and maybe tokens
if flags&(TOKEN_POS|SENTENCE_POS) != 0 {