Fix check on final states
diff --git a/datokenizer.go b/datokenizer.go
index af1e70f..7aea95a 100644
--- a/datokenizer.go
+++ b/datokenizer.go
@@ -1170,6 +1170,7 @@
eof := false
newchar := true
+PARSECHAR:
for {
if newchar {
@@ -1266,7 +1267,7 @@
buffo++
// Transition does not produce a character
- if dat.isNonToken(t) && buffo == 1 {
+ if buffo == 1 && dat.isNonToken(t) {
if DEBUG {
fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
}
@@ -1274,19 +1275,22 @@
}
}
- // Transition marks the end of a token
+ // Transition marks the end of a token - so flush the buffer
if dat.isTokenEnd(t) {
- data := []byte(string(buffer[:buffo]))
- if DEBUG {
- fmt.Println("-> Flush buffer:", string(data), showBuffer(buffer, buffo, buffi))
+ if buffi > 0 {
+ data := []byte(string(buffer[:buffo]))
+ if DEBUG {
+ fmt.Println("-> Flush buffer: [", string(data), "]", showBuffer(buffer, buffo, buffi))
+ fmt.Println("-> Newline")
+ }
+ writer.Write(data)
+ writer.WriteRune('\n')
+ rewindBuffer = true
}
- writer.Write(data)
- writer.WriteRune('\n')
- rewindBuffer = true
}
- // Rewind the buffer
+ // Rewind the buffer if necessary
if rewindBuffer {
// TODO: Better as a ring buffer
@@ -1311,9 +1315,14 @@
}
}
+ if eof {
+ break
+ }
+
+ newchar = true
+
// TODO:
// Prevent endless epsilon loops!
- newchar = true
}
// Input reader is not yet finished
@@ -1324,23 +1333,24 @@
return false
}
-FINALCHECK:
-
if DEBUG {
fmt.Println("Entering final check")
}
- // Automaton is in a final state
+ // Automaton is in a final state, so flush the buffer and return
if dat.getCheck(dat.getBase(t)+uint32(dat.final)) == t {
if buffi > 0 {
data := []byte(string(buffer[:buffi]))
if DEBUG {
- fmt.Println("-> Flush buffer:", string(data))
+ fmt.Println("-> Flush buffer: [", string(data), "]")
}
writer.Write(data)
if dat.isTokenEnd(t) {
writer.WriteRune('\n')
+ if DEBUG {
+ fmt.Println("-> Newline")
+ }
}
}
@@ -1348,45 +1358,24 @@
return true
}
- // Try again with epsilon symbol, in case everything else failed
- /*
+ // Check epsilon transitions until a final state is reached
+ t0 = t
+ t = dat.getBase(t0) + uint32(dat.epsilon)
+ if dat.getCheck(t) == t0 {
+ // Remember state for backtracking to last tokenend state
+ a = dat.epsilon
+ newchar = false
+ goto PARSECHAR
+ } else if epsilonState != 0 {
t0 = epsilonState
epsilonState = 0 // reset
buffo = epsilonOffset
a = dat.epsilon
- goto CHECK
- */
-
- // Check epsilon transitions until a final state is reached
- t0 = t
- t = dat.getBase(t0) + uint32(dat.epsilon)
-
- // Epsilon transition failed
- if t > dat.getCheck(1) || dat.getCheck(t) != t0 {
if DEBUG {
- fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", t0)
+ fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
}
- return false
+ newchar = false
+ goto PARSECHAR
}
-
- if dat.isSeparate(t) {
- // Move to representative state
- t = dat.getBase(t)
- }
-
- if dat.isTokenEnd(t) {
- if buffi > 0 {
- data := []byte(string(buffer[:buffi]))
- if DEBUG {
- fmt.Println("-> Flush buffer:", string(data))
- }
- writer.Write(data)
- buffi = 0
- buffo = 0
- epsilonState = 0
- }
- writer.WriteRune('\n')
- }
-
- goto FINALCHECK
+ return false
}
diff --git a/datokenizer_test.go b/datokenizer_test.go
index e861b08..08ba132 100644
--- a/datokenizer_test.go
+++ b/datokenizer_test.go
@@ -151,7 +151,7 @@
}
assert.NotNil(dat)
- r := strings.NewReader("tra. und Du?")
+ r := strings.NewReader("tra. u Du?")
b := make([]byte, 0, 2048)
w := bytes.NewBuffer(b)
@@ -163,12 +163,12 @@
assert.Equal("tra", tokens[0])
assert.Equal(".", tokens[1])
assert.Equal("", tokens[2])
- assert.Equal("und", tokens[3])
+ assert.Equal("u", tokens[3])
assert.Equal("Du", tokens[4])
assert.Equal("?", tokens[5])
assert.Equal("", tokens[6])
- assert.Equal("", tokens[7])
- assert.Equal(8, len(tokens))
+ // assert.Equal("", tokens[7])
+ assert.Equal(7, len(tokens))
}
func TestFullTokenizerSentenceSplitter(t *testing.T) {