Introduce token_writer object
This change also removes final state
sensibility from the tokenizer. Tokens
now require a tokenend transition to
be treated as complete.
diff --git a/datok.go b/datok.go
index fa6e44d..b70586c 100644
--- a/datok.go
+++ b/datok.go
@@ -724,14 +724,19 @@
return string(out)
}
-// Transduce an input string against the double array
-// FSA. The rules are always greedy. If the automaton fails,
-// it takes the last possible token ending branch.
+func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
+ return dat.TransduceTokenWriter(r, NewTokenWriterSimple(w))
+}
+
+// TransduceTokenWriter transduces an input string against
+// the double array FSA. The rules are always greedy. If the
+// automaton fails, it takes the last possible token ending
+// branch.
//
// Based on Mizobuchi et al (2000), p. 129,
// with additional support for IDENTITY, UNKNOWN
// and EPSILON transitions and NONTOKEN and TOKENEND handling.
-func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
+func (dat *DaTokenizer) TransduceTokenWriter(r io.Reader, w TokenWriterI) bool {
var a int
var t0 uint32
t := uint32(1) // Initial state
@@ -742,6 +747,9 @@
epsilonState := uint32(0)
epsilonOffset := 0
+ // Remember if the last transition was epsilon
+ sentenceEnd := false
+
// Implement a low level buffer for full control,
// however - it is probably better to introduce
// this on a higher level with a io.Reader interface
@@ -761,10 +769,10 @@
buffi := 0 // Buffer length
reader := bufio.NewReader(r)
- writer := bufio.NewWriter(w)
- defer writer.Flush()
+ defer w.Flush()
var char rune
+
var err error
eof := false
newchar := true
@@ -819,6 +827,10 @@
// Remember state for backtracking to last tokenend state
epsilonState = t0
epsilonOffset = buffo
+
+ if DEBUG {
+ fmt.Println("epsilonOffset is set to", buffo)
+ }
}
}
@@ -885,27 +897,33 @@
}
rewindBuffer = true
}
- }
- // Transition marks the end of a token - so flush the buffer
- if ta.isTokenEnd() {
+ } else {
- if buffi > 0 {
+ // Transition marks the end of a token - so flush the buffer
+ if buffo > 0 {
if DEBUG {
fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
}
- writer.WriteString(string(buffer[:buffo]))
+ w.Token(0, buffer[:buffo])
rewindBuffer = true
+ sentenceEnd = false
+ } else {
+ sentenceEnd = true
+ w.SentenceEnd()
}
if DEBUG {
fmt.Println("-> Newline")
}
- writer.WriteRune('\n')
}
// Rewind the buffer if necessary
if rewindBuffer {
+ if DEBUG {
+ fmt.Println("-> Rewind buffer", buffo, buffi, epsilonOffset)
+ }
+
// TODO: Better as a ring buffer
for x, i := range buffer[buffo:buffi] {
buffer[x] = i
@@ -913,7 +931,9 @@
buffi -= buffo
// epsilonOffset -= buffo
- epsilonOffset = buffo
+ epsilonOffset = 0
+ epsilonState = 0
+
buffo = 0
if DEBUG {
fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
@@ -948,46 +968,43 @@
fmt.Println("Entering final check")
}
- // Automaton is in a final state, so flush the buffer and return
- x := dat.array[t].getBase() + uint32(dat.final)
+ /*
+ The following code is for deprecated automata relying on
+ final states. Datok now requires final states to be marked
+ with tokenends.
- if x < dat.array[1].getCheck() && dat.array[x].getCheck() == t {
+ // Automaton is in a final state, so flush the buffer and return
+ x := dat.array[t].getBase() + uint32(dat.final)
- if buffi > 0 {
- if DEBUG {
- fmt.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
- }
- writer.WriteString(string(buffer[:buffi]))
+ if x < dat.array[1].getCheck() && dat.array[x].getCheck() == t {
- if dat.array[t].isTokenEnd() {
- writer.WriteRune('\n')
- if DEBUG {
- fmt.Println("-> Newline")
+ if buffi > 0 {
+ if DEBUG {
+ fmt.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
+ }
+ w.Token(0, buffer[:buffi])
}
- }
- }
- // Add an additional sentence ending, if the file is over but no explicit
- // sentence split was reached. This may be controversial and therefore
- // optional via parameter.
- if !dat.array[t0].isTokenEnd() {
- writer.WriteRune('\n')
- if DEBUG {
- fmt.Println("-> Newline")
- }
- }
+ // Add an additional sentence ending, if the file is over but no explicit
+ // sentence split was reached. This may be controversial and therefore
+ // optional via parameter.
+ if !dat.array[t0].isTokenEnd() {
+ w.SentenceEnd()
+ }
- // TODO:
- // There may be a new line at the end, from an epsilon,
- // so we may need to go on!
- return true
- }
+ // TODO:
+ // There may be a new line at the end, from an epsilon,
+ // so we may need to go on!
+ return true
+ }
+ */
// Check epsilon transitions until a final state is reached
t0 = t
t = dat.array[t0].getBase() + uint32(dat.epsilon)
a = dat.epsilon
newchar = false
+
if dat.array[t].getCheck() == t0 {
// Remember state for backtracking to last tokenend state
goto PARSECHAR
@@ -1001,5 +1018,18 @@
}
goto PARSECHAR
}
- return false
+
+ // Add an additional sentence ending, if the file is over but no explicit
+ // sentence split was reached. This may be controversial and therefore
+ // optional via parameter.
+ if !sentenceEnd {
+ // writer.WriteRune('\n')
+ // ::Sentenceend
+ w.SentenceEnd()
+ if DEBUG {
+ fmt.Println("-> Newline")
+ }
+ }
+
+ return true
}