Introduce token_writer object This change also removes final state sensibility from the tokenizer. Tokens now require a tokenend transition to be treated as complete.

commit: e396a93ea5848a941e664f992aed89b057ca3120 [log] [tgz]
author: Akron <nils@diewald-online.de> Tue Oct 19 01:06:13 2021 +0200
committer: Akron <nils@diewald-online.de> Wed Oct 20 19:06:07 2021 +0200
tree: aae1c3781aa0e28b5ef3b134c5017637830e6952
parent: e7751b807975e77757a63456aa89094f490b9dce [diff] [blame]
diff --git a/datok.go b/datok.go
index fa6e44d..b70586c 100644
--- a/datok.go
+++ b/datok.go

@@ -724,14 +724,19 @@
 	return string(out)
 }
 
-// Transduce an input string against the double array
-// FSA. The rules are always greedy. If the automaton fails,
-// it takes the last possible token ending branch.
+func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
+	return dat.TransduceTokenWriter(r, NewTokenWriterSimple(w))
+}
+
+// TransduceTokenWriter transduces an input string against
+// the double array FSA. The rules are always greedy. If the
+// automaton fails, it takes the last possible token ending
+// branch.
 //
 // Based on Mizobuchi et al (2000), p. 129,
 // with additional support for IDENTITY, UNKNOWN
 // and EPSILON transitions and NONTOKEN and TOKENEND handling.
-func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
+func (dat *DaTokenizer) TransduceTokenWriter(r io.Reader, w TokenWriterI) bool {
 	var a int
 	var t0 uint32
 	t := uint32(1) // Initial state
@@ -742,6 +747,9 @@
 	epsilonState := uint32(0)
 	epsilonOffset := 0
 
+	// Remember if the last transition was epsilon
+	sentenceEnd := false
+
 	// Implement a low level buffer for full control,
 	// however - it is probably better to introduce
 	// this on a higher level with a io.Reader interface
@@ -761,10 +769,10 @@
 	buffi := 0 // Buffer length
 
 	reader := bufio.NewReader(r)
-	writer := bufio.NewWriter(w)
-	defer writer.Flush()
+	defer w.Flush()
 
 	var char rune
+
 	var err error
 	eof := false
 	newchar := true
@@ -819,6 +827,10 @@
 				// Remember state for backtracking to last tokenend state
 				epsilonState = t0
 				epsilonOffset = buffo
+
+				if DEBUG {
+					fmt.Println("epsilonOffset is set to", buffo)
+				}
 			}
 		}
 
@@ -885,27 +897,33 @@
 				}
 				rewindBuffer = true
 			}
-		}
 
-		// Transition marks the end of a token - so flush the buffer
-		if ta.isTokenEnd() {
+		} else {
 
-			if buffi > 0 {
+			// Transition marks the end of a token - so flush the buffer
+			if buffo > 0 {
 				if DEBUG {
 					fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
 				}
-				writer.WriteString(string(buffer[:buffo]))
+				w.Token(0, buffer[:buffo])
 				rewindBuffer = true
+				sentenceEnd = false
+			} else {
+				sentenceEnd = true
+				w.SentenceEnd()
 			}
 			if DEBUG {
 				fmt.Println("-> Newline")
 			}
-			writer.WriteRune('\n')
 		}
 
 		// Rewind the buffer if necessary
 		if rewindBuffer {
 
+			if DEBUG {
+				fmt.Println("-> Rewind buffer", buffo, buffi, epsilonOffset)
+			}
+
 			// TODO: Better as a ring buffer
 			for x, i := range buffer[buffo:buffi] {
 				buffer[x] = i
@@ -913,7 +931,9 @@
 
 			buffi -= buffo
 			// epsilonOffset -= buffo
-			epsilonOffset = buffo
+			epsilonOffset = 0
+			epsilonState = 0
+
 			buffo = 0
 			if DEBUG {
 				fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
@@ -948,46 +968,43 @@
 		fmt.Println("Entering final check")
 	}
 
-	// Automaton is in a final state, so flush the buffer and return
-	x := dat.array[t].getBase() + uint32(dat.final)
+	/*
+		The following code is for deprecated automata relying on
+		final states. Datok now requires final states to be marked
+		with tokenends.
 
-	if x < dat.array[1].getCheck() && dat.array[x].getCheck() == t {
+			// Automaton is in a final state, so flush the buffer and return
+			x := dat.array[t].getBase() + uint32(dat.final)
 
-		if buffi > 0 {
-			if DEBUG {
-				fmt.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
-			}
-			writer.WriteString(string(buffer[:buffi]))
+			if x < dat.array[1].getCheck() && dat.array[x].getCheck() == t {
 
-			if dat.array[t].isTokenEnd() {
-				writer.WriteRune('\n')
-				if DEBUG {
-					fmt.Println("-> Newline")
+				if buffi > 0 {
+					if DEBUG {
+						fmt.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
+					}
+					w.Token(0, buffer[:buffi])
 				}
-			}
-		}
 
-		// Add an additional sentence ending, if the file is over but no explicit
-		// sentence split was reached. This may be controversial and therefore
-		// optional via parameter.
-		if !dat.array[t0].isTokenEnd() {
-			writer.WriteRune('\n')
-			if DEBUG {
-				fmt.Println("-> Newline")
-			}
-		}
+				// Add an additional sentence ending, if the file is over but no explicit
+				// sentence split was reached. This may be controversial and therefore
+				// optional via parameter.
+				if !dat.array[t0].isTokenEnd() {
+					w.SentenceEnd()
+				}
 
-		// TODO:
-		//   There may be a new line at the end, from an epsilon,
-		//   so we may need to go on!
-		return true
-	}
+				// TODO:
+				//   There may be a new line at the end, from an epsilon,
+				//   so we may need to go on!
+				return true
+			}
+	*/
 
 	// Check epsilon transitions until a final state is reached
 	t0 = t
 	t = dat.array[t0].getBase() + uint32(dat.epsilon)
 	a = dat.epsilon
 	newchar = false
+
 	if dat.array[t].getCheck() == t0 {
 		// Remember state for backtracking to last tokenend state
 		goto PARSECHAR
@@ -1001,5 +1018,18 @@
 		}
 		goto PARSECHAR
 	}
-	return false
+
+	// Add an additional sentence ending, if the file is over but no explicit
+	// sentence split was reached. This may be controversial and therefore
+	// optional via parameter.
+	if !sentenceEnd {
+		// writer.WriteRune('\n')
+		// ::Sentenceend
+		w.SentenceEnd()
+		if DEBUG {
+			fmt.Println("-> Newline")
+		}
+	}
+
+	return true
 }
commit	e396a93ea5848a941e664f992aed89b057ca3120	[log] [tgz]
author	Akron <nils@diewald-online.de>	Tue Oct 19 01:06:13 2021 +0200
committer	Akron <nils@diewald-online.de>	Wed Oct 20 19:06:07 2021 +0200
tree	aae1c3781aa0e28b5ef3b134c5017637830e6952
parent	e7751b807975e77757a63456aa89094f490b9dce [diff] [blame]