Make tokenizer robust and never failing Change-Id: I7f249434bc233b560c8d493f1f0c2abd4d69db91

commit: df27581811cd35c5adbe8031a3585e7d234c9732 [log] [tgz]
author: Akron <nils@diewald-online.de> Sun Mar 27 12:54:46 2022 +0200
committer: Akron <nils@diewald-online.de> Sun Mar 27 12:54:46 2022 +0200
tree: c6e05090ff8437934f13a51fc2eb18cd592b53d8
parent: 4222ac87aa5a497bb71ae0744c1cee342644364c [diff] [blame]
diff --git a/datok.go b/datok.go
index 6fe4f3c..48bb757 100644
--- a/datok.go
+++ b/datok.go

@@ -868,6 +868,7 @@
 
 			// Check for epsilon transitions and remember
 			if dat.array[dat.array[t0].getBase()+uint32(dat.epsilon)].getCheck() == t0 {
+
 				// Remember state for backtracking to last tokenend state
 				epsilonState = t0
 				epsilonOffset = buffc
@@ -906,7 +907,7 @@
 				}
 				a = dat.unknown
 
-			} else if a != dat.epsilon {
+			} else if a != dat.epsilon && epsilonState != 0 {
 
 				// Try again with epsilon symbol, in case everything else failed
 				t0 = epsilonState
@@ -919,7 +920,51 @@
 				}
 
 			} else {
-				break
+
+				if DEBUG {
+					log.Println("Fail!")
+				}
+
+				// w.Fail(bufft)
+
+				// The following procedure means the automaton fails to consume a certain character.
+				// In the tokenization scenario, this means, the tokenizer will drop the old or current data as a
+				// token and start blank at the root node of the automaton for the remaining data.
+				// It may be beneficial to have something like a "drop()" event to capture these cases,
+				// as they are likely the result of a bad automaton design.
+				if buffc-bufft == 0 {
+					buffc++
+				}
+
+				if DEBUG {
+					log.Println("-> Flush buffer: [", string(buffer[bufft:buffc]), "]", showBufferNew(buffer, bufft, buffc, buffi))
+				}
+				w.Token(bufft, buffer[:buffc])
+
+				sentenceEnd = false
+				textEnd = false
+
+				if DEBUG {
+					log.Println("-> Rewind buffer", bufft, buffc, buffi, epsilonOffset)
+				}
+
+				for x, i := range buffer[buffc:buffi] {
+					buffer[x] = i
+				}
+
+				buffi -= buffc
+				epsilonState = 0
+
+				buffc = 0
+				bufft = 0
+
+				a = dat.epsilon
+
+				// Restart from root state
+				t = uint32(1)
+				newchar = true
+				// goto PARSECHARM
+				continue
 			}
 
 			newchar = false
@@ -1009,7 +1054,8 @@
 		newchar = true
 
 		// TODO:
-		//   Prevent endless epsilon loops!
+		//   Prevent endless epsilon loops by checking
+		//   the model has no epsilon loops1
 	}
 
 	// Input reader is not yet finished
@@ -1017,6 +1063,7 @@
 		if DEBUG {
 			log.Println("Not at the end - problem", t0, ":", dat.outgoing(t0))
 		}
+		// This should never happen
 		return false
 	}
 
@@ -1024,37 +1071,6 @@
 		log.Println("Entering final check")
 	}
 
-	/*
-		The following code is for deprecated automata relying on
-		final states. Datok now requires final states to be marked
-		with tokenends.
-
-			// Automaton is in a final state, so flush the buffer and return
-			x := dat.array[t].getBase() + uint32(dat.final)
-
-			if x < dat.array[1].getCheck() && dat.array[x].getCheck() == t {
-
-				if buffi > 0 {
-					if DEBUG {
-						log.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
-					}
-					w.Token(0, buffer[:buffi])
-				}
-
-				// Add an additional sentence ending, if the file is over but no explicit
-				// sentence split was reached. This may be controversial and therefore
-				// optional via parameter.
-				if !dat.array[t0].isTokenEnd() {
-					w.SentenceEnd()
-				}
-
-				// TODO:
-				//   There may be a new line at the end, from an epsilon,
-				//   so we may need to go on!
-				return true
-			}
-	*/
-
 	// Check epsilon transitions as long as possible
 	t0 = t
 	t = dat.array[t0].getBase() + uint32(dat.epsilon)
@@ -1075,6 +1091,16 @@
 		goto PARSECHAR
 	}
 
+	// something left in buffer
+	if buffc-bufft > 0 {
+		if DEBUG {
+			log.Println("-> Flush buffer: [", string(buffer[bufft:buffc]), "]", showBufferNew(buffer, bufft, buffc, buffi))
+		}
+		w.Token(bufft, buffer[:buffc])
+		sentenceEnd = false
+		textEnd = false
+	}
+
 	// Add an additional sentence ending, if the file is over but no explicit
 	// sentence split was reached. This may be controversial and therefore
 	// optional via parameter.
commit	df27581811cd35c5adbe8031a3585e7d234c9732	[log] [tgz]
author	Akron <nils@diewald-online.de>	Sun Mar 27 12:54:46 2022 +0200
committer	Akron <nils@diewald-online.de>	Sun Mar 27 12:54:46 2022 +0200
tree	c6e05090ff8437934f13a51fc2eb18cd592b53d8
parent	4222ac87aa5a497bb71ae0744c1cee342644364c [diff] [blame]