Make tokenizer robust and never failing Change-Id: I7f249434bc233b560c8d493f1f0c2abd4d69db91

commit: df27581811cd35c5adbe8031a3585e7d234c9732 [log] [tgz]
author: Akron <nils@diewald-online.de> Sun Mar 27 12:54:46 2022 +0200
committer: Akron <nils@diewald-online.de> Sun Mar 27 12:54:46 2022 +0200
tree: c6e05090ff8437934f13a51fc2eb18cd592b53d8
parent: 4222ac87aa5a497bb71ae0744c1cee342644364c [diff] [blame]
diff --git a/matrix.go b/matrix.go
index 567430b..1861528 100644
--- a/matrix.go
+++ b/matrix.go

@@ -414,12 +414,16 @@
 				if int(char) == EOT {
 					eot = true
 				}
+
+				// mat.SigmaASCII[] is initialized with mat.identity
 				a = mat.sigmaASCII[int(char)]
 			} else {
 				a, ok = mat.sigma[char]
 
 				// Use identity symbol if character is not in sigma
 				if !ok && mat.identity != -1 {
+
+					// TODO: Maybe use unknown?
 					a = mat.identity
 				}
 			}
@@ -434,7 +438,7 @@
 
 				// Maybe not necessary - and should be simpler!
 				// Just Remove
-				t0 &= ^FIRSTBIT
+				// t0 &= ^FIRSTBIT
 				epsilonState = t0
 				epsilonOffset = buffc
 
@@ -444,8 +448,14 @@
 			}
 		}
 
-		// Checks a transition based on t0, a and buffo
-		t = mat.array[(int(a)-1)*mat.stateCount+int(t0)]
+		// can happen when no identity is defined.
+		// This shouldn't be tested in every loop
+		if a == 0 {
+			t = 0
+		} else {
+			// Checks a transition based on t0, a and buffo
+			t = mat.array[(int(a)-1)*mat.stateCount+int(t0)]
+		}
 
 		if DEBUG {
 			// Char is only relevant if set
@@ -468,7 +478,7 @@
 				}
 				a = mat.unknown
 
-			} else if a != mat.epsilon {
+			} else if a != mat.epsilon && epsilonState != 0 {
 
 				// Try again with epsilon symbol, in case everything else failed
 				t0 = epsilonState
@@ -481,7 +491,51 @@
 				}
 
 			} else {
-				break
+
+				if DEBUG {
+					log.Println("Fail!")
+				}
+
+				// w.Fail(bufft)
+
+				// The following procedure means the automaton fails to consume a certain character.
+				// In the tokenization scenario, this means, the tokenizer will drop the old or current data as a
+				// token and start blank at the root node of the automaton for the remaining data.
+				// It may be beneficial to have something like a "drop()" event to capture these cases,
+				// as they are likely the result of a bad automaton design.
+				if buffc-bufft == 0 {
+					buffc++
+				}
+
+				if DEBUG {
+					log.Println("-> Flush buffer: [", string(buffer[bufft:buffc]), "]", showBufferNew(buffer, bufft, buffc, buffi))
+				}
+				w.Token(bufft, buffer[:buffc])
+
+				sentenceEnd = false
+				textEnd = false
+
+				if DEBUG {
+					log.Println("-> Rewind buffer", bufft, buffc, buffi, epsilonOffset)
+				}
+
+				for x, i := range buffer[buffc:buffi] {
+					buffer[x] = i
+				}
+
+				buffi -= buffc
+				epsilonState = 0
+
+				buffc = 0
+				bufft = 0
+
+				a = mat.epsilon
+
+				// Restart from root state
+				t = uint32(1)
+				newchar = true
+				// goto PARSECHARM
+				continue
 			}
 
 			newchar = false
@@ -570,6 +624,7 @@
 		if DEBUG {
 			log.Println("Not at the end")
 		}
+		// This should never happen
 		return false
 	}
 
@@ -597,6 +652,16 @@
 		goto PARSECHARM
 	}
 
+	// something left in buffer
+	if buffc-bufft > 0 {
+		if DEBUG {
+			log.Println("-> Flush buffer: [", string(buffer[bufft:buffc]), "]", showBufferNew(buffer, bufft, buffc, buffi))
+		}
+		w.Token(bufft, buffer[:buffc])
+		sentenceEnd = false
+		textEnd = false
+	}
+
 	// Add an additional sentence ending, if the file is over but no explicit
 	// sentence split was reached. This may be controversial and therefore
 	// optional via parameter.
commit	df27581811cd35c5adbe8031a3585e7d234c9732	[log] [tgz]
author	Akron <nils@diewald-online.de>	Sun Mar 27 12:54:46 2022 +0200
committer	Akron <nils@diewald-online.de>	Sun Mar 27 12:54:46 2022 +0200
tree	c6e05090ff8437934f13a51fc2eb18cd592b53d8
parent	4222ac87aa5a497bb71ae0744c1cee342644364c [diff] [blame]