Simplify transducer (single test broken)

commit: b7e1f133b92aa7cccbcab20b33132c3917a2ad2f [log] [tgz]
author: Akron <nils@diewald-online.de> Tue Aug 10 11:52:31 2021 +0200
committer: Akron <nils@diewald-online.de> Tue Aug 10 11:52:31 2021 +0200
tree: 0c2f1e093e18b8199978e8a93102d595fddb935b
parent: df0a3ef3943832984515f19f84b0df2f0894047b [diff]
diff --git a/datokenizer.go b/datokenizer.go
index b89bccd..af1e70f 100644
--- a/datokenizer.go
+++ b/datokenizer.go

@@ -1135,7 +1135,8 @@
 func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
 	var a int
 	var t0 uint32
-	var ok, nontoken, tokenend bool
+	t := uint32(1) // Initial state
+	var ok, rewindBuffer bool
 
 	// Remember the last position of a possible tokenend,
 	// in case the automaton fails.
@@ -1152,6 +1153,10 @@
 	// epsilon transitions, to support tokenizations like:
 	// "this is an example|.| And it works." vs
 	// "this is an example.com| application."
+	//
+	// TODO:
+	//   Store a translation buffer as well, so characters don't
+	//   have to be translated multiple times!
 	buffer := make([]rune, 1024)
 	buffo := 0 // Buffer offset
 	buffi := 0 // Buffer length
@@ -1160,60 +1165,63 @@
 	writer := bufio.NewWriter(w)
 	defer writer.Flush()
 
-	t := uint32(1) // Initial state
-
 	var char rune
 	var err error
 	eof := false
+	newchar := true
 
 	for {
 
-		// Get from reader if buffer is empty
-		if buffo >= buffi {
-			char, _, err = reader.ReadRune()
+		if newchar {
+			// Get from reader if buffer is empty
+			if buffo >= buffi {
+				char, _, err = reader.ReadRune()
 
-			// No more runes to read
-			if err != nil {
-				eof = true
-				break
+				// No more runes to read
+				if err != nil {
+					eof = true
+					break
+				}
+				buffer[buffi] = char
+				buffi++
 			}
-			buffer[buffi] = char
-			buffi++
+
+			char = buffer[buffo]
+
+			if DEBUG {
+				fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
+			}
+
+			// TODO: Better not repeatedly check for a!
+			a, ok = dat.sigma[char]
+
+			// Use identity symbol if character is not in sigma
+			if !ok && dat.identity != -1 {
+				a = dat.identity
+			}
+
+			t0 = t
+
+			// Check for epsilon transitions and remember
+			if dat.getCheck(dat.getBase(t0)+uint32(dat.epsilon)) == t0 {
+				// Remember state for backtracking to last tokenend state
+				epsilonState = t0
+				epsilonOffset = buffo
+			}
 		}
 
-		char = buffer[buffo]
-
-		if DEBUG {
-			fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
-		}
-
-		a, ok = dat.sigma[char]
-
-		// Use identity symbol if character is not in sigma
-		if !ok && dat.identity != -1 {
-			a = dat.identity
-		}
-
-		t0 = t
-
-		// Check for epsilon transitions and remember
-		if dat.getCheck(dat.getBase(t0)+uint32(dat.epsilon)) == t0 {
-			// Remember state for backtracking to last tokenend state
-			epsilonState = t0
-			epsilonOffset = buffo
-		}
-
-	CHECK:
-		nontoken = false
-		tokenend = false
-
+		// Checks a transition based on t0, a and buffo
 		t = dat.getBase(t0) + uint32(a)
 
 		if DEBUG {
-			fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t, dat.outgoing(t0))
+			// Char is only relevant if set
+			fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
+			if false {
+				fmt.Println(dat.outgoing(t0))
+			}
 		}
 
-		// Check if the transition is valid according to the double array
+		// Check if the transition is invalid according to the double array
 		if t > dat.getCheck(1) || dat.getCheck(t) != t0 {
 
 			if DEBUG {
@@ -1223,6 +1231,7 @@
 			if !ok && a == dat.identity {
 
 				// Try again with unknown symbol, in case identity failed
+				// Char is only relevant when set
 				if DEBUG {
 					fmt.Println("UNKNOWN symbol", string(char), "->", dat.unknown)
 				}
@@ -1244,24 +1253,12 @@
 				break
 			}
 
-			goto CHECK
-
+			newchar = false
+			continue
 		}
 
-		// Move to representative state
-		nontoken = dat.isNonToken(t)
-		tokenend = dat.isTokenEnd(t)
-
-		// Check for representative states
-		if dat.isSeparate(t) {
-			t = dat.getBase(t)
-
-			if DEBUG {
-				fmt.Println("Representative pointing to", t)
-			}
-		}
-
-		rewindBuffer := false
+		// Transition was successful
+		rewindBuffer = false
 
 		// Transition consumes a character
 		if a != dat.epsilon {
@@ -1269,7 +1266,7 @@
 			buffo++
 
 			// Transition does not produce a character
-			if nontoken && buffo == 1 {
+			if dat.isNonToken(t) && buffo == 1 {
 				if DEBUG {
 					fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
 				}
@@ -1277,7 +1274,8 @@
 			}
 		}
 
-		if tokenend { // Transition marks the end of a token
+		// Transition marks the end of a token
+		if dat.isTokenEnd(t) {
 
 			data := []byte(string(buffer[:buffo]))
 			if DEBUG {
@@ -1304,8 +1302,18 @@
 			}
 		}
 
+		// Move to representative state
+		if dat.isSeparate(t) {
+			t = dat.getBase(t)
+
+			if DEBUG {
+				fmt.Println("Representative pointing to", t)
+			}
+		}
+
 		// TODO:
 		//   Prevent endless epsilon loops!
+		newchar = true
 	}
 
 	// Input reader is not yet finished
@@ -1318,6 +1326,10 @@
 
 FINALCHECK:
 
+	if DEBUG {
+		fmt.Println("Entering final check")
+	}
+
 	// Automaton is in a final state
 	if dat.getCheck(dat.getBase(t)+uint32(dat.final)) == t {
 
@@ -1336,6 +1348,15 @@
 		return true
 	}
 
+	// Try again with epsilon symbol, in case everything else failed
+	/*
+		t0 = epsilonState
+		epsilonState = 0 // reset
+		buffo = epsilonOffset
+		a = dat.epsilon
+		goto CHECK
+	*/
+
 	// Check epsilon transitions until a final state is reached
 	t0 = t
 	t = dat.getBase(t0) + uint32(dat.epsilon)

diff --git a/datokenizer_test.go b/datokenizer_test.go
index 39aa775..e861b08 100644
--- a/datokenizer_test.go
+++ b/datokenizer_test.go

@@ -170,3 +170,19 @@
 	assert.Equal("", tokens[7])
 	assert.Equal(8, len(tokens))
 }
+
+func TestFullTokenizerSentenceSplitter(t *testing.T) {
+	assert := assert.New(t)
+	dat := LoadDatokFile("testdata/tokenizer.datok")
+	assert.NotNil(dat)
+
+	b := make([]byte, 0, 2048)
+	w := bytes.NewBuffer(b)
+	var sentences []string
+
+	// testSentSplitterSimple
+	r := strings.NewReader("Mann.")
+	assert.True(dat.Transduce(r, w))
+	sentences = strings.Split(w.String(), "\n\n")
+	assert.Equal(len(sentences), 1)
+}
commit	b7e1f133b92aa7cccbcab20b33132c3917a2ad2f	[log] [tgz]
author	Akron <nils@diewald-online.de>	Tue Aug 10 11:52:31 2021 +0200
committer	Akron <nils@diewald-online.de>	Tue Aug 10 11:52:31 2021 +0200
tree	0c2f1e093e18b8199978e8a93102d595fddb935b
parent	df0a3ef3943832984515f19f84b0df2f0894047b [diff]