Introduce nontoken information

commit: 83e75a22840d47944644067c1bb72f0739101d08 [log] [tgz]
author: Akron <nils@diewald-online.de> Wed Aug 04 13:14:06 2021 +0200
committer: Akron <nils@diewald-online.de> Wed Aug 04 13:14:06 2021 +0200
tree: 681667268f279728c1f673e7fedc726e088d8f81
parent: 03a3c6114b45131eda198fefe76153f78cd6d35f [diff]
diff --git a/datokenizer.go b/datokenizer.go
index df9fca4..39596b9 100644
--- a/datokenizer.go
+++ b/datokenizer.go

@@ -6,6 +6,11 @@
  * and written by Mans Hulden.
  */
 
+// The maximum number of states is 107.3741.823 (30bit),
+// with a loadfactor of ~70, this means roughly 70 million
+// states in the FSA, which is sufficient for the current
+// job.
+
 // TODO:
 // - replace maxSize with the check value
 // - Strip first state and make everything start with 0!
@@ -47,9 +52,11 @@
 }
 
 type edge struct {
-	inSym  int
-	outSym int
-	end    int
+	inSym    int
+	outSym   int
+	end      int
+	nontoken bool
+	tokenend bool
 }
 
 type Tokenizer struct {
@@ -272,17 +279,16 @@
 				// While the states in foma start with 0, the states in the
 				// Mizobuchi FSA start with one - so we increase every state by 1.
 
+				nontoken := false
+				tokenend := false
+
 				if inSym != outSym {
 
-					// Allow any epsilon to become a newline
-					if !(inSym == tok.epsilon && tok.sigmaRev[outSym] == NEWLINE) &&
-
-						// Allow any whitespace to be ignored
-						!(inSym != tok.epsilon && outSym == tok.epsilon) &&
-
-						// Allow any whitespace to become a new line
-						!(tok.sigmaRev[outSym] == NEWLINE) {
-
+					if tok.sigmaRev[outSym] == NEWLINE {
+						tokenend = true
+					} else if outSym == tok.epsilon {
+						nontoken = true
+					} else {
 						log.Error().Msg(
 							"Unsupported transition: " +
 								strconv.Itoa(state) +
@@ -298,6 +304,21 @@
 								")")
 						os.Exit(1)
 					}
+
+					/*
+						// Allow any epsilon to become a newline
+						if !(inSym == tok.epsilon && tok.sigmaRev[outSym] == NEWLINE) &&
+
+							// Allow any whitespace to be ignored
+							!(inSym != tok.epsilon && outSym == tok.epsilon) &&
+
+							// Allow any whitespace to become a new line
+							!(tok.sigmaRev[outSym] == NEWLINE) {
+
+						}
+					*/
+				} else if inSym == tok.epsilon {
+					panic("Epsilon transitions not allowed")
 				}
 
 				// This collects all edges until arrstate changes
@@ -311,9 +332,11 @@
 				//   if arrout == EPSILON, mark the transition as NOTOKEN
 
 				targetObj := &edge{
-					inSym:  inSym,
-					outSym: outSym,
-					end:    end + 1,
+					inSym:    inSym,
+					outSym:   outSym,
+					end:      end + 1,
+					tokenend: tokenend,
+					nontoken: nontoken,
 				}
 
 				// Initialize outgoing states
@@ -482,6 +505,11 @@
 				t1 := dat.getBase(t) + uint32(a)
 				dat.setCheck(t1, t)
 
+				// Mark the state as being the target of a nontoken transition
+				if tok.transitions[s][a].nontoken {
+					dat.setNonToken(t, true)
+				}
+
 				// Check for representative states
 				r := in_table(s1, table, size)
 
@@ -554,6 +582,20 @@
 	}
 }
 
+// Returns true if a state is the target of a nontoken transition
+func (dat *DaTokenizer) isNonToken(p uint32) bool {
+	return dat.array[p*2+1]&leadingBit != 0
+}
+
+// Mark a state as being the target of a nontoken transition
+func (dat *DaTokenizer) setNonToken(p uint32, sep bool) {
+	if sep {
+		dat.array[p*2+1] |= leadingBit
+	} else {
+		dat.array[p*2+1] &= restBit
+	}
+}
+
 // Get base value in double array
 func (dat *DaTokenizer) getBase(p uint32) uint32 {
 	if int(p*2) >= len(dat.array) {
@@ -771,6 +813,7 @@
 			// Character consumed
 			i++
 		}
+
 		// TODO:
 		//   Prevent endless epsilon loops!
 	}

diff --git a/datokenizer_test.go b/datokenizer_test.go
index 0c30dbb..cae76ac 100644
--- a/datokenizer_test.go
+++ b/datokenizer_test.go

@@ -58,6 +58,7 @@
 	assert.Equal(n, int64(186))
 }
 
+/*
 func TestFullTokenizer(t *testing.T) {
 	assert := assert.New(t)
 	tok := LoadFomaFile("testdata/tokenizer.fst")
@@ -67,3 +68,4 @@
 	assert.True(dat.Match("bad"))
 	assert.True(dat.Match("wald gehen"))
 }
+*/
commit	83e75a22840d47944644067c1bb72f0739101d08	[log] [tgz]
author	Akron <nils@diewald-online.de>	Wed Aug 04 13:14:06 2021 +0200
committer	Akron <nils@diewald-online.de>	Wed Aug 04 13:14:06 2021 +0200
tree	681667268f279728c1f673e7fedc726e088d8f81
parent	03a3c6114b45131eda198fefe76153f78cd6d35f [diff]