Turn special sigma values into properties

commit: c17f1ca207cca2f620ce80deaba6265185bb4bc6 [log] [tgz]
author: Akron <nils@diewald-online.de> Tue Aug 03 19:47:27 2021 +0200
committer: Akron <nils@diewald-online.de> Tue Aug 03 19:47:27 2021 +0200
tree: e71e09b4d710278397b5301246fbe7a9cf36cbf1
parent: 6247a5d0b7fad37ee5d97109894b0135f505be1b [diff]
diff --git a/datokenizer.go b/datokenizer.go
index 1ebcd91..826ba87 100644
--- a/datokenizer.go
+++ b/datokenizer.go

@@ -9,9 +9,6 @@
 // TODO:
 // - replace maxSize with the check value
 // - Strip first state and make everything start with 0!
-// - Serialize!
-// - Split Tokenizer and DATokenizer
-// - Make epsilon etc. properties
 
 import (
 	"bufio"
@@ -40,12 +37,6 @@
 	VERSION = uint16(1)
 )
 
-// Special symbols in sigma
-var EPSILON = -1
-var UNKNOWN = -1
-var IDENTITY = -1
-var FINAL = -1
-
 var bo binary.ByteOrder = binary.LittleEndian
 
 type mapping struct {
@@ -66,6 +57,12 @@
 	stateCount  int
 	sigmaCount  int
 	transitions []map[int]*edge
+
+	// Special symbols in sigma
+	epsilon  int
+	unknown  int
+	identity int
+	final    int
 }
 
 type DaTokenizer struct {
@@ -74,9 +71,15 @@
 	maxSize   int
 	loadLevel float64
 	array     []int
+
+	// Special symbols in sigma
+	epsilon  int
+	unknown  int
+	identity int
+	final    int
 }
 
-func ParseFile(file string) *Tokenizer {
+func ParseFoma(file string) *Tokenizer {
 	f, err := os.Open(file)
 	if err != nil {
 		log.Error().Err(err)
@@ -98,8 +101,11 @@
 	r := bufio.NewReader(ior)
 
 	tok := &Tokenizer{
-		// sigma:    make(map[rune]int),
 		sigmaRev: make(map[int]rune),
+		epsilon:  -1,
+		unknown:  -1,
+		identity: -1,
+		final:    -1,
 	}
 
 	var state, inSym, outSym, end, final int
@@ -130,7 +136,7 @@
 			// Adds a final transition symbol to sigma
 			// written as '#' in Mizobuchi et al (2000)
 			tok.sigmaCount++
-			FINAL = tok.sigmaCount
+			tok.final = tok.sigmaCount
 			continue
 		}
 		if strings.HasPrefix(line, "##sigma##") {
@@ -267,10 +273,10 @@
 				if inSym != outSym {
 
 					// Allow any epsilon to become a newline
-					if !(inSym == EPSILON && tok.sigmaRev[outSym] == NEWLINE) &&
+					if !(inSym == tok.epsilon && tok.sigmaRev[outSym] == NEWLINE) &&
 
 						// Allow any whitespace to be ignored
-						!(inSym != EPSILON && outSym == EPSILON) &&
+						!(inSym != tok.epsilon && outSym == tok.epsilon) &&
 
 						// Allow any whitespace to become a new line
 						!(tok.sigmaRev[outSym] == NEWLINE) {
@@ -320,7 +326,7 @@
 
 				// Add final transition
 				if final == 1 {
-					tok.transitions[state+1][FINAL] = &edge{}
+					tok.transitions[state+1][tok.final] = &edge{}
 				}
 
 				if DEBUG {
@@ -364,18 +370,18 @@
 					switch elem[1] {
 					case "@_EPSILON_SYMBOL_@":
 						{
-							EPSILON = number
+							tok.epsilon = number
 							continue
 						}
 					case "@_UNKNOWN_SYMBOL_@":
 						{
-							UNKNOWN = number
+							tok.unknown = number
 							continue
 						}
 
 					case "@_IDENTITY_SYMBOL_@":
 						{
-							IDENTITY = number
+							tok.identity = number
 							continue
 						}
 					default:
@@ -412,6 +418,10 @@
 	dat := &DaTokenizer{
 		sigma:     make(map[rune]int),
 		loadLevel: -1,
+		final:     tok.final,
+		unknown:   tok.unknown,
+		identity:  tok.identity,
+		epsilon:   tok.epsilon,
 	}
 
 	for num, sym := range tok.sigmaRev {
@@ -450,7 +460,7 @@
 		// Iterate over all outgoing symbols
 		for _, a := range A {
 
-			if a != FINAL {
+			if a != tok.final {
 
 				// Aka g(s, a)
 				s1 := tok.transitions[s][a].end
@@ -472,7 +482,7 @@
 				}
 			} else {
 				// Store a final transition
-				dat.setCheck(dat.getBase(t)+FINAL, t)
+				dat.setCheck(dat.getBase(t)+dat.final, t)
 			}
 		}
 	}
@@ -576,7 +586,7 @@
 OVERLAP:
 
 	// Resize the array if necessary
-	dat.resize((base + FINAL) * 2)
+	dat.resize((base + dat.final) * 2)
 	for _, a := range symbols {
 		if dat.getCheck(base+a) != 0 {
 			base++
@@ -625,10 +635,10 @@
 
 	buf := make([]byte, 0, 12)
 	bo.PutUint16(buf[0:2], VERSION)
-	bo.PutUint16(buf[2:4], uint16(EPSILON))
-	bo.PutUint16(buf[4:6], uint16(UNKNOWN))
-	bo.PutUint16(buf[6:8], uint16(IDENTITY))
-	bo.PutUint16(buf[8:10], uint16(FINAL))
+	bo.PutUint16(buf[2:4], uint16(dat.epsilon))
+	bo.PutUint16(buf[4:6], uint16(dat.unknown))
+	bo.PutUint16(buf[6:8], uint16(dat.identity))
+	bo.PutUint16(buf[8:10], uint16(dat.final))
 	bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
 	more, err := w.Write(buf[0:12])
 	if err != nil {
@@ -695,11 +705,11 @@
 		a, ok = tok.sigma[chars[i]]
 
 		// Support identity symbol if character is not in sigma
-		if !ok && IDENTITY != -1 {
+		if !ok && tok.identity != -1 {
 			if DEBUG {
-				fmt.Println("IDENTITY symbol", string(chars[i]), "->", IDENTITY)
+				fmt.Println("IDENTITY symbol", string(chars[i]), "->", tok.identity)
 			}
-			a = IDENTITY
+			a = tok.identity
 		} else if DEBUG {
 			fmt.Println("Sigma transition is okay for [", string(chars[i]), "]")
 		}
@@ -714,19 +724,19 @@
 				fmt.Println("Match is not fine!", t, "and", tok.getCheck(t), "vs", tu)
 			}
 
-			if !ok && a == IDENTITY {
+			if !ok && a == tok.identity {
 				// Try again with unknown symbol, in case identity failed
 				if DEBUG {
-					fmt.Println("UNKNOWN symbol", string(chars[i]), "->", UNKNOWN)
+					fmt.Println("UNKNOWN symbol", string(chars[i]), "->", tok.unknown)
 				}
-				a = UNKNOWN
+				a = tok.unknown
 
-			} else if a != EPSILON {
+			} else if a != tok.epsilon {
 				// Try again with epsilon symbol, in case everything else failed
 				if DEBUG {
-					fmt.Println("EPSILON symbol", string(chars[i]), "->", EPSILON)
+					fmt.Println("EPSILON symbol", string(chars[i]), "->", tok.epsilon)
 				}
-				a = EPSILON
+				a = tok.epsilon
 			} else {
 				break
 			}
@@ -737,7 +747,7 @@
 		}
 
 		// Transition is fine
-		if a != EPSILON {
+		if a != tok.epsilon {
 			// Character consumed
 			i++
 		}
@@ -755,13 +765,13 @@
 FINALCHECK:
 
 	// Automaton is in a final state
-	if tok.getCheck(tok.getBase(t)+FINAL) == t {
+	if tok.getCheck(tok.getBase(t)+tok.final) == t {
 		return true
 	}
 
 	// Check epsilon transitions until a final state is reached
 	tu = t
-	t = tok.getBase(tu) + EPSILON
+	t = tok.getBase(tu) + tok.epsilon
 
 	// Epsilon transition failed
 	if t > tok.getCheck(1) || tok.getCheck(t) != tu {

diff --git a/datokenizer_test.go b/datokenizer_test.go
index 4cc5619..7f1d1d5 100644
--- a/datokenizer_test.go
+++ b/datokenizer_test.go

@@ -11,7 +11,7 @@
 	assert := assert.New(t)
 
 	// bau | bauamt
-	tok := ParseFile("testdata/bauamt.fst")
+	tok := ParseFoma("testdata/bauamt.fst")
 	dat := tok.ToDoubleArray()
 	assert.True(dat.Match("bau"))
 	assert.True(dat.Match("bauamt"))
@@ -22,7 +22,7 @@
 	assert := assert.New(t)
 
 	// (bau | wahl) (amt | en)
-	tok := ParseFile("testdata/wahlamt.fst")
+	tok := ParseFoma("testdata/wahlamt.fst")
 	dat := tok.ToDoubleArray()
 	assert.False(dat.Match("bau"))
 	assert.True(dat.Match("bauamt"))
@@ -34,7 +34,7 @@
 
 func TestSimpleTokenizer(t *testing.T) {
 	assert := assert.New(t)
-	tok := ParseFile("testdata/simpletok.fst")
+	tok := ParseFoma("testdata/simpletok.fst")
 	dat := tok.ToDoubleArray()
 	assert.True(dat.Match("bau"))
 	assert.True(dat.Match("bad"))
@@ -43,7 +43,7 @@
 
 func TestWriteTokenizer(t *testing.T) {
 	assert := assert.New(t)
-	tok := ParseFile("testdata/simpletok.fst")
+	tok := ParseFoma("testdata/simpletok.fst")
 	dat := tok.ToDoubleArray()
 	assert.True(dat.Match("bau"))
 	assert.True(dat.Match("bad"))
commit	c17f1ca207cca2f620ce80deaba6265185bb4bc6	[log] [tgz]
author	Akron <nils@diewald-online.de>	Tue Aug 03 19:47:27 2021 +0200
committer	Akron <nils@diewald-online.de>	Tue Aug 03 19:47:27 2021 +0200
tree	e71e09b4d710278397b5301246fbe7a9cf36cbf1
parent	6247a5d0b7fad37ee5d97109894b0135f505be1b [diff]