Split Tokenizer and DaTokenizer

commit: f2120ca4bab1dd40e4c5e90b18c78b7e6f934487 [log] [tgz]
author: Akron <nils@diewald-online.de> Tue Aug 03 16:26:41 2021 +0200
committer: Akron <nils@diewald-online.de> Tue Aug 03 16:26:41 2021 +0200
tree: d431746053c431a1b1c61d5e0e01e9dd4ce8033c
parent: c9d84a6c3874ffd8a027c668fac7ffcdf1cb12c8 [diff]
diff --git a/datokenizer.go b/datokenizer.go
index b6e877b..4409d98 100644
--- a/datokenizer.go
+++ b/datokenizer.go

@@ -53,16 +53,21 @@
 }
 
 type Tokenizer struct {
-	sigma       map[rune]int
+	// sigma       map[rune]int
 	sigmaRev    map[int]rune
 	arcCount    int
 	stateCount  int
 	sigmaCount  int
-	maxSize     int
-	array       []int
 	transitions []map[int]*edge
 }
 
+type DaTokenizer struct {
+	sigma map[rune]int
+	// sigmaRev map[int]rune
+	maxSize int
+	array   []int
+}
+
 func ParseFile(file string) *Tokenizer {
 	f, err := os.Open(file)
 	if err != nil {
@@ -85,7 +90,7 @@
 	r := bufio.NewReader(ior)
 
 	tok := &Tokenizer{
-		sigma:    make(map[rune]int),
+		// sigma:    make(map[rune]int),
 		sigmaRev: make(map[int]rune),
 	}
 
@@ -385,7 +390,6 @@
 					symbol = rune(NEWLINE)
 				}
 
-				tok.sigma[symbol] = number
 				tok.sigmaRev[number] = symbol
 			}
 		}
@@ -395,7 +399,15 @@
 }
 
 // Implementation of Mizobuchi et al (2000), p.128
-func (tok *Tokenizer) ToDoubleArray() *Tokenizer {
+func (tok *Tokenizer) ToDoubleArray() *DaTokenizer {
+
+	dat := &DaTokenizer{
+		sigma: make(map[rune]int),
+	}
+
+	for num, sym := range tok.sigmaRev {
+		dat.sigma[sym] = num
+	}
 
 	mark := 0
 	size := 0
@@ -420,7 +432,7 @@
 		tok.get_set(s, &A)
 
 		// Set base to the first free slot in the double array
-		tok.setBase(t, tok.xCheck(A))
+		dat.setBase(t, dat.xCheck(A))
 
 		// Iterate over all outgoing symbols
 		for _, a := range A {
@@ -431,8 +443,8 @@
 				s1 := tok.transitions[s][a].end
 
 				// Store the transition
-				t1 := tok.getBase(t) + a
-				tok.setCheck(t1, t)
+				t1 := dat.getBase(t) + a
+				dat.setCheck(t1, t)
 
 				// Check for representative states
 				r := in_table(s1, table, size)
@@ -443,24 +455,24 @@
 					size++
 				} else {
 					// Overwrite with the representative state
-					tok.setBase(t1, -1*r)
+					dat.setBase(t1, -1*r)
 				}
 			} else {
 				// Store a final transition
-				tok.setCheck(tok.getBase(t)+FINAL, t)
+				dat.setCheck(dat.getBase(t)+FINAL, t)
 			}
 		}
 	}
 
 	// Following Mizobuchi et al (2000) the size of the
 	// FSA should be stored in check(1).
-	tok.setCheck(1, tok.maxSize+1)
-	tok.array = tok.array[:tok.maxSize+1]
-	return tok
+	dat.setCheck(1, dat.maxSize+1)
+	dat.array = dat.array[:dat.maxSize+1]
+	return dat
 }
 
 // Resize double array when necessary
-func (tok *Tokenizer) resize(l int) {
+func (tok *DaTokenizer) resize(l int) {
 	// TODO:
 	//   This is a bit too aggressive atm and should be calmed down.
 	if len(tok.array) <= l {
@@ -469,7 +481,7 @@
 }
 
 // Set base value in double array
-func (tok *Tokenizer) setBase(p int, v int) {
+func (tok *DaTokenizer) setBase(p int, v int) {
 	l := p*2 + 1
 	tok.resize(l)
 	if tok.maxSize < l {
@@ -479,7 +491,7 @@
 }
 
 // Get base value in double array
-func (tok *Tokenizer) getBase(p int) int {
+func (tok *DaTokenizer) getBase(p int) int {
 	if p*2 >= len(tok.array) {
 		return 0
 	}
@@ -487,7 +499,7 @@
 }
 
 // Set check value in double array
-func (tok *Tokenizer) setCheck(p int, v int) {
+func (tok *DaTokenizer) setCheck(p int, v int) {
 	l := p*2 + 1
 	tok.resize(l)
 	if tok.maxSize < l {
@@ -497,7 +509,7 @@
 }
 
 // Get check value in double array
-func (tok *Tokenizer) getCheck(p int) int {
+func (tok *DaTokenizer) getCheck(p int) int {
 	if (p*2)+1 >= len(tok.array) {
 		return 0
 	}
@@ -505,12 +517,12 @@
 }
 
 // Set size of double array
-func (tok *Tokenizer) setSize(p, v int) {
+func (tok *DaTokenizer) setSize(p, v int) {
 	tok.setCheck(1, v)
 }
 
 // Get size of double array
-func (tok *Tokenizer) getSize(p int) int {
+func (tok *DaTokenizer) getSize(p int) int {
 	return tok.getCheck(1)
 }
 
@@ -543,7 +555,7 @@
 // structure until it finds a gap that fits all outgoing transitions
 // of the state. This is extremely slow, but is only necessary in the
 // construction phase of the tokenizer.
-func (tok *Tokenizer) xCheck(symbols []int) int {
+func (dat *DaTokenizer) xCheck(symbols []int) int {
 
 	// Start at the first entry of the double array list
 	base := 1
@@ -551,9 +563,9 @@
 OVERLAP:
 
 	// Resize the array if necessary
-	tok.resize((base + FINAL) * 2)
+	dat.resize((base + FINAL) * 2)
 	for _, a := range symbols {
-		if tok.getCheck(base+a) != 0 {
+		if dat.getCheck(base+a) != 0 {
 			base++
 			goto OVERLAP
 		}
@@ -567,7 +579,7 @@
 // Based on Mizobuchi et al (2000), p. 129,
 // with additional support for IDENTITY, UNKNOWN
 // and EPSILON transitions.
-func (tok *Tokenizer) Match(input string) bool {
+func (tok *DaTokenizer) Match(input string) bool {
 	var a int
 	var tu int
 	var ok bool

diff --git a/datokenizer_test.go b/datokenizer_test.go
index 8cabb67..f9c10cc 100644
--- a/datokenizer_test.go
+++ b/datokenizer_test.go

@@ -12,10 +12,10 @@
 
 	// bau | bauamt
 	tok := ParseFile("testdata/bauamt.fst")
-	tok.ToDoubleArray()
-	assert.True(tok.Match("bau"))
-	assert.True(tok.Match("bauamt"))
-	assert.False(tok.Match("baum"))
+	dat := tok.ToDoubleArray()
+	assert.True(dat.Match("bau"))
+	assert.True(dat.Match("bauamt"))
+	assert.False(dat.Match("baum"))
 }
 
 func TestSimpleBranches(t *testing.T) {
@@ -23,30 +23,30 @@
 
 	// (bau | wahl) (amt | en)
 	tok := ParseFile("testdata/wahlamt.fst")
-	tok.ToDoubleArray()
-	assert.False(tok.Match("bau"))
-	assert.True(tok.Match("bauamt"))
-	assert.True(tok.Match("wahlamt"))
-	assert.True(tok.Match("bauen"))
-	assert.True(tok.Match("wahlen"))
-	assert.False(tok.Match("baum"))
+	dat := tok.ToDoubleArray()
+	assert.False(dat.Match("bau"))
+	assert.True(dat.Match("bauamt"))
+	assert.True(dat.Match("wahlamt"))
+	assert.True(dat.Match("bauen"))
+	assert.True(dat.Match("wahlen"))
+	assert.False(dat.Match("baum"))
 }
 
 func TestSimpleTokenizer(t *testing.T) {
 	assert := assert.New(t)
 	tok := ParseFile("testdata/simpletok.fst")
-	tok.ToDoubleArray()
-	assert.True(tok.Match("bau"))
-	assert.True(tok.Match("bad"))
-	assert.True(tok.Match("wald gehen"))
+	dat := tok.ToDoubleArray()
+	assert.True(dat.Match("bau"))
+	assert.True(dat.Match("bad"))
+	assert.True(dat.Match("wald gehen"))
 }
 
 func TestFullTokenizer(t *testing.T) {
 	assert := assert.New(t)
 	tok := ParseFile("testdata/tokenizer.fst")
-	tok.ToDoubleArray()
-	fmt.Println("Size:", tok.maxSize)
-	assert.True(tok.Match("bau"))
-	assert.True(tok.Match("bad"))
-	assert.True(tok.Match("wald gehen"))
+	dat := tok.ToDoubleArray()
+	fmt.Println("Size:", dat.maxSize)
+	assert.True(dat.Match("bau"))
+	assert.True(dat.Match("bad"))
+	assert.True(dat.Match("wald gehen"))
 }
commit	f2120ca4bab1dd40e4c5e90b18c78b7e6f934487	[log] [tgz]
author	Akron <nils@diewald-online.de>	Tue Aug 03 16:26:41 2021 +0200
committer	Akron <nils@diewald-online.de>	Tue Aug 03 16:26:41 2021 +0200
tree	d431746053c431a1b1c61d5e0e01e9dd4ce8033c
parent	c9d84a6c3874ffd8a027c668fac7ffcdf1cb12c8 [diff]