Cleanup code

commit: 740f3d746e27f3ac40d5afc21aa2b0170320ddcb [log] [tgz]
author: Akron <nils@diewald-online.de> Tue Aug 03 12:12:34 2021 +0200
committer: Akron <nils@diewald-online.de> Tue Aug 03 15:29:34 2021 +0200
tree: d048543c33b8136a5ec11c43a10f017b668c365d
parent: 49d27ee2cc68204a33b2817c0b18ee0430339193 [diff] [blame]
diff --git a/datokenizer.go b/datokenizer.go
index e1f55dc..e128d49 100644
--- a/datokenizer.go
+++ b/datokenizer.go

@@ -6,6 +6,12 @@
  * and written by Mans Hulden.
  */
 
+// TODO:
+// - replace maxSize with the check value
+// - Strip first state and make everything start with 0!
+// - Serialize!
+// - Split Tokenizer and DATokenizer
+
 import (
 	"bufio"
 	"compress/gzip"
@@ -15,6 +21,8 @@
 	"strconv"
 	"strings"
 	"unicode/utf8"
+
+	"github.com/rs/zerolog/log"
 )
 
 const (
@@ -23,6 +31,7 @@
 	STATES  = 3
 	NONE    = 4
 	NEWLINE = '\u000a'
+	DEBUG   = false
 )
 
 // Special symbols in sigma
@@ -37,49 +46,49 @@
 }
 
 type edge struct {
-	in     int
-	out    int
-	target int
+	inSym  int
+	outSym int
+	end    int
 }
 
 type Tokenizer struct {
 	sigma       map[rune]int
-	sigma_rev   map[int]rune
-	arccount    int
-	statecount  int
-	sigmacount  int
-	maxsize     int
+	sigmaRev    map[int]rune
+	arcCount    int
+	stateCount  int
+	sigmaCount  int
+	maxSize     int
 	array       []int
 	transitions []map[int]*edge
 }
 
-func parse_file(file string) *Tokenizer {
+func ParseFile(file string) *Tokenizer {
 	f, err := os.Open(file)
 	if err != nil {
-		panic(err)
+		log.Error().Err(err)
+		os.Exit(0)
 	}
 	defer f.Close()
 
 	gz, err := gzip.NewReader(f)
 	if err != nil {
-		panic(err)
+		log.Error().Err(err)
+		os.Exit(0)
 	}
 	defer gz.Close()
 
-	return parse(gz)
+	return Parse(gz)
 }
 
-func parse(ior io.Reader) *Tokenizer {
+func Parse(ior io.Reader) *Tokenizer {
 	r := bufio.NewReader(ior)
 
 	tok := &Tokenizer{
-		sigma:     make(map[rune]int),
-		sigma_rev: make(map[int]rune),
+		sigma:    make(map[rune]int),
+		sigmaRev: make(map[int]rune),
 	}
 
-	final := false
-
-	var arrstate, arrin, arrout, arrtarget, arrfinal int
+	var state, inSym, outSym, end, final int
 
 	mode := 0
 	var elem []string
@@ -91,7 +100,8 @@
 			if err == io.EOF {
 				break
 			}
-			panic(err)
+			log.Error().Err(err)
+			os.Exit(0)
 		}
 		if strings.HasPrefix(line, "##foma-net") {
 			continue
@@ -105,8 +115,8 @@
 
 			// Adds a final transition symbol to sigma
 			// written as '#' in Mizobuchi et al (2000)
-			tok.sigmacount++
-			FINAL = tok.sigmacount
+			tok.sigmaCount++
+			FINAL = tok.sigmaCount
 			continue
 		}
 		if strings.HasPrefix(line, "##sigma##") {
@@ -138,26 +148,30 @@
 					fmt.Println("name:             " + elem[12])
 				*/
 				if elem[6] != "1" {
-					panic("The FST needs to be deterministic")
+					log.Error().Msg("The FST needs to be deterministic")
+					os.Exit(1)
 				}
 				if elem[9] != "1" {
-					panic("The FST needs to be epsilon free")
+					log.Error().Msg("The FST needs to be epsilon free")
+					os.Exit(1)
 				}
 
 				elemint[0], err = strconv.Atoi(elem[1])
 				if err != nil {
-					panic("Can't read arccount")
+					log.Error().Msg("Can't read arccount")
+					os.Exit(1)
 				}
-				tok.arccount = elemint[0]
+				tok.arcCount = elemint[0]
 
 				// States start at 1 in Mizobuchi et al (2000),
 				// as the state 0 is associated with a fail.
 				// Initialize states and transitions
 				elemint[0], err = strconv.Atoi(elem[2])
 				if err != nil {
-					panic("Can't read statecount")
+					log.Error().Msg("Can't read statecount")
+					os.Exit(1)
 				}
-				tok.statecount = elemint[0]
+				tok.stateCount = elemint[0]
 				tok.transitions = make([]map[int]*edge, elemint[0]+1)
 				continue
 			}
@@ -200,74 +214,72 @@
 				switch len(elem) {
 				case 5:
 					{
-						arrstate = elemint[0]
-						arrin = elemint[1]
-						arrout = elemint[2]
-						arrtarget = elemint[3]
-						arrfinal = elemint[4]
+						state = elemint[0]
+						inSym = elemint[1]
+						outSym = elemint[2]
+						end = elemint[3]
+						final = elemint[4]
 					}
 				case 4:
 					{
 						if elemint[1] == -1 {
-							arrstate = elemint[0]
-							arrfinal = elemint[3]
+							state = elemint[0]
+							final = elemint[3]
 						} else {
-							arrstate = elemint[0]
-							arrin = elemint[1]
-							arrtarget = elemint[2]
-							arrfinal = elemint[3]
-							arrout = arrin
+							state = elemint[0]
+							inSym = elemint[1]
+							end = elemint[2]
+							final = elemint[3]
+							outSym = inSym
 						}
 					}
 				case 3:
 					{
-						arrin = elemint[0]
-						arrout = elemint[1]
-						arrtarget = elemint[2]
+						inSym = elemint[0]
+						outSym = elemint[1]
+						end = elemint[2]
 					}
 				case 2:
 					{
-						arrin = elemint[0]
-						arrtarget = elemint[1]
-						arrout = arrin
+						inSym = elemint[0]
+						end = elemint[1]
+						outSym = inSym
 					}
 				}
 
-				// This collects all edges until arrstate changes
-				if arrfinal == 1 {
-					final = true
-				} else {
-					final = false
-				}
-
 				// While the states in foma start with 0, the states in the
 				// Mizobuchi FSA start with one - so we increase every state by 1.
 
-				/*
-					if arrin != arrout && arrin != EPSILON && tok.sigma_rev[arrin] != '\n' {
-						panic("Problem: " + strconv.Itoa(arrstate) + " -> " + strconv.Itoa(arrtarget) + " (" + strconv.Itoa(arrin) + ":" + strconv.Itoa(arrout) + ") ")
-					}
-				*/
-				if arrin != arrout {
-					if arrin == EPSILON && tok.sigma_rev[arrout] == NEWLINE {
-					} else if arrin != EPSILON && arrout == EPSILON {
-					} else {
-						panic(
-							"Problem: " +
-								strconv.Itoa(arrstate) +
-								" -> " + strconv.Itoa(arrtarget) +
+				if inSym != outSym {
+
+					// Allow any epsilon to become a newline
+					if !(inSym == EPSILON && tok.sigmaRev[outSym] == NEWLINE) &&
+
+						// Allow any whitespace to be ignored
+						!(inSym != EPSILON && outSym == EPSILON) &&
+
+						// Allow any whitespace to become a new line
+						!(tok.sigmaRev[outSym] == NEWLINE) {
+
+						log.Error().Msg(
+							"Unsupported transition: " +
+								strconv.Itoa(state) +
+								" -> " + strconv.Itoa(end) +
 								" (" +
-								strconv.Itoa(arrin) +
+								strconv.Itoa(inSym) +
 								":" +
-								strconv.Itoa(arrout) +
+								strconv.Itoa(outSym) +
 								") (" +
-								string(tok.sigma_rev[arrin]) +
+								string(tok.sigmaRev[inSym]) +
 								":" +
-								string(tok.sigma_rev[arrout]) +
+								string(tok.sigmaRev[outSym]) +
 								")")
+						os.Exit(1)
 					}
 				}
 
+				// This collects all edges until arrstate changes
+
 				// TODO:
 				//   if arrin == EPSILON && arrout == TOKENEND, mark state as newline
 				//   if the next transition is the same, remove TOKENEND and add SENTENCEEND
@@ -277,35 +289,39 @@
 				//   if arrout == EPSILON, mark the transition as NOTOKEN
 
 				targetObj := &edge{
-					in:     arrin,
-					out:    arrout,
-					target: arrtarget + 1,
+					inSym:  inSym,
+					outSym: outSym,
+					end:    end + 1,
 				}
 
-				// Initialize outgoing state
-				if tok.transitions[arrstate+1] == nil {
-					tok.transitions[arrstate+1] = make(map[int]*edge)
+				// Initialize outgoing states
+				if tok.transitions[state+1] == nil {
+					tok.transitions[state+1] = make(map[int]*edge)
 				}
 
-				if arrin >= 0 {
-					tok.transitions[arrstate+1][arrin] = targetObj
+				// Ignore transitions with invalid symbols
+				if inSym >= 0 {
+					tok.transitions[state+1][inSym] = targetObj
 				}
 
-				if final {
-					tok.transitions[arrstate+1][FINAL] = &edge{}
+				// Add final transition
+				if final == 1 {
+					tok.transitions[state+1][FINAL] = &edge{}
 				}
 
-				fmt.Println("Add",
-					arrstate+1, "->", arrtarget+1,
-					"(",
-					arrin,
-					":",
-					arrout,
-					") (",
-					string(tok.sigma_rev[arrin]),
-					":",
-					string(tok.sigma_rev[arrout]),
-					")")
+				if DEBUG {
+					fmt.Println("Add",
+						state+1, "->", end+1,
+						"(",
+						inSym,
+						":",
+						outSym,
+						") (",
+						string(tok.sigmaRev[inSym]),
+						":",
+						string(tok.sigmaRev[outSym]),
+						")")
+				}
 
 				continue
 			}
@@ -317,10 +333,11 @@
 				number, err := strconv.Atoi(elem[0])
 
 				if err != nil {
-					panic(err)
+					log.Error().Err(err)
+					os.Exit(0)
 				}
 
-				tok.sigmacount = number
+				tok.sigmaCount = number
 
 				var symbol rune
 
@@ -348,23 +365,27 @@
 							continue
 						}
 					default:
-						panic("MCS not supported: " + line)
+						{
+							log.Error().Msg("MCS not supported: " + line)
+							os.Exit(1)
+						}
 					}
 
-					// Probably a new line symbol
-				} else {
+				} else { // Probably a new line symbol
 					line, err = r.ReadString('\n')
 					if err != nil {
-						panic(err)
+						log.Error().Err(err)
+						os.Exit(0)
 					}
 					if len(line) != 1 {
-						panic("MCS not supported:" + line)
+						log.Error().Msg("MCS not supported:" + line)
+						os.Exit(0)
 					}
-					symbol = rune('\n')
+					symbol = rune(NEWLINE)
 				}
 
 				tok.sigma[symbol] = number
-				tok.sigma_rev[number] = symbol
+				tok.sigmaRev[number] = symbol
 			}
 		}
 	}
@@ -373,110 +394,129 @@
 }
 
 // Implementation of Mizobuchi et al (2000), p.128
-func (tok *Tokenizer) buildDA() *Tokenizer {
+func (tok *Tokenizer) ToDoubleArray() *Tokenizer {
 
 	mark := 0
 	size := 0
 
 	// Create a mapping from s to t
-	table := make([]*mapping, tok.arccount+1)
+	table := make([]*mapping, tok.arcCount+1)
 
 	table[size] = &mapping{source: 1, target: 1}
 	size++
 
-	A := make([]int, 0, 256)
+	// Allocate space for the outgoing symbol range
+	A := make([]int, 0, tok.sigmaCount)
 
 	for mark < size {
 		s := table[mark].source // This is a state in Ms
 		t := table[mark].target // This is a state in Mt
 		mark++
-		//		fmt.Println("Increase mark", mark)
-		// St := append(St, t)
+
+		// Following the paper, here the state t can be remembered
+		// in the set of states St
 		A = A[:0]
 		tok.get_set(s, &A)
 
-		// fmt.Println("Outgoing arcs from t", t, A)
+		// Set base to the first free slot in the double array
+		tok.setBase(t, tok.xCheck(A))
 
-		// tok.array[t].base = tok.x_check(A)
-		tok.set_base(t, tok.x_check(A))
-
+		// Iterate over all outgoing symbols
 		for _, a := range A {
 
 			if a != FINAL {
-				s1 := tok.transitions[s][a].target // g(s, a)
 
-				// fmt.Println("Found", s, "to", s1, "via", a)
+				// Aka g(s, a)
+				s1 := tok.transitions[s][a].end
 
-				t1 := tok.get_base(t) + a
-				tok.set_check(t1, t)
+				// Store the transition
+				t1 := tok.getBase(t) + a
+				tok.setCheck(t1, t)
 
+				// Check for representative states
 				r := in_table(s1, table, size)
+
 				if r == 0 {
-					// fmt.Println("Increase size", t1)
+					// Remember the mapping
 					table[size] = &mapping{source: s1, target: t1}
 					size++
 				} else {
-					//fmt.Println("Rep is there", t1, r)
-					tok.set_base(t1, -1*r)
-					// tok.array[t1].base = -1 * r
+					// Overwrite with the representative state
+					tok.setBase(t1, -1*r)
 				}
 			} else {
-				fmt.Println("I set a final")
-				// t1 := tok.array[t].base + FINAL
-				t1 := tok.get_base(t) + FINAL
-				// tok.array[t1].check = t
-				tok.set_check(t1, t)
+				// Store a final transition
+				tok.setCheck(tok.getBase(t)+FINAL, t)
 			}
 		}
 	}
 
 	// Following Mizobuchi et al (2000) the size of the
 	// FSA should be stored in check(1).
-	tok.set_check(1, tok.maxsize+1)
-	tok.array = tok.array[:tok.maxsize+1]
+	tok.setCheck(1, tok.maxSize+1)
+	tok.array = tok.array[:tok.maxSize+1]
 	return tok
 }
 
+// Resize double array when necessary
 func (tok *Tokenizer) resize(l int) {
+	// TODO:
+	//   This is a bit too aggressive atm and should be calmed down.
 	if len(tok.array) <= l {
 		tok.array = append(tok.array, make([]int, l)...)
 	}
 }
 
-func (tok *Tokenizer) set_base(p int, v int) {
+// Set base value in double array
+func (tok *Tokenizer) setBase(p int, v int) {
 	l := p*2 + 1
 	tok.resize(l)
-	if tok.maxsize < l {
-		tok.maxsize = l
+	if tok.maxSize < l {
+		tok.maxSize = l
 	}
 	tok.array[p*2] = v
 }
 
-func (tok *Tokenizer) get_base(p int) int {
+// Get base value in double array
+func (tok *Tokenizer) getBase(p int) int {
 	if p*2 >= len(tok.array) {
 		return 0
 	}
 	return tok.array[p*2]
 }
 
-func (tok *Tokenizer) set_check(p int, v int) {
+// Set check value in double array
+func (tok *Tokenizer) setCheck(p int, v int) {
 	l := p*2 + 1
 	tok.resize(l)
-	if tok.maxsize < l {
-		tok.maxsize = l
+	if tok.maxSize < l {
+		tok.maxSize = l
 	}
 	tok.array[(p*2)+1] = v
 }
 
-func (tok *Tokenizer) get_check(p int) int {
+// Get check value in double array
+func (tok *Tokenizer) getCheck(p int) int {
 	if (p*2)+1 >= len(tok.array) {
 		return 0
 	}
 	return tok.array[(p*2)+1]
 }
 
+// Set size of double array
+func (tok *Tokenizer) setSize(p, v int) {
+	tok.setCheck(1, v)
+}
+
+// Get size of double array
+func (tok *Tokenizer) getSize(p int) int {
+	return tok.getCheck(1)
+}
+
 // Check the table if a mapping of s
-// exists and return this as a representative
+// exists and return this as a representative.
+// Currently iterates through the whole table
+// in a bruteforce manner.
 func in_table(s int, table []*mapping, size int) int {
 	for x := 0; x < size; x++ {
 		if table[x].source == s {
@@ -499,114 +539,123 @@
 // structure until it finds a gap that fits all outgoing transitions
 // of the state. This is extremely slow, but is only necessary in the
 // construction phase of the tokenizer.
-func (tok *Tokenizer) x_check(symbols []int) int {
-	// see https://github.com/bramstein/datrie/blob/master/lib/trie.js
+func (tok *Tokenizer) xCheck(symbols []int) int {
+
+	// Start at the first entry of the double array list
 	base := 1
 
-	// 	fmt.Println("Resize", len(tok.linarray), "<", ((base + FINAL + 1) * 2))
-
 OVERLAP:
+
+	// Resize the array if necessary
 	tok.resize((base + FINAL) * 2)
 	for _, a := range symbols {
-		// if tok.array[base+a].check != 0 {
-		if tok.get_check(base+a) != 0 {
+		if tok.getCheck(base+a) != 0 {
 			base++
 			goto OVERLAP
 		}
 	}
-	//	fmt.Println("Found a nice place at", base, "for", len(symbols))
 	return base
 }
 
-// Based on Mizobuchi et al (2000), p. 129
-// Added support for IDENTITY, UNKNOWN and EPSILON
-func (tok *Tokenizer) match(input string) bool {
-	t := 1 // Start position
-	chars := []rune(input)
-	i := 0
+// Match an input string against the double array
+// FSA.
+//
+// Based on Mizobuchi et al (2000), p. 129,
+// with additional support for IDENTITY, UNKNOWN
+// and EPSILON transitions.
+func (tok *Tokenizer) Match(input string) bool {
 	var a int
 	var tu int
 	var ok bool
 
-	//	fmt.Println("Length of string is", len(chars))
+	t := 1 // Initial state
+	chars := []rune(input)
+	i := 0
+
 	for i < len(chars) {
 		a, ok = tok.sigma[chars[i]]
 
-		// Support identity symbol if char not in sigma
+		// Support identity symbol if character is not in sigma
 		if !ok && IDENTITY != -1 {
-			fmt.Println("IDENTITY symbol", string(chars[i]), "->", IDENTITY)
+			if DEBUG {
+				fmt.Println("IDENTITY symbol", string(chars[i]), "->", IDENTITY)
+			}
 			a = IDENTITY
-		} else {
+		} else if DEBUG {
 			fmt.Println("Sigma transition is okay for [", string(chars[i]), "]")
 		}
 		tu = t
 	CHECK:
-		t = tok.get_base(tu) + a
-		if t > tok.get_check(1) || tok.get_check(t) != tu {
-			fmt.Println("Match is not fine!", t, "and", tok.get_check(t), "vs", tu)
+		t = tok.getBase(tu) + a
 
-			// Try again with unknown symbol, in case identity failed
-			if !ok {
-				if a == IDENTITY {
-					fmt.Println("UNKNOWN symbol", string(chars[i]), "->", UNKNOWN)
-					a = UNKNOWN
-					goto CHECK
-				} else if a == UNKNOWN {
-					fmt.Println("aEPSILON symbol", string(chars[i]), "->", EPSILON)
-					a = EPSILON
-					// In the worst case, this checks epsilon twice at the same state -
-					// here and at the end
-					goto CHECK
-				}
-			} else if a != EPSILON {
-				fmt.Println("bEPSILON symbol", string(chars[i]), "->", EPSILON)
-				a = EPSILON
-				// In the worst case, this checks epsilon twice at the same state -
-				// here and at the end
-				goto CHECK
+		// Check if the transition is valid according to the double array
+		if t > tok.getCheck(1) || tok.getCheck(t) != tu {
+
+			if DEBUG {
+				fmt.Println("Match is not fine!", t, "and", tok.getCheck(t), "vs", tu)
 			}
-			break
-		} else if tok.get_base(t) < 0 {
+
+			if !ok && a == IDENTITY {
+				// Try again with unknown symbol, in case identity failed
+				if DEBUG {
+					fmt.Println("UNKNOWN symbol", string(chars[i]), "->", UNKNOWN)
+				}
+				a = UNKNOWN
+
+			} else if a != EPSILON {
+				// Try again with epsilon symbol, in case everything else failed
+				if DEBUG {
+					fmt.Println("EPSILON symbol", string(chars[i]), "->", EPSILON)
+				}
+				a = EPSILON
+			} else {
+				break
+			}
+			goto CHECK
+		} else if tok.getBase(t) < 0 {
 			// Move to representative state
-			t = -1 * tok.get_base(t)
+			t = -1 * tok.getBase(t)
 		}
+
+		// Transition is fine
 		if a != EPSILON {
+			// Character consumed
 			i++
 		}
+		// TODO:
+		//   Prevent endless epsilon loops!
 	}
 
-	if i == len(chars) {
-		fmt.Println("At the end")
-	} else {
-		fmt.Println("Not at the end")
+	if i != len(chars) {
+		if DEBUG {
+			fmt.Println("Not at the end")
+		}
 		return false
 	}
 
-	// fmt.Println("Hmm...", tok.get_check(tok.get_base(t)+FINAL), "-", t)
-
 FINALCHECK:
-	if tok.get_check(tok.get_base(t)+FINAL) == t {
+
+	// Automaton is in a final state
+	if tok.getCheck(tok.getBase(t)+FINAL) == t {
 		return true
 	}
 
+	// Check epsilon transitions until a final state is reached
 	tu = t
 	a = EPSILON
+	t = tok.getBase(tu) + a
 
-	t = tok.get_base(tu) + a
-	if t > tok.get_check(1) || tok.get_check(t) != tu {
-		fmt.Println("xMatch is not fine!", t, "and", tok.get_check(t), "vs", tu)
+	// Epsilon transition failed
+	if t > tok.getCheck(1) || tok.getCheck(t) != tu {
+		if DEBUG {
+			fmt.Println("Match is not fine!", t, "and", tok.getCheck(t), "vs", tu)
+		}
 		return false
-	} else if tok.get_base(t) < 0 {
+
+	} else if tok.getBase(t) < 0 {
 		// Move to representative state
-		t = -1 * tok.get_base(t)
-		goto FINALCHECK
+		t = -1 * tok.getBase(t)
 	}
+
 	goto FINALCHECK
 }
-
-// In the final realization, the states can only have 30 bits:
-// base[1] -> is final
-// base[2] -> is_separate
-// check[1] -> translates to epsilon
-// check[2] -> appends newine (Maybe)
-// If check[1] && check[2] is set, this translates to a sentence split (Maybe)
commit	740f3d746e27f3ac40d5afc21aa2b0170320ddcb	[log] [tgz]
author	Akron <nils@diewald-online.de>	Tue Aug 03 12:12:34 2021 +0200
committer	Akron <nils@diewald-online.de>	Tue Aug 03 15:29:34 2021 +0200
tree	d048543c33b8136a5ec11c43a10f017b668c365d
parent	49d27ee2cc68204a33b2817c0b18ee0430339193 [diff] [blame]