Initial commit

commit: 8ef408b0a3e1d8eddd1beeedab0c0f8b78f39c2c [log] [tgz]
author: Akron <nils@diewald-online.de> Mon Aug 02 22:11:04 2021 +0200
committer: Akron <nils@diewald-online.de> Mon Aug 02 22:11:04 2021 +0200
tree: 495928cb054d90a74813953bf968c39b13597b21
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ecfe0bc
--- /dev/null
+++ b/.gitignore

@@ -0,0 +1 @@
+sandbox
\ No newline at end of file

diff --git a/datokenizer.go b/datokenizer.go
new file mode 100644
index 0000000..8a68753
--- /dev/null
+++ b/datokenizer.go

@@ -0,0 +1,521 @@
+package datokenizer
+
+/**
+ * The file reader is basically a port of foma2js,
+ * licensed under the Apache License, version 2,
+ * and written by Mans Hulden.
+ */
+
+import (
+	"bufio"
+	"compress/gzip"
+	"fmt"
+	"io"
+	"os"
+	"strconv"
+	"strings"
+	"unicode/utf8"
+)
+
+const (
+	PROPS  = 1
+	SIGMA  = 2
+	STATES = 3
+	NONE   = 4
+)
+
+// Special symbols in sigma
+var EPSILON, UNKNOWN, IDENTITY, FINAL int
+
+type mapping struct {
+	source int
+	target int
+}
+
+type edge struct {
+	in     int
+	out    int
+	target int
+}
+
+type Tokenizer struct {
+	sigma       map[rune]int
+	sigma_rev   map[int]rune
+	arccount    int
+	statecount  int
+	sigmacount  int
+	maxsize     int
+	array       []int
+	transitions []map[int]*edge
+}
+
+func parse_file(file string) *Tokenizer {
+	f, err := os.Open(file)
+	if err != nil {
+		panic(err)
+	}
+	defer f.Close()
+
+	gz, err := gzip.NewReader(f)
+	if err != nil {
+		panic(err)
+	}
+	defer gz.Close()
+
+	return parse(gz)
+}
+
+func parse(ior io.Reader) *Tokenizer {
+	r := bufio.NewReader(ior)
+
+	tok := &Tokenizer{
+		sigma:     make(map[rune]int),
+		sigma_rev: make(map[int]rune),
+	}
+
+	final := false
+
+	var arrstate, arrin, arrout, arrtarget, arrfinal int
+
+	mode := 0
+	var elem []string
+	var elemint [5]int
+
+	for {
+		line, err := r.ReadString('\n')
+		if err != nil {
+			if err == io.EOF {
+				break
+			}
+			panic(err)
+		}
+		if strings.HasPrefix(line, "##foma-net") {
+			continue
+		}
+		if strings.HasPrefix(line, "##props##") {
+			mode = PROPS
+			continue
+		}
+		if strings.HasPrefix(line, "##states##") {
+			mode = STATES
+
+			// Adds a final transition symbol to sigma
+			// written as '#' in Mizobuchi et al (2000)
+			tok.sigmacount++
+			FINAL = tok.sigmacount
+			continue
+		}
+		if strings.HasPrefix(line, "##sigma##") {
+			mode = SIGMA
+			continue
+		}
+		if strings.HasPrefix(line, "##end##") {
+			mode = NONE
+			continue
+		}
+
+		switch mode {
+		case PROPS:
+			{
+				elem = strings.Split(line, " ")
+				/*
+					fmt.Println("arity:            " + elem[0])
+					fmt.Println("arccount:         " + elem[1])
+					fmt.Println("statecount:       " + elem[2])
+					fmt.Println("linecount:        " + elem[3])
+					fmt.Println("finalcount:       " + elem[4])
+					fmt.Println("pathcount:        " + elem[5])
+					fmt.Println("is_deterministic: " + elem[6])
+					fmt.Println("is_pruned:        " + elem[7])
+					fmt.Println("is_minimized:     " + elem[8])
+					fmt.Println("is_epsilon_free:  " + elem[9])
+					fmt.Println("is_loop_free:     " + elem[10])
+					fmt.Println("extras:           " + elem[11])
+					fmt.Println("name:             " + elem[12])
+				*/
+				if elem[6] != "1" {
+					panic("The FST needs to be deterministic")
+				}
+				if elem[9] != "1" {
+					panic("The FST needs to be epsilon free")
+				}
+
+				elemint[0], err = strconv.Atoi(elem[1])
+				if err != nil {
+					panic("Can't read arccount")
+				}
+				tok.arccount = elemint[0]
+
+				// States start at 1 in Mizobuchi et al (2000),
+				// as the state 0 is associated with a fail.
+				// Initialize states and transitions
+				elemint[0], err = strconv.Atoi(elem[2])
+				if err != nil {
+					panic("Can't read statecount")
+				}
+				tok.statecount = elemint[0]
+				tok.transitions = make([]map[int]*edge, elemint[0]+1)
+				continue
+			}
+		case STATES:
+			{
+				elem = strings.Split(line[0:len(line)-1], " ")
+				if elem[0] == "-1" {
+					continue
+				}
+				elemint[0], err = strconv.Atoi(elem[0])
+
+				if len(elem) > 1 {
+					elemint[1], err = strconv.Atoi(elem[1])
+					if err != nil {
+						break
+					}
+					if len(elem) > 2 {
+						elemint[2], err = strconv.Atoi(elem[2])
+						if err != nil {
+							break
+						}
+						if len(elem) > 3 {
+							elemint[3], err = strconv.Atoi(elem[3])
+							if err != nil {
+								break
+							}
+							if len(elem) > 4 {
+								elemint[4], err = strconv.Atoi(elem[4])
+								if err != nil {
+									break
+								}
+							}
+						}
+					}
+				}
+
+				switch len(elem) {
+				case 5:
+					{
+						arrstate = elemint[0]
+						arrin = elemint[1]
+						arrout = elemint[2]
+						arrtarget = elemint[3]
+						arrfinal = elemint[4]
+					}
+				case 4:
+					{
+						if elemint[1] == -1 {
+							arrstate = elemint[0]
+							arrfinal = elemint[3]
+						} else {
+							arrstate = elemint[0]
+							arrin = elemint[1]
+							arrtarget = elemint[2]
+							arrfinal = elemint[3]
+							arrout = arrin
+						}
+					}
+				case 3:
+					{
+						arrin = elemint[0]
+						arrout = elemint[1]
+						arrtarget = elemint[2]
+					}
+				case 2:
+					{
+						arrin = elemint[0]
+						arrtarget = elemint[2]
+						arrout = arrin
+					}
+				}
+
+				// This collects all edges until arrstate changes
+				if arrfinal == 1 {
+					final = true
+				} else {
+					final = false
+				}
+
+				// While the states in foma start with 0, the states in the
+				// Mizobuchi FSA start with one - so we increase every state by 1.
+
+				if arrin != arrout && arrin != EPSILON && tok.sigma_rev[arrin] != '\n' {
+					panic("Problem: " + strconv.Itoa(arrstate) + " -> " + strconv.Itoa(arrtarget) + " (" + strconv.Itoa(arrin) + ":" + strconv.Itoa(arrout) + ") ")
+				}
+
+				// TODO:
+				//   if arrin == EPSILON && arrout == TOKENEND, mark state as newline
+				//   if the next transition is the same, remove TOKENEND and add SENTENCEEND
+				//   This requires to remove the transition alltogether and marks the state instead.
+
+				// TODO:
+				//   if arrout == EPSILON, mark the transition as NOTOKEN
+
+				targetObj := &edge{
+					in:     arrin,
+					out:    arrout,
+					target: arrtarget + 1,
+				}
+
+				// Initialize outgoing state
+				if tok.transitions[arrstate+1] == nil {
+					tok.transitions[arrstate+1] = make(map[int]*edge)
+				}
+
+				tok.transitions[arrstate+1][arrin] = targetObj
+
+				if final {
+					tok.transitions[arrstate+1][FINAL] = &edge{}
+				}
+
+				continue
+			}
+		case SIGMA:
+			{
+				elem = strings.SplitN(line[0:len(line)-1], " ", 2)
+
+				// Turn string into sigma id
+				number, err := strconv.Atoi(elem[0])
+
+				if err != nil {
+					panic(err)
+				}
+
+				tok.sigmacount = number
+
+				var symbol rune
+
+				// Read rune
+				if utf8.RuneCountInString(elem[1]) == 1 {
+					symbol = []rune(elem[1])[0]
+
+					// Probably a MCS
+				} else if utf8.RuneCountInString(elem[1]) > 1 {
+					switch elem[1] {
+					case "@_EPSILON_SYMBOL_@":
+						{
+							EPSILON = number
+							continue
+						}
+					case "@_UNKNOWN_SYMBOL_@":
+						{
+							UNKNOWN = number
+							continue
+						}
+
+					case "@_IDENTITY_SYMBOL_@":
+						{
+							IDENTITY = number
+							continue
+						}
+					default:
+						panic("MCS not supported: " + line)
+					}
+
+					// Probably a new line symbol
+				} else {
+					line, err = r.ReadString('\n')
+					if err != nil {
+						panic(err)
+					}
+					if len(line) != 1 {
+						panic("MCS not supported:" + line)
+					}
+					symbol = rune('\n')
+				}
+
+				tok.sigma[symbol] = number
+				tok.sigma_rev[number] = symbol
+			}
+		}
+	}
+
+	return tok
+}
+
+// Implementation of Mizobuchi et al (2000), p.128
+func (tok *Tokenizer) buildDA() *Tokenizer {
+
+	mark := 0
+	size := 0
+
+	// Create a mapping from s to t
+	table := make([]*mapping, tok.arccount+1)
+
+	table[size] = &mapping{source: 1, target: 1}
+	size++
+
+	A := make([]int, 0, 256)
+
+	for mark < size {
+		s := table[mark].source // This is a state in Ms
+		t := table[mark].target // This is a state in Mt
+		mark++
+		//		fmt.Println("Increase mark", mark)
+		// St := append(St, t)
+		A = A[:0]
+		tok.get_set(s, &A)
+
+		// fmt.Println("Outgoing arcs from t", t, A)
+
+		// tok.array[t].base = tok.x_check(A)
+		tok.set_base(t, tok.x_check(A))
+
+		for _, a := range A {
+
+			if a != FINAL {
+				s1 := tok.transitions[s][a].target // g(s, a)
+
+				// fmt.Println("Found", s, "to", s1, "via", a)
+
+				t1 := tok.get_base(t) + a
+				tok.set_check(t1, t)
+
+				r := in_table(s1, table, size)
+				if r == 0 {
+					// fmt.Println("Increase size", t1)
+					table[size] = &mapping{source: s1, target: t1}
+					size++
+				} else {
+					//fmt.Println("Rep is there", t1, r)
+					tok.set_base(t1, -1*r)
+					// tok.array[t1].base = -1 * r
+				}
+			} else {
+				fmt.Println("I set a final")
+				// t1 := tok.array[t].base + FINAL
+				t1 := tok.get_base(t) + FINAL
+				// tok.array[t1].check = t
+				tok.set_check(t1, t)
+			}
+		}
+	}
+
+	// Following Mizobuchi et al (2000) the size of the
+	// FSA should be stored in check(1).
+	tok.set_check(1, tok.maxsize+1)
+	tok.array = tok.array[:tok.maxsize+1]
+	return tok
+}
+
+func (tok *Tokenizer) resize(l int) {
+	if len(tok.array) < l {
+		tok.array = append(tok.array, make([]int, l)...)
+	}
+}
+
+func (tok *Tokenizer) set_base(p int, v int) {
+	l := p*2 + 1
+	tok.resize(l)
+	if tok.maxsize < l {
+		tok.maxsize = l
+	}
+	tok.array[p*2] = v
+}
+
+func (tok *Tokenizer) get_base(p int) int {
+	if p*2 >= len(tok.array) {
+		return 0
+	}
+	return tok.array[p*2]
+}
+
+func (tok *Tokenizer) set_check(p int, v int) {
+	l := p*2 + 1
+	tok.resize(l)
+	if tok.maxsize < l {
+		tok.maxsize = l
+	}
+	tok.array[(p*2)+1] = v
+}
+
+func (tok *Tokenizer) get_check(p int) int {
+	if (p*2)+1 >= len(tok.array) {
+		return 0
+	}
+	return tok.array[(p*2)+1]
+}
+
+// Check the table if a mapping of s
+// exists and return this as a representative
+func in_table(s int, table []*mapping, size int) int {
+	for x := 0; x < size; x++ {
+		if table[x].source == s {
+			return table[x].target
+		}
+	}
+	return 0
+}
+
+// Set alphabet A to the list of all symbols
+// outgoing from s
+func (tok *Tokenizer) get_set(s int, A *[]int) {
+	for a, _ := range tok.transitions[s] {
+		*A = append(*A, a)
+	}
+}
+
+// Based on Mizobuchi et al (2000), p. 124
+// This iterates for every state through the complete double array
+// structure until it finds a gap that fits all outgoing transitions
+// of the state. This is extremely slow, but is only necessary in the
+// construction phase of the tokenizer.
+func (tok *Tokenizer) x_check(symbols []int) int {
+	// see https://github.com/bramstein/datrie/blob/master/lib/trie.js
+	base := 1
+
+	// 	fmt.Println("Resize", len(tok.linarray), "<", ((base + FINAL + 1) * 2))
+
+OVERLAP:
+	tok.resize((base + FINAL) * 2)
+	for _, a := range symbols {
+		// if tok.array[base+a].check != 0 {
+		if tok.get_check(base+a) != 0 {
+			base++
+			goto OVERLAP
+		}
+	}
+	fmt.Println("Found a nice place at", base, "for", len(symbols))
+	return base
+}
+
+// Based on Mizobuchi et al (2000), p. 129
+func (tok *Tokenizer) match(input string) bool {
+	t := 1 // Start position
+	chars := []rune(input)
+	i := 0
+	for ; i < len(chars); i++ {
+		a := tok.sigma[chars[i]]
+		tu := t
+		t = tok.get_base(tu) + a
+		fmt.Println("Check", a, t, tok.get_check(1))
+		if t > tok.get_check(1) {
+			break
+		} else if tok.get_check(t) != tu {
+			break
+		} else if tok.get_base(t) < 0 {
+			t = -1 * tok.get_base(t)
+			// fmt.Println("Match is representative!")
+		} else {
+			// fmt.Println("Match is fine!")
+		}
+	}
+
+	if i == len(chars) {
+		fmt.Println("At the end")
+	} else {
+		return false
+	}
+
+	// fmt.Println("Hmm...", tok.get_check(tok.get_base(t)+FINAL), "-", t)
+
+	if tok.get_check(tok.get_base(t)+FINAL) == t {
+		fmt.Println("FINE")
+		return true
+	}
+	return false
+}
+
+// In the final realization, the states can only have 30 bits:
+// base[1] -> is final
+// base[2] -> is_separate
+// check[1] -> translates to epsilon
+// check[2] -> appends newine (Maybe)
+// If check[1] && check[2] is set, this translates to a sentence split (Maybe)

diff --git a/datokenizer_test.go b/datokenizer_test.go
new file mode 100644
index 0000000..8945f91
--- /dev/null
+++ b/datokenizer_test.go

@@ -0,0 +1,40 @@
+package datokenizer
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestSimpleString(t *testing.T) {
+	assert := assert.New(t)
+
+	// bau | bauamt
+	r := strings.NewReader(`##foma-net 1.0##
+##props##
+1 6 7 8 2 2 1 1 1 1 1 2 5B57D486
+##sigma##
+0 @_EPSILON_SYMBOL_@
+3 a
+4 b
+5 m
+6 t
+7 u
+##states##
+0 4 1 0
+1 3 2 0
+2 7 3 0
+3 3 4 1
+4 5 5 0
+5 6 6 0
+6 -1 -1 1
+-1 -1 -1 -1 -1
+##end##`)
+
+	tok := parse(r) // ("tokenizer.fst")
+	tok.buildDA()
+	assert.True(tok.match("bau"))
+	assert.True(tok.match("bauamt"))
+	assert.False(tok.match("baum"))
+}

diff --git a/go.mod b/go.mod
new file mode 100644
index 0000000..53dc7d0
--- /dev/null
+++ b/go.mod

@@ -0,0 +1,5 @@
+module github.com/KorAP/datokenizer
+
+go 1.16
+
+require github.com/stretchr/testify v1.7.0

diff --git a/go.sum b/go.sum
new file mode 100644
index 0000000..acb88a4
--- /dev/null
+++ b/go.sum

@@ -0,0 +1,11 @@
+github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
+github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
commit	8ef408b0a3e1d8eddd1beeedab0c0f8b78f39c2c	[log] [tgz]
author	Akron <nils@diewald-online.de>	Mon Aug 02 22:11:04 2021 +0200
committer	Akron <nils@diewald-online.de>	Mon Aug 02 22:11:04 2021 +0200
tree	495928cb054d90a74813953bf968c39b13597b21