datokenizer.go - KorAP/Datok - Gitiles

 package datokenizer

 /**
  * The file reader is basically a port of foma2js,
  * licensed under the Apache License, version 2,
  * and written by Mans Hulden.
  */

 // The maximum number of states is 107.3741.823 (30bit),
 // with a loadfactor of ~70, this means roughly 70 million
 // states in the FSA, which is sufficient for the current
 // job.

 // TODO:
 // - replace maxSize with the check value
 // - Strip first state and make everything start with 0!
 // - Add checksum to serialization.

 import (
 	"bufio"
 	"compress/gzip"
 	"encoding/binary"
 	"fmt"
 	"io"
 	"os"
 	"sort"
 	"strconv"
 	"strings"
 	"unicode/utf8"

 	"github.com/rs/zerolog/log"
 )

 const (
 	PROPS            = 1
 	SIGMA            = 2
 	STATES           = 3
 	NONE             = 4
 	NEWLINE          = '\u000a'
 	DEBUG            = false
 	MAGIC            = "DATOK"
 	VERSION          = uint16(1)
 	firstBit  uint32 = 1 << 31
 	secondBit uint32 = 1 << 30
 	restBit   uint32 = ^uint32(0) &^ (firstBit | secondBit)
 )

 var bo binary.ByteOrder = binary.LittleEndian

 type mapping struct {
 	source int
 	target uint32
 }

 type edge struct {
 	inSym    int
 	outSym   int
 	end      int
 	nontoken bool
 	tokenend bool
 }

 type Tokenizer struct {
 	// sigma       map[rune]int
 	sigmaRev    map[int]rune
 	arcCount    int
 	stateCount  int
 	sigmaCount  int
 	transitions []map[int]*edge

 	// Special symbols in sigma
 	epsilon  int
 	unknown  int
 	identity int
 	final    int
 }

 type DaTokenizer struct {
 	// sigmaRev map[int]rune
 	sigma      map[rune]int
 	maxSize    int
 	loadFactor float64
 	array      []uint32
 	// lastFilledBase uint32

 	// Special symbols in sigma
 	epsilon  int
 	unknown  int
 	identity int
 	final    int
 }

 func LoadFomaFile(file string) *Tokenizer {
 	f, err := os.Open(file)
 	if err != nil {
 		log.Error().Err(err)
 		os.Exit(0)
 	}
 	defer f.Close()

 	gz, err := gzip.NewReader(f)
 	if err != nil {
 		log.Error().Err(err)
 		os.Exit(0)
 	}
 	defer gz.Close()

 	return ParseFoma(gz)
 }

 func ParseFoma(ior io.Reader) *Tokenizer {
 	r := bufio.NewReader(ior)

 	tok := &Tokenizer{
 		sigmaRev: make(map[int]rune),
 		epsilon:  -1,
 		unknown:  -1,
 		identity: -1,
 		final:    -1,
 	}

 	checkmap := make(map[string]bool)

 	var state, inSym, outSym, end, final int

 	mode := 0
 	var elem []string
 	var elemint [5]int

 	for {
 		line, err := r.ReadString('\n')
 		if err != nil {
 			if err == io.EOF {
 				break
 			}
 			log.Error().Err(err)
 			os.Exit(0)
 		}
 		if strings.HasPrefix(line, "##foma-net") {
 			continue
 		}
 		if strings.HasPrefix(line, "##props##") {
 			mode = PROPS
 			continue
 		}
 		if strings.HasPrefix(line, "##states##") {
 			mode = STATES

 			// Adds a final transition symbol to sigma
 			// written as '#' in Mizobuchi et al (2000)
 			tok.sigmaCount++
 			tok.final = tok.sigmaCount
 			continue
 		}
 		if strings.HasPrefix(line, "##sigma##") {
 			mode = SIGMA
 			continue
 		}
 		if strings.HasPrefix(line, "##end##") {
 			mode = NONE
 			continue
 		}

 		switch mode {
 		case PROPS:
 			{
 				elem = strings.Split(line, " ")
 				/*
 					fmt.Println("arity:            " + elem[0])
 					fmt.Println("arccount:         " + elem[1])
 					fmt.Println("statecount:       " + elem[2])
 					fmt.Println("linecount:        " + elem[3])
 					fmt.Println("finalcount:       " + elem[4])
 					fmt.Println("pathcount:        " + elem[5])
 					fmt.Println("is_deterministic: " + elem[6])
 					fmt.Println("is_pruned:        " + elem[7])
 					fmt.Println("is_minimized:     " + elem[8])
 					fmt.Println("is_epsilon_free:  " + elem[9])
 					fmt.Println("is_loop_free:     " + elem[10])
 					fmt.Println("extras:           " + elem[11])
 					fmt.Println("name:             " + elem[12])
 				*/
 				if elem[6] != "1" {
 					log.Error().Msg("The FST needs to be deterministic")
 					os.Exit(1)
 				}
 				if elem[9] != "1" {
 					log.Error().Msg("The FST needs to be epsilon free")
 					os.Exit(1)
 				}

 				elemint[0], err = strconv.Atoi(elem[1])
 				if err != nil {
 					log.Error().Msg("Can't read arccount")
 					os.Exit(1)
 				}
 				tok.arcCount = elemint[0]

 				// States start at 1 in Mizobuchi et al (2000),
 				// as the state 0 is associated with a fail.
 				// Initialize states and transitions
 				elemint[0], err = strconv.Atoi(elem[2])
 				if err != nil {
 					log.Error().Msg("Can't read statecount")
 					os.Exit(1)
 				}
 				tok.stateCount = elemint[0]
 				tok.transitions = make([]map[int]*edge, elemint[0]+1)
 				continue
 			}
 		case STATES:
 			{
 				elem = strings.Split(line[0:len(line)-1], " ")
 				if elem[0] == "-1" {
 					if DEBUG {
 						fmt.Println("Skip", elem)
 					}
 					continue
 				}
 				elemint[0], err = strconv.Atoi(elem[0])
 				if err != nil {
 					fmt.Println("Unable to translate", elem[0])
 					break
 				}

 				if len(elem) > 1 {
 					elemint[1], err = strconv.Atoi(elem[1])
 					if err != nil {
 						fmt.Println("Unable to translate", elem[1])
 						break
 					}
 					if len(elem) > 2 {
 						elemint[2], err = strconv.Atoi(elem[2])
 						if err != nil {
 							fmt.Println("Unable to translate", elem[2])
 							break
 						}
 						if len(elem) > 3 {
 							elemint[3], err = strconv.Atoi(elem[3])
 							if err != nil {
 								fmt.Println("Unable to translate", elem[3])
 								break
 							}
 							if len(elem) > 4 {
 								elemint[4], err = strconv.Atoi(elem[4])
 								if err != nil {
 									fmt.Println("Unable to translate", elem[4])
 									break
 								}
 							}
 						}
 					}
 				}

 				switch len(elem) {
 				case 5:
 					{
 						state = elemint[0]
 						inSym = elemint[1]
 						outSym = elemint[2]
 						end = elemint[3]
 						final = elemint[4]
 					}
 				case 4:
 					{
 						if elemint[1] == -1 {
 							state = elemint[0]
 							final = elemint[3]
 						} else {
 							state = elemint[0]
 							inSym = elemint[1]
 							end = elemint[2]
 							final = elemint[3]
 							outSym = inSym
 						}
 					}
 				case 3:
 					{
 						inSym = elemint[0]
 						outSym = elemint[1]
 						end = elemint[2]
 					}
 				case 2:
 					{
 						inSym = elemint[0]
 						end = elemint[1]
 						outSym = inSym
 					}
 				}

 				// While the states in foma start with 0, the states in the
 				// Mizobuchi FSA start with one - so we increase every state by 1.

 				nontoken := false
 				tokenend := false

 				// ID needs to be > 1
 				inSym++
 				outSym++

 				if inSym != outSym {

 					if tok.sigmaRev[outSym] == NEWLINE {
 						tokenend = true
 					} else if outSym == tok.epsilon {
 						nontoken = true
 					} else {
 						log.Error().Msg(
 							"Unsupported transition: " +
 								strconv.Itoa(state) +
 								" -> " + strconv.Itoa(end) +
 								" (" +
 								strconv.Itoa(inSym) +
 								":" +
 								strconv.Itoa(outSym) +
 								") (" +
 								string(tok.sigmaRev[inSym]) +
 								":" +
 								string(tok.sigmaRev[outSym]) +
 								")")
 						os.Exit(1)
 					}

 				} else if inSym == tok.epsilon {
 					log.Error().Msg("Epsilon transitions not supported")
 					os.Exit(1)
 				}

 				// This collects all edges until arrstate changes

 				// TODO:
 				//   if arrin == EPSILON && arrout == TOKENEND, mark state as newline
 				//   if the next transition is the same, remove TOKENEND and add SENTENCEEND
 				//   This requires to remove the transition alltogether and marks the state instead.

 				// TODO:
 				//   if arrout == EPSILON, mark the transition as NOTOKEN

 				targetObj := &edge{
 					inSym:    inSym,
 					outSym:   outSym,
 					end:      end + 1,
 					tokenend: tokenend,
 					nontoken: nontoken,
 				}

 				// Initialize outgoing states
 				if tok.transitions[state+1] == nil {
 					tok.transitions[state+1] = make(map[int]*edge)
 				}

 				// Ignore transitions with invalid symbols
 				if inSym >= 0 {
 					tok.transitions[state+1][inSym] = targetObj
 				}

 				// Add final transition
 				if final == 1 {
 					// TODO: maybe this is irrelevant for tokenizers
 					tok.transitions[state+1][tok.final] = &edge{}
 				}

 				test := fmt.Sprint(state+1) + ":" + fmt.Sprint(inSym)
 				if checkmap[test] {
 					fmt.Println("Path already defined!", test)
 					os.Exit(0)
 				} else {
 					checkmap[test] = true
 				}

 				if DEBUG || state+1 == 281 {
 					fmt.Println("Add",
 						state+1, "->", end+1,
 						"(",
 						inSym,
 						":",
 						outSym,
 						") (",
 						string(tok.sigmaRev[inSym]),
 						":",
 						string(tok.sigmaRev[outSym]),
 						")",
 						";",
 						"TE:", tokenend,
 						"NT:", nontoken,
 						"FIN:", final)
 				}

 				continue
 			}
 		case SIGMA:
 			{
 				elem = strings.SplitN(line[0:len(line)-1], " ", 2)

 				// Turn string into sigma id
 				number, err := strconv.Atoi(elem[0])

 				// ID needs to be > 1
 				number++

 				if err != nil {
 					log.Error().Err(err)
 					os.Exit(0)
 				}

 				tok.sigmaCount = number

 				var symbol rune

 				// Read rune
 				if utf8.RuneCountInString(elem[1]) == 1 {
 					symbol = []rune(elem[1])[0]

 					// Probably a MCS
 				} else if utf8.RuneCountInString(elem[1]) > 1 {
 					switch elem[1] {
 					case "@_EPSILON_SYMBOL_@":
 						{
 							tok.epsilon = number
 							continue
 						}
 					case "@_UNKNOWN_SYMBOL_@":
 						{
 							tok.unknown = number
 							continue
 						}

 					case "@_IDENTITY_SYMBOL_@":
 						{
 							tok.identity = number
 							continue
 						}
 					default:
 						{
 							log.Error().Msg("MCS not supported: " + line)
 							os.Exit(1)
 						}
 					}

 				} else { // Probably a new line symbol
 					line, err = r.ReadString('\n')
 					if err != nil {
 						log.Error().Err(err)
 						os.Exit(0)
 					}
 					if len(line) != 1 {
 						log.Error().Msg("MCS not supported:" + line)
 						os.Exit(0)
 					}
 					symbol = rune(NEWLINE)
 				}

 				tok.sigmaRev[number] = symbol
 			}
 		}
 	}

 	return tok
 }

 // Set alphabet A to the list of all symbols
 // outgoing from s
 func (tok *Tokenizer) get_set(s int, A *[]int) {
 	for a := range tok.transitions[s] {
 		*A = append(*A, a)
 	}

 	// Not required, but simplifies bug hunting
 	sort.Ints(*A)
 }

 // Implementation of Mizobuchi et al (2000), p.128
 func (tok *Tokenizer) ToDoubleArray() *DaTokenizer {

 	dat := &DaTokenizer{
 		sigma:      make(map[rune]int),
 		loadFactor: -1,
 		final:      tok.final,
 		unknown:    tok.unknown,
 		identity:   tok.identity,
 		epsilon:    tok.epsilon,
 		// lastFilledBase: 1,
 	}

 	for num, sym := range tok.sigmaRev {
 		dat.sigma[sym] = num
 	}

 	mark := 0
 	size := 0

 	// Create a mapping from s to t
 	table := make([]*mapping, tok.arcCount+1)

 	table[size] = &mapping{source: 1, target: 1}
 	size++

 	// Allocate space for the outgoing symbol range
 	A := make([]int, 0, tok.sigmaCount)

 	for mark < size {
 		s := table[mark].source // This is a state in Ms
 		t := table[mark].target // This is a state in Mt
 		mark++

 		if t == 6288 {
 			fmt.Println("1 State", t, "was", s)
 		}

 		// Following the paper, here the state t can be remembered
 		// in the set of states St
 		A = A[:0]
 		tok.get_set(s, &A)

 		// Set base to the first free slot in the double array
 		dat.setBase(t, dat.xCheck(A))

 		// TODO:
 		//   Sort the outgoing transitions based on the
 		//   outdegree of .end

 		// Iterate over all outgoing symbols
 		for _, a := range A {

 			if a != tok.final {

 				// Aka g(s, a)
 				s1 := tok.transitions[s][a].end

 				// Store the transition
 				t1 := dat.getBase(t) + uint32(a)
 				dat.setCheck(t1, t)

 				if DEBUG || t1 == 6288 {
 					fmt.Println("Translate transition",
 						s, "->", s1, "(", a, ")", "to", t, "->", t1)
 				}

 				// Mark the state as being the target of a nontoken transition
 				if tok.transitions[s][a].nontoken {
 					dat.setNonToken(t1, true)
 					if DEBUG {
 						fmt.Println("Set", t1, "to nontoken")
 					}
 				}

 				// Mark the state as being the target of a tokenend transition
 				if tok.transitions[s][a].tokenend {
 					dat.setTokenEnd(t1, true)
 					if DEBUG {
 						fmt.Println("Set", t1, "to tokenend")
 					}
 				}

 				// Check for representative states
 				r := in_table(s1, table, size)

 				if r == 0 {
 					// Remember the mapping
 					table[size] = &mapping{source: s1, target: t1}
 					size++
 				} else {
 					// Overwrite with the representative state
 					dat.setBase(t1, r)
 					dat.setSeparate(t1, true)
 				}
 			} else {
 				// Store a final transition
 				dat.setCheck(dat.getBase(t)+uint32(dat.final), t)
 			}
 		}
 	}

 	// Following Mizobuchi et al (2000) the size of the
 	// FSA should be stored in check(1).
 	dat.setSize(dat.maxSize + 1)
 	dat.array = dat.array[:dat.maxSize+1]
 	return dat
 }

 // Check the table if a mapping of s
 // exists and return this as a representative.
 // Currently iterates through the whole table
 // in a bruteforce manner.
 func in_table(s int, table []*mapping, size int) uint32 {
 	for x := 0; x < size; x++ {
 		if table[x].source == s {
 			return table[x].target
 		}
 	}
 	return 0
 }

 // Resize double array when necessary
 func (dat *DaTokenizer) resize(l int) {
 	// TODO:
 	//   This is a bit too aggressive atm and should be calmed down.
 	if len(dat.array) <= l {
 		dat.array = append(dat.array, make([]uint32, l)...)
 	}
 }

 // Set base value in double array
 func (dat *DaTokenizer) setBase(p uint32, v uint32) {
 	l := int(p*2 + 1)
 	dat.resize(l)
 	if dat.maxSize < l {
 		dat.maxSize = l
 	}
 	dat.array[p*2] = v
 }

 // Returns true if a state is separate pointing to a representative
 func (dat *DaTokenizer) isSeparate(p uint32) bool {
 	return dat.array[p*2]&firstBit != 0
 }

 // Mark a state as separate pointing to a representative
 func (dat *DaTokenizer) setSeparate(p uint32, sep bool) {
 	if sep {
 		dat.array[p*2] |= firstBit
 	} else {
 		dat.array[p*2] &= (restBit | secondBit)
 	}
 }

 // Returns true if a state is the target of a nontoken transition
 func (dat *DaTokenizer) isNonToken(p uint32) bool {
 	return dat.array[p*2+1]&firstBit != 0
 }

 // Mark a state as being the target of a nontoken transition
 func (dat *DaTokenizer) setNonToken(p uint32, sep bool) {
 	if sep {
 		dat.array[p*2+1] |= firstBit
 	} else {
 		dat.array[p*2+1] &= (restBit | secondBit)
 	}
 }

 // Returns true if a state is the target of a tokenend transition
 func (dat *DaTokenizer) isTokenEnd(p uint32) bool {
 	return dat.array[p*2+1]&secondBit != 0
 }

 // Mark a state as being the target of a tokenend transition
 func (dat *DaTokenizer) setTokenEnd(p uint32, sep bool) {
 	if sep {
 		dat.array[p*2+1] |= secondBit
 	} else {
 		dat.array[p*2+1] &= (restBit | firstBit)
 	}
 }

 // Get base value in double array
 func (dat *DaTokenizer) getBase(p uint32) uint32 {
 	if int(p*2) >= len(dat.array) {
 		return 0
 	}
 	return dat.array[p*2] & restBit
 }

 // Set check value in double array
 func (dat *DaTokenizer) setCheck(p uint32, v uint32) {
 	l := int(p*2 + 1)
 	dat.resize(l)
 	if dat.maxSize < l {
 		dat.maxSize = l
 	}
 	dat.array[(p*2)+1] = v
 }

 // Get check value in double array
 func (dat *DaTokenizer) getCheck(p uint32) uint32 {
 	if int((p*2)+1) >= len(dat.array) {
 		return 0
 	}
 	return dat.array[(p*2)+1] & restBit
 }

 // Set size of double array
 func (dat *DaTokenizer) setSize(v int) {
 	dat.setCheck(1, uint32(v))
 }

 // Get size of double array
 func (dat *DaTokenizer) GetSize() int {
 	return int(dat.getCheck(1))
 }

 // Based on Mizobuchi et al (2000), p. 124
 // This iterates for every state through the complete double array
 // structure until it finds a gap that fits all outgoing transitions
 // of the state. This is extremely slow, but is only necessary in the
 // construction phase of the tokenizer.
 func (dat *DaTokenizer) xCheck(symbols []int) uint32 {

 	// Start at the first entry of the double array list
 	base := uint32(1) // dat.lastFilledBase
 	// skip := false
 OVERLAP:

 	/*
 		if !skip {
 			if dat.getCheck(base) != 0 {
 				dat.lastFilledBase = base
 			} else {
 				skip = true
 			}
 		}
 	*/

 	// Resize the array if necessary
 	dat.resize((int(base) + dat.final) * 2)
 	for _, a := range symbols {
 		if dat.getCheck(base+uint32(a)) != 0 {
 			base++
 			goto OVERLAP
 		}
 	}
 	return base
 }

 // List all outgoing transitions for a state
 // for testing purposes
 func (dat *DaTokenizer) outgoing(t uint32) []int {

 	valid := make([]int, 0, len(dat.sigma))

 	for _, a := range dat.sigma {
 		t1 := dat.getBase(t) + uint32(a)
 		if t1 <= dat.getCheck(1) && dat.getCheck(t1) == t {
 			valid = append(valid, a)
 		}
 	}

 	for _, a := range []int{dat.epsilon, dat.unknown, dat.identity, dat.final} {
 		t1 := dat.getBase(t) + uint32(a)
 		if t1 <= dat.getCheck(1) && dat.getCheck(t1) == t {
 			valid = append(valid, -1*a)
 		}
 	}

 	sort.Ints(valid)

 	return valid
 }

 // LoadFactor as defined in Kanda et al (2018),
 // i.e. the proportion of non-empty elements to all elements.
 func (dat *DaTokenizer) LoadFactor() float64 {

 	// Cache the loadfactor
 	if dat.loadFactor > 0 {
 		return dat.loadFactor
 	}
 	nonEmpty := 0
 	all := len(dat.array) / 2
 	for x := 1; x <= len(dat.array); x = x + 2 {
 		if dat.array[x] != 0 {
 			nonEmpty++
 		}
 	}
 	dat.loadFactor = float64(nonEmpty) / float64(all) * 100
 	return dat.loadFactor
 }

 // WriteTo stores the double array data in an io.Writer.
 func (dat *DaTokenizer) Save(file string) (n int64, err error) {
 	f, err := os.Create(file)
 	if err != nil {
 		log.Error().Err(err)
 		return 0, nil
 	}
 	defer f.Close()
 	gz := gzip.NewWriter(f)
 	defer gz.Close()
 	n, err = dat.WriteTo(gz)
 	if err != nil {
 		log.Error().Err(err)
 		return n, err
 	}
 	gz.Flush()
 	return n, nil
 }

 // WriteTo stores the double array data in an io.Writer.
 func (dat *DaTokenizer) WriteTo(w io.Writer) (n int64, err error) {

 	wb := bufio.NewWriter(w)
 	defer wb.Flush()

 	// Store magical header
 	all, err := wb.Write([]byte(MAGIC))
 	if err != nil {
 		log.Error().Err(err)
 		return int64(all), err
 	}

 	// Get sigma as a list
 	sigmalist := make([]rune, len(dat.sigma)+16)
 	max := 0
 	for sym, num := range dat.sigma {
 		sigmalist[num] = sym
 		if num > max {
 			max = num
 		}
 	}

 	sigmalist = sigmalist[:max+1]

 	buf := make([]byte, 0, 16)
 	bo.PutUint16(buf[0:2], VERSION)
 	bo.PutUint16(buf[2:4], uint16(dat.epsilon))
 	bo.PutUint16(buf[4:6], uint16(dat.unknown))
 	bo.PutUint16(buf[6:8], uint16(dat.identity))
 	bo.PutUint16(buf[8:10], uint16(dat.final))
 	bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
 	bo.PutUint32(buf[12:16], uint32(len(dat.array)))
 	more, err := wb.Write(buf[0:16])
 	if err != nil {
 		log.Error().Err(err)
 		return int64(all), err
 	}

 	all += more

 	//  wbuf := bytes.NewBuffer(nil)
 	// wbufWrap := bufio.NewWriter(wbuf)

 	// Write sigma
 	for _, sym := range sigmalist {

 		more, err = wb.WriteRune(sym)
 		if err != nil {
 			log.Error().Err(err)
 			return int64(all), err
 		}
 		all += more
 	}
 	// wbufWrap.Flush()
 	// more, err = w.Write(wbuf.Bytes())
 	if err != nil {
 		log.Error().Err(err)
 		return int64(all), err
 	}
 	// all += more

 	// Test marker - could be checksum
 	more, err = wb.Write([]byte("T"))
 	if err != nil {
 		log.Error().Err(err)
 		return int64(all), err
 	}
 	all += more

 	// wbuf.Reset()

 	for x := 0; x < len(dat.array); x++ {
 		//	for _, d := range dat.array {
 		bo.PutUint32(buf[0:4], dat.array[x])
 		more, err := wb.Write(buf[0:4])
 		if err != nil {
 			log.Error().Err(err)
 			return int64(all), err
 		}
 		if more != 4 {
 			log.Error().Msg("Can not write uint32")
 			return int64(all), err
 		}
 		all += more
 	}

 	// wbufWrap.Flush()

 	return int64(all), err
 }

 func LoadDatokFile(file string) *DaTokenizer {
 	f, err := os.Open(file)
 	if err != nil {
 		log.Error().Err(err)
 		os.Exit(0)
 	}
 	defer f.Close()

 	gz, err := gzip.NewReader(f)
 	if err != nil {
 		log.Error().Err(err)
 		os.Exit(0)
 	}
 	defer gz.Close()

 	// Todo: Read the whole file!
 	return ParseDatok(gz)
 }

 func ParseDatok(ior io.Reader) *DaTokenizer {

 	dat := &DaTokenizer{
 		sigma:      make(map[rune]int),
 		epsilon:    0,
 		unknown:    0,
 		identity:   0,
 		final:      0,
 		loadFactor: 0,
 	}

 	r := bufio.NewReader(ior)

 	all := 0

 	buf := make([]byte, 1024)
 	buf = buf[0:len(MAGIC)]

 	more, err := r.Read(buf)

 	if err != nil {
 		log.Error().Err(err)
 		return nil
 	}

 	all += more

 	if string(MAGIC) != string(buf) {
 		log.Error().Msg("Not a datok file")
 		return nil
 	}

 	more, err = io.ReadFull(r, buf[0:16])
 	if err != nil {
 		log.Error().Err(err)
 		return nil
 	}

 	all += more

 	version := bo.Uint16(buf[0:2])

 	if version != VERSION {
 		log.Error().Msg("Version not compatible")
 		return nil
 	}

 	dat.epsilon = int(bo.Uint16(buf[2:4]))
 	dat.unknown = int(bo.Uint16(buf[4:6]))
 	dat.identity = int(bo.Uint16(buf[6:8]))
 	dat.final = int(bo.Uint16(buf[8:10]))

 	sigmaCount := int(bo.Uint16(buf[10:12]))
 	arraySize := int(bo.Uint32(buf[12:16]))

 	// Shouldn't be relevant though
 	dat.maxSize = arraySize - 1

 	for x := 0; x < sigmaCount; x++ {
 		sym, more, err := r.ReadRune()
 		if err == nil && sym != 0 {
 			dat.sigma[sym] = x
 		}
 		all += more
 	}

 	more, err = io.ReadFull(r, buf[0:1])

 	if err != nil {
 		log.Error().Err(err)
 		return nil
 	}

 	all += more

 	if string("T") != string(buf[0:1]) {
 		log.Error().Msg("Not a datok file")
 		return nil
 	}

 	// Read based on length
 	dat.array = make([]uint32, arraySize)

 	for x := 0; x < arraySize; x++ {
 		more, err = io.ReadFull(r, buf[0:4])
 		if err != nil {
 			if err == io.EOF {
 				fmt.Println(arraySize, x)
 				break
 			}
 			log.Error().Err(err)
 			return nil
 		}
 		all += more
 		dat.array[x] = bo.Uint32(buf[0:4])
 	}

 	return dat
 }

 // Match an input string against the double array
 // FSA.
 //
 // Based on Mizobuchi et al (2000), p. 129,
 // with additional support for IDENTITY, UNKNOWN
 // and EPSILON transitions.
 func (dat *DaTokenizer) Match(input string) bool {
 	var a int
 	var tu uint32
 	var ok bool

 	t := uint32(1) // Initial state
 	chars := []rune(input)
 	i := 0

 	for i < len(chars) {
 		a, ok = dat.sigma[chars[i]]

 		// Support identity symbol if character is not in sigma
 		if !ok && dat.identity != -1 {
 			if DEBUG {
 				fmt.Println("IDENTITY symbol", string(chars[i]), "->", dat.identity)
 			}
 			a = dat.identity
 		} else if DEBUG {
 			fmt.Println("Sigma transition is okay for [", string(chars[i]), "]")
 		}
 		tu = t
 	CHECK:
 		t = dat.getBase(tu) + uint32(a)

 		// Check if the transition is valid according to the double array
 		if t > dat.getCheck(1) || dat.getCheck(t) != tu {

 			if DEBUG {
 				fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
 			}

 			if !ok && a == dat.identity {
 				// Try again with unknown symbol, in case identity failed
 				if DEBUG {
 					fmt.Println("UNKNOWN symbol", string(chars[i]), "->", dat.unknown)
 				}
 				a = dat.unknown

 			} else if a != dat.epsilon {
 				// Try again with epsilon symbol, in case everything else failed
 				if DEBUG {
 					fmt.Println("EPSILON symbol", string(chars[i]), "->", dat.epsilon)
 				}
 				a = dat.epsilon
 			} else {
 				break
 			}
 			goto CHECK
 		} else if dat.isSeparate(t) {
 			// Move to representative state
 			t = dat.getBase(t)
 		}

 		// Transition is fine
 		if a != dat.epsilon {
 			// Character consumed
 			i++
 		}

 		// TODO:
 		//   Prevent endless epsilon loops!
 	}

 	if i != len(chars) {
 		if DEBUG {
 			fmt.Println("Not at the end")
 		}
 		return false
 	}

 FINALCHECK:

 	// Automaton is in a final state
 	if dat.getCheck(dat.getBase(t)+uint32(dat.final)) == t {
 		return true
 	}

 	// Check epsilon transitions until a final state is reached
 	tu = t
 	t = dat.getBase(tu) + uint32(dat.epsilon)

 	// Epsilon transition failed
 	if t > dat.getCheck(1) || dat.getCheck(t) != tu {
 		if DEBUG {
 			fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
 		}
 		return false

 	} else if dat.isSeparate(t) {
 		// Move to representative state
 		t = dat.getBase(t)
 	}

 	goto FINALCHECK
 }

 func showBuffer(buffer []rune, buffo int, buffi int) string {
 	out := make([]rune, 0, 1024)
 	for x := 0; x < len(buffer); x++ {
 		if buffi == x {
 			out = append(out, '^')
 		}
 		if buffo == x {
 			out = append(out, '[', buffer[x], ']')
 		} else {
 			out = append(out, buffer[x])
 		}
 	}
 	return string(out)
 }

 // Transduce an input string against the double array
 // FSA. The rules are always greedy. If the automaton fails,
 // it takes the last possible token ending branch.
 //
 // Based on Match with additional support
 // for NONTOKEN and TOKENEND handling
 func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
 	var a int
 	var tu uint32
 	var ok, nontoken, tokenend bool

 	// Remember the last position of a possible tokenend,
 	// in case the automaton fails.
 	epsilonState := uint32(0)
 	epsilonOffset := 0

 	// Implement a low level buffer for full control,
 	// however - it is probably better to introduce
 	// this on a higher level with a io.Reader interface
 	// The buffer stores a single word and may have white
 	// space at the end (but not at the beginning).
 	//
 	// This is the only backtracking requirement because of
 	// epsilon transitions, to support tokenizations like:
 	// "this is an example|.| And it works." vs
 	// "this is an example.com| application."
 	buffer := make([]rune, 1024)
 	buffo := 0 // Buffer offset
 	buffi := 0 // Buffer length

 	reader := bufio.NewReader(r)
 	writer := bufio.NewWriter(w)
 	defer writer.Flush()

 	t := uint32(1) // Initial state
 	// skip := false

 	var char rune
 	var err error
 	eof := false

 	// TODO:
 	//   Write all characters first into a buffer
 	//   and flush when necessary
 	// TODO:
 	//   Create an epsilon stack
 	for {

 		// if !skip {
 		if buffo >= buffi {
 			char, _, err = reader.ReadRune()
 			if err != nil {
 				eof = true
 				break
 			}
 			buffer[buffi] = char
 			buffi++
 		}

 		/*
 			if buffo < buffi {
 				char = buffer[buffo]
 				fmt.Println("Read from buffer", string(char), "=", buffo, ':', buffi)
 				buffo++
 			} else {
 				// Possible read from buffer
 				char, _, err = reader.ReadRune()
 				fmt.Println("Read from reader", string(char))

 				if err != nil {
 					eof = true
 					break
 				}
 			}
 		*/
 		// }
 		char = buffer[buffo]
 		// skip = false

 		if DEBUG {
 			fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
 		}

 		a, ok = dat.sigma[char]

 		// Support identity symbol if character is not in sigma
 		if !ok && dat.identity != -1 {
 			if DEBUG {
 				fmt.Println("IDENTITY symbol", string(char), "->", dat.identity)
 			}
 			a = dat.identity
 		} else if DEBUG {
 			fmt.Println("Sigma transition is okay for [", string(char), "]")
 		}
 		tu = t

 		if dat.getCheck(dat.getBase(tu)+uint32(dat.epsilon)) == tu {
 			if DEBUG {
 				fmt.Println("Remember for epsilon tu:charcount", tu, buffo)
 			}
 			epsilonState = tu
 			epsilonOffset = buffo
 		}

 	CHECK:
 		nontoken = false
 		tokenend = false

 		t = dat.getBase(tu) + uint32(a)

 		if DEBUG {
 			fmt.Println("Check", tu, "-", a, "(", string(char), ")", "->", t)
 			fmt.Println("Valid:", dat.outgoing(tu))
 		}

 		// Check if the transition is valid according to the double array
 		if t > dat.getCheck(1) || dat.getCheck(t) != tu {

 			if DEBUG {
 				fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
 			}

 			if !ok && a == dat.identity {
 				// Try again with unknown symbol, in case identity failed
 				if DEBUG {
 					fmt.Println("UNKNOWN symbol", string(char), "->", dat.unknown)
 				}
 				a = dat.unknown

 			} else if a != dat.epsilon {
 				// Try again with epsilon symbol, in case everything else failed
 				if DEBUG {
 					fmt.Println("EPSILON symbol", string(char), "->", dat.epsilon)
 				}
 				tu = epsilonState
 				a = dat.epsilon
 				epsilonState = 0 // reset
 				buffo = epsilonOffset
 				if DEBUG {
 					fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
 				}
 			} else {
 				break
 			}
 			goto CHECK
 		} else if dat.isSeparate(t) {
 			// Move to representative state
 			nontoken = dat.isNonToken(t)
 			tokenend = dat.isTokenEnd(t)

 			t = dat.getBase(t)

 			if DEBUG {
 				fmt.Println("Representative pointing to", t)
 			}

 		} else {
 			nontoken = dat.isNonToken(t)
 			tokenend = dat.isTokenEnd(t)
 		}

 		// Transition is fine
 		if a == dat.epsilon {
 			// skip = true
 			if DEBUG {
 				fmt.Println("Input ignored because of epsilon", showBuffer(buffer, buffo, buffi))
 			}
 		} else {
 			// Character consumed
 			buffo++
 			if !nontoken {
 				// buffer[buffo] = char
 				if DEBUG {
 					fmt.Println("Move forward in buffer", showBuffer(buffer, buffo, buffi))
 				}
 				// buffi++
 				// writer.WriteRune(char)
 			} else {
 				if DEBUG {
 					fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
 				}
 				// Maybe remove the first character, if buffo == 0?
 				if buffo == 1 {
 					// Better as a ring buffer
 					for x, i := range buffer[buffo:buffi] {
 						buffer[x] = i
 					}
 					//	writer.WriteRune('\n')
 					buffi -= buffo
 					epsilonOffset -= buffo
 					buffo = 0
 				}
 			}
 		}

 		if DEBUG {
 			fmt.Println("  --> ok!")
 		}

 		/*
 			if nontoken {
 				writer.WriteRune(("<|>")
 			}
 		*/

 		if tokenend {
 			data := []byte(string(buffer[:buffo]))
 			if DEBUG {
 				fmt.Println("-> Flush buffer:", string(data))
 			}
 			writer.Write(data)
 			writer.WriteRune('\n')
 			if DEBUG {
 				fmt.Println("Remaining (1):", showBuffer(buffer, buffo, buffi))
 			}

 			// Better as a ring buffer
 			for x, i := range buffer[buffo:buffi] {
 				buffer[x] = i
 			}
 			//	writer.WriteRune('\n')
 			buffi -= buffo
 			epsilonOffset -= buffo
 			buffo = 0
 			if DEBUG {
 				fmt.Println("Remaining (2):", showBuffer(buffer, buffo, buffi))
 			}
 		}

 		// TODO:
 		//   Prevent endless epsilon loops!
 	}

 	if !eof {
 		if DEBUG {
 			fmt.Println("Not at the end")
 			fmt.Println("Problem", tu, ":", dat.outgoing(tu))
 		}
 		return false
 	}

 FINALCHECK:

 	// Automaton is in a final state
 	if dat.getCheck(dat.getBase(t)+uint32(dat.final)) == t {
 		/*
 			if dat.isNonToken(t) {
 				fmt.Print("<|>")
 			}
 		*/
 		if dat.isTokenEnd(t) {
 			if buffi > 0 {
 				data := []byte(string(buffer[:buffi]))
 				if DEBUG {
 					fmt.Println("-> Flush buffer:", string(data))
 				}
 				writer.Write(data)
 				// states are irrelevant here
 			}
 			writer.WriteRune('\n')
 		}

 		// There may be a new line at the end, from an epsilon, so we go on!
 		return true
 	}

 	// Check epsilon transitions until a final state is reached
 	tu = t
 	t = dat.getBase(tu) + uint32(dat.epsilon)

 	// Epsilon transition failed
 	if t > dat.getCheck(1) || dat.getCheck(t) != tu {
 		if DEBUG {
 			fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
 		}
 		return false

 	} else if dat.isSeparate(t) {
 		// nontoken = dat.isNonToken(t)
 		tokenend = dat.isTokenEnd(t)

 		// Move to representative state
 		t = dat.getBase(t)
 	} else {
 		tokenend = dat.isTokenEnd(t)
 		// nontoken = dat.isNonToken(t)
 	}

 	/*
 		if nontoken {
 			fmt.Print("<|>")
 		}
 	*/

 	if tokenend {

 		if buffi > 0 {
 			data := []byte(string(buffer[:buffi]))
 			if DEBUG {
 				fmt.Println("-> Flush buffer:", string(data))
 			}
 			writer.Write(data)
 			buffi = 0
 			buffo = 0
 			epsilonState = 0
 		}
 		writer.WriteRune('\n')
 	}

 	goto FINALCHECK
 }