blob: 781298180c396e50c0de75c3ce5a6a01d1fe7702 [file] [log] [blame]
package datok
/**
* The file reader is basically a port of foma2js,
* licensed under the Apache License, version 2,
* and written by Mans Hulden.
*/
// The maximum number of states is 1.073.741.823 (30bit),
// with a loadfactor of ~70, this means roughly 70 million
// states in the FSA, which is sufficient for the current
// job.
//
// Serialization is little endian.
// TODO:
// - replace maxSize with the check value
// - Check if final states can be optional.
// - Introduce ELM (Morita et al. 2001) to speed
// up construction. Can be ignored on serialization
// to improve compression.
// - Add checksum to serialization.
// - Replace/Enhance table with a map
// - Provide a bufio.Scanner compatible interface.
// - Mark epsilon transitions in bytes
import (
"bufio"
"compress/gzip"
"encoding/binary"
"fmt"
"io"
"math"
"os"
"sort"
"log"
)
const (
DEBUG = false
DAMAGIC = "DATOK"
VERSION = uint16(1)
FIRSTBIT uint32 = 1 << 31
SECONDBIT uint32 = 1 << 30
RESTBIT uint32 = ^uint32(0) &^ (FIRSTBIT | SECONDBIT)
)
// Serialization is always little endian
var bo binary.ByteOrder = binary.LittleEndian
type mapping struct {
source int
target uint32
}
type bc struct {
base uint32
check uint32
}
// DaTokenizer represents a tokenizer implemented as a
// Double Array FSA.
type DaTokenizer struct {
sigma map[rune]int
sigmaASCII [256]int
maxSize int
transCount int
array []bc
// Special symbols in sigma
epsilon int
unknown int
identity int
final int
tokenend int
}
// ToDoubleArray turns the intermediate tokenizer representation
// into a double array representation.
//
// This is based on Mizobuchi et al (2000), p.128
func (auto *Automaton) ToDoubleArray() *DaTokenizer {
dat := &DaTokenizer{
sigma: make(map[rune]int),
transCount: -1,
final: auto.final,
unknown: auto.unknown,
identity: auto.identity,
epsilon: auto.epsilon,
tokenend: auto.tokenend,
}
dat.resize(dat.final)
for num, sym := range auto.sigmaRev {
if int(sym) < 256 {
dat.sigmaASCII[int(sym)] = num
}
dat.sigma[sym] = num
}
mark := 0
size := 0
var base uint32
var atrans *edge
var s, s1 int
var t, t1 uint32
// Create a mapping from s (in Ms aka Intermediate FSA)
// to t (in Mt aka Double Array FSA)
table := make([]*mapping, auto.arcCount+1)
// tableQueue := make([]int, tok.arcCount+1)
// Initialize with the start state
table[size] = &mapping{source: 1, target: 1}
// tableQueue[size] = 1
size++
// Allocate space for the outgoing symbol range
A := make([]int, 0, auto.sigmaCount)
// tableLookup := make([]uint32, tok.arcCount+2) // make(map[int]uint32)
// tableLookup[1] = 1
// block_begin_pos := uint32(1)
for mark < size {
s = table[mark].source // This is a state in Ms
t = table[mark].target // This is a state in Mt
// s = tableQueue[mark]
// t = tableLookup[s]
mark++
// Following the paper, here the state t can be remembered
// in the set of states St
A = A[:0]
auto.getSet(s, &A)
// Set base to the first free slot in the double array
// base = dat.xCheck(A)
// base = dat.xCheckSkip(A)
// base = dat.xCheckNiu(A, &block_begin_pos)
base = dat.xCheckSkipNiu(A)
dat.array[t].setBase(base)
// TODO:
// Sort the outgoing transitions based on the
// outdegree of .end
// Iterate over all outgoing symbols
for _, a := range A {
if a != auto.final {
atrans = auto.transitions[s][a]
// Aka g(s, a)
s1 = atrans.end
// Store the transition
t1 = base + uint32(a)
dat.array[t1].setCheck(t)
// Set maxSize
if dat.maxSize < int(t1) {
dat.maxSize = int(t1)
}
if DEBUG {
fmt.Println("Translate transition",
s, "->", s1, "(", a, ")", "to", t, "->", t1)
}
// Mark the state as being the target of a nontoken transition
if atrans.nontoken {
dat.array[t1].setNonToken(true)
if DEBUG {
fmt.Println("Set", t1, "to nontoken")
}
}
// Mark the state as being the target of a tokenend transition
if atrans.tokenend {
dat.array[t1].setTokenEnd(true)
if DEBUG {
fmt.Println("Set", t1, "to tokenend")
}
}
// Check for representative states
r := stateAlreadyInTable(s1, table, size)
// r := tableLookup[s1]
// No representative found
if r == 0 {
// Remember the mapping
table[size] = &mapping{source: s1, target: t1}
// tableQueue[size] = s1
// tableLookup[s1] = t1
size++
} else {
// Overwrite with the representative state
dat.array[t1].setBase(r)
dat.array[t1].setSeparate(true)
}
} else {
// Store a final transition
dat.array[base+uint32(dat.final)].setCheck(t)
if dat.maxSize < int(base)+dat.final {
dat.maxSize = int(base) + dat.final
}
}
}
}
// Following Mizobuchi et al (2000) the size of the
// FSA should be stored in check(1).
// We make the size a bit larger so we never have to check for boundaries.
dat.setSize(dat.maxSize + dat.final)
if len(dat.array) < dat.maxSize+dat.final {
dat.array = append(dat.array, make([]bc, dat.final)...)
}
dat.array = dat.array[:dat.maxSize+dat.final]
return dat
}
// Check the table if a mapping of s
// exists and return this as a representative.
// Currently iterates through the whole table
// in a bruteforce manner.
func stateAlreadyInTable(s int, table []*mapping, size int) uint32 {
for x := 0; x < size; x++ {
if table[x].source == s {
return table[x].target
}
}
return 0
}
// Resize double array when necessary
func (dat *DaTokenizer) resize(l int) {
// TODO:
// This is a bit too aggressive atm and should be calmed down.
if len(dat.array) <= l {
dat.array = append(dat.array, make([]bc, l)...)
}
}
// Set base value in double array
func (bc *bc) setBase(v uint32) {
bc.base = v
}
// Get base value in double array
func (bc *bc) getBase() uint32 {
return bc.base & RESTBIT
}
// Set check value in double array
func (bc *bc) setCheck(v uint32) {
bc.check = v
}
// Get check value in double array
func (bc *bc) getCheck() uint32 {
return bc.check & RESTBIT
}
// Returns true if a state is separate pointing to a representative
func (bc *bc) isSeparate() bool {
return bc.base&FIRSTBIT != 0
}
// Mark a state as separate pointing to a representative
func (bc *bc) setSeparate(sep bool) {
if sep {
bc.base |= FIRSTBIT
} else {
bc.base &= (RESTBIT | SECONDBIT)
}
}
// Returns true if a state is the target of a nontoken transition
func (bc *bc) isNonToken() bool {
return bc.check&FIRSTBIT != 0
}
// Mark a state as being the target of a nontoken transition
func (bc *bc) setNonToken(sep bool) {
if sep {
bc.check |= FIRSTBIT
} else {
bc.check &= (RESTBIT | SECONDBIT)
}
}
// Returns true if a state is the target of a tokenend transition
func (bc *bc) isTokenEnd() bool {
return bc.check&SECONDBIT != 0
}
// Mark a state as being the target of a tokenend transition
func (bc *bc) setTokenEnd(sep bool) {
if sep {
bc.check |= SECONDBIT
} else {
bc.check &= (RESTBIT | FIRSTBIT)
}
}
// Set size of double array
func (dat *DaTokenizer) setSize(v int) {
dat.array[1].setCheck(uint32(v))
}
// Get size of double array
func (dat *DaTokenizer) GetSize() int {
return int(dat.array[1].getCheck())
}
// Based on Mizobuchi et al (2000), p. 124
// This iterates for every state through the complete double array
// structure until it finds a gap that fits all outgoing transitions
// of the state. This is extremely slow, but is only necessary in the
// construction phase of the tokenizer.
func (dat *DaTokenizer) xCheck(symbols []int) uint32 {
// Start at the first entry of the double array list
base := uint32(1)
OVERLAP:
// Resize the array if necessary
dat.resize(int(base) + dat.final)
for _, a := range symbols {
if dat.array[int(base)+a].getCheck() != 0 {
base++
goto OVERLAP
}
}
return base
}
// This is an implementation of xCheck with the skip-improvement
// proposed by Morita et al. (2001)
func (dat *DaTokenizer) xCheckSkip(symbols []int) uint32 {
// Start at the first entry of the double array list
base := uint32(math.Abs(float64(dat.maxSize-1) * .9))
OVERLAP:
// Resize the array if necessary
dat.resize(int(base) + dat.final)
for _, a := range symbols {
if dat.array[int(base)+a].getCheck() != 0 {
base++
goto OVERLAP
}
}
return base
}
// This is an implementation of xCheck with the skip-improvement
// proposed by Morita et al. (2001) for higher outdegrees as
// proposed by Niu et al. (2013)
func (dat *DaTokenizer) xCheckSkipNiu(symbols []int) uint32 {
// Start at the first entry of the double array list
base := uint32(1)
// Or skip the first few entries
if len(symbols) >= 3 {
base = uint32(math.Abs(float64(dat.maxSize-1)*.9)) + 1
}
OVERLAP:
// Resize the array if necessary
dat.resize(int(base) + dat.final + 1)
for _, a := range symbols {
if dat.array[int(base)+a].getCheck() != 0 {
base++
goto OVERLAP
}
}
return base
}
// This is an implementation of xCheck wit an improvement
// proposed by Niu et al. (2013)
func (dat *DaTokenizer) xCheckNiu(symbols []int, block_begin_pos *uint32) uint32 {
// Start at the first entry of the double array list
base := uint32(1)
if len(symbols) > 3 {
sort.Ints(symbols)
if *block_begin_pos > uint32(symbols[0]) {
dat.resize(int(*block_begin_pos) + dat.final)
*block_begin_pos += uint32(symbols[len(symbols)-1] + 1)
return *block_begin_pos - uint32(symbols[0])
}
}
OVERLAP:
// Resize the array if necessary
dat.resize(int(base) + dat.final)
for _, a := range symbols {
if dat.array[int(base)+a].getCheck() != 0 {
base++
goto OVERLAP
}
}
return base
}
// List all outgoing transitions for a state
// for testing purposes
func (dat *DaTokenizer) outgoing(t uint32) []int {
valid := make([]int, 0, len(dat.sigma))
for _, a := range dat.sigma {
t1 := dat.array[t].getBase() + uint32(a)
if t1 <= dat.array[1].getCheck() && dat.array[t1].getCheck() == t {
valid = append(valid, a)
}
}
for _, a := range []int{dat.epsilon, dat.unknown, dat.identity, dat.final} {
t1 := dat.array[t].getBase() + uint32(a)
if t1 <= dat.array[1].getCheck() && dat.array[t1].getCheck() == t {
valid = append(valid, -1*a)
}
}
sort.Ints(valid)
return valid
}
// TransCount as the number of transitions aka arcs in the
// finite state automaton
func (dat *DaTokenizer) TransCount() int {
// Cache the transCount
if dat.transCount > 0 {
return dat.transCount
}
dat.transCount = 0
for x := 1; x < len(dat.array); x++ {
if dat.array[x].getBase() != 0 {
dat.transCount++
}
}
return dat.transCount
}
// LoadFactor as defined in Kanda et al (2018),
// i.e. the proportion of non-empty elements to all elements.
func (dat *DaTokenizer) LoadFactor() float64 {
return float64(dat.TransCount()) / float64(len(dat.array)) * 100
}
// Save stores the double array data in a file
func (dat *DaTokenizer) Save(file string) (n int64, err error) {
f, err := os.Create(file)
if err != nil {
log.Println(err)
return 0, err
}
defer f.Close()
gz := gzip.NewWriter(f)
defer gz.Close()
n, err = dat.WriteTo(gz)
if err != nil {
log.Println(err)
return n, err
}
gz.Flush()
return n, nil
}
// WriteTo stores the double array data in an io.Writer.
func (dat *DaTokenizer) WriteTo(w io.Writer) (n int64, err error) {
wb := bufio.NewWriter(w)
defer wb.Flush()
// Store magical header
all, err := wb.Write([]byte(DAMAGIC))
if err != nil {
log.Println(err)
return int64(all), err
}
// Get sigma as a list
sigmalist := make([]rune, len(dat.sigma)+16)
max := 0
for sym, num := range dat.sigma {
sigmalist[num] = sym
if num > max {
max = num
}
}
sigmalist = sigmalist[:max+1]
buf := make([]byte, 0, 16)
bo.PutUint16(buf[0:2], VERSION)
bo.PutUint16(buf[2:4], uint16(dat.epsilon))
bo.PutUint16(buf[4:6], uint16(dat.unknown))
bo.PutUint16(buf[6:8], uint16(dat.identity))
bo.PutUint16(buf[8:10], uint16(dat.final))
bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
bo.PutUint32(buf[12:16], uint32(len(dat.array)*2)) // Legacy support
more, err := wb.Write(buf[0:16])
if err != nil {
log.Println(err)
return int64(all), err
}
all += more
// Write sigma
for _, sym := range sigmalist {
more, err = wb.WriteRune(sym)
if err != nil {
log.Println(err)
return int64(all), err
}
all += more
}
if err != nil {
log.Println(err)
return int64(all), err
}
// Test marker - could be checksum
more, err = wb.Write([]byte("T"))
if err != nil {
log.Println(err)
return int64(all), err
}
all += more
// for x := 0; x < len(dat.array); x++ {
for _, bc := range dat.array {
bo.PutUint32(buf[0:4], bc.base)
more, err = wb.Write(buf[0:4])
if err != nil {
log.Println(err)
return int64(all), err
}
all += more
if more != 4 {
log.Println("Can not write base uint32")
return int64(all), err
}
bo.PutUint32(buf[0:4], bc.check)
more, err = wb.Write(buf[0:4])
if err != nil {
log.Println(err)
return int64(all), err
}
all += more
if more != 4 {
log.Println("Can not write check uint32")
return int64(all), err
}
}
return int64(all), err
}
// LoadDatokFile reads a double array represented tokenizer
// from a file.
func LoadDatokFile(file string) *DaTokenizer {
f, err := os.Open(file)
if err != nil {
log.Println(err)
return nil
}
defer f.Close()
gz, err := gzip.NewReader(f)
if err != nil {
log.Println(err)
return nil
}
defer gz.Close()
// Todo: Read the whole file!
return ParseDatok(gz)
}
// LoadDatokFile reads a double array represented tokenizer
// from an io.Reader
func ParseDatok(ior io.Reader) *DaTokenizer {
// Initialize tokenizer with default values
dat := &DaTokenizer{
sigma: make(map[rune]int),
epsilon: 0,
unknown: 0,
identity: 0,
final: 0,
transCount: 0,
}
r := bufio.NewReader(ior)
buf := make([]byte, 1024)
buf = buf[0:len(DAMAGIC)]
_, err := r.Read(buf)
if err != nil {
log.Println(err)
return nil
}
if string(DAMAGIC) != string(buf) {
log.Println("Not a datok file")
return nil
}
more, err := io.ReadFull(r, buf[0:16])
if err != nil {
log.Println(err)
return nil
}
if more != 16 {
log.Println("Read bytes do not fit")
return nil
}
version := bo.Uint16(buf[0:2])
if version != VERSION {
log.Println("Version not compatible")
return nil
}
dat.epsilon = int(bo.Uint16(buf[2:4]))
dat.unknown = int(bo.Uint16(buf[4:6]))
dat.identity = int(bo.Uint16(buf[6:8]))
dat.final = int(bo.Uint16(buf[8:10]))
sigmaCount := int(bo.Uint16(buf[10:12]))
arraySize := int(bo.Uint32(buf[12:16])) / 2 // Legacy support
// Shouldn't be relevant though
dat.maxSize = arraySize - 1
for x := 0; x < sigmaCount; x++ {
sym, _, err := r.ReadRune()
if err == nil && sym != 0 {
if int(sym) < 256 {
dat.sigmaASCII[int(sym)] = x
}
dat.sigma[sym] = x
}
}
_, err = io.ReadFull(r, buf[0:1])
if err != nil {
log.Print(err)
return nil
}
if string("T") != string(buf[0:1]) {
log.Println("Not a datok file")
return nil
}
// Read based on length
dat.array = make([]bc, arraySize)
dataArray, err := io.ReadAll(r)
if err == io.EOF {
log.Println(err)
return nil
}
if len(dataArray) < arraySize*8 {
log.Println("Not enough bytes read")
return nil
}
for x := 0; x < arraySize; x++ {
dat.array[x].base = bo.Uint32(dataArray[x*8 : (x*8)+4])
dat.array[x].check = bo.Uint32(dataArray[(x*8)+4 : (x*8)+8])
}
return dat
}
// Show the current state of the buffer,
// for testing puroses
func showBuffer(buffer []rune, buffo int, buffi int) string {
out := make([]rune, 0, 1024)
for x := 0; x < len(buffer); x++ {
if buffi == x {
out = append(out, '^')
}
if buffo == x {
out = append(out, '[', buffer[x], ']')
} else {
out = append(out, buffer[x])
}
}
return string(out)
}
// Transduce an input string against the double array
// FSA. The rules are always greedy. If the automaton fails,
// it takes the last possible token ending branch.
//
// Based on Mizobuchi et al (2000), p. 129,
// with additional support for IDENTITY, UNKNOWN
// and EPSILON transitions and NONTOKEN and TOKENEND handling.
func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
var a int
var t0 uint32
t := uint32(1) // Initial state
var ok, rewindBuffer bool
// Remember the last position of a possible tokenend,
// in case the automaton fails.
epsilonState := uint32(0)
epsilonOffset := 0
// Implement a low level buffer for full control,
// however - it is probably better to introduce
// this on a higher level with a io.Reader interface
// The buffer stores a single word and may have white
// space at the end (but not at the beginning).
//
// This is the only backtracking requirement because of
// epsilon transitions, to support tokenizations like:
// "this is an example|.| And it works." vs
// "this is an example.com| application."
//
// TODO:
// Store a translation buffer as well, so characters don't
// have to be translated multiple times!
buffer := make([]rune, 1024)
buffo := 0 // Buffer offset
buffi := 0 // Buffer length
reader := bufio.NewReader(r)
writer := bufio.NewWriter(w)
defer writer.Flush()
var char rune
var err error
eof := false
newchar := true
PARSECHAR:
for {
if newchar {
// Get from reader if buffer is empty
if buffo >= buffi {
if eof {
break
}
char, _, err = reader.ReadRune()
// No more runes to read
if err != nil {
eof = true
break
}
buffer[buffi] = char
buffi++
}
char = buffer[buffo]
if DEBUG {
fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
}
// TODO:
// Better not repeatedly check for a!
// Possibly keep a buffer with a.
if int(char) < 256 {
a = dat.sigmaASCII[int(char)]
} else {
a, ok = dat.sigma[char]
if !ok {
a = 0
}
}
// Use identity symbol if character is not in sigma
if a == 0 && dat.identity != -1 {
a = dat.identity
}
t0 = t
// Check for epsilon transitions and remember
if dat.array[dat.array[t0].getBase()+uint32(dat.epsilon)].getCheck() == t0 {
// Remember state for backtracking to last tokenend state
epsilonState = t0
epsilonOffset = buffo
}
}
// Checks a transition based on t0, a and buffo
t = dat.array[t0].getBase() + uint32(a)
ta := dat.array[t]
if DEBUG {
// Char is only relevant if set
fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
if false {
fmt.Println(dat.outgoing(t0))
}
}
// Check if the transition is invalid according to the double array
if t > dat.array[1].getCheck() || ta.getCheck() != t0 {
if DEBUG {
fmt.Println("Match is not fine!", t, "and", ta.getCheck(), "vs", t0)
}
if !ok && a == dat.identity {
// Try again with unknown symbol, in case identity failed
// Char is only relevant when set
if DEBUG {
fmt.Println("UNKNOWN symbol", string(char), "->", dat.unknown)
}
a = dat.unknown
} else if a != dat.epsilon {
// Try again with epsilon symbol, in case everything else failed
t0 = epsilonState
epsilonState = 0 // reset
buffo = epsilonOffset
a = dat.epsilon
if DEBUG {
fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
}
} else {
break
}
newchar = false
continue
}
// Transition was successful
rewindBuffer = false
// Transition consumes a character
if a != dat.epsilon {
buffo++
// Transition does not produce a character
if buffo == 1 && ta.isNonToken() {
if DEBUG {
fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
}
rewindBuffer = true
}
}
// Transition marks the end of a token - so flush the buffer
if ta.isTokenEnd() {
if buffi > 0 {
if DEBUG {
fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
}
writer.WriteString(string(buffer[:buffo]))
rewindBuffer = true
}
if DEBUG {
fmt.Println("-> Newline")
}
writer.WriteRune('\n')
}
// Rewind the buffer if necessary
if rewindBuffer {
// TODO: Better as a ring buffer
for x, i := range buffer[buffo:buffi] {
buffer[x] = i
}
buffi -= buffo
// epsilonOffset -= buffo
epsilonOffset = buffo
buffo = 0
if DEBUG {
fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
}
}
// Move to representative state
if ta.isSeparate() {
t = ta.getBase()
ta = dat.array[t]
if DEBUG {
fmt.Println("Representative pointing to", t)
}
}
newchar = true
// TODO:
// Prevent endless epsilon loops!
}
// Input reader is not yet finished
if !eof {
if DEBUG {
fmt.Println("Not at the end - problem", t0, ":", dat.outgoing(t0))
}
return false
}
if DEBUG {
fmt.Println("Entering final check")
}
// Automaton is in a final state, so flush the buffer and return
x := dat.array[t].getBase() + uint32(dat.final)
if x < dat.array[1].getCheck() && dat.array[x].getCheck() == t {
if buffi > 0 {
if DEBUG {
fmt.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
}
writer.WriteString(string(buffer[:buffi]))
if dat.array[t].isTokenEnd() {
writer.WriteRune('\n')
if DEBUG {
fmt.Println("-> Newline")
}
}
}
// Add an additional sentence ending, if the file is over but no explicit
// sentence split was reached. This may be controversial and therefore
// optional via parameter.
if !dat.array[t0].isTokenEnd() {
writer.WriteRune('\n')
if DEBUG {
fmt.Println("-> Newline")
}
}
// TODO:
// There may be a new line at the end, from an epsilon,
// so we may need to go on!
return true
}
// Check epsilon transitions until a final state is reached
t0 = t
t = dat.array[t0].getBase() + uint32(dat.epsilon)
a = dat.epsilon
newchar = false
if dat.array[t].getCheck() == t0 {
// Remember state for backtracking to last tokenend state
goto PARSECHAR
} else if epsilonState != 0 {
t0 = epsilonState
epsilonState = 0 // reset
buffo = epsilonOffset
if DEBUG {
fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
}
goto PARSECHAR
}
return false
}