blob: b6e877bf82f30d5cde7a6b7fb16efe0515164287 [file] [log] [blame]
package datokenizer
/**
* The file reader is basically a port of foma2js,
* licensed under the Apache License, version 2,
* and written by Mans Hulden.
*/
// TODO:
// - replace maxSize with the check value
// - Strip first state and make everything start with 0!
// - Serialize!
// - Split Tokenizer and DATokenizer
import (
"bufio"
"compress/gzip"
"fmt"
"io"
"os"
"sort"
"strconv"
"strings"
"unicode/utf8"
"github.com/rs/zerolog/log"
)
const (
PROPS = 1
SIGMA = 2
STATES = 3
NONE = 4
NEWLINE = '\u000a'
DEBUG = false
)
// Special symbols in sigma
var EPSILON = -1
var UNKNOWN = -1
var IDENTITY = -1
var FINAL = -1
type mapping struct {
source int
target int
}
type edge struct {
inSym int
outSym int
end int
}
type Tokenizer struct {
sigma map[rune]int
sigmaRev map[int]rune
arcCount int
stateCount int
sigmaCount int
maxSize int
array []int
transitions []map[int]*edge
}
func ParseFile(file string) *Tokenizer {
f, err := os.Open(file)
if err != nil {
log.Error().Err(err)
os.Exit(0)
}
defer f.Close()
gz, err := gzip.NewReader(f)
if err != nil {
log.Error().Err(err)
os.Exit(0)
}
defer gz.Close()
return Parse(gz)
}
func Parse(ior io.Reader) *Tokenizer {
r := bufio.NewReader(ior)
tok := &Tokenizer{
sigma: make(map[rune]int),
sigmaRev: make(map[int]rune),
}
var state, inSym, outSym, end, final int
mode := 0
var elem []string
var elemint [5]int
for {
line, err := r.ReadString('\n')
if err != nil {
if err == io.EOF {
break
}
log.Error().Err(err)
os.Exit(0)
}
if strings.HasPrefix(line, "##foma-net") {
continue
}
if strings.HasPrefix(line, "##props##") {
mode = PROPS
continue
}
if strings.HasPrefix(line, "##states##") {
mode = STATES
// Adds a final transition symbol to sigma
// written as '#' in Mizobuchi et al (2000)
tok.sigmaCount++
FINAL = tok.sigmaCount
continue
}
if strings.HasPrefix(line, "##sigma##") {
mode = SIGMA
continue
}
if strings.HasPrefix(line, "##end##") {
mode = NONE
continue
}
switch mode {
case PROPS:
{
elem = strings.Split(line, " ")
/*
fmt.Println("arity: " + elem[0])
fmt.Println("arccount: " + elem[1])
fmt.Println("statecount: " + elem[2])
fmt.Println("linecount: " + elem[3])
fmt.Println("finalcount: " + elem[4])
fmt.Println("pathcount: " + elem[5])
fmt.Println("is_deterministic: " + elem[6])
fmt.Println("is_pruned: " + elem[7])
fmt.Println("is_minimized: " + elem[8])
fmt.Println("is_epsilon_free: " + elem[9])
fmt.Println("is_loop_free: " + elem[10])
fmt.Println("extras: " + elem[11])
fmt.Println("name: " + elem[12])
*/
if elem[6] != "1" {
log.Error().Msg("The FST needs to be deterministic")
os.Exit(1)
}
if elem[9] != "1" {
log.Error().Msg("The FST needs to be epsilon free")
os.Exit(1)
}
elemint[0], err = strconv.Atoi(elem[1])
if err != nil {
log.Error().Msg("Can't read arccount")
os.Exit(1)
}
tok.arcCount = elemint[0]
// States start at 1 in Mizobuchi et al (2000),
// as the state 0 is associated with a fail.
// Initialize states and transitions
elemint[0], err = strconv.Atoi(elem[2])
if err != nil {
log.Error().Msg("Can't read statecount")
os.Exit(1)
}
tok.stateCount = elemint[0]
tok.transitions = make([]map[int]*edge, elemint[0]+1)
continue
}
case STATES:
{
elem = strings.Split(line[0:len(line)-1], " ")
if elem[0] == "-1" {
continue
}
elemint[0], err = strconv.Atoi(elem[0])
if err != nil {
break
}
if len(elem) > 1 {
elemint[1], err = strconv.Atoi(elem[1])
if err != nil {
break
}
if len(elem) > 2 {
elemint[2], err = strconv.Atoi(elem[2])
if err != nil {
break
}
if len(elem) > 3 {
elemint[3], err = strconv.Atoi(elem[3])
if err != nil {
break
}
if len(elem) > 4 {
elemint[4], err = strconv.Atoi(elem[4])
if err != nil {
break
}
}
}
}
}
switch len(elem) {
case 5:
{
state = elemint[0]
inSym = elemint[1]
outSym = elemint[2]
end = elemint[3]
final = elemint[4]
}
case 4:
{
if elemint[1] == -1 {
state = elemint[0]
final = elemint[3]
} else {
state = elemint[0]
inSym = elemint[1]
end = elemint[2]
final = elemint[3]
outSym = inSym
}
}
case 3:
{
inSym = elemint[0]
outSym = elemint[1]
end = elemint[2]
}
case 2:
{
inSym = elemint[0]
end = elemint[1]
outSym = inSym
}
}
// While the states in foma start with 0, the states in the
// Mizobuchi FSA start with one - so we increase every state by 1.
if inSym != outSym {
// Allow any epsilon to become a newline
if !(inSym == EPSILON && tok.sigmaRev[outSym] == NEWLINE) &&
// Allow any whitespace to be ignored
!(inSym != EPSILON && outSym == EPSILON) &&
// Allow any whitespace to become a new line
!(tok.sigmaRev[outSym] == NEWLINE) {
log.Error().Msg(
"Unsupported transition: " +
strconv.Itoa(state) +
" -> " + strconv.Itoa(end) +
" (" +
strconv.Itoa(inSym) +
":" +
strconv.Itoa(outSym) +
") (" +
string(tok.sigmaRev[inSym]) +
":" +
string(tok.sigmaRev[outSym]) +
")")
os.Exit(1)
}
}
// This collects all edges until arrstate changes
// TODO:
// if arrin == EPSILON && arrout == TOKENEND, mark state as newline
// if the next transition is the same, remove TOKENEND and add SENTENCEEND
// This requires to remove the transition alltogether and marks the state instead.
// TODO:
// if arrout == EPSILON, mark the transition as NOTOKEN
targetObj := &edge{
inSym: inSym,
outSym: outSym,
end: end + 1,
}
// Initialize outgoing states
if tok.transitions[state+1] == nil {
tok.transitions[state+1] = make(map[int]*edge)
}
// Ignore transitions with invalid symbols
if inSym >= 0 {
tok.transitions[state+1][inSym] = targetObj
}
// Add final transition
if final == 1 {
tok.transitions[state+1][FINAL] = &edge{}
}
if DEBUG {
fmt.Println("Add",
state+1, "->", end+1,
"(",
inSym,
":",
outSym,
") (",
string(tok.sigmaRev[inSym]),
":",
string(tok.sigmaRev[outSym]),
")")
}
continue
}
case SIGMA:
{
elem = strings.SplitN(line[0:len(line)-1], " ", 2)
// Turn string into sigma id
number, err := strconv.Atoi(elem[0])
if err != nil {
log.Error().Err(err)
os.Exit(0)
}
tok.sigmaCount = number
var symbol rune
// Read rune
if utf8.RuneCountInString(elem[1]) == 1 {
symbol = []rune(elem[1])[0]
// Probably a MCS
} else if utf8.RuneCountInString(elem[1]) > 1 {
switch elem[1] {
case "@_EPSILON_SYMBOL_@":
{
EPSILON = number
continue
}
case "@_UNKNOWN_SYMBOL_@":
{
UNKNOWN = number
continue
}
case "@_IDENTITY_SYMBOL_@":
{
IDENTITY = number
continue
}
default:
{
log.Error().Msg("MCS not supported: " + line)
os.Exit(1)
}
}
} else { // Probably a new line symbol
line, err = r.ReadString('\n')
if err != nil {
log.Error().Err(err)
os.Exit(0)
}
if len(line) != 1 {
log.Error().Msg("MCS not supported:" + line)
os.Exit(0)
}
symbol = rune(NEWLINE)
}
tok.sigma[symbol] = number
tok.sigmaRev[number] = symbol
}
}
}
return tok
}
// Implementation of Mizobuchi et al (2000), p.128
func (tok *Tokenizer) ToDoubleArray() *Tokenizer {
mark := 0
size := 0
// Create a mapping from s to t
table := make([]*mapping, tok.arcCount+1)
table[size] = &mapping{source: 1, target: 1}
size++
// Allocate space for the outgoing symbol range
A := make([]int, 0, tok.sigmaCount)
for mark < size {
s := table[mark].source // This is a state in Ms
t := table[mark].target // This is a state in Mt
mark++
// Following the paper, here the state t can be remembered
// in the set of states St
A = A[:0]
tok.get_set(s, &A)
// Set base to the first free slot in the double array
tok.setBase(t, tok.xCheck(A))
// Iterate over all outgoing symbols
for _, a := range A {
if a != FINAL {
// Aka g(s, a)
s1 := tok.transitions[s][a].end
// Store the transition
t1 := tok.getBase(t) + a
tok.setCheck(t1, t)
// Check for representative states
r := in_table(s1, table, size)
if r == 0 {
// Remember the mapping
table[size] = &mapping{source: s1, target: t1}
size++
} else {
// Overwrite with the representative state
tok.setBase(t1, -1*r)
}
} else {
// Store a final transition
tok.setCheck(tok.getBase(t)+FINAL, t)
}
}
}
// Following Mizobuchi et al (2000) the size of the
// FSA should be stored in check(1).
tok.setCheck(1, tok.maxSize+1)
tok.array = tok.array[:tok.maxSize+1]
return tok
}
// Resize double array when necessary
func (tok *Tokenizer) resize(l int) {
// TODO:
// This is a bit too aggressive atm and should be calmed down.
if len(tok.array) <= l {
tok.array = append(tok.array, make([]int, l)...)
}
}
// Set base value in double array
func (tok *Tokenizer) setBase(p int, v int) {
l := p*2 + 1
tok.resize(l)
if tok.maxSize < l {
tok.maxSize = l
}
tok.array[p*2] = v
}
// Get base value in double array
func (tok *Tokenizer) getBase(p int) int {
if p*2 >= len(tok.array) {
return 0
}
return tok.array[p*2]
}
// Set check value in double array
func (tok *Tokenizer) setCheck(p int, v int) {
l := p*2 + 1
tok.resize(l)
if tok.maxSize < l {
tok.maxSize = l
}
tok.array[(p*2)+1] = v
}
// Get check value in double array
func (tok *Tokenizer) getCheck(p int) int {
if (p*2)+1 >= len(tok.array) {
return 0
}
return tok.array[(p*2)+1]
}
// Set size of double array
func (tok *Tokenizer) setSize(p, v int) {
tok.setCheck(1, v)
}
// Get size of double array
func (tok *Tokenizer) getSize(p int) int {
return tok.getCheck(1)
}
// Check the table if a mapping of s
// exists and return this as a representative.
// Currently iterates through the whole table
// in a bruteforce manner.
func in_table(s int, table []*mapping, size int) int {
for x := 0; x < size; x++ {
if table[x].source == s {
return table[x].target
}
}
return 0
}
// Set alphabet A to the list of all symbols
// outgoing from s
func (tok *Tokenizer) get_set(s int, A *[]int) {
for a := range tok.transitions[s] {
*A = append(*A, a)
}
// Not required, but simplifies bug hunting
sort.Ints(*A)
}
// Based on Mizobuchi et al (2000), p. 124
// This iterates for every state through the complete double array
// structure until it finds a gap that fits all outgoing transitions
// of the state. This is extremely slow, but is only necessary in the
// construction phase of the tokenizer.
func (tok *Tokenizer) xCheck(symbols []int) int {
// Start at the first entry of the double array list
base := 1
OVERLAP:
// Resize the array if necessary
tok.resize((base + FINAL) * 2)
for _, a := range symbols {
if tok.getCheck(base+a) != 0 {
base++
goto OVERLAP
}
}
return base
}
// Match an input string against the double array
// FSA.
//
// Based on Mizobuchi et al (2000), p. 129,
// with additional support for IDENTITY, UNKNOWN
// and EPSILON transitions.
func (tok *Tokenizer) Match(input string) bool {
var a int
var tu int
var ok bool
t := 1 // Initial state
chars := []rune(input)
i := 0
for i < len(chars) {
a, ok = tok.sigma[chars[i]]
// Support identity symbol if character is not in sigma
if !ok && IDENTITY != -1 {
if DEBUG {
fmt.Println("IDENTITY symbol", string(chars[i]), "->", IDENTITY)
}
a = IDENTITY
} else if DEBUG {
fmt.Println("Sigma transition is okay for [", string(chars[i]), "]")
}
tu = t
CHECK:
t = tok.getBase(tu) + a
// Check if the transition is valid according to the double array
if t > tok.getCheck(1) || tok.getCheck(t) != tu {
if DEBUG {
fmt.Println("Match is not fine!", t, "and", tok.getCheck(t), "vs", tu)
}
if !ok && a == IDENTITY {
// Try again with unknown symbol, in case identity failed
if DEBUG {
fmt.Println("UNKNOWN symbol", string(chars[i]), "->", UNKNOWN)
}
a = UNKNOWN
} else if a != EPSILON {
// Try again with epsilon symbol, in case everything else failed
if DEBUG {
fmt.Println("EPSILON symbol", string(chars[i]), "->", EPSILON)
}
a = EPSILON
} else {
break
}
goto CHECK
} else if tok.getBase(t) < 0 {
// Move to representative state
t = -1 * tok.getBase(t)
}
// Transition is fine
if a != EPSILON {
// Character consumed
i++
}
// TODO:
// Prevent endless epsilon loops!
}
if i != len(chars) {
if DEBUG {
fmt.Println("Not at the end")
}
return false
}
FINALCHECK:
// Automaton is in a final state
if tok.getCheck(tok.getBase(t)+FINAL) == t {
return true
}
// Check epsilon transitions until a final state is reached
tu = t
a = EPSILON
t = tok.getBase(tu) + a
// Epsilon transition failed
if t > tok.getCheck(1) || tok.getCheck(t) != tu {
if DEBUG {
fmt.Println("Match is not fine!", t, "and", tok.getCheck(t), "vs", tu)
}
return false
} else if tok.getBase(t) < 0 {
// Move to representative state
t = -1 * tok.getBase(t)
}
goto FINALCHECK
}