blob: 88975ea58b5afa2491b756864e4eece50cb4bad7 [file] [log] [blame]
Akron7f1097f2021-09-21 16:00:29 +02001package datok
Akron8ef408b2021-08-02 22:11:04 +02002
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
Akronb4bbb472021-08-09 11:49:38 +02009// The maximum number of states is 1.073.741.823 (30bit),
Akron83e75a22021-08-04 13:14:06 +020010// with a loadfactor of ~70, this means roughly 70 million
11// states in the FSA, which is sufficient for the current
12// job.
Akron03c92fe2021-08-09 14:07:57 +020013//
14// Serialization is little endian.
Akron83e75a22021-08-04 13:14:06 +020015
Akron740f3d72021-08-03 12:12:34 +020016// TODO:
17// - replace maxSize with the check value
Akron7b1faa62021-09-02 16:10:21 +020018// - Check if final states can be optional.
19// - Introduce ELM (Morita et al. 2001) to speed
20// up construction. Can be ignored on serialization
21// to improve compression.
Akron3a063ef2021-08-05 19:36:35 +020022// - Add checksum to serialization.
Akrone61380b2021-08-16 10:10:46 +020023// - Replace/Enhance table with a map
24// - Provide a bufio.Scanner compatible interface.
Akron8e1d69b2021-08-12 17:38:49 +020025// - Mark epsilon transitions in bytes
Akron740f3d72021-08-03 12:12:34 +020026
Akron8ef408b2021-08-02 22:11:04 +020027import (
28 "bufio"
29 "compress/gzip"
Akron6247a5d2021-08-03 19:18:28 +020030 "encoding/binary"
Akron8ef408b2021-08-02 22:11:04 +020031 "io"
Akron679b4862021-09-02 16:59:26 +020032 "math"
Akron8ef408b2021-08-02 22:11:04 +020033 "os"
Akronc9d84a62021-08-03 15:56:03 +020034 "sort"
Akron740f3d72021-08-03 12:12:34 +020035
Akron527c10c2021-08-13 01:45:18 +020036 "log"
Akron8ef408b2021-08-02 22:11:04 +020037)
38
39const (
Akron2a4b9292021-08-04 15:35:22 +020040 DEBUG = false
Akron16c312e2021-09-26 13:11:12 +020041 DAMAGIC = "DATOK"
Akron2a4b9292021-08-04 15:35:22 +020042 VERSION = uint16(1)
Akron03c92fe2021-08-09 14:07:57 +020043 FIRSTBIT uint32 = 1 << 31
44 SECONDBIT uint32 = 1 << 30
45 RESTBIT uint32 = ^uint32(0) &^ (FIRSTBIT | SECONDBIT)
Akron8ef408b2021-08-02 22:11:04 +020046)
47
Akron03c92fe2021-08-09 14:07:57 +020048// Serialization is always little endian
Akron6247a5d2021-08-03 19:18:28 +020049var bo binary.ByteOrder = binary.LittleEndian
50
Akron8ef408b2021-08-02 22:11:04 +020051type mapping struct {
52 source int
Akron3fdfec62021-08-04 11:40:10 +020053 target uint32
Akron8ef408b2021-08-02 22:11:04 +020054}
55
Akronf1a16502021-08-16 15:24:38 +020056type bc struct {
57 base uint32
58 check uint32
59}
60
Akron03c92fe2021-08-09 14:07:57 +020061// DaTokenizer represents a tokenizer implemented as a
62// Double Array FSA.
Akronf2120ca2021-08-03 16:26:41 +020063type DaTokenizer struct {
Akronea46e8a2021-08-17 00:36:31 +020064 sigma map[rune]int
65 sigmaASCII [256]int
Akron03a3c612021-08-04 11:51:27 +020066 maxSize int
Akron4fa28b32021-08-27 10:55:41 +020067 transCount int
Akronf1a16502021-08-16 15:24:38 +020068 array []bc
Akronc17f1ca2021-08-03 19:47:27 +020069
70 // Special symbols in sigma
71 epsilon int
72 unknown int
73 identity int
74 final int
Akron03c92fe2021-08-09 14:07:57 +020075 tokenend int
Akronf2120ca2021-08-03 16:26:41 +020076}
77
Akron439f4ec2021-08-09 15:45:38 +020078// ToDoubleArray turns the intermediate tokenizer representation
79// into a double array representation.
80//
81// This is based on Mizobuchi et al (2000), p.128
Akron1c34ce62021-09-23 23:27:39 +020082func (auto *Automaton) ToDoubleArray() *DaTokenizer {
Akronf2120ca2021-08-03 16:26:41 +020083
84 dat := &DaTokenizer{
Akron03a3c612021-08-04 11:51:27 +020085 sigma: make(map[rune]int),
Akron4fa28b32021-08-27 10:55:41 +020086 transCount: -1,
Akron1c34ce62021-09-23 23:27:39 +020087 final: auto.final,
88 unknown: auto.unknown,
89 identity: auto.identity,
90 epsilon: auto.epsilon,
91 tokenend: auto.tokenend,
Akronf2120ca2021-08-03 16:26:41 +020092 }
93
Akronf1a16502021-08-16 15:24:38 +020094 dat.resize(dat.final)
95
Akron00cecd12021-12-05 13:14:03 +010096 // Init with identity
97 if dat.identity != -1 {
98 for i := 0; i < 256; i++ {
99 dat.sigmaASCII[i] = dat.identity
100 }
101 }
102
Akron1c34ce62021-09-23 23:27:39 +0200103 for num, sym := range auto.sigmaRev {
Akronea46e8a2021-08-17 00:36:31 +0200104 if int(sym) < 256 {
105 dat.sigmaASCII[int(sym)] = num
106 }
Akronf2120ca2021-08-03 16:26:41 +0200107 dat.sigma[sym] = num
108 }
Akron8ef408b2021-08-02 22:11:04 +0200109
110 mark := 0
111 size := 0
Akron6f1c16c2021-08-17 10:45:42 +0200112 var base uint32
Akronde18e902021-08-27 09:34:12 +0200113 var atrans *edge
114 var s, s1 int
115 var t, t1 uint32
Akron8ef408b2021-08-02 22:11:04 +0200116
Akron439f4ec2021-08-09 15:45:38 +0200117 // Create a mapping from s (in Ms aka Intermediate FSA)
118 // to t (in Mt aka Double Array FSA)
Akron1c34ce62021-09-23 23:27:39 +0200119 table := make([]*mapping, auto.arcCount+1)
Akron7b1faa62021-09-02 16:10:21 +0200120 // tableQueue := make([]int, tok.arcCount+1)
Akron8ef408b2021-08-02 22:11:04 +0200121
Akron439f4ec2021-08-09 15:45:38 +0200122 // Initialize with the start state
Akron8ef408b2021-08-02 22:11:04 +0200123 table[size] = &mapping{source: 1, target: 1}
Akron7b1faa62021-09-02 16:10:21 +0200124 // tableQueue[size] = 1
Akron8ef408b2021-08-02 22:11:04 +0200125 size++
126
Akron740f3d72021-08-03 12:12:34 +0200127 // Allocate space for the outgoing symbol range
Akron1c34ce62021-09-23 23:27:39 +0200128 A := make([]int, 0, auto.sigmaCount)
Akron7b1faa62021-09-02 16:10:21 +0200129 // tableLookup := make([]uint32, tok.arcCount+2) // make(map[int]uint32)
130 // tableLookup[1] = 1
Akron8ef408b2021-08-02 22:11:04 +0200131
Akron7b1faa62021-09-02 16:10:21 +0200132 // block_begin_pos := uint32(1)
Akronde18e902021-08-27 09:34:12 +0200133
Akron8ef408b2021-08-02 22:11:04 +0200134 for mark < size {
Akronde18e902021-08-27 09:34:12 +0200135 s = table[mark].source // This is a state in Ms
136 t = table[mark].target // This is a state in Mt
Akron7b1faa62021-09-02 16:10:21 +0200137 // s = tableQueue[mark]
138 // t = tableLookup[s]
Akron8ef408b2021-08-02 22:11:04 +0200139 mark++
Akron740f3d72021-08-03 12:12:34 +0200140
141 // Following the paper, here the state t can be remembered
142 // in the set of states St
Akron8ef408b2021-08-02 22:11:04 +0200143 A = A[:0]
Akron1c34ce62021-09-23 23:27:39 +0200144 auto.getSet(s, &A)
Akron8ef408b2021-08-02 22:11:04 +0200145
Akron740f3d72021-08-03 12:12:34 +0200146 // Set base to the first free slot in the double array
Akron29e306f2021-09-02 18:29:56 +0200147 // base = dat.xCheck(A)
Akron679b4862021-09-02 16:59:26 +0200148 // base = dat.xCheckSkip(A)
Akron7b1faa62021-09-02 16:10:21 +0200149 // base = dat.xCheckNiu(A, &block_begin_pos)
Akron29e306f2021-09-02 18:29:56 +0200150 base = dat.xCheckSkipNiu(A)
Akron6f1c16c2021-08-17 10:45:42 +0200151 dat.array[t].setBase(base)
Akron8ef408b2021-08-02 22:11:04 +0200152
Akron773b1ef2021-08-03 17:37:20 +0200153 // TODO:
Akron068874c2021-08-04 15:19:56 +0200154 // Sort the outgoing transitions based on the
Akron773b1ef2021-08-03 17:37:20 +0200155 // outdegree of .end
156
Akron740f3d72021-08-03 12:12:34 +0200157 // Iterate over all outgoing symbols
Akron8ef408b2021-08-02 22:11:04 +0200158 for _, a := range A {
159
Akron1c34ce62021-09-23 23:27:39 +0200160 if a != auto.final {
Akron8ef408b2021-08-02 22:11:04 +0200161
Akron1c34ce62021-09-23 23:27:39 +0200162 atrans = auto.transitions[s][a]
Akronde18e902021-08-27 09:34:12 +0200163
Akron740f3d72021-08-03 12:12:34 +0200164 // Aka g(s, a)
Akronde18e902021-08-27 09:34:12 +0200165 s1 = atrans.end
Akron8ef408b2021-08-02 22:11:04 +0200166
Akron740f3d72021-08-03 12:12:34 +0200167 // Store the transition
Akronde18e902021-08-27 09:34:12 +0200168 t1 = base + uint32(a)
Akronf1a16502021-08-16 15:24:38 +0200169 dat.array[t1].setCheck(t)
170
171 // Set maxSize
Akron72a64222023-04-26 17:00:45 +0200172 // New: dat.maxSize = max(dat.maxSize, int(t1))
Akronf1a16502021-08-16 15:24:38 +0200173 if dat.maxSize < int(t1) {
174 dat.maxSize = int(t1)
175 }
Akron8ef408b2021-08-02 22:11:04 +0200176
Akron439f4ec2021-08-09 15:45:38 +0200177 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100178 log.Println("Translate transition",
Akron524c5432021-08-05 14:14:27 +0200179 s, "->", s1, "(", a, ")", "to", t, "->", t1)
180 }
181
Akron83e75a22021-08-04 13:14:06 +0200182 // Mark the state as being the target of a nontoken transition
Akronde18e902021-08-27 09:34:12 +0200183 if atrans.nontoken {
Akronf1a16502021-08-16 15:24:38 +0200184 dat.array[t1].setNonToken(true)
Akron524c5432021-08-05 14:14:27 +0200185 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100186 log.Println("Set", t1, "to nontoken")
Akron524c5432021-08-05 14:14:27 +0200187 }
Akron83e75a22021-08-04 13:14:06 +0200188 }
189
Akron84d68e62021-08-04 17:06:52 +0200190 // Mark the state as being the target of a tokenend transition
Akronde18e902021-08-27 09:34:12 +0200191 if atrans.tokenend {
Akronf1a16502021-08-16 15:24:38 +0200192 dat.array[t1].setTokenEnd(true)
Akron524c5432021-08-05 14:14:27 +0200193 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100194 log.Println("Set", t1, "to tokenend")
Akron524c5432021-08-05 14:14:27 +0200195 }
Akron84d68e62021-08-04 17:06:52 +0200196 }
197
Akron740f3d72021-08-03 12:12:34 +0200198 // Check for representative states
Akron439f4ec2021-08-09 15:45:38 +0200199 r := stateAlreadyInTable(s1, table, size)
Akronde18e902021-08-27 09:34:12 +0200200 // r := tableLookup[s1]
Akron740f3d72021-08-03 12:12:34 +0200201
Akron439f4ec2021-08-09 15:45:38 +0200202 // No representative found
Akron8ef408b2021-08-02 22:11:04 +0200203 if r == 0 {
Akron740f3d72021-08-03 12:12:34 +0200204 // Remember the mapping
Akron8ef408b2021-08-02 22:11:04 +0200205 table[size] = &mapping{source: s1, target: t1}
Akron7b1faa62021-09-02 16:10:21 +0200206 // tableQueue[size] = s1
Akronde18e902021-08-27 09:34:12 +0200207 // tableLookup[s1] = t1
Akron8ef408b2021-08-02 22:11:04 +0200208 size++
209 } else {
Akron740f3d72021-08-03 12:12:34 +0200210 // Overwrite with the representative state
Akronf1a16502021-08-16 15:24:38 +0200211 dat.array[t1].setBase(r)
212 dat.array[t1].setSeparate(true)
Akron8ef408b2021-08-02 22:11:04 +0200213 }
214 } else {
Akron740f3d72021-08-03 12:12:34 +0200215 // Store a final transition
Akron6f1c16c2021-08-17 10:45:42 +0200216 dat.array[base+uint32(dat.final)].setCheck(t)
Akronf1a16502021-08-16 15:24:38 +0200217
Akronde18e902021-08-27 09:34:12 +0200218 if dat.maxSize < int(base)+dat.final {
219 dat.maxSize = int(base) + dat.final
Akronf1a16502021-08-16 15:24:38 +0200220 }
Akron8ef408b2021-08-02 22:11:04 +0200221 }
222 }
223 }
224
225 // Following Mizobuchi et al (2000) the size of the
226 // FSA should be stored in check(1).
Akron29e306f2021-09-02 18:29:56 +0200227 // We make the size a bit larger so we never have to check for boundaries.
228 dat.setSize(dat.maxSize + dat.final)
229 if len(dat.array) < dat.maxSize+dat.final {
230 dat.array = append(dat.array, make([]bc, dat.final)...)
231 }
232 dat.array = dat.array[:dat.maxSize+dat.final]
Akronf2120ca2021-08-03 16:26:41 +0200233 return dat
Akron8ef408b2021-08-02 22:11:04 +0200234}
235
Akron8ef408b2021-08-02 22:11:04 +0200236// Check the table if a mapping of s
Akron740f3d72021-08-03 12:12:34 +0200237// exists and return this as a representative.
238// Currently iterates through the whole table
239// in a bruteforce manner.
Akron439f4ec2021-08-09 15:45:38 +0200240func stateAlreadyInTable(s int, table []*mapping, size int) uint32 {
Akron8ef408b2021-08-02 22:11:04 +0200241 for x := 0; x < size; x++ {
242 if table[x].source == s {
243 return table[x].target
244 }
245 }
246 return 0
247}
248
Akron941f2152021-09-26 15:14:25 +0200249// Type of tokenizer
250func (DaTokenizer) Type() string {
251 return DAMAGIC
252}
253
Akron64ffd9a2021-08-03 19:55:21 +0200254// Resize double array when necessary
255func (dat *DaTokenizer) resize(l int) {
256 // TODO:
257 // This is a bit too aggressive atm and should be calmed down.
258 if len(dat.array) <= l {
Akronf1a16502021-08-16 15:24:38 +0200259 dat.array = append(dat.array, make([]bc, l)...)
Akron8ef408b2021-08-02 22:11:04 +0200260 }
Akron64ffd9a2021-08-03 19:55:21 +0200261}
Akronc9d84a62021-08-03 15:56:03 +0200262
Akron64ffd9a2021-08-03 19:55:21 +0200263// Set base value in double array
Akronf1a16502021-08-16 15:24:38 +0200264func (bc *bc) setBase(v uint32) {
265 bc.base = v
Akron439f4ec2021-08-09 15:45:38 +0200266}
267
268// Get base value in double array
Akronf1a16502021-08-16 15:24:38 +0200269func (bc *bc) getBase() uint32 {
270 return bc.base & RESTBIT
Akron439f4ec2021-08-09 15:45:38 +0200271}
272
273// Set check value in double array
Akronf1a16502021-08-16 15:24:38 +0200274func (bc *bc) setCheck(v uint32) {
275 bc.check = v
Akron439f4ec2021-08-09 15:45:38 +0200276}
277
278// Get check value in double array
Akronf1a16502021-08-16 15:24:38 +0200279func (bc *bc) getCheck() uint32 {
280 return bc.check & RESTBIT
Akron64ffd9a2021-08-03 19:55:21 +0200281}
282
Akron3fdfec62021-08-04 11:40:10 +0200283// Returns true if a state is separate pointing to a representative
Akronf1a16502021-08-16 15:24:38 +0200284func (bc *bc) isSeparate() bool {
285 return bc.base&FIRSTBIT != 0
Akron3fdfec62021-08-04 11:40:10 +0200286}
287
288// Mark a state as separate pointing to a representative
Akronf1a16502021-08-16 15:24:38 +0200289func (bc *bc) setSeparate(sep bool) {
Akron3fdfec62021-08-04 11:40:10 +0200290 if sep {
Akronf1a16502021-08-16 15:24:38 +0200291 bc.base |= FIRSTBIT
Akron3fdfec62021-08-04 11:40:10 +0200292 } else {
Akronf1a16502021-08-16 15:24:38 +0200293 bc.base &= (RESTBIT | SECONDBIT)
Akron3fdfec62021-08-04 11:40:10 +0200294 }
295}
296
Akron83e75a22021-08-04 13:14:06 +0200297// Returns true if a state is the target of a nontoken transition
Akronf1a16502021-08-16 15:24:38 +0200298func (bc *bc) isNonToken() bool {
299 return bc.check&FIRSTBIT != 0
Akron83e75a22021-08-04 13:14:06 +0200300}
301
302// Mark a state as being the target of a nontoken transition
Akronf1a16502021-08-16 15:24:38 +0200303func (bc *bc) setNonToken(sep bool) {
Akron83e75a22021-08-04 13:14:06 +0200304 if sep {
Akronf1a16502021-08-16 15:24:38 +0200305 bc.check |= FIRSTBIT
Akron83e75a22021-08-04 13:14:06 +0200306 } else {
Akronf1a16502021-08-16 15:24:38 +0200307 bc.check &= (RESTBIT | SECONDBIT)
Akron83e75a22021-08-04 13:14:06 +0200308 }
309}
310
Akron84d68e62021-08-04 17:06:52 +0200311// Returns true if a state is the target of a tokenend transition
Akronf1a16502021-08-16 15:24:38 +0200312func (bc *bc) isTokenEnd() bool {
313 return bc.check&SECONDBIT != 0
Akron84d68e62021-08-04 17:06:52 +0200314}
315
316// Mark a state as being the target of a tokenend transition
Akronf1a16502021-08-16 15:24:38 +0200317func (bc *bc) setTokenEnd(sep bool) {
Akron84d68e62021-08-04 17:06:52 +0200318 if sep {
Akronf1a16502021-08-16 15:24:38 +0200319 bc.check |= SECONDBIT
Akron84d68e62021-08-04 17:06:52 +0200320 } else {
Akronf1a16502021-08-16 15:24:38 +0200321 bc.check &= (RESTBIT | FIRSTBIT)
Akron84d68e62021-08-04 17:06:52 +0200322 }
323}
324
Akron64ffd9a2021-08-03 19:55:21 +0200325// Set size of double array
Akron3fdfec62021-08-04 11:40:10 +0200326func (dat *DaTokenizer) setSize(v int) {
Akronf1a16502021-08-16 15:24:38 +0200327 dat.array[1].setCheck(uint32(v))
Akron64ffd9a2021-08-03 19:55:21 +0200328}
329
330// Get size of double array
Akron3fdfec62021-08-04 11:40:10 +0200331func (dat *DaTokenizer) GetSize() int {
Akronf1a16502021-08-16 15:24:38 +0200332 return int(dat.array[1].getCheck())
Akron8ef408b2021-08-02 22:11:04 +0200333}
334
335// Based on Mizobuchi et al (2000), p. 124
336// This iterates for every state through the complete double array
337// structure until it finds a gap that fits all outgoing transitions
338// of the state. This is extremely slow, but is only necessary in the
339// construction phase of the tokenizer.
Akron3fdfec62021-08-04 11:40:10 +0200340func (dat *DaTokenizer) xCheck(symbols []int) uint32 {
Akron740f3d72021-08-03 12:12:34 +0200341
342 // Start at the first entry of the double array list
Akron6f1c16c2021-08-17 10:45:42 +0200343 base := uint32(1)
344
Akron8ef408b2021-08-02 22:11:04 +0200345OVERLAP:
Akron740f3d72021-08-03 12:12:34 +0200346 // Resize the array if necessary
Akronf1a16502021-08-16 15:24:38 +0200347 dat.resize(int(base) + dat.final)
Akron8ef408b2021-08-02 22:11:04 +0200348 for _, a := range symbols {
Akronf1a16502021-08-16 15:24:38 +0200349 if dat.array[int(base)+a].getCheck() != 0 {
Akron8ef408b2021-08-02 22:11:04 +0200350 base++
351 goto OVERLAP
352 }
353 }
Akron8ef408b2021-08-02 22:11:04 +0200354 return base
355}
356
Akron679b4862021-09-02 16:59:26 +0200357// This is an implementation of xCheck with the skip-improvement
358// proposed by Morita et al. (2001)
359func (dat *DaTokenizer) xCheckSkip(symbols []int) uint32 {
360
361 // Start at the first entry of the double array list
362 base := uint32(math.Abs(float64(dat.maxSize-1) * .9))
363
364OVERLAP:
365 // Resize the array if necessary
366 dat.resize(int(base) + dat.final)
367 for _, a := range symbols {
368 if dat.array[int(base)+a].getCheck() != 0 {
369 base++
370 goto OVERLAP
371 }
372 }
373 return base
374}
375
Akron29e306f2021-09-02 18:29:56 +0200376// This is an implementation of xCheck with the skip-improvement
377// proposed by Morita et al. (2001) for higher outdegrees as
378// proposed by Niu et al. (2013)
379func (dat *DaTokenizer) xCheckSkipNiu(symbols []int) uint32 {
380
381 // Start at the first entry of the double array list
382 base := uint32(1)
383
384 // Or skip the first few entries
385 if len(symbols) >= 3 {
386 base = uint32(math.Abs(float64(dat.maxSize-1)*.9)) + 1
387 }
388
389OVERLAP:
390 // Resize the array if necessary
391 dat.resize(int(base) + dat.final + 1)
392 for _, a := range symbols {
393 if dat.array[int(base)+a].getCheck() != 0 {
394 base++
395 goto OVERLAP
396 }
397 }
398 return base
399}
400
Akron7b1faa62021-09-02 16:10:21 +0200401// This is an implementation of xCheck wit an improvement
402// proposed by Niu et al. (2013)
403func (dat *DaTokenizer) xCheckNiu(symbols []int, block_begin_pos *uint32) uint32 {
404
405 // Start at the first entry of the double array list
406 base := uint32(1)
407
408 if len(symbols) > 3 {
409 sort.Ints(symbols)
410 if *block_begin_pos > uint32(symbols[0]) {
411 dat.resize(int(*block_begin_pos) + dat.final)
412 *block_begin_pos += uint32(symbols[len(symbols)-1] + 1)
413 return *block_begin_pos - uint32(symbols[0])
414 }
415 }
416
417OVERLAP:
418 // Resize the array if necessary
419 dat.resize(int(base) + dat.final)
420 for _, a := range symbols {
421 if dat.array[int(base)+a].getCheck() != 0 {
422 base++
423 goto OVERLAP
424 }
425 }
426 return base
427}
428
Akron3610f102021-08-08 14:13:25 +0200429// List all outgoing transitions for a state
430// for testing purposes
431func (dat *DaTokenizer) outgoing(t uint32) []int {
432
433 valid := make([]int, 0, len(dat.sigma))
434
435 for _, a := range dat.sigma {
Akronf1a16502021-08-16 15:24:38 +0200436 t1 := dat.array[t].getBase() + uint32(a)
437 if t1 <= dat.array[1].getCheck() && dat.array[t1].getCheck() == t {
Akron3610f102021-08-08 14:13:25 +0200438 valid = append(valid, a)
439 }
440 }
441
442 for _, a := range []int{dat.epsilon, dat.unknown, dat.identity, dat.final} {
Akronf1a16502021-08-16 15:24:38 +0200443 t1 := dat.array[t].getBase() + uint32(a)
444 if t1 <= dat.array[1].getCheck() && dat.array[t1].getCheck() == t {
Akron3610f102021-08-08 14:13:25 +0200445 valid = append(valid, -1*a)
446 }
447 }
448
449 sort.Ints(valid)
450
451 return valid
452}
453
Akron4fa28b32021-08-27 10:55:41 +0200454// TransCount as the number of transitions aka arcs in the
455// finite state automaton
456func (dat *DaTokenizer) TransCount() int {
457 // Cache the transCount
458 if dat.transCount > 0 {
459 return dat.transCount
460 }
461
462 dat.transCount = 0
463 for x := 1; x < len(dat.array); x++ {
464 if dat.array[x].getBase() != 0 {
465 dat.transCount++
466 }
467 }
468
469 return dat.transCount
470}
471
Akron03a3c612021-08-04 11:51:27 +0200472// LoadFactor as defined in Kanda et al (2018),
473// i.e. the proportion of non-empty elements to all elements.
474func (dat *DaTokenizer) LoadFactor() float64 {
Akron4fa28b32021-08-27 10:55:41 +0200475 return float64(dat.TransCount()) / float64(len(dat.array)) * 100
Akrond66a9262021-08-03 17:09:09 +0200476}
477
Akron439f4ec2021-08-09 15:45:38 +0200478// Save stores the double array data in a file
Akron3a063ef2021-08-05 19:36:35 +0200479func (dat *DaTokenizer) Save(file string) (n int64, err error) {
480 f, err := os.Create(file)
481 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200482 log.Println(err)
Akron439f4ec2021-08-09 15:45:38 +0200483 return 0, err
Akron3a063ef2021-08-05 19:36:35 +0200484 }
485 defer f.Close()
486 gz := gzip.NewWriter(f)
487 defer gz.Close()
488 n, err = dat.WriteTo(gz)
489 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200490 log.Println(err)
Akron3a063ef2021-08-05 19:36:35 +0200491 return n, err
492 }
493 gz.Flush()
494 return n, nil
495}
496
497// WriteTo stores the double array data in an io.Writer.
Akron6247a5d2021-08-03 19:18:28 +0200498func (dat *DaTokenizer) WriteTo(w io.Writer) (n int64, err error) {
499
Akron3a063ef2021-08-05 19:36:35 +0200500 wb := bufio.NewWriter(w)
501 defer wb.Flush()
502
Akron6247a5d2021-08-03 19:18:28 +0200503 // Store magical header
Akron16c312e2021-09-26 13:11:12 +0200504 all, err := wb.Write([]byte(DAMAGIC))
Akron6247a5d2021-08-03 19:18:28 +0200505 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200506 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200507 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200508 }
509
510 // Get sigma as a list
511 sigmalist := make([]rune, len(dat.sigma)+16)
512 max := 0
513 for sym, num := range dat.sigma {
514 sigmalist[num] = sym
515 if num > max {
516 max = num
517 }
518 }
519
520 sigmalist = sigmalist[:max+1]
521
Akron3f8571a2021-08-05 11:18:10 +0200522 buf := make([]byte, 0, 16)
Akron6247a5d2021-08-03 19:18:28 +0200523 bo.PutUint16(buf[0:2], VERSION)
Akronc17f1ca2021-08-03 19:47:27 +0200524 bo.PutUint16(buf[2:4], uint16(dat.epsilon))
525 bo.PutUint16(buf[4:6], uint16(dat.unknown))
526 bo.PutUint16(buf[6:8], uint16(dat.identity))
527 bo.PutUint16(buf[8:10], uint16(dat.final))
Akron6247a5d2021-08-03 19:18:28 +0200528 bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
Akronf1a16502021-08-16 15:24:38 +0200529 bo.PutUint32(buf[12:16], uint32(len(dat.array)*2)) // Legacy support
Akron3a063ef2021-08-05 19:36:35 +0200530 more, err := wb.Write(buf[0:16])
Akron6247a5d2021-08-03 19:18:28 +0200531 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200532 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200533 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200534 }
535
536 all += more
537
Akron6247a5d2021-08-03 19:18:28 +0200538 // Write sigma
539 for _, sym := range sigmalist {
Akron3f8571a2021-08-05 11:18:10 +0200540
Akron3a063ef2021-08-05 19:36:35 +0200541 more, err = wb.WriteRune(sym)
Akron6247a5d2021-08-03 19:18:28 +0200542 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200543 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200544 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200545 }
546 all += more
547 }
Akron439f4ec2021-08-09 15:45:38 +0200548
Akron6247a5d2021-08-03 19:18:28 +0200549 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200550 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200551 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200552 }
Akron6247a5d2021-08-03 19:18:28 +0200553
554 // Test marker - could be checksum
Akron3a063ef2021-08-05 19:36:35 +0200555 more, err = wb.Write([]byte("T"))
Akron6247a5d2021-08-03 19:18:28 +0200556 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200557 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200558 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200559 }
560 all += more
561
Akronf1a16502021-08-16 15:24:38 +0200562 // for x := 0; x < len(dat.array); x++ {
563 for _, bc := range dat.array {
564 bo.PutUint32(buf[0:4], bc.base)
565 more, err = wb.Write(buf[0:4])
Akron6247a5d2021-08-03 19:18:28 +0200566 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200567 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200568 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200569 }
Akron439f4ec2021-08-09 15:45:38 +0200570 all += more
Akron3a063ef2021-08-05 19:36:35 +0200571 if more != 4 {
Akronf1a16502021-08-16 15:24:38 +0200572 log.Println("Can not write base uint32")
573 return int64(all), err
574 }
575 bo.PutUint32(buf[0:4], bc.check)
576 more, err = wb.Write(buf[0:4])
577 if err != nil {
578 log.Println(err)
579 return int64(all), err
580 }
581 all += more
582 if more != 4 {
583 log.Println("Can not write check uint32")
Akron3a063ef2021-08-05 19:36:35 +0200584 return int64(all), err
585 }
Akron6247a5d2021-08-03 19:18:28 +0200586 }
587
588 return int64(all), err
589}
590
Akron439f4ec2021-08-09 15:45:38 +0200591// LoadDatokFile reads a double array represented tokenizer
592// from a file.
Akron3f8571a2021-08-05 11:18:10 +0200593func LoadDatokFile(file string) *DaTokenizer {
594 f, err := os.Open(file)
595 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200596 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200597 return nil
Akron3f8571a2021-08-05 11:18:10 +0200598 }
599 defer f.Close()
600
601 gz, err := gzip.NewReader(f)
602 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200603 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200604 return nil
Akron3f8571a2021-08-05 11:18:10 +0200605 }
606 defer gz.Close()
607
Akron3a063ef2021-08-05 19:36:35 +0200608 // Todo: Read the whole file!
Akron3f8571a2021-08-05 11:18:10 +0200609 return ParseDatok(gz)
610}
611
Akron439f4ec2021-08-09 15:45:38 +0200612// LoadDatokFile reads a double array represented tokenizer
613// from an io.Reader
Akron3f8571a2021-08-05 11:18:10 +0200614func ParseDatok(ior io.Reader) *DaTokenizer {
615
Akron439f4ec2021-08-09 15:45:38 +0200616 // Initialize tokenizer with default values
Akron3f8571a2021-08-05 11:18:10 +0200617 dat := &DaTokenizer{
618 sigma: make(map[rune]int),
619 epsilon: 0,
620 unknown: 0,
621 identity: 0,
622 final: 0,
Akron4fa28b32021-08-27 10:55:41 +0200623 transCount: 0,
Akron3f8571a2021-08-05 11:18:10 +0200624 }
625
626 r := bufio.NewReader(ior)
627
Akron3f8571a2021-08-05 11:18:10 +0200628 buf := make([]byte, 1024)
Akron16c312e2021-09-26 13:11:12 +0200629 buf = buf[0:len(DAMAGIC)]
Akron3f8571a2021-08-05 11:18:10 +0200630
Akron439f4ec2021-08-09 15:45:38 +0200631 _, err := r.Read(buf)
Akron3f8571a2021-08-05 11:18:10 +0200632
633 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200634 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200635 return nil
636 }
637
Akron16c312e2021-09-26 13:11:12 +0200638 if string(DAMAGIC) != string(buf) {
Akron527c10c2021-08-13 01:45:18 +0200639 log.Println("Not a datok file")
Akron3f8571a2021-08-05 11:18:10 +0200640 return nil
641 }
642
Akron439f4ec2021-08-09 15:45:38 +0200643 more, err := io.ReadFull(r, buf[0:16])
Akron3f8571a2021-08-05 11:18:10 +0200644 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200645 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200646 return nil
647 }
648
Akron439f4ec2021-08-09 15:45:38 +0200649 if more != 16 {
Akron527c10c2021-08-13 01:45:18 +0200650 log.Println("Read bytes do not fit")
Akron439f4ec2021-08-09 15:45:38 +0200651 return nil
652 }
Akron3f8571a2021-08-05 11:18:10 +0200653
Akron3a063ef2021-08-05 19:36:35 +0200654 version := bo.Uint16(buf[0:2])
655
656 if version != VERSION {
Akron527c10c2021-08-13 01:45:18 +0200657 log.Println("Version not compatible")
Akron3a063ef2021-08-05 19:36:35 +0200658 return nil
659 }
660
Akron3f8571a2021-08-05 11:18:10 +0200661 dat.epsilon = int(bo.Uint16(buf[2:4]))
662 dat.unknown = int(bo.Uint16(buf[4:6]))
663 dat.identity = int(bo.Uint16(buf[6:8]))
664 dat.final = int(bo.Uint16(buf[8:10]))
665
666 sigmaCount := int(bo.Uint16(buf[10:12]))
Akronf1a16502021-08-16 15:24:38 +0200667 arraySize := int(bo.Uint32(buf[12:16])) / 2 // Legacy support
Akron3f8571a2021-08-05 11:18:10 +0200668
Akron3a063ef2021-08-05 19:36:35 +0200669 // Shouldn't be relevant though
670 dat.maxSize = arraySize - 1
671
Akron00cecd12021-12-05 13:14:03 +0100672 // Init with identity
673 if dat.identity != -1 {
674 for i := 0; i < 256; i++ {
675 dat.sigmaASCII[i] = dat.identity
676 }
677 }
678
Akron3f8571a2021-08-05 11:18:10 +0200679 for x := 0; x < sigmaCount; x++ {
Akron439f4ec2021-08-09 15:45:38 +0200680 sym, _, err := r.ReadRune()
Akron3f8571a2021-08-05 11:18:10 +0200681 if err == nil && sym != 0 {
Akronea46e8a2021-08-17 00:36:31 +0200682 if int(sym) < 256 {
683 dat.sigmaASCII[int(sym)] = x
684 }
Akron3f8571a2021-08-05 11:18:10 +0200685 dat.sigma[sym] = x
686 }
Akron3f8571a2021-08-05 11:18:10 +0200687 }
688
Akron439f4ec2021-08-09 15:45:38 +0200689 _, err = io.ReadFull(r, buf[0:1])
Akron3f8571a2021-08-05 11:18:10 +0200690
691 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200692 log.Print(err)
Akron3f8571a2021-08-05 11:18:10 +0200693 return nil
694 }
695
Akron3f8571a2021-08-05 11:18:10 +0200696 if string("T") != string(buf[0:1]) {
Akron527c10c2021-08-13 01:45:18 +0200697 log.Println("Not a datok file")
Akron3f8571a2021-08-05 11:18:10 +0200698 return nil
699 }
700
701 // Read based on length
Akronf1a16502021-08-16 15:24:38 +0200702 dat.array = make([]bc, arraySize)
Akron3f8571a2021-08-05 11:18:10 +0200703
Akronbb4aac52021-08-13 00:52:27 +0200704 dataArray, err := io.ReadAll(r)
Akron439f4ec2021-08-09 15:45:38 +0200705
Akronbb4aac52021-08-13 00:52:27 +0200706 if err == io.EOF {
Akron527c10c2021-08-13 01:45:18 +0200707 log.Println(err)
Akronbb4aac52021-08-13 00:52:27 +0200708 return nil
709 }
710
Akronf1a16502021-08-16 15:24:38 +0200711 if len(dataArray) < arraySize*8 {
Akron527c10c2021-08-13 01:45:18 +0200712 log.Println("Not enough bytes read")
Akronbb4aac52021-08-13 00:52:27 +0200713 return nil
714 }
715
716 for x := 0; x < arraySize; x++ {
Akronf1a16502021-08-16 15:24:38 +0200717 dat.array[x].base = bo.Uint32(dataArray[x*8 : (x*8)+4])
718 dat.array[x].check = bo.Uint32(dataArray[(x*8)+4 : (x*8)+8])
Akron3f8571a2021-08-05 11:18:10 +0200719 }
720
721 return dat
722}
723
Akron439f4ec2021-08-09 15:45:38 +0200724// Show the current state of the buffer,
725// for testing puroses
Akron3610f102021-08-08 14:13:25 +0200726func showBuffer(buffer []rune, buffo int, buffi int) string {
727 out := make([]rune, 0, 1024)
728 for x := 0; x < len(buffer); x++ {
729 if buffi == x {
730 out = append(out, '^')
731 }
732 if buffo == x {
733 out = append(out, '[', buffer[x], ']')
734 } else {
735 out = append(out, buffer[x])
736 }
737 }
738 return string(out)
739}
740
Akron98fbfef2021-10-23 17:02:11 +0200741// Show the current state of the buffer,
742// for testing puroses
743func showBufferNew(buffer []rune, bufft int, buffc int, buffi int) string {
744 out := make([]rune, 0, 1024)
745 for x := 0; x < len(buffer); x++ {
746 if buffi == x {
747 out = append(out, '^')
748 }
749 if bufft == x {
750 out = append(out, '|')
751 }
752 if buffc == x {
753 out = append(out, '[', buffer[x], ']')
754 } else {
755 out = append(out, buffer[x])
756 }
757 }
758 return string(out)
759}
760
761// Transduce input to ouutput
Akrone396a932021-10-19 01:06:13 +0200762func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
Akron96fdc9b2021-10-27 21:11:17 +0200763 return dat.TransduceTokenWriter(r, NewTokenWriter(w, SIMPLE))
Akrone396a932021-10-19 01:06:13 +0200764}
765
766// TransduceTokenWriter transduces an input string against
767// the double array FSA. The rules are always greedy. If the
768// automaton fails, it takes the last possible token ending
769// branch.
Akron068874c2021-08-04 15:19:56 +0200770//
Akron4db3ecf2021-08-11 18:49:03 +0200771// Based on Mizobuchi et al (2000), p. 129,
772// with additional support for IDENTITY, UNKNOWN
773// and EPSILON transitions and NONTOKEN and TOKENEND handling.
Akron4f6b28c2021-10-25 00:52:03 +0200774func (dat *DaTokenizer) TransduceTokenWriter(r io.Reader, w *TokenWriter) bool {
Akron068874c2021-08-04 15:19:56 +0200775 var a int
Akronb4bbb472021-08-09 11:49:38 +0200776 var t0 uint32
Akronb7e1f132021-08-10 11:52:31 +0200777 t := uint32(1) // Initial state
778 var ok, rewindBuffer bool
Akron068874c2021-08-04 15:19:56 +0200779
Akron3610f102021-08-08 14:13:25 +0200780 // Remember the last position of a possible tokenend,
781 // in case the automaton fails.
782 epsilonState := uint32(0)
783 epsilonOffset := 0
784
Akrone396a932021-10-19 01:06:13 +0200785 // Remember if the last transition was epsilon
786 sentenceEnd := false
787
Akrona854faa2021-10-22 19:31:08 +0200788 // Remember if a text end was already set
789 textEnd := false
790
Akron3610f102021-08-08 14:13:25 +0200791 // Implement a low level buffer for full control,
792 // however - it is probably better to introduce
793 // this on a higher level with a io.Reader interface
794 // The buffer stores a single word and may have white
795 // space at the end (but not at the beginning).
796 //
797 // This is the only backtracking requirement because of
798 // epsilon transitions, to support tokenizations like:
799 // "this is an example|.| And it works." vs
800 // "this is an example.com| application."
Akronb7e1f132021-08-10 11:52:31 +0200801 //
802 // TODO:
803 // Store a translation buffer as well, so characters don't
804 // have to be translated multiple times!
Akron3610f102021-08-08 14:13:25 +0200805 buffer := make([]rune, 1024)
Akron98fbfef2021-10-23 17:02:11 +0200806 bufft := 0 // Buffer token offset
807 buffc := 0 // Buffer current symbol
Akron3610f102021-08-08 14:13:25 +0200808 buffi := 0 // Buffer length
809
Akron98fbfef2021-10-23 17:02:11 +0200810 // The buffer is organized as follows:
811 // [ t[....c..]..i]
812
Akron3f8571a2021-08-05 11:18:10 +0200813 reader := bufio.NewReader(r)
Akrone396a932021-10-19 01:06:13 +0200814 defer w.Flush()
Akron068874c2021-08-04 15:19:56 +0200815
Akron3f8571a2021-08-05 11:18:10 +0200816 var char rune
Akrone396a932021-10-19 01:06:13 +0200817
Akron3f8571a2021-08-05 11:18:10 +0200818 var err error
819 eof := false
Akrona854faa2021-10-22 19:31:08 +0200820 eot := false
Akronb7e1f132021-08-10 11:52:31 +0200821 newchar := true
Akron3f8571a2021-08-05 11:18:10 +0200822
Akronc5d8d432021-08-10 16:48:44 +0200823PARSECHAR:
Akron3f8571a2021-08-05 11:18:10 +0200824 for {
825
Akronb7e1f132021-08-10 11:52:31 +0200826 if newchar {
827 // Get from reader if buffer is empty
Akron98fbfef2021-10-23 17:02:11 +0200828 if buffc >= buffi {
Akron1594cb82021-08-11 11:14:56 +0200829 if eof {
830 break
831 }
Akronb7e1f132021-08-10 11:52:31 +0200832 char, _, err = reader.ReadRune()
Akron439f4ec2021-08-09 15:45:38 +0200833
Akronb7e1f132021-08-10 11:52:31 +0200834 // No more runes to read
835 if err != nil {
836 eof = true
837 break
838 }
839 buffer[buffi] = char
840 buffi++
Akron3f8571a2021-08-05 11:18:10 +0200841 }
Akronb7e1f132021-08-10 11:52:31 +0200842
Akron98fbfef2021-10-23 17:02:11 +0200843 char = buffer[buffc]
Akronb7e1f132021-08-10 11:52:31 +0200844
845 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100846 log.Println("Current char", string(char), int(char), showBufferNew(buffer, bufft, buffc, buffi))
Akronb7e1f132021-08-10 11:52:31 +0200847 }
848
Akrona854faa2021-10-22 19:31:08 +0200849 eot = false
850
Akron6f1c16c2021-08-17 10:45:42 +0200851 // TODO:
852 // Better not repeatedly check for a!
853 // Possibly keep a buffer with a.
Akronea46e8a2021-08-17 00:36:31 +0200854 if int(char) < 256 {
Akrona854faa2021-10-22 19:31:08 +0200855 if int(char) == EOT {
856 eot = true
857 }
Akronea46e8a2021-08-17 00:36:31 +0200858 a = dat.sigmaASCII[int(char)]
859 } else {
860 a, ok = dat.sigma[char]
Akronb7e1f132021-08-10 11:52:31 +0200861
Akron4880fb62021-12-05 12:03:05 +0100862 // Use identity symbol if character is not in sigma
863 if !ok && dat.identity != -1 {
864 a = dat.identity
865 }
Akronb7e1f132021-08-10 11:52:31 +0200866 }
867
868 t0 = t
869
870 // Check for epsilon transitions and remember
Akronf1a16502021-08-16 15:24:38 +0200871 if dat.array[dat.array[t0].getBase()+uint32(dat.epsilon)].getCheck() == t0 {
Akrondf275812022-03-27 12:54:46 +0200872
Akronb7e1f132021-08-10 11:52:31 +0200873 // Remember state for backtracking to last tokenend state
874 epsilonState = t0
Akron98fbfef2021-10-23 17:02:11 +0200875 epsilonOffset = buffc
Akrone396a932021-10-19 01:06:13 +0200876
877 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100878 log.Println("epsilonOffset is set to", buffc)
Akrone396a932021-10-19 01:06:13 +0200879 }
Akronb7e1f132021-08-10 11:52:31 +0200880 }
Akron3f8571a2021-08-05 11:18:10 +0200881 }
Akron3610f102021-08-08 14:13:25 +0200882
Akronb7e1f132021-08-10 11:52:31 +0200883 // Checks a transition based on t0, a and buffo
Akronf1a16502021-08-16 15:24:38 +0200884 t = dat.array[t0].getBase() + uint32(a)
885 ta := dat.array[t]
Akron068874c2021-08-04 15:19:56 +0200886
Akron524c5432021-08-05 14:14:27 +0200887 if DEBUG {
Akronb7e1f132021-08-10 11:52:31 +0200888 // Char is only relevant if set
Akron9c3bf7f2021-11-03 19:52:12 +0100889 log.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
Akronb7e1f132021-08-10 11:52:31 +0200890 if false {
Akron9c3bf7f2021-11-03 19:52:12 +0100891 log.Println(dat.outgoing(t0))
Akronb7e1f132021-08-10 11:52:31 +0200892 }
Akron524c5432021-08-05 14:14:27 +0200893 }
894
Akronb7e1f132021-08-10 11:52:31 +0200895 // Check if the transition is invalid according to the double array
Akronf1a16502021-08-16 15:24:38 +0200896 if t > dat.array[1].getCheck() || ta.getCheck() != t0 {
Akron068874c2021-08-04 15:19:56 +0200897
898 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100899 log.Println("Match is not fine!", t, "and", ta.getCheck(), "vs", t0)
Akron068874c2021-08-04 15:19:56 +0200900 }
901
902 if !ok && a == dat.identity {
Akronb4bbb472021-08-09 11:49:38 +0200903
Akron068874c2021-08-04 15:19:56 +0200904 // Try again with unknown symbol, in case identity failed
Akronb7e1f132021-08-10 11:52:31 +0200905 // Char is only relevant when set
Akron068874c2021-08-04 15:19:56 +0200906 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100907 log.Println("UNKNOWN symbol", string(char), "->", dat.unknown)
Akron068874c2021-08-04 15:19:56 +0200908 }
909 a = dat.unknown
910
Akrondf275812022-03-27 12:54:46 +0200911 } else if a != dat.epsilon && epsilonState != 0 {
Akronb4bbb472021-08-09 11:49:38 +0200912
Akron068874c2021-08-04 15:19:56 +0200913 // Try again with epsilon symbol, in case everything else failed
Akronb4bbb472021-08-09 11:49:38 +0200914 t0 = epsilonState
Akron3610f102021-08-08 14:13:25 +0200915 epsilonState = 0 // reset
Akron98fbfef2021-10-23 17:02:11 +0200916 buffc = epsilonOffset
Akron439f4ec2021-08-09 15:45:38 +0200917 a = dat.epsilon
918
Akron3610f102021-08-08 14:13:25 +0200919 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100920 log.Println("Get from epsilon stack and set buffo!", showBufferNew(buffer, bufft, buffc, buffi))
Akron3610f102021-08-08 14:13:25 +0200921 }
Akronb4bbb472021-08-09 11:49:38 +0200922
Akron068874c2021-08-04 15:19:56 +0200923 } else {
Akrondf275812022-03-27 12:54:46 +0200924
925 if DEBUG {
926 log.Println("Fail!")
927 }
928
929 // w.Fail(bufft)
930
931 // The following procedure means the automaton fails to consume a certain character.
932 // In the tokenization scenario, this means, the tokenizer will drop the old or current data as a
933 // token and start blank at the root node of the automaton for the remaining data.
934 // It may be beneficial to have something like a "drop()" event to capture these cases,
935 // as they are likely the result of a bad automaton design.
Akroncae39112023-04-26 19:43:16 +0200936 if buffc-bufft <= 0 {
Akrondf275812022-03-27 12:54:46 +0200937 buffc++
Akroncae39112023-04-26 19:43:16 +0200938 if buffc == 0 {
939 eof = true
940 break
941 }
Akrondf275812022-03-27 12:54:46 +0200942 }
943
944 if DEBUG {
945 log.Println("-> Flush buffer: [", string(buffer[bufft:buffc]), "]", showBufferNew(buffer, bufft, buffc, buffi))
946 }
947 w.Token(bufft, buffer[:buffc])
948
949 sentenceEnd = false
950 textEnd = false
951
952 if DEBUG {
953 log.Println("-> Rewind buffer", bufft, buffc, buffi, epsilonOffset)
954 }
955
956 for x, i := range buffer[buffc:buffi] {
957 buffer[x] = i
958 }
959
960 buffi -= buffc
961 epsilonState = 0
962
963 buffc = 0
964 bufft = 0
965
966 a = dat.epsilon
967
968 // Restart from root state
969 t = uint32(1)
970 newchar = true
971 // goto PARSECHARM
972 continue
Akron068874c2021-08-04 15:19:56 +0200973 }
Akron068874c2021-08-04 15:19:56 +0200974
Akronb7e1f132021-08-10 11:52:31 +0200975 newchar = false
Akrona854faa2021-10-22 19:31:08 +0200976 eot = false
Akronb7e1f132021-08-10 11:52:31 +0200977 continue
Akronb4bbb472021-08-09 11:49:38 +0200978 }
979
Akronb7e1f132021-08-10 11:52:31 +0200980 // Transition was successful
981 rewindBuffer = false
Akron439f4ec2021-08-09 15:45:38 +0200982
983 // Transition consumes a character
Akronb4bbb472021-08-09 11:49:38 +0200984 if a != dat.epsilon {
985
Akron98fbfef2021-10-23 17:02:11 +0200986 buffc++
Akronb4bbb472021-08-09 11:49:38 +0200987
Akron439f4ec2021-08-09 15:45:38 +0200988 // Transition does not produce a character
Akron98fbfef2021-10-23 17:02:11 +0200989 if buffc-bufft == 1 && ta.isNonToken() {
Akron3610f102021-08-08 14:13:25 +0200990 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100991 log.Println("Nontoken forward", showBufferNew(buffer, bufft, buffc, buffi))
Akron3610f102021-08-08 14:13:25 +0200992 }
Akron98fbfef2021-10-23 17:02:11 +0200993 bufft++
994 // rewindBuffer = true
Akron3610f102021-08-08 14:13:25 +0200995 }
Akron068874c2021-08-04 15:19:56 +0200996
Akrone396a932021-10-19 01:06:13 +0200997 } else {
Akron524c5432021-08-05 14:14:27 +0200998
Akrone396a932021-10-19 01:06:13 +0200999 // Transition marks the end of a token - so flush the buffer
Akron98fbfef2021-10-23 17:02:11 +02001000 if buffc-bufft > 0 {
Akronc5d8d432021-08-10 16:48:44 +02001001 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001002 log.Println("-> Flush buffer: [", string(buffer[bufft:buffc]), "]", showBuffer(buffer, buffc, buffi))
Akronc5d8d432021-08-10 16:48:44 +02001003 }
Akron32416ce2021-10-23 17:09:41 +02001004 w.Token(bufft, buffer[:buffc])
Akronc5d8d432021-08-10 16:48:44 +02001005 rewindBuffer = true
Akrone396a932021-10-19 01:06:13 +02001006 sentenceEnd = false
Akrona854faa2021-10-22 19:31:08 +02001007 textEnd = false
Akrone396a932021-10-19 01:06:13 +02001008 } else {
1009 sentenceEnd = true
Akrona854faa2021-10-22 19:31:08 +02001010 w.SentenceEnd(0)
Akron1594cb82021-08-11 11:14:56 +02001011 }
Akron439f4ec2021-08-09 15:45:38 +02001012 }
Akron3610f102021-08-08 14:13:25 +02001013
Akron8cc2dd92021-10-25 19:49:41 +02001014 if eot {
1015 eot = false
1016 textEnd = true
1017 w.TextEnd(0)
1018 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001019 log.Println("END OF TEXT")
Akron8cc2dd92021-10-25 19:49:41 +02001020 }
1021 }
1022
Akronc5d8d432021-08-10 16:48:44 +02001023 // Rewind the buffer if necessary
Akron439f4ec2021-08-09 15:45:38 +02001024 if rewindBuffer {
1025
Akrone396a932021-10-19 01:06:13 +02001026 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001027 log.Println("-> Rewind buffer", bufft, buffc, buffi, epsilonOffset)
Akrone396a932021-10-19 01:06:13 +02001028 }
1029
Akron439f4ec2021-08-09 15:45:38 +02001030 // TODO: Better as a ring buffer
Akron04335c62021-10-28 11:56:00 +02001031 // buffer = buffer[buffc:] !slower
Akron98fbfef2021-10-23 17:02:11 +02001032 for x, i := range buffer[buffc:buffi] {
Akron3610f102021-08-08 14:13:25 +02001033 buffer[x] = i
1034 }
Akronb4bbb472021-08-09 11:49:38 +02001035
Akron98fbfef2021-10-23 17:02:11 +02001036 buffi -= buffc
Akron16c312e2021-09-26 13:11:12 +02001037 // epsilonOffset -= buffo
Akrone396a932021-10-19 01:06:13 +02001038 epsilonOffset = 0
1039 epsilonState = 0
1040
Akron98fbfef2021-10-23 17:02:11 +02001041 buffc = 0
1042 bufft = 0
Akrona854faa2021-10-22 19:31:08 +02001043
Akron98fbfef2021-10-23 17:02:11 +02001044 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001045 log.Println("Remaining:", showBufferNew(buffer, bufft, buffc, buffi))
Akron98fbfef2021-10-23 17:02:11 +02001046 }
1047 }
1048
Akronb7e1f132021-08-10 11:52:31 +02001049 // Move to representative state
Akronf1a16502021-08-16 15:24:38 +02001050 if ta.isSeparate() {
1051 t = ta.getBase()
1052 ta = dat.array[t]
Akronb7e1f132021-08-10 11:52:31 +02001053
1054 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001055 log.Println("Representative pointing to", t)
Akronb7e1f132021-08-10 11:52:31 +02001056 }
1057 }
1058
Akronc5d8d432021-08-10 16:48:44 +02001059 newchar = true
1060
Akron068874c2021-08-04 15:19:56 +02001061 // TODO:
Akrondf275812022-03-27 12:54:46 +02001062 // Prevent endless epsilon loops by checking
1063 // the model has no epsilon loops1
Akron068874c2021-08-04 15:19:56 +02001064 }
1065
Akron439f4ec2021-08-09 15:45:38 +02001066 // Input reader is not yet finished
Akron3f8571a2021-08-05 11:18:10 +02001067 if !eof {
Akron068874c2021-08-04 15:19:56 +02001068 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001069 log.Println("Not at the end - problem", t0, ":", dat.outgoing(t0))
Akron068874c2021-08-04 15:19:56 +02001070 }
Akrondf275812022-03-27 12:54:46 +02001071 // This should never happen
Akron068874c2021-08-04 15:19:56 +02001072 return false
1073 }
1074
Akronb7e1f132021-08-10 11:52:31 +02001075 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001076 log.Println("Entering final check")
Akronb7e1f132021-08-10 11:52:31 +02001077 }
1078
Akrona854faa2021-10-22 19:31:08 +02001079 // Check epsilon transitions as long as possible
Akronc5d8d432021-08-10 16:48:44 +02001080 t0 = t
Akronf1a16502021-08-16 15:24:38 +02001081 t = dat.array[t0].getBase() + uint32(dat.epsilon)
Akron01912fc2021-08-12 11:41:58 +02001082 a = dat.epsilon
1083 newchar = false
Akrone396a932021-10-19 01:06:13 +02001084
Akronf1a16502021-08-16 15:24:38 +02001085 if dat.array[t].getCheck() == t0 {
Akronc5d8d432021-08-10 16:48:44 +02001086 // Remember state for backtracking to last tokenend state
Akronc5d8d432021-08-10 16:48:44 +02001087 goto PARSECHAR
Akrone61380b2021-08-16 10:10:46 +02001088
Akronc5d8d432021-08-10 16:48:44 +02001089 } else if epsilonState != 0 {
Akronb7e1f132021-08-10 11:52:31 +02001090 t0 = epsilonState
1091 epsilonState = 0 // reset
Akron98fbfef2021-10-23 17:02:11 +02001092 buffc = epsilonOffset
Akron068874c2021-08-04 15:19:56 +02001093 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001094 log.Println("Get from epsilon stack and set buffo!", showBufferNew(buffer, bufft, buffc, buffi))
Akron068874c2021-08-04 15:19:56 +02001095 }
Akronc5d8d432021-08-10 16:48:44 +02001096 goto PARSECHAR
Akron068874c2021-08-04 15:19:56 +02001097 }
Akrone396a932021-10-19 01:06:13 +02001098
Akrondf275812022-03-27 12:54:46 +02001099 // something left in buffer
1100 if buffc-bufft > 0 {
1101 if DEBUG {
1102 log.Println("-> Flush buffer: [", string(buffer[bufft:buffc]), "]", showBufferNew(buffer, bufft, buffc, buffi))
1103 }
1104 w.Token(bufft, buffer[:buffc])
1105 sentenceEnd = false
1106 textEnd = false
1107 }
1108
Akrone396a932021-10-19 01:06:13 +02001109 // Add an additional sentence ending, if the file is over but no explicit
1110 // sentence split was reached. This may be controversial and therefore
1111 // optional via parameter.
1112 if !sentenceEnd {
Akrona854faa2021-10-22 19:31:08 +02001113 w.SentenceEnd(0)
1114
Akrone396a932021-10-19 01:06:13 +02001115 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001116 log.Println("Sentence end")
Akrona854faa2021-10-22 19:31:08 +02001117 }
1118 }
1119
1120 if !textEnd {
1121 w.TextEnd(0)
1122
1123 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001124 log.Println("Text end")
Akrone396a932021-10-19 01:06:13 +02001125 }
1126 }
1127
1128 return true
Akron068874c2021-08-04 15:19:56 +02001129}