blob: f19ce5a39279022c8887ddeab7fd59fd0729cb48 [file] [log] [blame]
Akron7f1097f2021-09-21 16:00:29 +02001package datok
Akron8ef408b2021-08-02 22:11:04 +02002
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
Akronb4bbb472021-08-09 11:49:38 +02009// The maximum number of states is 1.073.741.823 (30bit),
Akron83e75a22021-08-04 13:14:06 +020010// with a loadfactor of ~70, this means roughly 70 million
11// states in the FSA, which is sufficient for the current
12// job.
Akron03c92fe2021-08-09 14:07:57 +020013//
14// Serialization is little endian.
Akron83e75a22021-08-04 13:14:06 +020015
Akron740f3d72021-08-03 12:12:34 +020016// TODO:
17// - replace maxSize with the check value
Akron7b1faa62021-09-02 16:10:21 +020018// - Check if final states can be optional.
19// - Introduce ELM (Morita et al. 2001) to speed
20// up construction. Can be ignored on serialization
21// to improve compression.
Akron3a063ef2021-08-05 19:36:35 +020022// - Add checksum to serialization.
Akrone61380b2021-08-16 10:10:46 +020023// - Replace/Enhance table with a map
24// - Provide a bufio.Scanner compatible interface.
Akron8e1d69b2021-08-12 17:38:49 +020025// - Mark epsilon transitions in bytes
Akron740f3d72021-08-03 12:12:34 +020026
Akron8ef408b2021-08-02 22:11:04 +020027import (
28 "bufio"
29 "compress/gzip"
Akron6247a5d2021-08-03 19:18:28 +020030 "encoding/binary"
Akron8ef408b2021-08-02 22:11:04 +020031 "io"
Akron679b4862021-09-02 16:59:26 +020032 "math"
Akron8ef408b2021-08-02 22:11:04 +020033 "os"
Akronc9d84a62021-08-03 15:56:03 +020034 "sort"
Akron740f3d72021-08-03 12:12:34 +020035
Akron527c10c2021-08-13 01:45:18 +020036 "log"
Akron8ef408b2021-08-02 22:11:04 +020037)
38
39const (
Akron2a4b9292021-08-04 15:35:22 +020040 DEBUG = false
Akron16c312e2021-09-26 13:11:12 +020041 DAMAGIC = "DATOK"
Akron2a4b9292021-08-04 15:35:22 +020042 VERSION = uint16(1)
Akron03c92fe2021-08-09 14:07:57 +020043 FIRSTBIT uint32 = 1 << 31
44 SECONDBIT uint32 = 1 << 30
45 RESTBIT uint32 = ^uint32(0) &^ (FIRSTBIT | SECONDBIT)
Akron8ef408b2021-08-02 22:11:04 +020046)
47
Akron03c92fe2021-08-09 14:07:57 +020048// Serialization is always little endian
Akron6247a5d2021-08-03 19:18:28 +020049var bo binary.ByteOrder = binary.LittleEndian
50
Akron8ef408b2021-08-02 22:11:04 +020051type mapping struct {
52 source int
Akron3fdfec62021-08-04 11:40:10 +020053 target uint32
Akron8ef408b2021-08-02 22:11:04 +020054}
55
Akronf1a16502021-08-16 15:24:38 +020056type bc struct {
57 base uint32
58 check uint32
59}
60
Akron03c92fe2021-08-09 14:07:57 +020061// DaTokenizer represents a tokenizer implemented as a
62// Double Array FSA.
Akronf2120ca2021-08-03 16:26:41 +020063type DaTokenizer struct {
Akronea46e8a2021-08-17 00:36:31 +020064 sigma map[rune]int
65 sigmaASCII [256]int
Akron03a3c612021-08-04 11:51:27 +020066 maxSize int
Akron4fa28b32021-08-27 10:55:41 +020067 transCount int
Akronf1a16502021-08-16 15:24:38 +020068 array []bc
Akronc17f1ca2021-08-03 19:47:27 +020069
70 // Special symbols in sigma
71 epsilon int
72 unknown int
73 identity int
74 final int
Akron03c92fe2021-08-09 14:07:57 +020075 tokenend int
Akronf2120ca2021-08-03 16:26:41 +020076}
77
Akron439f4ec2021-08-09 15:45:38 +020078// ToDoubleArray turns the intermediate tokenizer representation
79// into a double array representation.
80//
81// This is based on Mizobuchi et al (2000), p.128
Akron1c34ce62021-09-23 23:27:39 +020082func (auto *Automaton) ToDoubleArray() *DaTokenizer {
Akronf2120ca2021-08-03 16:26:41 +020083
84 dat := &DaTokenizer{
Akron03a3c612021-08-04 11:51:27 +020085 sigma: make(map[rune]int),
Akron4fa28b32021-08-27 10:55:41 +020086 transCount: -1,
Akron1c34ce62021-09-23 23:27:39 +020087 final: auto.final,
88 unknown: auto.unknown,
89 identity: auto.identity,
90 epsilon: auto.epsilon,
91 tokenend: auto.tokenend,
Akronf2120ca2021-08-03 16:26:41 +020092 }
93
Akronf1a16502021-08-16 15:24:38 +020094 dat.resize(dat.final)
95
Akron00cecd12021-12-05 13:14:03 +010096 // Init with identity
97 if dat.identity != -1 {
98 for i := 0; i < 256; i++ {
99 dat.sigmaASCII[i] = dat.identity
100 }
101 }
102
Akron1c34ce62021-09-23 23:27:39 +0200103 for num, sym := range auto.sigmaRev {
Akronea46e8a2021-08-17 00:36:31 +0200104 if int(sym) < 256 {
105 dat.sigmaASCII[int(sym)] = num
106 }
Akronf2120ca2021-08-03 16:26:41 +0200107 dat.sigma[sym] = num
108 }
Akron8ef408b2021-08-02 22:11:04 +0200109
110 mark := 0
111 size := 0
Akron6f1c16c2021-08-17 10:45:42 +0200112 var base uint32
Akronde18e902021-08-27 09:34:12 +0200113 var atrans *edge
114 var s, s1 int
115 var t, t1 uint32
Akron8ef408b2021-08-02 22:11:04 +0200116
Akron439f4ec2021-08-09 15:45:38 +0200117 // Create a mapping from s (in Ms aka Intermediate FSA)
118 // to t (in Mt aka Double Array FSA)
Akron1c34ce62021-09-23 23:27:39 +0200119 table := make([]*mapping, auto.arcCount+1)
Akron7b1faa62021-09-02 16:10:21 +0200120 // tableQueue := make([]int, tok.arcCount+1)
Akron8ef408b2021-08-02 22:11:04 +0200121
Akron439f4ec2021-08-09 15:45:38 +0200122 // Initialize with the start state
Akron8ef408b2021-08-02 22:11:04 +0200123 table[size] = &mapping{source: 1, target: 1}
Akron7b1faa62021-09-02 16:10:21 +0200124 // tableQueue[size] = 1
Akron8ef408b2021-08-02 22:11:04 +0200125 size++
126
Akron740f3d72021-08-03 12:12:34 +0200127 // Allocate space for the outgoing symbol range
Akron1c34ce62021-09-23 23:27:39 +0200128 A := make([]int, 0, auto.sigmaCount)
Akron7b1faa62021-09-02 16:10:21 +0200129 // tableLookup := make([]uint32, tok.arcCount+2) // make(map[int]uint32)
130 // tableLookup[1] = 1
Akron8ef408b2021-08-02 22:11:04 +0200131
Akron7b1faa62021-09-02 16:10:21 +0200132 // block_begin_pos := uint32(1)
Akronde18e902021-08-27 09:34:12 +0200133
Akron8ef408b2021-08-02 22:11:04 +0200134 for mark < size {
Akronde18e902021-08-27 09:34:12 +0200135 s = table[mark].source // This is a state in Ms
136 t = table[mark].target // This is a state in Mt
Akron7b1faa62021-09-02 16:10:21 +0200137 // s = tableQueue[mark]
138 // t = tableLookup[s]
Akron8ef408b2021-08-02 22:11:04 +0200139 mark++
Akron740f3d72021-08-03 12:12:34 +0200140
141 // Following the paper, here the state t can be remembered
142 // in the set of states St
Akron8ef408b2021-08-02 22:11:04 +0200143 A = A[:0]
Akron1c34ce62021-09-23 23:27:39 +0200144 auto.getSet(s, &A)
Akron8ef408b2021-08-02 22:11:04 +0200145
Akron740f3d72021-08-03 12:12:34 +0200146 // Set base to the first free slot in the double array
Akron29e306f2021-09-02 18:29:56 +0200147 // base = dat.xCheck(A)
Akron679b4862021-09-02 16:59:26 +0200148 // base = dat.xCheckSkip(A)
Akron7b1faa62021-09-02 16:10:21 +0200149 // base = dat.xCheckNiu(A, &block_begin_pos)
Akron29e306f2021-09-02 18:29:56 +0200150 base = dat.xCheckSkipNiu(A)
Akron6f1c16c2021-08-17 10:45:42 +0200151 dat.array[t].setBase(base)
Akron8ef408b2021-08-02 22:11:04 +0200152
Akron773b1ef2021-08-03 17:37:20 +0200153 // TODO:
Akron068874c2021-08-04 15:19:56 +0200154 // Sort the outgoing transitions based on the
Akron773b1ef2021-08-03 17:37:20 +0200155 // outdegree of .end
156
Akron740f3d72021-08-03 12:12:34 +0200157 // Iterate over all outgoing symbols
Akron8ef408b2021-08-02 22:11:04 +0200158 for _, a := range A {
159
Akron1c34ce62021-09-23 23:27:39 +0200160 if a != auto.final {
Akron8ef408b2021-08-02 22:11:04 +0200161
Akron1c34ce62021-09-23 23:27:39 +0200162 atrans = auto.transitions[s][a]
Akronde18e902021-08-27 09:34:12 +0200163
Akron740f3d72021-08-03 12:12:34 +0200164 // Aka g(s, a)
Akronde18e902021-08-27 09:34:12 +0200165 s1 = atrans.end
Akron8ef408b2021-08-02 22:11:04 +0200166
Akron740f3d72021-08-03 12:12:34 +0200167 // Store the transition
Akronde18e902021-08-27 09:34:12 +0200168 t1 = base + uint32(a)
Akronf1a16502021-08-16 15:24:38 +0200169 dat.array[t1].setCheck(t)
170
171 // Set maxSize
172 if dat.maxSize < int(t1) {
173 dat.maxSize = int(t1)
174 }
Akron8ef408b2021-08-02 22:11:04 +0200175
Akron439f4ec2021-08-09 15:45:38 +0200176 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100177 log.Println("Translate transition",
Akron524c5432021-08-05 14:14:27 +0200178 s, "->", s1, "(", a, ")", "to", t, "->", t1)
179 }
180
Akron83e75a22021-08-04 13:14:06 +0200181 // Mark the state as being the target of a nontoken transition
Akronde18e902021-08-27 09:34:12 +0200182 if atrans.nontoken {
Akronf1a16502021-08-16 15:24:38 +0200183 dat.array[t1].setNonToken(true)
Akron524c5432021-08-05 14:14:27 +0200184 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100185 log.Println("Set", t1, "to nontoken")
Akron524c5432021-08-05 14:14:27 +0200186 }
Akron83e75a22021-08-04 13:14:06 +0200187 }
188
Akron84d68e62021-08-04 17:06:52 +0200189 // Mark the state as being the target of a tokenend transition
Akronde18e902021-08-27 09:34:12 +0200190 if atrans.tokenend {
Akronf1a16502021-08-16 15:24:38 +0200191 dat.array[t1].setTokenEnd(true)
Akron524c5432021-08-05 14:14:27 +0200192 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100193 log.Println("Set", t1, "to tokenend")
Akron524c5432021-08-05 14:14:27 +0200194 }
Akron84d68e62021-08-04 17:06:52 +0200195 }
196
Akron740f3d72021-08-03 12:12:34 +0200197 // Check for representative states
Akron439f4ec2021-08-09 15:45:38 +0200198 r := stateAlreadyInTable(s1, table, size)
Akronde18e902021-08-27 09:34:12 +0200199 // r := tableLookup[s1]
Akron740f3d72021-08-03 12:12:34 +0200200
Akron439f4ec2021-08-09 15:45:38 +0200201 // No representative found
Akron8ef408b2021-08-02 22:11:04 +0200202 if r == 0 {
Akron740f3d72021-08-03 12:12:34 +0200203 // Remember the mapping
Akron8ef408b2021-08-02 22:11:04 +0200204 table[size] = &mapping{source: s1, target: t1}
Akron7b1faa62021-09-02 16:10:21 +0200205 // tableQueue[size] = s1
Akronde18e902021-08-27 09:34:12 +0200206 // tableLookup[s1] = t1
Akron8ef408b2021-08-02 22:11:04 +0200207 size++
208 } else {
Akron740f3d72021-08-03 12:12:34 +0200209 // Overwrite with the representative state
Akronf1a16502021-08-16 15:24:38 +0200210 dat.array[t1].setBase(r)
211 dat.array[t1].setSeparate(true)
Akron8ef408b2021-08-02 22:11:04 +0200212 }
213 } else {
Akron740f3d72021-08-03 12:12:34 +0200214 // Store a final transition
Akron6f1c16c2021-08-17 10:45:42 +0200215 dat.array[base+uint32(dat.final)].setCheck(t)
Akronf1a16502021-08-16 15:24:38 +0200216
Akronde18e902021-08-27 09:34:12 +0200217 if dat.maxSize < int(base)+dat.final {
218 dat.maxSize = int(base) + dat.final
Akronf1a16502021-08-16 15:24:38 +0200219 }
Akron8ef408b2021-08-02 22:11:04 +0200220 }
221 }
222 }
223
224 // Following Mizobuchi et al (2000) the size of the
225 // FSA should be stored in check(1).
Akron29e306f2021-09-02 18:29:56 +0200226 // We make the size a bit larger so we never have to check for boundaries.
227 dat.setSize(dat.maxSize + dat.final)
228 if len(dat.array) < dat.maxSize+dat.final {
229 dat.array = append(dat.array, make([]bc, dat.final)...)
230 }
231 dat.array = dat.array[:dat.maxSize+dat.final]
Akronf2120ca2021-08-03 16:26:41 +0200232 return dat
Akron8ef408b2021-08-02 22:11:04 +0200233}
234
Akron8ef408b2021-08-02 22:11:04 +0200235// Check the table if a mapping of s
Akron740f3d72021-08-03 12:12:34 +0200236// exists and return this as a representative.
237// Currently iterates through the whole table
238// in a bruteforce manner.
Akron439f4ec2021-08-09 15:45:38 +0200239func stateAlreadyInTable(s int, table []*mapping, size int) uint32 {
Akron8ef408b2021-08-02 22:11:04 +0200240 for x := 0; x < size; x++ {
241 if table[x].source == s {
242 return table[x].target
243 }
244 }
245 return 0
246}
247
Akron941f2152021-09-26 15:14:25 +0200248// Type of tokenizer
249func (DaTokenizer) Type() string {
250 return DAMAGIC
251}
252
Akron64ffd9a2021-08-03 19:55:21 +0200253// Resize double array when necessary
254func (dat *DaTokenizer) resize(l int) {
255 // TODO:
256 // This is a bit too aggressive atm and should be calmed down.
257 if len(dat.array) <= l {
Akronf1a16502021-08-16 15:24:38 +0200258 dat.array = append(dat.array, make([]bc, l)...)
Akron8ef408b2021-08-02 22:11:04 +0200259 }
Akron64ffd9a2021-08-03 19:55:21 +0200260}
Akronc9d84a62021-08-03 15:56:03 +0200261
Akron64ffd9a2021-08-03 19:55:21 +0200262// Set base value in double array
Akronf1a16502021-08-16 15:24:38 +0200263func (bc *bc) setBase(v uint32) {
264 bc.base = v
Akron439f4ec2021-08-09 15:45:38 +0200265}
266
267// Get base value in double array
Akronf1a16502021-08-16 15:24:38 +0200268func (bc *bc) getBase() uint32 {
269 return bc.base & RESTBIT
Akron439f4ec2021-08-09 15:45:38 +0200270}
271
272// Set check value in double array
Akronf1a16502021-08-16 15:24:38 +0200273func (bc *bc) setCheck(v uint32) {
274 bc.check = v
Akron439f4ec2021-08-09 15:45:38 +0200275}
276
277// Get check value in double array
Akronf1a16502021-08-16 15:24:38 +0200278func (bc *bc) getCheck() uint32 {
279 return bc.check & RESTBIT
Akron64ffd9a2021-08-03 19:55:21 +0200280}
281
Akron3fdfec62021-08-04 11:40:10 +0200282// Returns true if a state is separate pointing to a representative
Akronf1a16502021-08-16 15:24:38 +0200283func (bc *bc) isSeparate() bool {
284 return bc.base&FIRSTBIT != 0
Akron3fdfec62021-08-04 11:40:10 +0200285}
286
287// Mark a state as separate pointing to a representative
Akronf1a16502021-08-16 15:24:38 +0200288func (bc *bc) setSeparate(sep bool) {
Akron3fdfec62021-08-04 11:40:10 +0200289 if sep {
Akronf1a16502021-08-16 15:24:38 +0200290 bc.base |= FIRSTBIT
Akron3fdfec62021-08-04 11:40:10 +0200291 } else {
Akronf1a16502021-08-16 15:24:38 +0200292 bc.base &= (RESTBIT | SECONDBIT)
Akron3fdfec62021-08-04 11:40:10 +0200293 }
294}
295
Akron83e75a22021-08-04 13:14:06 +0200296// Returns true if a state is the target of a nontoken transition
Akronf1a16502021-08-16 15:24:38 +0200297func (bc *bc) isNonToken() bool {
298 return bc.check&FIRSTBIT != 0
Akron83e75a22021-08-04 13:14:06 +0200299}
300
301// Mark a state as being the target of a nontoken transition
Akronf1a16502021-08-16 15:24:38 +0200302func (bc *bc) setNonToken(sep bool) {
Akron83e75a22021-08-04 13:14:06 +0200303 if sep {
Akronf1a16502021-08-16 15:24:38 +0200304 bc.check |= FIRSTBIT
Akron83e75a22021-08-04 13:14:06 +0200305 } else {
Akronf1a16502021-08-16 15:24:38 +0200306 bc.check &= (RESTBIT | SECONDBIT)
Akron83e75a22021-08-04 13:14:06 +0200307 }
308}
309
Akron84d68e62021-08-04 17:06:52 +0200310// Returns true if a state is the target of a tokenend transition
Akronf1a16502021-08-16 15:24:38 +0200311func (bc *bc) isTokenEnd() bool {
312 return bc.check&SECONDBIT != 0
Akron84d68e62021-08-04 17:06:52 +0200313}
314
315// Mark a state as being the target of a tokenend transition
Akronf1a16502021-08-16 15:24:38 +0200316func (bc *bc) setTokenEnd(sep bool) {
Akron84d68e62021-08-04 17:06:52 +0200317 if sep {
Akronf1a16502021-08-16 15:24:38 +0200318 bc.check |= SECONDBIT
Akron84d68e62021-08-04 17:06:52 +0200319 } else {
Akronf1a16502021-08-16 15:24:38 +0200320 bc.check &= (RESTBIT | FIRSTBIT)
Akron84d68e62021-08-04 17:06:52 +0200321 }
322}
323
Akron64ffd9a2021-08-03 19:55:21 +0200324// Set size of double array
Akron3fdfec62021-08-04 11:40:10 +0200325func (dat *DaTokenizer) setSize(v int) {
Akronf1a16502021-08-16 15:24:38 +0200326 dat.array[1].setCheck(uint32(v))
Akron64ffd9a2021-08-03 19:55:21 +0200327}
328
329// Get size of double array
Akron3fdfec62021-08-04 11:40:10 +0200330func (dat *DaTokenizer) GetSize() int {
Akronf1a16502021-08-16 15:24:38 +0200331 return int(dat.array[1].getCheck())
Akron8ef408b2021-08-02 22:11:04 +0200332}
333
334// Based on Mizobuchi et al (2000), p. 124
335// This iterates for every state through the complete double array
336// structure until it finds a gap that fits all outgoing transitions
337// of the state. This is extremely slow, but is only necessary in the
338// construction phase of the tokenizer.
Akron3fdfec62021-08-04 11:40:10 +0200339func (dat *DaTokenizer) xCheck(symbols []int) uint32 {
Akron740f3d72021-08-03 12:12:34 +0200340
341 // Start at the first entry of the double array list
Akron6f1c16c2021-08-17 10:45:42 +0200342 base := uint32(1)
343
Akron8ef408b2021-08-02 22:11:04 +0200344OVERLAP:
Akron740f3d72021-08-03 12:12:34 +0200345 // Resize the array if necessary
Akronf1a16502021-08-16 15:24:38 +0200346 dat.resize(int(base) + dat.final)
Akron8ef408b2021-08-02 22:11:04 +0200347 for _, a := range symbols {
Akronf1a16502021-08-16 15:24:38 +0200348 if dat.array[int(base)+a].getCheck() != 0 {
Akron8ef408b2021-08-02 22:11:04 +0200349 base++
350 goto OVERLAP
351 }
352 }
Akron8ef408b2021-08-02 22:11:04 +0200353 return base
354}
355
Akron679b4862021-09-02 16:59:26 +0200356// This is an implementation of xCheck with the skip-improvement
357// proposed by Morita et al. (2001)
358func (dat *DaTokenizer) xCheckSkip(symbols []int) uint32 {
359
360 // Start at the first entry of the double array list
361 base := uint32(math.Abs(float64(dat.maxSize-1) * .9))
362
363OVERLAP:
364 // Resize the array if necessary
365 dat.resize(int(base) + dat.final)
366 for _, a := range symbols {
367 if dat.array[int(base)+a].getCheck() != 0 {
368 base++
369 goto OVERLAP
370 }
371 }
372 return base
373}
374
Akron29e306f2021-09-02 18:29:56 +0200375// This is an implementation of xCheck with the skip-improvement
376// proposed by Morita et al. (2001) for higher outdegrees as
377// proposed by Niu et al. (2013)
378func (dat *DaTokenizer) xCheckSkipNiu(symbols []int) uint32 {
379
380 // Start at the first entry of the double array list
381 base := uint32(1)
382
383 // Or skip the first few entries
384 if len(symbols) >= 3 {
385 base = uint32(math.Abs(float64(dat.maxSize-1)*.9)) + 1
386 }
387
388OVERLAP:
389 // Resize the array if necessary
390 dat.resize(int(base) + dat.final + 1)
391 for _, a := range symbols {
392 if dat.array[int(base)+a].getCheck() != 0 {
393 base++
394 goto OVERLAP
395 }
396 }
397 return base
398}
399
Akron7b1faa62021-09-02 16:10:21 +0200400// This is an implementation of xCheck wit an improvement
401// proposed by Niu et al. (2013)
402func (dat *DaTokenizer) xCheckNiu(symbols []int, block_begin_pos *uint32) uint32 {
403
404 // Start at the first entry of the double array list
405 base := uint32(1)
406
407 if len(symbols) > 3 {
408 sort.Ints(symbols)
409 if *block_begin_pos > uint32(symbols[0]) {
410 dat.resize(int(*block_begin_pos) + dat.final)
411 *block_begin_pos += uint32(symbols[len(symbols)-1] + 1)
412 return *block_begin_pos - uint32(symbols[0])
413 }
414 }
415
416OVERLAP:
417 // Resize the array if necessary
418 dat.resize(int(base) + dat.final)
419 for _, a := range symbols {
420 if dat.array[int(base)+a].getCheck() != 0 {
421 base++
422 goto OVERLAP
423 }
424 }
425 return base
426}
427
Akron3610f102021-08-08 14:13:25 +0200428// List all outgoing transitions for a state
429// for testing purposes
430func (dat *DaTokenizer) outgoing(t uint32) []int {
431
432 valid := make([]int, 0, len(dat.sigma))
433
434 for _, a := range dat.sigma {
Akronf1a16502021-08-16 15:24:38 +0200435 t1 := dat.array[t].getBase() + uint32(a)
436 if t1 <= dat.array[1].getCheck() && dat.array[t1].getCheck() == t {
Akron3610f102021-08-08 14:13:25 +0200437 valid = append(valid, a)
438 }
439 }
440
441 for _, a := range []int{dat.epsilon, dat.unknown, dat.identity, dat.final} {
Akronf1a16502021-08-16 15:24:38 +0200442 t1 := dat.array[t].getBase() + uint32(a)
443 if t1 <= dat.array[1].getCheck() && dat.array[t1].getCheck() == t {
Akron3610f102021-08-08 14:13:25 +0200444 valid = append(valid, -1*a)
445 }
446 }
447
448 sort.Ints(valid)
449
450 return valid
451}
452
Akron4fa28b32021-08-27 10:55:41 +0200453// TransCount as the number of transitions aka arcs in the
454// finite state automaton
455func (dat *DaTokenizer) TransCount() int {
456 // Cache the transCount
457 if dat.transCount > 0 {
458 return dat.transCount
459 }
460
461 dat.transCount = 0
462 for x := 1; x < len(dat.array); x++ {
463 if dat.array[x].getBase() != 0 {
464 dat.transCount++
465 }
466 }
467
468 return dat.transCount
469}
470
Akron03a3c612021-08-04 11:51:27 +0200471// LoadFactor as defined in Kanda et al (2018),
472// i.e. the proportion of non-empty elements to all elements.
473func (dat *DaTokenizer) LoadFactor() float64 {
Akron4fa28b32021-08-27 10:55:41 +0200474 return float64(dat.TransCount()) / float64(len(dat.array)) * 100
Akrond66a9262021-08-03 17:09:09 +0200475}
476
Akron439f4ec2021-08-09 15:45:38 +0200477// Save stores the double array data in a file
Akron3a063ef2021-08-05 19:36:35 +0200478func (dat *DaTokenizer) Save(file string) (n int64, err error) {
479 f, err := os.Create(file)
480 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200481 log.Println(err)
Akron439f4ec2021-08-09 15:45:38 +0200482 return 0, err
Akron3a063ef2021-08-05 19:36:35 +0200483 }
484 defer f.Close()
485 gz := gzip.NewWriter(f)
486 defer gz.Close()
487 n, err = dat.WriteTo(gz)
488 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200489 log.Println(err)
Akron3a063ef2021-08-05 19:36:35 +0200490 return n, err
491 }
492 gz.Flush()
493 return n, nil
494}
495
496// WriteTo stores the double array data in an io.Writer.
Akron6247a5d2021-08-03 19:18:28 +0200497func (dat *DaTokenizer) WriteTo(w io.Writer) (n int64, err error) {
498
Akron3a063ef2021-08-05 19:36:35 +0200499 wb := bufio.NewWriter(w)
500 defer wb.Flush()
501
Akron6247a5d2021-08-03 19:18:28 +0200502 // Store magical header
Akron16c312e2021-09-26 13:11:12 +0200503 all, err := wb.Write([]byte(DAMAGIC))
Akron6247a5d2021-08-03 19:18:28 +0200504 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200505 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200506 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200507 }
508
509 // Get sigma as a list
510 sigmalist := make([]rune, len(dat.sigma)+16)
511 max := 0
512 for sym, num := range dat.sigma {
513 sigmalist[num] = sym
514 if num > max {
515 max = num
516 }
517 }
518
519 sigmalist = sigmalist[:max+1]
520
Akron3f8571a2021-08-05 11:18:10 +0200521 buf := make([]byte, 0, 16)
Akron6247a5d2021-08-03 19:18:28 +0200522 bo.PutUint16(buf[0:2], VERSION)
Akronc17f1ca2021-08-03 19:47:27 +0200523 bo.PutUint16(buf[2:4], uint16(dat.epsilon))
524 bo.PutUint16(buf[4:6], uint16(dat.unknown))
525 bo.PutUint16(buf[6:8], uint16(dat.identity))
526 bo.PutUint16(buf[8:10], uint16(dat.final))
Akron6247a5d2021-08-03 19:18:28 +0200527 bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
Akronf1a16502021-08-16 15:24:38 +0200528 bo.PutUint32(buf[12:16], uint32(len(dat.array)*2)) // Legacy support
Akron3a063ef2021-08-05 19:36:35 +0200529 more, err := wb.Write(buf[0:16])
Akron6247a5d2021-08-03 19:18:28 +0200530 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200531 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200532 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200533 }
534
535 all += more
536
Akron6247a5d2021-08-03 19:18:28 +0200537 // Write sigma
538 for _, sym := range sigmalist {
Akron3f8571a2021-08-05 11:18:10 +0200539
Akron3a063ef2021-08-05 19:36:35 +0200540 more, err = wb.WriteRune(sym)
Akron6247a5d2021-08-03 19:18:28 +0200541 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200542 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200543 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200544 }
545 all += more
546 }
Akron439f4ec2021-08-09 15:45:38 +0200547
Akron6247a5d2021-08-03 19:18:28 +0200548 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200549 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200550 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200551 }
Akron6247a5d2021-08-03 19:18:28 +0200552
553 // Test marker - could be checksum
Akron3a063ef2021-08-05 19:36:35 +0200554 more, err = wb.Write([]byte("T"))
Akron6247a5d2021-08-03 19:18:28 +0200555 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200556 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200557 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200558 }
559 all += more
560
Akronf1a16502021-08-16 15:24:38 +0200561 // for x := 0; x < len(dat.array); x++ {
562 for _, bc := range dat.array {
563 bo.PutUint32(buf[0:4], bc.base)
564 more, err = wb.Write(buf[0:4])
Akron6247a5d2021-08-03 19:18:28 +0200565 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200566 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200567 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200568 }
Akron439f4ec2021-08-09 15:45:38 +0200569 all += more
Akron3a063ef2021-08-05 19:36:35 +0200570 if more != 4 {
Akronf1a16502021-08-16 15:24:38 +0200571 log.Println("Can not write base uint32")
572 return int64(all), err
573 }
574 bo.PutUint32(buf[0:4], bc.check)
575 more, err = wb.Write(buf[0:4])
576 if err != nil {
577 log.Println(err)
578 return int64(all), err
579 }
580 all += more
581 if more != 4 {
582 log.Println("Can not write check uint32")
Akron3a063ef2021-08-05 19:36:35 +0200583 return int64(all), err
584 }
Akron6247a5d2021-08-03 19:18:28 +0200585 }
586
587 return int64(all), err
588}
589
Akron439f4ec2021-08-09 15:45:38 +0200590// LoadDatokFile reads a double array represented tokenizer
591// from a file.
Akron3f8571a2021-08-05 11:18:10 +0200592func LoadDatokFile(file string) *DaTokenizer {
593 f, err := os.Open(file)
594 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200595 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200596 return nil
Akron3f8571a2021-08-05 11:18:10 +0200597 }
598 defer f.Close()
599
600 gz, err := gzip.NewReader(f)
601 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200602 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200603 return nil
Akron3f8571a2021-08-05 11:18:10 +0200604 }
605 defer gz.Close()
606
Akron3a063ef2021-08-05 19:36:35 +0200607 // Todo: Read the whole file!
Akron3f8571a2021-08-05 11:18:10 +0200608 return ParseDatok(gz)
609}
610
Akron439f4ec2021-08-09 15:45:38 +0200611// LoadDatokFile reads a double array represented tokenizer
612// from an io.Reader
Akron3f8571a2021-08-05 11:18:10 +0200613func ParseDatok(ior io.Reader) *DaTokenizer {
614
Akron439f4ec2021-08-09 15:45:38 +0200615 // Initialize tokenizer with default values
Akron3f8571a2021-08-05 11:18:10 +0200616 dat := &DaTokenizer{
617 sigma: make(map[rune]int),
618 epsilon: 0,
619 unknown: 0,
620 identity: 0,
621 final: 0,
Akron4fa28b32021-08-27 10:55:41 +0200622 transCount: 0,
Akron3f8571a2021-08-05 11:18:10 +0200623 }
624
625 r := bufio.NewReader(ior)
626
Akron3f8571a2021-08-05 11:18:10 +0200627 buf := make([]byte, 1024)
Akron16c312e2021-09-26 13:11:12 +0200628 buf = buf[0:len(DAMAGIC)]
Akron3f8571a2021-08-05 11:18:10 +0200629
Akron439f4ec2021-08-09 15:45:38 +0200630 _, err := r.Read(buf)
Akron3f8571a2021-08-05 11:18:10 +0200631
632 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200633 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200634 return nil
635 }
636
Akron16c312e2021-09-26 13:11:12 +0200637 if string(DAMAGIC) != string(buf) {
Akron527c10c2021-08-13 01:45:18 +0200638 log.Println("Not a datok file")
Akron3f8571a2021-08-05 11:18:10 +0200639 return nil
640 }
641
Akron439f4ec2021-08-09 15:45:38 +0200642 more, err := io.ReadFull(r, buf[0:16])
Akron3f8571a2021-08-05 11:18:10 +0200643 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200644 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200645 return nil
646 }
647
Akron439f4ec2021-08-09 15:45:38 +0200648 if more != 16 {
Akron527c10c2021-08-13 01:45:18 +0200649 log.Println("Read bytes do not fit")
Akron439f4ec2021-08-09 15:45:38 +0200650 return nil
651 }
Akron3f8571a2021-08-05 11:18:10 +0200652
Akron3a063ef2021-08-05 19:36:35 +0200653 version := bo.Uint16(buf[0:2])
654
655 if version != VERSION {
Akron527c10c2021-08-13 01:45:18 +0200656 log.Println("Version not compatible")
Akron3a063ef2021-08-05 19:36:35 +0200657 return nil
658 }
659
Akron3f8571a2021-08-05 11:18:10 +0200660 dat.epsilon = int(bo.Uint16(buf[2:4]))
661 dat.unknown = int(bo.Uint16(buf[4:6]))
662 dat.identity = int(bo.Uint16(buf[6:8]))
663 dat.final = int(bo.Uint16(buf[8:10]))
664
665 sigmaCount := int(bo.Uint16(buf[10:12]))
Akronf1a16502021-08-16 15:24:38 +0200666 arraySize := int(bo.Uint32(buf[12:16])) / 2 // Legacy support
Akron3f8571a2021-08-05 11:18:10 +0200667
Akron3a063ef2021-08-05 19:36:35 +0200668 // Shouldn't be relevant though
669 dat.maxSize = arraySize - 1
670
Akron00cecd12021-12-05 13:14:03 +0100671 // Init with identity
672 if dat.identity != -1 {
673 for i := 0; i < 256; i++ {
674 dat.sigmaASCII[i] = dat.identity
675 }
676 }
677
Akron3f8571a2021-08-05 11:18:10 +0200678 for x := 0; x < sigmaCount; x++ {
Akron439f4ec2021-08-09 15:45:38 +0200679 sym, _, err := r.ReadRune()
Akron3f8571a2021-08-05 11:18:10 +0200680 if err == nil && sym != 0 {
Akronea46e8a2021-08-17 00:36:31 +0200681 if int(sym) < 256 {
682 dat.sigmaASCII[int(sym)] = x
683 }
Akron3f8571a2021-08-05 11:18:10 +0200684 dat.sigma[sym] = x
685 }
Akron3f8571a2021-08-05 11:18:10 +0200686 }
687
Akron439f4ec2021-08-09 15:45:38 +0200688 _, err = io.ReadFull(r, buf[0:1])
Akron3f8571a2021-08-05 11:18:10 +0200689
690 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200691 log.Print(err)
Akron3f8571a2021-08-05 11:18:10 +0200692 return nil
693 }
694
Akron3f8571a2021-08-05 11:18:10 +0200695 if string("T") != string(buf[0:1]) {
Akron527c10c2021-08-13 01:45:18 +0200696 log.Println("Not a datok file")
Akron3f8571a2021-08-05 11:18:10 +0200697 return nil
698 }
699
700 // Read based on length
Akronf1a16502021-08-16 15:24:38 +0200701 dat.array = make([]bc, arraySize)
Akron3f8571a2021-08-05 11:18:10 +0200702
Akronbb4aac52021-08-13 00:52:27 +0200703 dataArray, err := io.ReadAll(r)
Akron439f4ec2021-08-09 15:45:38 +0200704
Akronbb4aac52021-08-13 00:52:27 +0200705 if err == io.EOF {
Akron527c10c2021-08-13 01:45:18 +0200706 log.Println(err)
Akronbb4aac52021-08-13 00:52:27 +0200707 return nil
708 }
709
Akronf1a16502021-08-16 15:24:38 +0200710 if len(dataArray) < arraySize*8 {
Akron527c10c2021-08-13 01:45:18 +0200711 log.Println("Not enough bytes read")
Akronbb4aac52021-08-13 00:52:27 +0200712 return nil
713 }
714
715 for x := 0; x < arraySize; x++ {
Akronf1a16502021-08-16 15:24:38 +0200716 dat.array[x].base = bo.Uint32(dataArray[x*8 : (x*8)+4])
717 dat.array[x].check = bo.Uint32(dataArray[(x*8)+4 : (x*8)+8])
Akron3f8571a2021-08-05 11:18:10 +0200718 }
719
720 return dat
721}
722
Akron439f4ec2021-08-09 15:45:38 +0200723// Show the current state of the buffer,
724// for testing puroses
Akron3610f102021-08-08 14:13:25 +0200725func showBuffer(buffer []rune, buffo int, buffi int) string {
726 out := make([]rune, 0, 1024)
727 for x := 0; x < len(buffer); x++ {
728 if buffi == x {
729 out = append(out, '^')
730 }
731 if buffo == x {
732 out = append(out, '[', buffer[x], ']')
733 } else {
734 out = append(out, buffer[x])
735 }
736 }
737 return string(out)
738}
739
Akron98fbfef2021-10-23 17:02:11 +0200740// Show the current state of the buffer,
741// for testing puroses
742func showBufferNew(buffer []rune, bufft int, buffc int, buffi int) string {
743 out := make([]rune, 0, 1024)
744 for x := 0; x < len(buffer); x++ {
745 if buffi == x {
746 out = append(out, '^')
747 }
748 if bufft == x {
749 out = append(out, '|')
750 }
751 if buffc == x {
752 out = append(out, '[', buffer[x], ']')
753 } else {
754 out = append(out, buffer[x])
755 }
756 }
757 return string(out)
758}
759
760// Transduce input to ouutput
Akrone396a932021-10-19 01:06:13 +0200761func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
Akron96fdc9b2021-10-27 21:11:17 +0200762 return dat.TransduceTokenWriter(r, NewTokenWriter(w, SIMPLE))
Akrone396a932021-10-19 01:06:13 +0200763}
764
765// TransduceTokenWriter transduces an input string against
766// the double array FSA. The rules are always greedy. If the
767// automaton fails, it takes the last possible token ending
768// branch.
Akron068874c2021-08-04 15:19:56 +0200769//
Akron4db3ecf2021-08-11 18:49:03 +0200770// Based on Mizobuchi et al (2000), p. 129,
771// with additional support for IDENTITY, UNKNOWN
772// and EPSILON transitions and NONTOKEN and TOKENEND handling.
Akron4f6b28c2021-10-25 00:52:03 +0200773func (dat *DaTokenizer) TransduceTokenWriter(r io.Reader, w *TokenWriter) bool {
Akron068874c2021-08-04 15:19:56 +0200774 var a int
Akronb4bbb472021-08-09 11:49:38 +0200775 var t0 uint32
Akronb7e1f132021-08-10 11:52:31 +0200776 t := uint32(1) // Initial state
777 var ok, rewindBuffer bool
Akron068874c2021-08-04 15:19:56 +0200778
Akron3610f102021-08-08 14:13:25 +0200779 // Remember the last position of a possible tokenend,
780 // in case the automaton fails.
781 epsilonState := uint32(0)
782 epsilonOffset := 0
783
Akrone396a932021-10-19 01:06:13 +0200784 // Remember if the last transition was epsilon
785 sentenceEnd := false
786
Akrona854faa2021-10-22 19:31:08 +0200787 // Remember if a text end was already set
788 textEnd := false
789
Akron3610f102021-08-08 14:13:25 +0200790 // Implement a low level buffer for full control,
791 // however - it is probably better to introduce
792 // this on a higher level with a io.Reader interface
793 // The buffer stores a single word and may have white
794 // space at the end (but not at the beginning).
795 //
796 // This is the only backtracking requirement because of
797 // epsilon transitions, to support tokenizations like:
798 // "this is an example|.| And it works." vs
799 // "this is an example.com| application."
Akronb7e1f132021-08-10 11:52:31 +0200800 //
801 // TODO:
802 // Store a translation buffer as well, so characters don't
803 // have to be translated multiple times!
Akron3610f102021-08-08 14:13:25 +0200804 buffer := make([]rune, 1024)
Akron98fbfef2021-10-23 17:02:11 +0200805 bufft := 0 // Buffer token offset
806 buffc := 0 // Buffer current symbol
Akron3610f102021-08-08 14:13:25 +0200807 buffi := 0 // Buffer length
808
Akron98fbfef2021-10-23 17:02:11 +0200809 // The buffer is organized as follows:
810 // [ t[....c..]..i]
811
Akron3f8571a2021-08-05 11:18:10 +0200812 reader := bufio.NewReader(r)
Akrone396a932021-10-19 01:06:13 +0200813 defer w.Flush()
Akron068874c2021-08-04 15:19:56 +0200814
Akron3f8571a2021-08-05 11:18:10 +0200815 var char rune
Akrone396a932021-10-19 01:06:13 +0200816
Akron3f8571a2021-08-05 11:18:10 +0200817 var err error
818 eof := false
Akrona854faa2021-10-22 19:31:08 +0200819 eot := false
Akronb7e1f132021-08-10 11:52:31 +0200820 newchar := true
Akron3f8571a2021-08-05 11:18:10 +0200821
Akronc5d8d432021-08-10 16:48:44 +0200822PARSECHAR:
Akron3f8571a2021-08-05 11:18:10 +0200823 for {
824
Akronb7e1f132021-08-10 11:52:31 +0200825 if newchar {
826 // Get from reader if buffer is empty
Akron98fbfef2021-10-23 17:02:11 +0200827 if buffc >= buffi {
Akron1594cb82021-08-11 11:14:56 +0200828 if eof {
829 break
830 }
Akronb7e1f132021-08-10 11:52:31 +0200831 char, _, err = reader.ReadRune()
Akron439f4ec2021-08-09 15:45:38 +0200832
Akronb7e1f132021-08-10 11:52:31 +0200833 // No more runes to read
834 if err != nil {
835 eof = true
836 break
837 }
838 buffer[buffi] = char
839 buffi++
Akron3f8571a2021-08-05 11:18:10 +0200840 }
Akronb7e1f132021-08-10 11:52:31 +0200841
Akron98fbfef2021-10-23 17:02:11 +0200842 char = buffer[buffc]
Akronb7e1f132021-08-10 11:52:31 +0200843
844 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100845 log.Println("Current char", string(char), int(char), showBufferNew(buffer, bufft, buffc, buffi))
Akronb7e1f132021-08-10 11:52:31 +0200846 }
847
Akrona854faa2021-10-22 19:31:08 +0200848 eot = false
849
Akron6f1c16c2021-08-17 10:45:42 +0200850 // TODO:
851 // Better not repeatedly check for a!
852 // Possibly keep a buffer with a.
Akronea46e8a2021-08-17 00:36:31 +0200853 if int(char) < 256 {
Akrona854faa2021-10-22 19:31:08 +0200854 if int(char) == EOT {
855 eot = true
856 }
Akronea46e8a2021-08-17 00:36:31 +0200857 a = dat.sigmaASCII[int(char)]
858 } else {
859 a, ok = dat.sigma[char]
Akronb7e1f132021-08-10 11:52:31 +0200860
Akron4880fb62021-12-05 12:03:05 +0100861 // Use identity symbol if character is not in sigma
862 if !ok && dat.identity != -1 {
863 a = dat.identity
864 }
Akronb7e1f132021-08-10 11:52:31 +0200865 }
866
867 t0 = t
868
869 // Check for epsilon transitions and remember
Akronf1a16502021-08-16 15:24:38 +0200870 if dat.array[dat.array[t0].getBase()+uint32(dat.epsilon)].getCheck() == t0 {
Akrondf275812022-03-27 12:54:46 +0200871
Akronb7e1f132021-08-10 11:52:31 +0200872 // Remember state for backtracking to last tokenend state
873 epsilonState = t0
Akron98fbfef2021-10-23 17:02:11 +0200874 epsilonOffset = buffc
Akrone396a932021-10-19 01:06:13 +0200875
876 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100877 log.Println("epsilonOffset is set to", buffc)
Akrone396a932021-10-19 01:06:13 +0200878 }
Akronb7e1f132021-08-10 11:52:31 +0200879 }
Akron3f8571a2021-08-05 11:18:10 +0200880 }
Akron3610f102021-08-08 14:13:25 +0200881
Akronb7e1f132021-08-10 11:52:31 +0200882 // Checks a transition based on t0, a and buffo
Akronf1a16502021-08-16 15:24:38 +0200883 t = dat.array[t0].getBase() + uint32(a)
884 ta := dat.array[t]
Akron068874c2021-08-04 15:19:56 +0200885
Akron524c5432021-08-05 14:14:27 +0200886 if DEBUG {
Akronb7e1f132021-08-10 11:52:31 +0200887 // Char is only relevant if set
Akron9c3bf7f2021-11-03 19:52:12 +0100888 log.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
Akronb7e1f132021-08-10 11:52:31 +0200889 if false {
Akron9c3bf7f2021-11-03 19:52:12 +0100890 log.Println(dat.outgoing(t0))
Akronb7e1f132021-08-10 11:52:31 +0200891 }
Akron524c5432021-08-05 14:14:27 +0200892 }
893
Akronb7e1f132021-08-10 11:52:31 +0200894 // Check if the transition is invalid according to the double array
Akronf1a16502021-08-16 15:24:38 +0200895 if t > dat.array[1].getCheck() || ta.getCheck() != t0 {
Akron068874c2021-08-04 15:19:56 +0200896
897 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100898 log.Println("Match is not fine!", t, "and", ta.getCheck(), "vs", t0)
Akron068874c2021-08-04 15:19:56 +0200899 }
900
901 if !ok && a == dat.identity {
Akronb4bbb472021-08-09 11:49:38 +0200902
Akron068874c2021-08-04 15:19:56 +0200903 // Try again with unknown symbol, in case identity failed
Akronb7e1f132021-08-10 11:52:31 +0200904 // Char is only relevant when set
Akron068874c2021-08-04 15:19:56 +0200905 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100906 log.Println("UNKNOWN symbol", string(char), "->", dat.unknown)
Akron068874c2021-08-04 15:19:56 +0200907 }
908 a = dat.unknown
909
Akrondf275812022-03-27 12:54:46 +0200910 } else if a != dat.epsilon && epsilonState != 0 {
Akronb4bbb472021-08-09 11:49:38 +0200911
Akron068874c2021-08-04 15:19:56 +0200912 // Try again with epsilon symbol, in case everything else failed
Akronb4bbb472021-08-09 11:49:38 +0200913 t0 = epsilonState
Akron3610f102021-08-08 14:13:25 +0200914 epsilonState = 0 // reset
Akron98fbfef2021-10-23 17:02:11 +0200915 buffc = epsilonOffset
Akron439f4ec2021-08-09 15:45:38 +0200916 a = dat.epsilon
917
Akron3610f102021-08-08 14:13:25 +0200918 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100919 log.Println("Get from epsilon stack and set buffo!", showBufferNew(buffer, bufft, buffc, buffi))
Akron3610f102021-08-08 14:13:25 +0200920 }
Akronb4bbb472021-08-09 11:49:38 +0200921
Akron068874c2021-08-04 15:19:56 +0200922 } else {
Akrondf275812022-03-27 12:54:46 +0200923
924 if DEBUG {
925 log.Println("Fail!")
926 }
927
928 // w.Fail(bufft)
929
930 // The following procedure means the automaton fails to consume a certain character.
931 // In the tokenization scenario, this means, the tokenizer will drop the old or current data as a
932 // token and start blank at the root node of the automaton for the remaining data.
933 // It may be beneficial to have something like a "drop()" event to capture these cases,
934 // as they are likely the result of a bad automaton design.
Akroncae39112023-04-26 19:43:16 +0200935 if buffc-bufft <= 0 {
Akrondf275812022-03-27 12:54:46 +0200936 buffc++
Akroncae39112023-04-26 19:43:16 +0200937 if buffc == 0 {
938 eof = true
939 break
940 }
Akrondf275812022-03-27 12:54:46 +0200941 }
942
943 if DEBUG {
944 log.Println("-> Flush buffer: [", string(buffer[bufft:buffc]), "]", showBufferNew(buffer, bufft, buffc, buffi))
945 }
946 w.Token(bufft, buffer[:buffc])
947
948 sentenceEnd = false
949 textEnd = false
950
951 if DEBUG {
952 log.Println("-> Rewind buffer", bufft, buffc, buffi, epsilonOffset)
953 }
954
955 for x, i := range buffer[buffc:buffi] {
956 buffer[x] = i
957 }
958
959 buffi -= buffc
960 epsilonState = 0
961
962 buffc = 0
963 bufft = 0
964
965 a = dat.epsilon
966
967 // Restart from root state
968 t = uint32(1)
969 newchar = true
970 // goto PARSECHARM
971 continue
Akron068874c2021-08-04 15:19:56 +0200972 }
Akron068874c2021-08-04 15:19:56 +0200973
Akronb7e1f132021-08-10 11:52:31 +0200974 newchar = false
Akrona854faa2021-10-22 19:31:08 +0200975 eot = false
Akronb7e1f132021-08-10 11:52:31 +0200976 continue
Akronb4bbb472021-08-09 11:49:38 +0200977 }
978
Akronb7e1f132021-08-10 11:52:31 +0200979 // Transition was successful
980 rewindBuffer = false
Akron439f4ec2021-08-09 15:45:38 +0200981
982 // Transition consumes a character
Akronb4bbb472021-08-09 11:49:38 +0200983 if a != dat.epsilon {
984
Akron98fbfef2021-10-23 17:02:11 +0200985 buffc++
Akronb4bbb472021-08-09 11:49:38 +0200986
Akron439f4ec2021-08-09 15:45:38 +0200987 // Transition does not produce a character
Akron98fbfef2021-10-23 17:02:11 +0200988 if buffc-bufft == 1 && ta.isNonToken() {
Akron3610f102021-08-08 14:13:25 +0200989 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100990 log.Println("Nontoken forward", showBufferNew(buffer, bufft, buffc, buffi))
Akron3610f102021-08-08 14:13:25 +0200991 }
Akron98fbfef2021-10-23 17:02:11 +0200992 bufft++
993 // rewindBuffer = true
Akron3610f102021-08-08 14:13:25 +0200994 }
Akron068874c2021-08-04 15:19:56 +0200995
Akrone396a932021-10-19 01:06:13 +0200996 } else {
Akron524c5432021-08-05 14:14:27 +0200997
Akrone396a932021-10-19 01:06:13 +0200998 // Transition marks the end of a token - so flush the buffer
Akron98fbfef2021-10-23 17:02:11 +0200999 if buffc-bufft > 0 {
Akronc5d8d432021-08-10 16:48:44 +02001000 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001001 log.Println("-> Flush buffer: [", string(buffer[bufft:buffc]), "]", showBuffer(buffer, buffc, buffi))
Akronc5d8d432021-08-10 16:48:44 +02001002 }
Akron32416ce2021-10-23 17:09:41 +02001003 w.Token(bufft, buffer[:buffc])
Akronc5d8d432021-08-10 16:48:44 +02001004 rewindBuffer = true
Akrone396a932021-10-19 01:06:13 +02001005 sentenceEnd = false
Akrona854faa2021-10-22 19:31:08 +02001006 textEnd = false
Akrone396a932021-10-19 01:06:13 +02001007 } else {
1008 sentenceEnd = true
Akrona854faa2021-10-22 19:31:08 +02001009 w.SentenceEnd(0)
Akron1594cb82021-08-11 11:14:56 +02001010 }
Akron439f4ec2021-08-09 15:45:38 +02001011 }
Akron3610f102021-08-08 14:13:25 +02001012
Akron8cc2dd92021-10-25 19:49:41 +02001013 if eot {
1014 eot = false
1015 textEnd = true
1016 w.TextEnd(0)
1017 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001018 log.Println("END OF TEXT")
Akron8cc2dd92021-10-25 19:49:41 +02001019 }
1020 }
1021
Akronc5d8d432021-08-10 16:48:44 +02001022 // Rewind the buffer if necessary
Akron439f4ec2021-08-09 15:45:38 +02001023 if rewindBuffer {
1024
Akrone396a932021-10-19 01:06:13 +02001025 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001026 log.Println("-> Rewind buffer", bufft, buffc, buffi, epsilonOffset)
Akrone396a932021-10-19 01:06:13 +02001027 }
1028
Akron439f4ec2021-08-09 15:45:38 +02001029 // TODO: Better as a ring buffer
Akron04335c62021-10-28 11:56:00 +02001030 // buffer = buffer[buffc:] !slower
Akron98fbfef2021-10-23 17:02:11 +02001031 for x, i := range buffer[buffc:buffi] {
Akron3610f102021-08-08 14:13:25 +02001032 buffer[x] = i
1033 }
Akronb4bbb472021-08-09 11:49:38 +02001034
Akron98fbfef2021-10-23 17:02:11 +02001035 buffi -= buffc
Akron16c312e2021-09-26 13:11:12 +02001036 // epsilonOffset -= buffo
Akrone396a932021-10-19 01:06:13 +02001037 epsilonOffset = 0
1038 epsilonState = 0
1039
Akron98fbfef2021-10-23 17:02:11 +02001040 buffc = 0
1041 bufft = 0
Akrona854faa2021-10-22 19:31:08 +02001042
Akron98fbfef2021-10-23 17:02:11 +02001043 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001044 log.Println("Remaining:", showBufferNew(buffer, bufft, buffc, buffi))
Akron98fbfef2021-10-23 17:02:11 +02001045 }
1046 }
1047
Akronb7e1f132021-08-10 11:52:31 +02001048 // Move to representative state
Akronf1a16502021-08-16 15:24:38 +02001049 if ta.isSeparate() {
1050 t = ta.getBase()
1051 ta = dat.array[t]
Akronb7e1f132021-08-10 11:52:31 +02001052
1053 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001054 log.Println("Representative pointing to", t)
Akronb7e1f132021-08-10 11:52:31 +02001055 }
1056 }
1057
Akronc5d8d432021-08-10 16:48:44 +02001058 newchar = true
1059
Akron068874c2021-08-04 15:19:56 +02001060 // TODO:
Akrondf275812022-03-27 12:54:46 +02001061 // Prevent endless epsilon loops by checking
1062 // the model has no epsilon loops1
Akron068874c2021-08-04 15:19:56 +02001063 }
1064
Akron439f4ec2021-08-09 15:45:38 +02001065 // Input reader is not yet finished
Akron3f8571a2021-08-05 11:18:10 +02001066 if !eof {
Akron068874c2021-08-04 15:19:56 +02001067 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001068 log.Println("Not at the end - problem", t0, ":", dat.outgoing(t0))
Akron068874c2021-08-04 15:19:56 +02001069 }
Akrondf275812022-03-27 12:54:46 +02001070 // This should never happen
Akron068874c2021-08-04 15:19:56 +02001071 return false
1072 }
1073
Akronb7e1f132021-08-10 11:52:31 +02001074 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001075 log.Println("Entering final check")
Akronb7e1f132021-08-10 11:52:31 +02001076 }
1077
Akrona854faa2021-10-22 19:31:08 +02001078 // Check epsilon transitions as long as possible
Akronc5d8d432021-08-10 16:48:44 +02001079 t0 = t
Akronf1a16502021-08-16 15:24:38 +02001080 t = dat.array[t0].getBase() + uint32(dat.epsilon)
Akron01912fc2021-08-12 11:41:58 +02001081 a = dat.epsilon
1082 newchar = false
Akrone396a932021-10-19 01:06:13 +02001083
Akronf1a16502021-08-16 15:24:38 +02001084 if dat.array[t].getCheck() == t0 {
Akronc5d8d432021-08-10 16:48:44 +02001085 // Remember state for backtracking to last tokenend state
Akronc5d8d432021-08-10 16:48:44 +02001086 goto PARSECHAR
Akrone61380b2021-08-16 10:10:46 +02001087
Akronc5d8d432021-08-10 16:48:44 +02001088 } else if epsilonState != 0 {
Akronb7e1f132021-08-10 11:52:31 +02001089 t0 = epsilonState
1090 epsilonState = 0 // reset
Akron98fbfef2021-10-23 17:02:11 +02001091 buffc = epsilonOffset
Akron068874c2021-08-04 15:19:56 +02001092 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001093 log.Println("Get from epsilon stack and set buffo!", showBufferNew(buffer, bufft, buffc, buffi))
Akron068874c2021-08-04 15:19:56 +02001094 }
Akronc5d8d432021-08-10 16:48:44 +02001095 goto PARSECHAR
Akron068874c2021-08-04 15:19:56 +02001096 }
Akrone396a932021-10-19 01:06:13 +02001097
Akrondf275812022-03-27 12:54:46 +02001098 // something left in buffer
1099 if buffc-bufft > 0 {
1100 if DEBUG {
1101 log.Println("-> Flush buffer: [", string(buffer[bufft:buffc]), "]", showBufferNew(buffer, bufft, buffc, buffi))
1102 }
1103 w.Token(bufft, buffer[:buffc])
1104 sentenceEnd = false
1105 textEnd = false
1106 }
1107
Akrone396a932021-10-19 01:06:13 +02001108 // Add an additional sentence ending, if the file is over but no explicit
1109 // sentence split was reached. This may be controversial and therefore
1110 // optional via parameter.
1111 if !sentenceEnd {
Akrona854faa2021-10-22 19:31:08 +02001112 w.SentenceEnd(0)
1113
Akrone396a932021-10-19 01:06:13 +02001114 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001115 log.Println("Sentence end")
Akrona854faa2021-10-22 19:31:08 +02001116 }
1117 }
1118
1119 if !textEnd {
1120 w.TextEnd(0)
1121
1122 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001123 log.Println("Text end")
Akrone396a932021-10-19 01:06:13 +02001124 }
1125 }
1126
1127 return true
Akron068874c2021-08-04 15:19:56 +02001128}