blob: fba655eecbc7f1fdd972d17153fba456c7db6050 [file] [log] [blame]
Akron7f1097f2021-09-21 16:00:29 +02001package datok
Akron8ef408b2021-08-02 22:11:04 +02002
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
Akronb4bbb472021-08-09 11:49:38 +02009// The maximum number of states is 1.073.741.823 (30bit),
Akron83e75a22021-08-04 13:14:06 +020010// with a loadfactor of ~70, this means roughly 70 million
11// states in the FSA, which is sufficient for the current
12// job.
Akron03c92fe2021-08-09 14:07:57 +020013//
14// Serialization is little endian.
Akron83e75a22021-08-04 13:14:06 +020015
Akron740f3d72021-08-03 12:12:34 +020016// TODO:
17// - replace maxSize with the check value
Akron7b1faa62021-09-02 16:10:21 +020018// - Check if final states can be optional.
19// - Introduce ELM (Morita et al. 2001) to speed
20// up construction. Can be ignored on serialization
21// to improve compression.
Akron3a063ef2021-08-05 19:36:35 +020022// - Add checksum to serialization.
Akrone61380b2021-08-16 10:10:46 +020023// - Replace/Enhance table with a map
24// - Provide a bufio.Scanner compatible interface.
Akron8e1d69b2021-08-12 17:38:49 +020025// - Mark epsilon transitions in bytes
Akron740f3d72021-08-03 12:12:34 +020026
Akron8ef408b2021-08-02 22:11:04 +020027import (
28 "bufio"
29 "compress/gzip"
Akron6247a5d2021-08-03 19:18:28 +020030 "encoding/binary"
Akron8ef408b2021-08-02 22:11:04 +020031 "io"
Akron679b4862021-09-02 16:59:26 +020032 "math"
Akron8ef408b2021-08-02 22:11:04 +020033 "os"
Akronc9d84a62021-08-03 15:56:03 +020034 "sort"
Akron740f3d72021-08-03 12:12:34 +020035
Akron527c10c2021-08-13 01:45:18 +020036 "log"
Akron8ef408b2021-08-02 22:11:04 +020037)
38
39const (
Akron2a4b9292021-08-04 15:35:22 +020040 DEBUG = false
Akron16c312e2021-09-26 13:11:12 +020041 DAMAGIC = "DATOK"
Akron2a4b9292021-08-04 15:35:22 +020042 VERSION = uint16(1)
Akron03c92fe2021-08-09 14:07:57 +020043 FIRSTBIT uint32 = 1 << 31
44 SECONDBIT uint32 = 1 << 30
45 RESTBIT uint32 = ^uint32(0) &^ (FIRSTBIT | SECONDBIT)
Akron8ef408b2021-08-02 22:11:04 +020046)
47
Akron03c92fe2021-08-09 14:07:57 +020048// Serialization is always little endian
Akron6247a5d2021-08-03 19:18:28 +020049var bo binary.ByteOrder = binary.LittleEndian
50
Akron8ef408b2021-08-02 22:11:04 +020051type mapping struct {
52 source int
Akron3fdfec62021-08-04 11:40:10 +020053 target uint32
Akron8ef408b2021-08-02 22:11:04 +020054}
55
Akronf1a16502021-08-16 15:24:38 +020056type bc struct {
57 base uint32
58 check uint32
59}
60
Akron03c92fe2021-08-09 14:07:57 +020061// DaTokenizer represents a tokenizer implemented as a
62// Double Array FSA.
Akronf2120ca2021-08-03 16:26:41 +020063type DaTokenizer struct {
Akronea46e8a2021-08-17 00:36:31 +020064 sigma map[rune]int
65 sigmaASCII [256]int
Akron03a3c612021-08-04 11:51:27 +020066 maxSize int
Akron4fa28b32021-08-27 10:55:41 +020067 transCount int
Akronf1a16502021-08-16 15:24:38 +020068 array []bc
Akronc17f1ca2021-08-03 19:47:27 +020069
70 // Special symbols in sigma
71 epsilon int
72 unknown int
73 identity int
74 final int
Akron03c92fe2021-08-09 14:07:57 +020075 tokenend int
Akronf2120ca2021-08-03 16:26:41 +020076}
77
Akron439f4ec2021-08-09 15:45:38 +020078// ToDoubleArray turns the intermediate tokenizer representation
79// into a double array representation.
80//
81// This is based on Mizobuchi et al (2000), p.128
Akron1c34ce62021-09-23 23:27:39 +020082func (auto *Automaton) ToDoubleArray() *DaTokenizer {
Akronf2120ca2021-08-03 16:26:41 +020083
84 dat := &DaTokenizer{
Akron03a3c612021-08-04 11:51:27 +020085 sigma: make(map[rune]int),
Akron4fa28b32021-08-27 10:55:41 +020086 transCount: -1,
Akron1c34ce62021-09-23 23:27:39 +020087 final: auto.final,
88 unknown: auto.unknown,
89 identity: auto.identity,
90 epsilon: auto.epsilon,
91 tokenend: auto.tokenend,
Akronf2120ca2021-08-03 16:26:41 +020092 }
93
Akronf1a16502021-08-16 15:24:38 +020094 dat.resize(dat.final)
95
Akron00cecd12021-12-05 13:14:03 +010096 // Init with identity
97 if dat.identity != -1 {
98 for i := 0; i < 256; i++ {
99 dat.sigmaASCII[i] = dat.identity
100 }
101 }
102
Akron1c34ce62021-09-23 23:27:39 +0200103 for num, sym := range auto.sigmaRev {
Akronea46e8a2021-08-17 00:36:31 +0200104 if int(sym) < 256 {
105 dat.sigmaASCII[int(sym)] = num
106 }
Akronf2120ca2021-08-03 16:26:41 +0200107 dat.sigma[sym] = num
108 }
Akron8ef408b2021-08-02 22:11:04 +0200109
110 mark := 0
111 size := 0
Akron6f1c16c2021-08-17 10:45:42 +0200112 var base uint32
Akronde18e902021-08-27 09:34:12 +0200113 var atrans *edge
114 var s, s1 int
115 var t, t1 uint32
Akron8e803932023-04-18 10:19:19 +0200116 var diff int
Akron8ef408b2021-08-02 22:11:04 +0200117
Akron439f4ec2021-08-09 15:45:38 +0200118 // Create a mapping from s (in Ms aka Intermediate FSA)
119 // to t (in Mt aka Double Array FSA)
Akron1c34ce62021-09-23 23:27:39 +0200120 table := make([]*mapping, auto.arcCount+1)
Akron7b1faa62021-09-02 16:10:21 +0200121 // tableQueue := make([]int, tok.arcCount+1)
Akron8ef408b2021-08-02 22:11:04 +0200122
Akron439f4ec2021-08-09 15:45:38 +0200123 // Initialize with the start state
Akron8ef408b2021-08-02 22:11:04 +0200124 table[size] = &mapping{source: 1, target: 1}
Akron7b1faa62021-09-02 16:10:21 +0200125 // tableQueue[size] = 1
Akron8ef408b2021-08-02 22:11:04 +0200126 size++
127
Akron740f3d72021-08-03 12:12:34 +0200128 // Allocate space for the outgoing symbol range
Akron1c34ce62021-09-23 23:27:39 +0200129 A := make([]int, 0, auto.sigmaCount)
Akron7b1faa62021-09-02 16:10:21 +0200130 // tableLookup := make([]uint32, tok.arcCount+2) // make(map[int]uint32)
131 // tableLookup[1] = 1
Akron8ef408b2021-08-02 22:11:04 +0200132
Akron7b1faa62021-09-02 16:10:21 +0200133 // block_begin_pos := uint32(1)
Akronde18e902021-08-27 09:34:12 +0200134
Akron8ef408b2021-08-02 22:11:04 +0200135 for mark < size {
Akronde18e902021-08-27 09:34:12 +0200136 s = table[mark].source // This is a state in Ms
137 t = table[mark].target // This is a state in Mt
Akron7b1faa62021-09-02 16:10:21 +0200138 // s = tableQueue[mark]
139 // t = tableLookup[s]
Akron8ef408b2021-08-02 22:11:04 +0200140 mark++
Akron740f3d72021-08-03 12:12:34 +0200141
142 // Following the paper, here the state t can be remembered
143 // in the set of states St
Akron8ef408b2021-08-02 22:11:04 +0200144 A = A[:0]
Akron1c34ce62021-09-23 23:27:39 +0200145 auto.getSet(s, &A)
Akron8ef408b2021-08-02 22:11:04 +0200146
Akron740f3d72021-08-03 12:12:34 +0200147 // Set base to the first free slot in the double array
Akron29e306f2021-09-02 18:29:56 +0200148 // base = dat.xCheck(A)
Akron679b4862021-09-02 16:59:26 +0200149 // base = dat.xCheckSkip(A)
Akron7b1faa62021-09-02 16:10:21 +0200150 // base = dat.xCheckNiu(A, &block_begin_pos)
Akron29e306f2021-09-02 18:29:56 +0200151 base = dat.xCheckSkipNiu(A)
Akron6f1c16c2021-08-17 10:45:42 +0200152 dat.array[t].setBase(base)
Akron8ef408b2021-08-02 22:11:04 +0200153
Akron773b1ef2021-08-03 17:37:20 +0200154 // TODO:
Akron068874c2021-08-04 15:19:56 +0200155 // Sort the outgoing transitions based on the
Akron773b1ef2021-08-03 17:37:20 +0200156 // outdegree of .end
157
Akron740f3d72021-08-03 12:12:34 +0200158 // Iterate over all outgoing symbols
Akron8ef408b2021-08-02 22:11:04 +0200159 for _, a := range A {
160
Akron1c34ce62021-09-23 23:27:39 +0200161 if a != auto.final {
Akron8ef408b2021-08-02 22:11:04 +0200162
Akron1c34ce62021-09-23 23:27:39 +0200163 atrans = auto.transitions[s][a]
Akronde18e902021-08-27 09:34:12 +0200164
Akron740f3d72021-08-03 12:12:34 +0200165 // Aka g(s, a)
Akronde18e902021-08-27 09:34:12 +0200166 s1 = atrans.end
Akron8ef408b2021-08-02 22:11:04 +0200167
Akron740f3d72021-08-03 12:12:34 +0200168 // Store the transition
Akronde18e902021-08-27 09:34:12 +0200169 t1 = base + uint32(a)
Akronf1a16502021-08-16 15:24:38 +0200170 dat.array[t1].setCheck(t)
171
172 // Set maxSize
Akron72a64222023-04-26 17:00:45 +0200173 // New: dat.maxSize = max(dat.maxSize, int(t1))
Akronf1a16502021-08-16 15:24:38 +0200174 if dat.maxSize < int(t1) {
175 dat.maxSize = int(t1)
176 }
Akron8ef408b2021-08-02 22:11:04 +0200177
Akron439f4ec2021-08-09 15:45:38 +0200178 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100179 log.Println("Translate transition",
Akron524c5432021-08-05 14:14:27 +0200180 s, "->", s1, "(", a, ")", "to", t, "->", t1)
181 }
182
Akron83e75a22021-08-04 13:14:06 +0200183 // Mark the state as being the target of a nontoken transition
Akronde18e902021-08-27 09:34:12 +0200184 if atrans.nontoken {
Akronf1a16502021-08-16 15:24:38 +0200185 dat.array[t1].setNonToken(true)
Akron524c5432021-08-05 14:14:27 +0200186 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100187 log.Println("Set", t1, "to nontoken")
Akron524c5432021-08-05 14:14:27 +0200188 }
Akron83e75a22021-08-04 13:14:06 +0200189 }
190
Akron84d68e62021-08-04 17:06:52 +0200191 // Mark the state as being the target of a tokenend transition
Akronde18e902021-08-27 09:34:12 +0200192 if atrans.tokenend {
Akronf1a16502021-08-16 15:24:38 +0200193 dat.array[t1].setTokenEnd(true)
Akron524c5432021-08-05 14:14:27 +0200194 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100195 log.Println("Set", t1, "to tokenend")
Akron524c5432021-08-05 14:14:27 +0200196 }
Akron84d68e62021-08-04 17:06:52 +0200197 }
198
Akron740f3d72021-08-03 12:12:34 +0200199 // Check for representative states
Akron439f4ec2021-08-09 15:45:38 +0200200 r := stateAlreadyInTable(s1, table, size)
Akronde18e902021-08-27 09:34:12 +0200201 // r := tableLookup[s1]
Akron740f3d72021-08-03 12:12:34 +0200202
Akron439f4ec2021-08-09 15:45:38 +0200203 // No representative found
Akron8ef408b2021-08-02 22:11:04 +0200204 if r == 0 {
Akron740f3d72021-08-03 12:12:34 +0200205 // Remember the mapping
Akron8ef408b2021-08-02 22:11:04 +0200206 table[size] = &mapping{source: s1, target: t1}
Akron7b1faa62021-09-02 16:10:21 +0200207 // tableQueue[size] = s1
Akronde18e902021-08-27 09:34:12 +0200208 // tableLookup[s1] = t1
Akron8ef408b2021-08-02 22:11:04 +0200209 size++
210 } else {
Akron740f3d72021-08-03 12:12:34 +0200211 // Overwrite with the representative state
Akronf1a16502021-08-16 15:24:38 +0200212 dat.array[t1].setBase(r)
213 dat.array[t1].setSeparate(true)
Akron8ef408b2021-08-02 22:11:04 +0200214 }
215 } else {
Akron740f3d72021-08-03 12:12:34 +0200216 // Store a final transition
Akron6f1c16c2021-08-17 10:45:42 +0200217 dat.array[base+uint32(dat.final)].setCheck(t)
Akronf1a16502021-08-16 15:24:38 +0200218
Akron8e803932023-04-18 10:19:19 +0200219 // Find max
220 // see https://dev.to/jobinrjohnson/branchless-programming-does-it-really-matter-20j4
221 diff = dat.maxSize - (int(base) + dat.final)
222 dat.maxSize -= (diff & (diff >> 31))
Akron8ef408b2021-08-02 22:11:04 +0200223 }
224 }
225 }
226
227 // Following Mizobuchi et al (2000) the size of the
228 // FSA should be stored in check(1).
Akron29e306f2021-09-02 18:29:56 +0200229 // We make the size a bit larger so we never have to check for boundaries.
230 dat.setSize(dat.maxSize + dat.final)
231 if len(dat.array) < dat.maxSize+dat.final {
232 dat.array = append(dat.array, make([]bc, dat.final)...)
233 }
234 dat.array = dat.array[:dat.maxSize+dat.final]
Akronf2120ca2021-08-03 16:26:41 +0200235 return dat
Akron8ef408b2021-08-02 22:11:04 +0200236}
237
Akron8ef408b2021-08-02 22:11:04 +0200238// Check the table if a mapping of s
Akron740f3d72021-08-03 12:12:34 +0200239// exists and return this as a representative.
240// Currently iterates through the whole table
241// in a bruteforce manner.
Akron439f4ec2021-08-09 15:45:38 +0200242func stateAlreadyInTable(s int, table []*mapping, size int) uint32 {
Akron8ef408b2021-08-02 22:11:04 +0200243 for x := 0; x < size; x++ {
244 if table[x].source == s {
245 return table[x].target
246 }
247 }
248 return 0
249}
250
Akron941f2152021-09-26 15:14:25 +0200251// Type of tokenizer
252func (DaTokenizer) Type() string {
253 return DAMAGIC
254}
255
Akron64ffd9a2021-08-03 19:55:21 +0200256// Resize double array when necessary
257func (dat *DaTokenizer) resize(l int) {
258 // TODO:
259 // This is a bit too aggressive atm and should be calmed down.
260 if len(dat.array) <= l {
Akronf1a16502021-08-16 15:24:38 +0200261 dat.array = append(dat.array, make([]bc, l)...)
Akron8ef408b2021-08-02 22:11:04 +0200262 }
Akron64ffd9a2021-08-03 19:55:21 +0200263}
Akronc9d84a62021-08-03 15:56:03 +0200264
Akron64ffd9a2021-08-03 19:55:21 +0200265// Set base value in double array
Akronf1a16502021-08-16 15:24:38 +0200266func (bc *bc) setBase(v uint32) {
267 bc.base = v
Akron439f4ec2021-08-09 15:45:38 +0200268}
269
270// Get base value in double array
Akronf1a16502021-08-16 15:24:38 +0200271func (bc *bc) getBase() uint32 {
272 return bc.base & RESTBIT
Akron439f4ec2021-08-09 15:45:38 +0200273}
274
275// Set check value in double array
Akronf1a16502021-08-16 15:24:38 +0200276func (bc *bc) setCheck(v uint32) {
277 bc.check = v
Akron439f4ec2021-08-09 15:45:38 +0200278}
279
280// Get check value in double array
Akronf1a16502021-08-16 15:24:38 +0200281func (bc *bc) getCheck() uint32 {
282 return bc.check & RESTBIT
Akron64ffd9a2021-08-03 19:55:21 +0200283}
284
Akron3fdfec62021-08-04 11:40:10 +0200285// Returns true if a state is separate pointing to a representative
Akronf1a16502021-08-16 15:24:38 +0200286func (bc *bc) isSeparate() bool {
287 return bc.base&FIRSTBIT != 0
Akron3fdfec62021-08-04 11:40:10 +0200288}
289
290// Mark a state as separate pointing to a representative
Akronf1a16502021-08-16 15:24:38 +0200291func (bc *bc) setSeparate(sep bool) {
Akron3fdfec62021-08-04 11:40:10 +0200292 if sep {
Akronf1a16502021-08-16 15:24:38 +0200293 bc.base |= FIRSTBIT
Akron3fdfec62021-08-04 11:40:10 +0200294 } else {
Akronf1a16502021-08-16 15:24:38 +0200295 bc.base &= (RESTBIT | SECONDBIT)
Akron3fdfec62021-08-04 11:40:10 +0200296 }
297}
298
Akron83e75a22021-08-04 13:14:06 +0200299// Returns true if a state is the target of a nontoken transition
Akronf1a16502021-08-16 15:24:38 +0200300func (bc *bc) isNonToken() bool {
301 return bc.check&FIRSTBIT != 0
Akron83e75a22021-08-04 13:14:06 +0200302}
303
304// Mark a state as being the target of a nontoken transition
Akronf1a16502021-08-16 15:24:38 +0200305func (bc *bc) setNonToken(sep bool) {
Akron83e75a22021-08-04 13:14:06 +0200306 if sep {
Akronf1a16502021-08-16 15:24:38 +0200307 bc.check |= FIRSTBIT
Akron83e75a22021-08-04 13:14:06 +0200308 } else {
Akronf1a16502021-08-16 15:24:38 +0200309 bc.check &= (RESTBIT | SECONDBIT)
Akron83e75a22021-08-04 13:14:06 +0200310 }
311}
312
Akron84d68e62021-08-04 17:06:52 +0200313// Returns true if a state is the target of a tokenend transition
Akronf1a16502021-08-16 15:24:38 +0200314func (bc *bc) isTokenEnd() bool {
315 return bc.check&SECONDBIT != 0
Akron84d68e62021-08-04 17:06:52 +0200316}
317
318// Mark a state as being the target of a tokenend transition
Akronf1a16502021-08-16 15:24:38 +0200319func (bc *bc) setTokenEnd(sep bool) {
Akron84d68e62021-08-04 17:06:52 +0200320 if sep {
Akronf1a16502021-08-16 15:24:38 +0200321 bc.check |= SECONDBIT
Akron84d68e62021-08-04 17:06:52 +0200322 } else {
Akronf1a16502021-08-16 15:24:38 +0200323 bc.check &= (RESTBIT | FIRSTBIT)
Akron84d68e62021-08-04 17:06:52 +0200324 }
325}
326
Akron64ffd9a2021-08-03 19:55:21 +0200327// Set size of double array
Akron3fdfec62021-08-04 11:40:10 +0200328func (dat *DaTokenizer) setSize(v int) {
Akronf1a16502021-08-16 15:24:38 +0200329 dat.array[1].setCheck(uint32(v))
Akron64ffd9a2021-08-03 19:55:21 +0200330}
331
332// Get size of double array
Akron3fdfec62021-08-04 11:40:10 +0200333func (dat *DaTokenizer) GetSize() int {
Akronf1a16502021-08-16 15:24:38 +0200334 return int(dat.array[1].getCheck())
Akron8ef408b2021-08-02 22:11:04 +0200335}
336
337// Based on Mizobuchi et al (2000), p. 124
338// This iterates for every state through the complete double array
339// structure until it finds a gap that fits all outgoing transitions
340// of the state. This is extremely slow, but is only necessary in the
341// construction phase of the tokenizer.
Akron3fdfec62021-08-04 11:40:10 +0200342func (dat *DaTokenizer) xCheck(symbols []int) uint32 {
Akron740f3d72021-08-03 12:12:34 +0200343
344 // Start at the first entry of the double array list
Akron6f1c16c2021-08-17 10:45:42 +0200345 base := uint32(1)
346
Akron8ef408b2021-08-02 22:11:04 +0200347OVERLAP:
Akron740f3d72021-08-03 12:12:34 +0200348 // Resize the array if necessary
Akronf1a16502021-08-16 15:24:38 +0200349 dat.resize(int(base) + dat.final)
Akron8ef408b2021-08-02 22:11:04 +0200350 for _, a := range symbols {
Akronf1a16502021-08-16 15:24:38 +0200351 if dat.array[int(base)+a].getCheck() != 0 {
Akron8ef408b2021-08-02 22:11:04 +0200352 base++
353 goto OVERLAP
354 }
355 }
Akron8ef408b2021-08-02 22:11:04 +0200356 return base
357}
358
Akron679b4862021-09-02 16:59:26 +0200359// This is an implementation of xCheck with the skip-improvement
360// proposed by Morita et al. (2001)
361func (dat *DaTokenizer) xCheckSkip(symbols []int) uint32 {
362
363 // Start at the first entry of the double array list
364 base := uint32(math.Abs(float64(dat.maxSize-1) * .9))
365
366OVERLAP:
367 // Resize the array if necessary
368 dat.resize(int(base) + dat.final)
369 for _, a := range symbols {
370 if dat.array[int(base)+a].getCheck() != 0 {
371 base++
372 goto OVERLAP
373 }
374 }
375 return base
376}
377
Akron29e306f2021-09-02 18:29:56 +0200378// This is an implementation of xCheck with the skip-improvement
379// proposed by Morita et al. (2001) for higher outdegrees as
380// proposed by Niu et al. (2013)
381func (dat *DaTokenizer) xCheckSkipNiu(symbols []int) uint32 {
382
383 // Start at the first entry of the double array list
384 base := uint32(1)
385
386 // Or skip the first few entries
387 if len(symbols) >= 3 {
388 base = uint32(math.Abs(float64(dat.maxSize-1)*.9)) + 1
389 }
390
391OVERLAP:
392 // Resize the array if necessary
393 dat.resize(int(base) + dat.final + 1)
394 for _, a := range symbols {
395 if dat.array[int(base)+a].getCheck() != 0 {
396 base++
397 goto OVERLAP
398 }
399 }
400 return base
401}
402
Akron7b1faa62021-09-02 16:10:21 +0200403// This is an implementation of xCheck wit an improvement
404// proposed by Niu et al. (2013)
405func (dat *DaTokenizer) xCheckNiu(symbols []int, block_begin_pos *uint32) uint32 {
406
407 // Start at the first entry of the double array list
408 base := uint32(1)
409
410 if len(symbols) > 3 {
411 sort.Ints(symbols)
412 if *block_begin_pos > uint32(symbols[0]) {
413 dat.resize(int(*block_begin_pos) + dat.final)
414 *block_begin_pos += uint32(symbols[len(symbols)-1] + 1)
415 return *block_begin_pos - uint32(symbols[0])
416 }
417 }
418
419OVERLAP:
420 // Resize the array if necessary
421 dat.resize(int(base) + dat.final)
422 for _, a := range symbols {
423 if dat.array[int(base)+a].getCheck() != 0 {
424 base++
425 goto OVERLAP
426 }
427 }
428 return base
429}
430
Akron3610f102021-08-08 14:13:25 +0200431// List all outgoing transitions for a state
432// for testing purposes
433func (dat *DaTokenizer) outgoing(t uint32) []int {
434
435 valid := make([]int, 0, len(dat.sigma))
436
437 for _, a := range dat.sigma {
Akronf1a16502021-08-16 15:24:38 +0200438 t1 := dat.array[t].getBase() + uint32(a)
439 if t1 <= dat.array[1].getCheck() && dat.array[t1].getCheck() == t {
Akron3610f102021-08-08 14:13:25 +0200440 valid = append(valid, a)
441 }
442 }
443
444 for _, a := range []int{dat.epsilon, dat.unknown, dat.identity, dat.final} {
Akronf1a16502021-08-16 15:24:38 +0200445 t1 := dat.array[t].getBase() + uint32(a)
446 if t1 <= dat.array[1].getCheck() && dat.array[t1].getCheck() == t {
Akron3610f102021-08-08 14:13:25 +0200447 valid = append(valid, -1*a)
448 }
449 }
450
451 sort.Ints(valid)
452
453 return valid
454}
455
Akron4fa28b32021-08-27 10:55:41 +0200456// TransCount as the number of transitions aka arcs in the
457// finite state automaton
458func (dat *DaTokenizer) TransCount() int {
459 // Cache the transCount
460 if dat.transCount > 0 {
461 return dat.transCount
462 }
463
464 dat.transCount = 0
465 for x := 1; x < len(dat.array); x++ {
Akron8e803932023-04-18 10:19:19 +0200466
467 // Hopefully branchless
Akron4fa28b32021-08-27 10:55:41 +0200468 if dat.array[x].getBase() != 0 {
469 dat.transCount++
470 }
471 }
472
473 return dat.transCount
474}
475
Akron03a3c612021-08-04 11:51:27 +0200476// LoadFactor as defined in Kanda et al (2018),
477// i.e. the proportion of non-empty elements to all elements.
478func (dat *DaTokenizer) LoadFactor() float64 {
Akron4fa28b32021-08-27 10:55:41 +0200479 return float64(dat.TransCount()) / float64(len(dat.array)) * 100
Akrond66a9262021-08-03 17:09:09 +0200480}
481
Akron439f4ec2021-08-09 15:45:38 +0200482// Save stores the double array data in a file
Akron3a063ef2021-08-05 19:36:35 +0200483func (dat *DaTokenizer) Save(file string) (n int64, err error) {
484 f, err := os.Create(file)
485 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200486 log.Println(err)
Akron439f4ec2021-08-09 15:45:38 +0200487 return 0, err
Akron3a063ef2021-08-05 19:36:35 +0200488 }
489 defer f.Close()
490 gz := gzip.NewWriter(f)
491 defer gz.Close()
492 n, err = dat.WriteTo(gz)
493 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200494 log.Println(err)
Akron3a063ef2021-08-05 19:36:35 +0200495 return n, err
496 }
497 gz.Flush()
498 return n, nil
499}
500
501// WriteTo stores the double array data in an io.Writer.
Akron6247a5d2021-08-03 19:18:28 +0200502func (dat *DaTokenizer) WriteTo(w io.Writer) (n int64, err error) {
503
Akron3a063ef2021-08-05 19:36:35 +0200504 wb := bufio.NewWriter(w)
505 defer wb.Flush()
506
Akron6247a5d2021-08-03 19:18:28 +0200507 // Store magical header
Akron16c312e2021-09-26 13:11:12 +0200508 all, err := wb.Write([]byte(DAMAGIC))
Akron6247a5d2021-08-03 19:18:28 +0200509 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200510 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200511 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200512 }
513
514 // Get sigma as a list
515 sigmalist := make([]rune, len(dat.sigma)+16)
516 max := 0
517 for sym, num := range dat.sigma {
518 sigmalist[num] = sym
Akron8e803932023-04-18 10:19:19 +0200519
520 // Find max
521 max -= ((max - num) & ((max - num) >> 31))
522 // if num > max {
523 // max = num
524 // }
Akron6247a5d2021-08-03 19:18:28 +0200525 }
526
527 sigmalist = sigmalist[:max+1]
528
Akron3f8571a2021-08-05 11:18:10 +0200529 buf := make([]byte, 0, 16)
Akron6247a5d2021-08-03 19:18:28 +0200530 bo.PutUint16(buf[0:2], VERSION)
Akronc17f1ca2021-08-03 19:47:27 +0200531 bo.PutUint16(buf[2:4], uint16(dat.epsilon))
532 bo.PutUint16(buf[4:6], uint16(dat.unknown))
533 bo.PutUint16(buf[6:8], uint16(dat.identity))
534 bo.PutUint16(buf[8:10], uint16(dat.final))
Akron6247a5d2021-08-03 19:18:28 +0200535 bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
Akronf1a16502021-08-16 15:24:38 +0200536 bo.PutUint32(buf[12:16], uint32(len(dat.array)*2)) // Legacy support
Akron3a063ef2021-08-05 19:36:35 +0200537 more, err := wb.Write(buf[0:16])
Akron6247a5d2021-08-03 19:18:28 +0200538 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200539 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200540 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200541 }
542
543 all += more
544
Akron6247a5d2021-08-03 19:18:28 +0200545 // Write sigma
546 for _, sym := range sigmalist {
Akron3f8571a2021-08-05 11:18:10 +0200547
Akron3a063ef2021-08-05 19:36:35 +0200548 more, err = wb.WriteRune(sym)
Akron6247a5d2021-08-03 19:18:28 +0200549 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200550 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200551 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200552 }
553 all += more
554 }
Akron439f4ec2021-08-09 15:45:38 +0200555
Akron6247a5d2021-08-03 19:18:28 +0200556 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200557 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200558 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200559 }
Akron6247a5d2021-08-03 19:18:28 +0200560
561 // Test marker - could be checksum
Akron3a063ef2021-08-05 19:36:35 +0200562 more, err = wb.Write([]byte("T"))
Akron6247a5d2021-08-03 19:18:28 +0200563 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200564 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200565 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200566 }
567 all += more
568
Akronf1a16502021-08-16 15:24:38 +0200569 // for x := 0; x < len(dat.array); x++ {
570 for _, bc := range dat.array {
571 bo.PutUint32(buf[0:4], bc.base)
572 more, err = wb.Write(buf[0:4])
Akron6247a5d2021-08-03 19:18:28 +0200573 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200574 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200575 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200576 }
Akron439f4ec2021-08-09 15:45:38 +0200577 all += more
Akron3a063ef2021-08-05 19:36:35 +0200578 if more != 4 {
Akronf1a16502021-08-16 15:24:38 +0200579 log.Println("Can not write base uint32")
580 return int64(all), err
581 }
582 bo.PutUint32(buf[0:4], bc.check)
583 more, err = wb.Write(buf[0:4])
584 if err != nil {
585 log.Println(err)
586 return int64(all), err
587 }
588 all += more
589 if more != 4 {
590 log.Println("Can not write check uint32")
Akron3a063ef2021-08-05 19:36:35 +0200591 return int64(all), err
592 }
Akron6247a5d2021-08-03 19:18:28 +0200593 }
594
595 return int64(all), err
596}
597
Akron439f4ec2021-08-09 15:45:38 +0200598// LoadDatokFile reads a double array represented tokenizer
599// from a file.
Akron3f8571a2021-08-05 11:18:10 +0200600func LoadDatokFile(file string) *DaTokenizer {
601 f, err := os.Open(file)
602 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200603 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200604 return nil
Akron3f8571a2021-08-05 11:18:10 +0200605 }
606 defer f.Close()
607
608 gz, err := gzip.NewReader(f)
609 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200610 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200611 return nil
Akron3f8571a2021-08-05 11:18:10 +0200612 }
613 defer gz.Close()
614
Akron3a063ef2021-08-05 19:36:35 +0200615 // Todo: Read the whole file!
Akron3f8571a2021-08-05 11:18:10 +0200616 return ParseDatok(gz)
617}
618
Akron439f4ec2021-08-09 15:45:38 +0200619// LoadDatokFile reads a double array represented tokenizer
620// from an io.Reader
Akron3f8571a2021-08-05 11:18:10 +0200621func ParseDatok(ior io.Reader) *DaTokenizer {
622
Akron439f4ec2021-08-09 15:45:38 +0200623 // Initialize tokenizer with default values
Akron3f8571a2021-08-05 11:18:10 +0200624 dat := &DaTokenizer{
625 sigma: make(map[rune]int),
626 epsilon: 0,
627 unknown: 0,
628 identity: 0,
629 final: 0,
Akron4fa28b32021-08-27 10:55:41 +0200630 transCount: 0,
Akron3f8571a2021-08-05 11:18:10 +0200631 }
632
633 r := bufio.NewReader(ior)
634
Akron3f8571a2021-08-05 11:18:10 +0200635 buf := make([]byte, 1024)
Akron16c312e2021-09-26 13:11:12 +0200636 buf = buf[0:len(DAMAGIC)]
Akron3f8571a2021-08-05 11:18:10 +0200637
Akron439f4ec2021-08-09 15:45:38 +0200638 _, err := r.Read(buf)
Akron3f8571a2021-08-05 11:18:10 +0200639
640 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200641 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200642 return nil
643 }
644
Akron16c312e2021-09-26 13:11:12 +0200645 if string(DAMAGIC) != string(buf) {
Akron527c10c2021-08-13 01:45:18 +0200646 log.Println("Not a datok file")
Akron3f8571a2021-08-05 11:18:10 +0200647 return nil
648 }
649
Akron439f4ec2021-08-09 15:45:38 +0200650 more, err := io.ReadFull(r, buf[0:16])
Akron3f8571a2021-08-05 11:18:10 +0200651 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200652 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200653 return nil
654 }
655
Akron439f4ec2021-08-09 15:45:38 +0200656 if more != 16 {
Akron527c10c2021-08-13 01:45:18 +0200657 log.Println("Read bytes do not fit")
Akron439f4ec2021-08-09 15:45:38 +0200658 return nil
659 }
Akron3f8571a2021-08-05 11:18:10 +0200660
Akron3a063ef2021-08-05 19:36:35 +0200661 version := bo.Uint16(buf[0:2])
662
663 if version != VERSION {
Akron527c10c2021-08-13 01:45:18 +0200664 log.Println("Version not compatible")
Akron3a063ef2021-08-05 19:36:35 +0200665 return nil
666 }
667
Akron3f8571a2021-08-05 11:18:10 +0200668 dat.epsilon = int(bo.Uint16(buf[2:4]))
669 dat.unknown = int(bo.Uint16(buf[4:6]))
670 dat.identity = int(bo.Uint16(buf[6:8]))
671 dat.final = int(bo.Uint16(buf[8:10]))
672
673 sigmaCount := int(bo.Uint16(buf[10:12]))
Akronf1a16502021-08-16 15:24:38 +0200674 arraySize := int(bo.Uint32(buf[12:16])) / 2 // Legacy support
Akron3f8571a2021-08-05 11:18:10 +0200675
Akron3a063ef2021-08-05 19:36:35 +0200676 // Shouldn't be relevant though
677 dat.maxSize = arraySize - 1
678
Akron00cecd12021-12-05 13:14:03 +0100679 // Init with identity
680 if dat.identity != -1 {
681 for i := 0; i < 256; i++ {
682 dat.sigmaASCII[i] = dat.identity
683 }
684 }
685
Akron3f8571a2021-08-05 11:18:10 +0200686 for x := 0; x < sigmaCount; x++ {
Akron439f4ec2021-08-09 15:45:38 +0200687 sym, _, err := r.ReadRune()
Akron3f8571a2021-08-05 11:18:10 +0200688 if err == nil && sym != 0 {
Akronea46e8a2021-08-17 00:36:31 +0200689 if int(sym) < 256 {
690 dat.sigmaASCII[int(sym)] = x
691 }
Akron3f8571a2021-08-05 11:18:10 +0200692 dat.sigma[sym] = x
693 }
Akron3f8571a2021-08-05 11:18:10 +0200694 }
695
Akron439f4ec2021-08-09 15:45:38 +0200696 _, err = io.ReadFull(r, buf[0:1])
Akron3f8571a2021-08-05 11:18:10 +0200697
698 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200699 log.Print(err)
Akron3f8571a2021-08-05 11:18:10 +0200700 return nil
701 }
702
Akron3f8571a2021-08-05 11:18:10 +0200703 if string("T") != string(buf[0:1]) {
Akron527c10c2021-08-13 01:45:18 +0200704 log.Println("Not a datok file")
Akron3f8571a2021-08-05 11:18:10 +0200705 return nil
706 }
707
708 // Read based on length
Akronf1a16502021-08-16 15:24:38 +0200709 dat.array = make([]bc, arraySize)
Akron3f8571a2021-08-05 11:18:10 +0200710
Akronbb4aac52021-08-13 00:52:27 +0200711 dataArray, err := io.ReadAll(r)
Akron439f4ec2021-08-09 15:45:38 +0200712
Akronbb4aac52021-08-13 00:52:27 +0200713 if err == io.EOF {
Akron527c10c2021-08-13 01:45:18 +0200714 log.Println(err)
Akronbb4aac52021-08-13 00:52:27 +0200715 return nil
716 }
717
Akronf1a16502021-08-16 15:24:38 +0200718 if len(dataArray) < arraySize*8 {
Akron527c10c2021-08-13 01:45:18 +0200719 log.Println("Not enough bytes read")
Akronbb4aac52021-08-13 00:52:27 +0200720 return nil
721 }
722
723 for x := 0; x < arraySize; x++ {
Akronf1a16502021-08-16 15:24:38 +0200724 dat.array[x].base = bo.Uint32(dataArray[x*8 : (x*8)+4])
725 dat.array[x].check = bo.Uint32(dataArray[(x*8)+4 : (x*8)+8])
Akron3f8571a2021-08-05 11:18:10 +0200726 }
727
728 return dat
729}
730
Akron439f4ec2021-08-09 15:45:38 +0200731// Show the current state of the buffer,
732// for testing puroses
Akron3610f102021-08-08 14:13:25 +0200733func showBuffer(buffer []rune, buffo int, buffi int) string {
734 out := make([]rune, 0, 1024)
735 for x := 0; x < len(buffer); x++ {
736 if buffi == x {
737 out = append(out, '^')
738 }
739 if buffo == x {
740 out = append(out, '[', buffer[x], ']')
741 } else {
742 out = append(out, buffer[x])
743 }
744 }
745 return string(out)
746}
747
Akron98fbfef2021-10-23 17:02:11 +0200748// Show the current state of the buffer,
749// for testing puroses
750func showBufferNew(buffer []rune, bufft int, buffc int, buffi int) string {
751 out := make([]rune, 0, 1024)
752 for x := 0; x < len(buffer); x++ {
753 if buffi == x {
754 out = append(out, '^')
755 }
756 if bufft == x {
757 out = append(out, '|')
758 }
759 if buffc == x {
760 out = append(out, '[', buffer[x], ']')
761 } else {
762 out = append(out, buffer[x])
763 }
764 }
765 return string(out)
766}
767
768// Transduce input to ouutput
Akrone396a932021-10-19 01:06:13 +0200769func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
Akron96fdc9b2021-10-27 21:11:17 +0200770 return dat.TransduceTokenWriter(r, NewTokenWriter(w, SIMPLE))
Akrone396a932021-10-19 01:06:13 +0200771}
772
773// TransduceTokenWriter transduces an input string against
774// the double array FSA. The rules are always greedy. If the
775// automaton fails, it takes the last possible token ending
776// branch.
Akron068874c2021-08-04 15:19:56 +0200777//
Akron4db3ecf2021-08-11 18:49:03 +0200778// Based on Mizobuchi et al (2000), p. 129,
779// with additional support for IDENTITY, UNKNOWN
780// and EPSILON transitions and NONTOKEN and TOKENEND handling.
Akron4f6b28c2021-10-25 00:52:03 +0200781func (dat *DaTokenizer) TransduceTokenWriter(r io.Reader, w *TokenWriter) bool {
Akron068874c2021-08-04 15:19:56 +0200782 var a int
Akronb4bbb472021-08-09 11:49:38 +0200783 var t0 uint32
Akronb7e1f132021-08-10 11:52:31 +0200784 t := uint32(1) // Initial state
785 var ok, rewindBuffer bool
Akron068874c2021-08-04 15:19:56 +0200786
Akron3610f102021-08-08 14:13:25 +0200787 // Remember the last position of a possible tokenend,
788 // in case the automaton fails.
789 epsilonState := uint32(0)
790 epsilonOffset := 0
791
Akrone396a932021-10-19 01:06:13 +0200792 // Remember if the last transition was epsilon
793 sentenceEnd := false
794
Akrona854faa2021-10-22 19:31:08 +0200795 // Remember if a text end was already set
796 textEnd := false
797
Akron3610f102021-08-08 14:13:25 +0200798 // Implement a low level buffer for full control,
799 // however - it is probably better to introduce
800 // this on a higher level with a io.Reader interface
801 // The buffer stores a single word and may have white
802 // space at the end (but not at the beginning).
803 //
804 // This is the only backtracking requirement because of
805 // epsilon transitions, to support tokenizations like:
806 // "this is an example|.| And it works." vs
807 // "this is an example.com| application."
Akronb7e1f132021-08-10 11:52:31 +0200808 //
809 // TODO:
810 // Store a translation buffer as well, so characters don't
811 // have to be translated multiple times!
Akron3610f102021-08-08 14:13:25 +0200812 buffer := make([]rune, 1024)
Akron98fbfef2021-10-23 17:02:11 +0200813 bufft := 0 // Buffer token offset
814 buffc := 0 // Buffer current symbol
Akron3610f102021-08-08 14:13:25 +0200815 buffi := 0 // Buffer length
816
Akron98fbfef2021-10-23 17:02:11 +0200817 // The buffer is organized as follows:
818 // [ t[....c..]..i]
819
Akron3f8571a2021-08-05 11:18:10 +0200820 reader := bufio.NewReader(r)
Akrone396a932021-10-19 01:06:13 +0200821 defer w.Flush()
Akron068874c2021-08-04 15:19:56 +0200822
Akron3f8571a2021-08-05 11:18:10 +0200823 var char rune
Akrone396a932021-10-19 01:06:13 +0200824
Akron3f8571a2021-08-05 11:18:10 +0200825 var err error
826 eof := false
Akrona854faa2021-10-22 19:31:08 +0200827 eot := false
Akronb7e1f132021-08-10 11:52:31 +0200828 newchar := true
Akron3f8571a2021-08-05 11:18:10 +0200829
Akronc5d8d432021-08-10 16:48:44 +0200830PARSECHAR:
Akron3f8571a2021-08-05 11:18:10 +0200831 for {
832
Akronb7e1f132021-08-10 11:52:31 +0200833 if newchar {
834 // Get from reader if buffer is empty
Akron98fbfef2021-10-23 17:02:11 +0200835 if buffc >= buffi {
Akron1594cb82021-08-11 11:14:56 +0200836 if eof {
837 break
838 }
Akronb7e1f132021-08-10 11:52:31 +0200839 char, _, err = reader.ReadRune()
Akron439f4ec2021-08-09 15:45:38 +0200840
Akronb7e1f132021-08-10 11:52:31 +0200841 // No more runes to read
842 if err != nil {
843 eof = true
844 break
845 }
846 buffer[buffi] = char
847 buffi++
Akron3f8571a2021-08-05 11:18:10 +0200848 }
Akronb7e1f132021-08-10 11:52:31 +0200849
Akron98fbfef2021-10-23 17:02:11 +0200850 char = buffer[buffc]
Akronb7e1f132021-08-10 11:52:31 +0200851
852 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100853 log.Println("Current char", string(char), int(char), showBufferNew(buffer, bufft, buffc, buffi))
Akronb7e1f132021-08-10 11:52:31 +0200854 }
855
Akrona854faa2021-10-22 19:31:08 +0200856 eot = false
857
Akron6f1c16c2021-08-17 10:45:42 +0200858 // TODO:
859 // Better not repeatedly check for a!
860 // Possibly keep a buffer with a.
Akronea46e8a2021-08-17 00:36:31 +0200861 if int(char) < 256 {
Akron8e803932023-04-18 10:19:19 +0200862 eot = int(char) == EOT
Akronea46e8a2021-08-17 00:36:31 +0200863 a = dat.sigmaASCII[int(char)]
864 } else {
865 a, ok = dat.sigma[char]
Akronb7e1f132021-08-10 11:52:31 +0200866
Akron4880fb62021-12-05 12:03:05 +0100867 // Use identity symbol if character is not in sigma
868 if !ok && dat.identity != -1 {
869 a = dat.identity
870 }
Akronb7e1f132021-08-10 11:52:31 +0200871 }
872
873 t0 = t
874
875 // Check for epsilon transitions and remember
Akronf1a16502021-08-16 15:24:38 +0200876 if dat.array[dat.array[t0].getBase()+uint32(dat.epsilon)].getCheck() == t0 {
Akrondf275812022-03-27 12:54:46 +0200877
Akronb7e1f132021-08-10 11:52:31 +0200878 // Remember state for backtracking to last tokenend state
879 epsilonState = t0
Akron98fbfef2021-10-23 17:02:11 +0200880 epsilonOffset = buffc
Akrone396a932021-10-19 01:06:13 +0200881
882 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100883 log.Println("epsilonOffset is set to", buffc)
Akrone396a932021-10-19 01:06:13 +0200884 }
Akronb7e1f132021-08-10 11:52:31 +0200885 }
Akron3f8571a2021-08-05 11:18:10 +0200886 }
Akron3610f102021-08-08 14:13:25 +0200887
Akronb7e1f132021-08-10 11:52:31 +0200888 // Checks a transition based on t0, a and buffo
Akronf1a16502021-08-16 15:24:38 +0200889 t = dat.array[t0].getBase() + uint32(a)
890 ta := dat.array[t]
Akron068874c2021-08-04 15:19:56 +0200891
Akron524c5432021-08-05 14:14:27 +0200892 if DEBUG {
Akronb7e1f132021-08-10 11:52:31 +0200893 // Char is only relevant if set
Akron9c3bf7f2021-11-03 19:52:12 +0100894 log.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
Akronb7e1f132021-08-10 11:52:31 +0200895 if false {
Akron9c3bf7f2021-11-03 19:52:12 +0100896 log.Println(dat.outgoing(t0))
Akronb7e1f132021-08-10 11:52:31 +0200897 }
Akron524c5432021-08-05 14:14:27 +0200898 }
899
Akronb7e1f132021-08-10 11:52:31 +0200900 // Check if the transition is invalid according to the double array
Akronf1a16502021-08-16 15:24:38 +0200901 if t > dat.array[1].getCheck() || ta.getCheck() != t0 {
Akron068874c2021-08-04 15:19:56 +0200902
903 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100904 log.Println("Match is not fine!", t, "and", ta.getCheck(), "vs", t0)
Akron068874c2021-08-04 15:19:56 +0200905 }
906
907 if !ok && a == dat.identity {
Akronb4bbb472021-08-09 11:49:38 +0200908
Akron068874c2021-08-04 15:19:56 +0200909 // Try again with unknown symbol, in case identity failed
Akronb7e1f132021-08-10 11:52:31 +0200910 // Char is only relevant when set
Akron068874c2021-08-04 15:19:56 +0200911 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100912 log.Println("UNKNOWN symbol", string(char), "->", dat.unknown)
Akron068874c2021-08-04 15:19:56 +0200913 }
914 a = dat.unknown
915
Akrondf275812022-03-27 12:54:46 +0200916 } else if a != dat.epsilon && epsilonState != 0 {
Akronb4bbb472021-08-09 11:49:38 +0200917
Akron068874c2021-08-04 15:19:56 +0200918 // Try again with epsilon symbol, in case everything else failed
Akronb4bbb472021-08-09 11:49:38 +0200919 t0 = epsilonState
Akron3610f102021-08-08 14:13:25 +0200920 epsilonState = 0 // reset
Akron98fbfef2021-10-23 17:02:11 +0200921 buffc = epsilonOffset
Akron439f4ec2021-08-09 15:45:38 +0200922 a = dat.epsilon
923
Akron3610f102021-08-08 14:13:25 +0200924 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100925 log.Println("Get from epsilon stack and set buffo!", showBufferNew(buffer, bufft, buffc, buffi))
Akron3610f102021-08-08 14:13:25 +0200926 }
Akronb4bbb472021-08-09 11:49:38 +0200927
Akron068874c2021-08-04 15:19:56 +0200928 } else {
Akrondf275812022-03-27 12:54:46 +0200929
930 if DEBUG {
931 log.Println("Fail!")
932 }
933
934 // w.Fail(bufft)
935
936 // The following procedure means the automaton fails to consume a certain character.
937 // In the tokenization scenario, this means, the tokenizer will drop the old or current data as a
938 // token and start blank at the root node of the automaton for the remaining data.
939 // It may be beneficial to have something like a "drop()" event to capture these cases,
940 // as they are likely the result of a bad automaton design.
Akron8e803932023-04-18 10:19:19 +0200941 // Hopefully this is branchless code
Akroncae39112023-04-26 19:43:16 +0200942 if buffc-bufft <= 0 {
Akrondf275812022-03-27 12:54:46 +0200943 buffc++
Akroncae39112023-04-26 19:43:16 +0200944 if buffc == 0 {
945 eof = true
946 break
947 }
Akrondf275812022-03-27 12:54:46 +0200948 }
949
950 if DEBUG {
951 log.Println("-> Flush buffer: [", string(buffer[bufft:buffc]), "]", showBufferNew(buffer, bufft, buffc, buffi))
952 }
953 w.Token(bufft, buffer[:buffc])
954
955 sentenceEnd = false
956 textEnd = false
957
958 if DEBUG {
959 log.Println("-> Rewind buffer", bufft, buffc, buffi, epsilonOffset)
960 }
961
Akron8e803932023-04-18 10:19:19 +0200962 copy(buffer[0:], buffer[buffc:buffi])
Akrondf275812022-03-27 12:54:46 +0200963
964 buffi -= buffc
965 epsilonState = 0
966
967 buffc = 0
968 bufft = 0
969
970 a = dat.epsilon
971
972 // Restart from root state
973 t = uint32(1)
974 newchar = true
975 // goto PARSECHARM
976 continue
Akron068874c2021-08-04 15:19:56 +0200977 }
Akron068874c2021-08-04 15:19:56 +0200978
Akronb7e1f132021-08-10 11:52:31 +0200979 newchar = false
Akrona854faa2021-10-22 19:31:08 +0200980 eot = false
Akronb7e1f132021-08-10 11:52:31 +0200981 continue
Akronb4bbb472021-08-09 11:49:38 +0200982 }
983
Akronb7e1f132021-08-10 11:52:31 +0200984 // Transition was successful
985 rewindBuffer = false
Akron439f4ec2021-08-09 15:45:38 +0200986
987 // Transition consumes a character
Akronb4bbb472021-08-09 11:49:38 +0200988 if a != dat.epsilon {
989
Akron98fbfef2021-10-23 17:02:11 +0200990 buffc++
Akronb4bbb472021-08-09 11:49:38 +0200991
Akron439f4ec2021-08-09 15:45:38 +0200992 // Transition does not produce a character
Akron8e803932023-04-18 10:19:19 +0200993 // Hopefully this is branchless
Akron98fbfef2021-10-23 17:02:11 +0200994 if buffc-bufft == 1 && ta.isNonToken() {
Akron3610f102021-08-08 14:13:25 +0200995 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100996 log.Println("Nontoken forward", showBufferNew(buffer, bufft, buffc, buffi))
Akron3610f102021-08-08 14:13:25 +0200997 }
Akron98fbfef2021-10-23 17:02:11 +0200998 bufft++
999 // rewindBuffer = true
Akron3610f102021-08-08 14:13:25 +02001000 }
Akron068874c2021-08-04 15:19:56 +02001001
Akrone396a932021-10-19 01:06:13 +02001002 } else {
Akron524c5432021-08-05 14:14:27 +02001003
Akrone396a932021-10-19 01:06:13 +02001004 // Transition marks the end of a token - so flush the buffer
Akron98fbfef2021-10-23 17:02:11 +02001005 if buffc-bufft > 0 {
Akronc5d8d432021-08-10 16:48:44 +02001006 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001007 log.Println("-> Flush buffer: [", string(buffer[bufft:buffc]), "]", showBuffer(buffer, buffc, buffi))
Akronc5d8d432021-08-10 16:48:44 +02001008 }
Akron32416ce2021-10-23 17:09:41 +02001009 w.Token(bufft, buffer[:buffc])
Akronc5d8d432021-08-10 16:48:44 +02001010 rewindBuffer = true
Akrone396a932021-10-19 01:06:13 +02001011 sentenceEnd = false
Akrona854faa2021-10-22 19:31:08 +02001012 textEnd = false
Akrone396a932021-10-19 01:06:13 +02001013 } else {
1014 sentenceEnd = true
Akrona854faa2021-10-22 19:31:08 +02001015 w.SentenceEnd(0)
Akron1594cb82021-08-11 11:14:56 +02001016 }
Akron439f4ec2021-08-09 15:45:38 +02001017 }
Akron3610f102021-08-08 14:13:25 +02001018
Akron8cc2dd92021-10-25 19:49:41 +02001019 if eot {
1020 eot = false
1021 textEnd = true
1022 w.TextEnd(0)
1023 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001024 log.Println("END OF TEXT")
Akron8cc2dd92021-10-25 19:49:41 +02001025 }
1026 }
1027
Akronc5d8d432021-08-10 16:48:44 +02001028 // Rewind the buffer if necessary
Akron439f4ec2021-08-09 15:45:38 +02001029 if rewindBuffer {
1030
Akrone396a932021-10-19 01:06:13 +02001031 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001032 log.Println("-> Rewind buffer", bufft, buffc, buffi, epsilonOffset)
Akrone396a932021-10-19 01:06:13 +02001033 }
1034
Akron439f4ec2021-08-09 15:45:38 +02001035 // TODO: Better as a ring buffer
Akron8e803932023-04-18 10:19:19 +02001036 copy(buffer[0:], buffer[buffc:buffi])
Akronb4bbb472021-08-09 11:49:38 +02001037
Akron98fbfef2021-10-23 17:02:11 +02001038 buffi -= buffc
Akron16c312e2021-09-26 13:11:12 +02001039 // epsilonOffset -= buffo
Akrone396a932021-10-19 01:06:13 +02001040 epsilonOffset = 0
1041 epsilonState = 0
1042
Akron98fbfef2021-10-23 17:02:11 +02001043 buffc = 0
1044 bufft = 0
Akrona854faa2021-10-22 19:31:08 +02001045
Akron98fbfef2021-10-23 17:02:11 +02001046 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001047 log.Println("Remaining:", showBufferNew(buffer, bufft, buffc, buffi))
Akron98fbfef2021-10-23 17:02:11 +02001048 }
1049 }
1050
Akronb7e1f132021-08-10 11:52:31 +02001051 // Move to representative state
Akronf1a16502021-08-16 15:24:38 +02001052 if ta.isSeparate() {
1053 t = ta.getBase()
1054 ta = dat.array[t]
Akronb7e1f132021-08-10 11:52:31 +02001055
1056 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001057 log.Println("Representative pointing to", t)
Akronb7e1f132021-08-10 11:52:31 +02001058 }
1059 }
1060
Akronc5d8d432021-08-10 16:48:44 +02001061 newchar = true
1062
Akron068874c2021-08-04 15:19:56 +02001063 // TODO:
Akrondf275812022-03-27 12:54:46 +02001064 // Prevent endless epsilon loops by checking
1065 // the model has no epsilon loops1
Akron068874c2021-08-04 15:19:56 +02001066 }
1067
Akron439f4ec2021-08-09 15:45:38 +02001068 // Input reader is not yet finished
Akron3f8571a2021-08-05 11:18:10 +02001069 if !eof {
Akron068874c2021-08-04 15:19:56 +02001070 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001071 log.Println("Not at the end - problem", t0, ":", dat.outgoing(t0))
Akron068874c2021-08-04 15:19:56 +02001072 }
Akrondf275812022-03-27 12:54:46 +02001073 // This should never happen
Akron068874c2021-08-04 15:19:56 +02001074 return false
1075 }
1076
Akronb7e1f132021-08-10 11:52:31 +02001077 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001078 log.Println("Entering final check")
Akronb7e1f132021-08-10 11:52:31 +02001079 }
1080
Akrona854faa2021-10-22 19:31:08 +02001081 // Check epsilon transitions as long as possible
Akronc5d8d432021-08-10 16:48:44 +02001082 t0 = t
Akronf1a16502021-08-16 15:24:38 +02001083 t = dat.array[t0].getBase() + uint32(dat.epsilon)
Akron01912fc2021-08-12 11:41:58 +02001084 a = dat.epsilon
1085 newchar = false
Akrone396a932021-10-19 01:06:13 +02001086
Akronf1a16502021-08-16 15:24:38 +02001087 if dat.array[t].getCheck() == t0 {
Akronc5d8d432021-08-10 16:48:44 +02001088 // Remember state for backtracking to last tokenend state
Akronc5d8d432021-08-10 16:48:44 +02001089 goto PARSECHAR
Akrone61380b2021-08-16 10:10:46 +02001090
Akronc5d8d432021-08-10 16:48:44 +02001091 } else if epsilonState != 0 {
Akronb7e1f132021-08-10 11:52:31 +02001092 t0 = epsilonState
1093 epsilonState = 0 // reset
Akron98fbfef2021-10-23 17:02:11 +02001094 buffc = epsilonOffset
Akron068874c2021-08-04 15:19:56 +02001095 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001096 log.Println("Get from epsilon stack and set buffo!", showBufferNew(buffer, bufft, buffc, buffi))
Akron068874c2021-08-04 15:19:56 +02001097 }
Akronc5d8d432021-08-10 16:48:44 +02001098 goto PARSECHAR
Akron068874c2021-08-04 15:19:56 +02001099 }
Akrone396a932021-10-19 01:06:13 +02001100
Akrondf275812022-03-27 12:54:46 +02001101 // something left in buffer
1102 if buffc-bufft > 0 {
1103 if DEBUG {
1104 log.Println("-> Flush buffer: [", string(buffer[bufft:buffc]), "]", showBufferNew(buffer, bufft, buffc, buffi))
1105 }
1106 w.Token(bufft, buffer[:buffc])
1107 sentenceEnd = false
1108 textEnd = false
1109 }
1110
Akrone396a932021-10-19 01:06:13 +02001111 // Add an additional sentence ending, if the file is over but no explicit
1112 // sentence split was reached. This may be controversial and therefore
1113 // optional via parameter.
1114 if !sentenceEnd {
Akrona854faa2021-10-22 19:31:08 +02001115 w.SentenceEnd(0)
1116
Akrone396a932021-10-19 01:06:13 +02001117 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001118 log.Println("Sentence end")
Akrona854faa2021-10-22 19:31:08 +02001119 }
1120 }
1121
1122 if !textEnd {
1123 w.TextEnd(0)
1124
1125 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001126 log.Println("Text end")
Akrone396a932021-10-19 01:06:13 +02001127 }
1128 }
1129
1130 return true
Akron068874c2021-08-04 15:19:56 +02001131}