blob: 45a9a195522e59a2f86f15f5865aa59b48f0e6fd [file] [log] [blame]
Akron7f1097f2021-09-21 16:00:29 +02001package datok
Akron8ef408b2021-08-02 22:11:04 +02002
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
Akronb4bbb472021-08-09 11:49:38 +02009// The maximum number of states is 1.073.741.823 (30bit),
Akron83e75a22021-08-04 13:14:06 +020010// with a loadfactor of ~70, this means roughly 70 million
11// states in the FSA, which is sufficient for the current
12// job.
Akron03c92fe2021-08-09 14:07:57 +020013//
14// Serialization is little endian.
Akron83e75a22021-08-04 13:14:06 +020015
Akron740f3d72021-08-03 12:12:34 +020016// TODO:
17// - replace maxSize with the check value
Akron7b1faa62021-09-02 16:10:21 +020018// - Check if final states can be optional.
19// - Introduce ELM (Morita et al. 2001) to speed
20// up construction. Can be ignored on serialization
21// to improve compression.
Akron3a063ef2021-08-05 19:36:35 +020022// - Add checksum to serialization.
Akrone61380b2021-08-16 10:10:46 +020023// - Replace/Enhance table with a map
24// - Provide a bufio.Scanner compatible interface.
Akron8e1d69b2021-08-12 17:38:49 +020025// - Mark epsilon transitions in bytes
Akron740f3d72021-08-03 12:12:34 +020026
Akron8ef408b2021-08-02 22:11:04 +020027import (
28 "bufio"
29 "compress/gzip"
Akron6247a5d2021-08-03 19:18:28 +020030 "encoding/binary"
Akron8ef408b2021-08-02 22:11:04 +020031 "fmt"
32 "io"
Akron679b4862021-09-02 16:59:26 +020033 "math"
Akron8ef408b2021-08-02 22:11:04 +020034 "os"
Akronc9d84a62021-08-03 15:56:03 +020035 "sort"
Akron740f3d72021-08-03 12:12:34 +020036
Akron527c10c2021-08-13 01:45:18 +020037 "log"
Akron8ef408b2021-08-02 22:11:04 +020038)
39
40const (
Akron2a4b9292021-08-04 15:35:22 +020041 DEBUG = false
Akron16c312e2021-09-26 13:11:12 +020042 DAMAGIC = "DATOK"
Akron2a4b9292021-08-04 15:35:22 +020043 VERSION = uint16(1)
Akron03c92fe2021-08-09 14:07:57 +020044 FIRSTBIT uint32 = 1 << 31
45 SECONDBIT uint32 = 1 << 30
46 RESTBIT uint32 = ^uint32(0) &^ (FIRSTBIT | SECONDBIT)
Akron8ef408b2021-08-02 22:11:04 +020047)
48
Akron03c92fe2021-08-09 14:07:57 +020049// Serialization is always little endian
Akron6247a5d2021-08-03 19:18:28 +020050var bo binary.ByteOrder = binary.LittleEndian
51
Akron8ef408b2021-08-02 22:11:04 +020052type mapping struct {
53 source int
Akron3fdfec62021-08-04 11:40:10 +020054 target uint32
Akron8ef408b2021-08-02 22:11:04 +020055}
56
Akronf1a16502021-08-16 15:24:38 +020057type bc struct {
58 base uint32
59 check uint32
60}
61
Akron03c92fe2021-08-09 14:07:57 +020062// DaTokenizer represents a tokenizer implemented as a
63// Double Array FSA.
Akronf2120ca2021-08-03 16:26:41 +020064type DaTokenizer struct {
Akronea46e8a2021-08-17 00:36:31 +020065 sigma map[rune]int
66 sigmaASCII [256]int
Akron03a3c612021-08-04 11:51:27 +020067 maxSize int
Akron4fa28b32021-08-27 10:55:41 +020068 transCount int
Akronf1a16502021-08-16 15:24:38 +020069 array []bc
Akronc17f1ca2021-08-03 19:47:27 +020070
71 // Special symbols in sigma
72 epsilon int
73 unknown int
74 identity int
75 final int
Akron03c92fe2021-08-09 14:07:57 +020076 tokenend int
Akronf2120ca2021-08-03 16:26:41 +020077}
78
Akron439f4ec2021-08-09 15:45:38 +020079// ToDoubleArray turns the intermediate tokenizer representation
80// into a double array representation.
81//
82// This is based on Mizobuchi et al (2000), p.128
Akron1c34ce62021-09-23 23:27:39 +020083func (auto *Automaton) ToDoubleArray() *DaTokenizer {
Akronf2120ca2021-08-03 16:26:41 +020084
85 dat := &DaTokenizer{
Akron03a3c612021-08-04 11:51:27 +020086 sigma: make(map[rune]int),
Akron4fa28b32021-08-27 10:55:41 +020087 transCount: -1,
Akron1c34ce62021-09-23 23:27:39 +020088 final: auto.final,
89 unknown: auto.unknown,
90 identity: auto.identity,
91 epsilon: auto.epsilon,
92 tokenend: auto.tokenend,
Akronf2120ca2021-08-03 16:26:41 +020093 }
94
Akronf1a16502021-08-16 15:24:38 +020095 dat.resize(dat.final)
96
Akron1c34ce62021-09-23 23:27:39 +020097 for num, sym := range auto.sigmaRev {
Akronea46e8a2021-08-17 00:36:31 +020098 if int(sym) < 256 {
99 dat.sigmaASCII[int(sym)] = num
100 }
Akronf2120ca2021-08-03 16:26:41 +0200101 dat.sigma[sym] = num
102 }
Akron8ef408b2021-08-02 22:11:04 +0200103
104 mark := 0
105 size := 0
Akron6f1c16c2021-08-17 10:45:42 +0200106 var base uint32
Akronde18e902021-08-27 09:34:12 +0200107 var atrans *edge
108 var s, s1 int
109 var t, t1 uint32
Akron8ef408b2021-08-02 22:11:04 +0200110
Akron439f4ec2021-08-09 15:45:38 +0200111 // Create a mapping from s (in Ms aka Intermediate FSA)
112 // to t (in Mt aka Double Array FSA)
Akron1c34ce62021-09-23 23:27:39 +0200113 table := make([]*mapping, auto.arcCount+1)
Akron7b1faa62021-09-02 16:10:21 +0200114 // tableQueue := make([]int, tok.arcCount+1)
Akron8ef408b2021-08-02 22:11:04 +0200115
Akron439f4ec2021-08-09 15:45:38 +0200116 // Initialize with the start state
Akron8ef408b2021-08-02 22:11:04 +0200117 table[size] = &mapping{source: 1, target: 1}
Akron7b1faa62021-09-02 16:10:21 +0200118 // tableQueue[size] = 1
Akron8ef408b2021-08-02 22:11:04 +0200119 size++
120
Akron740f3d72021-08-03 12:12:34 +0200121 // Allocate space for the outgoing symbol range
Akron1c34ce62021-09-23 23:27:39 +0200122 A := make([]int, 0, auto.sigmaCount)
Akron7b1faa62021-09-02 16:10:21 +0200123 // tableLookup := make([]uint32, tok.arcCount+2) // make(map[int]uint32)
124 // tableLookup[1] = 1
Akron8ef408b2021-08-02 22:11:04 +0200125
Akron7b1faa62021-09-02 16:10:21 +0200126 // block_begin_pos := uint32(1)
Akronde18e902021-08-27 09:34:12 +0200127
Akron8ef408b2021-08-02 22:11:04 +0200128 for mark < size {
Akronde18e902021-08-27 09:34:12 +0200129 s = table[mark].source // This is a state in Ms
130 t = table[mark].target // This is a state in Mt
Akron7b1faa62021-09-02 16:10:21 +0200131 // s = tableQueue[mark]
132 // t = tableLookup[s]
Akron8ef408b2021-08-02 22:11:04 +0200133 mark++
Akron740f3d72021-08-03 12:12:34 +0200134
135 // Following the paper, here the state t can be remembered
136 // in the set of states St
Akron8ef408b2021-08-02 22:11:04 +0200137 A = A[:0]
Akron1c34ce62021-09-23 23:27:39 +0200138 auto.getSet(s, &A)
Akron8ef408b2021-08-02 22:11:04 +0200139
Akron740f3d72021-08-03 12:12:34 +0200140 // Set base to the first free slot in the double array
Akron29e306f2021-09-02 18:29:56 +0200141 // base = dat.xCheck(A)
Akron679b4862021-09-02 16:59:26 +0200142 // base = dat.xCheckSkip(A)
Akron7b1faa62021-09-02 16:10:21 +0200143 // base = dat.xCheckNiu(A, &block_begin_pos)
Akron29e306f2021-09-02 18:29:56 +0200144 base = dat.xCheckSkipNiu(A)
Akron6f1c16c2021-08-17 10:45:42 +0200145 dat.array[t].setBase(base)
Akron8ef408b2021-08-02 22:11:04 +0200146
Akron773b1ef2021-08-03 17:37:20 +0200147 // TODO:
Akron068874c2021-08-04 15:19:56 +0200148 // Sort the outgoing transitions based on the
Akron773b1ef2021-08-03 17:37:20 +0200149 // outdegree of .end
150
Akron740f3d72021-08-03 12:12:34 +0200151 // Iterate over all outgoing symbols
Akron8ef408b2021-08-02 22:11:04 +0200152 for _, a := range A {
153
Akron1c34ce62021-09-23 23:27:39 +0200154 if a != auto.final {
Akron8ef408b2021-08-02 22:11:04 +0200155
Akron1c34ce62021-09-23 23:27:39 +0200156 atrans = auto.transitions[s][a]
Akronde18e902021-08-27 09:34:12 +0200157
Akron740f3d72021-08-03 12:12:34 +0200158 // Aka g(s, a)
Akronde18e902021-08-27 09:34:12 +0200159 s1 = atrans.end
Akron8ef408b2021-08-02 22:11:04 +0200160
Akron740f3d72021-08-03 12:12:34 +0200161 // Store the transition
Akronde18e902021-08-27 09:34:12 +0200162 t1 = base + uint32(a)
Akronf1a16502021-08-16 15:24:38 +0200163 dat.array[t1].setCheck(t)
164
165 // Set maxSize
166 if dat.maxSize < int(t1) {
167 dat.maxSize = int(t1)
168 }
Akron8ef408b2021-08-02 22:11:04 +0200169
Akron439f4ec2021-08-09 15:45:38 +0200170 if DEBUG {
Akron524c5432021-08-05 14:14:27 +0200171 fmt.Println("Translate transition",
172 s, "->", s1, "(", a, ")", "to", t, "->", t1)
173 }
174
Akron83e75a22021-08-04 13:14:06 +0200175 // Mark the state as being the target of a nontoken transition
Akronde18e902021-08-27 09:34:12 +0200176 if atrans.nontoken {
Akronf1a16502021-08-16 15:24:38 +0200177 dat.array[t1].setNonToken(true)
Akron524c5432021-08-05 14:14:27 +0200178 if DEBUG {
179 fmt.Println("Set", t1, "to nontoken")
180 }
Akron83e75a22021-08-04 13:14:06 +0200181 }
182
Akron84d68e62021-08-04 17:06:52 +0200183 // Mark the state as being the target of a tokenend transition
Akronde18e902021-08-27 09:34:12 +0200184 if atrans.tokenend {
Akronf1a16502021-08-16 15:24:38 +0200185 dat.array[t1].setTokenEnd(true)
Akron524c5432021-08-05 14:14:27 +0200186 if DEBUG {
187 fmt.Println("Set", t1, "to tokenend")
188 }
Akron84d68e62021-08-04 17:06:52 +0200189 }
190
Akron740f3d72021-08-03 12:12:34 +0200191 // Check for representative states
Akron439f4ec2021-08-09 15:45:38 +0200192 r := stateAlreadyInTable(s1, table, size)
Akronde18e902021-08-27 09:34:12 +0200193 // r := tableLookup[s1]
Akron740f3d72021-08-03 12:12:34 +0200194
Akron439f4ec2021-08-09 15:45:38 +0200195 // No representative found
Akron8ef408b2021-08-02 22:11:04 +0200196 if r == 0 {
Akron740f3d72021-08-03 12:12:34 +0200197 // Remember the mapping
Akron8ef408b2021-08-02 22:11:04 +0200198 table[size] = &mapping{source: s1, target: t1}
Akron7b1faa62021-09-02 16:10:21 +0200199 // tableQueue[size] = s1
Akronde18e902021-08-27 09:34:12 +0200200 // tableLookup[s1] = t1
Akron8ef408b2021-08-02 22:11:04 +0200201 size++
202 } else {
Akron740f3d72021-08-03 12:12:34 +0200203 // Overwrite with the representative state
Akronf1a16502021-08-16 15:24:38 +0200204 dat.array[t1].setBase(r)
205 dat.array[t1].setSeparate(true)
Akron8ef408b2021-08-02 22:11:04 +0200206 }
207 } else {
Akron740f3d72021-08-03 12:12:34 +0200208 // Store a final transition
Akron6f1c16c2021-08-17 10:45:42 +0200209 dat.array[base+uint32(dat.final)].setCheck(t)
Akronf1a16502021-08-16 15:24:38 +0200210
Akronde18e902021-08-27 09:34:12 +0200211 if dat.maxSize < int(base)+dat.final {
212 dat.maxSize = int(base) + dat.final
Akronf1a16502021-08-16 15:24:38 +0200213 }
Akron8ef408b2021-08-02 22:11:04 +0200214 }
215 }
216 }
217
218 // Following Mizobuchi et al (2000) the size of the
219 // FSA should be stored in check(1).
Akron29e306f2021-09-02 18:29:56 +0200220 // We make the size a bit larger so we never have to check for boundaries.
221 dat.setSize(dat.maxSize + dat.final)
222 if len(dat.array) < dat.maxSize+dat.final {
223 dat.array = append(dat.array, make([]bc, dat.final)...)
224 }
225 dat.array = dat.array[:dat.maxSize+dat.final]
Akronf2120ca2021-08-03 16:26:41 +0200226 return dat
Akron8ef408b2021-08-02 22:11:04 +0200227}
228
Akron8ef408b2021-08-02 22:11:04 +0200229// Check the table if a mapping of s
Akron740f3d72021-08-03 12:12:34 +0200230// exists and return this as a representative.
231// Currently iterates through the whole table
232// in a bruteforce manner.
Akron439f4ec2021-08-09 15:45:38 +0200233func stateAlreadyInTable(s int, table []*mapping, size int) uint32 {
Akron8ef408b2021-08-02 22:11:04 +0200234 for x := 0; x < size; x++ {
235 if table[x].source == s {
236 return table[x].target
237 }
238 }
239 return 0
240}
241
Akron941f2152021-09-26 15:14:25 +0200242// Type of tokenizer
243func (DaTokenizer) Type() string {
244 return DAMAGIC
245}
246
Akron64ffd9a2021-08-03 19:55:21 +0200247// Resize double array when necessary
248func (dat *DaTokenizer) resize(l int) {
249 // TODO:
250 // This is a bit too aggressive atm and should be calmed down.
251 if len(dat.array) <= l {
Akronf1a16502021-08-16 15:24:38 +0200252 dat.array = append(dat.array, make([]bc, l)...)
Akron8ef408b2021-08-02 22:11:04 +0200253 }
Akron64ffd9a2021-08-03 19:55:21 +0200254}
Akronc9d84a62021-08-03 15:56:03 +0200255
Akron64ffd9a2021-08-03 19:55:21 +0200256// Set base value in double array
Akronf1a16502021-08-16 15:24:38 +0200257func (bc *bc) setBase(v uint32) {
258 bc.base = v
Akron439f4ec2021-08-09 15:45:38 +0200259}
260
261// Get base value in double array
Akronf1a16502021-08-16 15:24:38 +0200262func (bc *bc) getBase() uint32 {
263 return bc.base & RESTBIT
Akron439f4ec2021-08-09 15:45:38 +0200264}
265
266// Set check value in double array
Akronf1a16502021-08-16 15:24:38 +0200267func (bc *bc) setCheck(v uint32) {
268 bc.check = v
Akron439f4ec2021-08-09 15:45:38 +0200269}
270
271// Get check value in double array
Akronf1a16502021-08-16 15:24:38 +0200272func (bc *bc) getCheck() uint32 {
273 return bc.check & RESTBIT
Akron64ffd9a2021-08-03 19:55:21 +0200274}
275
Akron3fdfec62021-08-04 11:40:10 +0200276// Returns true if a state is separate pointing to a representative
Akronf1a16502021-08-16 15:24:38 +0200277func (bc *bc) isSeparate() bool {
278 return bc.base&FIRSTBIT != 0
Akron3fdfec62021-08-04 11:40:10 +0200279}
280
281// Mark a state as separate pointing to a representative
Akronf1a16502021-08-16 15:24:38 +0200282func (bc *bc) setSeparate(sep bool) {
Akron3fdfec62021-08-04 11:40:10 +0200283 if sep {
Akronf1a16502021-08-16 15:24:38 +0200284 bc.base |= FIRSTBIT
Akron3fdfec62021-08-04 11:40:10 +0200285 } else {
Akronf1a16502021-08-16 15:24:38 +0200286 bc.base &= (RESTBIT | SECONDBIT)
Akron3fdfec62021-08-04 11:40:10 +0200287 }
288}
289
Akron83e75a22021-08-04 13:14:06 +0200290// Returns true if a state is the target of a nontoken transition
Akronf1a16502021-08-16 15:24:38 +0200291func (bc *bc) isNonToken() bool {
292 return bc.check&FIRSTBIT != 0
Akron83e75a22021-08-04 13:14:06 +0200293}
294
295// Mark a state as being the target of a nontoken transition
Akronf1a16502021-08-16 15:24:38 +0200296func (bc *bc) setNonToken(sep bool) {
Akron83e75a22021-08-04 13:14:06 +0200297 if sep {
Akronf1a16502021-08-16 15:24:38 +0200298 bc.check |= FIRSTBIT
Akron83e75a22021-08-04 13:14:06 +0200299 } else {
Akronf1a16502021-08-16 15:24:38 +0200300 bc.check &= (RESTBIT | SECONDBIT)
Akron83e75a22021-08-04 13:14:06 +0200301 }
302}
303
Akron84d68e62021-08-04 17:06:52 +0200304// Returns true if a state is the target of a tokenend transition
Akronf1a16502021-08-16 15:24:38 +0200305func (bc *bc) isTokenEnd() bool {
306 return bc.check&SECONDBIT != 0
Akron84d68e62021-08-04 17:06:52 +0200307}
308
309// Mark a state as being the target of a tokenend transition
Akronf1a16502021-08-16 15:24:38 +0200310func (bc *bc) setTokenEnd(sep bool) {
Akron84d68e62021-08-04 17:06:52 +0200311 if sep {
Akronf1a16502021-08-16 15:24:38 +0200312 bc.check |= SECONDBIT
Akron84d68e62021-08-04 17:06:52 +0200313 } else {
Akronf1a16502021-08-16 15:24:38 +0200314 bc.check &= (RESTBIT | FIRSTBIT)
Akron84d68e62021-08-04 17:06:52 +0200315 }
316}
317
Akron64ffd9a2021-08-03 19:55:21 +0200318// Set size of double array
Akron3fdfec62021-08-04 11:40:10 +0200319func (dat *DaTokenizer) setSize(v int) {
Akronf1a16502021-08-16 15:24:38 +0200320 dat.array[1].setCheck(uint32(v))
Akron64ffd9a2021-08-03 19:55:21 +0200321}
322
323// Get size of double array
Akron3fdfec62021-08-04 11:40:10 +0200324func (dat *DaTokenizer) GetSize() int {
Akronf1a16502021-08-16 15:24:38 +0200325 return int(dat.array[1].getCheck())
Akron8ef408b2021-08-02 22:11:04 +0200326}
327
328// Based on Mizobuchi et al (2000), p. 124
329// This iterates for every state through the complete double array
330// structure until it finds a gap that fits all outgoing transitions
331// of the state. This is extremely slow, but is only necessary in the
332// construction phase of the tokenizer.
Akron3fdfec62021-08-04 11:40:10 +0200333func (dat *DaTokenizer) xCheck(symbols []int) uint32 {
Akron740f3d72021-08-03 12:12:34 +0200334
335 // Start at the first entry of the double array list
Akron6f1c16c2021-08-17 10:45:42 +0200336 base := uint32(1)
337
Akron8ef408b2021-08-02 22:11:04 +0200338OVERLAP:
Akron740f3d72021-08-03 12:12:34 +0200339 // Resize the array if necessary
Akronf1a16502021-08-16 15:24:38 +0200340 dat.resize(int(base) + dat.final)
Akron8ef408b2021-08-02 22:11:04 +0200341 for _, a := range symbols {
Akronf1a16502021-08-16 15:24:38 +0200342 if dat.array[int(base)+a].getCheck() != 0 {
Akron8ef408b2021-08-02 22:11:04 +0200343 base++
344 goto OVERLAP
345 }
346 }
Akron8ef408b2021-08-02 22:11:04 +0200347 return base
348}
349
Akron679b4862021-09-02 16:59:26 +0200350// This is an implementation of xCheck with the skip-improvement
351// proposed by Morita et al. (2001)
352func (dat *DaTokenizer) xCheckSkip(symbols []int) uint32 {
353
354 // Start at the first entry of the double array list
355 base := uint32(math.Abs(float64(dat.maxSize-1) * .9))
356
357OVERLAP:
358 // Resize the array if necessary
359 dat.resize(int(base) + dat.final)
360 for _, a := range symbols {
361 if dat.array[int(base)+a].getCheck() != 0 {
362 base++
363 goto OVERLAP
364 }
365 }
366 return base
367}
368
Akron29e306f2021-09-02 18:29:56 +0200369// This is an implementation of xCheck with the skip-improvement
370// proposed by Morita et al. (2001) for higher outdegrees as
371// proposed by Niu et al. (2013)
372func (dat *DaTokenizer) xCheckSkipNiu(symbols []int) uint32 {
373
374 // Start at the first entry of the double array list
375 base := uint32(1)
376
377 // Or skip the first few entries
378 if len(symbols) >= 3 {
379 base = uint32(math.Abs(float64(dat.maxSize-1)*.9)) + 1
380 }
381
382OVERLAP:
383 // Resize the array if necessary
384 dat.resize(int(base) + dat.final + 1)
385 for _, a := range symbols {
386 if dat.array[int(base)+a].getCheck() != 0 {
387 base++
388 goto OVERLAP
389 }
390 }
391 return base
392}
393
Akron7b1faa62021-09-02 16:10:21 +0200394// This is an implementation of xCheck wit an improvement
395// proposed by Niu et al. (2013)
396func (dat *DaTokenizer) xCheckNiu(symbols []int, block_begin_pos *uint32) uint32 {
397
398 // Start at the first entry of the double array list
399 base := uint32(1)
400
401 if len(symbols) > 3 {
402 sort.Ints(symbols)
403 if *block_begin_pos > uint32(symbols[0]) {
404 dat.resize(int(*block_begin_pos) + dat.final)
405 *block_begin_pos += uint32(symbols[len(symbols)-1] + 1)
406 return *block_begin_pos - uint32(symbols[0])
407 }
408 }
409
410OVERLAP:
411 // Resize the array if necessary
412 dat.resize(int(base) + dat.final)
413 for _, a := range symbols {
414 if dat.array[int(base)+a].getCheck() != 0 {
415 base++
416 goto OVERLAP
417 }
418 }
419 return base
420}
421
Akron3610f102021-08-08 14:13:25 +0200422// List all outgoing transitions for a state
423// for testing purposes
424func (dat *DaTokenizer) outgoing(t uint32) []int {
425
426 valid := make([]int, 0, len(dat.sigma))
427
428 for _, a := range dat.sigma {
Akronf1a16502021-08-16 15:24:38 +0200429 t1 := dat.array[t].getBase() + uint32(a)
430 if t1 <= dat.array[1].getCheck() && dat.array[t1].getCheck() == t {
Akron3610f102021-08-08 14:13:25 +0200431 valid = append(valid, a)
432 }
433 }
434
435 for _, a := range []int{dat.epsilon, dat.unknown, dat.identity, dat.final} {
Akronf1a16502021-08-16 15:24:38 +0200436 t1 := dat.array[t].getBase() + uint32(a)
437 if t1 <= dat.array[1].getCheck() && dat.array[t1].getCheck() == t {
Akron3610f102021-08-08 14:13:25 +0200438 valid = append(valid, -1*a)
439 }
440 }
441
442 sort.Ints(valid)
443
444 return valid
445}
446
Akron4fa28b32021-08-27 10:55:41 +0200447// TransCount as the number of transitions aka arcs in the
448// finite state automaton
449func (dat *DaTokenizer) TransCount() int {
450 // Cache the transCount
451 if dat.transCount > 0 {
452 return dat.transCount
453 }
454
455 dat.transCount = 0
456 for x := 1; x < len(dat.array); x++ {
457 if dat.array[x].getBase() != 0 {
458 dat.transCount++
459 }
460 }
461
462 return dat.transCount
463}
464
Akron03a3c612021-08-04 11:51:27 +0200465// LoadFactor as defined in Kanda et al (2018),
466// i.e. the proportion of non-empty elements to all elements.
467func (dat *DaTokenizer) LoadFactor() float64 {
Akron4fa28b32021-08-27 10:55:41 +0200468 return float64(dat.TransCount()) / float64(len(dat.array)) * 100
Akrond66a9262021-08-03 17:09:09 +0200469}
470
Akron439f4ec2021-08-09 15:45:38 +0200471// Save stores the double array data in a file
Akron3a063ef2021-08-05 19:36:35 +0200472func (dat *DaTokenizer) Save(file string) (n int64, err error) {
473 f, err := os.Create(file)
474 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200475 log.Println(err)
Akron439f4ec2021-08-09 15:45:38 +0200476 return 0, err
Akron3a063ef2021-08-05 19:36:35 +0200477 }
478 defer f.Close()
479 gz := gzip.NewWriter(f)
480 defer gz.Close()
481 n, err = dat.WriteTo(gz)
482 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200483 log.Println(err)
Akron3a063ef2021-08-05 19:36:35 +0200484 return n, err
485 }
486 gz.Flush()
487 return n, nil
488}
489
490// WriteTo stores the double array data in an io.Writer.
Akron6247a5d2021-08-03 19:18:28 +0200491func (dat *DaTokenizer) WriteTo(w io.Writer) (n int64, err error) {
492
Akron3a063ef2021-08-05 19:36:35 +0200493 wb := bufio.NewWriter(w)
494 defer wb.Flush()
495
Akron6247a5d2021-08-03 19:18:28 +0200496 // Store magical header
Akron16c312e2021-09-26 13:11:12 +0200497 all, err := wb.Write([]byte(DAMAGIC))
Akron6247a5d2021-08-03 19:18:28 +0200498 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200499 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200500 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200501 }
502
503 // Get sigma as a list
504 sigmalist := make([]rune, len(dat.sigma)+16)
505 max := 0
506 for sym, num := range dat.sigma {
507 sigmalist[num] = sym
508 if num > max {
509 max = num
510 }
511 }
512
513 sigmalist = sigmalist[:max+1]
514
Akron3f8571a2021-08-05 11:18:10 +0200515 buf := make([]byte, 0, 16)
Akron6247a5d2021-08-03 19:18:28 +0200516 bo.PutUint16(buf[0:2], VERSION)
Akronc17f1ca2021-08-03 19:47:27 +0200517 bo.PutUint16(buf[2:4], uint16(dat.epsilon))
518 bo.PutUint16(buf[4:6], uint16(dat.unknown))
519 bo.PutUint16(buf[6:8], uint16(dat.identity))
520 bo.PutUint16(buf[8:10], uint16(dat.final))
Akron6247a5d2021-08-03 19:18:28 +0200521 bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
Akronf1a16502021-08-16 15:24:38 +0200522 bo.PutUint32(buf[12:16], uint32(len(dat.array)*2)) // Legacy support
Akron3a063ef2021-08-05 19:36:35 +0200523 more, err := wb.Write(buf[0:16])
Akron6247a5d2021-08-03 19:18:28 +0200524 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200525 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200526 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200527 }
528
529 all += more
530
Akron6247a5d2021-08-03 19:18:28 +0200531 // Write sigma
532 for _, sym := range sigmalist {
Akron3f8571a2021-08-05 11:18:10 +0200533
Akron3a063ef2021-08-05 19:36:35 +0200534 more, err = wb.WriteRune(sym)
Akron6247a5d2021-08-03 19:18:28 +0200535 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200536 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200537 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200538 }
539 all += more
540 }
Akron439f4ec2021-08-09 15:45:38 +0200541
Akron6247a5d2021-08-03 19:18:28 +0200542 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200543 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200544 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200545 }
Akron6247a5d2021-08-03 19:18:28 +0200546
547 // Test marker - could be checksum
Akron3a063ef2021-08-05 19:36:35 +0200548 more, err = wb.Write([]byte("T"))
Akron6247a5d2021-08-03 19:18:28 +0200549 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200550 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200551 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200552 }
553 all += more
554
Akronf1a16502021-08-16 15:24:38 +0200555 // for x := 0; x < len(dat.array); x++ {
556 for _, bc := range dat.array {
557 bo.PutUint32(buf[0:4], bc.base)
558 more, err = wb.Write(buf[0:4])
Akron6247a5d2021-08-03 19:18:28 +0200559 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200560 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200561 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200562 }
Akron439f4ec2021-08-09 15:45:38 +0200563 all += more
Akron3a063ef2021-08-05 19:36:35 +0200564 if more != 4 {
Akronf1a16502021-08-16 15:24:38 +0200565 log.Println("Can not write base uint32")
566 return int64(all), err
567 }
568 bo.PutUint32(buf[0:4], bc.check)
569 more, err = wb.Write(buf[0:4])
570 if err != nil {
571 log.Println(err)
572 return int64(all), err
573 }
574 all += more
575 if more != 4 {
576 log.Println("Can not write check uint32")
Akron3a063ef2021-08-05 19:36:35 +0200577 return int64(all), err
578 }
Akron6247a5d2021-08-03 19:18:28 +0200579 }
580
581 return int64(all), err
582}
583
Akron439f4ec2021-08-09 15:45:38 +0200584// LoadDatokFile reads a double array represented tokenizer
585// from a file.
Akron3f8571a2021-08-05 11:18:10 +0200586func LoadDatokFile(file string) *DaTokenizer {
587 f, err := os.Open(file)
588 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200589 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200590 return nil
Akron3f8571a2021-08-05 11:18:10 +0200591 }
592 defer f.Close()
593
594 gz, err := gzip.NewReader(f)
595 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200596 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200597 return nil
Akron3f8571a2021-08-05 11:18:10 +0200598 }
599 defer gz.Close()
600
Akron3a063ef2021-08-05 19:36:35 +0200601 // Todo: Read the whole file!
Akron3f8571a2021-08-05 11:18:10 +0200602 return ParseDatok(gz)
603}
604
Akron439f4ec2021-08-09 15:45:38 +0200605// LoadDatokFile reads a double array represented tokenizer
606// from an io.Reader
Akron3f8571a2021-08-05 11:18:10 +0200607func ParseDatok(ior io.Reader) *DaTokenizer {
608
Akron439f4ec2021-08-09 15:45:38 +0200609 // Initialize tokenizer with default values
Akron3f8571a2021-08-05 11:18:10 +0200610 dat := &DaTokenizer{
611 sigma: make(map[rune]int),
612 epsilon: 0,
613 unknown: 0,
614 identity: 0,
615 final: 0,
Akron4fa28b32021-08-27 10:55:41 +0200616 transCount: 0,
Akron3f8571a2021-08-05 11:18:10 +0200617 }
618
619 r := bufio.NewReader(ior)
620
Akron3f8571a2021-08-05 11:18:10 +0200621 buf := make([]byte, 1024)
Akron16c312e2021-09-26 13:11:12 +0200622 buf = buf[0:len(DAMAGIC)]
Akron3f8571a2021-08-05 11:18:10 +0200623
Akron439f4ec2021-08-09 15:45:38 +0200624 _, err := r.Read(buf)
Akron3f8571a2021-08-05 11:18:10 +0200625
626 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200627 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200628 return nil
629 }
630
Akron16c312e2021-09-26 13:11:12 +0200631 if string(DAMAGIC) != string(buf) {
Akron527c10c2021-08-13 01:45:18 +0200632 log.Println("Not a datok file")
Akron3f8571a2021-08-05 11:18:10 +0200633 return nil
634 }
635
Akron439f4ec2021-08-09 15:45:38 +0200636 more, err := io.ReadFull(r, buf[0:16])
Akron3f8571a2021-08-05 11:18:10 +0200637 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200638 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200639 return nil
640 }
641
Akron439f4ec2021-08-09 15:45:38 +0200642 if more != 16 {
Akron527c10c2021-08-13 01:45:18 +0200643 log.Println("Read bytes do not fit")
Akron439f4ec2021-08-09 15:45:38 +0200644 return nil
645 }
Akron3f8571a2021-08-05 11:18:10 +0200646
Akron3a063ef2021-08-05 19:36:35 +0200647 version := bo.Uint16(buf[0:2])
648
649 if version != VERSION {
Akron527c10c2021-08-13 01:45:18 +0200650 log.Println("Version not compatible")
Akron3a063ef2021-08-05 19:36:35 +0200651 return nil
652 }
653
Akron3f8571a2021-08-05 11:18:10 +0200654 dat.epsilon = int(bo.Uint16(buf[2:4]))
655 dat.unknown = int(bo.Uint16(buf[4:6]))
656 dat.identity = int(bo.Uint16(buf[6:8]))
657 dat.final = int(bo.Uint16(buf[8:10]))
658
659 sigmaCount := int(bo.Uint16(buf[10:12]))
Akronf1a16502021-08-16 15:24:38 +0200660 arraySize := int(bo.Uint32(buf[12:16])) / 2 // Legacy support
Akron3f8571a2021-08-05 11:18:10 +0200661
Akron3a063ef2021-08-05 19:36:35 +0200662 // Shouldn't be relevant though
663 dat.maxSize = arraySize - 1
664
Akron3f8571a2021-08-05 11:18:10 +0200665 for x := 0; x < sigmaCount; x++ {
Akron439f4ec2021-08-09 15:45:38 +0200666 sym, _, err := r.ReadRune()
Akron3f8571a2021-08-05 11:18:10 +0200667 if err == nil && sym != 0 {
Akronea46e8a2021-08-17 00:36:31 +0200668 if int(sym) < 256 {
669 dat.sigmaASCII[int(sym)] = x
670 }
Akron3f8571a2021-08-05 11:18:10 +0200671 dat.sigma[sym] = x
672 }
Akron3f8571a2021-08-05 11:18:10 +0200673 }
674
Akron439f4ec2021-08-09 15:45:38 +0200675 _, err = io.ReadFull(r, buf[0:1])
Akron3f8571a2021-08-05 11:18:10 +0200676
677 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200678 log.Print(err)
Akron3f8571a2021-08-05 11:18:10 +0200679 return nil
680 }
681
Akron3f8571a2021-08-05 11:18:10 +0200682 if string("T") != string(buf[0:1]) {
Akron527c10c2021-08-13 01:45:18 +0200683 log.Println("Not a datok file")
Akron3f8571a2021-08-05 11:18:10 +0200684 return nil
685 }
686
687 // Read based on length
Akronf1a16502021-08-16 15:24:38 +0200688 dat.array = make([]bc, arraySize)
Akron3f8571a2021-08-05 11:18:10 +0200689
Akronbb4aac52021-08-13 00:52:27 +0200690 dataArray, err := io.ReadAll(r)
Akron439f4ec2021-08-09 15:45:38 +0200691
Akronbb4aac52021-08-13 00:52:27 +0200692 if err == io.EOF {
Akron527c10c2021-08-13 01:45:18 +0200693 log.Println(err)
Akronbb4aac52021-08-13 00:52:27 +0200694 return nil
695 }
696
Akronf1a16502021-08-16 15:24:38 +0200697 if len(dataArray) < arraySize*8 {
Akron527c10c2021-08-13 01:45:18 +0200698 log.Println("Not enough bytes read")
Akronbb4aac52021-08-13 00:52:27 +0200699 return nil
700 }
701
702 for x := 0; x < arraySize; x++ {
Akronf1a16502021-08-16 15:24:38 +0200703 dat.array[x].base = bo.Uint32(dataArray[x*8 : (x*8)+4])
704 dat.array[x].check = bo.Uint32(dataArray[(x*8)+4 : (x*8)+8])
Akron3f8571a2021-08-05 11:18:10 +0200705 }
706
707 return dat
708}
709
Akron439f4ec2021-08-09 15:45:38 +0200710// Show the current state of the buffer,
711// for testing puroses
Akron3610f102021-08-08 14:13:25 +0200712func showBuffer(buffer []rune, buffo int, buffi int) string {
713 out := make([]rune, 0, 1024)
714 for x := 0; x < len(buffer); x++ {
715 if buffi == x {
716 out = append(out, '^')
717 }
718 if buffo == x {
719 out = append(out, '[', buffer[x], ']')
720 } else {
721 out = append(out, buffer[x])
722 }
723 }
724 return string(out)
725}
726
Akron98fbfef2021-10-23 17:02:11 +0200727// Show the current state of the buffer,
728// for testing puroses
729func showBufferNew(buffer []rune, bufft int, buffc int, buffi int) string {
730 out := make([]rune, 0, 1024)
731 for x := 0; x < len(buffer); x++ {
732 if buffi == x {
733 out = append(out, '^')
734 }
735 if bufft == x {
736 out = append(out, '|')
737 }
738 if buffc == x {
739 out = append(out, '[', buffer[x], ']')
740 } else {
741 out = append(out, buffer[x])
742 }
743 }
744 return string(out)
745}
746
747// Transduce input to ouutput
Akrone396a932021-10-19 01:06:13 +0200748func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
749 return dat.TransduceTokenWriter(r, NewTokenWriterSimple(w))
750}
751
752// TransduceTokenWriter transduces an input string against
753// the double array FSA. The rules are always greedy. If the
754// automaton fails, it takes the last possible token ending
755// branch.
Akron068874c2021-08-04 15:19:56 +0200756//
Akron4db3ecf2021-08-11 18:49:03 +0200757// Based on Mizobuchi et al (2000), p. 129,
758// with additional support for IDENTITY, UNKNOWN
759// and EPSILON transitions and NONTOKEN and TOKENEND handling.
Akrone396a932021-10-19 01:06:13 +0200760func (dat *DaTokenizer) TransduceTokenWriter(r io.Reader, w TokenWriterI) bool {
Akron068874c2021-08-04 15:19:56 +0200761 var a int
Akronb4bbb472021-08-09 11:49:38 +0200762 var t0 uint32
Akronb7e1f132021-08-10 11:52:31 +0200763 t := uint32(1) // Initial state
764 var ok, rewindBuffer bool
Akron068874c2021-08-04 15:19:56 +0200765
Akron3610f102021-08-08 14:13:25 +0200766 // Remember the last position of a possible tokenend,
767 // in case the automaton fails.
768 epsilonState := uint32(0)
769 epsilonOffset := 0
770
Akrone396a932021-10-19 01:06:13 +0200771 // Remember if the last transition was epsilon
772 sentenceEnd := false
773
Akrona854faa2021-10-22 19:31:08 +0200774 // Remember if a text end was already set
775 textEnd := false
776
Akron3610f102021-08-08 14:13:25 +0200777 // Implement a low level buffer for full control,
778 // however - it is probably better to introduce
779 // this on a higher level with a io.Reader interface
780 // The buffer stores a single word and may have white
781 // space at the end (but not at the beginning).
782 //
783 // This is the only backtracking requirement because of
784 // epsilon transitions, to support tokenizations like:
785 // "this is an example|.| And it works." vs
786 // "this is an example.com| application."
Akronb7e1f132021-08-10 11:52:31 +0200787 //
788 // TODO:
789 // Store a translation buffer as well, so characters don't
790 // have to be translated multiple times!
Akron3610f102021-08-08 14:13:25 +0200791 buffer := make([]rune, 1024)
Akron98fbfef2021-10-23 17:02:11 +0200792 bufft := 0 // Buffer token offset
793 buffc := 0 // Buffer current symbol
Akron3610f102021-08-08 14:13:25 +0200794 buffi := 0 // Buffer length
795
Akron98fbfef2021-10-23 17:02:11 +0200796 // The buffer is organized as follows:
797 // [ t[....c..]..i]
798
Akron3f8571a2021-08-05 11:18:10 +0200799 reader := bufio.NewReader(r)
Akrone396a932021-10-19 01:06:13 +0200800 defer w.Flush()
Akron068874c2021-08-04 15:19:56 +0200801
Akron3f8571a2021-08-05 11:18:10 +0200802 var char rune
Akrone396a932021-10-19 01:06:13 +0200803
Akron3f8571a2021-08-05 11:18:10 +0200804 var err error
805 eof := false
Akrona854faa2021-10-22 19:31:08 +0200806 eot := false
Akronb7e1f132021-08-10 11:52:31 +0200807 newchar := true
Akron3f8571a2021-08-05 11:18:10 +0200808
Akronc5d8d432021-08-10 16:48:44 +0200809PARSECHAR:
Akron3f8571a2021-08-05 11:18:10 +0200810 for {
811
Akronb7e1f132021-08-10 11:52:31 +0200812 if newchar {
813 // Get from reader if buffer is empty
Akron98fbfef2021-10-23 17:02:11 +0200814 if buffc >= buffi {
Akron1594cb82021-08-11 11:14:56 +0200815 if eof {
816 break
817 }
Akronb7e1f132021-08-10 11:52:31 +0200818 char, _, err = reader.ReadRune()
Akron439f4ec2021-08-09 15:45:38 +0200819
Akronb7e1f132021-08-10 11:52:31 +0200820 // No more runes to read
821 if err != nil {
822 eof = true
823 break
824 }
825 buffer[buffi] = char
826 buffi++
Akron3f8571a2021-08-05 11:18:10 +0200827 }
Akronb7e1f132021-08-10 11:52:31 +0200828
Akron98fbfef2021-10-23 17:02:11 +0200829 char = buffer[buffc]
Akronb7e1f132021-08-10 11:52:31 +0200830
831 if DEBUG {
Akron98fbfef2021-10-23 17:02:11 +0200832 fmt.Println("Current char", string(char), int(char), showBufferNew(buffer, bufft, buffc, buffi))
Akronb7e1f132021-08-10 11:52:31 +0200833 }
834
Akrona854faa2021-10-22 19:31:08 +0200835 eot = false
836
Akron6f1c16c2021-08-17 10:45:42 +0200837 // TODO:
838 // Better not repeatedly check for a!
839 // Possibly keep a buffer with a.
Akronea46e8a2021-08-17 00:36:31 +0200840 if int(char) < 256 {
Akrona854faa2021-10-22 19:31:08 +0200841 if int(char) == EOT {
842 eot = true
843 }
Akronea46e8a2021-08-17 00:36:31 +0200844 a = dat.sigmaASCII[int(char)]
845 } else {
846 a, ok = dat.sigma[char]
847 if !ok {
848 a = 0
849 }
850 }
Akronb7e1f132021-08-10 11:52:31 +0200851
852 // Use identity symbol if character is not in sigma
Akronea46e8a2021-08-17 00:36:31 +0200853 if a == 0 && dat.identity != -1 {
Akronb7e1f132021-08-10 11:52:31 +0200854 a = dat.identity
855 }
856
857 t0 = t
858
859 // Check for epsilon transitions and remember
Akronf1a16502021-08-16 15:24:38 +0200860 if dat.array[dat.array[t0].getBase()+uint32(dat.epsilon)].getCheck() == t0 {
Akronb7e1f132021-08-10 11:52:31 +0200861 // Remember state for backtracking to last tokenend state
862 epsilonState = t0
Akron98fbfef2021-10-23 17:02:11 +0200863 epsilonOffset = buffc
Akrone396a932021-10-19 01:06:13 +0200864
865 if DEBUG {
Akron98fbfef2021-10-23 17:02:11 +0200866 fmt.Println("epsilonOffset is set to", buffc)
Akrone396a932021-10-19 01:06:13 +0200867 }
Akronb7e1f132021-08-10 11:52:31 +0200868 }
Akron3f8571a2021-08-05 11:18:10 +0200869 }
Akron3610f102021-08-08 14:13:25 +0200870
Akronb7e1f132021-08-10 11:52:31 +0200871 // Checks a transition based on t0, a and buffo
Akronf1a16502021-08-16 15:24:38 +0200872 t = dat.array[t0].getBase() + uint32(a)
873 ta := dat.array[t]
Akron068874c2021-08-04 15:19:56 +0200874
Akron524c5432021-08-05 14:14:27 +0200875 if DEBUG {
Akronb7e1f132021-08-10 11:52:31 +0200876 // Char is only relevant if set
877 fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
878 if false {
879 fmt.Println(dat.outgoing(t0))
880 }
Akron524c5432021-08-05 14:14:27 +0200881 }
882
Akronb7e1f132021-08-10 11:52:31 +0200883 // Check if the transition is invalid according to the double array
Akronf1a16502021-08-16 15:24:38 +0200884 if t > dat.array[1].getCheck() || ta.getCheck() != t0 {
Akron068874c2021-08-04 15:19:56 +0200885
886 if DEBUG {
Akronf1a16502021-08-16 15:24:38 +0200887 fmt.Println("Match is not fine!", t, "and", ta.getCheck(), "vs", t0)
Akron068874c2021-08-04 15:19:56 +0200888 }
889
890 if !ok && a == dat.identity {
Akronb4bbb472021-08-09 11:49:38 +0200891
Akron068874c2021-08-04 15:19:56 +0200892 // Try again with unknown symbol, in case identity failed
Akronb7e1f132021-08-10 11:52:31 +0200893 // Char is only relevant when set
Akron068874c2021-08-04 15:19:56 +0200894 if DEBUG {
Akron3f8571a2021-08-05 11:18:10 +0200895 fmt.Println("UNKNOWN symbol", string(char), "->", dat.unknown)
Akron068874c2021-08-04 15:19:56 +0200896 }
897 a = dat.unknown
898
899 } else if a != dat.epsilon {
Akronb4bbb472021-08-09 11:49:38 +0200900
Akron068874c2021-08-04 15:19:56 +0200901 // Try again with epsilon symbol, in case everything else failed
Akronb4bbb472021-08-09 11:49:38 +0200902 t0 = epsilonState
Akron3610f102021-08-08 14:13:25 +0200903 epsilonState = 0 // reset
Akron98fbfef2021-10-23 17:02:11 +0200904 buffc = epsilonOffset
Akron439f4ec2021-08-09 15:45:38 +0200905 a = dat.epsilon
906
Akron3610f102021-08-08 14:13:25 +0200907 if DEBUG {
Akron98fbfef2021-10-23 17:02:11 +0200908 fmt.Println("Get from epsilon stack and set buffo!", showBufferNew(buffer, bufft, buffc, buffi))
Akron3610f102021-08-08 14:13:25 +0200909 }
Akronb4bbb472021-08-09 11:49:38 +0200910
Akron068874c2021-08-04 15:19:56 +0200911 } else {
912 break
913 }
Akron068874c2021-08-04 15:19:56 +0200914
Akronb7e1f132021-08-10 11:52:31 +0200915 newchar = false
Akrona854faa2021-10-22 19:31:08 +0200916 eot = false
Akronb7e1f132021-08-10 11:52:31 +0200917 continue
Akronb4bbb472021-08-09 11:49:38 +0200918 }
919
Akronb7e1f132021-08-10 11:52:31 +0200920 // Transition was successful
921 rewindBuffer = false
Akron439f4ec2021-08-09 15:45:38 +0200922
923 // Transition consumes a character
Akronb4bbb472021-08-09 11:49:38 +0200924 if a != dat.epsilon {
925
Akron98fbfef2021-10-23 17:02:11 +0200926 buffc++
Akronb4bbb472021-08-09 11:49:38 +0200927
Akron439f4ec2021-08-09 15:45:38 +0200928 // Transition does not produce a character
Akron98fbfef2021-10-23 17:02:11 +0200929 if buffc-bufft == 1 && ta.isNonToken() {
Akron3610f102021-08-08 14:13:25 +0200930 if DEBUG {
Akron98fbfef2021-10-23 17:02:11 +0200931 fmt.Println("Nontoken forward", showBufferNew(buffer, bufft, buffc, buffi))
Akron3610f102021-08-08 14:13:25 +0200932 }
Akron98fbfef2021-10-23 17:02:11 +0200933 bufft++
934 // rewindBuffer = true
Akron3610f102021-08-08 14:13:25 +0200935 }
Akron068874c2021-08-04 15:19:56 +0200936
Akrone396a932021-10-19 01:06:13 +0200937 } else {
Akron524c5432021-08-05 14:14:27 +0200938
Akrone396a932021-10-19 01:06:13 +0200939 // Transition marks the end of a token - so flush the buffer
Akron98fbfef2021-10-23 17:02:11 +0200940 if buffc-bufft > 0 {
Akronc5d8d432021-08-10 16:48:44 +0200941 if DEBUG {
Akron98fbfef2021-10-23 17:02:11 +0200942 fmt.Println("-> Flush buffer: [", string(buffer[bufft:buffc]), "]", showBuffer(buffer, buffc, buffi))
Akronc5d8d432021-08-10 16:48:44 +0200943 }
Akron98fbfef2021-10-23 17:02:11 +0200944 w.Token(0, buffer[bufft:buffc])
Akronc5d8d432021-08-10 16:48:44 +0200945 rewindBuffer = true
Akrone396a932021-10-19 01:06:13 +0200946 sentenceEnd = false
Akrona854faa2021-10-22 19:31:08 +0200947 textEnd = false
Akrone396a932021-10-19 01:06:13 +0200948 } else {
949 sentenceEnd = true
Akrona854faa2021-10-22 19:31:08 +0200950 w.SentenceEnd(0)
Akron1594cb82021-08-11 11:14:56 +0200951 }
Akron439f4ec2021-08-09 15:45:38 +0200952 }
Akron3610f102021-08-08 14:13:25 +0200953
Akronc5d8d432021-08-10 16:48:44 +0200954 // Rewind the buffer if necessary
Akron439f4ec2021-08-09 15:45:38 +0200955 if rewindBuffer {
956
Akrone396a932021-10-19 01:06:13 +0200957 if DEBUG {
Akron98fbfef2021-10-23 17:02:11 +0200958 fmt.Println("-> Rewind buffer", bufft, buffc, buffi, epsilonOffset)
Akrone396a932021-10-19 01:06:13 +0200959 }
960
Akron439f4ec2021-08-09 15:45:38 +0200961 // TODO: Better as a ring buffer
Akron98fbfef2021-10-23 17:02:11 +0200962 for x, i := range buffer[buffc:buffi] {
Akron3610f102021-08-08 14:13:25 +0200963 buffer[x] = i
964 }
Akronb4bbb472021-08-09 11:49:38 +0200965
Akron98fbfef2021-10-23 17:02:11 +0200966 buffi -= buffc
Akron16c312e2021-09-26 13:11:12 +0200967 // epsilonOffset -= buffo
Akrone396a932021-10-19 01:06:13 +0200968 epsilonOffset = 0
969 epsilonState = 0
970
Akron98fbfef2021-10-23 17:02:11 +0200971 buffc = 0
972 bufft = 0
Akrona854faa2021-10-22 19:31:08 +0200973
Akron98fbfef2021-10-23 17:02:11 +0200974 if DEBUG {
975 fmt.Println("Remaining:", showBufferNew(buffer, bufft, buffc, buffi))
976 }
977 }
978
979 if eot {
980 eot = false
981 textEnd = true
982 w.TextEnd(0)
983 if DEBUG {
984 fmt.Println("END OF TEXT")
Akrona854faa2021-10-22 19:31:08 +0200985 }
Akron84d68e62021-08-04 17:06:52 +0200986 }
987
Akronb7e1f132021-08-10 11:52:31 +0200988 // Move to representative state
Akronf1a16502021-08-16 15:24:38 +0200989 if ta.isSeparate() {
990 t = ta.getBase()
991 ta = dat.array[t]
Akronb7e1f132021-08-10 11:52:31 +0200992
993 if DEBUG {
994 fmt.Println("Representative pointing to", t)
995 }
996 }
997
Akronc5d8d432021-08-10 16:48:44 +0200998 newchar = true
999
Akron068874c2021-08-04 15:19:56 +02001000 // TODO:
1001 // Prevent endless epsilon loops!
1002 }
1003
Akron439f4ec2021-08-09 15:45:38 +02001004 // Input reader is not yet finished
Akron3f8571a2021-08-05 11:18:10 +02001005 if !eof {
Akron068874c2021-08-04 15:19:56 +02001006 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001007 fmt.Println("Not at the end - problem", t0, ":", dat.outgoing(t0))
Akron068874c2021-08-04 15:19:56 +02001008 }
1009 return false
1010 }
1011
Akronb7e1f132021-08-10 11:52:31 +02001012 if DEBUG {
1013 fmt.Println("Entering final check")
1014 }
1015
Akrone396a932021-10-19 01:06:13 +02001016 /*
1017 The following code is for deprecated automata relying on
1018 final states. Datok now requires final states to be marked
1019 with tokenends.
Akronf1a16502021-08-16 15:24:38 +02001020
Akrone396a932021-10-19 01:06:13 +02001021 // Automaton is in a final state, so flush the buffer and return
1022 x := dat.array[t].getBase() + uint32(dat.final)
Akronb4bbb472021-08-09 11:49:38 +02001023
Akrone396a932021-10-19 01:06:13 +02001024 if x < dat.array[1].getCheck() && dat.array[x].getCheck() == t {
Akron6e70dc82021-08-11 11:33:18 +02001025
Akrone396a932021-10-19 01:06:13 +02001026 if buffi > 0 {
1027 if DEBUG {
1028 fmt.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
1029 }
1030 w.Token(0, buffer[:buffi])
Akronc5d8d432021-08-10 16:48:44 +02001031 }
Akron84d68e62021-08-04 17:06:52 +02001032
Akrone396a932021-10-19 01:06:13 +02001033 // Add an additional sentence ending, if the file is over but no explicit
1034 // sentence split was reached. This may be controversial and therefore
1035 // optional via parameter.
1036 if !dat.array[t0].isTokenEnd() {
1037 w.SentenceEnd()
1038 }
Akron6e70dc82021-08-11 11:33:18 +02001039
Akrone396a932021-10-19 01:06:13 +02001040 // TODO:
1041 // There may be a new line at the end, from an epsilon,
1042 // so we may need to go on!
1043 return true
1044 }
1045 */
Akron068874c2021-08-04 15:19:56 +02001046
Akrona854faa2021-10-22 19:31:08 +02001047 // Check epsilon transitions as long as possible
Akronc5d8d432021-08-10 16:48:44 +02001048 t0 = t
Akronf1a16502021-08-16 15:24:38 +02001049 t = dat.array[t0].getBase() + uint32(dat.epsilon)
Akron01912fc2021-08-12 11:41:58 +02001050 a = dat.epsilon
1051 newchar = false
Akrone396a932021-10-19 01:06:13 +02001052
Akronf1a16502021-08-16 15:24:38 +02001053 if dat.array[t].getCheck() == t0 {
Akronc5d8d432021-08-10 16:48:44 +02001054 // Remember state for backtracking to last tokenend state
Akronc5d8d432021-08-10 16:48:44 +02001055 goto PARSECHAR
Akrone61380b2021-08-16 10:10:46 +02001056
Akronc5d8d432021-08-10 16:48:44 +02001057 } else if epsilonState != 0 {
Akronb7e1f132021-08-10 11:52:31 +02001058 t0 = epsilonState
1059 epsilonState = 0 // reset
Akron98fbfef2021-10-23 17:02:11 +02001060 buffc = epsilonOffset
Akron068874c2021-08-04 15:19:56 +02001061 if DEBUG {
Akron98fbfef2021-10-23 17:02:11 +02001062 fmt.Println("Get from epsilon stack and set buffo!", showBufferNew(buffer, bufft, buffc, buffi))
Akron068874c2021-08-04 15:19:56 +02001063 }
Akronc5d8d432021-08-10 16:48:44 +02001064 goto PARSECHAR
Akron068874c2021-08-04 15:19:56 +02001065 }
Akrone396a932021-10-19 01:06:13 +02001066
1067 // Add an additional sentence ending, if the file is over but no explicit
1068 // sentence split was reached. This may be controversial and therefore
1069 // optional via parameter.
1070 if !sentenceEnd {
Akrona854faa2021-10-22 19:31:08 +02001071 w.SentenceEnd(0)
1072
Akrone396a932021-10-19 01:06:13 +02001073 if DEBUG {
Akrona854faa2021-10-22 19:31:08 +02001074 fmt.Println("Sentence end")
1075 }
1076 }
1077
1078 if !textEnd {
1079 w.TextEnd(0)
1080
1081 if DEBUG {
1082 fmt.Println("Text end")
Akrone396a932021-10-19 01:06:13 +02001083 }
1084 }
1085
1086 return true
Akron068874c2021-08-04 15:19:56 +02001087}