blob: b70586c2c194367cdf6618f514bafb60a462e51d [file] [log] [blame]
Akron7f1097f2021-09-21 16:00:29 +02001package datok
Akron8ef408b2021-08-02 22:11:04 +02002
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
Akronb4bbb472021-08-09 11:49:38 +02009// The maximum number of states is 1.073.741.823 (30bit),
Akron83e75a22021-08-04 13:14:06 +020010// with a loadfactor of ~70, this means roughly 70 million
11// states in the FSA, which is sufficient for the current
12// job.
Akron03c92fe2021-08-09 14:07:57 +020013//
14// Serialization is little endian.
Akron83e75a22021-08-04 13:14:06 +020015
Akron740f3d72021-08-03 12:12:34 +020016// TODO:
17// - replace maxSize with the check value
Akron7b1faa62021-09-02 16:10:21 +020018// - Check if final states can be optional.
19// - Introduce ELM (Morita et al. 2001) to speed
20// up construction. Can be ignored on serialization
21// to improve compression.
Akron3a063ef2021-08-05 19:36:35 +020022// - Add checksum to serialization.
Akrone61380b2021-08-16 10:10:46 +020023// - Replace/Enhance table with a map
24// - Provide a bufio.Scanner compatible interface.
Akron8e1d69b2021-08-12 17:38:49 +020025// - Mark epsilon transitions in bytes
Akron740f3d72021-08-03 12:12:34 +020026
Akron8ef408b2021-08-02 22:11:04 +020027import (
28 "bufio"
29 "compress/gzip"
Akron6247a5d2021-08-03 19:18:28 +020030 "encoding/binary"
Akron8ef408b2021-08-02 22:11:04 +020031 "fmt"
32 "io"
Akron679b4862021-09-02 16:59:26 +020033 "math"
Akron8ef408b2021-08-02 22:11:04 +020034 "os"
Akronc9d84a62021-08-03 15:56:03 +020035 "sort"
Akron740f3d72021-08-03 12:12:34 +020036
Akron527c10c2021-08-13 01:45:18 +020037 "log"
Akron8ef408b2021-08-02 22:11:04 +020038)
39
40const (
Akron2a4b9292021-08-04 15:35:22 +020041 DEBUG = false
Akron16c312e2021-09-26 13:11:12 +020042 DAMAGIC = "DATOK"
Akron2a4b9292021-08-04 15:35:22 +020043 VERSION = uint16(1)
Akron03c92fe2021-08-09 14:07:57 +020044 FIRSTBIT uint32 = 1 << 31
45 SECONDBIT uint32 = 1 << 30
46 RESTBIT uint32 = ^uint32(0) &^ (FIRSTBIT | SECONDBIT)
Akron8ef408b2021-08-02 22:11:04 +020047)
48
Akron03c92fe2021-08-09 14:07:57 +020049// Serialization is always little endian
Akron6247a5d2021-08-03 19:18:28 +020050var bo binary.ByteOrder = binary.LittleEndian
51
Akron8ef408b2021-08-02 22:11:04 +020052type mapping struct {
53 source int
Akron3fdfec62021-08-04 11:40:10 +020054 target uint32
Akron8ef408b2021-08-02 22:11:04 +020055}
56
Akronf1a16502021-08-16 15:24:38 +020057type bc struct {
58 base uint32
59 check uint32
60}
61
Akron03c92fe2021-08-09 14:07:57 +020062// DaTokenizer represents a tokenizer implemented as a
63// Double Array FSA.
Akronf2120ca2021-08-03 16:26:41 +020064type DaTokenizer struct {
Akronea46e8a2021-08-17 00:36:31 +020065 sigma map[rune]int
66 sigmaASCII [256]int
Akron03a3c612021-08-04 11:51:27 +020067 maxSize int
Akron4fa28b32021-08-27 10:55:41 +020068 transCount int
Akronf1a16502021-08-16 15:24:38 +020069 array []bc
Akronc17f1ca2021-08-03 19:47:27 +020070
71 // Special symbols in sigma
72 epsilon int
73 unknown int
74 identity int
75 final int
Akron03c92fe2021-08-09 14:07:57 +020076 tokenend int
Akronf2120ca2021-08-03 16:26:41 +020077}
78
Akron439f4ec2021-08-09 15:45:38 +020079// ToDoubleArray turns the intermediate tokenizer representation
80// into a double array representation.
81//
82// This is based on Mizobuchi et al (2000), p.128
Akron1c34ce62021-09-23 23:27:39 +020083func (auto *Automaton) ToDoubleArray() *DaTokenizer {
Akronf2120ca2021-08-03 16:26:41 +020084
85 dat := &DaTokenizer{
Akron03a3c612021-08-04 11:51:27 +020086 sigma: make(map[rune]int),
Akron4fa28b32021-08-27 10:55:41 +020087 transCount: -1,
Akron1c34ce62021-09-23 23:27:39 +020088 final: auto.final,
89 unknown: auto.unknown,
90 identity: auto.identity,
91 epsilon: auto.epsilon,
92 tokenend: auto.tokenend,
Akronf2120ca2021-08-03 16:26:41 +020093 }
94
Akronf1a16502021-08-16 15:24:38 +020095 dat.resize(dat.final)
96
Akron1c34ce62021-09-23 23:27:39 +020097 for num, sym := range auto.sigmaRev {
Akronea46e8a2021-08-17 00:36:31 +020098 if int(sym) < 256 {
99 dat.sigmaASCII[int(sym)] = num
100 }
Akronf2120ca2021-08-03 16:26:41 +0200101 dat.sigma[sym] = num
102 }
Akron8ef408b2021-08-02 22:11:04 +0200103
104 mark := 0
105 size := 0
Akron6f1c16c2021-08-17 10:45:42 +0200106 var base uint32
Akronde18e902021-08-27 09:34:12 +0200107 var atrans *edge
108 var s, s1 int
109 var t, t1 uint32
Akron8ef408b2021-08-02 22:11:04 +0200110
Akron439f4ec2021-08-09 15:45:38 +0200111 // Create a mapping from s (in Ms aka Intermediate FSA)
112 // to t (in Mt aka Double Array FSA)
Akron1c34ce62021-09-23 23:27:39 +0200113 table := make([]*mapping, auto.arcCount+1)
Akron7b1faa62021-09-02 16:10:21 +0200114 // tableQueue := make([]int, tok.arcCount+1)
Akron8ef408b2021-08-02 22:11:04 +0200115
Akron439f4ec2021-08-09 15:45:38 +0200116 // Initialize with the start state
Akron8ef408b2021-08-02 22:11:04 +0200117 table[size] = &mapping{source: 1, target: 1}
Akron7b1faa62021-09-02 16:10:21 +0200118 // tableQueue[size] = 1
Akron8ef408b2021-08-02 22:11:04 +0200119 size++
120
Akron740f3d72021-08-03 12:12:34 +0200121 // Allocate space for the outgoing symbol range
Akron1c34ce62021-09-23 23:27:39 +0200122 A := make([]int, 0, auto.sigmaCount)
Akron7b1faa62021-09-02 16:10:21 +0200123 // tableLookup := make([]uint32, tok.arcCount+2) // make(map[int]uint32)
124 // tableLookup[1] = 1
Akron8ef408b2021-08-02 22:11:04 +0200125
Akron7b1faa62021-09-02 16:10:21 +0200126 // block_begin_pos := uint32(1)
Akronde18e902021-08-27 09:34:12 +0200127
Akron8ef408b2021-08-02 22:11:04 +0200128 for mark < size {
Akronde18e902021-08-27 09:34:12 +0200129 s = table[mark].source // This is a state in Ms
130 t = table[mark].target // This is a state in Mt
Akron7b1faa62021-09-02 16:10:21 +0200131 // s = tableQueue[mark]
132 // t = tableLookup[s]
Akron8ef408b2021-08-02 22:11:04 +0200133 mark++
Akron740f3d72021-08-03 12:12:34 +0200134
135 // Following the paper, here the state t can be remembered
136 // in the set of states St
Akron8ef408b2021-08-02 22:11:04 +0200137 A = A[:0]
Akron1c34ce62021-09-23 23:27:39 +0200138 auto.getSet(s, &A)
Akron8ef408b2021-08-02 22:11:04 +0200139
Akron740f3d72021-08-03 12:12:34 +0200140 // Set base to the first free slot in the double array
Akron29e306f2021-09-02 18:29:56 +0200141 // base = dat.xCheck(A)
Akron679b4862021-09-02 16:59:26 +0200142 // base = dat.xCheckSkip(A)
Akron7b1faa62021-09-02 16:10:21 +0200143 // base = dat.xCheckNiu(A, &block_begin_pos)
Akron29e306f2021-09-02 18:29:56 +0200144 base = dat.xCheckSkipNiu(A)
Akron6f1c16c2021-08-17 10:45:42 +0200145 dat.array[t].setBase(base)
Akron8ef408b2021-08-02 22:11:04 +0200146
Akron773b1ef2021-08-03 17:37:20 +0200147 // TODO:
Akron068874c2021-08-04 15:19:56 +0200148 // Sort the outgoing transitions based on the
Akron773b1ef2021-08-03 17:37:20 +0200149 // outdegree of .end
150
Akron740f3d72021-08-03 12:12:34 +0200151 // Iterate over all outgoing symbols
Akron8ef408b2021-08-02 22:11:04 +0200152 for _, a := range A {
153
Akron1c34ce62021-09-23 23:27:39 +0200154 if a != auto.final {
Akron8ef408b2021-08-02 22:11:04 +0200155
Akron1c34ce62021-09-23 23:27:39 +0200156 atrans = auto.transitions[s][a]
Akronde18e902021-08-27 09:34:12 +0200157
Akron740f3d72021-08-03 12:12:34 +0200158 // Aka g(s, a)
Akronde18e902021-08-27 09:34:12 +0200159 s1 = atrans.end
Akron8ef408b2021-08-02 22:11:04 +0200160
Akron740f3d72021-08-03 12:12:34 +0200161 // Store the transition
Akronde18e902021-08-27 09:34:12 +0200162 t1 = base + uint32(a)
Akronf1a16502021-08-16 15:24:38 +0200163 dat.array[t1].setCheck(t)
164
165 // Set maxSize
166 if dat.maxSize < int(t1) {
167 dat.maxSize = int(t1)
168 }
Akron8ef408b2021-08-02 22:11:04 +0200169
Akron439f4ec2021-08-09 15:45:38 +0200170 if DEBUG {
Akron524c5432021-08-05 14:14:27 +0200171 fmt.Println("Translate transition",
172 s, "->", s1, "(", a, ")", "to", t, "->", t1)
173 }
174
Akron83e75a22021-08-04 13:14:06 +0200175 // Mark the state as being the target of a nontoken transition
Akronde18e902021-08-27 09:34:12 +0200176 if atrans.nontoken {
Akronf1a16502021-08-16 15:24:38 +0200177 dat.array[t1].setNonToken(true)
Akron524c5432021-08-05 14:14:27 +0200178 if DEBUG {
179 fmt.Println("Set", t1, "to nontoken")
180 }
Akron83e75a22021-08-04 13:14:06 +0200181 }
182
Akron84d68e62021-08-04 17:06:52 +0200183 // Mark the state as being the target of a tokenend transition
Akronde18e902021-08-27 09:34:12 +0200184 if atrans.tokenend {
Akronf1a16502021-08-16 15:24:38 +0200185 dat.array[t1].setTokenEnd(true)
Akron524c5432021-08-05 14:14:27 +0200186 if DEBUG {
187 fmt.Println("Set", t1, "to tokenend")
188 }
Akron84d68e62021-08-04 17:06:52 +0200189 }
190
Akron740f3d72021-08-03 12:12:34 +0200191 // Check for representative states
Akron439f4ec2021-08-09 15:45:38 +0200192 r := stateAlreadyInTable(s1, table, size)
Akronde18e902021-08-27 09:34:12 +0200193 // r := tableLookup[s1]
Akron740f3d72021-08-03 12:12:34 +0200194
Akron439f4ec2021-08-09 15:45:38 +0200195 // No representative found
Akron8ef408b2021-08-02 22:11:04 +0200196 if r == 0 {
Akron740f3d72021-08-03 12:12:34 +0200197 // Remember the mapping
Akron8ef408b2021-08-02 22:11:04 +0200198 table[size] = &mapping{source: s1, target: t1}
Akron7b1faa62021-09-02 16:10:21 +0200199 // tableQueue[size] = s1
Akronde18e902021-08-27 09:34:12 +0200200 // tableLookup[s1] = t1
Akron8ef408b2021-08-02 22:11:04 +0200201 size++
202 } else {
Akron740f3d72021-08-03 12:12:34 +0200203 // Overwrite with the representative state
Akronf1a16502021-08-16 15:24:38 +0200204 dat.array[t1].setBase(r)
205 dat.array[t1].setSeparate(true)
Akron8ef408b2021-08-02 22:11:04 +0200206 }
207 } else {
Akron740f3d72021-08-03 12:12:34 +0200208 // Store a final transition
Akron6f1c16c2021-08-17 10:45:42 +0200209 dat.array[base+uint32(dat.final)].setCheck(t)
Akronf1a16502021-08-16 15:24:38 +0200210
Akronde18e902021-08-27 09:34:12 +0200211 if dat.maxSize < int(base)+dat.final {
212 dat.maxSize = int(base) + dat.final
Akronf1a16502021-08-16 15:24:38 +0200213 }
Akron8ef408b2021-08-02 22:11:04 +0200214 }
215 }
216 }
217
218 // Following Mizobuchi et al (2000) the size of the
219 // FSA should be stored in check(1).
Akron29e306f2021-09-02 18:29:56 +0200220 // We make the size a bit larger so we never have to check for boundaries.
221 dat.setSize(dat.maxSize + dat.final)
222 if len(dat.array) < dat.maxSize+dat.final {
223 dat.array = append(dat.array, make([]bc, dat.final)...)
224 }
225 dat.array = dat.array[:dat.maxSize+dat.final]
Akronf2120ca2021-08-03 16:26:41 +0200226 return dat
Akron8ef408b2021-08-02 22:11:04 +0200227}
228
Akron8ef408b2021-08-02 22:11:04 +0200229// Check the table if a mapping of s
Akron740f3d72021-08-03 12:12:34 +0200230// exists and return this as a representative.
231// Currently iterates through the whole table
232// in a bruteforce manner.
Akron439f4ec2021-08-09 15:45:38 +0200233func stateAlreadyInTable(s int, table []*mapping, size int) uint32 {
Akron8ef408b2021-08-02 22:11:04 +0200234 for x := 0; x < size; x++ {
235 if table[x].source == s {
236 return table[x].target
237 }
238 }
239 return 0
240}
241
Akron941f2152021-09-26 15:14:25 +0200242// Type of tokenizer
243func (DaTokenizer) Type() string {
244 return DAMAGIC
245}
246
Akron64ffd9a2021-08-03 19:55:21 +0200247// Resize double array when necessary
248func (dat *DaTokenizer) resize(l int) {
249 // TODO:
250 // This is a bit too aggressive atm and should be calmed down.
251 if len(dat.array) <= l {
Akronf1a16502021-08-16 15:24:38 +0200252 dat.array = append(dat.array, make([]bc, l)...)
Akron8ef408b2021-08-02 22:11:04 +0200253 }
Akron64ffd9a2021-08-03 19:55:21 +0200254}
Akronc9d84a62021-08-03 15:56:03 +0200255
Akron64ffd9a2021-08-03 19:55:21 +0200256// Set base value in double array
Akronf1a16502021-08-16 15:24:38 +0200257func (bc *bc) setBase(v uint32) {
258 bc.base = v
Akron439f4ec2021-08-09 15:45:38 +0200259}
260
261// Get base value in double array
Akronf1a16502021-08-16 15:24:38 +0200262func (bc *bc) getBase() uint32 {
263 return bc.base & RESTBIT
Akron439f4ec2021-08-09 15:45:38 +0200264}
265
266// Set check value in double array
Akronf1a16502021-08-16 15:24:38 +0200267func (bc *bc) setCheck(v uint32) {
268 bc.check = v
Akron439f4ec2021-08-09 15:45:38 +0200269}
270
271// Get check value in double array
Akronf1a16502021-08-16 15:24:38 +0200272func (bc *bc) getCheck() uint32 {
273 return bc.check & RESTBIT
Akron64ffd9a2021-08-03 19:55:21 +0200274}
275
Akron3fdfec62021-08-04 11:40:10 +0200276// Returns true if a state is separate pointing to a representative
Akronf1a16502021-08-16 15:24:38 +0200277func (bc *bc) isSeparate() bool {
278 return bc.base&FIRSTBIT != 0
Akron3fdfec62021-08-04 11:40:10 +0200279}
280
281// Mark a state as separate pointing to a representative
Akronf1a16502021-08-16 15:24:38 +0200282func (bc *bc) setSeparate(sep bool) {
Akron3fdfec62021-08-04 11:40:10 +0200283 if sep {
Akronf1a16502021-08-16 15:24:38 +0200284 bc.base |= FIRSTBIT
Akron3fdfec62021-08-04 11:40:10 +0200285 } else {
Akronf1a16502021-08-16 15:24:38 +0200286 bc.base &= (RESTBIT | SECONDBIT)
Akron3fdfec62021-08-04 11:40:10 +0200287 }
288}
289
Akron83e75a22021-08-04 13:14:06 +0200290// Returns true if a state is the target of a nontoken transition
Akronf1a16502021-08-16 15:24:38 +0200291func (bc *bc) isNonToken() bool {
292 return bc.check&FIRSTBIT != 0
Akron83e75a22021-08-04 13:14:06 +0200293}
294
295// Mark a state as being the target of a nontoken transition
Akronf1a16502021-08-16 15:24:38 +0200296func (bc *bc) setNonToken(sep bool) {
Akron83e75a22021-08-04 13:14:06 +0200297 if sep {
Akronf1a16502021-08-16 15:24:38 +0200298 bc.check |= FIRSTBIT
Akron83e75a22021-08-04 13:14:06 +0200299 } else {
Akronf1a16502021-08-16 15:24:38 +0200300 bc.check &= (RESTBIT | SECONDBIT)
Akron83e75a22021-08-04 13:14:06 +0200301 }
302}
303
Akron84d68e62021-08-04 17:06:52 +0200304// Returns true if a state is the target of a tokenend transition
Akronf1a16502021-08-16 15:24:38 +0200305func (bc *bc) isTokenEnd() bool {
306 return bc.check&SECONDBIT != 0
Akron84d68e62021-08-04 17:06:52 +0200307}
308
309// Mark a state as being the target of a tokenend transition
Akronf1a16502021-08-16 15:24:38 +0200310func (bc *bc) setTokenEnd(sep bool) {
Akron84d68e62021-08-04 17:06:52 +0200311 if sep {
Akronf1a16502021-08-16 15:24:38 +0200312 bc.check |= SECONDBIT
Akron84d68e62021-08-04 17:06:52 +0200313 } else {
Akronf1a16502021-08-16 15:24:38 +0200314 bc.check &= (RESTBIT | FIRSTBIT)
Akron84d68e62021-08-04 17:06:52 +0200315 }
316}
317
Akron64ffd9a2021-08-03 19:55:21 +0200318// Set size of double array
Akron3fdfec62021-08-04 11:40:10 +0200319func (dat *DaTokenizer) setSize(v int) {
Akronf1a16502021-08-16 15:24:38 +0200320 dat.array[1].setCheck(uint32(v))
Akron64ffd9a2021-08-03 19:55:21 +0200321}
322
323// Get size of double array
Akron3fdfec62021-08-04 11:40:10 +0200324func (dat *DaTokenizer) GetSize() int {
Akronf1a16502021-08-16 15:24:38 +0200325 return int(dat.array[1].getCheck())
Akron8ef408b2021-08-02 22:11:04 +0200326}
327
328// Based on Mizobuchi et al (2000), p. 124
329// This iterates for every state through the complete double array
330// structure until it finds a gap that fits all outgoing transitions
331// of the state. This is extremely slow, but is only necessary in the
332// construction phase of the tokenizer.
Akron3fdfec62021-08-04 11:40:10 +0200333func (dat *DaTokenizer) xCheck(symbols []int) uint32 {
Akron740f3d72021-08-03 12:12:34 +0200334
335 // Start at the first entry of the double array list
Akron6f1c16c2021-08-17 10:45:42 +0200336 base := uint32(1)
337
Akron8ef408b2021-08-02 22:11:04 +0200338OVERLAP:
Akron740f3d72021-08-03 12:12:34 +0200339 // Resize the array if necessary
Akronf1a16502021-08-16 15:24:38 +0200340 dat.resize(int(base) + dat.final)
Akron8ef408b2021-08-02 22:11:04 +0200341 for _, a := range symbols {
Akronf1a16502021-08-16 15:24:38 +0200342 if dat.array[int(base)+a].getCheck() != 0 {
Akron8ef408b2021-08-02 22:11:04 +0200343 base++
344 goto OVERLAP
345 }
346 }
Akron8ef408b2021-08-02 22:11:04 +0200347 return base
348}
349
Akron679b4862021-09-02 16:59:26 +0200350// This is an implementation of xCheck with the skip-improvement
351// proposed by Morita et al. (2001)
352func (dat *DaTokenizer) xCheckSkip(symbols []int) uint32 {
353
354 // Start at the first entry of the double array list
355 base := uint32(math.Abs(float64(dat.maxSize-1) * .9))
356
357OVERLAP:
358 // Resize the array if necessary
359 dat.resize(int(base) + dat.final)
360 for _, a := range symbols {
361 if dat.array[int(base)+a].getCheck() != 0 {
362 base++
363 goto OVERLAP
364 }
365 }
366 return base
367}
368
Akron29e306f2021-09-02 18:29:56 +0200369// This is an implementation of xCheck with the skip-improvement
370// proposed by Morita et al. (2001) for higher outdegrees as
371// proposed by Niu et al. (2013)
372func (dat *DaTokenizer) xCheckSkipNiu(symbols []int) uint32 {
373
374 // Start at the first entry of the double array list
375 base := uint32(1)
376
377 // Or skip the first few entries
378 if len(symbols) >= 3 {
379 base = uint32(math.Abs(float64(dat.maxSize-1)*.9)) + 1
380 }
381
382OVERLAP:
383 // Resize the array if necessary
384 dat.resize(int(base) + dat.final + 1)
385 for _, a := range symbols {
386 if dat.array[int(base)+a].getCheck() != 0 {
387 base++
388 goto OVERLAP
389 }
390 }
391 return base
392}
393
Akron7b1faa62021-09-02 16:10:21 +0200394// This is an implementation of xCheck wit an improvement
395// proposed by Niu et al. (2013)
396func (dat *DaTokenizer) xCheckNiu(symbols []int, block_begin_pos *uint32) uint32 {
397
398 // Start at the first entry of the double array list
399 base := uint32(1)
400
401 if len(symbols) > 3 {
402 sort.Ints(symbols)
403 if *block_begin_pos > uint32(symbols[0]) {
404 dat.resize(int(*block_begin_pos) + dat.final)
405 *block_begin_pos += uint32(symbols[len(symbols)-1] + 1)
406 return *block_begin_pos - uint32(symbols[0])
407 }
408 }
409
410OVERLAP:
411 // Resize the array if necessary
412 dat.resize(int(base) + dat.final)
413 for _, a := range symbols {
414 if dat.array[int(base)+a].getCheck() != 0 {
415 base++
416 goto OVERLAP
417 }
418 }
419 return base
420}
421
Akron3610f102021-08-08 14:13:25 +0200422// List all outgoing transitions for a state
423// for testing purposes
424func (dat *DaTokenizer) outgoing(t uint32) []int {
425
426 valid := make([]int, 0, len(dat.sigma))
427
428 for _, a := range dat.sigma {
Akronf1a16502021-08-16 15:24:38 +0200429 t1 := dat.array[t].getBase() + uint32(a)
430 if t1 <= dat.array[1].getCheck() && dat.array[t1].getCheck() == t {
Akron3610f102021-08-08 14:13:25 +0200431 valid = append(valid, a)
432 }
433 }
434
435 for _, a := range []int{dat.epsilon, dat.unknown, dat.identity, dat.final} {
Akronf1a16502021-08-16 15:24:38 +0200436 t1 := dat.array[t].getBase() + uint32(a)
437 if t1 <= dat.array[1].getCheck() && dat.array[t1].getCheck() == t {
Akron3610f102021-08-08 14:13:25 +0200438 valid = append(valid, -1*a)
439 }
440 }
441
442 sort.Ints(valid)
443
444 return valid
445}
446
Akron4fa28b32021-08-27 10:55:41 +0200447// TransCount as the number of transitions aka arcs in the
448// finite state automaton
449func (dat *DaTokenizer) TransCount() int {
450 // Cache the transCount
451 if dat.transCount > 0 {
452 return dat.transCount
453 }
454
455 dat.transCount = 0
456 for x := 1; x < len(dat.array); x++ {
457 if dat.array[x].getBase() != 0 {
458 dat.transCount++
459 }
460 }
461
462 return dat.transCount
463}
464
Akron03a3c612021-08-04 11:51:27 +0200465// LoadFactor as defined in Kanda et al (2018),
466// i.e. the proportion of non-empty elements to all elements.
467func (dat *DaTokenizer) LoadFactor() float64 {
Akron4fa28b32021-08-27 10:55:41 +0200468 return float64(dat.TransCount()) / float64(len(dat.array)) * 100
Akrond66a9262021-08-03 17:09:09 +0200469}
470
Akron439f4ec2021-08-09 15:45:38 +0200471// Save stores the double array data in a file
Akron3a063ef2021-08-05 19:36:35 +0200472func (dat *DaTokenizer) Save(file string) (n int64, err error) {
473 f, err := os.Create(file)
474 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200475 log.Println(err)
Akron439f4ec2021-08-09 15:45:38 +0200476 return 0, err
Akron3a063ef2021-08-05 19:36:35 +0200477 }
478 defer f.Close()
479 gz := gzip.NewWriter(f)
480 defer gz.Close()
481 n, err = dat.WriteTo(gz)
482 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200483 log.Println(err)
Akron3a063ef2021-08-05 19:36:35 +0200484 return n, err
485 }
486 gz.Flush()
487 return n, nil
488}
489
490// WriteTo stores the double array data in an io.Writer.
Akron6247a5d2021-08-03 19:18:28 +0200491func (dat *DaTokenizer) WriteTo(w io.Writer) (n int64, err error) {
492
Akron3a063ef2021-08-05 19:36:35 +0200493 wb := bufio.NewWriter(w)
494 defer wb.Flush()
495
Akron6247a5d2021-08-03 19:18:28 +0200496 // Store magical header
Akron16c312e2021-09-26 13:11:12 +0200497 all, err := wb.Write([]byte(DAMAGIC))
Akron6247a5d2021-08-03 19:18:28 +0200498 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200499 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200500 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200501 }
502
503 // Get sigma as a list
504 sigmalist := make([]rune, len(dat.sigma)+16)
505 max := 0
506 for sym, num := range dat.sigma {
507 sigmalist[num] = sym
508 if num > max {
509 max = num
510 }
511 }
512
513 sigmalist = sigmalist[:max+1]
514
Akron3f8571a2021-08-05 11:18:10 +0200515 buf := make([]byte, 0, 16)
Akron6247a5d2021-08-03 19:18:28 +0200516 bo.PutUint16(buf[0:2], VERSION)
Akronc17f1ca2021-08-03 19:47:27 +0200517 bo.PutUint16(buf[2:4], uint16(dat.epsilon))
518 bo.PutUint16(buf[4:6], uint16(dat.unknown))
519 bo.PutUint16(buf[6:8], uint16(dat.identity))
520 bo.PutUint16(buf[8:10], uint16(dat.final))
Akron6247a5d2021-08-03 19:18:28 +0200521 bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
Akronf1a16502021-08-16 15:24:38 +0200522 bo.PutUint32(buf[12:16], uint32(len(dat.array)*2)) // Legacy support
Akron3a063ef2021-08-05 19:36:35 +0200523 more, err := wb.Write(buf[0:16])
Akron6247a5d2021-08-03 19:18:28 +0200524 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200525 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200526 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200527 }
528
529 all += more
530
Akron6247a5d2021-08-03 19:18:28 +0200531 // Write sigma
532 for _, sym := range sigmalist {
Akron3f8571a2021-08-05 11:18:10 +0200533
Akron3a063ef2021-08-05 19:36:35 +0200534 more, err = wb.WriteRune(sym)
Akron6247a5d2021-08-03 19:18:28 +0200535 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200536 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200537 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200538 }
539 all += more
540 }
Akron439f4ec2021-08-09 15:45:38 +0200541
Akron6247a5d2021-08-03 19:18:28 +0200542 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200543 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200544 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200545 }
Akron6247a5d2021-08-03 19:18:28 +0200546
547 // Test marker - could be checksum
Akron3a063ef2021-08-05 19:36:35 +0200548 more, err = wb.Write([]byte("T"))
Akron6247a5d2021-08-03 19:18:28 +0200549 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200550 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200551 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200552 }
553 all += more
554
Akronf1a16502021-08-16 15:24:38 +0200555 // for x := 0; x < len(dat.array); x++ {
556 for _, bc := range dat.array {
557 bo.PutUint32(buf[0:4], bc.base)
558 more, err = wb.Write(buf[0:4])
Akron6247a5d2021-08-03 19:18:28 +0200559 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200560 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200561 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200562 }
Akron439f4ec2021-08-09 15:45:38 +0200563 all += more
Akron3a063ef2021-08-05 19:36:35 +0200564 if more != 4 {
Akronf1a16502021-08-16 15:24:38 +0200565 log.Println("Can not write base uint32")
566 return int64(all), err
567 }
568 bo.PutUint32(buf[0:4], bc.check)
569 more, err = wb.Write(buf[0:4])
570 if err != nil {
571 log.Println(err)
572 return int64(all), err
573 }
574 all += more
575 if more != 4 {
576 log.Println("Can not write check uint32")
Akron3a063ef2021-08-05 19:36:35 +0200577 return int64(all), err
578 }
Akron6247a5d2021-08-03 19:18:28 +0200579 }
580
581 return int64(all), err
582}
583
Akron439f4ec2021-08-09 15:45:38 +0200584// LoadDatokFile reads a double array represented tokenizer
585// from a file.
Akron3f8571a2021-08-05 11:18:10 +0200586func LoadDatokFile(file string) *DaTokenizer {
587 f, err := os.Open(file)
588 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200589 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200590 return nil
Akron3f8571a2021-08-05 11:18:10 +0200591 }
592 defer f.Close()
593
594 gz, err := gzip.NewReader(f)
595 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200596 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200597 return nil
Akron3f8571a2021-08-05 11:18:10 +0200598 }
599 defer gz.Close()
600
Akron3a063ef2021-08-05 19:36:35 +0200601 // Todo: Read the whole file!
Akron3f8571a2021-08-05 11:18:10 +0200602 return ParseDatok(gz)
603}
604
Akron439f4ec2021-08-09 15:45:38 +0200605// LoadDatokFile reads a double array represented tokenizer
606// from an io.Reader
Akron3f8571a2021-08-05 11:18:10 +0200607func ParseDatok(ior io.Reader) *DaTokenizer {
608
Akron439f4ec2021-08-09 15:45:38 +0200609 // Initialize tokenizer with default values
Akron3f8571a2021-08-05 11:18:10 +0200610 dat := &DaTokenizer{
611 sigma: make(map[rune]int),
612 epsilon: 0,
613 unknown: 0,
614 identity: 0,
615 final: 0,
Akron4fa28b32021-08-27 10:55:41 +0200616 transCount: 0,
Akron3f8571a2021-08-05 11:18:10 +0200617 }
618
619 r := bufio.NewReader(ior)
620
Akron3f8571a2021-08-05 11:18:10 +0200621 buf := make([]byte, 1024)
Akron16c312e2021-09-26 13:11:12 +0200622 buf = buf[0:len(DAMAGIC)]
Akron3f8571a2021-08-05 11:18:10 +0200623
Akron439f4ec2021-08-09 15:45:38 +0200624 _, err := r.Read(buf)
Akron3f8571a2021-08-05 11:18:10 +0200625
626 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200627 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200628 return nil
629 }
630
Akron16c312e2021-09-26 13:11:12 +0200631 if string(DAMAGIC) != string(buf) {
Akron527c10c2021-08-13 01:45:18 +0200632 log.Println("Not a datok file")
Akron3f8571a2021-08-05 11:18:10 +0200633 return nil
634 }
635
Akron439f4ec2021-08-09 15:45:38 +0200636 more, err := io.ReadFull(r, buf[0:16])
Akron3f8571a2021-08-05 11:18:10 +0200637 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200638 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200639 return nil
640 }
641
Akron439f4ec2021-08-09 15:45:38 +0200642 if more != 16 {
Akron527c10c2021-08-13 01:45:18 +0200643 log.Println("Read bytes do not fit")
Akron439f4ec2021-08-09 15:45:38 +0200644 return nil
645 }
Akron3f8571a2021-08-05 11:18:10 +0200646
Akron3a063ef2021-08-05 19:36:35 +0200647 version := bo.Uint16(buf[0:2])
648
649 if version != VERSION {
Akron527c10c2021-08-13 01:45:18 +0200650 log.Println("Version not compatible")
Akron3a063ef2021-08-05 19:36:35 +0200651 return nil
652 }
653
Akron3f8571a2021-08-05 11:18:10 +0200654 dat.epsilon = int(bo.Uint16(buf[2:4]))
655 dat.unknown = int(bo.Uint16(buf[4:6]))
656 dat.identity = int(bo.Uint16(buf[6:8]))
657 dat.final = int(bo.Uint16(buf[8:10]))
658
659 sigmaCount := int(bo.Uint16(buf[10:12]))
Akronf1a16502021-08-16 15:24:38 +0200660 arraySize := int(bo.Uint32(buf[12:16])) / 2 // Legacy support
Akron3f8571a2021-08-05 11:18:10 +0200661
Akron3a063ef2021-08-05 19:36:35 +0200662 // Shouldn't be relevant though
663 dat.maxSize = arraySize - 1
664
Akron3f8571a2021-08-05 11:18:10 +0200665 for x := 0; x < sigmaCount; x++ {
Akron439f4ec2021-08-09 15:45:38 +0200666 sym, _, err := r.ReadRune()
Akron3f8571a2021-08-05 11:18:10 +0200667 if err == nil && sym != 0 {
Akronea46e8a2021-08-17 00:36:31 +0200668 if int(sym) < 256 {
669 dat.sigmaASCII[int(sym)] = x
670 }
Akron3f8571a2021-08-05 11:18:10 +0200671 dat.sigma[sym] = x
672 }
Akron3f8571a2021-08-05 11:18:10 +0200673 }
674
Akron439f4ec2021-08-09 15:45:38 +0200675 _, err = io.ReadFull(r, buf[0:1])
Akron3f8571a2021-08-05 11:18:10 +0200676
677 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200678 log.Print(err)
Akron3f8571a2021-08-05 11:18:10 +0200679 return nil
680 }
681
Akron3f8571a2021-08-05 11:18:10 +0200682 if string("T") != string(buf[0:1]) {
Akron527c10c2021-08-13 01:45:18 +0200683 log.Println("Not a datok file")
Akron3f8571a2021-08-05 11:18:10 +0200684 return nil
685 }
686
687 // Read based on length
Akronf1a16502021-08-16 15:24:38 +0200688 dat.array = make([]bc, arraySize)
Akron3f8571a2021-08-05 11:18:10 +0200689
Akronbb4aac52021-08-13 00:52:27 +0200690 dataArray, err := io.ReadAll(r)
Akron439f4ec2021-08-09 15:45:38 +0200691
Akronbb4aac52021-08-13 00:52:27 +0200692 if err == io.EOF {
Akron527c10c2021-08-13 01:45:18 +0200693 log.Println(err)
Akronbb4aac52021-08-13 00:52:27 +0200694 return nil
695 }
696
Akronf1a16502021-08-16 15:24:38 +0200697 if len(dataArray) < arraySize*8 {
Akron527c10c2021-08-13 01:45:18 +0200698 log.Println("Not enough bytes read")
Akronbb4aac52021-08-13 00:52:27 +0200699 return nil
700 }
701
702 for x := 0; x < arraySize; x++ {
Akronf1a16502021-08-16 15:24:38 +0200703 dat.array[x].base = bo.Uint32(dataArray[x*8 : (x*8)+4])
704 dat.array[x].check = bo.Uint32(dataArray[(x*8)+4 : (x*8)+8])
Akron3f8571a2021-08-05 11:18:10 +0200705 }
706
707 return dat
708}
709
Akron439f4ec2021-08-09 15:45:38 +0200710// Show the current state of the buffer,
711// for testing puroses
Akron3610f102021-08-08 14:13:25 +0200712func showBuffer(buffer []rune, buffo int, buffi int) string {
713 out := make([]rune, 0, 1024)
714 for x := 0; x < len(buffer); x++ {
715 if buffi == x {
716 out = append(out, '^')
717 }
718 if buffo == x {
719 out = append(out, '[', buffer[x], ']')
720 } else {
721 out = append(out, buffer[x])
722 }
723 }
724 return string(out)
725}
726
Akrone396a932021-10-19 01:06:13 +0200727func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
728 return dat.TransduceTokenWriter(r, NewTokenWriterSimple(w))
729}
730
731// TransduceTokenWriter transduces an input string against
732// the double array FSA. The rules are always greedy. If the
733// automaton fails, it takes the last possible token ending
734// branch.
Akron068874c2021-08-04 15:19:56 +0200735//
Akron4db3ecf2021-08-11 18:49:03 +0200736// Based on Mizobuchi et al (2000), p. 129,
737// with additional support for IDENTITY, UNKNOWN
738// and EPSILON transitions and NONTOKEN and TOKENEND handling.
Akrone396a932021-10-19 01:06:13 +0200739func (dat *DaTokenizer) TransduceTokenWriter(r io.Reader, w TokenWriterI) bool {
Akron068874c2021-08-04 15:19:56 +0200740 var a int
Akronb4bbb472021-08-09 11:49:38 +0200741 var t0 uint32
Akronb7e1f132021-08-10 11:52:31 +0200742 t := uint32(1) // Initial state
743 var ok, rewindBuffer bool
Akron068874c2021-08-04 15:19:56 +0200744
Akron3610f102021-08-08 14:13:25 +0200745 // Remember the last position of a possible tokenend,
746 // in case the automaton fails.
747 epsilonState := uint32(0)
748 epsilonOffset := 0
749
Akrone396a932021-10-19 01:06:13 +0200750 // Remember if the last transition was epsilon
751 sentenceEnd := false
752
Akron3610f102021-08-08 14:13:25 +0200753 // Implement a low level buffer for full control,
754 // however - it is probably better to introduce
755 // this on a higher level with a io.Reader interface
756 // The buffer stores a single word and may have white
757 // space at the end (but not at the beginning).
758 //
759 // This is the only backtracking requirement because of
760 // epsilon transitions, to support tokenizations like:
761 // "this is an example|.| And it works." vs
762 // "this is an example.com| application."
Akronb7e1f132021-08-10 11:52:31 +0200763 //
764 // TODO:
765 // Store a translation buffer as well, so characters don't
766 // have to be translated multiple times!
Akron3610f102021-08-08 14:13:25 +0200767 buffer := make([]rune, 1024)
768 buffo := 0 // Buffer offset
769 buffi := 0 // Buffer length
770
Akron3f8571a2021-08-05 11:18:10 +0200771 reader := bufio.NewReader(r)
Akrone396a932021-10-19 01:06:13 +0200772 defer w.Flush()
Akron068874c2021-08-04 15:19:56 +0200773
Akron3f8571a2021-08-05 11:18:10 +0200774 var char rune
Akrone396a932021-10-19 01:06:13 +0200775
Akron3f8571a2021-08-05 11:18:10 +0200776 var err error
777 eof := false
Akronb7e1f132021-08-10 11:52:31 +0200778 newchar := true
Akron3f8571a2021-08-05 11:18:10 +0200779
Akronc5d8d432021-08-10 16:48:44 +0200780PARSECHAR:
Akron3f8571a2021-08-05 11:18:10 +0200781 for {
782
Akronb7e1f132021-08-10 11:52:31 +0200783 if newchar {
784 // Get from reader if buffer is empty
785 if buffo >= buffi {
Akron1594cb82021-08-11 11:14:56 +0200786 if eof {
787 break
788 }
Akronb7e1f132021-08-10 11:52:31 +0200789 char, _, err = reader.ReadRune()
Akron439f4ec2021-08-09 15:45:38 +0200790
Akronb7e1f132021-08-10 11:52:31 +0200791 // No more runes to read
792 if err != nil {
793 eof = true
794 break
795 }
796 buffer[buffi] = char
797 buffi++
Akron3f8571a2021-08-05 11:18:10 +0200798 }
Akronb7e1f132021-08-10 11:52:31 +0200799
800 char = buffer[buffo]
801
802 if DEBUG {
803 fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
804 }
805
Akron6f1c16c2021-08-17 10:45:42 +0200806 // TODO:
807 // Better not repeatedly check for a!
808 // Possibly keep a buffer with a.
Akronea46e8a2021-08-17 00:36:31 +0200809 if int(char) < 256 {
810 a = dat.sigmaASCII[int(char)]
811 } else {
812 a, ok = dat.sigma[char]
813 if !ok {
814 a = 0
815 }
816 }
Akronb7e1f132021-08-10 11:52:31 +0200817
818 // Use identity symbol if character is not in sigma
Akronea46e8a2021-08-17 00:36:31 +0200819 if a == 0 && dat.identity != -1 {
Akronb7e1f132021-08-10 11:52:31 +0200820 a = dat.identity
821 }
822
823 t0 = t
824
825 // Check for epsilon transitions and remember
Akronf1a16502021-08-16 15:24:38 +0200826 if dat.array[dat.array[t0].getBase()+uint32(dat.epsilon)].getCheck() == t0 {
Akronb7e1f132021-08-10 11:52:31 +0200827 // Remember state for backtracking to last tokenend state
828 epsilonState = t0
829 epsilonOffset = buffo
Akrone396a932021-10-19 01:06:13 +0200830
831 if DEBUG {
832 fmt.Println("epsilonOffset is set to", buffo)
833 }
Akronb7e1f132021-08-10 11:52:31 +0200834 }
Akron3f8571a2021-08-05 11:18:10 +0200835 }
Akron3610f102021-08-08 14:13:25 +0200836
Akronb7e1f132021-08-10 11:52:31 +0200837 // Checks a transition based on t0, a and buffo
Akronf1a16502021-08-16 15:24:38 +0200838 t = dat.array[t0].getBase() + uint32(a)
839 ta := dat.array[t]
Akron068874c2021-08-04 15:19:56 +0200840
Akron524c5432021-08-05 14:14:27 +0200841 if DEBUG {
Akronb7e1f132021-08-10 11:52:31 +0200842 // Char is only relevant if set
843 fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
844 if false {
845 fmt.Println(dat.outgoing(t0))
846 }
Akron524c5432021-08-05 14:14:27 +0200847 }
848
Akronb7e1f132021-08-10 11:52:31 +0200849 // Check if the transition is invalid according to the double array
Akronf1a16502021-08-16 15:24:38 +0200850 if t > dat.array[1].getCheck() || ta.getCheck() != t0 {
Akron068874c2021-08-04 15:19:56 +0200851
852 if DEBUG {
Akronf1a16502021-08-16 15:24:38 +0200853 fmt.Println("Match is not fine!", t, "and", ta.getCheck(), "vs", t0)
Akron068874c2021-08-04 15:19:56 +0200854 }
855
856 if !ok && a == dat.identity {
Akronb4bbb472021-08-09 11:49:38 +0200857
Akron068874c2021-08-04 15:19:56 +0200858 // Try again with unknown symbol, in case identity failed
Akronb7e1f132021-08-10 11:52:31 +0200859 // Char is only relevant when set
Akron068874c2021-08-04 15:19:56 +0200860 if DEBUG {
Akron3f8571a2021-08-05 11:18:10 +0200861 fmt.Println("UNKNOWN symbol", string(char), "->", dat.unknown)
Akron068874c2021-08-04 15:19:56 +0200862 }
863 a = dat.unknown
864
865 } else if a != dat.epsilon {
Akronb4bbb472021-08-09 11:49:38 +0200866
Akron068874c2021-08-04 15:19:56 +0200867 // Try again with epsilon symbol, in case everything else failed
Akronb4bbb472021-08-09 11:49:38 +0200868 t0 = epsilonState
Akron3610f102021-08-08 14:13:25 +0200869 epsilonState = 0 // reset
870 buffo = epsilonOffset
Akron439f4ec2021-08-09 15:45:38 +0200871 a = dat.epsilon
872
Akron3610f102021-08-08 14:13:25 +0200873 if DEBUG {
874 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
875 }
Akronb4bbb472021-08-09 11:49:38 +0200876
Akron068874c2021-08-04 15:19:56 +0200877 } else {
878 break
879 }
Akron068874c2021-08-04 15:19:56 +0200880
Akronb7e1f132021-08-10 11:52:31 +0200881 newchar = false
882 continue
Akronb4bbb472021-08-09 11:49:38 +0200883 }
884
Akronb7e1f132021-08-10 11:52:31 +0200885 // Transition was successful
886 rewindBuffer = false
Akron439f4ec2021-08-09 15:45:38 +0200887
888 // Transition consumes a character
Akronb4bbb472021-08-09 11:49:38 +0200889 if a != dat.epsilon {
890
Akron3610f102021-08-08 14:13:25 +0200891 buffo++
Akronb4bbb472021-08-09 11:49:38 +0200892
Akron439f4ec2021-08-09 15:45:38 +0200893 // Transition does not produce a character
Akronf1a16502021-08-16 15:24:38 +0200894 if buffo == 1 && ta.isNonToken() {
Akron3610f102021-08-08 14:13:25 +0200895 if DEBUG {
896 fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
897 }
Akron439f4ec2021-08-09 15:45:38 +0200898 rewindBuffer = true
Akron3610f102021-08-08 14:13:25 +0200899 }
Akron068874c2021-08-04 15:19:56 +0200900
Akrone396a932021-10-19 01:06:13 +0200901 } else {
Akron524c5432021-08-05 14:14:27 +0200902
Akrone396a932021-10-19 01:06:13 +0200903 // Transition marks the end of a token - so flush the buffer
904 if buffo > 0 {
Akronc5d8d432021-08-10 16:48:44 +0200905 if DEBUG {
Akron01912fc2021-08-12 11:41:58 +0200906 fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
Akronc5d8d432021-08-10 16:48:44 +0200907 }
Akrone396a932021-10-19 01:06:13 +0200908 w.Token(0, buffer[:buffo])
Akronc5d8d432021-08-10 16:48:44 +0200909 rewindBuffer = true
Akrone396a932021-10-19 01:06:13 +0200910 sentenceEnd = false
911 } else {
912 sentenceEnd = true
913 w.SentenceEnd()
Akron3610f102021-08-08 14:13:25 +0200914 }
Akron1594cb82021-08-11 11:14:56 +0200915 if DEBUG {
916 fmt.Println("-> Newline")
917 }
Akron439f4ec2021-08-09 15:45:38 +0200918 }
Akron3610f102021-08-08 14:13:25 +0200919
Akronc5d8d432021-08-10 16:48:44 +0200920 // Rewind the buffer if necessary
Akron439f4ec2021-08-09 15:45:38 +0200921 if rewindBuffer {
922
Akrone396a932021-10-19 01:06:13 +0200923 if DEBUG {
924 fmt.Println("-> Rewind buffer", buffo, buffi, epsilonOffset)
925 }
926
Akron439f4ec2021-08-09 15:45:38 +0200927 // TODO: Better as a ring buffer
Akron3610f102021-08-08 14:13:25 +0200928 for x, i := range buffer[buffo:buffi] {
929 buffer[x] = i
930 }
Akronb4bbb472021-08-09 11:49:38 +0200931
Akron3610f102021-08-08 14:13:25 +0200932 buffi -= buffo
Akron16c312e2021-09-26 13:11:12 +0200933 // epsilonOffset -= buffo
Akrone396a932021-10-19 01:06:13 +0200934 epsilonOffset = 0
935 epsilonState = 0
936
Akron3610f102021-08-08 14:13:25 +0200937 buffo = 0
938 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +0200939 fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
Akron3610f102021-08-08 14:13:25 +0200940 }
Akron84d68e62021-08-04 17:06:52 +0200941 }
942
Akronb7e1f132021-08-10 11:52:31 +0200943 // Move to representative state
Akronf1a16502021-08-16 15:24:38 +0200944 if ta.isSeparate() {
945 t = ta.getBase()
946 ta = dat.array[t]
Akronb7e1f132021-08-10 11:52:31 +0200947
948 if DEBUG {
949 fmt.Println("Representative pointing to", t)
950 }
951 }
952
Akronc5d8d432021-08-10 16:48:44 +0200953 newchar = true
954
Akron068874c2021-08-04 15:19:56 +0200955 // TODO:
956 // Prevent endless epsilon loops!
957 }
958
Akron439f4ec2021-08-09 15:45:38 +0200959 // Input reader is not yet finished
Akron3f8571a2021-08-05 11:18:10 +0200960 if !eof {
Akron068874c2021-08-04 15:19:56 +0200961 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +0200962 fmt.Println("Not at the end - problem", t0, ":", dat.outgoing(t0))
Akron068874c2021-08-04 15:19:56 +0200963 }
964 return false
965 }
966
Akronb7e1f132021-08-10 11:52:31 +0200967 if DEBUG {
968 fmt.Println("Entering final check")
969 }
970
Akrone396a932021-10-19 01:06:13 +0200971 /*
972 The following code is for deprecated automata relying on
973 final states. Datok now requires final states to be marked
974 with tokenends.
Akronf1a16502021-08-16 15:24:38 +0200975
Akrone396a932021-10-19 01:06:13 +0200976 // Automaton is in a final state, so flush the buffer and return
977 x := dat.array[t].getBase() + uint32(dat.final)
Akronb4bbb472021-08-09 11:49:38 +0200978
Akrone396a932021-10-19 01:06:13 +0200979 if x < dat.array[1].getCheck() && dat.array[x].getCheck() == t {
Akron6e70dc82021-08-11 11:33:18 +0200980
Akrone396a932021-10-19 01:06:13 +0200981 if buffi > 0 {
982 if DEBUG {
983 fmt.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
984 }
985 w.Token(0, buffer[:buffi])
Akronc5d8d432021-08-10 16:48:44 +0200986 }
Akron84d68e62021-08-04 17:06:52 +0200987
Akrone396a932021-10-19 01:06:13 +0200988 // Add an additional sentence ending, if the file is over but no explicit
989 // sentence split was reached. This may be controversial and therefore
990 // optional via parameter.
991 if !dat.array[t0].isTokenEnd() {
992 w.SentenceEnd()
993 }
Akron6e70dc82021-08-11 11:33:18 +0200994
Akrone396a932021-10-19 01:06:13 +0200995 // TODO:
996 // There may be a new line at the end, from an epsilon,
997 // so we may need to go on!
998 return true
999 }
1000 */
Akron068874c2021-08-04 15:19:56 +02001001
Akronc5d8d432021-08-10 16:48:44 +02001002 // Check epsilon transitions until a final state is reached
1003 t0 = t
Akronf1a16502021-08-16 15:24:38 +02001004 t = dat.array[t0].getBase() + uint32(dat.epsilon)
Akron01912fc2021-08-12 11:41:58 +02001005 a = dat.epsilon
1006 newchar = false
Akrone396a932021-10-19 01:06:13 +02001007
Akronf1a16502021-08-16 15:24:38 +02001008 if dat.array[t].getCheck() == t0 {
Akronc5d8d432021-08-10 16:48:44 +02001009 // Remember state for backtracking to last tokenend state
Akronc5d8d432021-08-10 16:48:44 +02001010 goto PARSECHAR
Akrone61380b2021-08-16 10:10:46 +02001011
Akronc5d8d432021-08-10 16:48:44 +02001012 } else if epsilonState != 0 {
Akronb7e1f132021-08-10 11:52:31 +02001013 t0 = epsilonState
1014 epsilonState = 0 // reset
1015 buffo = epsilonOffset
Akron068874c2021-08-04 15:19:56 +02001016 if DEBUG {
Akronc5d8d432021-08-10 16:48:44 +02001017 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
Akron068874c2021-08-04 15:19:56 +02001018 }
Akronc5d8d432021-08-10 16:48:44 +02001019 goto PARSECHAR
Akron068874c2021-08-04 15:19:56 +02001020 }
Akrone396a932021-10-19 01:06:13 +02001021
1022 // Add an additional sentence ending, if the file is over but no explicit
1023 // sentence split was reached. This may be controversial and therefore
1024 // optional via parameter.
1025 if !sentenceEnd {
1026 // writer.WriteRune('\n')
1027 // ::Sentenceend
1028 w.SentenceEnd()
1029 if DEBUG {
1030 fmt.Println("-> Newline")
1031 }
1032 }
1033
1034 return true
Akron068874c2021-08-04 15:19:56 +02001035}