blob: fa6e44d5ada6a4cf394343852ebec1caab391d2d [file] [log] [blame]
Akron7f1097f2021-09-21 16:00:29 +02001package datok
Akron8ef408b2021-08-02 22:11:04 +02002
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
Akronb4bbb472021-08-09 11:49:38 +02009// The maximum number of states is 1.073.741.823 (30bit),
Akron83e75a22021-08-04 13:14:06 +020010// with a loadfactor of ~70, this means roughly 70 million
11// states in the FSA, which is sufficient for the current
12// job.
Akron03c92fe2021-08-09 14:07:57 +020013//
14// Serialization is little endian.
Akron83e75a22021-08-04 13:14:06 +020015
Akron740f3d72021-08-03 12:12:34 +020016// TODO:
17// - replace maxSize with the check value
Akron7b1faa62021-09-02 16:10:21 +020018// - Check if final states can be optional.
19// - Introduce ELM (Morita et al. 2001) to speed
20// up construction. Can be ignored on serialization
21// to improve compression.
Akron3a063ef2021-08-05 19:36:35 +020022// - Add checksum to serialization.
Akrone61380b2021-08-16 10:10:46 +020023// - Replace/Enhance table with a map
24// - Provide a bufio.Scanner compatible interface.
Akron8e1d69b2021-08-12 17:38:49 +020025// - Mark epsilon transitions in bytes
Akron740f3d72021-08-03 12:12:34 +020026
Akron8ef408b2021-08-02 22:11:04 +020027import (
28 "bufio"
29 "compress/gzip"
Akron6247a5d2021-08-03 19:18:28 +020030 "encoding/binary"
Akron8ef408b2021-08-02 22:11:04 +020031 "fmt"
32 "io"
Akron679b4862021-09-02 16:59:26 +020033 "math"
Akron8ef408b2021-08-02 22:11:04 +020034 "os"
Akronc9d84a62021-08-03 15:56:03 +020035 "sort"
Akron740f3d72021-08-03 12:12:34 +020036
Akron527c10c2021-08-13 01:45:18 +020037 "log"
Akron8ef408b2021-08-02 22:11:04 +020038)
39
40const (
Akron2a4b9292021-08-04 15:35:22 +020041 DEBUG = false
Akron16c312e2021-09-26 13:11:12 +020042 DAMAGIC = "DATOK"
Akron2a4b9292021-08-04 15:35:22 +020043 VERSION = uint16(1)
Akron03c92fe2021-08-09 14:07:57 +020044 FIRSTBIT uint32 = 1 << 31
45 SECONDBIT uint32 = 1 << 30
46 RESTBIT uint32 = ^uint32(0) &^ (FIRSTBIT | SECONDBIT)
Akron8ef408b2021-08-02 22:11:04 +020047)
48
Akron03c92fe2021-08-09 14:07:57 +020049// Serialization is always little endian
Akron6247a5d2021-08-03 19:18:28 +020050var bo binary.ByteOrder = binary.LittleEndian
51
Akron8ef408b2021-08-02 22:11:04 +020052type mapping struct {
53 source int
Akron3fdfec62021-08-04 11:40:10 +020054 target uint32
Akron8ef408b2021-08-02 22:11:04 +020055}
56
Akronf1a16502021-08-16 15:24:38 +020057type bc struct {
58 base uint32
59 check uint32
60}
61
Akron03c92fe2021-08-09 14:07:57 +020062// DaTokenizer represents a tokenizer implemented as a
63// Double Array FSA.
Akronf2120ca2021-08-03 16:26:41 +020064type DaTokenizer struct {
Akronea46e8a2021-08-17 00:36:31 +020065 sigma map[rune]int
66 sigmaASCII [256]int
Akron03a3c612021-08-04 11:51:27 +020067 maxSize int
Akron4fa28b32021-08-27 10:55:41 +020068 transCount int
Akronf1a16502021-08-16 15:24:38 +020069 array []bc
Akronc17f1ca2021-08-03 19:47:27 +020070
71 // Special symbols in sigma
72 epsilon int
73 unknown int
74 identity int
75 final int
Akron03c92fe2021-08-09 14:07:57 +020076 tokenend int
Akronf2120ca2021-08-03 16:26:41 +020077}
78
Akron439f4ec2021-08-09 15:45:38 +020079// ToDoubleArray turns the intermediate tokenizer representation
80// into a double array representation.
81//
82// This is based on Mizobuchi et al (2000), p.128
Akron1c34ce62021-09-23 23:27:39 +020083func (auto *Automaton) ToDoubleArray() *DaTokenizer {
Akronf2120ca2021-08-03 16:26:41 +020084
85 dat := &DaTokenizer{
Akron03a3c612021-08-04 11:51:27 +020086 sigma: make(map[rune]int),
Akron4fa28b32021-08-27 10:55:41 +020087 transCount: -1,
Akron1c34ce62021-09-23 23:27:39 +020088 final: auto.final,
89 unknown: auto.unknown,
90 identity: auto.identity,
91 epsilon: auto.epsilon,
92 tokenend: auto.tokenend,
Akronf2120ca2021-08-03 16:26:41 +020093 }
94
Akronf1a16502021-08-16 15:24:38 +020095 dat.resize(dat.final)
96
Akron1c34ce62021-09-23 23:27:39 +020097 for num, sym := range auto.sigmaRev {
Akronea46e8a2021-08-17 00:36:31 +020098 if int(sym) < 256 {
99 dat.sigmaASCII[int(sym)] = num
100 }
Akronf2120ca2021-08-03 16:26:41 +0200101 dat.sigma[sym] = num
102 }
Akron8ef408b2021-08-02 22:11:04 +0200103
104 mark := 0
105 size := 0
Akron6f1c16c2021-08-17 10:45:42 +0200106 var base uint32
Akronde18e902021-08-27 09:34:12 +0200107 var atrans *edge
108 var s, s1 int
109 var t, t1 uint32
Akron8ef408b2021-08-02 22:11:04 +0200110
Akron439f4ec2021-08-09 15:45:38 +0200111 // Create a mapping from s (in Ms aka Intermediate FSA)
112 // to t (in Mt aka Double Array FSA)
Akron1c34ce62021-09-23 23:27:39 +0200113 table := make([]*mapping, auto.arcCount+1)
Akron7b1faa62021-09-02 16:10:21 +0200114 // tableQueue := make([]int, tok.arcCount+1)
Akron8ef408b2021-08-02 22:11:04 +0200115
Akron439f4ec2021-08-09 15:45:38 +0200116 // Initialize with the start state
Akron8ef408b2021-08-02 22:11:04 +0200117 table[size] = &mapping{source: 1, target: 1}
Akron7b1faa62021-09-02 16:10:21 +0200118 // tableQueue[size] = 1
Akron8ef408b2021-08-02 22:11:04 +0200119 size++
120
Akron740f3d72021-08-03 12:12:34 +0200121 // Allocate space for the outgoing symbol range
Akron1c34ce62021-09-23 23:27:39 +0200122 A := make([]int, 0, auto.sigmaCount)
Akron7b1faa62021-09-02 16:10:21 +0200123 // tableLookup := make([]uint32, tok.arcCount+2) // make(map[int]uint32)
124 // tableLookup[1] = 1
Akron8ef408b2021-08-02 22:11:04 +0200125
Akron7b1faa62021-09-02 16:10:21 +0200126 // block_begin_pos := uint32(1)
Akronde18e902021-08-27 09:34:12 +0200127
Akron8ef408b2021-08-02 22:11:04 +0200128 for mark < size {
Akronde18e902021-08-27 09:34:12 +0200129 s = table[mark].source // This is a state in Ms
130 t = table[mark].target // This is a state in Mt
Akron7b1faa62021-09-02 16:10:21 +0200131 // s = tableQueue[mark]
132 // t = tableLookup[s]
Akron8ef408b2021-08-02 22:11:04 +0200133 mark++
Akron740f3d72021-08-03 12:12:34 +0200134
135 // Following the paper, here the state t can be remembered
136 // in the set of states St
Akron8ef408b2021-08-02 22:11:04 +0200137 A = A[:0]
Akron1c34ce62021-09-23 23:27:39 +0200138 auto.getSet(s, &A)
Akron8ef408b2021-08-02 22:11:04 +0200139
Akron740f3d72021-08-03 12:12:34 +0200140 // Set base to the first free slot in the double array
Akron29e306f2021-09-02 18:29:56 +0200141 // base = dat.xCheck(A)
Akron679b4862021-09-02 16:59:26 +0200142 // base = dat.xCheckSkip(A)
Akron7b1faa62021-09-02 16:10:21 +0200143 // base = dat.xCheckNiu(A, &block_begin_pos)
Akron29e306f2021-09-02 18:29:56 +0200144 base = dat.xCheckSkipNiu(A)
Akron6f1c16c2021-08-17 10:45:42 +0200145 dat.array[t].setBase(base)
Akron8ef408b2021-08-02 22:11:04 +0200146
Akron773b1ef2021-08-03 17:37:20 +0200147 // TODO:
Akron068874c2021-08-04 15:19:56 +0200148 // Sort the outgoing transitions based on the
Akron773b1ef2021-08-03 17:37:20 +0200149 // outdegree of .end
150
Akron740f3d72021-08-03 12:12:34 +0200151 // Iterate over all outgoing symbols
Akron8ef408b2021-08-02 22:11:04 +0200152 for _, a := range A {
153
Akron1c34ce62021-09-23 23:27:39 +0200154 if a != auto.final {
Akron8ef408b2021-08-02 22:11:04 +0200155
Akron1c34ce62021-09-23 23:27:39 +0200156 atrans = auto.transitions[s][a]
Akronde18e902021-08-27 09:34:12 +0200157
Akron740f3d72021-08-03 12:12:34 +0200158 // Aka g(s, a)
Akronde18e902021-08-27 09:34:12 +0200159 s1 = atrans.end
Akron8ef408b2021-08-02 22:11:04 +0200160
Akron740f3d72021-08-03 12:12:34 +0200161 // Store the transition
Akronde18e902021-08-27 09:34:12 +0200162 t1 = base + uint32(a)
Akronf1a16502021-08-16 15:24:38 +0200163 dat.array[t1].setCheck(t)
164
165 // Set maxSize
166 if dat.maxSize < int(t1) {
167 dat.maxSize = int(t1)
168 }
Akron8ef408b2021-08-02 22:11:04 +0200169
Akron439f4ec2021-08-09 15:45:38 +0200170 if DEBUG {
Akron524c5432021-08-05 14:14:27 +0200171 fmt.Println("Translate transition",
172 s, "->", s1, "(", a, ")", "to", t, "->", t1)
173 }
174
Akron83e75a22021-08-04 13:14:06 +0200175 // Mark the state as being the target of a nontoken transition
Akronde18e902021-08-27 09:34:12 +0200176 if atrans.nontoken {
Akronf1a16502021-08-16 15:24:38 +0200177 dat.array[t1].setNonToken(true)
Akron524c5432021-08-05 14:14:27 +0200178 if DEBUG {
179 fmt.Println("Set", t1, "to nontoken")
180 }
Akron83e75a22021-08-04 13:14:06 +0200181 }
182
Akron84d68e62021-08-04 17:06:52 +0200183 // Mark the state as being the target of a tokenend transition
Akronde18e902021-08-27 09:34:12 +0200184 if atrans.tokenend {
Akronf1a16502021-08-16 15:24:38 +0200185 dat.array[t1].setTokenEnd(true)
Akron524c5432021-08-05 14:14:27 +0200186 if DEBUG {
187 fmt.Println("Set", t1, "to tokenend")
188 }
Akron84d68e62021-08-04 17:06:52 +0200189 }
190
Akron740f3d72021-08-03 12:12:34 +0200191 // Check for representative states
Akron439f4ec2021-08-09 15:45:38 +0200192 r := stateAlreadyInTable(s1, table, size)
Akronde18e902021-08-27 09:34:12 +0200193 // r := tableLookup[s1]
Akron740f3d72021-08-03 12:12:34 +0200194
Akron439f4ec2021-08-09 15:45:38 +0200195 // No representative found
Akron8ef408b2021-08-02 22:11:04 +0200196 if r == 0 {
Akron740f3d72021-08-03 12:12:34 +0200197 // Remember the mapping
Akron8ef408b2021-08-02 22:11:04 +0200198 table[size] = &mapping{source: s1, target: t1}
Akron7b1faa62021-09-02 16:10:21 +0200199 // tableQueue[size] = s1
Akronde18e902021-08-27 09:34:12 +0200200 // tableLookup[s1] = t1
Akron8ef408b2021-08-02 22:11:04 +0200201 size++
202 } else {
Akron740f3d72021-08-03 12:12:34 +0200203 // Overwrite with the representative state
Akronf1a16502021-08-16 15:24:38 +0200204 dat.array[t1].setBase(r)
205 dat.array[t1].setSeparate(true)
Akron8ef408b2021-08-02 22:11:04 +0200206 }
207 } else {
Akron740f3d72021-08-03 12:12:34 +0200208 // Store a final transition
Akron6f1c16c2021-08-17 10:45:42 +0200209 dat.array[base+uint32(dat.final)].setCheck(t)
Akronf1a16502021-08-16 15:24:38 +0200210
Akronde18e902021-08-27 09:34:12 +0200211 if dat.maxSize < int(base)+dat.final {
212 dat.maxSize = int(base) + dat.final
Akronf1a16502021-08-16 15:24:38 +0200213 }
Akron8ef408b2021-08-02 22:11:04 +0200214 }
215 }
216 }
217
218 // Following Mizobuchi et al (2000) the size of the
219 // FSA should be stored in check(1).
Akron29e306f2021-09-02 18:29:56 +0200220 // We make the size a bit larger so we never have to check for boundaries.
221 dat.setSize(dat.maxSize + dat.final)
222 if len(dat.array) < dat.maxSize+dat.final {
223 dat.array = append(dat.array, make([]bc, dat.final)...)
224 }
225 dat.array = dat.array[:dat.maxSize+dat.final]
Akronf2120ca2021-08-03 16:26:41 +0200226 return dat
Akron8ef408b2021-08-02 22:11:04 +0200227}
228
Akron8ef408b2021-08-02 22:11:04 +0200229// Check the table if a mapping of s
Akron740f3d72021-08-03 12:12:34 +0200230// exists and return this as a representative.
231// Currently iterates through the whole table
232// in a bruteforce manner.
Akron439f4ec2021-08-09 15:45:38 +0200233func stateAlreadyInTable(s int, table []*mapping, size int) uint32 {
Akron8ef408b2021-08-02 22:11:04 +0200234 for x := 0; x < size; x++ {
235 if table[x].source == s {
236 return table[x].target
237 }
238 }
239 return 0
240}
241
Akron941f2152021-09-26 15:14:25 +0200242// Type of tokenizer
243func (DaTokenizer) Type() string {
244 return DAMAGIC
245}
246
Akron64ffd9a2021-08-03 19:55:21 +0200247// Resize double array when necessary
248func (dat *DaTokenizer) resize(l int) {
249 // TODO:
250 // This is a bit too aggressive atm and should be calmed down.
251 if len(dat.array) <= l {
Akronf1a16502021-08-16 15:24:38 +0200252 dat.array = append(dat.array, make([]bc, l)...)
Akron8ef408b2021-08-02 22:11:04 +0200253 }
Akron64ffd9a2021-08-03 19:55:21 +0200254}
Akronc9d84a62021-08-03 15:56:03 +0200255
Akron64ffd9a2021-08-03 19:55:21 +0200256// Set base value in double array
Akronf1a16502021-08-16 15:24:38 +0200257func (bc *bc) setBase(v uint32) {
258 bc.base = v
Akron439f4ec2021-08-09 15:45:38 +0200259}
260
261// Get base value in double array
Akronf1a16502021-08-16 15:24:38 +0200262func (bc *bc) getBase() uint32 {
263 return bc.base & RESTBIT
Akron439f4ec2021-08-09 15:45:38 +0200264}
265
266// Set check value in double array
Akronf1a16502021-08-16 15:24:38 +0200267func (bc *bc) setCheck(v uint32) {
268 bc.check = v
Akron439f4ec2021-08-09 15:45:38 +0200269}
270
271// Get check value in double array
Akronf1a16502021-08-16 15:24:38 +0200272func (bc *bc) getCheck() uint32 {
273 return bc.check & RESTBIT
Akron64ffd9a2021-08-03 19:55:21 +0200274}
275
Akron3fdfec62021-08-04 11:40:10 +0200276// Returns true if a state is separate pointing to a representative
Akronf1a16502021-08-16 15:24:38 +0200277func (bc *bc) isSeparate() bool {
278 return bc.base&FIRSTBIT != 0
Akron3fdfec62021-08-04 11:40:10 +0200279}
280
281// Mark a state as separate pointing to a representative
Akronf1a16502021-08-16 15:24:38 +0200282func (bc *bc) setSeparate(sep bool) {
Akron3fdfec62021-08-04 11:40:10 +0200283 if sep {
Akronf1a16502021-08-16 15:24:38 +0200284 bc.base |= FIRSTBIT
Akron3fdfec62021-08-04 11:40:10 +0200285 } else {
Akronf1a16502021-08-16 15:24:38 +0200286 bc.base &= (RESTBIT | SECONDBIT)
Akron3fdfec62021-08-04 11:40:10 +0200287 }
288}
289
Akron83e75a22021-08-04 13:14:06 +0200290// Returns true if a state is the target of a nontoken transition
Akronf1a16502021-08-16 15:24:38 +0200291func (bc *bc) isNonToken() bool {
292 return bc.check&FIRSTBIT != 0
Akron83e75a22021-08-04 13:14:06 +0200293}
294
295// Mark a state as being the target of a nontoken transition
Akronf1a16502021-08-16 15:24:38 +0200296func (bc *bc) setNonToken(sep bool) {
Akron83e75a22021-08-04 13:14:06 +0200297 if sep {
Akronf1a16502021-08-16 15:24:38 +0200298 bc.check |= FIRSTBIT
Akron83e75a22021-08-04 13:14:06 +0200299 } else {
Akronf1a16502021-08-16 15:24:38 +0200300 bc.check &= (RESTBIT | SECONDBIT)
Akron83e75a22021-08-04 13:14:06 +0200301 }
302}
303
Akron84d68e62021-08-04 17:06:52 +0200304// Returns true if a state is the target of a tokenend transition
Akronf1a16502021-08-16 15:24:38 +0200305func (bc *bc) isTokenEnd() bool {
306 return bc.check&SECONDBIT != 0
Akron84d68e62021-08-04 17:06:52 +0200307}
308
309// Mark a state as being the target of a tokenend transition
Akronf1a16502021-08-16 15:24:38 +0200310func (bc *bc) setTokenEnd(sep bool) {
Akron84d68e62021-08-04 17:06:52 +0200311 if sep {
Akronf1a16502021-08-16 15:24:38 +0200312 bc.check |= SECONDBIT
Akron84d68e62021-08-04 17:06:52 +0200313 } else {
Akronf1a16502021-08-16 15:24:38 +0200314 bc.check &= (RESTBIT | FIRSTBIT)
Akron84d68e62021-08-04 17:06:52 +0200315 }
316}
317
Akron64ffd9a2021-08-03 19:55:21 +0200318// Set size of double array
Akron3fdfec62021-08-04 11:40:10 +0200319func (dat *DaTokenizer) setSize(v int) {
Akronf1a16502021-08-16 15:24:38 +0200320 dat.array[1].setCheck(uint32(v))
Akron64ffd9a2021-08-03 19:55:21 +0200321}
322
323// Get size of double array
Akron3fdfec62021-08-04 11:40:10 +0200324func (dat *DaTokenizer) GetSize() int {
Akronf1a16502021-08-16 15:24:38 +0200325 return int(dat.array[1].getCheck())
Akron8ef408b2021-08-02 22:11:04 +0200326}
327
328// Based on Mizobuchi et al (2000), p. 124
329// This iterates for every state through the complete double array
330// structure until it finds a gap that fits all outgoing transitions
331// of the state. This is extremely slow, but is only necessary in the
332// construction phase of the tokenizer.
Akron3fdfec62021-08-04 11:40:10 +0200333func (dat *DaTokenizer) xCheck(symbols []int) uint32 {
Akron740f3d72021-08-03 12:12:34 +0200334
335 // Start at the first entry of the double array list
Akron6f1c16c2021-08-17 10:45:42 +0200336 base := uint32(1)
337
Akron8ef408b2021-08-02 22:11:04 +0200338OVERLAP:
Akron740f3d72021-08-03 12:12:34 +0200339 // Resize the array if necessary
Akronf1a16502021-08-16 15:24:38 +0200340 dat.resize(int(base) + dat.final)
Akron8ef408b2021-08-02 22:11:04 +0200341 for _, a := range symbols {
Akronf1a16502021-08-16 15:24:38 +0200342 if dat.array[int(base)+a].getCheck() != 0 {
Akron8ef408b2021-08-02 22:11:04 +0200343 base++
344 goto OVERLAP
345 }
346 }
Akron8ef408b2021-08-02 22:11:04 +0200347 return base
348}
349
Akron679b4862021-09-02 16:59:26 +0200350// This is an implementation of xCheck with the skip-improvement
351// proposed by Morita et al. (2001)
352func (dat *DaTokenizer) xCheckSkip(symbols []int) uint32 {
353
354 // Start at the first entry of the double array list
355 base := uint32(math.Abs(float64(dat.maxSize-1) * .9))
356
357OVERLAP:
358 // Resize the array if necessary
359 dat.resize(int(base) + dat.final)
360 for _, a := range symbols {
361 if dat.array[int(base)+a].getCheck() != 0 {
362 base++
363 goto OVERLAP
364 }
365 }
366 return base
367}
368
Akron29e306f2021-09-02 18:29:56 +0200369// This is an implementation of xCheck with the skip-improvement
370// proposed by Morita et al. (2001) for higher outdegrees as
371// proposed by Niu et al. (2013)
372func (dat *DaTokenizer) xCheckSkipNiu(symbols []int) uint32 {
373
374 // Start at the first entry of the double array list
375 base := uint32(1)
376
377 // Or skip the first few entries
378 if len(symbols) >= 3 {
379 base = uint32(math.Abs(float64(dat.maxSize-1)*.9)) + 1
380 }
381
382OVERLAP:
383 // Resize the array if necessary
384 dat.resize(int(base) + dat.final + 1)
385 for _, a := range symbols {
386 if dat.array[int(base)+a].getCheck() != 0 {
387 base++
388 goto OVERLAP
389 }
390 }
391 return base
392}
393
Akron7b1faa62021-09-02 16:10:21 +0200394// This is an implementation of xCheck wit an improvement
395// proposed by Niu et al. (2013)
396func (dat *DaTokenizer) xCheckNiu(symbols []int, block_begin_pos *uint32) uint32 {
397
398 // Start at the first entry of the double array list
399 base := uint32(1)
400
401 if len(symbols) > 3 {
402 sort.Ints(symbols)
403 if *block_begin_pos > uint32(symbols[0]) {
404 dat.resize(int(*block_begin_pos) + dat.final)
405 *block_begin_pos += uint32(symbols[len(symbols)-1] + 1)
406 return *block_begin_pos - uint32(symbols[0])
407 }
408 }
409
410OVERLAP:
411 // Resize the array if necessary
412 dat.resize(int(base) + dat.final)
413 for _, a := range symbols {
414 if dat.array[int(base)+a].getCheck() != 0 {
415 base++
416 goto OVERLAP
417 }
418 }
419 return base
420}
421
Akron3610f102021-08-08 14:13:25 +0200422// List all outgoing transitions for a state
423// for testing purposes
424func (dat *DaTokenizer) outgoing(t uint32) []int {
425
426 valid := make([]int, 0, len(dat.sigma))
427
428 for _, a := range dat.sigma {
Akronf1a16502021-08-16 15:24:38 +0200429 t1 := dat.array[t].getBase() + uint32(a)
430 if t1 <= dat.array[1].getCheck() && dat.array[t1].getCheck() == t {
Akron3610f102021-08-08 14:13:25 +0200431 valid = append(valid, a)
432 }
433 }
434
435 for _, a := range []int{dat.epsilon, dat.unknown, dat.identity, dat.final} {
Akronf1a16502021-08-16 15:24:38 +0200436 t1 := dat.array[t].getBase() + uint32(a)
437 if t1 <= dat.array[1].getCheck() && dat.array[t1].getCheck() == t {
Akron3610f102021-08-08 14:13:25 +0200438 valid = append(valid, -1*a)
439 }
440 }
441
442 sort.Ints(valid)
443
444 return valid
445}
446
Akron4fa28b32021-08-27 10:55:41 +0200447// TransCount as the number of transitions aka arcs in the
448// finite state automaton
449func (dat *DaTokenizer) TransCount() int {
450 // Cache the transCount
451 if dat.transCount > 0 {
452 return dat.transCount
453 }
454
455 dat.transCount = 0
456 for x := 1; x < len(dat.array); x++ {
457 if dat.array[x].getBase() != 0 {
458 dat.transCount++
459 }
460 }
461
462 return dat.transCount
463}
464
Akron03a3c612021-08-04 11:51:27 +0200465// LoadFactor as defined in Kanda et al (2018),
466// i.e. the proportion of non-empty elements to all elements.
467func (dat *DaTokenizer) LoadFactor() float64 {
Akron4fa28b32021-08-27 10:55:41 +0200468 return float64(dat.TransCount()) / float64(len(dat.array)) * 100
Akrond66a9262021-08-03 17:09:09 +0200469}
470
Akron439f4ec2021-08-09 15:45:38 +0200471// Save stores the double array data in a file
Akron3a063ef2021-08-05 19:36:35 +0200472func (dat *DaTokenizer) Save(file string) (n int64, err error) {
473 f, err := os.Create(file)
474 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200475 log.Println(err)
Akron439f4ec2021-08-09 15:45:38 +0200476 return 0, err
Akron3a063ef2021-08-05 19:36:35 +0200477 }
478 defer f.Close()
479 gz := gzip.NewWriter(f)
480 defer gz.Close()
481 n, err = dat.WriteTo(gz)
482 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200483 log.Println(err)
Akron3a063ef2021-08-05 19:36:35 +0200484 return n, err
485 }
486 gz.Flush()
487 return n, nil
488}
489
490// WriteTo stores the double array data in an io.Writer.
Akron6247a5d2021-08-03 19:18:28 +0200491func (dat *DaTokenizer) WriteTo(w io.Writer) (n int64, err error) {
492
Akron3a063ef2021-08-05 19:36:35 +0200493 wb := bufio.NewWriter(w)
494 defer wb.Flush()
495
Akron6247a5d2021-08-03 19:18:28 +0200496 // Store magical header
Akron16c312e2021-09-26 13:11:12 +0200497 all, err := wb.Write([]byte(DAMAGIC))
Akron6247a5d2021-08-03 19:18:28 +0200498 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200499 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200500 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200501 }
502
503 // Get sigma as a list
504 sigmalist := make([]rune, len(dat.sigma)+16)
505 max := 0
506 for sym, num := range dat.sigma {
507 sigmalist[num] = sym
508 if num > max {
509 max = num
510 }
511 }
512
513 sigmalist = sigmalist[:max+1]
514
Akron3f8571a2021-08-05 11:18:10 +0200515 buf := make([]byte, 0, 16)
Akron6247a5d2021-08-03 19:18:28 +0200516 bo.PutUint16(buf[0:2], VERSION)
Akronc17f1ca2021-08-03 19:47:27 +0200517 bo.PutUint16(buf[2:4], uint16(dat.epsilon))
518 bo.PutUint16(buf[4:6], uint16(dat.unknown))
519 bo.PutUint16(buf[6:8], uint16(dat.identity))
520 bo.PutUint16(buf[8:10], uint16(dat.final))
Akron6247a5d2021-08-03 19:18:28 +0200521 bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
Akronf1a16502021-08-16 15:24:38 +0200522 bo.PutUint32(buf[12:16], uint32(len(dat.array)*2)) // Legacy support
Akron3a063ef2021-08-05 19:36:35 +0200523 more, err := wb.Write(buf[0:16])
Akron6247a5d2021-08-03 19:18:28 +0200524 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200525 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200526 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200527 }
528
529 all += more
530
Akron6247a5d2021-08-03 19:18:28 +0200531 // Write sigma
532 for _, sym := range sigmalist {
Akron3f8571a2021-08-05 11:18:10 +0200533
Akron3a063ef2021-08-05 19:36:35 +0200534 more, err = wb.WriteRune(sym)
Akron6247a5d2021-08-03 19:18:28 +0200535 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200536 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200537 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200538 }
539 all += more
540 }
Akron439f4ec2021-08-09 15:45:38 +0200541
Akron6247a5d2021-08-03 19:18:28 +0200542 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200543 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200544 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200545 }
Akron6247a5d2021-08-03 19:18:28 +0200546
547 // Test marker - could be checksum
Akron3a063ef2021-08-05 19:36:35 +0200548 more, err = wb.Write([]byte("T"))
Akron6247a5d2021-08-03 19:18:28 +0200549 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200550 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200551 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200552 }
553 all += more
554
Akronf1a16502021-08-16 15:24:38 +0200555 // for x := 0; x < len(dat.array); x++ {
556 for _, bc := range dat.array {
557 bo.PutUint32(buf[0:4], bc.base)
558 more, err = wb.Write(buf[0:4])
Akron6247a5d2021-08-03 19:18:28 +0200559 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200560 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200561 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200562 }
Akron439f4ec2021-08-09 15:45:38 +0200563 all += more
Akron3a063ef2021-08-05 19:36:35 +0200564 if more != 4 {
Akronf1a16502021-08-16 15:24:38 +0200565 log.Println("Can not write base uint32")
566 return int64(all), err
567 }
568 bo.PutUint32(buf[0:4], bc.check)
569 more, err = wb.Write(buf[0:4])
570 if err != nil {
571 log.Println(err)
572 return int64(all), err
573 }
574 all += more
575 if more != 4 {
576 log.Println("Can not write check uint32")
Akron3a063ef2021-08-05 19:36:35 +0200577 return int64(all), err
578 }
Akron6247a5d2021-08-03 19:18:28 +0200579 }
580
581 return int64(all), err
582}
583
Akron439f4ec2021-08-09 15:45:38 +0200584// LoadDatokFile reads a double array represented tokenizer
585// from a file.
Akron3f8571a2021-08-05 11:18:10 +0200586func LoadDatokFile(file string) *DaTokenizer {
587 f, err := os.Open(file)
588 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200589 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200590 return nil
Akron3f8571a2021-08-05 11:18:10 +0200591 }
592 defer f.Close()
593
594 gz, err := gzip.NewReader(f)
595 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200596 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200597 return nil
Akron3f8571a2021-08-05 11:18:10 +0200598 }
599 defer gz.Close()
600
Akron3a063ef2021-08-05 19:36:35 +0200601 // Todo: Read the whole file!
Akron3f8571a2021-08-05 11:18:10 +0200602 return ParseDatok(gz)
603}
604
Akron439f4ec2021-08-09 15:45:38 +0200605// LoadDatokFile reads a double array represented tokenizer
606// from an io.Reader
Akron3f8571a2021-08-05 11:18:10 +0200607func ParseDatok(ior io.Reader) *DaTokenizer {
608
Akron439f4ec2021-08-09 15:45:38 +0200609 // Initialize tokenizer with default values
Akron3f8571a2021-08-05 11:18:10 +0200610 dat := &DaTokenizer{
611 sigma: make(map[rune]int),
612 epsilon: 0,
613 unknown: 0,
614 identity: 0,
615 final: 0,
Akron4fa28b32021-08-27 10:55:41 +0200616 transCount: 0,
Akron3f8571a2021-08-05 11:18:10 +0200617 }
618
619 r := bufio.NewReader(ior)
620
Akron3f8571a2021-08-05 11:18:10 +0200621 buf := make([]byte, 1024)
Akron16c312e2021-09-26 13:11:12 +0200622 buf = buf[0:len(DAMAGIC)]
Akron3f8571a2021-08-05 11:18:10 +0200623
Akron439f4ec2021-08-09 15:45:38 +0200624 _, err := r.Read(buf)
Akron3f8571a2021-08-05 11:18:10 +0200625
626 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200627 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200628 return nil
629 }
630
Akron16c312e2021-09-26 13:11:12 +0200631 if string(DAMAGIC) != string(buf) {
Akron527c10c2021-08-13 01:45:18 +0200632 log.Println("Not a datok file")
Akron3f8571a2021-08-05 11:18:10 +0200633 return nil
634 }
635
Akron439f4ec2021-08-09 15:45:38 +0200636 more, err := io.ReadFull(r, buf[0:16])
Akron3f8571a2021-08-05 11:18:10 +0200637 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200638 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200639 return nil
640 }
641
Akron439f4ec2021-08-09 15:45:38 +0200642 if more != 16 {
Akron527c10c2021-08-13 01:45:18 +0200643 log.Println("Read bytes do not fit")
Akron439f4ec2021-08-09 15:45:38 +0200644 return nil
645 }
Akron3f8571a2021-08-05 11:18:10 +0200646
Akron3a063ef2021-08-05 19:36:35 +0200647 version := bo.Uint16(buf[0:2])
648
649 if version != VERSION {
Akron527c10c2021-08-13 01:45:18 +0200650 log.Println("Version not compatible")
Akron3a063ef2021-08-05 19:36:35 +0200651 return nil
652 }
653
Akron3f8571a2021-08-05 11:18:10 +0200654 dat.epsilon = int(bo.Uint16(buf[2:4]))
655 dat.unknown = int(bo.Uint16(buf[4:6]))
656 dat.identity = int(bo.Uint16(buf[6:8]))
657 dat.final = int(bo.Uint16(buf[8:10]))
658
659 sigmaCount := int(bo.Uint16(buf[10:12]))
Akronf1a16502021-08-16 15:24:38 +0200660 arraySize := int(bo.Uint32(buf[12:16])) / 2 // Legacy support
Akron3f8571a2021-08-05 11:18:10 +0200661
Akron3a063ef2021-08-05 19:36:35 +0200662 // Shouldn't be relevant though
663 dat.maxSize = arraySize - 1
664
Akron3f8571a2021-08-05 11:18:10 +0200665 for x := 0; x < sigmaCount; x++ {
Akron439f4ec2021-08-09 15:45:38 +0200666 sym, _, err := r.ReadRune()
Akron3f8571a2021-08-05 11:18:10 +0200667 if err == nil && sym != 0 {
Akronea46e8a2021-08-17 00:36:31 +0200668 if int(sym) < 256 {
669 dat.sigmaASCII[int(sym)] = x
670 }
Akron3f8571a2021-08-05 11:18:10 +0200671 dat.sigma[sym] = x
672 }
Akron3f8571a2021-08-05 11:18:10 +0200673 }
674
Akron439f4ec2021-08-09 15:45:38 +0200675 _, err = io.ReadFull(r, buf[0:1])
Akron3f8571a2021-08-05 11:18:10 +0200676
677 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200678 log.Print(err)
Akron3f8571a2021-08-05 11:18:10 +0200679 return nil
680 }
681
Akron3f8571a2021-08-05 11:18:10 +0200682 if string("T") != string(buf[0:1]) {
Akron527c10c2021-08-13 01:45:18 +0200683 log.Println("Not a datok file")
Akron3f8571a2021-08-05 11:18:10 +0200684 return nil
685 }
686
687 // Read based on length
Akronf1a16502021-08-16 15:24:38 +0200688 dat.array = make([]bc, arraySize)
Akron3f8571a2021-08-05 11:18:10 +0200689
Akronbb4aac52021-08-13 00:52:27 +0200690 dataArray, err := io.ReadAll(r)
Akron439f4ec2021-08-09 15:45:38 +0200691
Akronbb4aac52021-08-13 00:52:27 +0200692 if err == io.EOF {
Akron527c10c2021-08-13 01:45:18 +0200693 log.Println(err)
Akronbb4aac52021-08-13 00:52:27 +0200694 return nil
695 }
696
Akronf1a16502021-08-16 15:24:38 +0200697 if len(dataArray) < arraySize*8 {
Akron527c10c2021-08-13 01:45:18 +0200698 log.Println("Not enough bytes read")
Akronbb4aac52021-08-13 00:52:27 +0200699 return nil
700 }
701
702 for x := 0; x < arraySize; x++ {
Akronf1a16502021-08-16 15:24:38 +0200703 dat.array[x].base = bo.Uint32(dataArray[x*8 : (x*8)+4])
704 dat.array[x].check = bo.Uint32(dataArray[(x*8)+4 : (x*8)+8])
Akron3f8571a2021-08-05 11:18:10 +0200705 }
706
707 return dat
708}
709
Akron439f4ec2021-08-09 15:45:38 +0200710// Show the current state of the buffer,
711// for testing puroses
Akron3610f102021-08-08 14:13:25 +0200712func showBuffer(buffer []rune, buffo int, buffi int) string {
713 out := make([]rune, 0, 1024)
714 for x := 0; x < len(buffer); x++ {
715 if buffi == x {
716 out = append(out, '^')
717 }
718 if buffo == x {
719 out = append(out, '[', buffer[x], ']')
720 } else {
721 out = append(out, buffer[x])
722 }
723 }
724 return string(out)
725}
726
Akron84d68e62021-08-04 17:06:52 +0200727// Transduce an input string against the double array
Akron3610f102021-08-08 14:13:25 +0200728// FSA. The rules are always greedy. If the automaton fails,
729// it takes the last possible token ending branch.
Akron068874c2021-08-04 15:19:56 +0200730//
Akron4db3ecf2021-08-11 18:49:03 +0200731// Based on Mizobuchi et al (2000), p. 129,
732// with additional support for IDENTITY, UNKNOWN
733// and EPSILON transitions and NONTOKEN and TOKENEND handling.
Akron3f8571a2021-08-05 11:18:10 +0200734func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
Akron068874c2021-08-04 15:19:56 +0200735 var a int
Akronb4bbb472021-08-09 11:49:38 +0200736 var t0 uint32
Akronb7e1f132021-08-10 11:52:31 +0200737 t := uint32(1) // Initial state
738 var ok, rewindBuffer bool
Akron068874c2021-08-04 15:19:56 +0200739
Akron3610f102021-08-08 14:13:25 +0200740 // Remember the last position of a possible tokenend,
741 // in case the automaton fails.
742 epsilonState := uint32(0)
743 epsilonOffset := 0
744
745 // Implement a low level buffer for full control,
746 // however - it is probably better to introduce
747 // this on a higher level with a io.Reader interface
748 // The buffer stores a single word and may have white
749 // space at the end (but not at the beginning).
750 //
751 // This is the only backtracking requirement because of
752 // epsilon transitions, to support tokenizations like:
753 // "this is an example|.| And it works." vs
754 // "this is an example.com| application."
Akronb7e1f132021-08-10 11:52:31 +0200755 //
756 // TODO:
757 // Store a translation buffer as well, so characters don't
758 // have to be translated multiple times!
Akron3610f102021-08-08 14:13:25 +0200759 buffer := make([]rune, 1024)
760 buffo := 0 // Buffer offset
761 buffi := 0 // Buffer length
762
Akron3f8571a2021-08-05 11:18:10 +0200763 reader := bufio.NewReader(r)
764 writer := bufio.NewWriter(w)
765 defer writer.Flush()
Akron068874c2021-08-04 15:19:56 +0200766
Akron3f8571a2021-08-05 11:18:10 +0200767 var char rune
768 var err error
769 eof := false
Akronb7e1f132021-08-10 11:52:31 +0200770 newchar := true
Akron3f8571a2021-08-05 11:18:10 +0200771
Akronc5d8d432021-08-10 16:48:44 +0200772PARSECHAR:
Akron3f8571a2021-08-05 11:18:10 +0200773 for {
774
Akronb7e1f132021-08-10 11:52:31 +0200775 if newchar {
776 // Get from reader if buffer is empty
777 if buffo >= buffi {
Akron1594cb82021-08-11 11:14:56 +0200778 if eof {
779 break
780 }
Akronb7e1f132021-08-10 11:52:31 +0200781 char, _, err = reader.ReadRune()
Akron439f4ec2021-08-09 15:45:38 +0200782
Akronb7e1f132021-08-10 11:52:31 +0200783 // No more runes to read
784 if err != nil {
785 eof = true
786 break
787 }
788 buffer[buffi] = char
789 buffi++
Akron3f8571a2021-08-05 11:18:10 +0200790 }
Akronb7e1f132021-08-10 11:52:31 +0200791
792 char = buffer[buffo]
793
794 if DEBUG {
795 fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
796 }
797
Akron6f1c16c2021-08-17 10:45:42 +0200798 // TODO:
799 // Better not repeatedly check for a!
800 // Possibly keep a buffer with a.
Akronea46e8a2021-08-17 00:36:31 +0200801 if int(char) < 256 {
802 a = dat.sigmaASCII[int(char)]
803 } else {
804 a, ok = dat.sigma[char]
805 if !ok {
806 a = 0
807 }
808 }
Akronb7e1f132021-08-10 11:52:31 +0200809
810 // Use identity symbol if character is not in sigma
Akronea46e8a2021-08-17 00:36:31 +0200811 if a == 0 && dat.identity != -1 {
Akronb7e1f132021-08-10 11:52:31 +0200812 a = dat.identity
813 }
814
815 t0 = t
816
817 // Check for epsilon transitions and remember
Akronf1a16502021-08-16 15:24:38 +0200818 if dat.array[dat.array[t0].getBase()+uint32(dat.epsilon)].getCheck() == t0 {
Akronb7e1f132021-08-10 11:52:31 +0200819 // Remember state for backtracking to last tokenend state
820 epsilonState = t0
821 epsilonOffset = buffo
822 }
Akron3f8571a2021-08-05 11:18:10 +0200823 }
Akron3610f102021-08-08 14:13:25 +0200824
Akronb7e1f132021-08-10 11:52:31 +0200825 // Checks a transition based on t0, a and buffo
Akronf1a16502021-08-16 15:24:38 +0200826 t = dat.array[t0].getBase() + uint32(a)
827 ta := dat.array[t]
Akron068874c2021-08-04 15:19:56 +0200828
Akron524c5432021-08-05 14:14:27 +0200829 if DEBUG {
Akronb7e1f132021-08-10 11:52:31 +0200830 // Char is only relevant if set
831 fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
832 if false {
833 fmt.Println(dat.outgoing(t0))
834 }
Akron524c5432021-08-05 14:14:27 +0200835 }
836
Akronb7e1f132021-08-10 11:52:31 +0200837 // Check if the transition is invalid according to the double array
Akronf1a16502021-08-16 15:24:38 +0200838 if t > dat.array[1].getCheck() || ta.getCheck() != t0 {
Akron068874c2021-08-04 15:19:56 +0200839
840 if DEBUG {
Akronf1a16502021-08-16 15:24:38 +0200841 fmt.Println("Match is not fine!", t, "and", ta.getCheck(), "vs", t0)
Akron068874c2021-08-04 15:19:56 +0200842 }
843
844 if !ok && a == dat.identity {
Akronb4bbb472021-08-09 11:49:38 +0200845
Akron068874c2021-08-04 15:19:56 +0200846 // Try again with unknown symbol, in case identity failed
Akronb7e1f132021-08-10 11:52:31 +0200847 // Char is only relevant when set
Akron068874c2021-08-04 15:19:56 +0200848 if DEBUG {
Akron3f8571a2021-08-05 11:18:10 +0200849 fmt.Println("UNKNOWN symbol", string(char), "->", dat.unknown)
Akron068874c2021-08-04 15:19:56 +0200850 }
851 a = dat.unknown
852
853 } else if a != dat.epsilon {
Akronb4bbb472021-08-09 11:49:38 +0200854
Akron068874c2021-08-04 15:19:56 +0200855 // Try again with epsilon symbol, in case everything else failed
Akronb4bbb472021-08-09 11:49:38 +0200856 t0 = epsilonState
Akron3610f102021-08-08 14:13:25 +0200857 epsilonState = 0 // reset
858 buffo = epsilonOffset
Akron439f4ec2021-08-09 15:45:38 +0200859 a = dat.epsilon
860
Akron3610f102021-08-08 14:13:25 +0200861 if DEBUG {
862 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
863 }
Akronb4bbb472021-08-09 11:49:38 +0200864
Akron068874c2021-08-04 15:19:56 +0200865 } else {
866 break
867 }
Akron068874c2021-08-04 15:19:56 +0200868
Akronb7e1f132021-08-10 11:52:31 +0200869 newchar = false
870 continue
Akronb4bbb472021-08-09 11:49:38 +0200871 }
872
Akronb7e1f132021-08-10 11:52:31 +0200873 // Transition was successful
874 rewindBuffer = false
Akron439f4ec2021-08-09 15:45:38 +0200875
876 // Transition consumes a character
Akronb4bbb472021-08-09 11:49:38 +0200877 if a != dat.epsilon {
878
Akron3610f102021-08-08 14:13:25 +0200879 buffo++
Akronb4bbb472021-08-09 11:49:38 +0200880
Akron439f4ec2021-08-09 15:45:38 +0200881 // Transition does not produce a character
Akronf1a16502021-08-16 15:24:38 +0200882 if buffo == 1 && ta.isNonToken() {
Akron3610f102021-08-08 14:13:25 +0200883 if DEBUG {
884 fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
885 }
Akron439f4ec2021-08-09 15:45:38 +0200886 rewindBuffer = true
Akron3610f102021-08-08 14:13:25 +0200887 }
Akron3f8571a2021-08-05 11:18:10 +0200888 }
Akron068874c2021-08-04 15:19:56 +0200889
Akronc5d8d432021-08-10 16:48:44 +0200890 // Transition marks the end of a token - so flush the buffer
Akronf1a16502021-08-16 15:24:38 +0200891 if ta.isTokenEnd() {
Akron524c5432021-08-05 14:14:27 +0200892
Akronc5d8d432021-08-10 16:48:44 +0200893 if buffi > 0 {
Akronc5d8d432021-08-10 16:48:44 +0200894 if DEBUG {
Akron01912fc2021-08-12 11:41:58 +0200895 fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
Akronc5d8d432021-08-10 16:48:44 +0200896 }
Akron01912fc2021-08-12 11:41:58 +0200897 writer.WriteString(string(buffer[:buffo]))
Akronc5d8d432021-08-10 16:48:44 +0200898 rewindBuffer = true
Akron3610f102021-08-08 14:13:25 +0200899 }
Akron1594cb82021-08-11 11:14:56 +0200900 if DEBUG {
901 fmt.Println("-> Newline")
902 }
903 writer.WriteRune('\n')
Akron439f4ec2021-08-09 15:45:38 +0200904 }
Akron3610f102021-08-08 14:13:25 +0200905
Akronc5d8d432021-08-10 16:48:44 +0200906 // Rewind the buffer if necessary
Akron439f4ec2021-08-09 15:45:38 +0200907 if rewindBuffer {
908
909 // TODO: Better as a ring buffer
Akron3610f102021-08-08 14:13:25 +0200910 for x, i := range buffer[buffo:buffi] {
911 buffer[x] = i
912 }
Akronb4bbb472021-08-09 11:49:38 +0200913
Akron3610f102021-08-08 14:13:25 +0200914 buffi -= buffo
Akron16c312e2021-09-26 13:11:12 +0200915 // epsilonOffset -= buffo
916 epsilonOffset = buffo
Akron3610f102021-08-08 14:13:25 +0200917 buffo = 0
918 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +0200919 fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
Akron3610f102021-08-08 14:13:25 +0200920 }
Akron84d68e62021-08-04 17:06:52 +0200921 }
922
Akronb7e1f132021-08-10 11:52:31 +0200923 // Move to representative state
Akronf1a16502021-08-16 15:24:38 +0200924 if ta.isSeparate() {
925 t = ta.getBase()
926 ta = dat.array[t]
Akronb7e1f132021-08-10 11:52:31 +0200927
928 if DEBUG {
929 fmt.Println("Representative pointing to", t)
930 }
931 }
932
Akronc5d8d432021-08-10 16:48:44 +0200933 newchar = true
934
Akron068874c2021-08-04 15:19:56 +0200935 // TODO:
936 // Prevent endless epsilon loops!
937 }
938
Akron439f4ec2021-08-09 15:45:38 +0200939 // Input reader is not yet finished
Akron3f8571a2021-08-05 11:18:10 +0200940 if !eof {
Akron068874c2021-08-04 15:19:56 +0200941 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +0200942 fmt.Println("Not at the end - problem", t0, ":", dat.outgoing(t0))
Akron068874c2021-08-04 15:19:56 +0200943 }
944 return false
945 }
946
Akronb7e1f132021-08-10 11:52:31 +0200947 if DEBUG {
948 fmt.Println("Entering final check")
949 }
950
Akronc5d8d432021-08-10 16:48:44 +0200951 // Automaton is in a final state, so flush the buffer and return
Akronf1a16502021-08-16 15:24:38 +0200952 x := dat.array[t].getBase() + uint32(dat.final)
953
954 if x < dat.array[1].getCheck() && dat.array[x].getCheck() == t {
Akronb4bbb472021-08-09 11:49:38 +0200955
956 if buffi > 0 {
Akronb4bbb472021-08-09 11:49:38 +0200957 if DEBUG {
Akron01912fc2021-08-12 11:41:58 +0200958 fmt.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
Akron3f8571a2021-08-05 11:18:10 +0200959 }
Akron01912fc2021-08-12 11:41:58 +0200960 writer.WriteString(string(buffer[:buffi]))
Akron6e70dc82021-08-11 11:33:18 +0200961
Akronf1a16502021-08-16 15:24:38 +0200962 if dat.array[t].isTokenEnd() {
Akrondf0a3ef2021-08-09 15:53:45 +0200963 writer.WriteRune('\n')
Akronc5d8d432021-08-10 16:48:44 +0200964 if DEBUG {
965 fmt.Println("-> Newline")
966 }
Akrondf0a3ef2021-08-09 15:53:45 +0200967 }
Akron84d68e62021-08-04 17:06:52 +0200968 }
969
Akron6e70dc82021-08-11 11:33:18 +0200970 // Add an additional sentence ending, if the file is over but no explicit
971 // sentence split was reached. This may be controversial and therefore
972 // optional via parameter.
Akronf1a16502021-08-16 15:24:38 +0200973 if !dat.array[t0].isTokenEnd() {
Akron6e70dc82021-08-11 11:33:18 +0200974 writer.WriteRune('\n')
975 if DEBUG {
976 fmt.Println("-> Newline")
977 }
978 }
979
Akrone61380b2021-08-16 10:10:46 +0200980 // TODO:
981 // There may be a new line at the end, from an epsilon,
982 // so we may need to go on!
Akron068874c2021-08-04 15:19:56 +0200983 return true
984 }
985
Akronc5d8d432021-08-10 16:48:44 +0200986 // Check epsilon transitions until a final state is reached
987 t0 = t
Akronf1a16502021-08-16 15:24:38 +0200988 t = dat.array[t0].getBase() + uint32(dat.epsilon)
Akron01912fc2021-08-12 11:41:58 +0200989 a = dat.epsilon
990 newchar = false
Akronf1a16502021-08-16 15:24:38 +0200991 if dat.array[t].getCheck() == t0 {
Akronc5d8d432021-08-10 16:48:44 +0200992 // Remember state for backtracking to last tokenend state
Akronc5d8d432021-08-10 16:48:44 +0200993 goto PARSECHAR
Akrone61380b2021-08-16 10:10:46 +0200994
Akronc5d8d432021-08-10 16:48:44 +0200995 } else if epsilonState != 0 {
Akronb7e1f132021-08-10 11:52:31 +0200996 t0 = epsilonState
997 epsilonState = 0 // reset
998 buffo = epsilonOffset
Akron068874c2021-08-04 15:19:56 +0200999 if DEBUG {
Akronc5d8d432021-08-10 16:48:44 +02001000 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
Akron068874c2021-08-04 15:19:56 +02001001 }
Akronc5d8d432021-08-10 16:48:44 +02001002 goto PARSECHAR
Akron068874c2021-08-04 15:19:56 +02001003 }
Akronc5d8d432021-08-10 16:48:44 +02001004 return false
Akron068874c2021-08-04 15:19:56 +02001005}