blob: 4f7ac6f32d2f00f62c2fa9bd6d3c8b8ecaf254c8 [file] [log] [blame]
Akron7f1097f2021-09-21 16:00:29 +02001package datok
Akron8ef408b2021-08-02 22:11:04 +02002
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
Akronb4bbb472021-08-09 11:49:38 +02009// The maximum number of states is 1.073.741.823 (30bit),
Akron83e75a22021-08-04 13:14:06 +020010// with a loadfactor of ~70, this means roughly 70 million
11// states in the FSA, which is sufficient for the current
12// job.
Akron03c92fe2021-08-09 14:07:57 +020013//
14// Serialization is little endian.
Akron83e75a22021-08-04 13:14:06 +020015
Akron740f3d72021-08-03 12:12:34 +020016// TODO:
17// - replace maxSize with the check value
Akron7b1faa62021-09-02 16:10:21 +020018// - Check if final states can be optional.
19// - Introduce ELM (Morita et al. 2001) to speed
20// up construction. Can be ignored on serialization
21// to improve compression.
Akron3a063ef2021-08-05 19:36:35 +020022// - Add checksum to serialization.
Akrone61380b2021-08-16 10:10:46 +020023// - Replace/Enhance table with a map
24// - Provide a bufio.Scanner compatible interface.
Akron8e1d69b2021-08-12 17:38:49 +020025// - Mark epsilon transitions in bytes
Akron740f3d72021-08-03 12:12:34 +020026
Akron8ef408b2021-08-02 22:11:04 +020027import (
28 "bufio"
29 "compress/gzip"
Akron6247a5d2021-08-03 19:18:28 +020030 "encoding/binary"
Akron8ef408b2021-08-02 22:11:04 +020031 "fmt"
32 "io"
Akron679b4862021-09-02 16:59:26 +020033 "math"
Akron8ef408b2021-08-02 22:11:04 +020034 "os"
Akronc9d84a62021-08-03 15:56:03 +020035 "sort"
Akron740f3d72021-08-03 12:12:34 +020036
Akron527c10c2021-08-13 01:45:18 +020037 "log"
Akron8ef408b2021-08-02 22:11:04 +020038)
39
40const (
Akron2a4b9292021-08-04 15:35:22 +020041 DEBUG = false
42 MAGIC = "DATOK"
43 VERSION = uint16(1)
Akron03c92fe2021-08-09 14:07:57 +020044 FIRSTBIT uint32 = 1 << 31
45 SECONDBIT uint32 = 1 << 30
46 RESTBIT uint32 = ^uint32(0) &^ (FIRSTBIT | SECONDBIT)
Akron8ef408b2021-08-02 22:11:04 +020047)
48
Akron03c92fe2021-08-09 14:07:57 +020049// Serialization is always little endian
Akron6247a5d2021-08-03 19:18:28 +020050var bo binary.ByteOrder = binary.LittleEndian
51
Akron8ef408b2021-08-02 22:11:04 +020052type mapping struct {
53 source int
Akron3fdfec62021-08-04 11:40:10 +020054 target uint32
Akron8ef408b2021-08-02 22:11:04 +020055}
56
Akronf1a16502021-08-16 15:24:38 +020057type bc struct {
58 base uint32
59 check uint32
60}
61
Akron03c92fe2021-08-09 14:07:57 +020062// DaTokenizer represents a tokenizer implemented as a
63// Double Array FSA.
Akronf2120ca2021-08-03 16:26:41 +020064type DaTokenizer struct {
Akronea46e8a2021-08-17 00:36:31 +020065 sigma map[rune]int
66 sigmaASCII [256]int
Akron03a3c612021-08-04 11:51:27 +020067 maxSize int
Akron4fa28b32021-08-27 10:55:41 +020068 transCount int
Akronf1a16502021-08-16 15:24:38 +020069 array []bc
Akronc17f1ca2021-08-03 19:47:27 +020070
71 // Special symbols in sigma
72 epsilon int
73 unknown int
74 identity int
75 final int
Akron03c92fe2021-08-09 14:07:57 +020076 tokenend int
Akronf2120ca2021-08-03 16:26:41 +020077}
78
Akron439f4ec2021-08-09 15:45:38 +020079// ToDoubleArray turns the intermediate tokenizer representation
80// into a double array representation.
81//
82// This is based on Mizobuchi et al (2000), p.128
Akron1c34ce62021-09-23 23:27:39 +020083func (auto *Automaton) ToDoubleArray() *DaTokenizer {
Akronf2120ca2021-08-03 16:26:41 +020084
85 dat := &DaTokenizer{
Akron03a3c612021-08-04 11:51:27 +020086 sigma: make(map[rune]int),
Akron4fa28b32021-08-27 10:55:41 +020087 transCount: -1,
Akron1c34ce62021-09-23 23:27:39 +020088 final: auto.final,
89 unknown: auto.unknown,
90 identity: auto.identity,
91 epsilon: auto.epsilon,
92 tokenend: auto.tokenend,
Akronf2120ca2021-08-03 16:26:41 +020093 }
94
Akronf1a16502021-08-16 15:24:38 +020095 dat.resize(dat.final)
96
Akron1c34ce62021-09-23 23:27:39 +020097 for num, sym := range auto.sigmaRev {
Akronea46e8a2021-08-17 00:36:31 +020098 if int(sym) < 256 {
99 dat.sigmaASCII[int(sym)] = num
100 }
Akronf2120ca2021-08-03 16:26:41 +0200101 dat.sigma[sym] = num
102 }
Akron8ef408b2021-08-02 22:11:04 +0200103
104 mark := 0
105 size := 0
Akron6f1c16c2021-08-17 10:45:42 +0200106 var base uint32
Akronde18e902021-08-27 09:34:12 +0200107 var atrans *edge
108 var s, s1 int
109 var t, t1 uint32
Akron8ef408b2021-08-02 22:11:04 +0200110
Akron439f4ec2021-08-09 15:45:38 +0200111 // Create a mapping from s (in Ms aka Intermediate FSA)
112 // to t (in Mt aka Double Array FSA)
Akron1c34ce62021-09-23 23:27:39 +0200113 table := make([]*mapping, auto.arcCount+1)
Akron7b1faa62021-09-02 16:10:21 +0200114 // tableQueue := make([]int, tok.arcCount+1)
Akron8ef408b2021-08-02 22:11:04 +0200115
Akron439f4ec2021-08-09 15:45:38 +0200116 // Initialize with the start state
Akron8ef408b2021-08-02 22:11:04 +0200117 table[size] = &mapping{source: 1, target: 1}
Akron7b1faa62021-09-02 16:10:21 +0200118 // tableQueue[size] = 1
Akron8ef408b2021-08-02 22:11:04 +0200119 size++
120
Akron740f3d72021-08-03 12:12:34 +0200121 // Allocate space for the outgoing symbol range
Akron1c34ce62021-09-23 23:27:39 +0200122 A := make([]int, 0, auto.sigmaCount)
Akron7b1faa62021-09-02 16:10:21 +0200123 // tableLookup := make([]uint32, tok.arcCount+2) // make(map[int]uint32)
124 // tableLookup[1] = 1
Akron8ef408b2021-08-02 22:11:04 +0200125
Akron7b1faa62021-09-02 16:10:21 +0200126 // block_begin_pos := uint32(1)
Akronde18e902021-08-27 09:34:12 +0200127
Akron8ef408b2021-08-02 22:11:04 +0200128 for mark < size {
Akronde18e902021-08-27 09:34:12 +0200129 s = table[mark].source // This is a state in Ms
130 t = table[mark].target // This is a state in Mt
Akron7b1faa62021-09-02 16:10:21 +0200131 // s = tableQueue[mark]
132 // t = tableLookup[s]
Akron8ef408b2021-08-02 22:11:04 +0200133 mark++
Akron740f3d72021-08-03 12:12:34 +0200134
135 // Following the paper, here the state t can be remembered
136 // in the set of states St
Akron8ef408b2021-08-02 22:11:04 +0200137 A = A[:0]
Akron1c34ce62021-09-23 23:27:39 +0200138 auto.getSet(s, &A)
Akron8ef408b2021-08-02 22:11:04 +0200139
Akron740f3d72021-08-03 12:12:34 +0200140 // Set base to the first free slot in the double array
Akron29e306f2021-09-02 18:29:56 +0200141 // base = dat.xCheck(A)
Akron679b4862021-09-02 16:59:26 +0200142 // base = dat.xCheckSkip(A)
Akron7b1faa62021-09-02 16:10:21 +0200143 // base = dat.xCheckNiu(A, &block_begin_pos)
Akron29e306f2021-09-02 18:29:56 +0200144 base = dat.xCheckSkipNiu(A)
Akron6f1c16c2021-08-17 10:45:42 +0200145 dat.array[t].setBase(base)
Akron8ef408b2021-08-02 22:11:04 +0200146
Akron773b1ef2021-08-03 17:37:20 +0200147 // TODO:
Akron068874c2021-08-04 15:19:56 +0200148 // Sort the outgoing transitions based on the
Akron773b1ef2021-08-03 17:37:20 +0200149 // outdegree of .end
150
Akron740f3d72021-08-03 12:12:34 +0200151 // Iterate over all outgoing symbols
Akron8ef408b2021-08-02 22:11:04 +0200152 for _, a := range A {
153
Akron1c34ce62021-09-23 23:27:39 +0200154 if a != auto.final {
Akron8ef408b2021-08-02 22:11:04 +0200155
Akron1c34ce62021-09-23 23:27:39 +0200156 atrans = auto.transitions[s][a]
Akronde18e902021-08-27 09:34:12 +0200157
Akron740f3d72021-08-03 12:12:34 +0200158 // Aka g(s, a)
Akronde18e902021-08-27 09:34:12 +0200159 s1 = atrans.end
Akron8ef408b2021-08-02 22:11:04 +0200160
Akron740f3d72021-08-03 12:12:34 +0200161 // Store the transition
Akronde18e902021-08-27 09:34:12 +0200162 t1 = base + uint32(a)
Akronf1a16502021-08-16 15:24:38 +0200163 dat.array[t1].setCheck(t)
164
165 // Set maxSize
166 if dat.maxSize < int(t1) {
167 dat.maxSize = int(t1)
168 }
Akron8ef408b2021-08-02 22:11:04 +0200169
Akron439f4ec2021-08-09 15:45:38 +0200170 if DEBUG {
Akron524c5432021-08-05 14:14:27 +0200171 fmt.Println("Translate transition",
172 s, "->", s1, "(", a, ")", "to", t, "->", t1)
173 }
174
Akron83e75a22021-08-04 13:14:06 +0200175 // Mark the state as being the target of a nontoken transition
Akronde18e902021-08-27 09:34:12 +0200176 if atrans.nontoken {
Akronf1a16502021-08-16 15:24:38 +0200177 dat.array[t1].setNonToken(true)
Akron524c5432021-08-05 14:14:27 +0200178 if DEBUG {
179 fmt.Println("Set", t1, "to nontoken")
180 }
Akron83e75a22021-08-04 13:14:06 +0200181 }
182
Akron84d68e62021-08-04 17:06:52 +0200183 // Mark the state as being the target of a tokenend transition
Akronde18e902021-08-27 09:34:12 +0200184 if atrans.tokenend {
Akronf1a16502021-08-16 15:24:38 +0200185 dat.array[t1].setTokenEnd(true)
Akron524c5432021-08-05 14:14:27 +0200186 if DEBUG {
187 fmt.Println("Set", t1, "to tokenend")
188 }
Akron84d68e62021-08-04 17:06:52 +0200189 }
190
Akron740f3d72021-08-03 12:12:34 +0200191 // Check for representative states
Akron439f4ec2021-08-09 15:45:38 +0200192 r := stateAlreadyInTable(s1, table, size)
Akronde18e902021-08-27 09:34:12 +0200193 // r := tableLookup[s1]
Akron740f3d72021-08-03 12:12:34 +0200194
Akron439f4ec2021-08-09 15:45:38 +0200195 // No representative found
Akron8ef408b2021-08-02 22:11:04 +0200196 if r == 0 {
Akron740f3d72021-08-03 12:12:34 +0200197 // Remember the mapping
Akron8ef408b2021-08-02 22:11:04 +0200198 table[size] = &mapping{source: s1, target: t1}
Akron7b1faa62021-09-02 16:10:21 +0200199 // tableQueue[size] = s1
Akronde18e902021-08-27 09:34:12 +0200200 // tableLookup[s1] = t1
Akron8ef408b2021-08-02 22:11:04 +0200201 size++
202 } else {
Akron740f3d72021-08-03 12:12:34 +0200203 // Overwrite with the representative state
Akronf1a16502021-08-16 15:24:38 +0200204 dat.array[t1].setBase(r)
205 dat.array[t1].setSeparate(true)
Akron8ef408b2021-08-02 22:11:04 +0200206 }
207 } else {
Akron740f3d72021-08-03 12:12:34 +0200208 // Store a final transition
Akron6f1c16c2021-08-17 10:45:42 +0200209 dat.array[base+uint32(dat.final)].setCheck(t)
Akronf1a16502021-08-16 15:24:38 +0200210
Akronde18e902021-08-27 09:34:12 +0200211 if dat.maxSize < int(base)+dat.final {
212 dat.maxSize = int(base) + dat.final
Akronf1a16502021-08-16 15:24:38 +0200213 }
Akron8ef408b2021-08-02 22:11:04 +0200214 }
215 }
216 }
217
218 // Following Mizobuchi et al (2000) the size of the
219 // FSA should be stored in check(1).
Akron29e306f2021-09-02 18:29:56 +0200220 // We make the size a bit larger so we never have to check for boundaries.
221 dat.setSize(dat.maxSize + dat.final)
222 if len(dat.array) < dat.maxSize+dat.final {
223 dat.array = append(dat.array, make([]bc, dat.final)...)
224 }
225 dat.array = dat.array[:dat.maxSize+dat.final]
Akronf2120ca2021-08-03 16:26:41 +0200226 return dat
Akron8ef408b2021-08-02 22:11:04 +0200227}
228
Akron8ef408b2021-08-02 22:11:04 +0200229// Check the table if a mapping of s
Akron740f3d72021-08-03 12:12:34 +0200230// exists and return this as a representative.
231// Currently iterates through the whole table
232// in a bruteforce manner.
Akron439f4ec2021-08-09 15:45:38 +0200233func stateAlreadyInTable(s int, table []*mapping, size int) uint32 {
Akron8ef408b2021-08-02 22:11:04 +0200234 for x := 0; x < size; x++ {
235 if table[x].source == s {
236 return table[x].target
237 }
238 }
239 return 0
240}
241
Akron64ffd9a2021-08-03 19:55:21 +0200242// Resize double array when necessary
243func (dat *DaTokenizer) resize(l int) {
244 // TODO:
245 // This is a bit too aggressive atm and should be calmed down.
246 if len(dat.array) <= l {
Akronf1a16502021-08-16 15:24:38 +0200247 dat.array = append(dat.array, make([]bc, l)...)
Akron8ef408b2021-08-02 22:11:04 +0200248 }
Akron64ffd9a2021-08-03 19:55:21 +0200249}
Akronc9d84a62021-08-03 15:56:03 +0200250
Akron64ffd9a2021-08-03 19:55:21 +0200251// Set base value in double array
Akronf1a16502021-08-16 15:24:38 +0200252func (bc *bc) setBase(v uint32) {
253 bc.base = v
Akron439f4ec2021-08-09 15:45:38 +0200254}
255
256// Get base value in double array
Akronf1a16502021-08-16 15:24:38 +0200257func (bc *bc) getBase() uint32 {
258 return bc.base & RESTBIT
Akron439f4ec2021-08-09 15:45:38 +0200259}
260
261// Set check value in double array
Akronf1a16502021-08-16 15:24:38 +0200262func (bc *bc) setCheck(v uint32) {
263 bc.check = v
Akron439f4ec2021-08-09 15:45:38 +0200264}
265
266// Get check value in double array
Akronf1a16502021-08-16 15:24:38 +0200267func (bc *bc) getCheck() uint32 {
268 return bc.check & RESTBIT
Akron64ffd9a2021-08-03 19:55:21 +0200269}
270
Akron3fdfec62021-08-04 11:40:10 +0200271// Returns true if a state is separate pointing to a representative
Akronf1a16502021-08-16 15:24:38 +0200272func (bc *bc) isSeparate() bool {
273 return bc.base&FIRSTBIT != 0
Akron3fdfec62021-08-04 11:40:10 +0200274}
275
276// Mark a state as separate pointing to a representative
Akronf1a16502021-08-16 15:24:38 +0200277func (bc *bc) setSeparate(sep bool) {
Akron3fdfec62021-08-04 11:40:10 +0200278 if sep {
Akronf1a16502021-08-16 15:24:38 +0200279 bc.base |= FIRSTBIT
Akron3fdfec62021-08-04 11:40:10 +0200280 } else {
Akronf1a16502021-08-16 15:24:38 +0200281 bc.base &= (RESTBIT | SECONDBIT)
Akron3fdfec62021-08-04 11:40:10 +0200282 }
283}
284
Akron83e75a22021-08-04 13:14:06 +0200285// Returns true if a state is the target of a nontoken transition
Akronf1a16502021-08-16 15:24:38 +0200286func (bc *bc) isNonToken() bool {
287 return bc.check&FIRSTBIT != 0
Akron83e75a22021-08-04 13:14:06 +0200288}
289
290// Mark a state as being the target of a nontoken transition
Akronf1a16502021-08-16 15:24:38 +0200291func (bc *bc) setNonToken(sep bool) {
Akron83e75a22021-08-04 13:14:06 +0200292 if sep {
Akronf1a16502021-08-16 15:24:38 +0200293 bc.check |= FIRSTBIT
Akron83e75a22021-08-04 13:14:06 +0200294 } else {
Akronf1a16502021-08-16 15:24:38 +0200295 bc.check &= (RESTBIT | SECONDBIT)
Akron83e75a22021-08-04 13:14:06 +0200296 }
297}
298
Akron84d68e62021-08-04 17:06:52 +0200299// Returns true if a state is the target of a tokenend transition
Akronf1a16502021-08-16 15:24:38 +0200300func (bc *bc) isTokenEnd() bool {
301 return bc.check&SECONDBIT != 0
Akron84d68e62021-08-04 17:06:52 +0200302}
303
304// Mark a state as being the target of a tokenend transition
Akronf1a16502021-08-16 15:24:38 +0200305func (bc *bc) setTokenEnd(sep bool) {
Akron84d68e62021-08-04 17:06:52 +0200306 if sep {
Akronf1a16502021-08-16 15:24:38 +0200307 bc.check |= SECONDBIT
Akron84d68e62021-08-04 17:06:52 +0200308 } else {
Akronf1a16502021-08-16 15:24:38 +0200309 bc.check &= (RESTBIT | FIRSTBIT)
Akron84d68e62021-08-04 17:06:52 +0200310 }
311}
312
Akron64ffd9a2021-08-03 19:55:21 +0200313// Set size of double array
Akron3fdfec62021-08-04 11:40:10 +0200314func (dat *DaTokenizer) setSize(v int) {
Akronf1a16502021-08-16 15:24:38 +0200315 dat.array[1].setCheck(uint32(v))
Akron64ffd9a2021-08-03 19:55:21 +0200316}
317
318// Get size of double array
Akron3fdfec62021-08-04 11:40:10 +0200319func (dat *DaTokenizer) GetSize() int {
Akronf1a16502021-08-16 15:24:38 +0200320 return int(dat.array[1].getCheck())
Akron8ef408b2021-08-02 22:11:04 +0200321}
322
323// Based on Mizobuchi et al (2000), p. 124
324// This iterates for every state through the complete double array
325// structure until it finds a gap that fits all outgoing transitions
326// of the state. This is extremely slow, but is only necessary in the
327// construction phase of the tokenizer.
Akron3fdfec62021-08-04 11:40:10 +0200328func (dat *DaTokenizer) xCheck(symbols []int) uint32 {
Akron740f3d72021-08-03 12:12:34 +0200329
330 // Start at the first entry of the double array list
Akron6f1c16c2021-08-17 10:45:42 +0200331 base := uint32(1)
332
Akron8ef408b2021-08-02 22:11:04 +0200333OVERLAP:
Akron740f3d72021-08-03 12:12:34 +0200334 // Resize the array if necessary
Akronf1a16502021-08-16 15:24:38 +0200335 dat.resize(int(base) + dat.final)
Akron8ef408b2021-08-02 22:11:04 +0200336 for _, a := range symbols {
Akronf1a16502021-08-16 15:24:38 +0200337 if dat.array[int(base)+a].getCheck() != 0 {
Akron8ef408b2021-08-02 22:11:04 +0200338 base++
339 goto OVERLAP
340 }
341 }
Akron8ef408b2021-08-02 22:11:04 +0200342 return base
343}
344
Akron679b4862021-09-02 16:59:26 +0200345// This is an implementation of xCheck with the skip-improvement
346// proposed by Morita et al. (2001)
347func (dat *DaTokenizer) xCheckSkip(symbols []int) uint32 {
348
349 // Start at the first entry of the double array list
350 base := uint32(math.Abs(float64(dat.maxSize-1) * .9))
351
352OVERLAP:
353 // Resize the array if necessary
354 dat.resize(int(base) + dat.final)
355 for _, a := range symbols {
356 if dat.array[int(base)+a].getCheck() != 0 {
357 base++
358 goto OVERLAP
359 }
360 }
361 return base
362}
363
Akron29e306f2021-09-02 18:29:56 +0200364// This is an implementation of xCheck with the skip-improvement
365// proposed by Morita et al. (2001) for higher outdegrees as
366// proposed by Niu et al. (2013)
367func (dat *DaTokenizer) xCheckSkipNiu(symbols []int) uint32 {
368
369 // Start at the first entry of the double array list
370 base := uint32(1)
371
372 // Or skip the first few entries
373 if len(symbols) >= 3 {
374 base = uint32(math.Abs(float64(dat.maxSize-1)*.9)) + 1
375 }
376
377OVERLAP:
378 // Resize the array if necessary
379 dat.resize(int(base) + dat.final + 1)
380 for _, a := range symbols {
381 if dat.array[int(base)+a].getCheck() != 0 {
382 base++
383 goto OVERLAP
384 }
385 }
386 return base
387}
388
Akron7b1faa62021-09-02 16:10:21 +0200389// This is an implementation of xCheck wit an improvement
390// proposed by Niu et al. (2013)
391func (dat *DaTokenizer) xCheckNiu(symbols []int, block_begin_pos *uint32) uint32 {
392
393 // Start at the first entry of the double array list
394 base := uint32(1)
395
396 if len(symbols) > 3 {
397 sort.Ints(symbols)
398 if *block_begin_pos > uint32(symbols[0]) {
399 dat.resize(int(*block_begin_pos) + dat.final)
400 *block_begin_pos += uint32(symbols[len(symbols)-1] + 1)
401 return *block_begin_pos - uint32(symbols[0])
402 }
403 }
404
405OVERLAP:
406 // Resize the array if necessary
407 dat.resize(int(base) + dat.final)
408 for _, a := range symbols {
409 if dat.array[int(base)+a].getCheck() != 0 {
410 base++
411 goto OVERLAP
412 }
413 }
414 return base
415}
416
Akron3610f102021-08-08 14:13:25 +0200417// List all outgoing transitions for a state
418// for testing purposes
419func (dat *DaTokenizer) outgoing(t uint32) []int {
420
421 valid := make([]int, 0, len(dat.sigma))
422
423 for _, a := range dat.sigma {
Akronf1a16502021-08-16 15:24:38 +0200424 t1 := dat.array[t].getBase() + uint32(a)
425 if t1 <= dat.array[1].getCheck() && dat.array[t1].getCheck() == t {
Akron3610f102021-08-08 14:13:25 +0200426 valid = append(valid, a)
427 }
428 }
429
430 for _, a := range []int{dat.epsilon, dat.unknown, dat.identity, dat.final} {
Akronf1a16502021-08-16 15:24:38 +0200431 t1 := dat.array[t].getBase() + uint32(a)
432 if t1 <= dat.array[1].getCheck() && dat.array[t1].getCheck() == t {
Akron3610f102021-08-08 14:13:25 +0200433 valid = append(valid, -1*a)
434 }
435 }
436
437 sort.Ints(valid)
438
439 return valid
440}
441
Akron4fa28b32021-08-27 10:55:41 +0200442// TransCount as the number of transitions aka arcs in the
443// finite state automaton
444func (dat *DaTokenizer) TransCount() int {
445 // Cache the transCount
446 if dat.transCount > 0 {
447 return dat.transCount
448 }
449
450 dat.transCount = 0
451 for x := 1; x < len(dat.array); x++ {
452 if dat.array[x].getBase() != 0 {
453 dat.transCount++
454 }
455 }
456
457 return dat.transCount
458}
459
Akron03a3c612021-08-04 11:51:27 +0200460// LoadFactor as defined in Kanda et al (2018),
461// i.e. the proportion of non-empty elements to all elements.
462func (dat *DaTokenizer) LoadFactor() float64 {
Akron4fa28b32021-08-27 10:55:41 +0200463 return float64(dat.TransCount()) / float64(len(dat.array)) * 100
Akrond66a9262021-08-03 17:09:09 +0200464}
465
Akron439f4ec2021-08-09 15:45:38 +0200466// Save stores the double array data in a file
Akron3a063ef2021-08-05 19:36:35 +0200467func (dat *DaTokenizer) Save(file string) (n int64, err error) {
468 f, err := os.Create(file)
469 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200470 log.Println(err)
Akron439f4ec2021-08-09 15:45:38 +0200471 return 0, err
Akron3a063ef2021-08-05 19:36:35 +0200472 }
473 defer f.Close()
474 gz := gzip.NewWriter(f)
475 defer gz.Close()
476 n, err = dat.WriteTo(gz)
477 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200478 log.Println(err)
Akron3a063ef2021-08-05 19:36:35 +0200479 return n, err
480 }
481 gz.Flush()
482 return n, nil
483}
484
485// WriteTo stores the double array data in an io.Writer.
Akron6247a5d2021-08-03 19:18:28 +0200486func (dat *DaTokenizer) WriteTo(w io.Writer) (n int64, err error) {
487
Akron3a063ef2021-08-05 19:36:35 +0200488 wb := bufio.NewWriter(w)
489 defer wb.Flush()
490
Akron6247a5d2021-08-03 19:18:28 +0200491 // Store magical header
Akron3a063ef2021-08-05 19:36:35 +0200492 all, err := wb.Write([]byte(MAGIC))
Akron6247a5d2021-08-03 19:18:28 +0200493 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200494 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200495 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200496 }
497
498 // Get sigma as a list
499 sigmalist := make([]rune, len(dat.sigma)+16)
500 max := 0
501 for sym, num := range dat.sigma {
502 sigmalist[num] = sym
503 if num > max {
504 max = num
505 }
506 }
507
508 sigmalist = sigmalist[:max+1]
509
Akron3f8571a2021-08-05 11:18:10 +0200510 buf := make([]byte, 0, 16)
Akron6247a5d2021-08-03 19:18:28 +0200511 bo.PutUint16(buf[0:2], VERSION)
Akronc17f1ca2021-08-03 19:47:27 +0200512 bo.PutUint16(buf[2:4], uint16(dat.epsilon))
513 bo.PutUint16(buf[4:6], uint16(dat.unknown))
514 bo.PutUint16(buf[6:8], uint16(dat.identity))
515 bo.PutUint16(buf[8:10], uint16(dat.final))
Akron6247a5d2021-08-03 19:18:28 +0200516 bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
Akronf1a16502021-08-16 15:24:38 +0200517 bo.PutUint32(buf[12:16], uint32(len(dat.array)*2)) // Legacy support
Akron3a063ef2021-08-05 19:36:35 +0200518 more, err := wb.Write(buf[0:16])
Akron6247a5d2021-08-03 19:18:28 +0200519 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200520 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200521 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200522 }
523
524 all += more
525
Akron6247a5d2021-08-03 19:18:28 +0200526 // Write sigma
527 for _, sym := range sigmalist {
Akron3f8571a2021-08-05 11:18:10 +0200528
Akron3a063ef2021-08-05 19:36:35 +0200529 more, err = wb.WriteRune(sym)
Akron6247a5d2021-08-03 19:18:28 +0200530 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200531 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200532 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200533 }
534 all += more
535 }
Akron439f4ec2021-08-09 15:45:38 +0200536
Akron6247a5d2021-08-03 19:18:28 +0200537 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200538 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200539 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200540 }
Akron6247a5d2021-08-03 19:18:28 +0200541
542 // Test marker - could be checksum
Akron3a063ef2021-08-05 19:36:35 +0200543 more, err = wb.Write([]byte("T"))
Akron6247a5d2021-08-03 19:18:28 +0200544 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200545 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200546 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200547 }
548 all += more
549
Akronf1a16502021-08-16 15:24:38 +0200550 // for x := 0; x < len(dat.array); x++ {
551 for _, bc := range dat.array {
552 bo.PutUint32(buf[0:4], bc.base)
553 more, err = wb.Write(buf[0:4])
Akron6247a5d2021-08-03 19:18:28 +0200554 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200555 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200556 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200557 }
Akron439f4ec2021-08-09 15:45:38 +0200558 all += more
Akron3a063ef2021-08-05 19:36:35 +0200559 if more != 4 {
Akronf1a16502021-08-16 15:24:38 +0200560 log.Println("Can not write base uint32")
561 return int64(all), err
562 }
563 bo.PutUint32(buf[0:4], bc.check)
564 more, err = wb.Write(buf[0:4])
565 if err != nil {
566 log.Println(err)
567 return int64(all), err
568 }
569 all += more
570 if more != 4 {
571 log.Println("Can not write check uint32")
Akron3a063ef2021-08-05 19:36:35 +0200572 return int64(all), err
573 }
Akron6247a5d2021-08-03 19:18:28 +0200574 }
575
576 return int64(all), err
577}
578
Akron439f4ec2021-08-09 15:45:38 +0200579// LoadDatokFile reads a double array represented tokenizer
580// from a file.
Akron3f8571a2021-08-05 11:18:10 +0200581func LoadDatokFile(file string) *DaTokenizer {
582 f, err := os.Open(file)
583 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200584 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200585 return nil
Akron3f8571a2021-08-05 11:18:10 +0200586 }
587 defer f.Close()
588
589 gz, err := gzip.NewReader(f)
590 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200591 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200592 return nil
Akron3f8571a2021-08-05 11:18:10 +0200593 }
594 defer gz.Close()
595
Akron3a063ef2021-08-05 19:36:35 +0200596 // Todo: Read the whole file!
Akron3f8571a2021-08-05 11:18:10 +0200597 return ParseDatok(gz)
598}
599
Akron439f4ec2021-08-09 15:45:38 +0200600// LoadDatokFile reads a double array represented tokenizer
601// from an io.Reader
Akron3f8571a2021-08-05 11:18:10 +0200602func ParseDatok(ior io.Reader) *DaTokenizer {
603
Akron439f4ec2021-08-09 15:45:38 +0200604 // Initialize tokenizer with default values
Akron3f8571a2021-08-05 11:18:10 +0200605 dat := &DaTokenizer{
606 sigma: make(map[rune]int),
607 epsilon: 0,
608 unknown: 0,
609 identity: 0,
610 final: 0,
Akron4fa28b32021-08-27 10:55:41 +0200611 transCount: 0,
Akron3f8571a2021-08-05 11:18:10 +0200612 }
613
614 r := bufio.NewReader(ior)
615
Akron3f8571a2021-08-05 11:18:10 +0200616 buf := make([]byte, 1024)
617 buf = buf[0:len(MAGIC)]
618
Akron439f4ec2021-08-09 15:45:38 +0200619 _, err := r.Read(buf)
Akron3f8571a2021-08-05 11:18:10 +0200620
621 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200622 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200623 return nil
624 }
625
Akron3f8571a2021-08-05 11:18:10 +0200626 if string(MAGIC) != string(buf) {
Akron527c10c2021-08-13 01:45:18 +0200627 log.Println("Not a datok file")
Akron3f8571a2021-08-05 11:18:10 +0200628 return nil
629 }
630
Akron439f4ec2021-08-09 15:45:38 +0200631 more, err := io.ReadFull(r, buf[0:16])
Akron3f8571a2021-08-05 11:18:10 +0200632 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200633 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200634 return nil
635 }
636
Akron439f4ec2021-08-09 15:45:38 +0200637 if more != 16 {
Akron527c10c2021-08-13 01:45:18 +0200638 log.Println("Read bytes do not fit")
Akron439f4ec2021-08-09 15:45:38 +0200639 return nil
640 }
Akron3f8571a2021-08-05 11:18:10 +0200641
Akron3a063ef2021-08-05 19:36:35 +0200642 version := bo.Uint16(buf[0:2])
643
644 if version != VERSION {
Akron527c10c2021-08-13 01:45:18 +0200645 log.Println("Version not compatible")
Akron3a063ef2021-08-05 19:36:35 +0200646 return nil
647 }
648
Akron3f8571a2021-08-05 11:18:10 +0200649 dat.epsilon = int(bo.Uint16(buf[2:4]))
650 dat.unknown = int(bo.Uint16(buf[4:6]))
651 dat.identity = int(bo.Uint16(buf[6:8]))
652 dat.final = int(bo.Uint16(buf[8:10]))
653
654 sigmaCount := int(bo.Uint16(buf[10:12]))
Akronf1a16502021-08-16 15:24:38 +0200655 arraySize := int(bo.Uint32(buf[12:16])) / 2 // Legacy support
Akron3f8571a2021-08-05 11:18:10 +0200656
Akron3a063ef2021-08-05 19:36:35 +0200657 // Shouldn't be relevant though
658 dat.maxSize = arraySize - 1
659
Akron3f8571a2021-08-05 11:18:10 +0200660 for x := 0; x < sigmaCount; x++ {
Akron439f4ec2021-08-09 15:45:38 +0200661 sym, _, err := r.ReadRune()
Akron3f8571a2021-08-05 11:18:10 +0200662 if err == nil && sym != 0 {
Akronea46e8a2021-08-17 00:36:31 +0200663 if int(sym) < 256 {
664 dat.sigmaASCII[int(sym)] = x
665 }
Akron3f8571a2021-08-05 11:18:10 +0200666 dat.sigma[sym] = x
667 }
Akron3f8571a2021-08-05 11:18:10 +0200668 }
669
Akron439f4ec2021-08-09 15:45:38 +0200670 _, err = io.ReadFull(r, buf[0:1])
Akron3f8571a2021-08-05 11:18:10 +0200671
672 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200673 log.Print(err)
Akron3f8571a2021-08-05 11:18:10 +0200674 return nil
675 }
676
Akron3f8571a2021-08-05 11:18:10 +0200677 if string("T") != string(buf[0:1]) {
Akron527c10c2021-08-13 01:45:18 +0200678 log.Println("Not a datok file")
Akron3f8571a2021-08-05 11:18:10 +0200679 return nil
680 }
681
682 // Read based on length
Akronf1a16502021-08-16 15:24:38 +0200683 dat.array = make([]bc, arraySize)
Akron3f8571a2021-08-05 11:18:10 +0200684
Akronbb4aac52021-08-13 00:52:27 +0200685 dataArray, err := io.ReadAll(r)
Akron439f4ec2021-08-09 15:45:38 +0200686
Akronbb4aac52021-08-13 00:52:27 +0200687 if err == io.EOF {
Akron527c10c2021-08-13 01:45:18 +0200688 log.Println(err)
Akronbb4aac52021-08-13 00:52:27 +0200689 return nil
690 }
691
Akronf1a16502021-08-16 15:24:38 +0200692 if len(dataArray) < arraySize*8 {
Akron527c10c2021-08-13 01:45:18 +0200693 log.Println("Not enough bytes read")
Akronbb4aac52021-08-13 00:52:27 +0200694 return nil
695 }
696
697 for x := 0; x < arraySize; x++ {
Akronf1a16502021-08-16 15:24:38 +0200698 dat.array[x].base = bo.Uint32(dataArray[x*8 : (x*8)+4])
699 dat.array[x].check = bo.Uint32(dataArray[(x*8)+4 : (x*8)+8])
Akron3f8571a2021-08-05 11:18:10 +0200700 }
701
702 return dat
703}
704
Akron439f4ec2021-08-09 15:45:38 +0200705// Show the current state of the buffer,
706// for testing puroses
Akron3610f102021-08-08 14:13:25 +0200707func showBuffer(buffer []rune, buffo int, buffi int) string {
708 out := make([]rune, 0, 1024)
709 for x := 0; x < len(buffer); x++ {
710 if buffi == x {
711 out = append(out, '^')
712 }
713 if buffo == x {
714 out = append(out, '[', buffer[x], ']')
715 } else {
716 out = append(out, buffer[x])
717 }
718 }
719 return string(out)
720}
721
Akron84d68e62021-08-04 17:06:52 +0200722// Transduce an input string against the double array
Akron3610f102021-08-08 14:13:25 +0200723// FSA. The rules are always greedy. If the automaton fails,
724// it takes the last possible token ending branch.
Akron068874c2021-08-04 15:19:56 +0200725//
Akron4db3ecf2021-08-11 18:49:03 +0200726// Based on Mizobuchi et al (2000), p. 129,
727// with additional support for IDENTITY, UNKNOWN
728// and EPSILON transitions and NONTOKEN and TOKENEND handling.
Akron3f8571a2021-08-05 11:18:10 +0200729func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
Akron068874c2021-08-04 15:19:56 +0200730 var a int
Akronb4bbb472021-08-09 11:49:38 +0200731 var t0 uint32
Akronb7e1f132021-08-10 11:52:31 +0200732 t := uint32(1) // Initial state
733 var ok, rewindBuffer bool
Akron068874c2021-08-04 15:19:56 +0200734
Akron3610f102021-08-08 14:13:25 +0200735 // Remember the last position of a possible tokenend,
736 // in case the automaton fails.
737 epsilonState := uint32(0)
738 epsilonOffset := 0
739
740 // Implement a low level buffer for full control,
741 // however - it is probably better to introduce
742 // this on a higher level with a io.Reader interface
743 // The buffer stores a single word and may have white
744 // space at the end (but not at the beginning).
745 //
746 // This is the only backtracking requirement because of
747 // epsilon transitions, to support tokenizations like:
748 // "this is an example|.| And it works." vs
749 // "this is an example.com| application."
Akronb7e1f132021-08-10 11:52:31 +0200750 //
751 // TODO:
752 // Store a translation buffer as well, so characters don't
753 // have to be translated multiple times!
Akron3610f102021-08-08 14:13:25 +0200754 buffer := make([]rune, 1024)
755 buffo := 0 // Buffer offset
756 buffi := 0 // Buffer length
757
Akron3f8571a2021-08-05 11:18:10 +0200758 reader := bufio.NewReader(r)
759 writer := bufio.NewWriter(w)
760 defer writer.Flush()
Akron068874c2021-08-04 15:19:56 +0200761
Akron3f8571a2021-08-05 11:18:10 +0200762 var char rune
763 var err error
764 eof := false
Akronb7e1f132021-08-10 11:52:31 +0200765 newchar := true
Akron3f8571a2021-08-05 11:18:10 +0200766
Akronc5d8d432021-08-10 16:48:44 +0200767PARSECHAR:
Akron3f8571a2021-08-05 11:18:10 +0200768 for {
769
Akronb7e1f132021-08-10 11:52:31 +0200770 if newchar {
771 // Get from reader if buffer is empty
772 if buffo >= buffi {
Akron1594cb82021-08-11 11:14:56 +0200773 if eof {
774 break
775 }
Akronb7e1f132021-08-10 11:52:31 +0200776 char, _, err = reader.ReadRune()
Akron439f4ec2021-08-09 15:45:38 +0200777
Akronb7e1f132021-08-10 11:52:31 +0200778 // No more runes to read
779 if err != nil {
780 eof = true
781 break
782 }
783 buffer[buffi] = char
784 buffi++
Akron3f8571a2021-08-05 11:18:10 +0200785 }
Akronb7e1f132021-08-10 11:52:31 +0200786
787 char = buffer[buffo]
788
789 if DEBUG {
790 fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
791 }
792
Akron6f1c16c2021-08-17 10:45:42 +0200793 // TODO:
794 // Better not repeatedly check for a!
795 // Possibly keep a buffer with a.
Akronea46e8a2021-08-17 00:36:31 +0200796 if int(char) < 256 {
797 a = dat.sigmaASCII[int(char)]
798 } else {
799 a, ok = dat.sigma[char]
800 if !ok {
801 a = 0
802 }
803 }
Akronb7e1f132021-08-10 11:52:31 +0200804
805 // Use identity symbol if character is not in sigma
Akronea46e8a2021-08-17 00:36:31 +0200806 if a == 0 && dat.identity != -1 {
Akronb7e1f132021-08-10 11:52:31 +0200807 a = dat.identity
808 }
809
810 t0 = t
811
812 // Check for epsilon transitions and remember
Akronf1a16502021-08-16 15:24:38 +0200813 if dat.array[dat.array[t0].getBase()+uint32(dat.epsilon)].getCheck() == t0 {
Akronb7e1f132021-08-10 11:52:31 +0200814 // Remember state for backtracking to last tokenend state
815 epsilonState = t0
816 epsilonOffset = buffo
817 }
Akron3f8571a2021-08-05 11:18:10 +0200818 }
Akron3610f102021-08-08 14:13:25 +0200819
Akronb7e1f132021-08-10 11:52:31 +0200820 // Checks a transition based on t0, a and buffo
Akronf1a16502021-08-16 15:24:38 +0200821 t = dat.array[t0].getBase() + uint32(a)
822 ta := dat.array[t]
Akron068874c2021-08-04 15:19:56 +0200823
Akron524c5432021-08-05 14:14:27 +0200824 if DEBUG {
Akronb7e1f132021-08-10 11:52:31 +0200825 // Char is only relevant if set
826 fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
827 if false {
828 fmt.Println(dat.outgoing(t0))
829 }
Akron524c5432021-08-05 14:14:27 +0200830 }
831
Akronb7e1f132021-08-10 11:52:31 +0200832 // Check if the transition is invalid according to the double array
Akronf1a16502021-08-16 15:24:38 +0200833 if t > dat.array[1].getCheck() || ta.getCheck() != t0 {
Akron068874c2021-08-04 15:19:56 +0200834
835 if DEBUG {
Akronf1a16502021-08-16 15:24:38 +0200836 fmt.Println("Match is not fine!", t, "and", ta.getCheck(), "vs", t0)
Akron068874c2021-08-04 15:19:56 +0200837 }
838
839 if !ok && a == dat.identity {
Akronb4bbb472021-08-09 11:49:38 +0200840
Akron068874c2021-08-04 15:19:56 +0200841 // Try again with unknown symbol, in case identity failed
Akronb7e1f132021-08-10 11:52:31 +0200842 // Char is only relevant when set
Akron068874c2021-08-04 15:19:56 +0200843 if DEBUG {
Akron3f8571a2021-08-05 11:18:10 +0200844 fmt.Println("UNKNOWN symbol", string(char), "->", dat.unknown)
Akron068874c2021-08-04 15:19:56 +0200845 }
846 a = dat.unknown
847
848 } else if a != dat.epsilon {
Akronb4bbb472021-08-09 11:49:38 +0200849
Akron068874c2021-08-04 15:19:56 +0200850 // Try again with epsilon symbol, in case everything else failed
Akronb4bbb472021-08-09 11:49:38 +0200851 t0 = epsilonState
Akron3610f102021-08-08 14:13:25 +0200852 epsilonState = 0 // reset
853 buffo = epsilonOffset
Akron439f4ec2021-08-09 15:45:38 +0200854 a = dat.epsilon
855
Akron3610f102021-08-08 14:13:25 +0200856 if DEBUG {
857 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
858 }
Akronb4bbb472021-08-09 11:49:38 +0200859
Akron068874c2021-08-04 15:19:56 +0200860 } else {
861 break
862 }
Akron068874c2021-08-04 15:19:56 +0200863
Akronb7e1f132021-08-10 11:52:31 +0200864 newchar = false
865 continue
Akronb4bbb472021-08-09 11:49:38 +0200866 }
867
Akronb7e1f132021-08-10 11:52:31 +0200868 // Transition was successful
869 rewindBuffer = false
Akron439f4ec2021-08-09 15:45:38 +0200870
871 // Transition consumes a character
Akronb4bbb472021-08-09 11:49:38 +0200872 if a != dat.epsilon {
873
Akron3610f102021-08-08 14:13:25 +0200874 buffo++
Akronb4bbb472021-08-09 11:49:38 +0200875
Akron439f4ec2021-08-09 15:45:38 +0200876 // Transition does not produce a character
Akronf1a16502021-08-16 15:24:38 +0200877 if buffo == 1 && ta.isNonToken() {
Akron3610f102021-08-08 14:13:25 +0200878 if DEBUG {
879 fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
880 }
Akron439f4ec2021-08-09 15:45:38 +0200881 rewindBuffer = true
Akron3610f102021-08-08 14:13:25 +0200882 }
Akron3f8571a2021-08-05 11:18:10 +0200883 }
Akron068874c2021-08-04 15:19:56 +0200884
Akronc5d8d432021-08-10 16:48:44 +0200885 // Transition marks the end of a token - so flush the buffer
Akronf1a16502021-08-16 15:24:38 +0200886 if ta.isTokenEnd() {
Akron524c5432021-08-05 14:14:27 +0200887
Akronc5d8d432021-08-10 16:48:44 +0200888 if buffi > 0 {
Akronc5d8d432021-08-10 16:48:44 +0200889 if DEBUG {
Akron01912fc2021-08-12 11:41:58 +0200890 fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
Akronc5d8d432021-08-10 16:48:44 +0200891 }
Akron01912fc2021-08-12 11:41:58 +0200892 writer.WriteString(string(buffer[:buffo]))
Akronc5d8d432021-08-10 16:48:44 +0200893 rewindBuffer = true
Akron3610f102021-08-08 14:13:25 +0200894 }
Akron1594cb82021-08-11 11:14:56 +0200895 if DEBUG {
896 fmt.Println("-> Newline")
897 }
898 writer.WriteRune('\n')
Akron439f4ec2021-08-09 15:45:38 +0200899 }
Akron3610f102021-08-08 14:13:25 +0200900
Akronc5d8d432021-08-10 16:48:44 +0200901 // Rewind the buffer if necessary
Akron439f4ec2021-08-09 15:45:38 +0200902 if rewindBuffer {
903
904 // TODO: Better as a ring buffer
Akron3610f102021-08-08 14:13:25 +0200905 for x, i := range buffer[buffo:buffi] {
906 buffer[x] = i
907 }
Akronb4bbb472021-08-09 11:49:38 +0200908
Akron3610f102021-08-08 14:13:25 +0200909 buffi -= buffo
910 epsilonOffset -= buffo
911 buffo = 0
912 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +0200913 fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
Akron3610f102021-08-08 14:13:25 +0200914 }
Akron84d68e62021-08-04 17:06:52 +0200915 }
916
Akronb7e1f132021-08-10 11:52:31 +0200917 // Move to representative state
Akronf1a16502021-08-16 15:24:38 +0200918 if ta.isSeparate() {
919 t = ta.getBase()
920 ta = dat.array[t]
Akronb7e1f132021-08-10 11:52:31 +0200921
922 if DEBUG {
923 fmt.Println("Representative pointing to", t)
924 }
925 }
926
Akronc5d8d432021-08-10 16:48:44 +0200927 newchar = true
928
Akron068874c2021-08-04 15:19:56 +0200929 // TODO:
930 // Prevent endless epsilon loops!
931 }
932
Akron439f4ec2021-08-09 15:45:38 +0200933 // Input reader is not yet finished
Akron3f8571a2021-08-05 11:18:10 +0200934 if !eof {
Akron068874c2021-08-04 15:19:56 +0200935 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +0200936 fmt.Println("Not at the end - problem", t0, ":", dat.outgoing(t0))
Akron068874c2021-08-04 15:19:56 +0200937 }
938 return false
939 }
940
Akronb7e1f132021-08-10 11:52:31 +0200941 if DEBUG {
942 fmt.Println("Entering final check")
943 }
944
Akronc5d8d432021-08-10 16:48:44 +0200945 // Automaton is in a final state, so flush the buffer and return
Akronf1a16502021-08-16 15:24:38 +0200946 x := dat.array[t].getBase() + uint32(dat.final)
947
948 if x < dat.array[1].getCheck() && dat.array[x].getCheck() == t {
Akronb4bbb472021-08-09 11:49:38 +0200949
950 if buffi > 0 {
Akronb4bbb472021-08-09 11:49:38 +0200951 if DEBUG {
Akron01912fc2021-08-12 11:41:58 +0200952 fmt.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
Akron3f8571a2021-08-05 11:18:10 +0200953 }
Akron01912fc2021-08-12 11:41:58 +0200954 writer.WriteString(string(buffer[:buffi]))
Akron6e70dc82021-08-11 11:33:18 +0200955
Akronf1a16502021-08-16 15:24:38 +0200956 if dat.array[t].isTokenEnd() {
Akrondf0a3ef2021-08-09 15:53:45 +0200957 writer.WriteRune('\n')
Akronc5d8d432021-08-10 16:48:44 +0200958 if DEBUG {
959 fmt.Println("-> Newline")
960 }
Akrondf0a3ef2021-08-09 15:53:45 +0200961 }
Akron84d68e62021-08-04 17:06:52 +0200962 }
963
Akron6e70dc82021-08-11 11:33:18 +0200964 // Add an additional sentence ending, if the file is over but no explicit
965 // sentence split was reached. This may be controversial and therefore
966 // optional via parameter.
Akronf1a16502021-08-16 15:24:38 +0200967 if !dat.array[t0].isTokenEnd() {
Akron6e70dc82021-08-11 11:33:18 +0200968 writer.WriteRune('\n')
969 if DEBUG {
970 fmt.Println("-> Newline")
971 }
972 }
973
Akrone61380b2021-08-16 10:10:46 +0200974 // TODO:
975 // There may be a new line at the end, from an epsilon,
976 // so we may need to go on!
Akron068874c2021-08-04 15:19:56 +0200977 return true
978 }
979
Akronc5d8d432021-08-10 16:48:44 +0200980 // Check epsilon transitions until a final state is reached
981 t0 = t
Akronf1a16502021-08-16 15:24:38 +0200982 t = dat.array[t0].getBase() + uint32(dat.epsilon)
Akron01912fc2021-08-12 11:41:58 +0200983 a = dat.epsilon
984 newchar = false
Akronf1a16502021-08-16 15:24:38 +0200985 if dat.array[t].getCheck() == t0 {
Akronc5d8d432021-08-10 16:48:44 +0200986 // Remember state for backtracking to last tokenend state
Akronc5d8d432021-08-10 16:48:44 +0200987 goto PARSECHAR
Akrone61380b2021-08-16 10:10:46 +0200988
Akronc5d8d432021-08-10 16:48:44 +0200989 } else if epsilonState != 0 {
Akronb7e1f132021-08-10 11:52:31 +0200990 t0 = epsilonState
991 epsilonState = 0 // reset
992 buffo = epsilonOffset
Akron068874c2021-08-04 15:19:56 +0200993 if DEBUG {
Akronc5d8d432021-08-10 16:48:44 +0200994 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
Akron068874c2021-08-04 15:19:56 +0200995 }
Akronc5d8d432021-08-10 16:48:44 +0200996 goto PARSECHAR
Akron068874c2021-08-04 15:19:56 +0200997 }
Akronc5d8d432021-08-10 16:48:44 +0200998 return false
Akron068874c2021-08-04 15:19:56 +0200999}