blob: 0ffaac0e141574052028d959aad30172ea1c60c6 [file] [log] [blame]
Akron7f1097f2021-09-21 16:00:29 +02001package datok
Akron8ef408b2021-08-02 22:11:04 +02002
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
Akronb4bbb472021-08-09 11:49:38 +02009// The maximum number of states is 1.073.741.823 (30bit),
Akron83e75a22021-08-04 13:14:06 +020010// with a loadfactor of ~70, this means roughly 70 million
11// states in the FSA, which is sufficient for the current
12// job.
Akron03c92fe2021-08-09 14:07:57 +020013//
14// Serialization is little endian.
Akron83e75a22021-08-04 13:14:06 +020015
Akron740f3d72021-08-03 12:12:34 +020016// TODO:
17// - replace maxSize with the check value
Akron7b1faa62021-09-02 16:10:21 +020018// - Check if final states can be optional.
19// - Introduce ELM (Morita et al. 2001) to speed
20// up construction. Can be ignored on serialization
21// to improve compression.
Akron3a063ef2021-08-05 19:36:35 +020022// - Add checksum to serialization.
Akrone61380b2021-08-16 10:10:46 +020023// - Replace/Enhance table with a map
24// - Provide a bufio.Scanner compatible interface.
Akron8e1d69b2021-08-12 17:38:49 +020025// - Mark epsilon transitions in bytes
Akron740f3d72021-08-03 12:12:34 +020026
Akron8ef408b2021-08-02 22:11:04 +020027import (
28 "bufio"
29 "compress/gzip"
Akron6247a5d2021-08-03 19:18:28 +020030 "encoding/binary"
Akron8ef408b2021-08-02 22:11:04 +020031 "io"
Akron679b4862021-09-02 16:59:26 +020032 "math"
Akron8ef408b2021-08-02 22:11:04 +020033 "os"
Akronc9d84a62021-08-03 15:56:03 +020034 "sort"
Akron740f3d72021-08-03 12:12:34 +020035
Akron527c10c2021-08-13 01:45:18 +020036 "log"
Akron8ef408b2021-08-02 22:11:04 +020037)
38
39const (
Akron2a4b9292021-08-04 15:35:22 +020040 DEBUG = false
Akron16c312e2021-09-26 13:11:12 +020041 DAMAGIC = "DATOK"
Akron2a4b9292021-08-04 15:35:22 +020042 VERSION = uint16(1)
Akron03c92fe2021-08-09 14:07:57 +020043 FIRSTBIT uint32 = 1 << 31
44 SECONDBIT uint32 = 1 << 30
45 RESTBIT uint32 = ^uint32(0) &^ (FIRSTBIT | SECONDBIT)
Akron8ef408b2021-08-02 22:11:04 +020046)
47
Akron03c92fe2021-08-09 14:07:57 +020048// Serialization is always little endian
Akron6247a5d2021-08-03 19:18:28 +020049var bo binary.ByteOrder = binary.LittleEndian
50
Akron8ef408b2021-08-02 22:11:04 +020051type mapping struct {
52 source int
Akron3fdfec62021-08-04 11:40:10 +020053 target uint32
Akron8ef408b2021-08-02 22:11:04 +020054}
55
Akronf1a16502021-08-16 15:24:38 +020056type bc struct {
57 base uint32
58 check uint32
59}
60
Akron03c92fe2021-08-09 14:07:57 +020061// DaTokenizer represents a tokenizer implemented as a
62// Double Array FSA.
Akronf2120ca2021-08-03 16:26:41 +020063type DaTokenizer struct {
Akronea46e8a2021-08-17 00:36:31 +020064 sigma map[rune]int
65 sigmaASCII [256]int
Akron03a3c612021-08-04 11:51:27 +020066 maxSize int
Akron4fa28b32021-08-27 10:55:41 +020067 transCount int
Akronf1a16502021-08-16 15:24:38 +020068 array []bc
Akronc17f1ca2021-08-03 19:47:27 +020069
70 // Special symbols in sigma
71 epsilon int
72 unknown int
73 identity int
74 final int
Akron03c92fe2021-08-09 14:07:57 +020075 tokenend int
Akronf2120ca2021-08-03 16:26:41 +020076}
77
Akron439f4ec2021-08-09 15:45:38 +020078// ToDoubleArray turns the intermediate tokenizer representation
79// into a double array representation.
80//
81// This is based on Mizobuchi et al (2000), p.128
Akron1c34ce62021-09-23 23:27:39 +020082func (auto *Automaton) ToDoubleArray() *DaTokenizer {
Akronf2120ca2021-08-03 16:26:41 +020083
84 dat := &DaTokenizer{
Akron03a3c612021-08-04 11:51:27 +020085 sigma: make(map[rune]int),
Akron4fa28b32021-08-27 10:55:41 +020086 transCount: -1,
Akron1c34ce62021-09-23 23:27:39 +020087 final: auto.final,
88 unknown: auto.unknown,
89 identity: auto.identity,
90 epsilon: auto.epsilon,
91 tokenend: auto.tokenend,
Akronf2120ca2021-08-03 16:26:41 +020092 }
93
Akronf1a16502021-08-16 15:24:38 +020094 dat.resize(dat.final)
95
Akron1c34ce62021-09-23 23:27:39 +020096 for num, sym := range auto.sigmaRev {
Akronea46e8a2021-08-17 00:36:31 +020097 if int(sym) < 256 {
98 dat.sigmaASCII[int(sym)] = num
99 }
Akronf2120ca2021-08-03 16:26:41 +0200100 dat.sigma[sym] = num
101 }
Akron8ef408b2021-08-02 22:11:04 +0200102
103 mark := 0
104 size := 0
Akron6f1c16c2021-08-17 10:45:42 +0200105 var base uint32
Akronde18e902021-08-27 09:34:12 +0200106 var atrans *edge
107 var s, s1 int
108 var t, t1 uint32
Akron8ef408b2021-08-02 22:11:04 +0200109
Akron439f4ec2021-08-09 15:45:38 +0200110 // Create a mapping from s (in Ms aka Intermediate FSA)
111 // to t (in Mt aka Double Array FSA)
Akron1c34ce62021-09-23 23:27:39 +0200112 table := make([]*mapping, auto.arcCount+1)
Akron7b1faa62021-09-02 16:10:21 +0200113 // tableQueue := make([]int, tok.arcCount+1)
Akron8ef408b2021-08-02 22:11:04 +0200114
Akron439f4ec2021-08-09 15:45:38 +0200115 // Initialize with the start state
Akron8ef408b2021-08-02 22:11:04 +0200116 table[size] = &mapping{source: 1, target: 1}
Akron7b1faa62021-09-02 16:10:21 +0200117 // tableQueue[size] = 1
Akron8ef408b2021-08-02 22:11:04 +0200118 size++
119
Akron740f3d72021-08-03 12:12:34 +0200120 // Allocate space for the outgoing symbol range
Akron1c34ce62021-09-23 23:27:39 +0200121 A := make([]int, 0, auto.sigmaCount)
Akron7b1faa62021-09-02 16:10:21 +0200122 // tableLookup := make([]uint32, tok.arcCount+2) // make(map[int]uint32)
123 // tableLookup[1] = 1
Akron8ef408b2021-08-02 22:11:04 +0200124
Akron7b1faa62021-09-02 16:10:21 +0200125 // block_begin_pos := uint32(1)
Akronde18e902021-08-27 09:34:12 +0200126
Akron8ef408b2021-08-02 22:11:04 +0200127 for mark < size {
Akronde18e902021-08-27 09:34:12 +0200128 s = table[mark].source // This is a state in Ms
129 t = table[mark].target // This is a state in Mt
Akron7b1faa62021-09-02 16:10:21 +0200130 // s = tableQueue[mark]
131 // t = tableLookup[s]
Akron8ef408b2021-08-02 22:11:04 +0200132 mark++
Akron740f3d72021-08-03 12:12:34 +0200133
134 // Following the paper, here the state t can be remembered
135 // in the set of states St
Akron8ef408b2021-08-02 22:11:04 +0200136 A = A[:0]
Akron1c34ce62021-09-23 23:27:39 +0200137 auto.getSet(s, &A)
Akron8ef408b2021-08-02 22:11:04 +0200138
Akron740f3d72021-08-03 12:12:34 +0200139 // Set base to the first free slot in the double array
Akron29e306f2021-09-02 18:29:56 +0200140 // base = dat.xCheck(A)
Akron679b4862021-09-02 16:59:26 +0200141 // base = dat.xCheckSkip(A)
Akron7b1faa62021-09-02 16:10:21 +0200142 // base = dat.xCheckNiu(A, &block_begin_pos)
Akron29e306f2021-09-02 18:29:56 +0200143 base = dat.xCheckSkipNiu(A)
Akron6f1c16c2021-08-17 10:45:42 +0200144 dat.array[t].setBase(base)
Akron8ef408b2021-08-02 22:11:04 +0200145
Akron773b1ef2021-08-03 17:37:20 +0200146 // TODO:
Akron068874c2021-08-04 15:19:56 +0200147 // Sort the outgoing transitions based on the
Akron773b1ef2021-08-03 17:37:20 +0200148 // outdegree of .end
149
Akron740f3d72021-08-03 12:12:34 +0200150 // Iterate over all outgoing symbols
Akron8ef408b2021-08-02 22:11:04 +0200151 for _, a := range A {
152
Akron1c34ce62021-09-23 23:27:39 +0200153 if a != auto.final {
Akron8ef408b2021-08-02 22:11:04 +0200154
Akron1c34ce62021-09-23 23:27:39 +0200155 atrans = auto.transitions[s][a]
Akronde18e902021-08-27 09:34:12 +0200156
Akron740f3d72021-08-03 12:12:34 +0200157 // Aka g(s, a)
Akronde18e902021-08-27 09:34:12 +0200158 s1 = atrans.end
Akron8ef408b2021-08-02 22:11:04 +0200159
Akron740f3d72021-08-03 12:12:34 +0200160 // Store the transition
Akronde18e902021-08-27 09:34:12 +0200161 t1 = base + uint32(a)
Akronf1a16502021-08-16 15:24:38 +0200162 dat.array[t1].setCheck(t)
163
164 // Set maxSize
165 if dat.maxSize < int(t1) {
166 dat.maxSize = int(t1)
167 }
Akron8ef408b2021-08-02 22:11:04 +0200168
Akron439f4ec2021-08-09 15:45:38 +0200169 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100170 log.Println("Translate transition",
Akron524c5432021-08-05 14:14:27 +0200171 s, "->", s1, "(", a, ")", "to", t, "->", t1)
172 }
173
Akron83e75a22021-08-04 13:14:06 +0200174 // Mark the state as being the target of a nontoken transition
Akronde18e902021-08-27 09:34:12 +0200175 if atrans.nontoken {
Akronf1a16502021-08-16 15:24:38 +0200176 dat.array[t1].setNonToken(true)
Akron524c5432021-08-05 14:14:27 +0200177 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100178 log.Println("Set", t1, "to nontoken")
Akron524c5432021-08-05 14:14:27 +0200179 }
Akron83e75a22021-08-04 13:14:06 +0200180 }
181
Akron84d68e62021-08-04 17:06:52 +0200182 // Mark the state as being the target of a tokenend transition
Akronde18e902021-08-27 09:34:12 +0200183 if atrans.tokenend {
Akronf1a16502021-08-16 15:24:38 +0200184 dat.array[t1].setTokenEnd(true)
Akron524c5432021-08-05 14:14:27 +0200185 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100186 log.Println("Set", t1, "to tokenend")
Akron524c5432021-08-05 14:14:27 +0200187 }
Akron84d68e62021-08-04 17:06:52 +0200188 }
189
Akron740f3d72021-08-03 12:12:34 +0200190 // Check for representative states
Akron439f4ec2021-08-09 15:45:38 +0200191 r := stateAlreadyInTable(s1, table, size)
Akronde18e902021-08-27 09:34:12 +0200192 // r := tableLookup[s1]
Akron740f3d72021-08-03 12:12:34 +0200193
Akron439f4ec2021-08-09 15:45:38 +0200194 // No representative found
Akron8ef408b2021-08-02 22:11:04 +0200195 if r == 0 {
Akron740f3d72021-08-03 12:12:34 +0200196 // Remember the mapping
Akron8ef408b2021-08-02 22:11:04 +0200197 table[size] = &mapping{source: s1, target: t1}
Akron7b1faa62021-09-02 16:10:21 +0200198 // tableQueue[size] = s1
Akronde18e902021-08-27 09:34:12 +0200199 // tableLookup[s1] = t1
Akron8ef408b2021-08-02 22:11:04 +0200200 size++
201 } else {
Akron740f3d72021-08-03 12:12:34 +0200202 // Overwrite with the representative state
Akronf1a16502021-08-16 15:24:38 +0200203 dat.array[t1].setBase(r)
204 dat.array[t1].setSeparate(true)
Akron8ef408b2021-08-02 22:11:04 +0200205 }
206 } else {
Akron740f3d72021-08-03 12:12:34 +0200207 // Store a final transition
Akron6f1c16c2021-08-17 10:45:42 +0200208 dat.array[base+uint32(dat.final)].setCheck(t)
Akronf1a16502021-08-16 15:24:38 +0200209
Akronde18e902021-08-27 09:34:12 +0200210 if dat.maxSize < int(base)+dat.final {
211 dat.maxSize = int(base) + dat.final
Akronf1a16502021-08-16 15:24:38 +0200212 }
Akron8ef408b2021-08-02 22:11:04 +0200213 }
214 }
215 }
216
217 // Following Mizobuchi et al (2000) the size of the
218 // FSA should be stored in check(1).
Akron29e306f2021-09-02 18:29:56 +0200219 // We make the size a bit larger so we never have to check for boundaries.
220 dat.setSize(dat.maxSize + dat.final)
221 if len(dat.array) < dat.maxSize+dat.final {
222 dat.array = append(dat.array, make([]bc, dat.final)...)
223 }
224 dat.array = dat.array[:dat.maxSize+dat.final]
Akronf2120ca2021-08-03 16:26:41 +0200225 return dat
Akron8ef408b2021-08-02 22:11:04 +0200226}
227
Akron8ef408b2021-08-02 22:11:04 +0200228// Check the table if a mapping of s
Akron740f3d72021-08-03 12:12:34 +0200229// exists and return this as a representative.
230// Currently iterates through the whole table
231// in a bruteforce manner.
Akron439f4ec2021-08-09 15:45:38 +0200232func stateAlreadyInTable(s int, table []*mapping, size int) uint32 {
Akron8ef408b2021-08-02 22:11:04 +0200233 for x := 0; x < size; x++ {
234 if table[x].source == s {
235 return table[x].target
236 }
237 }
238 return 0
239}
240
Akron941f2152021-09-26 15:14:25 +0200241// Type of tokenizer
242func (DaTokenizer) Type() string {
243 return DAMAGIC
244}
245
Akron64ffd9a2021-08-03 19:55:21 +0200246// Resize double array when necessary
247func (dat *DaTokenizer) resize(l int) {
248 // TODO:
249 // This is a bit too aggressive atm and should be calmed down.
250 if len(dat.array) <= l {
Akronf1a16502021-08-16 15:24:38 +0200251 dat.array = append(dat.array, make([]bc, l)...)
Akron8ef408b2021-08-02 22:11:04 +0200252 }
Akron64ffd9a2021-08-03 19:55:21 +0200253}
Akronc9d84a62021-08-03 15:56:03 +0200254
Akron64ffd9a2021-08-03 19:55:21 +0200255// Set base value in double array
Akronf1a16502021-08-16 15:24:38 +0200256func (bc *bc) setBase(v uint32) {
257 bc.base = v
Akron439f4ec2021-08-09 15:45:38 +0200258}
259
260// Get base value in double array
Akronf1a16502021-08-16 15:24:38 +0200261func (bc *bc) getBase() uint32 {
262 return bc.base & RESTBIT
Akron439f4ec2021-08-09 15:45:38 +0200263}
264
265// Set check value in double array
Akronf1a16502021-08-16 15:24:38 +0200266func (bc *bc) setCheck(v uint32) {
267 bc.check = v
Akron439f4ec2021-08-09 15:45:38 +0200268}
269
270// Get check value in double array
Akronf1a16502021-08-16 15:24:38 +0200271func (bc *bc) getCheck() uint32 {
272 return bc.check & RESTBIT
Akron64ffd9a2021-08-03 19:55:21 +0200273}
274
Akron3fdfec62021-08-04 11:40:10 +0200275// Returns true if a state is separate pointing to a representative
Akronf1a16502021-08-16 15:24:38 +0200276func (bc *bc) isSeparate() bool {
277 return bc.base&FIRSTBIT != 0
Akron3fdfec62021-08-04 11:40:10 +0200278}
279
280// Mark a state as separate pointing to a representative
Akronf1a16502021-08-16 15:24:38 +0200281func (bc *bc) setSeparate(sep bool) {
Akron3fdfec62021-08-04 11:40:10 +0200282 if sep {
Akronf1a16502021-08-16 15:24:38 +0200283 bc.base |= FIRSTBIT
Akron3fdfec62021-08-04 11:40:10 +0200284 } else {
Akronf1a16502021-08-16 15:24:38 +0200285 bc.base &= (RESTBIT | SECONDBIT)
Akron3fdfec62021-08-04 11:40:10 +0200286 }
287}
288
Akron83e75a22021-08-04 13:14:06 +0200289// Returns true if a state is the target of a nontoken transition
Akronf1a16502021-08-16 15:24:38 +0200290func (bc *bc) isNonToken() bool {
291 return bc.check&FIRSTBIT != 0
Akron83e75a22021-08-04 13:14:06 +0200292}
293
294// Mark a state as being the target of a nontoken transition
Akronf1a16502021-08-16 15:24:38 +0200295func (bc *bc) setNonToken(sep bool) {
Akron83e75a22021-08-04 13:14:06 +0200296 if sep {
Akronf1a16502021-08-16 15:24:38 +0200297 bc.check |= FIRSTBIT
Akron83e75a22021-08-04 13:14:06 +0200298 } else {
Akronf1a16502021-08-16 15:24:38 +0200299 bc.check &= (RESTBIT | SECONDBIT)
Akron83e75a22021-08-04 13:14:06 +0200300 }
301}
302
Akron84d68e62021-08-04 17:06:52 +0200303// Returns true if a state is the target of a tokenend transition
Akronf1a16502021-08-16 15:24:38 +0200304func (bc *bc) isTokenEnd() bool {
305 return bc.check&SECONDBIT != 0
Akron84d68e62021-08-04 17:06:52 +0200306}
307
308// Mark a state as being the target of a tokenend transition
Akronf1a16502021-08-16 15:24:38 +0200309func (bc *bc) setTokenEnd(sep bool) {
Akron84d68e62021-08-04 17:06:52 +0200310 if sep {
Akronf1a16502021-08-16 15:24:38 +0200311 bc.check |= SECONDBIT
Akron84d68e62021-08-04 17:06:52 +0200312 } else {
Akronf1a16502021-08-16 15:24:38 +0200313 bc.check &= (RESTBIT | FIRSTBIT)
Akron84d68e62021-08-04 17:06:52 +0200314 }
315}
316
Akron64ffd9a2021-08-03 19:55:21 +0200317// Set size of double array
Akron3fdfec62021-08-04 11:40:10 +0200318func (dat *DaTokenizer) setSize(v int) {
Akronf1a16502021-08-16 15:24:38 +0200319 dat.array[1].setCheck(uint32(v))
Akron64ffd9a2021-08-03 19:55:21 +0200320}
321
322// Get size of double array
Akron3fdfec62021-08-04 11:40:10 +0200323func (dat *DaTokenizer) GetSize() int {
Akronf1a16502021-08-16 15:24:38 +0200324 return int(dat.array[1].getCheck())
Akron8ef408b2021-08-02 22:11:04 +0200325}
326
327// Based on Mizobuchi et al (2000), p. 124
328// This iterates for every state through the complete double array
329// structure until it finds a gap that fits all outgoing transitions
330// of the state. This is extremely slow, but is only necessary in the
331// construction phase of the tokenizer.
Akron3fdfec62021-08-04 11:40:10 +0200332func (dat *DaTokenizer) xCheck(symbols []int) uint32 {
Akron740f3d72021-08-03 12:12:34 +0200333
334 // Start at the first entry of the double array list
Akron6f1c16c2021-08-17 10:45:42 +0200335 base := uint32(1)
336
Akron8ef408b2021-08-02 22:11:04 +0200337OVERLAP:
Akron740f3d72021-08-03 12:12:34 +0200338 // Resize the array if necessary
Akronf1a16502021-08-16 15:24:38 +0200339 dat.resize(int(base) + dat.final)
Akron8ef408b2021-08-02 22:11:04 +0200340 for _, a := range symbols {
Akronf1a16502021-08-16 15:24:38 +0200341 if dat.array[int(base)+a].getCheck() != 0 {
Akron8ef408b2021-08-02 22:11:04 +0200342 base++
343 goto OVERLAP
344 }
345 }
Akron8ef408b2021-08-02 22:11:04 +0200346 return base
347}
348
Akron679b4862021-09-02 16:59:26 +0200349// This is an implementation of xCheck with the skip-improvement
350// proposed by Morita et al. (2001)
351func (dat *DaTokenizer) xCheckSkip(symbols []int) uint32 {
352
353 // Start at the first entry of the double array list
354 base := uint32(math.Abs(float64(dat.maxSize-1) * .9))
355
356OVERLAP:
357 // Resize the array if necessary
358 dat.resize(int(base) + dat.final)
359 for _, a := range symbols {
360 if dat.array[int(base)+a].getCheck() != 0 {
361 base++
362 goto OVERLAP
363 }
364 }
365 return base
366}
367
Akron29e306f2021-09-02 18:29:56 +0200368// This is an implementation of xCheck with the skip-improvement
369// proposed by Morita et al. (2001) for higher outdegrees as
370// proposed by Niu et al. (2013)
371func (dat *DaTokenizer) xCheckSkipNiu(symbols []int) uint32 {
372
373 // Start at the first entry of the double array list
374 base := uint32(1)
375
376 // Or skip the first few entries
377 if len(symbols) >= 3 {
378 base = uint32(math.Abs(float64(dat.maxSize-1)*.9)) + 1
379 }
380
381OVERLAP:
382 // Resize the array if necessary
383 dat.resize(int(base) + dat.final + 1)
384 for _, a := range symbols {
385 if dat.array[int(base)+a].getCheck() != 0 {
386 base++
387 goto OVERLAP
388 }
389 }
390 return base
391}
392
Akron7b1faa62021-09-02 16:10:21 +0200393// This is an implementation of xCheck wit an improvement
394// proposed by Niu et al. (2013)
395func (dat *DaTokenizer) xCheckNiu(symbols []int, block_begin_pos *uint32) uint32 {
396
397 // Start at the first entry of the double array list
398 base := uint32(1)
399
400 if len(symbols) > 3 {
401 sort.Ints(symbols)
402 if *block_begin_pos > uint32(symbols[0]) {
403 dat.resize(int(*block_begin_pos) + dat.final)
404 *block_begin_pos += uint32(symbols[len(symbols)-1] + 1)
405 return *block_begin_pos - uint32(symbols[0])
406 }
407 }
408
409OVERLAP:
410 // Resize the array if necessary
411 dat.resize(int(base) + dat.final)
412 for _, a := range symbols {
413 if dat.array[int(base)+a].getCheck() != 0 {
414 base++
415 goto OVERLAP
416 }
417 }
418 return base
419}
420
Akron3610f102021-08-08 14:13:25 +0200421// List all outgoing transitions for a state
422// for testing purposes
423func (dat *DaTokenizer) outgoing(t uint32) []int {
424
425 valid := make([]int, 0, len(dat.sigma))
426
427 for _, a := range dat.sigma {
Akronf1a16502021-08-16 15:24:38 +0200428 t1 := dat.array[t].getBase() + uint32(a)
429 if t1 <= dat.array[1].getCheck() && dat.array[t1].getCheck() == t {
Akron3610f102021-08-08 14:13:25 +0200430 valid = append(valid, a)
431 }
432 }
433
434 for _, a := range []int{dat.epsilon, dat.unknown, dat.identity, dat.final} {
Akronf1a16502021-08-16 15:24:38 +0200435 t1 := dat.array[t].getBase() + uint32(a)
436 if t1 <= dat.array[1].getCheck() && dat.array[t1].getCheck() == t {
Akron3610f102021-08-08 14:13:25 +0200437 valid = append(valid, -1*a)
438 }
439 }
440
441 sort.Ints(valid)
442
443 return valid
444}
445
Akron4fa28b32021-08-27 10:55:41 +0200446// TransCount as the number of transitions aka arcs in the
447// finite state automaton
448func (dat *DaTokenizer) TransCount() int {
449 // Cache the transCount
450 if dat.transCount > 0 {
451 return dat.transCount
452 }
453
454 dat.transCount = 0
455 for x := 1; x < len(dat.array); x++ {
456 if dat.array[x].getBase() != 0 {
457 dat.transCount++
458 }
459 }
460
461 return dat.transCount
462}
463
Akron03a3c612021-08-04 11:51:27 +0200464// LoadFactor as defined in Kanda et al (2018),
465// i.e. the proportion of non-empty elements to all elements.
466func (dat *DaTokenizer) LoadFactor() float64 {
Akron4fa28b32021-08-27 10:55:41 +0200467 return float64(dat.TransCount()) / float64(len(dat.array)) * 100
Akrond66a9262021-08-03 17:09:09 +0200468}
469
Akron439f4ec2021-08-09 15:45:38 +0200470// Save stores the double array data in a file
Akron3a063ef2021-08-05 19:36:35 +0200471func (dat *DaTokenizer) Save(file string) (n int64, err error) {
472 f, err := os.Create(file)
473 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200474 log.Println(err)
Akron439f4ec2021-08-09 15:45:38 +0200475 return 0, err
Akron3a063ef2021-08-05 19:36:35 +0200476 }
477 defer f.Close()
478 gz := gzip.NewWriter(f)
479 defer gz.Close()
480 n, err = dat.WriteTo(gz)
481 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200482 log.Println(err)
Akron3a063ef2021-08-05 19:36:35 +0200483 return n, err
484 }
485 gz.Flush()
486 return n, nil
487}
488
489// WriteTo stores the double array data in an io.Writer.
Akron6247a5d2021-08-03 19:18:28 +0200490func (dat *DaTokenizer) WriteTo(w io.Writer) (n int64, err error) {
491
Akron3a063ef2021-08-05 19:36:35 +0200492 wb := bufio.NewWriter(w)
493 defer wb.Flush()
494
Akron6247a5d2021-08-03 19:18:28 +0200495 // Store magical header
Akron16c312e2021-09-26 13:11:12 +0200496 all, err := wb.Write([]byte(DAMAGIC))
Akron6247a5d2021-08-03 19:18:28 +0200497 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200498 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200499 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200500 }
501
502 // Get sigma as a list
503 sigmalist := make([]rune, len(dat.sigma)+16)
504 max := 0
505 for sym, num := range dat.sigma {
506 sigmalist[num] = sym
507 if num > max {
508 max = num
509 }
510 }
511
512 sigmalist = sigmalist[:max+1]
513
Akron3f8571a2021-08-05 11:18:10 +0200514 buf := make([]byte, 0, 16)
Akron6247a5d2021-08-03 19:18:28 +0200515 bo.PutUint16(buf[0:2], VERSION)
Akronc17f1ca2021-08-03 19:47:27 +0200516 bo.PutUint16(buf[2:4], uint16(dat.epsilon))
517 bo.PutUint16(buf[4:6], uint16(dat.unknown))
518 bo.PutUint16(buf[6:8], uint16(dat.identity))
519 bo.PutUint16(buf[8:10], uint16(dat.final))
Akron6247a5d2021-08-03 19:18:28 +0200520 bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
Akronf1a16502021-08-16 15:24:38 +0200521 bo.PutUint32(buf[12:16], uint32(len(dat.array)*2)) // Legacy support
Akron3a063ef2021-08-05 19:36:35 +0200522 more, err := wb.Write(buf[0:16])
Akron6247a5d2021-08-03 19:18:28 +0200523 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200524 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200525 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200526 }
527
528 all += more
529
Akron6247a5d2021-08-03 19:18:28 +0200530 // Write sigma
531 for _, sym := range sigmalist {
Akron3f8571a2021-08-05 11:18:10 +0200532
Akron3a063ef2021-08-05 19:36:35 +0200533 more, err = wb.WriteRune(sym)
Akron6247a5d2021-08-03 19:18:28 +0200534 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200535 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200536 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200537 }
538 all += more
539 }
Akron439f4ec2021-08-09 15:45:38 +0200540
Akron6247a5d2021-08-03 19:18:28 +0200541 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200542 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200543 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200544 }
Akron6247a5d2021-08-03 19:18:28 +0200545
546 // Test marker - could be checksum
Akron3a063ef2021-08-05 19:36:35 +0200547 more, err = wb.Write([]byte("T"))
Akron6247a5d2021-08-03 19:18:28 +0200548 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200549 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200550 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200551 }
552 all += more
553
Akronf1a16502021-08-16 15:24:38 +0200554 // for x := 0; x < len(dat.array); x++ {
555 for _, bc := range dat.array {
556 bo.PutUint32(buf[0:4], bc.base)
557 more, err = wb.Write(buf[0:4])
Akron6247a5d2021-08-03 19:18:28 +0200558 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200559 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200560 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200561 }
Akron439f4ec2021-08-09 15:45:38 +0200562 all += more
Akron3a063ef2021-08-05 19:36:35 +0200563 if more != 4 {
Akronf1a16502021-08-16 15:24:38 +0200564 log.Println("Can not write base uint32")
565 return int64(all), err
566 }
567 bo.PutUint32(buf[0:4], bc.check)
568 more, err = wb.Write(buf[0:4])
569 if err != nil {
570 log.Println(err)
571 return int64(all), err
572 }
573 all += more
574 if more != 4 {
575 log.Println("Can not write check uint32")
Akron3a063ef2021-08-05 19:36:35 +0200576 return int64(all), err
577 }
Akron6247a5d2021-08-03 19:18:28 +0200578 }
579
580 return int64(all), err
581}
582
Akron439f4ec2021-08-09 15:45:38 +0200583// LoadDatokFile reads a double array represented tokenizer
584// from a file.
Akron3f8571a2021-08-05 11:18:10 +0200585func LoadDatokFile(file string) *DaTokenizer {
586 f, err := os.Open(file)
587 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200588 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200589 return nil
Akron3f8571a2021-08-05 11:18:10 +0200590 }
591 defer f.Close()
592
593 gz, err := gzip.NewReader(f)
594 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200595 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200596 return nil
Akron3f8571a2021-08-05 11:18:10 +0200597 }
598 defer gz.Close()
599
Akron3a063ef2021-08-05 19:36:35 +0200600 // Todo: Read the whole file!
Akron3f8571a2021-08-05 11:18:10 +0200601 return ParseDatok(gz)
602}
603
Akron439f4ec2021-08-09 15:45:38 +0200604// LoadDatokFile reads a double array represented tokenizer
605// from an io.Reader
Akron3f8571a2021-08-05 11:18:10 +0200606func ParseDatok(ior io.Reader) *DaTokenizer {
607
Akron439f4ec2021-08-09 15:45:38 +0200608 // Initialize tokenizer with default values
Akron3f8571a2021-08-05 11:18:10 +0200609 dat := &DaTokenizer{
610 sigma: make(map[rune]int),
611 epsilon: 0,
612 unknown: 0,
613 identity: 0,
614 final: 0,
Akron4fa28b32021-08-27 10:55:41 +0200615 transCount: 0,
Akron3f8571a2021-08-05 11:18:10 +0200616 }
617
618 r := bufio.NewReader(ior)
619
Akron3f8571a2021-08-05 11:18:10 +0200620 buf := make([]byte, 1024)
Akron16c312e2021-09-26 13:11:12 +0200621 buf = buf[0:len(DAMAGIC)]
Akron3f8571a2021-08-05 11:18:10 +0200622
Akron439f4ec2021-08-09 15:45:38 +0200623 _, err := r.Read(buf)
Akron3f8571a2021-08-05 11:18:10 +0200624
625 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200626 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200627 return nil
628 }
629
Akron16c312e2021-09-26 13:11:12 +0200630 if string(DAMAGIC) != string(buf) {
Akron527c10c2021-08-13 01:45:18 +0200631 log.Println("Not a datok file")
Akron3f8571a2021-08-05 11:18:10 +0200632 return nil
633 }
634
Akron439f4ec2021-08-09 15:45:38 +0200635 more, err := io.ReadFull(r, buf[0:16])
Akron3f8571a2021-08-05 11:18:10 +0200636 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200637 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200638 return nil
639 }
640
Akron439f4ec2021-08-09 15:45:38 +0200641 if more != 16 {
Akron527c10c2021-08-13 01:45:18 +0200642 log.Println("Read bytes do not fit")
Akron439f4ec2021-08-09 15:45:38 +0200643 return nil
644 }
Akron3f8571a2021-08-05 11:18:10 +0200645
Akron3a063ef2021-08-05 19:36:35 +0200646 version := bo.Uint16(buf[0:2])
647
648 if version != VERSION {
Akron527c10c2021-08-13 01:45:18 +0200649 log.Println("Version not compatible")
Akron3a063ef2021-08-05 19:36:35 +0200650 return nil
651 }
652
Akron3f8571a2021-08-05 11:18:10 +0200653 dat.epsilon = int(bo.Uint16(buf[2:4]))
654 dat.unknown = int(bo.Uint16(buf[4:6]))
655 dat.identity = int(bo.Uint16(buf[6:8]))
656 dat.final = int(bo.Uint16(buf[8:10]))
657
658 sigmaCount := int(bo.Uint16(buf[10:12]))
Akronf1a16502021-08-16 15:24:38 +0200659 arraySize := int(bo.Uint32(buf[12:16])) / 2 // Legacy support
Akron3f8571a2021-08-05 11:18:10 +0200660
Akron3a063ef2021-08-05 19:36:35 +0200661 // Shouldn't be relevant though
662 dat.maxSize = arraySize - 1
663
Akron3f8571a2021-08-05 11:18:10 +0200664 for x := 0; x < sigmaCount; x++ {
Akron439f4ec2021-08-09 15:45:38 +0200665 sym, _, err := r.ReadRune()
Akron3f8571a2021-08-05 11:18:10 +0200666 if err == nil && sym != 0 {
Akronea46e8a2021-08-17 00:36:31 +0200667 if int(sym) < 256 {
668 dat.sigmaASCII[int(sym)] = x
669 }
Akron3f8571a2021-08-05 11:18:10 +0200670 dat.sigma[sym] = x
671 }
Akron3f8571a2021-08-05 11:18:10 +0200672 }
673
Akron439f4ec2021-08-09 15:45:38 +0200674 _, err = io.ReadFull(r, buf[0:1])
Akron3f8571a2021-08-05 11:18:10 +0200675
676 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200677 log.Print(err)
Akron3f8571a2021-08-05 11:18:10 +0200678 return nil
679 }
680
Akron3f8571a2021-08-05 11:18:10 +0200681 if string("T") != string(buf[0:1]) {
Akron527c10c2021-08-13 01:45:18 +0200682 log.Println("Not a datok file")
Akron3f8571a2021-08-05 11:18:10 +0200683 return nil
684 }
685
686 // Read based on length
Akronf1a16502021-08-16 15:24:38 +0200687 dat.array = make([]bc, arraySize)
Akron3f8571a2021-08-05 11:18:10 +0200688
Akronbb4aac52021-08-13 00:52:27 +0200689 dataArray, err := io.ReadAll(r)
Akron439f4ec2021-08-09 15:45:38 +0200690
Akronbb4aac52021-08-13 00:52:27 +0200691 if err == io.EOF {
Akron527c10c2021-08-13 01:45:18 +0200692 log.Println(err)
Akronbb4aac52021-08-13 00:52:27 +0200693 return nil
694 }
695
Akronf1a16502021-08-16 15:24:38 +0200696 if len(dataArray) < arraySize*8 {
Akron527c10c2021-08-13 01:45:18 +0200697 log.Println("Not enough bytes read")
Akronbb4aac52021-08-13 00:52:27 +0200698 return nil
699 }
700
701 for x := 0; x < arraySize; x++ {
Akronf1a16502021-08-16 15:24:38 +0200702 dat.array[x].base = bo.Uint32(dataArray[x*8 : (x*8)+4])
703 dat.array[x].check = bo.Uint32(dataArray[(x*8)+4 : (x*8)+8])
Akron3f8571a2021-08-05 11:18:10 +0200704 }
705
706 return dat
707}
708
Akron439f4ec2021-08-09 15:45:38 +0200709// Show the current state of the buffer,
710// for testing puroses
Akron3610f102021-08-08 14:13:25 +0200711func showBuffer(buffer []rune, buffo int, buffi int) string {
712 out := make([]rune, 0, 1024)
713 for x := 0; x < len(buffer); x++ {
714 if buffi == x {
715 out = append(out, '^')
716 }
717 if buffo == x {
718 out = append(out, '[', buffer[x], ']')
719 } else {
720 out = append(out, buffer[x])
721 }
722 }
723 return string(out)
724}
725
Akron98fbfef2021-10-23 17:02:11 +0200726// Show the current state of the buffer,
727// for testing puroses
728func showBufferNew(buffer []rune, bufft int, buffc int, buffi int) string {
729 out := make([]rune, 0, 1024)
730 for x := 0; x < len(buffer); x++ {
731 if buffi == x {
732 out = append(out, '^')
733 }
734 if bufft == x {
735 out = append(out, '|')
736 }
737 if buffc == x {
738 out = append(out, '[', buffer[x], ']')
739 } else {
740 out = append(out, buffer[x])
741 }
742 }
743 return string(out)
744}
745
746// Transduce input to ouutput
Akrone396a932021-10-19 01:06:13 +0200747func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
Akron96fdc9b2021-10-27 21:11:17 +0200748 return dat.TransduceTokenWriter(r, NewTokenWriter(w, SIMPLE))
Akrone396a932021-10-19 01:06:13 +0200749}
750
751// TransduceTokenWriter transduces an input string against
752// the double array FSA. The rules are always greedy. If the
753// automaton fails, it takes the last possible token ending
754// branch.
Akron068874c2021-08-04 15:19:56 +0200755//
Akron4db3ecf2021-08-11 18:49:03 +0200756// Based on Mizobuchi et al (2000), p. 129,
757// with additional support for IDENTITY, UNKNOWN
758// and EPSILON transitions and NONTOKEN and TOKENEND handling.
Akron4f6b28c2021-10-25 00:52:03 +0200759func (dat *DaTokenizer) TransduceTokenWriter(r io.Reader, w *TokenWriter) bool {
Akron068874c2021-08-04 15:19:56 +0200760 var a int
Akronb4bbb472021-08-09 11:49:38 +0200761 var t0 uint32
Akronb7e1f132021-08-10 11:52:31 +0200762 t := uint32(1) // Initial state
763 var ok, rewindBuffer bool
Akron068874c2021-08-04 15:19:56 +0200764
Akron3610f102021-08-08 14:13:25 +0200765 // Remember the last position of a possible tokenend,
766 // in case the automaton fails.
767 epsilonState := uint32(0)
768 epsilonOffset := 0
769
Akrone396a932021-10-19 01:06:13 +0200770 // Remember if the last transition was epsilon
771 sentenceEnd := false
772
Akrona854faa2021-10-22 19:31:08 +0200773 // Remember if a text end was already set
774 textEnd := false
775
Akron3610f102021-08-08 14:13:25 +0200776 // Implement a low level buffer for full control,
777 // however - it is probably better to introduce
778 // this on a higher level with a io.Reader interface
779 // The buffer stores a single word and may have white
780 // space at the end (but not at the beginning).
781 //
782 // This is the only backtracking requirement because of
783 // epsilon transitions, to support tokenizations like:
784 // "this is an example|.| And it works." vs
785 // "this is an example.com| application."
Akronb7e1f132021-08-10 11:52:31 +0200786 //
787 // TODO:
788 // Store a translation buffer as well, so characters don't
789 // have to be translated multiple times!
Akron3610f102021-08-08 14:13:25 +0200790 buffer := make([]rune, 1024)
Akron98fbfef2021-10-23 17:02:11 +0200791 bufft := 0 // Buffer token offset
792 buffc := 0 // Buffer current symbol
Akron3610f102021-08-08 14:13:25 +0200793 buffi := 0 // Buffer length
794
Akron98fbfef2021-10-23 17:02:11 +0200795 // The buffer is organized as follows:
796 // [ t[....c..]..i]
797
Akron3f8571a2021-08-05 11:18:10 +0200798 reader := bufio.NewReader(r)
Akrone396a932021-10-19 01:06:13 +0200799 defer w.Flush()
Akron068874c2021-08-04 15:19:56 +0200800
Akron3f8571a2021-08-05 11:18:10 +0200801 var char rune
Akrone396a932021-10-19 01:06:13 +0200802
Akron3f8571a2021-08-05 11:18:10 +0200803 var err error
804 eof := false
Akrona854faa2021-10-22 19:31:08 +0200805 eot := false
Akronb7e1f132021-08-10 11:52:31 +0200806 newchar := true
Akron3f8571a2021-08-05 11:18:10 +0200807
Akronc5d8d432021-08-10 16:48:44 +0200808PARSECHAR:
Akron3f8571a2021-08-05 11:18:10 +0200809 for {
810
Akronb7e1f132021-08-10 11:52:31 +0200811 if newchar {
812 // Get from reader if buffer is empty
Akron98fbfef2021-10-23 17:02:11 +0200813 if buffc >= buffi {
Akron1594cb82021-08-11 11:14:56 +0200814 if eof {
815 break
816 }
Akronb7e1f132021-08-10 11:52:31 +0200817 char, _, err = reader.ReadRune()
Akron439f4ec2021-08-09 15:45:38 +0200818
Akronb7e1f132021-08-10 11:52:31 +0200819 // No more runes to read
820 if err != nil {
821 eof = true
822 break
823 }
824 buffer[buffi] = char
825 buffi++
Akron3f8571a2021-08-05 11:18:10 +0200826 }
Akronb7e1f132021-08-10 11:52:31 +0200827
Akron98fbfef2021-10-23 17:02:11 +0200828 char = buffer[buffc]
Akronb7e1f132021-08-10 11:52:31 +0200829
830 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100831 log.Println("Current char", string(char), int(char), showBufferNew(buffer, bufft, buffc, buffi))
Akronb7e1f132021-08-10 11:52:31 +0200832 }
833
Akrona854faa2021-10-22 19:31:08 +0200834 eot = false
835
Akron6f1c16c2021-08-17 10:45:42 +0200836 // TODO:
837 // Better not repeatedly check for a!
838 // Possibly keep a buffer with a.
Akronea46e8a2021-08-17 00:36:31 +0200839 if int(char) < 256 {
Akrona854faa2021-10-22 19:31:08 +0200840 if int(char) == EOT {
841 eot = true
842 }
Akronea46e8a2021-08-17 00:36:31 +0200843 a = dat.sigmaASCII[int(char)]
Akron4880fb62021-12-05 12:03:05 +0100844
845 // Use identity symbol if character is not in sigma
846 if a == 0 && dat.identity != -1 {
847 a = dat.identity
848 }
849
Akronea46e8a2021-08-17 00:36:31 +0200850 } else {
851 a, ok = dat.sigma[char]
Akronb7e1f132021-08-10 11:52:31 +0200852
Akron4880fb62021-12-05 12:03:05 +0100853 // Use identity symbol if character is not in sigma
854 if !ok && dat.identity != -1 {
855 a = dat.identity
856 }
Akronb7e1f132021-08-10 11:52:31 +0200857 }
858
859 t0 = t
860
861 // Check for epsilon transitions and remember
Akronf1a16502021-08-16 15:24:38 +0200862 if dat.array[dat.array[t0].getBase()+uint32(dat.epsilon)].getCheck() == t0 {
Akronb7e1f132021-08-10 11:52:31 +0200863 // Remember state for backtracking to last tokenend state
864 epsilonState = t0
Akron98fbfef2021-10-23 17:02:11 +0200865 epsilonOffset = buffc
Akrone396a932021-10-19 01:06:13 +0200866
867 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100868 log.Println("epsilonOffset is set to", buffc)
Akrone396a932021-10-19 01:06:13 +0200869 }
Akronb7e1f132021-08-10 11:52:31 +0200870 }
Akron3f8571a2021-08-05 11:18:10 +0200871 }
Akron3610f102021-08-08 14:13:25 +0200872
Akronb7e1f132021-08-10 11:52:31 +0200873 // Checks a transition based on t0, a and buffo
Akronf1a16502021-08-16 15:24:38 +0200874 t = dat.array[t0].getBase() + uint32(a)
875 ta := dat.array[t]
Akron068874c2021-08-04 15:19:56 +0200876
Akron524c5432021-08-05 14:14:27 +0200877 if DEBUG {
Akronb7e1f132021-08-10 11:52:31 +0200878 // Char is only relevant if set
Akron9c3bf7f2021-11-03 19:52:12 +0100879 log.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
Akronb7e1f132021-08-10 11:52:31 +0200880 if false {
Akron9c3bf7f2021-11-03 19:52:12 +0100881 log.Println(dat.outgoing(t0))
Akronb7e1f132021-08-10 11:52:31 +0200882 }
Akron524c5432021-08-05 14:14:27 +0200883 }
884
Akronb7e1f132021-08-10 11:52:31 +0200885 // Check if the transition is invalid according to the double array
Akronf1a16502021-08-16 15:24:38 +0200886 if t > dat.array[1].getCheck() || ta.getCheck() != t0 {
Akron068874c2021-08-04 15:19:56 +0200887
888 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100889 log.Println("Match is not fine!", t, "and", ta.getCheck(), "vs", t0)
Akron068874c2021-08-04 15:19:56 +0200890 }
891
892 if !ok && a == dat.identity {
Akronb4bbb472021-08-09 11:49:38 +0200893
Akron068874c2021-08-04 15:19:56 +0200894 // Try again with unknown symbol, in case identity failed
Akronb7e1f132021-08-10 11:52:31 +0200895 // Char is only relevant when set
Akron068874c2021-08-04 15:19:56 +0200896 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100897 log.Println("UNKNOWN symbol", string(char), "->", dat.unknown)
Akron068874c2021-08-04 15:19:56 +0200898 }
899 a = dat.unknown
900
901 } else if a != dat.epsilon {
Akronb4bbb472021-08-09 11:49:38 +0200902
Akron068874c2021-08-04 15:19:56 +0200903 // Try again with epsilon symbol, in case everything else failed
Akronb4bbb472021-08-09 11:49:38 +0200904 t0 = epsilonState
Akron3610f102021-08-08 14:13:25 +0200905 epsilonState = 0 // reset
Akron98fbfef2021-10-23 17:02:11 +0200906 buffc = epsilonOffset
Akron439f4ec2021-08-09 15:45:38 +0200907 a = dat.epsilon
908
Akron3610f102021-08-08 14:13:25 +0200909 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100910 log.Println("Get from epsilon stack and set buffo!", showBufferNew(buffer, bufft, buffc, buffi))
Akron3610f102021-08-08 14:13:25 +0200911 }
Akronb4bbb472021-08-09 11:49:38 +0200912
Akron068874c2021-08-04 15:19:56 +0200913 } else {
914 break
915 }
Akron068874c2021-08-04 15:19:56 +0200916
Akronb7e1f132021-08-10 11:52:31 +0200917 newchar = false
Akrona854faa2021-10-22 19:31:08 +0200918 eot = false
Akronb7e1f132021-08-10 11:52:31 +0200919 continue
Akronb4bbb472021-08-09 11:49:38 +0200920 }
921
Akronb7e1f132021-08-10 11:52:31 +0200922 // Transition was successful
923 rewindBuffer = false
Akron439f4ec2021-08-09 15:45:38 +0200924
925 // Transition consumes a character
Akronb4bbb472021-08-09 11:49:38 +0200926 if a != dat.epsilon {
927
Akron98fbfef2021-10-23 17:02:11 +0200928 buffc++
Akronb4bbb472021-08-09 11:49:38 +0200929
Akron439f4ec2021-08-09 15:45:38 +0200930 // Transition does not produce a character
Akron98fbfef2021-10-23 17:02:11 +0200931 if buffc-bufft == 1 && ta.isNonToken() {
Akron3610f102021-08-08 14:13:25 +0200932 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100933 log.Println("Nontoken forward", showBufferNew(buffer, bufft, buffc, buffi))
Akron3610f102021-08-08 14:13:25 +0200934 }
Akron98fbfef2021-10-23 17:02:11 +0200935 bufft++
936 // rewindBuffer = true
Akron3610f102021-08-08 14:13:25 +0200937 }
Akron068874c2021-08-04 15:19:56 +0200938
Akrone396a932021-10-19 01:06:13 +0200939 } else {
Akron524c5432021-08-05 14:14:27 +0200940
Akrone396a932021-10-19 01:06:13 +0200941 // Transition marks the end of a token - so flush the buffer
Akron98fbfef2021-10-23 17:02:11 +0200942 if buffc-bufft > 0 {
Akronc5d8d432021-08-10 16:48:44 +0200943 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100944 log.Println("-> Flush buffer: [", string(buffer[bufft:buffc]), "]", showBuffer(buffer, buffc, buffi))
Akronc5d8d432021-08-10 16:48:44 +0200945 }
Akron32416ce2021-10-23 17:09:41 +0200946 w.Token(bufft, buffer[:buffc])
Akronc5d8d432021-08-10 16:48:44 +0200947 rewindBuffer = true
Akrone396a932021-10-19 01:06:13 +0200948 sentenceEnd = false
Akrona854faa2021-10-22 19:31:08 +0200949 textEnd = false
Akrone396a932021-10-19 01:06:13 +0200950 } else {
951 sentenceEnd = true
Akrona854faa2021-10-22 19:31:08 +0200952 w.SentenceEnd(0)
Akron1594cb82021-08-11 11:14:56 +0200953 }
Akron439f4ec2021-08-09 15:45:38 +0200954 }
Akron3610f102021-08-08 14:13:25 +0200955
Akron8cc2dd92021-10-25 19:49:41 +0200956 if eot {
957 eot = false
958 textEnd = true
959 w.TextEnd(0)
960 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100961 log.Println("END OF TEXT")
Akron8cc2dd92021-10-25 19:49:41 +0200962 }
963 }
964
Akronc5d8d432021-08-10 16:48:44 +0200965 // Rewind the buffer if necessary
Akron439f4ec2021-08-09 15:45:38 +0200966 if rewindBuffer {
967
Akrone396a932021-10-19 01:06:13 +0200968 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100969 log.Println("-> Rewind buffer", bufft, buffc, buffi, epsilonOffset)
Akrone396a932021-10-19 01:06:13 +0200970 }
971
Akron439f4ec2021-08-09 15:45:38 +0200972 // TODO: Better as a ring buffer
Akron04335c62021-10-28 11:56:00 +0200973 // buffer = buffer[buffc:] !slower
Akron98fbfef2021-10-23 17:02:11 +0200974 for x, i := range buffer[buffc:buffi] {
Akron3610f102021-08-08 14:13:25 +0200975 buffer[x] = i
976 }
Akronb4bbb472021-08-09 11:49:38 +0200977
Akron98fbfef2021-10-23 17:02:11 +0200978 buffi -= buffc
Akron16c312e2021-09-26 13:11:12 +0200979 // epsilonOffset -= buffo
Akrone396a932021-10-19 01:06:13 +0200980 epsilonOffset = 0
981 epsilonState = 0
982
Akron98fbfef2021-10-23 17:02:11 +0200983 buffc = 0
984 bufft = 0
Akrona854faa2021-10-22 19:31:08 +0200985
Akron98fbfef2021-10-23 17:02:11 +0200986 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100987 log.Println("Remaining:", showBufferNew(buffer, bufft, buffc, buffi))
Akron98fbfef2021-10-23 17:02:11 +0200988 }
989 }
990
Akronb7e1f132021-08-10 11:52:31 +0200991 // Move to representative state
Akronf1a16502021-08-16 15:24:38 +0200992 if ta.isSeparate() {
993 t = ta.getBase()
994 ta = dat.array[t]
Akronb7e1f132021-08-10 11:52:31 +0200995
996 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100997 log.Println("Representative pointing to", t)
Akronb7e1f132021-08-10 11:52:31 +0200998 }
999 }
1000
Akronc5d8d432021-08-10 16:48:44 +02001001 newchar = true
1002
Akron068874c2021-08-04 15:19:56 +02001003 // TODO:
1004 // Prevent endless epsilon loops!
1005 }
1006
Akron439f4ec2021-08-09 15:45:38 +02001007 // Input reader is not yet finished
Akron3f8571a2021-08-05 11:18:10 +02001008 if !eof {
Akron068874c2021-08-04 15:19:56 +02001009 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001010 log.Println("Not at the end - problem", t0, ":", dat.outgoing(t0))
Akron068874c2021-08-04 15:19:56 +02001011 }
1012 return false
1013 }
1014
Akronb7e1f132021-08-10 11:52:31 +02001015 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001016 log.Println("Entering final check")
Akronb7e1f132021-08-10 11:52:31 +02001017 }
1018
Akrone396a932021-10-19 01:06:13 +02001019 /*
1020 The following code is for deprecated automata relying on
1021 final states. Datok now requires final states to be marked
1022 with tokenends.
Akronf1a16502021-08-16 15:24:38 +02001023
Akrone396a932021-10-19 01:06:13 +02001024 // Automaton is in a final state, so flush the buffer and return
1025 x := dat.array[t].getBase() + uint32(dat.final)
Akronb4bbb472021-08-09 11:49:38 +02001026
Akrone396a932021-10-19 01:06:13 +02001027 if x < dat.array[1].getCheck() && dat.array[x].getCheck() == t {
Akron6e70dc82021-08-11 11:33:18 +02001028
Akrone396a932021-10-19 01:06:13 +02001029 if buffi > 0 {
1030 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001031 log.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
Akrone396a932021-10-19 01:06:13 +02001032 }
1033 w.Token(0, buffer[:buffi])
Akronc5d8d432021-08-10 16:48:44 +02001034 }
Akron84d68e62021-08-04 17:06:52 +02001035
Akrone396a932021-10-19 01:06:13 +02001036 // Add an additional sentence ending, if the file is over but no explicit
1037 // sentence split was reached. This may be controversial and therefore
1038 // optional via parameter.
1039 if !dat.array[t0].isTokenEnd() {
1040 w.SentenceEnd()
1041 }
Akron6e70dc82021-08-11 11:33:18 +02001042
Akrone396a932021-10-19 01:06:13 +02001043 // TODO:
1044 // There may be a new line at the end, from an epsilon,
1045 // so we may need to go on!
1046 return true
1047 }
1048 */
Akron068874c2021-08-04 15:19:56 +02001049
Akrona854faa2021-10-22 19:31:08 +02001050 // Check epsilon transitions as long as possible
Akronc5d8d432021-08-10 16:48:44 +02001051 t0 = t
Akronf1a16502021-08-16 15:24:38 +02001052 t = dat.array[t0].getBase() + uint32(dat.epsilon)
Akron01912fc2021-08-12 11:41:58 +02001053 a = dat.epsilon
1054 newchar = false
Akrone396a932021-10-19 01:06:13 +02001055
Akronf1a16502021-08-16 15:24:38 +02001056 if dat.array[t].getCheck() == t0 {
Akronc5d8d432021-08-10 16:48:44 +02001057 // Remember state for backtracking to last tokenend state
Akronc5d8d432021-08-10 16:48:44 +02001058 goto PARSECHAR
Akrone61380b2021-08-16 10:10:46 +02001059
Akronc5d8d432021-08-10 16:48:44 +02001060 } else if epsilonState != 0 {
Akronb7e1f132021-08-10 11:52:31 +02001061 t0 = epsilonState
1062 epsilonState = 0 // reset
Akron98fbfef2021-10-23 17:02:11 +02001063 buffc = epsilonOffset
Akron068874c2021-08-04 15:19:56 +02001064 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001065 log.Println("Get from epsilon stack and set buffo!", showBufferNew(buffer, bufft, buffc, buffi))
Akron068874c2021-08-04 15:19:56 +02001066 }
Akronc5d8d432021-08-10 16:48:44 +02001067 goto PARSECHAR
Akron068874c2021-08-04 15:19:56 +02001068 }
Akrone396a932021-10-19 01:06:13 +02001069
1070 // Add an additional sentence ending, if the file is over but no explicit
1071 // sentence split was reached. This may be controversial and therefore
1072 // optional via parameter.
1073 if !sentenceEnd {
Akrona854faa2021-10-22 19:31:08 +02001074 w.SentenceEnd(0)
1075
Akrone396a932021-10-19 01:06:13 +02001076 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001077 log.Println("Sentence end")
Akrona854faa2021-10-22 19:31:08 +02001078 }
1079 }
1080
1081 if !textEnd {
1082 w.TextEnd(0)
1083
1084 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001085 log.Println("Text end")
Akrone396a932021-10-19 01:06:13 +02001086 }
1087 }
1088
1089 return true
Akron068874c2021-08-04 15:19:56 +02001090}