blob: ef3667b1216f4737501671994be928d0cebcbc66 [file] [log] [blame]
Akron7f1097f2021-09-21 16:00:29 +02001package datok
Akron8ef408b2021-08-02 22:11:04 +02002
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
Akronb4bbb472021-08-09 11:49:38 +02009// The maximum number of states is 1.073.741.823 (30bit),
Akron83e75a22021-08-04 13:14:06 +020010// with a loadfactor of ~70, this means roughly 70 million
11// states in the FSA, which is sufficient for the current
12// job.
Akron03c92fe2021-08-09 14:07:57 +020013//
14// Serialization is little endian.
Akron83e75a22021-08-04 13:14:06 +020015
Akron740f3d72021-08-03 12:12:34 +020016// TODO:
17// - replace maxSize with the check value
Akron7b1faa62021-09-02 16:10:21 +020018// - Check if final states can be optional.
19// - Introduce ELM (Morita et al. 2001) to speed
20// up construction. Can be ignored on serialization
21// to improve compression.
Akron3a063ef2021-08-05 19:36:35 +020022// - Add checksum to serialization.
Akrone61380b2021-08-16 10:10:46 +020023// - Replace/Enhance table with a map
24// - Provide a bufio.Scanner compatible interface.
Akron8e1d69b2021-08-12 17:38:49 +020025// - Mark epsilon transitions in bytes
Akron740f3d72021-08-03 12:12:34 +020026
Akron8ef408b2021-08-02 22:11:04 +020027import (
28 "bufio"
29 "compress/gzip"
Akron6247a5d2021-08-03 19:18:28 +020030 "encoding/binary"
Akron8ef408b2021-08-02 22:11:04 +020031 "io"
Akron679b4862021-09-02 16:59:26 +020032 "math"
Akron8ef408b2021-08-02 22:11:04 +020033 "os"
Akronc9d84a62021-08-03 15:56:03 +020034 "sort"
Akron740f3d72021-08-03 12:12:34 +020035
Akron527c10c2021-08-13 01:45:18 +020036 "log"
Akron8ef408b2021-08-02 22:11:04 +020037)
38
39const (
Akron2a4b9292021-08-04 15:35:22 +020040 DEBUG = false
Akron16c312e2021-09-26 13:11:12 +020041 DAMAGIC = "DATOK"
Akron2a4b9292021-08-04 15:35:22 +020042 VERSION = uint16(1)
Akron03c92fe2021-08-09 14:07:57 +020043 FIRSTBIT uint32 = 1 << 31
44 SECONDBIT uint32 = 1 << 30
45 RESTBIT uint32 = ^uint32(0) &^ (FIRSTBIT | SECONDBIT)
Akron8ef408b2021-08-02 22:11:04 +020046)
47
Akron03c92fe2021-08-09 14:07:57 +020048// Serialization is always little endian
Akron6247a5d2021-08-03 19:18:28 +020049var bo binary.ByteOrder = binary.LittleEndian
50
Akron8ef408b2021-08-02 22:11:04 +020051type mapping struct {
52 source int
Akron3fdfec62021-08-04 11:40:10 +020053 target uint32
Akron8ef408b2021-08-02 22:11:04 +020054}
55
Akronf1a16502021-08-16 15:24:38 +020056type bc struct {
57 base uint32
58 check uint32
59}
60
Akron03c92fe2021-08-09 14:07:57 +020061// DaTokenizer represents a tokenizer implemented as a
62// Double Array FSA.
Akronf2120ca2021-08-03 16:26:41 +020063type DaTokenizer struct {
Akronea46e8a2021-08-17 00:36:31 +020064 sigma map[rune]int
65 sigmaASCII [256]int
Akron03a3c612021-08-04 11:51:27 +020066 maxSize int
Akron4fa28b32021-08-27 10:55:41 +020067 transCount int
Akronf1a16502021-08-16 15:24:38 +020068 array []bc
Akronc17f1ca2021-08-03 19:47:27 +020069
70 // Special symbols in sigma
71 epsilon int
72 unknown int
73 identity int
74 final int
Akron03c92fe2021-08-09 14:07:57 +020075 tokenend int
Akronf2120ca2021-08-03 16:26:41 +020076}
77
Akron439f4ec2021-08-09 15:45:38 +020078// ToDoubleArray turns the intermediate tokenizer representation
79// into a double array representation.
80//
81// This is based on Mizobuchi et al (2000), p.128
Akron1c34ce62021-09-23 23:27:39 +020082func (auto *Automaton) ToDoubleArray() *DaTokenizer {
Akronf2120ca2021-08-03 16:26:41 +020083
84 dat := &DaTokenizer{
Akron03a3c612021-08-04 11:51:27 +020085 sigma: make(map[rune]int),
Akron4fa28b32021-08-27 10:55:41 +020086 transCount: -1,
Akron1c34ce62021-09-23 23:27:39 +020087 final: auto.final,
88 unknown: auto.unknown,
89 identity: auto.identity,
90 epsilon: auto.epsilon,
91 tokenend: auto.tokenend,
Akronf2120ca2021-08-03 16:26:41 +020092 }
93
Akronf1a16502021-08-16 15:24:38 +020094 dat.resize(dat.final)
95
Akron1c34ce62021-09-23 23:27:39 +020096 for num, sym := range auto.sigmaRev {
Akronea46e8a2021-08-17 00:36:31 +020097 if int(sym) < 256 {
98 dat.sigmaASCII[int(sym)] = num
99 }
Akronf2120ca2021-08-03 16:26:41 +0200100 dat.sigma[sym] = num
101 }
Akron8ef408b2021-08-02 22:11:04 +0200102
103 mark := 0
104 size := 0
Akron6f1c16c2021-08-17 10:45:42 +0200105 var base uint32
Akronde18e902021-08-27 09:34:12 +0200106 var atrans *edge
107 var s, s1 int
108 var t, t1 uint32
Akron8ef408b2021-08-02 22:11:04 +0200109
Akron439f4ec2021-08-09 15:45:38 +0200110 // Create a mapping from s (in Ms aka Intermediate FSA)
111 // to t (in Mt aka Double Array FSA)
Akron1c34ce62021-09-23 23:27:39 +0200112 table := make([]*mapping, auto.arcCount+1)
Akron7b1faa62021-09-02 16:10:21 +0200113 // tableQueue := make([]int, tok.arcCount+1)
Akron8ef408b2021-08-02 22:11:04 +0200114
Akron439f4ec2021-08-09 15:45:38 +0200115 // Initialize with the start state
Akron8ef408b2021-08-02 22:11:04 +0200116 table[size] = &mapping{source: 1, target: 1}
Akron7b1faa62021-09-02 16:10:21 +0200117 // tableQueue[size] = 1
Akron8ef408b2021-08-02 22:11:04 +0200118 size++
119
Akron740f3d72021-08-03 12:12:34 +0200120 // Allocate space for the outgoing symbol range
Akron1c34ce62021-09-23 23:27:39 +0200121 A := make([]int, 0, auto.sigmaCount)
Akron7b1faa62021-09-02 16:10:21 +0200122 // tableLookup := make([]uint32, tok.arcCount+2) // make(map[int]uint32)
123 // tableLookup[1] = 1
Akron8ef408b2021-08-02 22:11:04 +0200124
Akron7b1faa62021-09-02 16:10:21 +0200125 // block_begin_pos := uint32(1)
Akronde18e902021-08-27 09:34:12 +0200126
Akron8ef408b2021-08-02 22:11:04 +0200127 for mark < size {
Akronde18e902021-08-27 09:34:12 +0200128 s = table[mark].source // This is a state in Ms
129 t = table[mark].target // This is a state in Mt
Akron7b1faa62021-09-02 16:10:21 +0200130 // s = tableQueue[mark]
131 // t = tableLookup[s]
Akron8ef408b2021-08-02 22:11:04 +0200132 mark++
Akron740f3d72021-08-03 12:12:34 +0200133
134 // Following the paper, here the state t can be remembered
135 // in the set of states St
Akron8ef408b2021-08-02 22:11:04 +0200136 A = A[:0]
Akron1c34ce62021-09-23 23:27:39 +0200137 auto.getSet(s, &A)
Akron8ef408b2021-08-02 22:11:04 +0200138
Akron740f3d72021-08-03 12:12:34 +0200139 // Set base to the first free slot in the double array
Akron29e306f2021-09-02 18:29:56 +0200140 // base = dat.xCheck(A)
Akron679b4862021-09-02 16:59:26 +0200141 // base = dat.xCheckSkip(A)
Akron7b1faa62021-09-02 16:10:21 +0200142 // base = dat.xCheckNiu(A, &block_begin_pos)
Akron29e306f2021-09-02 18:29:56 +0200143 base = dat.xCheckSkipNiu(A)
Akron6f1c16c2021-08-17 10:45:42 +0200144 dat.array[t].setBase(base)
Akron8ef408b2021-08-02 22:11:04 +0200145
Akron773b1ef2021-08-03 17:37:20 +0200146 // TODO:
Akron068874c2021-08-04 15:19:56 +0200147 // Sort the outgoing transitions based on the
Akron773b1ef2021-08-03 17:37:20 +0200148 // outdegree of .end
149
Akron740f3d72021-08-03 12:12:34 +0200150 // Iterate over all outgoing symbols
Akron8ef408b2021-08-02 22:11:04 +0200151 for _, a := range A {
152
Akron1c34ce62021-09-23 23:27:39 +0200153 if a != auto.final {
Akron8ef408b2021-08-02 22:11:04 +0200154
Akron1c34ce62021-09-23 23:27:39 +0200155 atrans = auto.transitions[s][a]
Akronde18e902021-08-27 09:34:12 +0200156
Akron740f3d72021-08-03 12:12:34 +0200157 // Aka g(s, a)
Akronde18e902021-08-27 09:34:12 +0200158 s1 = atrans.end
Akron8ef408b2021-08-02 22:11:04 +0200159
Akron740f3d72021-08-03 12:12:34 +0200160 // Store the transition
Akronde18e902021-08-27 09:34:12 +0200161 t1 = base + uint32(a)
Akronf1a16502021-08-16 15:24:38 +0200162 dat.array[t1].setCheck(t)
163
164 // Set maxSize
165 if dat.maxSize < int(t1) {
166 dat.maxSize = int(t1)
167 }
Akron8ef408b2021-08-02 22:11:04 +0200168
Akron439f4ec2021-08-09 15:45:38 +0200169 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100170 log.Println("Translate transition",
Akron524c5432021-08-05 14:14:27 +0200171 s, "->", s1, "(", a, ")", "to", t, "->", t1)
172 }
173
Akron83e75a22021-08-04 13:14:06 +0200174 // Mark the state as being the target of a nontoken transition
Akronde18e902021-08-27 09:34:12 +0200175 if atrans.nontoken {
Akronf1a16502021-08-16 15:24:38 +0200176 dat.array[t1].setNonToken(true)
Akron524c5432021-08-05 14:14:27 +0200177 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100178 log.Println("Set", t1, "to nontoken")
Akron524c5432021-08-05 14:14:27 +0200179 }
Akron83e75a22021-08-04 13:14:06 +0200180 }
181
Akron84d68e62021-08-04 17:06:52 +0200182 // Mark the state as being the target of a tokenend transition
Akronde18e902021-08-27 09:34:12 +0200183 if atrans.tokenend {
Akronf1a16502021-08-16 15:24:38 +0200184 dat.array[t1].setTokenEnd(true)
Akron524c5432021-08-05 14:14:27 +0200185 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100186 log.Println("Set", t1, "to tokenend")
Akron524c5432021-08-05 14:14:27 +0200187 }
Akron84d68e62021-08-04 17:06:52 +0200188 }
189
Akron740f3d72021-08-03 12:12:34 +0200190 // Check for representative states
Akron439f4ec2021-08-09 15:45:38 +0200191 r := stateAlreadyInTable(s1, table, size)
Akronde18e902021-08-27 09:34:12 +0200192 // r := tableLookup[s1]
Akron740f3d72021-08-03 12:12:34 +0200193
Akron439f4ec2021-08-09 15:45:38 +0200194 // No representative found
Akron8ef408b2021-08-02 22:11:04 +0200195 if r == 0 {
Akron740f3d72021-08-03 12:12:34 +0200196 // Remember the mapping
Akron8ef408b2021-08-02 22:11:04 +0200197 table[size] = &mapping{source: s1, target: t1}
Akron7b1faa62021-09-02 16:10:21 +0200198 // tableQueue[size] = s1
Akronde18e902021-08-27 09:34:12 +0200199 // tableLookup[s1] = t1
Akron8ef408b2021-08-02 22:11:04 +0200200 size++
201 } else {
Akron740f3d72021-08-03 12:12:34 +0200202 // Overwrite with the representative state
Akronf1a16502021-08-16 15:24:38 +0200203 dat.array[t1].setBase(r)
204 dat.array[t1].setSeparate(true)
Akron8ef408b2021-08-02 22:11:04 +0200205 }
206 } else {
Akron740f3d72021-08-03 12:12:34 +0200207 // Store a final transition
Akron6f1c16c2021-08-17 10:45:42 +0200208 dat.array[base+uint32(dat.final)].setCheck(t)
Akronf1a16502021-08-16 15:24:38 +0200209
Akronde18e902021-08-27 09:34:12 +0200210 if dat.maxSize < int(base)+dat.final {
211 dat.maxSize = int(base) + dat.final
Akronf1a16502021-08-16 15:24:38 +0200212 }
Akron8ef408b2021-08-02 22:11:04 +0200213 }
214 }
215 }
216
217 // Following Mizobuchi et al (2000) the size of the
218 // FSA should be stored in check(1).
Akron29e306f2021-09-02 18:29:56 +0200219 // We make the size a bit larger so we never have to check for boundaries.
220 dat.setSize(dat.maxSize + dat.final)
221 if len(dat.array) < dat.maxSize+dat.final {
222 dat.array = append(dat.array, make([]bc, dat.final)...)
223 }
224 dat.array = dat.array[:dat.maxSize+dat.final]
Akronf2120ca2021-08-03 16:26:41 +0200225 return dat
Akron8ef408b2021-08-02 22:11:04 +0200226}
227
Akron8ef408b2021-08-02 22:11:04 +0200228// Check the table if a mapping of s
Akron740f3d72021-08-03 12:12:34 +0200229// exists and return this as a representative.
230// Currently iterates through the whole table
231// in a bruteforce manner.
Akron439f4ec2021-08-09 15:45:38 +0200232func stateAlreadyInTable(s int, table []*mapping, size int) uint32 {
Akron8ef408b2021-08-02 22:11:04 +0200233 for x := 0; x < size; x++ {
234 if table[x].source == s {
235 return table[x].target
236 }
237 }
238 return 0
239}
240
Akron941f2152021-09-26 15:14:25 +0200241// Type of tokenizer
242func (DaTokenizer) Type() string {
243 return DAMAGIC
244}
245
Akron64ffd9a2021-08-03 19:55:21 +0200246// Resize double array when necessary
247func (dat *DaTokenizer) resize(l int) {
248 // TODO:
249 // This is a bit too aggressive atm and should be calmed down.
250 if len(dat.array) <= l {
Akronf1a16502021-08-16 15:24:38 +0200251 dat.array = append(dat.array, make([]bc, l)...)
Akron8ef408b2021-08-02 22:11:04 +0200252 }
Akron64ffd9a2021-08-03 19:55:21 +0200253}
Akronc9d84a62021-08-03 15:56:03 +0200254
Akron64ffd9a2021-08-03 19:55:21 +0200255// Set base value in double array
Akronf1a16502021-08-16 15:24:38 +0200256func (bc *bc) setBase(v uint32) {
257 bc.base = v
Akron439f4ec2021-08-09 15:45:38 +0200258}
259
260// Get base value in double array
Akronf1a16502021-08-16 15:24:38 +0200261func (bc *bc) getBase() uint32 {
262 return bc.base & RESTBIT
Akron439f4ec2021-08-09 15:45:38 +0200263}
264
265// Set check value in double array
Akronf1a16502021-08-16 15:24:38 +0200266func (bc *bc) setCheck(v uint32) {
267 bc.check = v
Akron439f4ec2021-08-09 15:45:38 +0200268}
269
270// Get check value in double array
Akronf1a16502021-08-16 15:24:38 +0200271func (bc *bc) getCheck() uint32 {
272 return bc.check & RESTBIT
Akron64ffd9a2021-08-03 19:55:21 +0200273}
274
Akron3fdfec62021-08-04 11:40:10 +0200275// Returns true if a state is separate pointing to a representative
Akronf1a16502021-08-16 15:24:38 +0200276func (bc *bc) isSeparate() bool {
277 return bc.base&FIRSTBIT != 0
Akron3fdfec62021-08-04 11:40:10 +0200278}
279
280// Mark a state as separate pointing to a representative
Akronf1a16502021-08-16 15:24:38 +0200281func (bc *bc) setSeparate(sep bool) {
Akron3fdfec62021-08-04 11:40:10 +0200282 if sep {
Akronf1a16502021-08-16 15:24:38 +0200283 bc.base |= FIRSTBIT
Akron3fdfec62021-08-04 11:40:10 +0200284 } else {
Akronf1a16502021-08-16 15:24:38 +0200285 bc.base &= (RESTBIT | SECONDBIT)
Akron3fdfec62021-08-04 11:40:10 +0200286 }
287}
288
Akron83e75a22021-08-04 13:14:06 +0200289// Returns true if a state is the target of a nontoken transition
Akronf1a16502021-08-16 15:24:38 +0200290func (bc *bc) isNonToken() bool {
291 return bc.check&FIRSTBIT != 0
Akron83e75a22021-08-04 13:14:06 +0200292}
293
294// Mark a state as being the target of a nontoken transition
Akronf1a16502021-08-16 15:24:38 +0200295func (bc *bc) setNonToken(sep bool) {
Akron83e75a22021-08-04 13:14:06 +0200296 if sep {
Akronf1a16502021-08-16 15:24:38 +0200297 bc.check |= FIRSTBIT
Akron83e75a22021-08-04 13:14:06 +0200298 } else {
Akronf1a16502021-08-16 15:24:38 +0200299 bc.check &= (RESTBIT | SECONDBIT)
Akron83e75a22021-08-04 13:14:06 +0200300 }
301}
302
Akron84d68e62021-08-04 17:06:52 +0200303// Returns true if a state is the target of a tokenend transition
Akronf1a16502021-08-16 15:24:38 +0200304func (bc *bc) isTokenEnd() bool {
305 return bc.check&SECONDBIT != 0
Akron84d68e62021-08-04 17:06:52 +0200306}
307
308// Mark a state as being the target of a tokenend transition
Akronf1a16502021-08-16 15:24:38 +0200309func (bc *bc) setTokenEnd(sep bool) {
Akron84d68e62021-08-04 17:06:52 +0200310 if sep {
Akronf1a16502021-08-16 15:24:38 +0200311 bc.check |= SECONDBIT
Akron84d68e62021-08-04 17:06:52 +0200312 } else {
Akronf1a16502021-08-16 15:24:38 +0200313 bc.check &= (RESTBIT | FIRSTBIT)
Akron84d68e62021-08-04 17:06:52 +0200314 }
315}
316
Akron64ffd9a2021-08-03 19:55:21 +0200317// Set size of double array
Akron3fdfec62021-08-04 11:40:10 +0200318func (dat *DaTokenizer) setSize(v int) {
Akronf1a16502021-08-16 15:24:38 +0200319 dat.array[1].setCheck(uint32(v))
Akron64ffd9a2021-08-03 19:55:21 +0200320}
321
322// Get size of double array
Akron3fdfec62021-08-04 11:40:10 +0200323func (dat *DaTokenizer) GetSize() int {
Akronf1a16502021-08-16 15:24:38 +0200324 return int(dat.array[1].getCheck())
Akron8ef408b2021-08-02 22:11:04 +0200325}
326
327// Based on Mizobuchi et al (2000), p. 124
328// This iterates for every state through the complete double array
329// structure until it finds a gap that fits all outgoing transitions
330// of the state. This is extremely slow, but is only necessary in the
331// construction phase of the tokenizer.
Akron3fdfec62021-08-04 11:40:10 +0200332func (dat *DaTokenizer) xCheck(symbols []int) uint32 {
Akron740f3d72021-08-03 12:12:34 +0200333
334 // Start at the first entry of the double array list
Akron6f1c16c2021-08-17 10:45:42 +0200335 base := uint32(1)
336
Akron8ef408b2021-08-02 22:11:04 +0200337OVERLAP:
Akron740f3d72021-08-03 12:12:34 +0200338 // Resize the array if necessary
Akronf1a16502021-08-16 15:24:38 +0200339 dat.resize(int(base) + dat.final)
Akron8ef408b2021-08-02 22:11:04 +0200340 for _, a := range symbols {
Akronf1a16502021-08-16 15:24:38 +0200341 if dat.array[int(base)+a].getCheck() != 0 {
Akron8ef408b2021-08-02 22:11:04 +0200342 base++
343 goto OVERLAP
344 }
345 }
Akron8ef408b2021-08-02 22:11:04 +0200346 return base
347}
348
Akron679b4862021-09-02 16:59:26 +0200349// This is an implementation of xCheck with the skip-improvement
350// proposed by Morita et al. (2001)
351func (dat *DaTokenizer) xCheckSkip(symbols []int) uint32 {
352
353 // Start at the first entry of the double array list
354 base := uint32(math.Abs(float64(dat.maxSize-1) * .9))
355
356OVERLAP:
357 // Resize the array if necessary
358 dat.resize(int(base) + dat.final)
359 for _, a := range symbols {
360 if dat.array[int(base)+a].getCheck() != 0 {
361 base++
362 goto OVERLAP
363 }
364 }
365 return base
366}
367
Akron29e306f2021-09-02 18:29:56 +0200368// This is an implementation of xCheck with the skip-improvement
369// proposed by Morita et al. (2001) for higher outdegrees as
370// proposed by Niu et al. (2013)
371func (dat *DaTokenizer) xCheckSkipNiu(symbols []int) uint32 {
372
373 // Start at the first entry of the double array list
374 base := uint32(1)
375
376 // Or skip the first few entries
377 if len(symbols) >= 3 {
378 base = uint32(math.Abs(float64(dat.maxSize-1)*.9)) + 1
379 }
380
381OVERLAP:
382 // Resize the array if necessary
383 dat.resize(int(base) + dat.final + 1)
384 for _, a := range symbols {
385 if dat.array[int(base)+a].getCheck() != 0 {
386 base++
387 goto OVERLAP
388 }
389 }
390 return base
391}
392
Akron7b1faa62021-09-02 16:10:21 +0200393// This is an implementation of xCheck wit an improvement
394// proposed by Niu et al. (2013)
395func (dat *DaTokenizer) xCheckNiu(symbols []int, block_begin_pos *uint32) uint32 {
396
397 // Start at the first entry of the double array list
398 base := uint32(1)
399
400 if len(symbols) > 3 {
401 sort.Ints(symbols)
402 if *block_begin_pos > uint32(symbols[0]) {
403 dat.resize(int(*block_begin_pos) + dat.final)
404 *block_begin_pos += uint32(symbols[len(symbols)-1] + 1)
405 return *block_begin_pos - uint32(symbols[0])
406 }
407 }
408
409OVERLAP:
410 // Resize the array if necessary
411 dat.resize(int(base) + dat.final)
412 for _, a := range symbols {
413 if dat.array[int(base)+a].getCheck() != 0 {
414 base++
415 goto OVERLAP
416 }
417 }
418 return base
419}
420
Akron3610f102021-08-08 14:13:25 +0200421// List all outgoing transitions for a state
422// for testing purposes
423func (dat *DaTokenizer) outgoing(t uint32) []int {
424
425 valid := make([]int, 0, len(dat.sigma))
426
427 for _, a := range dat.sigma {
Akronf1a16502021-08-16 15:24:38 +0200428 t1 := dat.array[t].getBase() + uint32(a)
429 if t1 <= dat.array[1].getCheck() && dat.array[t1].getCheck() == t {
Akron3610f102021-08-08 14:13:25 +0200430 valid = append(valid, a)
431 }
432 }
433
434 for _, a := range []int{dat.epsilon, dat.unknown, dat.identity, dat.final} {
Akronf1a16502021-08-16 15:24:38 +0200435 t1 := dat.array[t].getBase() + uint32(a)
436 if t1 <= dat.array[1].getCheck() && dat.array[t1].getCheck() == t {
Akron3610f102021-08-08 14:13:25 +0200437 valid = append(valid, -1*a)
438 }
439 }
440
441 sort.Ints(valid)
442
443 return valid
444}
445
Akron4fa28b32021-08-27 10:55:41 +0200446// TransCount as the number of transitions aka arcs in the
447// finite state automaton
448func (dat *DaTokenizer) TransCount() int {
449 // Cache the transCount
450 if dat.transCount > 0 {
451 return dat.transCount
452 }
453
454 dat.transCount = 0
455 for x := 1; x < len(dat.array); x++ {
456 if dat.array[x].getBase() != 0 {
457 dat.transCount++
458 }
459 }
460
461 return dat.transCount
462}
463
Akron03a3c612021-08-04 11:51:27 +0200464// LoadFactor as defined in Kanda et al (2018),
465// i.e. the proportion of non-empty elements to all elements.
466func (dat *DaTokenizer) LoadFactor() float64 {
Akron4fa28b32021-08-27 10:55:41 +0200467 return float64(dat.TransCount()) / float64(len(dat.array)) * 100
Akrond66a9262021-08-03 17:09:09 +0200468}
469
Akron439f4ec2021-08-09 15:45:38 +0200470// Save stores the double array data in a file
Akron3a063ef2021-08-05 19:36:35 +0200471func (dat *DaTokenizer) Save(file string) (n int64, err error) {
472 f, err := os.Create(file)
473 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200474 log.Println(err)
Akron439f4ec2021-08-09 15:45:38 +0200475 return 0, err
Akron3a063ef2021-08-05 19:36:35 +0200476 }
477 defer f.Close()
478 gz := gzip.NewWriter(f)
479 defer gz.Close()
480 n, err = dat.WriteTo(gz)
481 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200482 log.Println(err)
Akron3a063ef2021-08-05 19:36:35 +0200483 return n, err
484 }
485 gz.Flush()
486 return n, nil
487}
488
489// WriteTo stores the double array data in an io.Writer.
Akron6247a5d2021-08-03 19:18:28 +0200490func (dat *DaTokenizer) WriteTo(w io.Writer) (n int64, err error) {
491
Akron3a063ef2021-08-05 19:36:35 +0200492 wb := bufio.NewWriter(w)
493 defer wb.Flush()
494
Akron6247a5d2021-08-03 19:18:28 +0200495 // Store magical header
Akron16c312e2021-09-26 13:11:12 +0200496 all, err := wb.Write([]byte(DAMAGIC))
Akron6247a5d2021-08-03 19:18:28 +0200497 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200498 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200499 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200500 }
501
502 // Get sigma as a list
503 sigmalist := make([]rune, len(dat.sigma)+16)
504 max := 0
505 for sym, num := range dat.sigma {
506 sigmalist[num] = sym
507 if num > max {
508 max = num
509 }
510 }
511
512 sigmalist = sigmalist[:max+1]
513
Akron3f8571a2021-08-05 11:18:10 +0200514 buf := make([]byte, 0, 16)
Akron6247a5d2021-08-03 19:18:28 +0200515 bo.PutUint16(buf[0:2], VERSION)
Akronc17f1ca2021-08-03 19:47:27 +0200516 bo.PutUint16(buf[2:4], uint16(dat.epsilon))
517 bo.PutUint16(buf[4:6], uint16(dat.unknown))
518 bo.PutUint16(buf[6:8], uint16(dat.identity))
519 bo.PutUint16(buf[8:10], uint16(dat.final))
Akron6247a5d2021-08-03 19:18:28 +0200520 bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
Akronf1a16502021-08-16 15:24:38 +0200521 bo.PutUint32(buf[12:16], uint32(len(dat.array)*2)) // Legacy support
Akron3a063ef2021-08-05 19:36:35 +0200522 more, err := wb.Write(buf[0:16])
Akron6247a5d2021-08-03 19:18:28 +0200523 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200524 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200525 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200526 }
527
528 all += more
529
Akron6247a5d2021-08-03 19:18:28 +0200530 // Write sigma
531 for _, sym := range sigmalist {
Akron3f8571a2021-08-05 11:18:10 +0200532
Akron3a063ef2021-08-05 19:36:35 +0200533 more, err = wb.WriteRune(sym)
Akron6247a5d2021-08-03 19:18:28 +0200534 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200535 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200536 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200537 }
538 all += more
539 }
Akron439f4ec2021-08-09 15:45:38 +0200540
Akron6247a5d2021-08-03 19:18:28 +0200541 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200542 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200543 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200544 }
Akron6247a5d2021-08-03 19:18:28 +0200545
546 // Test marker - could be checksum
Akron3a063ef2021-08-05 19:36:35 +0200547 more, err = wb.Write([]byte("T"))
Akron6247a5d2021-08-03 19:18:28 +0200548 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200549 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200550 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200551 }
552 all += more
553
Akronf1a16502021-08-16 15:24:38 +0200554 // for x := 0; x < len(dat.array); x++ {
555 for _, bc := range dat.array {
556 bo.PutUint32(buf[0:4], bc.base)
557 more, err = wb.Write(buf[0:4])
Akron6247a5d2021-08-03 19:18:28 +0200558 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200559 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200560 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200561 }
Akron439f4ec2021-08-09 15:45:38 +0200562 all += more
Akron3a063ef2021-08-05 19:36:35 +0200563 if more != 4 {
Akronf1a16502021-08-16 15:24:38 +0200564 log.Println("Can not write base uint32")
565 return int64(all), err
566 }
567 bo.PutUint32(buf[0:4], bc.check)
568 more, err = wb.Write(buf[0:4])
569 if err != nil {
570 log.Println(err)
571 return int64(all), err
572 }
573 all += more
574 if more != 4 {
575 log.Println("Can not write check uint32")
Akron3a063ef2021-08-05 19:36:35 +0200576 return int64(all), err
577 }
Akron6247a5d2021-08-03 19:18:28 +0200578 }
579
580 return int64(all), err
581}
582
Akron439f4ec2021-08-09 15:45:38 +0200583// LoadDatokFile reads a double array represented tokenizer
584// from a file.
Akron3f8571a2021-08-05 11:18:10 +0200585func LoadDatokFile(file string) *DaTokenizer {
586 f, err := os.Open(file)
587 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200588 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200589 return nil
Akron3f8571a2021-08-05 11:18:10 +0200590 }
591 defer f.Close()
592
593 gz, err := gzip.NewReader(f)
594 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200595 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200596 return nil
Akron3f8571a2021-08-05 11:18:10 +0200597 }
598 defer gz.Close()
599
Akron3a063ef2021-08-05 19:36:35 +0200600 // Todo: Read the whole file!
Akron3f8571a2021-08-05 11:18:10 +0200601 return ParseDatok(gz)
602}
603
Akron439f4ec2021-08-09 15:45:38 +0200604// LoadDatokFile reads a double array represented tokenizer
605// from an io.Reader
Akron3f8571a2021-08-05 11:18:10 +0200606func ParseDatok(ior io.Reader) *DaTokenizer {
607
Akron439f4ec2021-08-09 15:45:38 +0200608 // Initialize tokenizer with default values
Akron3f8571a2021-08-05 11:18:10 +0200609 dat := &DaTokenizer{
610 sigma: make(map[rune]int),
611 epsilon: 0,
612 unknown: 0,
613 identity: 0,
614 final: 0,
Akron4fa28b32021-08-27 10:55:41 +0200615 transCount: 0,
Akron3f8571a2021-08-05 11:18:10 +0200616 }
617
618 r := bufio.NewReader(ior)
619
Akron3f8571a2021-08-05 11:18:10 +0200620 buf := make([]byte, 1024)
Akron16c312e2021-09-26 13:11:12 +0200621 buf = buf[0:len(DAMAGIC)]
Akron3f8571a2021-08-05 11:18:10 +0200622
Akron439f4ec2021-08-09 15:45:38 +0200623 _, err := r.Read(buf)
Akron3f8571a2021-08-05 11:18:10 +0200624
625 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200626 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200627 return nil
628 }
629
Akron16c312e2021-09-26 13:11:12 +0200630 if string(DAMAGIC) != string(buf) {
Akron527c10c2021-08-13 01:45:18 +0200631 log.Println("Not a datok file")
Akron3f8571a2021-08-05 11:18:10 +0200632 return nil
633 }
634
Akron439f4ec2021-08-09 15:45:38 +0200635 more, err := io.ReadFull(r, buf[0:16])
Akron3f8571a2021-08-05 11:18:10 +0200636 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200637 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200638 return nil
639 }
640
Akron439f4ec2021-08-09 15:45:38 +0200641 if more != 16 {
Akron527c10c2021-08-13 01:45:18 +0200642 log.Println("Read bytes do not fit")
Akron439f4ec2021-08-09 15:45:38 +0200643 return nil
644 }
Akron3f8571a2021-08-05 11:18:10 +0200645
Akron3a063ef2021-08-05 19:36:35 +0200646 version := bo.Uint16(buf[0:2])
647
648 if version != VERSION {
Akron527c10c2021-08-13 01:45:18 +0200649 log.Println("Version not compatible")
Akron3a063ef2021-08-05 19:36:35 +0200650 return nil
651 }
652
Akron3f8571a2021-08-05 11:18:10 +0200653 dat.epsilon = int(bo.Uint16(buf[2:4]))
654 dat.unknown = int(bo.Uint16(buf[4:6]))
655 dat.identity = int(bo.Uint16(buf[6:8]))
656 dat.final = int(bo.Uint16(buf[8:10]))
657
658 sigmaCount := int(bo.Uint16(buf[10:12]))
Akronf1a16502021-08-16 15:24:38 +0200659 arraySize := int(bo.Uint32(buf[12:16])) / 2 // Legacy support
Akron3f8571a2021-08-05 11:18:10 +0200660
Akron3a063ef2021-08-05 19:36:35 +0200661 // Shouldn't be relevant though
662 dat.maxSize = arraySize - 1
663
Akron3f8571a2021-08-05 11:18:10 +0200664 for x := 0; x < sigmaCount; x++ {
Akron439f4ec2021-08-09 15:45:38 +0200665 sym, _, err := r.ReadRune()
Akron3f8571a2021-08-05 11:18:10 +0200666 if err == nil && sym != 0 {
Akronea46e8a2021-08-17 00:36:31 +0200667 if int(sym) < 256 {
668 dat.sigmaASCII[int(sym)] = x
669 }
Akron3f8571a2021-08-05 11:18:10 +0200670 dat.sigma[sym] = x
671 }
Akron3f8571a2021-08-05 11:18:10 +0200672 }
673
Akron439f4ec2021-08-09 15:45:38 +0200674 _, err = io.ReadFull(r, buf[0:1])
Akron3f8571a2021-08-05 11:18:10 +0200675
676 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200677 log.Print(err)
Akron3f8571a2021-08-05 11:18:10 +0200678 return nil
679 }
680
Akron3f8571a2021-08-05 11:18:10 +0200681 if string("T") != string(buf[0:1]) {
Akron527c10c2021-08-13 01:45:18 +0200682 log.Println("Not a datok file")
Akron3f8571a2021-08-05 11:18:10 +0200683 return nil
684 }
685
686 // Read based on length
Akronf1a16502021-08-16 15:24:38 +0200687 dat.array = make([]bc, arraySize)
Akron3f8571a2021-08-05 11:18:10 +0200688
Akronbb4aac52021-08-13 00:52:27 +0200689 dataArray, err := io.ReadAll(r)
Akron439f4ec2021-08-09 15:45:38 +0200690
Akronbb4aac52021-08-13 00:52:27 +0200691 if err == io.EOF {
Akron527c10c2021-08-13 01:45:18 +0200692 log.Println(err)
Akronbb4aac52021-08-13 00:52:27 +0200693 return nil
694 }
695
Akronf1a16502021-08-16 15:24:38 +0200696 if len(dataArray) < arraySize*8 {
Akron527c10c2021-08-13 01:45:18 +0200697 log.Println("Not enough bytes read")
Akronbb4aac52021-08-13 00:52:27 +0200698 return nil
699 }
700
701 for x := 0; x < arraySize; x++ {
Akronf1a16502021-08-16 15:24:38 +0200702 dat.array[x].base = bo.Uint32(dataArray[x*8 : (x*8)+4])
703 dat.array[x].check = bo.Uint32(dataArray[(x*8)+4 : (x*8)+8])
Akron3f8571a2021-08-05 11:18:10 +0200704 }
705
706 return dat
707}
708
Akron439f4ec2021-08-09 15:45:38 +0200709// Show the current state of the buffer,
710// for testing puroses
Akron3610f102021-08-08 14:13:25 +0200711func showBuffer(buffer []rune, buffo int, buffi int) string {
712 out := make([]rune, 0, 1024)
713 for x := 0; x < len(buffer); x++ {
714 if buffi == x {
715 out = append(out, '^')
716 }
717 if buffo == x {
718 out = append(out, '[', buffer[x], ']')
719 } else {
720 out = append(out, buffer[x])
721 }
722 }
723 return string(out)
724}
725
Akron98fbfef2021-10-23 17:02:11 +0200726// Show the current state of the buffer,
727// for testing puroses
728func showBufferNew(buffer []rune, bufft int, buffc int, buffi int) string {
729 out := make([]rune, 0, 1024)
730 for x := 0; x < len(buffer); x++ {
731 if buffi == x {
732 out = append(out, '^')
733 }
734 if bufft == x {
735 out = append(out, '|')
736 }
737 if buffc == x {
738 out = append(out, '[', buffer[x], ']')
739 } else {
740 out = append(out, buffer[x])
741 }
742 }
743 return string(out)
744}
745
746// Transduce input to ouutput
Akrone396a932021-10-19 01:06:13 +0200747func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
Akron96fdc9b2021-10-27 21:11:17 +0200748 return dat.TransduceTokenWriter(r, NewTokenWriter(w, SIMPLE))
Akrone396a932021-10-19 01:06:13 +0200749}
750
751// TransduceTokenWriter transduces an input string against
752// the double array FSA. The rules are always greedy. If the
753// automaton fails, it takes the last possible token ending
754// branch.
Akron068874c2021-08-04 15:19:56 +0200755//
Akron4db3ecf2021-08-11 18:49:03 +0200756// Based on Mizobuchi et al (2000), p. 129,
757// with additional support for IDENTITY, UNKNOWN
758// and EPSILON transitions and NONTOKEN and TOKENEND handling.
Akron4f6b28c2021-10-25 00:52:03 +0200759func (dat *DaTokenizer) TransduceTokenWriter(r io.Reader, w *TokenWriter) bool {
Akron068874c2021-08-04 15:19:56 +0200760 var a int
Akronb4bbb472021-08-09 11:49:38 +0200761 var t0 uint32
Akronb7e1f132021-08-10 11:52:31 +0200762 t := uint32(1) // Initial state
763 var ok, rewindBuffer bool
Akron068874c2021-08-04 15:19:56 +0200764
Akron3610f102021-08-08 14:13:25 +0200765 // Remember the last position of a possible tokenend,
766 // in case the automaton fails.
767 epsilonState := uint32(0)
768 epsilonOffset := 0
769
Akrone396a932021-10-19 01:06:13 +0200770 // Remember if the last transition was epsilon
771 sentenceEnd := false
772
Akrona854faa2021-10-22 19:31:08 +0200773 // Remember if a text end was already set
774 textEnd := false
775
Akron3610f102021-08-08 14:13:25 +0200776 // Implement a low level buffer for full control,
777 // however - it is probably better to introduce
778 // this on a higher level with a io.Reader interface
779 // The buffer stores a single word and may have white
780 // space at the end (but not at the beginning).
781 //
782 // This is the only backtracking requirement because of
783 // epsilon transitions, to support tokenizations like:
784 // "this is an example|.| And it works." vs
785 // "this is an example.com| application."
Akronb7e1f132021-08-10 11:52:31 +0200786 //
787 // TODO:
788 // Store a translation buffer as well, so characters don't
789 // have to be translated multiple times!
Akron3610f102021-08-08 14:13:25 +0200790 buffer := make([]rune, 1024)
Akron98fbfef2021-10-23 17:02:11 +0200791 bufft := 0 // Buffer token offset
792 buffc := 0 // Buffer current symbol
Akron3610f102021-08-08 14:13:25 +0200793 buffi := 0 // Buffer length
794
Akron98fbfef2021-10-23 17:02:11 +0200795 // The buffer is organized as follows:
796 // [ t[....c..]..i]
797
Akron3f8571a2021-08-05 11:18:10 +0200798 reader := bufio.NewReader(r)
Akrone396a932021-10-19 01:06:13 +0200799 defer w.Flush()
Akron068874c2021-08-04 15:19:56 +0200800
Akron3f8571a2021-08-05 11:18:10 +0200801 var char rune
Akrone396a932021-10-19 01:06:13 +0200802
Akron3f8571a2021-08-05 11:18:10 +0200803 var err error
804 eof := false
Akrona854faa2021-10-22 19:31:08 +0200805 eot := false
Akronb7e1f132021-08-10 11:52:31 +0200806 newchar := true
Akron3f8571a2021-08-05 11:18:10 +0200807
Akronc5d8d432021-08-10 16:48:44 +0200808PARSECHAR:
Akron3f8571a2021-08-05 11:18:10 +0200809 for {
810
Akronb7e1f132021-08-10 11:52:31 +0200811 if newchar {
812 // Get from reader if buffer is empty
Akron98fbfef2021-10-23 17:02:11 +0200813 if buffc >= buffi {
Akron1594cb82021-08-11 11:14:56 +0200814 if eof {
815 break
816 }
Akronb7e1f132021-08-10 11:52:31 +0200817 char, _, err = reader.ReadRune()
Akron439f4ec2021-08-09 15:45:38 +0200818
Akronb7e1f132021-08-10 11:52:31 +0200819 // No more runes to read
820 if err != nil {
821 eof = true
822 break
823 }
824 buffer[buffi] = char
825 buffi++
Akron3f8571a2021-08-05 11:18:10 +0200826 }
Akronb7e1f132021-08-10 11:52:31 +0200827
Akron98fbfef2021-10-23 17:02:11 +0200828 char = buffer[buffc]
Akronb7e1f132021-08-10 11:52:31 +0200829
830 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100831 log.Println("Current char", string(char), int(char), showBufferNew(buffer, bufft, buffc, buffi))
Akronb7e1f132021-08-10 11:52:31 +0200832 }
833
Akrona854faa2021-10-22 19:31:08 +0200834 eot = false
835
Akron6f1c16c2021-08-17 10:45:42 +0200836 // TODO:
837 // Better not repeatedly check for a!
838 // Possibly keep a buffer with a.
Akronea46e8a2021-08-17 00:36:31 +0200839 if int(char) < 256 {
Akrona854faa2021-10-22 19:31:08 +0200840 if int(char) == EOT {
841 eot = true
842 }
Akronea46e8a2021-08-17 00:36:31 +0200843 a = dat.sigmaASCII[int(char)]
844 } else {
845 a, ok = dat.sigma[char]
846 if !ok {
847 a = 0
848 }
849 }
Akronb7e1f132021-08-10 11:52:31 +0200850
851 // Use identity symbol if character is not in sigma
Akronea46e8a2021-08-17 00:36:31 +0200852 if a == 0 && dat.identity != -1 {
Akronb7e1f132021-08-10 11:52:31 +0200853 a = dat.identity
854 }
855
856 t0 = t
857
858 // Check for epsilon transitions and remember
Akronf1a16502021-08-16 15:24:38 +0200859 if dat.array[dat.array[t0].getBase()+uint32(dat.epsilon)].getCheck() == t0 {
Akronb7e1f132021-08-10 11:52:31 +0200860 // Remember state for backtracking to last tokenend state
861 epsilonState = t0
Akron98fbfef2021-10-23 17:02:11 +0200862 epsilonOffset = buffc
Akrone396a932021-10-19 01:06:13 +0200863
864 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100865 log.Println("epsilonOffset is set to", buffc)
Akrone396a932021-10-19 01:06:13 +0200866 }
Akronb7e1f132021-08-10 11:52:31 +0200867 }
Akron3f8571a2021-08-05 11:18:10 +0200868 }
Akron3610f102021-08-08 14:13:25 +0200869
Akronb7e1f132021-08-10 11:52:31 +0200870 // Checks a transition based on t0, a and buffo
Akronf1a16502021-08-16 15:24:38 +0200871 t = dat.array[t0].getBase() + uint32(a)
872 ta := dat.array[t]
Akron068874c2021-08-04 15:19:56 +0200873
Akron524c5432021-08-05 14:14:27 +0200874 if DEBUG {
Akronb7e1f132021-08-10 11:52:31 +0200875 // Char is only relevant if set
Akron9c3bf7f2021-11-03 19:52:12 +0100876 log.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
Akronb7e1f132021-08-10 11:52:31 +0200877 if false {
Akron9c3bf7f2021-11-03 19:52:12 +0100878 log.Println(dat.outgoing(t0))
Akronb7e1f132021-08-10 11:52:31 +0200879 }
Akron524c5432021-08-05 14:14:27 +0200880 }
881
Akronb7e1f132021-08-10 11:52:31 +0200882 // Check if the transition is invalid according to the double array
Akronf1a16502021-08-16 15:24:38 +0200883 if t > dat.array[1].getCheck() || ta.getCheck() != t0 {
Akron068874c2021-08-04 15:19:56 +0200884
885 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100886 log.Println("Match is not fine!", t, "and", ta.getCheck(), "vs", t0)
Akron068874c2021-08-04 15:19:56 +0200887 }
888
889 if !ok && a == dat.identity {
Akronb4bbb472021-08-09 11:49:38 +0200890
Akron068874c2021-08-04 15:19:56 +0200891 // Try again with unknown symbol, in case identity failed
Akronb7e1f132021-08-10 11:52:31 +0200892 // Char is only relevant when set
Akron068874c2021-08-04 15:19:56 +0200893 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100894 log.Println("UNKNOWN symbol", string(char), "->", dat.unknown)
Akron068874c2021-08-04 15:19:56 +0200895 }
896 a = dat.unknown
897
898 } else if a != dat.epsilon {
Akronb4bbb472021-08-09 11:49:38 +0200899
Akron068874c2021-08-04 15:19:56 +0200900 // Try again with epsilon symbol, in case everything else failed
Akronb4bbb472021-08-09 11:49:38 +0200901 t0 = epsilonState
Akron3610f102021-08-08 14:13:25 +0200902 epsilonState = 0 // reset
Akron98fbfef2021-10-23 17:02:11 +0200903 buffc = epsilonOffset
Akron439f4ec2021-08-09 15:45:38 +0200904 a = dat.epsilon
905
Akron3610f102021-08-08 14:13:25 +0200906 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100907 log.Println("Get from epsilon stack and set buffo!", showBufferNew(buffer, bufft, buffc, buffi))
Akron3610f102021-08-08 14:13:25 +0200908 }
Akronb4bbb472021-08-09 11:49:38 +0200909
Akron068874c2021-08-04 15:19:56 +0200910 } else {
911 break
912 }
Akron068874c2021-08-04 15:19:56 +0200913
Akronb7e1f132021-08-10 11:52:31 +0200914 newchar = false
Akrona854faa2021-10-22 19:31:08 +0200915 eot = false
Akronb7e1f132021-08-10 11:52:31 +0200916 continue
Akronb4bbb472021-08-09 11:49:38 +0200917 }
918
Akronb7e1f132021-08-10 11:52:31 +0200919 // Transition was successful
920 rewindBuffer = false
Akron439f4ec2021-08-09 15:45:38 +0200921
922 // Transition consumes a character
Akronb4bbb472021-08-09 11:49:38 +0200923 if a != dat.epsilon {
924
Akron98fbfef2021-10-23 17:02:11 +0200925 buffc++
Akronb4bbb472021-08-09 11:49:38 +0200926
Akron439f4ec2021-08-09 15:45:38 +0200927 // Transition does not produce a character
Akron98fbfef2021-10-23 17:02:11 +0200928 if buffc-bufft == 1 && ta.isNonToken() {
Akron3610f102021-08-08 14:13:25 +0200929 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100930 log.Println("Nontoken forward", showBufferNew(buffer, bufft, buffc, buffi))
Akron3610f102021-08-08 14:13:25 +0200931 }
Akron98fbfef2021-10-23 17:02:11 +0200932 bufft++
933 // rewindBuffer = true
Akron3610f102021-08-08 14:13:25 +0200934 }
Akron068874c2021-08-04 15:19:56 +0200935
Akrone396a932021-10-19 01:06:13 +0200936 } else {
Akron524c5432021-08-05 14:14:27 +0200937
Akrone396a932021-10-19 01:06:13 +0200938 // Transition marks the end of a token - so flush the buffer
Akron98fbfef2021-10-23 17:02:11 +0200939 if buffc-bufft > 0 {
Akronc5d8d432021-08-10 16:48:44 +0200940 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100941 log.Println("-> Flush buffer: [", string(buffer[bufft:buffc]), "]", showBuffer(buffer, buffc, buffi))
Akronc5d8d432021-08-10 16:48:44 +0200942 }
Akron32416ce2021-10-23 17:09:41 +0200943 w.Token(bufft, buffer[:buffc])
Akronc5d8d432021-08-10 16:48:44 +0200944 rewindBuffer = true
Akrone396a932021-10-19 01:06:13 +0200945 sentenceEnd = false
Akrona854faa2021-10-22 19:31:08 +0200946 textEnd = false
Akrone396a932021-10-19 01:06:13 +0200947 } else {
948 sentenceEnd = true
Akrona854faa2021-10-22 19:31:08 +0200949 w.SentenceEnd(0)
Akron1594cb82021-08-11 11:14:56 +0200950 }
Akron439f4ec2021-08-09 15:45:38 +0200951 }
Akron3610f102021-08-08 14:13:25 +0200952
Akron8cc2dd92021-10-25 19:49:41 +0200953 if eot {
954 eot = false
955 textEnd = true
956 w.TextEnd(0)
957 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100958 log.Println("END OF TEXT")
Akron8cc2dd92021-10-25 19:49:41 +0200959 }
960 }
961
Akronc5d8d432021-08-10 16:48:44 +0200962 // Rewind the buffer if necessary
Akron439f4ec2021-08-09 15:45:38 +0200963 if rewindBuffer {
964
Akrone396a932021-10-19 01:06:13 +0200965 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100966 log.Println("-> Rewind buffer", bufft, buffc, buffi, epsilonOffset)
Akrone396a932021-10-19 01:06:13 +0200967 }
968
Akron439f4ec2021-08-09 15:45:38 +0200969 // TODO: Better as a ring buffer
Akron04335c62021-10-28 11:56:00 +0200970 // buffer = buffer[buffc:] !slower
Akron98fbfef2021-10-23 17:02:11 +0200971 for x, i := range buffer[buffc:buffi] {
Akron3610f102021-08-08 14:13:25 +0200972 buffer[x] = i
973 }
Akronb4bbb472021-08-09 11:49:38 +0200974
Akron98fbfef2021-10-23 17:02:11 +0200975 buffi -= buffc
Akron16c312e2021-09-26 13:11:12 +0200976 // epsilonOffset -= buffo
Akrone396a932021-10-19 01:06:13 +0200977 epsilonOffset = 0
978 epsilonState = 0
979
Akron98fbfef2021-10-23 17:02:11 +0200980 buffc = 0
981 bufft = 0
Akrona854faa2021-10-22 19:31:08 +0200982
Akron98fbfef2021-10-23 17:02:11 +0200983 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100984 log.Println("Remaining:", showBufferNew(buffer, bufft, buffc, buffi))
Akron98fbfef2021-10-23 17:02:11 +0200985 }
986 }
987
Akronb7e1f132021-08-10 11:52:31 +0200988 // Move to representative state
Akronf1a16502021-08-16 15:24:38 +0200989 if ta.isSeparate() {
990 t = ta.getBase()
991 ta = dat.array[t]
Akronb7e1f132021-08-10 11:52:31 +0200992
993 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100994 log.Println("Representative pointing to", t)
Akronb7e1f132021-08-10 11:52:31 +0200995 }
996 }
997
Akronc5d8d432021-08-10 16:48:44 +0200998 newchar = true
999
Akron068874c2021-08-04 15:19:56 +02001000 // TODO:
1001 // Prevent endless epsilon loops!
1002 }
1003
Akron439f4ec2021-08-09 15:45:38 +02001004 // Input reader is not yet finished
Akron3f8571a2021-08-05 11:18:10 +02001005 if !eof {
Akron068874c2021-08-04 15:19:56 +02001006 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001007 log.Println("Not at the end - problem", t0, ":", dat.outgoing(t0))
Akron068874c2021-08-04 15:19:56 +02001008 }
1009 return false
1010 }
1011
Akronb7e1f132021-08-10 11:52:31 +02001012 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001013 log.Println("Entering final check")
Akronb7e1f132021-08-10 11:52:31 +02001014 }
1015
Akrone396a932021-10-19 01:06:13 +02001016 /*
1017 The following code is for deprecated automata relying on
1018 final states. Datok now requires final states to be marked
1019 with tokenends.
Akronf1a16502021-08-16 15:24:38 +02001020
Akrone396a932021-10-19 01:06:13 +02001021 // Automaton is in a final state, so flush the buffer and return
1022 x := dat.array[t].getBase() + uint32(dat.final)
Akronb4bbb472021-08-09 11:49:38 +02001023
Akrone396a932021-10-19 01:06:13 +02001024 if x < dat.array[1].getCheck() && dat.array[x].getCheck() == t {
Akron6e70dc82021-08-11 11:33:18 +02001025
Akrone396a932021-10-19 01:06:13 +02001026 if buffi > 0 {
1027 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001028 log.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
Akrone396a932021-10-19 01:06:13 +02001029 }
1030 w.Token(0, buffer[:buffi])
Akronc5d8d432021-08-10 16:48:44 +02001031 }
Akron84d68e62021-08-04 17:06:52 +02001032
Akrone396a932021-10-19 01:06:13 +02001033 // Add an additional sentence ending, if the file is over but no explicit
1034 // sentence split was reached. This may be controversial and therefore
1035 // optional via parameter.
1036 if !dat.array[t0].isTokenEnd() {
1037 w.SentenceEnd()
1038 }
Akron6e70dc82021-08-11 11:33:18 +02001039
Akrone396a932021-10-19 01:06:13 +02001040 // TODO:
1041 // There may be a new line at the end, from an epsilon,
1042 // so we may need to go on!
1043 return true
1044 }
1045 */
Akron068874c2021-08-04 15:19:56 +02001046
Akrona854faa2021-10-22 19:31:08 +02001047 // Check epsilon transitions as long as possible
Akronc5d8d432021-08-10 16:48:44 +02001048 t0 = t
Akronf1a16502021-08-16 15:24:38 +02001049 t = dat.array[t0].getBase() + uint32(dat.epsilon)
Akron01912fc2021-08-12 11:41:58 +02001050 a = dat.epsilon
1051 newchar = false
Akrone396a932021-10-19 01:06:13 +02001052
Akronf1a16502021-08-16 15:24:38 +02001053 if dat.array[t].getCheck() == t0 {
Akronc5d8d432021-08-10 16:48:44 +02001054 // Remember state for backtracking to last tokenend state
Akronc5d8d432021-08-10 16:48:44 +02001055 goto PARSECHAR
Akrone61380b2021-08-16 10:10:46 +02001056
Akronc5d8d432021-08-10 16:48:44 +02001057 } else if epsilonState != 0 {
Akronb7e1f132021-08-10 11:52:31 +02001058 t0 = epsilonState
1059 epsilonState = 0 // reset
Akron98fbfef2021-10-23 17:02:11 +02001060 buffc = epsilonOffset
Akron068874c2021-08-04 15:19:56 +02001061 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001062 log.Println("Get from epsilon stack and set buffo!", showBufferNew(buffer, bufft, buffc, buffi))
Akron068874c2021-08-04 15:19:56 +02001063 }
Akronc5d8d432021-08-10 16:48:44 +02001064 goto PARSECHAR
Akron068874c2021-08-04 15:19:56 +02001065 }
Akrone396a932021-10-19 01:06:13 +02001066
1067 // Add an additional sentence ending, if the file is over but no explicit
1068 // sentence split was reached. This may be controversial and therefore
1069 // optional via parameter.
1070 if !sentenceEnd {
Akrona854faa2021-10-22 19:31:08 +02001071 w.SentenceEnd(0)
1072
Akrone396a932021-10-19 01:06:13 +02001073 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001074 log.Println("Sentence end")
Akrona854faa2021-10-22 19:31:08 +02001075 }
1076 }
1077
1078 if !textEnd {
1079 w.TextEnd(0)
1080
1081 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +01001082 log.Println("Text end")
Akrone396a932021-10-19 01:06:13 +02001083 }
1084 }
1085
1086 return true
Akron068874c2021-08-04 15:19:56 +02001087}