| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 1 | package datokenizer | 
 | 2 |  | 
 | 3 | /** | 
 | 4 |  * The file reader is basically a port of foma2js, | 
 | 5 |  * licensed under the Apache License, version 2, | 
 | 6 |  * and written by Mans Hulden. | 
 | 7 |  */ | 
 | 8 |  | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 9 | // TODO: | 
 | 10 | // - replace maxSize with the check value | 
 | 11 | // - Strip first state and make everything start with 0! | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 12 |  | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 13 | import ( | 
 | 14 | 	"bufio" | 
| Akron | 6247a5d | 2021-08-03 19:18:28 +0200 | [diff] [blame] | 15 | 	"bytes" | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 16 | 	"compress/gzip" | 
| Akron | 6247a5d | 2021-08-03 19:18:28 +0200 | [diff] [blame] | 17 | 	"encoding/binary" | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 18 | 	"fmt" | 
 | 19 | 	"io" | 
 | 20 | 	"os" | 
| Akron | c9d84a6 | 2021-08-03 15:56:03 +0200 | [diff] [blame] | 21 | 	"sort" | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 22 | 	"strconv" | 
 | 23 | 	"strings" | 
 | 24 | 	"unicode/utf8" | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 25 |  | 
 | 26 | 	"github.com/rs/zerolog/log" | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 27 | ) | 
 | 28 |  | 
 | 29 | const ( | 
| Akron | 75ebe7f | 2021-08-03 10:34:10 +0200 | [diff] [blame] | 30 | 	PROPS   = 1 | 
 | 31 | 	SIGMA   = 2 | 
 | 32 | 	STATES  = 3 | 
 | 33 | 	NONE    = 4 | 
 | 34 | 	NEWLINE = '\u000a' | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 35 | 	DEBUG   = false | 
| Akron | 6247a5d | 2021-08-03 19:18:28 +0200 | [diff] [blame] | 36 | 	MAGIC   = "DATOK" | 
 | 37 | 	VERSION = uint16(1) | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 38 | ) | 
 | 39 |  | 
| Akron | 6247a5d | 2021-08-03 19:18:28 +0200 | [diff] [blame] | 40 | var bo binary.ByteOrder = binary.LittleEndian | 
 | 41 |  | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 42 | type mapping struct { | 
 | 43 | 	source int | 
 | 44 | 	target int | 
 | 45 | } | 
 | 46 |  | 
 | 47 | type edge struct { | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 48 | 	inSym  int | 
 | 49 | 	outSym int | 
 | 50 | 	end    int | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 51 | } | 
 | 52 |  | 
 | 53 | type Tokenizer struct { | 
| Akron | f2120ca | 2021-08-03 16:26:41 +0200 | [diff] [blame] | 54 | 	// sigma       map[rune]int | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 55 | 	sigmaRev    map[int]rune | 
 | 56 | 	arcCount    int | 
 | 57 | 	stateCount  int | 
 | 58 | 	sigmaCount  int | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 59 | 	transitions []map[int]*edge | 
| Akron | c17f1ca | 2021-08-03 19:47:27 +0200 | [diff] [blame] | 60 |  | 
 | 61 | 	// Special symbols in sigma | 
 | 62 | 	epsilon  int | 
 | 63 | 	unknown  int | 
 | 64 | 	identity int | 
 | 65 | 	final    int | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 66 | } | 
 | 67 |  | 
| Akron | f2120ca | 2021-08-03 16:26:41 +0200 | [diff] [blame] | 68 | type DaTokenizer struct { | 
| Akron | f2120ca | 2021-08-03 16:26:41 +0200 | [diff] [blame] | 69 | 	// sigmaRev map[int]rune | 
| Akron | 773b1ef | 2021-08-03 17:37:20 +0200 | [diff] [blame] | 70 | 	sigma     map[rune]int | 
 | 71 | 	maxSize   int | 
 | 72 | 	loadLevel float64 | 
 | 73 | 	array     []int | 
| Akron | c17f1ca | 2021-08-03 19:47:27 +0200 | [diff] [blame] | 74 |  | 
 | 75 | 	// Special symbols in sigma | 
 | 76 | 	epsilon  int | 
 | 77 | 	unknown  int | 
 | 78 | 	identity int | 
 | 79 | 	final    int | 
| Akron | f2120ca | 2021-08-03 16:26:41 +0200 | [diff] [blame] | 80 | } | 
 | 81 |  | 
| Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame^] | 82 | func LoadFomaFile(file string) *Tokenizer { | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 83 | 	f, err := os.Open(file) | 
 | 84 | 	if err != nil { | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 85 | 		log.Error().Err(err) | 
 | 86 | 		os.Exit(0) | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 87 | 	} | 
 | 88 | 	defer f.Close() | 
 | 89 |  | 
 | 90 | 	gz, err := gzip.NewReader(f) | 
 | 91 | 	if err != nil { | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 92 | 		log.Error().Err(err) | 
 | 93 | 		os.Exit(0) | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 94 | 	} | 
 | 95 | 	defer gz.Close() | 
 | 96 |  | 
| Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame^] | 97 | 	return ParseFome(gz) | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 98 | } | 
 | 99 |  | 
| Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame^] | 100 | func ParseFome(ior io.Reader) *Tokenizer { | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 101 | 	r := bufio.NewReader(ior) | 
 | 102 |  | 
 | 103 | 	tok := &Tokenizer{ | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 104 | 		sigmaRev: make(map[int]rune), | 
| Akron | c17f1ca | 2021-08-03 19:47:27 +0200 | [diff] [blame] | 105 | 		epsilon:  -1, | 
 | 106 | 		unknown:  -1, | 
 | 107 | 		identity: -1, | 
 | 108 | 		final:    -1, | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 109 | 	} | 
 | 110 |  | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 111 | 	var state, inSym, outSym, end, final int | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 112 |  | 
 | 113 | 	mode := 0 | 
 | 114 | 	var elem []string | 
 | 115 | 	var elemint [5]int | 
 | 116 |  | 
 | 117 | 	for { | 
 | 118 | 		line, err := r.ReadString('\n') | 
 | 119 | 		if err != nil { | 
 | 120 | 			if err == io.EOF { | 
 | 121 | 				break | 
 | 122 | 			} | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 123 | 			log.Error().Err(err) | 
 | 124 | 			os.Exit(0) | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 125 | 		} | 
 | 126 | 		if strings.HasPrefix(line, "##foma-net") { | 
 | 127 | 			continue | 
 | 128 | 		} | 
 | 129 | 		if strings.HasPrefix(line, "##props##") { | 
 | 130 | 			mode = PROPS | 
 | 131 | 			continue | 
 | 132 | 		} | 
 | 133 | 		if strings.HasPrefix(line, "##states##") { | 
 | 134 | 			mode = STATES | 
 | 135 |  | 
 | 136 | 			// Adds a final transition symbol to sigma | 
 | 137 | 			// written as '#' in Mizobuchi et al (2000) | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 138 | 			tok.sigmaCount++ | 
| Akron | c17f1ca | 2021-08-03 19:47:27 +0200 | [diff] [blame] | 139 | 			tok.final = tok.sigmaCount | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 140 | 			continue | 
 | 141 | 		} | 
 | 142 | 		if strings.HasPrefix(line, "##sigma##") { | 
 | 143 | 			mode = SIGMA | 
 | 144 | 			continue | 
 | 145 | 		} | 
 | 146 | 		if strings.HasPrefix(line, "##end##") { | 
 | 147 | 			mode = NONE | 
 | 148 | 			continue | 
 | 149 | 		} | 
 | 150 |  | 
 | 151 | 		switch mode { | 
 | 152 | 		case PROPS: | 
 | 153 | 			{ | 
 | 154 | 				elem = strings.Split(line, " ") | 
 | 155 | 				/* | 
 | 156 | 					fmt.Println("arity:            " + elem[0]) | 
 | 157 | 					fmt.Println("arccount:         " + elem[1]) | 
 | 158 | 					fmt.Println("statecount:       " + elem[2]) | 
 | 159 | 					fmt.Println("linecount:        " + elem[3]) | 
 | 160 | 					fmt.Println("finalcount:       " + elem[4]) | 
 | 161 | 					fmt.Println("pathcount:        " + elem[5]) | 
 | 162 | 					fmt.Println("is_deterministic: " + elem[6]) | 
 | 163 | 					fmt.Println("is_pruned:        " + elem[7]) | 
 | 164 | 					fmt.Println("is_minimized:     " + elem[8]) | 
 | 165 | 					fmt.Println("is_epsilon_free:  " + elem[9]) | 
 | 166 | 					fmt.Println("is_loop_free:     " + elem[10]) | 
 | 167 | 					fmt.Println("extras:           " + elem[11]) | 
 | 168 | 					fmt.Println("name:             " + elem[12]) | 
 | 169 | 				*/ | 
 | 170 | 				if elem[6] != "1" { | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 171 | 					log.Error().Msg("The FST needs to be deterministic") | 
 | 172 | 					os.Exit(1) | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 173 | 				} | 
 | 174 | 				if elem[9] != "1" { | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 175 | 					log.Error().Msg("The FST needs to be epsilon free") | 
 | 176 | 					os.Exit(1) | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 177 | 				} | 
 | 178 |  | 
 | 179 | 				elemint[0], err = strconv.Atoi(elem[1]) | 
 | 180 | 				if err != nil { | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 181 | 					log.Error().Msg("Can't read arccount") | 
 | 182 | 					os.Exit(1) | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 183 | 				} | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 184 | 				tok.arcCount = elemint[0] | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 185 |  | 
 | 186 | 				// States start at 1 in Mizobuchi et al (2000), | 
 | 187 | 				// as the state 0 is associated with a fail. | 
 | 188 | 				// Initialize states and transitions | 
 | 189 | 				elemint[0], err = strconv.Atoi(elem[2]) | 
 | 190 | 				if err != nil { | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 191 | 					log.Error().Msg("Can't read statecount") | 
 | 192 | 					os.Exit(1) | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 193 | 				} | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 194 | 				tok.stateCount = elemint[0] | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 195 | 				tok.transitions = make([]map[int]*edge, elemint[0]+1) | 
 | 196 | 				continue | 
 | 197 | 			} | 
 | 198 | 		case STATES: | 
 | 199 | 			{ | 
 | 200 | 				elem = strings.Split(line[0:len(line)-1], " ") | 
 | 201 | 				if elem[0] == "-1" { | 
 | 202 | 					continue | 
 | 203 | 				} | 
 | 204 | 				elemint[0], err = strconv.Atoi(elem[0]) | 
| Akron | 75ebe7f | 2021-08-03 10:34:10 +0200 | [diff] [blame] | 205 | 				if err != nil { | 
 | 206 | 					break | 
 | 207 | 				} | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 208 |  | 
 | 209 | 				if len(elem) > 1 { | 
 | 210 | 					elemint[1], err = strconv.Atoi(elem[1]) | 
 | 211 | 					if err != nil { | 
 | 212 | 						break | 
 | 213 | 					} | 
 | 214 | 					if len(elem) > 2 { | 
 | 215 | 						elemint[2], err = strconv.Atoi(elem[2]) | 
 | 216 | 						if err != nil { | 
 | 217 | 							break | 
 | 218 | 						} | 
 | 219 | 						if len(elem) > 3 { | 
 | 220 | 							elemint[3], err = strconv.Atoi(elem[3]) | 
 | 221 | 							if err != nil { | 
 | 222 | 								break | 
 | 223 | 							} | 
 | 224 | 							if len(elem) > 4 { | 
 | 225 | 								elemint[4], err = strconv.Atoi(elem[4]) | 
 | 226 | 								if err != nil { | 
 | 227 | 									break | 
 | 228 | 								} | 
 | 229 | 							} | 
 | 230 | 						} | 
 | 231 | 					} | 
 | 232 | 				} | 
 | 233 |  | 
 | 234 | 				switch len(elem) { | 
 | 235 | 				case 5: | 
 | 236 | 					{ | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 237 | 						state = elemint[0] | 
 | 238 | 						inSym = elemint[1] | 
 | 239 | 						outSym = elemint[2] | 
 | 240 | 						end = elemint[3] | 
 | 241 | 						final = elemint[4] | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 242 | 					} | 
 | 243 | 				case 4: | 
 | 244 | 					{ | 
 | 245 | 						if elemint[1] == -1 { | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 246 | 							state = elemint[0] | 
 | 247 | 							final = elemint[3] | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 248 | 						} else { | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 249 | 							state = elemint[0] | 
 | 250 | 							inSym = elemint[1] | 
 | 251 | 							end = elemint[2] | 
 | 252 | 							final = elemint[3] | 
 | 253 | 							outSym = inSym | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 254 | 						} | 
 | 255 | 					} | 
 | 256 | 				case 3: | 
 | 257 | 					{ | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 258 | 						inSym = elemint[0] | 
 | 259 | 						outSym = elemint[1] | 
 | 260 | 						end = elemint[2] | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 261 | 					} | 
 | 262 | 				case 2: | 
 | 263 | 					{ | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 264 | 						inSym = elemint[0] | 
 | 265 | 						end = elemint[1] | 
 | 266 | 						outSym = inSym | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 267 | 					} | 
 | 268 | 				} | 
 | 269 |  | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 270 | 				// While the states in foma start with 0, the states in the | 
 | 271 | 				// Mizobuchi FSA start with one - so we increase every state by 1. | 
 | 272 |  | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 273 | 				if inSym != outSym { | 
 | 274 |  | 
 | 275 | 					// Allow any epsilon to become a newline | 
| Akron | c17f1ca | 2021-08-03 19:47:27 +0200 | [diff] [blame] | 276 | 					if !(inSym == tok.epsilon && tok.sigmaRev[outSym] == NEWLINE) && | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 277 |  | 
 | 278 | 						// Allow any whitespace to be ignored | 
| Akron | c17f1ca | 2021-08-03 19:47:27 +0200 | [diff] [blame] | 279 | 						!(inSym != tok.epsilon && outSym == tok.epsilon) && | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 280 |  | 
 | 281 | 						// Allow any whitespace to become a new line | 
 | 282 | 						!(tok.sigmaRev[outSym] == NEWLINE) { | 
 | 283 |  | 
 | 284 | 						log.Error().Msg( | 
 | 285 | 							"Unsupported transition: " + | 
 | 286 | 								strconv.Itoa(state) + | 
 | 287 | 								" -> " + strconv.Itoa(end) + | 
| Akron | 75ebe7f | 2021-08-03 10:34:10 +0200 | [diff] [blame] | 288 | 								" (" + | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 289 | 								strconv.Itoa(inSym) + | 
| Akron | 75ebe7f | 2021-08-03 10:34:10 +0200 | [diff] [blame] | 290 | 								":" + | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 291 | 								strconv.Itoa(outSym) + | 
| Akron | 75ebe7f | 2021-08-03 10:34:10 +0200 | [diff] [blame] | 292 | 								") (" + | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 293 | 								string(tok.sigmaRev[inSym]) + | 
| Akron | 75ebe7f | 2021-08-03 10:34:10 +0200 | [diff] [blame] | 294 | 								":" + | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 295 | 								string(tok.sigmaRev[outSym]) + | 
| Akron | 75ebe7f | 2021-08-03 10:34:10 +0200 | [diff] [blame] | 296 | 								")") | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 297 | 						os.Exit(1) | 
| Akron | 75ebe7f | 2021-08-03 10:34:10 +0200 | [diff] [blame] | 298 | 					} | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 299 | 				} | 
 | 300 |  | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 301 | 				// This collects all edges until arrstate changes | 
 | 302 |  | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 303 | 				// TODO: | 
 | 304 | 				//   if arrin == EPSILON && arrout == TOKENEND, mark state as newline | 
 | 305 | 				//   if the next transition is the same, remove TOKENEND and add SENTENCEEND | 
 | 306 | 				//   This requires to remove the transition alltogether and marks the state instead. | 
 | 307 |  | 
 | 308 | 				// TODO: | 
 | 309 | 				//   if arrout == EPSILON, mark the transition as NOTOKEN | 
 | 310 |  | 
 | 311 | 				targetObj := &edge{ | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 312 | 					inSym:  inSym, | 
 | 313 | 					outSym: outSym, | 
 | 314 | 					end:    end + 1, | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 315 | 				} | 
 | 316 |  | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 317 | 				// Initialize outgoing states | 
 | 318 | 				if tok.transitions[state+1] == nil { | 
 | 319 | 					tok.transitions[state+1] = make(map[int]*edge) | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 320 | 				} | 
 | 321 |  | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 322 | 				// Ignore transitions with invalid symbols | 
 | 323 | 				if inSym >= 0 { | 
 | 324 | 					tok.transitions[state+1][inSym] = targetObj | 
| Akron | 75ebe7f | 2021-08-03 10:34:10 +0200 | [diff] [blame] | 325 | 				} | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 326 |  | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 327 | 				// Add final transition | 
 | 328 | 				if final == 1 { | 
| Akron | c17f1ca | 2021-08-03 19:47:27 +0200 | [diff] [blame] | 329 | 					tok.transitions[state+1][tok.final] = &edge{} | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 330 | 				} | 
 | 331 |  | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 332 | 				if DEBUG { | 
 | 333 | 					fmt.Println("Add", | 
 | 334 | 						state+1, "->", end+1, | 
 | 335 | 						"(", | 
 | 336 | 						inSym, | 
 | 337 | 						":", | 
 | 338 | 						outSym, | 
 | 339 | 						") (", | 
 | 340 | 						string(tok.sigmaRev[inSym]), | 
 | 341 | 						":", | 
 | 342 | 						string(tok.sigmaRev[outSym]), | 
 | 343 | 						")") | 
 | 344 | 				} | 
| Akron | 75ebe7f | 2021-08-03 10:34:10 +0200 | [diff] [blame] | 345 |  | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 346 | 				continue | 
 | 347 | 			} | 
 | 348 | 		case SIGMA: | 
 | 349 | 			{ | 
 | 350 | 				elem = strings.SplitN(line[0:len(line)-1], " ", 2) | 
 | 351 |  | 
 | 352 | 				// Turn string into sigma id | 
 | 353 | 				number, err := strconv.Atoi(elem[0]) | 
 | 354 |  | 
 | 355 | 				if err != nil { | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 356 | 					log.Error().Err(err) | 
 | 357 | 					os.Exit(0) | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 358 | 				} | 
 | 359 |  | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 360 | 				tok.sigmaCount = number | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 361 |  | 
 | 362 | 				var symbol rune | 
 | 363 |  | 
 | 364 | 				// Read rune | 
 | 365 | 				if utf8.RuneCountInString(elem[1]) == 1 { | 
 | 366 | 					symbol = []rune(elem[1])[0] | 
 | 367 |  | 
 | 368 | 					// Probably a MCS | 
 | 369 | 				} else if utf8.RuneCountInString(elem[1]) > 1 { | 
 | 370 | 					switch elem[1] { | 
 | 371 | 					case "@_EPSILON_SYMBOL_@": | 
 | 372 | 						{ | 
| Akron | c17f1ca | 2021-08-03 19:47:27 +0200 | [diff] [blame] | 373 | 							tok.epsilon = number | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 374 | 							continue | 
 | 375 | 						} | 
 | 376 | 					case "@_UNKNOWN_SYMBOL_@": | 
 | 377 | 						{ | 
| Akron | c17f1ca | 2021-08-03 19:47:27 +0200 | [diff] [blame] | 378 | 							tok.unknown = number | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 379 | 							continue | 
 | 380 | 						} | 
 | 381 |  | 
 | 382 | 					case "@_IDENTITY_SYMBOL_@": | 
 | 383 | 						{ | 
| Akron | c17f1ca | 2021-08-03 19:47:27 +0200 | [diff] [blame] | 384 | 							tok.identity = number | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 385 | 							continue | 
 | 386 | 						} | 
 | 387 | 					default: | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 388 | 						{ | 
 | 389 | 							log.Error().Msg("MCS not supported: " + line) | 
 | 390 | 							os.Exit(1) | 
 | 391 | 						} | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 392 | 					} | 
 | 393 |  | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 394 | 				} else { // Probably a new line symbol | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 395 | 					line, err = r.ReadString('\n') | 
 | 396 | 					if err != nil { | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 397 | 						log.Error().Err(err) | 
 | 398 | 						os.Exit(0) | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 399 | 					} | 
 | 400 | 					if len(line) != 1 { | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 401 | 						log.Error().Msg("MCS not supported:" + line) | 
 | 402 | 						os.Exit(0) | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 403 | 					} | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 404 | 					symbol = rune(NEWLINE) | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 405 | 				} | 
 | 406 |  | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 407 | 				tok.sigmaRev[number] = symbol | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 408 | 			} | 
 | 409 | 		} | 
 | 410 | 	} | 
 | 411 |  | 
 | 412 | 	return tok | 
 | 413 | } | 
 | 414 |  | 
| Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame^] | 415 | // Set alphabet A to the list of all symbols | 
 | 416 | // outgoing from s | 
 | 417 | func (tok *Tokenizer) get_set(s int, A *[]int) { | 
 | 418 | 	for a := range tok.transitions[s] { | 
 | 419 | 		*A = append(*A, a) | 
 | 420 | 	} | 
 | 421 |  | 
 | 422 | 	// Not required, but simplifies bug hunting | 
 | 423 | 	sort.Ints(*A) | 
 | 424 | } | 
 | 425 |  | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 426 | // Implementation of Mizobuchi et al (2000), p.128 | 
| Akron | f2120ca | 2021-08-03 16:26:41 +0200 | [diff] [blame] | 427 | func (tok *Tokenizer) ToDoubleArray() *DaTokenizer { | 
 | 428 |  | 
 | 429 | 	dat := &DaTokenizer{ | 
| Akron | 6247a5d | 2021-08-03 19:18:28 +0200 | [diff] [blame] | 430 | 		sigma:     make(map[rune]int), | 
 | 431 | 		loadLevel: -1, | 
| Akron | c17f1ca | 2021-08-03 19:47:27 +0200 | [diff] [blame] | 432 | 		final:     tok.final, | 
 | 433 | 		unknown:   tok.unknown, | 
 | 434 | 		identity:  tok.identity, | 
 | 435 | 		epsilon:   tok.epsilon, | 
| Akron | f2120ca | 2021-08-03 16:26:41 +0200 | [diff] [blame] | 436 | 	} | 
 | 437 |  | 
 | 438 | 	for num, sym := range tok.sigmaRev { | 
 | 439 | 		dat.sigma[sym] = num | 
 | 440 | 	} | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 441 |  | 
 | 442 | 	mark := 0 | 
 | 443 | 	size := 0 | 
 | 444 |  | 
 | 445 | 	// Create a mapping from s to t | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 446 | 	table := make([]*mapping, tok.arcCount+1) | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 447 |  | 
 | 448 | 	table[size] = &mapping{source: 1, target: 1} | 
 | 449 | 	size++ | 
 | 450 |  | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 451 | 	// Allocate space for the outgoing symbol range | 
 | 452 | 	A := make([]int, 0, tok.sigmaCount) | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 453 |  | 
 | 454 | 	for mark < size { | 
 | 455 | 		s := table[mark].source // This is a state in Ms | 
 | 456 | 		t := table[mark].target // This is a state in Mt | 
 | 457 | 		mark++ | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 458 |  | 
 | 459 | 		// Following the paper, here the state t can be remembered | 
 | 460 | 		// in the set of states St | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 461 | 		A = A[:0] | 
 | 462 | 		tok.get_set(s, &A) | 
 | 463 |  | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 464 | 		// Set base to the first free slot in the double array | 
| Akron | f2120ca | 2021-08-03 16:26:41 +0200 | [diff] [blame] | 465 | 		dat.setBase(t, dat.xCheck(A)) | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 466 |  | 
| Akron | 773b1ef | 2021-08-03 17:37:20 +0200 | [diff] [blame] | 467 | 		// TODO: | 
 | 468 | 		//   Sort the outgoing transitions based onm the | 
 | 469 | 		//   outdegree of .end | 
 | 470 |  | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 471 | 		// Iterate over all outgoing symbols | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 472 | 		for _, a := range A { | 
 | 473 |  | 
| Akron | c17f1ca | 2021-08-03 19:47:27 +0200 | [diff] [blame] | 474 | 			if a != tok.final { | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 475 |  | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 476 | 				// Aka g(s, a) | 
 | 477 | 				s1 := tok.transitions[s][a].end | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 478 |  | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 479 | 				// Store the transition | 
| Akron | f2120ca | 2021-08-03 16:26:41 +0200 | [diff] [blame] | 480 | 				t1 := dat.getBase(t) + a | 
 | 481 | 				dat.setCheck(t1, t) | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 482 |  | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 483 | 				// Check for representative states | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 484 | 				r := in_table(s1, table, size) | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 485 |  | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 486 | 				if r == 0 { | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 487 | 					// Remember the mapping | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 488 | 					table[size] = &mapping{source: s1, target: t1} | 
 | 489 | 					size++ | 
 | 490 | 				} else { | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 491 | 					// Overwrite with the representative state | 
| Akron | f2120ca | 2021-08-03 16:26:41 +0200 | [diff] [blame] | 492 | 					dat.setBase(t1, -1*r) | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 493 | 				} | 
 | 494 | 			} else { | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 495 | 				// Store a final transition | 
| Akron | c17f1ca | 2021-08-03 19:47:27 +0200 | [diff] [blame] | 496 | 				dat.setCheck(dat.getBase(t)+dat.final, t) | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 497 | 			} | 
 | 498 | 		} | 
 | 499 | 	} | 
 | 500 |  | 
 | 501 | 	// Following Mizobuchi et al (2000) the size of the | 
 | 502 | 	// FSA should be stored in check(1). | 
| Akron | f2120ca | 2021-08-03 16:26:41 +0200 | [diff] [blame] | 503 | 	dat.setCheck(1, dat.maxSize+1) | 
 | 504 | 	dat.array = dat.array[:dat.maxSize+1] | 
 | 505 | 	return dat | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 506 | } | 
 | 507 |  | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 508 | // Check the table if a mapping of s | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 509 | // exists and return this as a representative. | 
 | 510 | // Currently iterates through the whole table | 
 | 511 | // in a bruteforce manner. | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 512 | func in_table(s int, table []*mapping, size int) int { | 
 | 513 | 	for x := 0; x < size; x++ { | 
 | 514 | 		if table[x].source == s { | 
 | 515 | 			return table[x].target | 
 | 516 | 		} | 
 | 517 | 	} | 
 | 518 | 	return 0 | 
 | 519 | } | 
 | 520 |  | 
| Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame^] | 521 | // Resize double array when necessary | 
 | 522 | func (dat *DaTokenizer) resize(l int) { | 
 | 523 | 	// TODO: | 
 | 524 | 	//   This is a bit too aggressive atm and should be calmed down. | 
 | 525 | 	if len(dat.array) <= l { | 
 | 526 | 		dat.array = append(dat.array, make([]int, l)...) | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 527 | 	} | 
| Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame^] | 528 | } | 
| Akron | c9d84a6 | 2021-08-03 15:56:03 +0200 | [diff] [blame] | 529 |  | 
| Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame^] | 530 | // Set base value in double array | 
 | 531 | func (dat *DaTokenizer) setBase(p int, v int) { | 
 | 532 | 	l := p*2 + 1 | 
 | 533 | 	dat.resize(l) | 
 | 534 | 	if dat.maxSize < l { | 
 | 535 | 		dat.maxSize = l | 
 | 536 | 	} | 
 | 537 | 	dat.array[p*2] = v | 
 | 538 | } | 
 | 539 |  | 
 | 540 | // Get base value in double array | 
 | 541 | func (dat *DaTokenizer) getBase(p int) int { | 
 | 542 | 	if p*2 >= len(dat.array) { | 
 | 543 | 		return 0 | 
 | 544 | 	} | 
 | 545 | 	return dat.array[p*2] | 
 | 546 | } | 
 | 547 |  | 
 | 548 | // Set check value in double array | 
 | 549 | func (dat *DaTokenizer) setCheck(p int, v int) { | 
 | 550 | 	l := p*2 + 1 | 
 | 551 | 	dat.resize(l) | 
 | 552 | 	if dat.maxSize < l { | 
 | 553 | 		dat.maxSize = l | 
 | 554 | 	} | 
 | 555 | 	dat.array[(p*2)+1] = v | 
 | 556 | } | 
 | 557 |  | 
 | 558 | // Get check value in double array | 
 | 559 | func (dat *DaTokenizer) getCheck(p int) int { | 
 | 560 | 	if (p*2)+1 >= len(dat.array) { | 
 | 561 | 		return 0 | 
 | 562 | 	} | 
 | 563 | 	return dat.array[(p*2)+1] | 
 | 564 | } | 
 | 565 |  | 
 | 566 | // Set size of double array | 
 | 567 | func (dat *DaTokenizer) setSize(p, v int) { | 
 | 568 | 	dat.setCheck(1, v) | 
 | 569 | } | 
 | 570 |  | 
 | 571 | // Get size of double array | 
 | 572 | func (dat *DaTokenizer) GetSize(p int) int { | 
 | 573 | 	return dat.getCheck(1) | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 574 | } | 
 | 575 |  | 
 | 576 | // Based on Mizobuchi et al (2000), p. 124 | 
 | 577 | // This iterates for every state through the complete double array | 
 | 578 | // structure until it finds a gap that fits all outgoing transitions | 
 | 579 | // of the state. This is extremely slow, but is only necessary in the | 
 | 580 | // construction phase of the tokenizer. | 
| Akron | f2120ca | 2021-08-03 16:26:41 +0200 | [diff] [blame] | 581 | func (dat *DaTokenizer) xCheck(symbols []int) int { | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 582 |  | 
 | 583 | 	// Start at the first entry of the double array list | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 584 | 	base := 1 | 
 | 585 |  | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 586 | OVERLAP: | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 587 |  | 
 | 588 | 	// Resize the array if necessary | 
| Akron | c17f1ca | 2021-08-03 19:47:27 +0200 | [diff] [blame] | 589 | 	dat.resize((base + dat.final) * 2) | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 590 | 	for _, a := range symbols { | 
| Akron | f2120ca | 2021-08-03 16:26:41 +0200 | [diff] [blame] | 591 | 		if dat.getCheck(base+a) != 0 { | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 592 | 			base++ | 
 | 593 | 			goto OVERLAP | 
 | 594 | 		} | 
 | 595 | 	} | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 596 | 	return base | 
 | 597 | } | 
 | 598 |  | 
| Akron | 773b1ef | 2021-08-03 17:37:20 +0200 | [diff] [blame] | 599 | func (dat *DaTokenizer) LoadLevel() float64 { | 
| Akron | d66a926 | 2021-08-03 17:09:09 +0200 | [diff] [blame] | 600 |  | 
| Akron | 773b1ef | 2021-08-03 17:37:20 +0200 | [diff] [blame] | 601 | 	if dat.loadLevel >= 0 { | 
 | 602 | 		return dat.loadLevel | 
 | 603 | 	} | 
| Akron | d66a926 | 2021-08-03 17:09:09 +0200 | [diff] [blame] | 604 | 	nonEmpty := 0 | 
 | 605 | 	all := len(dat.array) / 2 | 
 | 606 | 	for x := 1; x <= len(dat.array); x = x + 2 { | 
 | 607 | 		if dat.array[x] != 0 { | 
 | 608 | 			nonEmpty++ | 
 | 609 | 		} | 
 | 610 | 	} | 
| Akron | 773b1ef | 2021-08-03 17:37:20 +0200 | [diff] [blame] | 611 | 	dat.loadLevel = float64(nonEmpty) / float64(all) * 100 | 
 | 612 | 	return dat.loadLevel | 
| Akron | d66a926 | 2021-08-03 17:09:09 +0200 | [diff] [blame] | 613 | } | 
 | 614 |  | 
| Akron | 6247a5d | 2021-08-03 19:18:28 +0200 | [diff] [blame] | 615 | // WriteTo stores the double array data in an io.Writer. | 
 | 616 | func (dat *DaTokenizer) WriteTo(w io.Writer) (n int64, err error) { | 
 | 617 |  | 
 | 618 | 	// Store magical header | 
 | 619 | 	all, err := w.Write([]byte(MAGIC)) | 
 | 620 | 	if err != nil { | 
 | 621 | 		log.Error().Msg("Unable to write data") | 
 | 622 | 	} | 
 | 623 |  | 
 | 624 | 	// Get sigma as a list | 
 | 625 | 	sigmalist := make([]rune, len(dat.sigma)+16) | 
 | 626 | 	max := 0 | 
 | 627 | 	for sym, num := range dat.sigma { | 
 | 628 | 		sigmalist[num] = sym | 
 | 629 | 		if num > max { | 
 | 630 | 			max = num | 
 | 631 | 		} | 
 | 632 | 	} | 
 | 633 |  | 
 | 634 | 	sigmalist = sigmalist[:max+1] | 
 | 635 |  | 
 | 636 | 	buf := make([]byte, 0, 12) | 
 | 637 | 	bo.PutUint16(buf[0:2], VERSION) | 
| Akron | c17f1ca | 2021-08-03 19:47:27 +0200 | [diff] [blame] | 638 | 	bo.PutUint16(buf[2:4], uint16(dat.epsilon)) | 
 | 639 | 	bo.PutUint16(buf[4:6], uint16(dat.unknown)) | 
 | 640 | 	bo.PutUint16(buf[6:8], uint16(dat.identity)) | 
 | 641 | 	bo.PutUint16(buf[8:10], uint16(dat.final)) | 
| Akron | 6247a5d | 2021-08-03 19:18:28 +0200 | [diff] [blame] | 642 | 	bo.PutUint16(buf[10:12], uint16(len(sigmalist))) | 
 | 643 | 	more, err := w.Write(buf[0:12]) | 
 | 644 | 	if err != nil { | 
 | 645 | 		log.Error().Msg("Unable to write data") | 
 | 646 | 	} | 
 | 647 |  | 
 | 648 | 	all += more | 
 | 649 |  | 
 | 650 | 	wbuf := bytes.NewBuffer(nil) | 
 | 651 | 	wbufWrap := bufio.NewWriter(wbuf) | 
 | 652 |  | 
 | 653 | 	// Write sigma | 
 | 654 | 	for _, sym := range sigmalist { | 
 | 655 | 		more, err = wbufWrap.WriteRune(sym) | 
 | 656 | 		if err != nil { | 
 | 657 | 			log.Error().Msg("Unable to write data") | 
 | 658 | 		} | 
 | 659 | 		all += more | 
 | 660 | 	} | 
 | 661 | 	wbufWrap.Flush() | 
 | 662 | 	more, err = w.Write(wbuf.Bytes()) | 
 | 663 | 	if err != nil { | 
 | 664 | 		log.Error().Msg("Unable to write data") | 
 | 665 | 	} | 
 | 666 | 	all += more | 
 | 667 |  | 
 | 668 | 	// Test marker - could be checksum | 
 | 669 | 	more, err = w.Write([]byte("T")) | 
 | 670 | 	if err != nil { | 
 | 671 | 		log.Error().Msg("Unable to write data") | 
 | 672 | 	} | 
 | 673 | 	all += more | 
 | 674 |  | 
 | 675 | 	wbuf.Reset() | 
 | 676 |  | 
 | 677 | 	for _, d := range dat.array { | 
 | 678 | 		bo.PutUint32(buf[0:4], uint32(d)) | 
 | 679 | 		more, err := w.Write(buf[0:4]) | 
 | 680 | 		if err != nil { | 
 | 681 | 			log.Error().Msg("Unable to write data") | 
 | 682 | 		} | 
 | 683 | 		all += more | 
 | 684 | 	} | 
 | 685 |  | 
 | 686 | 	return int64(all), err | 
 | 687 | } | 
 | 688 |  | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 689 | // Match an input string against the double array | 
 | 690 | // FSA. | 
 | 691 | // | 
 | 692 | // Based on Mizobuchi et al (2000), p. 129, | 
 | 693 | // with additional support for IDENTITY, UNKNOWN | 
 | 694 | // and EPSILON transitions. | 
| Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame^] | 695 | func (dat *DaTokenizer) Match(input string) bool { | 
| Akron | 465a099 | 2021-08-03 11:28:48 +0200 | [diff] [blame] | 696 | 	var a int | 
 | 697 | 	var tu int | 
 | 698 | 	var ok bool | 
 | 699 |  | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 700 | 	t := 1 // Initial state | 
 | 701 | 	chars := []rune(input) | 
 | 702 | 	i := 0 | 
 | 703 |  | 
| Akron | 49d27ee | 2021-08-03 11:58:13 +0200 | [diff] [blame] | 704 | 	for i < len(chars) { | 
| Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame^] | 705 | 		a, ok = dat.sigma[chars[i]] | 
| Akron | 730a79c | 2021-08-03 11:05:29 +0200 | [diff] [blame] | 706 |  | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 707 | 		// Support identity symbol if character is not in sigma | 
| Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame^] | 708 | 		if !ok && dat.identity != -1 { | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 709 | 			if DEBUG { | 
| Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame^] | 710 | 				fmt.Println("IDENTITY symbol", string(chars[i]), "->", dat.identity) | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 711 | 			} | 
| Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame^] | 712 | 			a = dat.identity | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 713 | 		} else if DEBUG { | 
| Akron | 49d27ee | 2021-08-03 11:58:13 +0200 | [diff] [blame] | 714 | 			fmt.Println("Sigma transition is okay for [", string(chars[i]), "]") | 
| Akron | 730a79c | 2021-08-03 11:05:29 +0200 | [diff] [blame] | 715 | 		} | 
| Akron | 465a099 | 2021-08-03 11:28:48 +0200 | [diff] [blame] | 716 | 		tu = t | 
| Akron | 730a79c | 2021-08-03 11:05:29 +0200 | [diff] [blame] | 717 | 	CHECK: | 
| Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame^] | 718 | 		t = dat.getBase(tu) + a | 
| Akron | 730a79c | 2021-08-03 11:05:29 +0200 | [diff] [blame] | 719 |  | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 720 | 		// Check if the transition is valid according to the double array | 
| Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame^] | 721 | 		if t > dat.getCheck(1) || dat.getCheck(t) != tu { | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 722 |  | 
 | 723 | 			if DEBUG { | 
| Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame^] | 724 | 				fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu) | 
| Akron | 730a79c | 2021-08-03 11:05:29 +0200 | [diff] [blame] | 725 | 			} | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 726 |  | 
| Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame^] | 727 | 			if !ok && a == dat.identity { | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 728 | 				// Try again with unknown symbol, in case identity failed | 
 | 729 | 				if DEBUG { | 
| Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame^] | 730 | 					fmt.Println("UNKNOWN symbol", string(chars[i]), "->", dat.unknown) | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 731 | 				} | 
| Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame^] | 732 | 				a = dat.unknown | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 733 |  | 
| Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame^] | 734 | 			} else if a != dat.epsilon { | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 735 | 				// Try again with epsilon symbol, in case everything else failed | 
 | 736 | 				if DEBUG { | 
| Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame^] | 737 | 					fmt.Println("EPSILON symbol", string(chars[i]), "->", dat.epsilon) | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 738 | 				} | 
| Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame^] | 739 | 				a = dat.epsilon | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 740 | 			} else { | 
 | 741 | 				break | 
 | 742 | 			} | 
 | 743 | 			goto CHECK | 
| Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame^] | 744 | 		} else if dat.getBase(t) < 0 { | 
| Akron | 730a79c | 2021-08-03 11:05:29 +0200 | [diff] [blame] | 745 | 			// Move to representative state | 
| Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame^] | 746 | 			t = -1 * dat.getBase(t) | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 747 | 		} | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 748 |  | 
 | 749 | 		// Transition is fine | 
| Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame^] | 750 | 		if a != dat.epsilon { | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 751 | 			// Character consumed | 
| Akron | 49d27ee | 2021-08-03 11:58:13 +0200 | [diff] [blame] | 752 | 			i++ | 
 | 753 | 		} | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 754 | 		// TODO: | 
 | 755 | 		//   Prevent endless epsilon loops! | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 756 | 	} | 
 | 757 |  | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 758 | 	if i != len(chars) { | 
 | 759 | 		if DEBUG { | 
 | 760 | 			fmt.Println("Not at the end") | 
 | 761 | 		} | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 762 | 		return false | 
 | 763 | 	} | 
 | 764 |  | 
| Akron | 465a099 | 2021-08-03 11:28:48 +0200 | [diff] [blame] | 765 | FINALCHECK: | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 766 |  | 
 | 767 | 	// Automaton is in a final state | 
| Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame^] | 768 | 	if dat.getCheck(dat.getBase(t)+dat.final) == t { | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 769 | 		return true | 
 | 770 | 	} | 
| Akron | 465a099 | 2021-08-03 11:28:48 +0200 | [diff] [blame] | 771 |  | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 772 | 	// Check epsilon transitions until a final state is reached | 
| Akron | 465a099 | 2021-08-03 11:28:48 +0200 | [diff] [blame] | 773 | 	tu = t | 
| Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame^] | 774 | 	t = dat.getBase(tu) + dat.epsilon | 
| Akron | 465a099 | 2021-08-03 11:28:48 +0200 | [diff] [blame] | 775 |  | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 776 | 	// Epsilon transition failed | 
| Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame^] | 777 | 	if t > dat.getCheck(1) || dat.getCheck(t) != tu { | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 778 | 		if DEBUG { | 
| Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame^] | 779 | 			fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu) | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 780 | 		} | 
| Akron | 465a099 | 2021-08-03 11:28:48 +0200 | [diff] [blame] | 781 | 		return false | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 782 |  | 
| Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame^] | 783 | 	} else if dat.getBase(t) < 0 { | 
| Akron | 465a099 | 2021-08-03 11:28:48 +0200 | [diff] [blame] | 784 | 		// Move to representative state | 
| Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame^] | 785 | 		t = -1 * dat.getBase(t) | 
| Akron | 465a099 | 2021-08-03 11:28:48 +0200 | [diff] [blame] | 786 | 	} | 
| Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 787 |  | 
| Akron | 465a099 | 2021-08-03 11:28:48 +0200 | [diff] [blame] | 788 | 	goto FINALCHECK | 
| Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 789 | } |