Blame - matrix.go - KorAP/Datok

blob: 356efa9de3c3db2de8c546dbe2bff52cfaae8789 [file] [log] [blame]

Akron	1c34ce6	2021-09-23 23:27:39 +0200	[diff] [blame]	1	package datok
				2
				3	import (
				4	"bufio"
				5	"fmt"
				6	"io"
				7	)
				8
				9	type MatrixTokenizer struct {
				10	sigma map[rune]int
				11	sigmaASCII [256]int
				12	array []int
				13	stateCount int
				14
				15	// Special symbols in sigma
				16	epsilon int
				17	unknown int
				18	identity int
				19	final int
				20	tokenend int
				21	}
				22
				23	// ToMatrix turns the intermediate tokenizer into a
				24	// matrix representation.
				25	func (auto Automaton) ToMatrix() MatrixTokenizer {
				26
				27	mat := &MatrixTokenizer{
				28	sigma: make(map[rune]int),
				29	final: auto.final,
				30	unknown: auto.unknown,
				31	identity: auto.identity,
				32	epsilon: auto.epsilon,
				33	tokenend: auto.tokenend,
				34	stateCount: auto.stateCount,
				35	}
				36
				37	mat.array = make([]int, (auto.stateCount+1)*(auto.sigmaCount+1))
				38
				39	for num, sym := range auto.sigmaRev {
				40	if int(sym) < 256 {
				41	mat.sigmaASCII[int(sym)] = num
				42	}
				43	mat.sigma[sym] = num
				44	if num > auto.sigmaCount {
				45	panic("sigmaCount is smaller")
				46	}
				47	}
				48	remember := make([]bool, auto.stateCount+2)
				49
				50	// Store all transitions in matrix
				51	var toMatrix func([]int, int)
				52
				53	toMatrix = func(matrix []int, start int) {
				54	if start > auto.stateCount {
				55	panic("stateCount is smaller")
				56	}
				57	if remember[start] {
				58	return
				59	}
				60	remember[start] = true
				61	for alpha, t := range auto.transitions[start] {
				62	matrix[(alpha-1)*auto.stateCount+start] = t.end
				63
				64	// Mark nontoken transitions
				65	if t.nontoken {
				66	matrix[(alpha-1)auto.stateCount+start] = -1
				67	}
				68
				69	toMatrix(matrix, t.end)
				70	}
				71	}
				72
				73	toMatrix(mat.array, 1)
				74
				75	return mat
				76	}
				77
				78	func (mat *MatrixTokenizer) Transduce(r io.Reader, w io.Writer) bool {
				79	var a int
				80	var t0 int
				81	t := int(1) // Initial state
				82	var ok, rewindBuffer bool
				83
				84	// Remember the last position of a possible tokenend,
				85	// in case the automaton fails.
				86	epsilonState := int(0)
				87	epsilonOffset := 0
				88
Akron	5c82a92	2021-09-24 19:11:29 +0200	[diff] [blame^]	89	// Remember if the last transition was epsilon
				90	sentenceEnd := false
				91
Akron	1c34ce6	2021-09-23 23:27:39 +0200	[diff] [blame]	92	buffer := make([]rune, 1024)
				93	buffo := 0 // Buffer offset
				94	buffi := 0 // Buffer length
				95
				96	reader := bufio.NewReader(r)
				97	writer := bufio.NewWriter(w)
				98	defer writer.Flush()
				99
				100	var char rune
				101
				102	var err error
				103	eof := false
				104	newchar := true
				105
				106	PARSECHARM:
				107	for {
				108
				109	if newchar {
				110	// Get from reader if buffer is empty
				111	if buffo >= buffi {
				112	if eof {
				113	break
				114	}
				115	char, _, err = reader.ReadRune()
				116
				117	// No more runes to read
				118	if err != nil {
				119	eof = true
				120	break
				121	}
				122	buffer[buffi] = char
				123	buffi++
				124	}
				125
				126	char = buffer[buffo]
				127
				128	if DEBUG {
				129	fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
				130	}
				131
				132	// TODO:
				133	// Better not repeatedly check for a!
				134	// Possibly keep a buffer with a.
				135	if int(char) < 256 {
				136	a = mat.sigmaASCII[int(char)]
				137	} else {
				138	a, ok = mat.sigma[char]
				139	if !ok {
				140	a = 0
				141	}
				142	}
				143
				144	// Use identity symbol if character is not in sigma
				145	if a == 0 && mat.identity != -1 {
				146	a = mat.identity
				147	}
				148
				149	t0 = t
				150
				151	// Check for epsilon transitions and remember
				152
				153	if mat.array[(mat.epsilon-1)*mat.stateCount+t0] != 0 {
				154	// Remember state for backtracking to last tokenend state
				155	epsilonState = t0
				156	epsilonOffset = buffo
				157	}
				158	}
				159
				160	// Checks a transition based on t0, a and buffo
				161	t = mat.array[(int(a)-1)*mat.stateCount+int(t0)]
				162	// t = mat.array[t0].getBase() + uint32(a)
				163	// ta := dat.array[t]
				164
				165	if DEBUG {
				166	// Char is only relevant if set
				167	fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
				168	/*
				169	if false {
				170	fmt.Println(dat.outgoing(t0))
				171	}
				172	*/
				173	}
				174
				175	// Check if the transition is invalid according to the double array
				176	// if t > dat.array[1].getCheck() \|\| ta.getCheck() != t0 {
				177	if t == 0 {
				178
				179	if DEBUG {
				180	fmt.Println("Match is not fine!")
				181	}
				182
				183	if !ok && a == mat.identity {
				184
				185	// Try again with unknown symbol, in case identity failed
				186	// Char is only relevant when set
				187	if DEBUG {
				188	fmt.Println("UNKNOWN symbol", string(char), "->", mat.unknown)
				189	}
				190	a = mat.unknown
				191
				192	} else if a != mat.epsilon {
				193
				194	// Try again with epsilon symbol, in case everything else failed
				195	t0 = epsilonState
				196	epsilonState = 0 // reset
				197	buffo = epsilonOffset
				198	a = mat.epsilon
				199
				200	if DEBUG {
				201	fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
				202	}
				203
				204	} else {
				205	break
				206	}
				207
				208	newchar = false
				209	continue
				210	}
				211
				212	// Transition was successful
				213	rewindBuffer = false
				214
				215	// Transition consumes a character
				216	if a != mat.epsilon {
				217
				218	buffo++
				219
				220	// Transition does not produce a character
				221	// if buffo == 1 && ta.isNonToken() {
				222	if buffo == 1 && t < 0 {
				223	if DEBUG {
				224	fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
				225	}
				226	rewindBuffer = true
				227	}
				228
				229	} else {
				230	// Transition marks the end of a token - so flush the buffer
				231
				232	if buffi > 0 {
				233	if DEBUG {
				234	fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
				235	}
				236	writer.WriteString(string(buffer[:buffo]))
				237	rewindBuffer = true
Akron	5c82a92	2021-09-24 19:11:29 +0200	[diff] [blame^]	238	sentenceEnd = false
				239	} else {
				240	sentenceEnd = true
Akron	1c34ce6	2021-09-23 23:27:39 +0200	[diff] [blame]	241	}
				242	if DEBUG {
				243	fmt.Println("-> Newline")
				244	}
				245	writer.WriteRune('\n')
				246	}
				247
				248	// Rewind the buffer if necessary
				249	if rewindBuffer {
				250
				251	// TODO: Better as a ring buffer
				252	for x, i := range buffer[buffo:buffi] {
				253	buffer[x] = i
				254	}
				255
				256	buffi -= buffo
				257	epsilonOffset -= buffo
				258	buffo = 0
				259	if DEBUG {
				260	fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
				261	}
				262	}
				263
				264	// Move to representative state
				265	/*
				266	if ta.isSeparate() {
				267	t = ta.getBase()
				268	ta = dat.array[t]
				269
				270	if DEBUG {
				271	fmt.Println("Representative pointing to", t)
				272	}
				273	}
				274	*/
				275
				276	// Ignore nontoken mark
				277	if t < 0 {
				278	t *= -1
				279	}
				280
				281	newchar = true
				282
				283	// TODO:
				284	// Prevent endless epsilon loops!
				285	}
				286
				287	// Input reader is not yet finished
				288	if !eof {
				289	if DEBUG {
				290	fmt.Println("Not at the end")
				291	}
				292	return false
				293	}
				294
				295	if DEBUG {
				296	fmt.Println("Entering final check")
				297	}
				298	/*
				299	// Automaton is in a final state, so flush the buffer and return
				300	x := dat.array[t].getBase() + uint32(dat.final)
				301
				302	if x < dat.array[1].getCheck() && dat.array[x].getCheck() == t {
				303
				304	if buffi > 0 {
				305	if DEBUG {
				306	fmt.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
				307	}
				308	writer.WriteString(string(buffer[:buffi]))
				309
				310	if dat.array[t].isTokenEnd() {
				311	writer.WriteRune('\n')
				312	if DEBUG {
				313	fmt.Println("-> Newline")
				314	}
				315	}
				316	}
				317
				318	// Add an additional sentence ending, if the file is over but no explicit
				319	// sentence split was reached. This may be controversial and therefore
				320	// optional via parameter.
				321	if !dat.array[t0].isTokenEnd() {
				322	writer.WriteRune('\n')
				323	if DEBUG {
				324	fmt.Println("-> Newline")
				325	}
				326	}
				327
				328	// TODO:
				329	// There may be a new line at the end, from an epsilon,
				330	// so we may need to go on!
				331	return true
				332	}
				333	*/
				334
				335	// Check epsilon transitions until a final state is reached
				336	t0 = t
				337	// t = dat.array[t0].getBase() + uint32(dat.epsilon)
				338	t = mat.array[(int(mat.epsilon)-1)*mat.stateCount+int(t0)]
				339	a = mat.epsilon
				340	newchar = false
				341	// if dat.array[t].getCheck() == t0 {
				342	// t can't be < 0
				343	if t > 0 {
				344	// Remember state for backtracking to last tokenend state
				345	goto PARSECHARM
				346
				347	} else if epsilonState != 0 {
				348	t0 = epsilonState
				349	epsilonState = 0 // reset
				350	buffo = epsilonOffset
				351	if DEBUG {
				352	fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
				353	}
				354	goto PARSECHARM
				355	}
Akron	1c34ce6	2021-09-23 23:27:39 +0200	[diff] [blame]	356
Akron	5c82a92	2021-09-24 19:11:29 +0200	[diff] [blame^]	357	// Add an additional sentence ending, if the file is over but no explicit
				358	// sentence split was reached. This may be controversial and therefore
				359	// optional via parameter.
				360	if !sentenceEnd {
				361	writer.WriteRune('\n')
				362	if DEBUG {
				363	fmt.Println("-> Newline")
				364	}
				365	}
				366
				367	return true
Akron	1c34ce6	2021-09-23 23:27:39 +0200	[diff] [blame]	368	}