Blame - matrix.go - KorAP/Datok

blob: 0ec940e200344d9e62dfb55487f8b81748c8a652 [file] [log] [blame]

Akron	1c34ce6	2021-09-23 23:27:39 +0200	[diff] [blame^]	1	package datok
				2
				3	import (
				4	"bufio"
				5	"fmt"
				6	"io"
				7	)
				8
				9	type MatrixTokenizer struct {
				10	sigma map[rune]int
				11	sigmaASCII [256]int
				12	array []int
				13	stateCount int
				14
				15	// Special symbols in sigma
				16	epsilon int
				17	unknown int
				18	identity int
				19	final int
				20	tokenend int
				21	}
				22
				23	// ToMatrix turns the intermediate tokenizer into a
				24	// matrix representation.
				25	func (auto Automaton) ToMatrix() MatrixTokenizer {
				26
				27	mat := &MatrixTokenizer{
				28	sigma: make(map[rune]int),
				29	final: auto.final,
				30	unknown: auto.unknown,
				31	identity: auto.identity,
				32	epsilon: auto.epsilon,
				33	tokenend: auto.tokenend,
				34	stateCount: auto.stateCount,
				35	}
				36
				37	mat.array = make([]int, (auto.stateCount+1)*(auto.sigmaCount+1))
				38
				39	for num, sym := range auto.sigmaRev {
				40	if int(sym) < 256 {
				41	mat.sigmaASCII[int(sym)] = num
				42	}
				43	mat.sigma[sym] = num
				44	if num > auto.sigmaCount {
				45	panic("sigmaCount is smaller")
				46	}
				47	}
				48	remember := make([]bool, auto.stateCount+2)
				49
				50	// Store all transitions in matrix
				51	var toMatrix func([]int, int)
				52
				53	toMatrix = func(matrix []int, start int) {
				54	if start > auto.stateCount {
				55	panic("stateCount is smaller")
				56	}
				57	if remember[start] {
				58	return
				59	}
				60	remember[start] = true
				61	for alpha, t := range auto.transitions[start] {
				62	matrix[(alpha-1)*auto.stateCount+start] = t.end
				63
				64	// Mark nontoken transitions
				65	if t.nontoken {
				66	matrix[(alpha-1)auto.stateCount+start] = -1
				67	}
				68
				69	toMatrix(matrix, t.end)
				70	}
				71	}
				72
				73	toMatrix(mat.array, 1)
				74
				75	return mat
				76	}
				77
				78	func (mat *MatrixTokenizer) Transduce(r io.Reader, w io.Writer) bool {
				79	var a int
				80	var t0 int
				81	t := int(1) // Initial state
				82	var ok, rewindBuffer bool
				83
				84	// Remember the last position of a possible tokenend,
				85	// in case the automaton fails.
				86	epsilonState := int(0)
				87	epsilonOffset := 0
				88
				89	buffer := make([]rune, 1024)
				90	buffo := 0 // Buffer offset
				91	buffi := 0 // Buffer length
				92
				93	reader := bufio.NewReader(r)
				94	writer := bufio.NewWriter(w)
				95	defer writer.Flush()
				96
				97	var char rune
				98
				99	var err error
				100	eof := false
				101	newchar := true
				102
				103	PARSECHARM:
				104	for {
				105
				106	if newchar {
				107	// Get from reader if buffer is empty
				108	if buffo >= buffi {
				109	if eof {
				110	break
				111	}
				112	char, _, err = reader.ReadRune()
				113
				114	// No more runes to read
				115	if err != nil {
				116	eof = true
				117	break
				118	}
				119	buffer[buffi] = char
				120	buffi++
				121	}
				122
				123	char = buffer[buffo]
				124
				125	if DEBUG {
				126	fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
				127	}
				128
				129	// TODO:
				130	// Better not repeatedly check for a!
				131	// Possibly keep a buffer with a.
				132	if int(char) < 256 {
				133	a = mat.sigmaASCII[int(char)]
				134	} else {
				135	a, ok = mat.sigma[char]
				136	if !ok {
				137	a = 0
				138	}
				139	}
				140
				141	// Use identity symbol if character is not in sigma
				142	if a == 0 && mat.identity != -1 {
				143	a = mat.identity
				144	}
				145
				146	t0 = t
				147
				148	// Check for epsilon transitions and remember
				149
				150	if mat.array[(mat.epsilon-1)*mat.stateCount+t0] != 0 {
				151	// Remember state for backtracking to last tokenend state
				152	epsilonState = t0
				153	epsilonOffset = buffo
				154	}
				155	}
				156
				157	// Checks a transition based on t0, a and buffo
				158	t = mat.array[(int(a)-1)*mat.stateCount+int(t0)]
				159	// t = mat.array[t0].getBase() + uint32(a)
				160	// ta := dat.array[t]
				161
				162	if DEBUG {
				163	// Char is only relevant if set
				164	fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
				165	/*
				166	if false {
				167	fmt.Println(dat.outgoing(t0))
				168	}
				169	*/
				170	}
				171
				172	// Check if the transition is invalid according to the double array
				173	// if t > dat.array[1].getCheck() \|\| ta.getCheck() != t0 {
				174	if t == 0 {
				175
				176	if DEBUG {
				177	fmt.Println("Match is not fine!")
				178	}
				179
				180	if !ok && a == mat.identity {
				181
				182	// Try again with unknown symbol, in case identity failed
				183	// Char is only relevant when set
				184	if DEBUG {
				185	fmt.Println("UNKNOWN symbol", string(char), "->", mat.unknown)
				186	}
				187	a = mat.unknown
				188
				189	} else if a != mat.epsilon {
				190
				191	// Try again with epsilon symbol, in case everything else failed
				192	t0 = epsilonState
				193	epsilonState = 0 // reset
				194	buffo = epsilonOffset
				195	a = mat.epsilon
				196
				197	if DEBUG {
				198	fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
				199	}
				200
				201	} else {
				202	break
				203	}
				204
				205	newchar = false
				206	continue
				207	}
				208
				209	// Transition was successful
				210	rewindBuffer = false
				211
				212	// Transition consumes a character
				213	if a != mat.epsilon {
				214
				215	buffo++
				216
				217	// Transition does not produce a character
				218	// if buffo == 1 && ta.isNonToken() {
				219	if buffo == 1 && t < 0 {
				220	if DEBUG {
				221	fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
				222	}
				223	rewindBuffer = true
				224	}
				225
				226	} else {
				227	// Transition marks the end of a token - so flush the buffer
				228
				229	if buffi > 0 {
				230	if DEBUG {
				231	fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
				232	}
				233	writer.WriteString(string(buffer[:buffo]))
				234	rewindBuffer = true
				235	}
				236	if DEBUG {
				237	fmt.Println("-> Newline")
				238	}
				239	writer.WriteRune('\n')
				240	}
				241
				242	// Rewind the buffer if necessary
				243	if rewindBuffer {
				244
				245	// TODO: Better as a ring buffer
				246	for x, i := range buffer[buffo:buffi] {
				247	buffer[x] = i
				248	}
				249
				250	buffi -= buffo
				251	epsilonOffset -= buffo
				252	buffo = 0
				253	if DEBUG {
				254	fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
				255	}
				256	}
				257
				258	// Move to representative state
				259	/*
				260	if ta.isSeparate() {
				261	t = ta.getBase()
				262	ta = dat.array[t]
				263
				264	if DEBUG {
				265	fmt.Println("Representative pointing to", t)
				266	}
				267	}
				268	*/
				269
				270	// Ignore nontoken mark
				271	if t < 0 {
				272	t *= -1
				273	}
				274
				275	newchar = true
				276
				277	// TODO:
				278	// Prevent endless epsilon loops!
				279	}
				280
				281	// Input reader is not yet finished
				282	if !eof {
				283	if DEBUG {
				284	fmt.Println("Not at the end")
				285	}
				286	return false
				287	}
				288
				289	if DEBUG {
				290	fmt.Println("Entering final check")
				291	}
				292	/*
				293	// Automaton is in a final state, so flush the buffer and return
				294	x := dat.array[t].getBase() + uint32(dat.final)
				295
				296	if x < dat.array[1].getCheck() && dat.array[x].getCheck() == t {
				297
				298	if buffi > 0 {
				299	if DEBUG {
				300	fmt.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
				301	}
				302	writer.WriteString(string(buffer[:buffi]))
				303
				304	if dat.array[t].isTokenEnd() {
				305	writer.WriteRune('\n')
				306	if DEBUG {
				307	fmt.Println("-> Newline")
				308	}
				309	}
				310	}
				311
				312	// Add an additional sentence ending, if the file is over but no explicit
				313	// sentence split was reached. This may be controversial and therefore
				314	// optional via parameter.
				315	if !dat.array[t0].isTokenEnd() {
				316	writer.WriteRune('\n')
				317	if DEBUG {
				318	fmt.Println("-> Newline")
				319	}
				320	}
				321
				322	// TODO:
				323	// There may be a new line at the end, from an epsilon,
				324	// so we may need to go on!
				325	return true
				326	}
				327	*/
				328
				329	// Check epsilon transitions until a final state is reached
				330	t0 = t
				331	// t = dat.array[t0].getBase() + uint32(dat.epsilon)
				332	t = mat.array[(int(mat.epsilon)-1)*mat.stateCount+int(t0)]
				333	a = mat.epsilon
				334	newchar = false
				335	// if dat.array[t].getCheck() == t0 {
				336	// t can't be < 0
				337	if t > 0 {
				338	// Remember state for backtracking to last tokenend state
				339	goto PARSECHARM
				340
				341	} else if epsilonState != 0 {
				342	t0 = epsilonState
				343	epsilonState = 0 // reset
				344	buffo = epsilonOffset
				345	if DEBUG {
				346	fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
				347	}
				348	goto PARSECHARM
				349	}
				350	return false
				351
				352	}