token_writer.go - KorAP/Datok - Gitiles

 package datok

 import (
 	"bufio"
 	"io"
 	"strconv"
 )

 type Bits uint8

 const (
 	TOKENS Bits = 1 << iota
 	SENTENCES
 	TOKEN_POS
 	SENTENCE_POS
 	NEWLINE_AFTER_EOT

 	SIMPLE = TOKENS | SENTENCES
 )

 type TokenWriter struct {
 	SentenceEnd func(int)
 	TextEnd     func(int)
 	Flush       func() error
 	Token       func(int, []rune)
 }

 // Create a new token writer based on the options
 func NewTokenWriter(w io.Writer, flags Bits) *TokenWriter {
 	writer := bufio.NewWriter(w)
 	posC := 0
 	pos := make([]int, 0, 1024)
 	sentB := true
 	sent := make([]int, 0, 1024)

 	tw := &TokenWriter{}

 	// Collect token positions and maybe tokens
 	if flags&(TOKEN_POS|SENTENCE_POS) != 0 {

 		// TODO:
 		//   Split to
 		//   - Token_pos+Tokens+Newline
 		//   - Token_pos+Newline
 		//   - Token_pos|Sentence_pos
 		//   - Sentence_pos
 		//   - Tokens

 		tw.Token = func(offset int, buf []rune) {

 			// TODO:
 			//   Store in []uint16
 			//   and write to string

 			// Accept newline after EOT
 			if flags&NEWLINE_AFTER_EOT != 0 && posC == 0 && buf[0] == '\n' && writer.Buffered() != 0 {
 				posC--
 			}

 			posC += offset
 			pos = append(pos, posC)

 			// Token is the start of a sentence
 			if sentB {
 				sentB = false
 				sent = append(sent, posC)
 			}
 			posC += len(buf) - offset
 			pos = append(pos, posC)

 			// Collect tokens also
 			if flags&TOKENS != 0 {
 				writer.WriteString(string(buf[offset:]))
 				writer.WriteByte('\n')
 			}
 		}

 		// Collect tokens
 	} else if flags&TOKENS != 0 {
 		tw.Token = func(offset int, buf []rune) {
 			writer.WriteString(string(buf[offset:]))
 			writer.WriteByte('\n')
 		}

 		// Ignore tokens
 	} else {
 		tw.Token = func(_ int, _ []rune) {}
 	}

 	// Collect sentence positions and maybe sentence boundaries
 	if flags&SENTENCE_POS != 0 {
 		tw.SentenceEnd = func(_ int) {

 			// Add end position of last token to sentence boundary
 			// TODO: This only works if token positions are taking into account
 			sent = append(sent, pos[len(pos)-1])
 			sentB = true

 			// Collect sentences also
 			if flags&SENTENCES != 0 {
 				writer.WriteByte('\n')
 			}
 		}

 		// Collect sentence boundaries
 	} else if flags&SENTENCES != 0 {
 		tw.SentenceEnd = func(_ int) {
 			writer.WriteByte('\n')
 		}

 		// Ignore sentence boundaries
 	} else {
 		tw.SentenceEnd = func(_ int) {}
 	}

 	// Write token or sentence positions
 	if flags&(TOKEN_POS|SENTENCE_POS) != 0 {
 		tw.TextEnd = func(_ int) {
 			writer.Flush()

 			// Write token positions
 			if flags&TOKEN_POS != 0 {
 				writer.WriteString(strconv.Itoa(pos[0]))
 				for _, x := range pos[1:] {
 					writer.WriteByte(' ')
 					writer.WriteString(strconv.Itoa(x))
 				}
 				writer.WriteByte('\n')
 			}

 			// Write sentence positions
 			if flags&SENTENCE_POS != 0 {
 				writer.WriteString(strconv.Itoa(sent[0]))
 				for _, x := range sent[1:] {
 					writer.WriteByte(' ')
 					writer.WriteString(strconv.Itoa(x))
 				}
 				writer.WriteByte('\n')
 				sent = sent[:0]
 				sentB = true
 			}

 			posC = 0
 			pos = pos[:0]
 		}

 		// Collect text ends
 	} else {
 		tw.TextEnd = func(_ int) {
 			writer.WriteRune('\n')
 			writer.Flush()
 		}
 	}

 	// Flush the writer
 	tw.Flush = func() error {
 		return writer.Flush()
 	}

 	return tw
 }
	package datok

	import (
	"bufio"
	"io"
	"strconv"
	)

	type Bits uint8

	const (
	TOKENS Bits = 1 << iota
	SENTENCES
	TOKEN_POS
	SENTENCE_POS
	NEWLINE_AFTER_EOT

	SIMPLE = TOKENS \| SENTENCES
	)

	type TokenWriter struct {
	SentenceEnd func(int)
	TextEnd func(int)
	Flush func() error
	Token func(int, []rune)
	}

	// Create a new token writer based on the options
	func NewTokenWriter(w io.Writer, flags Bits) *TokenWriter {
	writer := bufio.NewWriter(w)
	posC := 0
	pos := make([]int, 0, 1024)
	sentB := true
	sent := make([]int, 0, 1024)

	tw := &TokenWriter{}

	// Collect token positions and maybe tokens
	if flags&(TOKEN_POS\|SENTENCE_POS) != 0 {

	// TODO:
	// Split to
	// - Token_pos+Tokens+Newline
	// - Token_pos+Newline
	// - Token_pos\|Sentence_pos
	// - Sentence_pos
	// - Tokens

	tw.Token = func(offset int, buf []rune) {

	// TODO:
	// Store in []uint16
	// and write to string

	// Accept newline after EOT
	if flags&NEWLINE_AFTER_EOT != 0 && posC == 0 && buf[0] == '\n' && writer.Buffered() != 0 {
	posC--
	}

	posC += offset
	pos = append(pos, posC)

	// Token is the start of a sentence
	if sentB {
	sentB = false
	sent = append(sent, posC)
	}
	posC += len(buf) - offset
	pos = append(pos, posC)

	// Collect tokens also
	if flags&TOKENS != 0 {
	writer.WriteString(string(buf[offset:]))
	writer.WriteByte('\n')
	}
	}

	// Collect tokens
	} else if flags&TOKENS != 0 {
	tw.Token = func(offset int, buf []rune) {
	writer.WriteString(string(buf[offset:]))
	writer.WriteByte('\n')
	}

	// Ignore tokens
	} else {
	tw.Token = func(_ int, _ []rune) {}
	}

	// Collect sentence positions and maybe sentence boundaries
	if flags&SENTENCE_POS != 0 {
	tw.SentenceEnd = func(_ int) {

	// Add end position of last token to sentence boundary
	// TODO: This only works if token positions are taking into account
	sent = append(sent, pos[len(pos)-1])
	sentB = true

	// Collect sentences also
	if flags&SENTENCES != 0 {
	writer.WriteByte('\n')
	}
	}

	// Collect sentence boundaries
	} else if flags&SENTENCES != 0 {
	tw.SentenceEnd = func(_ int) {
	writer.WriteByte('\n')
	}

	// Ignore sentence boundaries
	} else {
	tw.SentenceEnd = func(_ int) {}
	}

	// Write token or sentence positions
	if flags&(TOKEN_POS\|SENTENCE_POS) != 0 {
	tw.TextEnd = func(_ int) {
	writer.Flush()

	// Write token positions
	if flags&TOKEN_POS != 0 {
	writer.WriteString(strconv.Itoa(pos[0]))
	for _, x := range pos[1:] {
	writer.WriteByte(' ')
	writer.WriteString(strconv.Itoa(x))
	}
	writer.WriteByte('\n')
	}

	// Write sentence positions
	if flags&SENTENCE_POS != 0 {
	writer.WriteString(strconv.Itoa(sent[0]))
	for _, x := range sent[1:] {
	writer.WriteByte(' ')
	writer.WriteString(strconv.Itoa(x))
	}
	writer.WriteByte('\n')
	sent = sent[:0]
	sentB = true
	}

	posC = 0
	pos = pos[:0]
	}

	// Collect text ends
	} else {
	tw.TextEnd = func(_ int) {
	writer.WriteRune('\n')
	writer.Flush()
	}
	}

	// Flush the writer
	tw.Flush = func() error {
	return writer.Flush()
	}

	return tw
	}