Add sentence flags (for printing and offsets)
Change-Id: Ia4aaf75cae509ba1a43d04e369e7d5b21867bc15
diff --git a/cmd/datok.go b/cmd/datok.go
index 95919e3..00c79eb 100644
--- a/cmd/datok.go
+++ b/cmd/datok.go
@@ -17,10 +17,12 @@
DoubleArray bool `kong:"optional,short='d',help='Convert to Double Array instead of Matrix representation'"`
} `kong:"cmd, help='Convert a foma file to a Matrix or Double Array tokenizer'"`
Tokenize struct {
- Tokenizer string `kong:"required,short='t',help='The Matrix or Double Array Tokenizer file'"`
- Positions bool `kong:"optional,negatable,default=false,short='p',help='Print token offsets'"`
- Tokens bool `kong:"optional,negatable,default=true,help='Print token surfaces'"`
- NewlineAfterEOT bool `kong:"optional,negatable,help='Ignore newline after EOT'"`
+ Tokenizer string `kong:"required,short='t',help='The Matrix or Double Array Tokenizer file'"`
+ Tokens bool `kong:"optional,negatable,default=true,help='Print token surfaces'"`
+ Sentences bool `kong:"optional,negatable,default=true,help='Print sentence boundaries'"`
+ TokenPositions bool `kong:"optional,negatable,default=false,short='p',help='Print token offsets'"`
+ SentencePositions bool `kong:"optional,negatable,default=false,help='Print sentence offsets'"`
+ NewlineAfterEOT bool `kong:"optional,negatable,help='Ignore newline after EOT'"`
} `kong:"cmd, help='Tokenize a text'"`
}
@@ -73,8 +75,10 @@
// Create token writer based on the options defined
tw := datok.NewTokenWriterFromOptions(
os.Stdout,
- cli.Tokenize.Positions,
+ cli.Tokenize.TokenPositions,
cli.Tokenize.Tokens,
+ cli.Tokenize.Sentences,
+ cli.Tokenize.SentencePositions,
cli.Tokenize.NewlineAfterEOT,
)
diff --git a/token_writer.go b/token_writer.go
index 5e4b72d..dd528b5 100644
--- a/token_writer.go
+++ b/token_writer.go
@@ -35,10 +35,12 @@
}
// Create a new token writer based on the options
-func NewTokenWriterFromOptions(w io.Writer, positionFlag bool, tokenFlag bool, newlineAfterEot bool) *TokenWriter {
+func NewTokenWriterFromOptions(w io.Writer, positionFlag bool, tokenFlag bool, sentenceFlag bool, sentencePositionFlag bool, newlineAfterEot bool) *TokenWriter {
writer := bufio.NewWriter(w)
posC := 0
- pos := make([]int, 0, 200)
+ pos := make([]int, 0, 1024)
+ sentB := true
+ sent := make([]int, 0, 1024)
tw := &TokenWriter{}
@@ -56,6 +58,12 @@
posC += offset
pos = append(pos, posC)
+
+ // Token is the start of a sentence
+ if sentB {
+ sentB = false
+ sent = append(sent, posC)
+ }
posC += len(buf) - offset
pos = append(pos, posC)
@@ -64,6 +72,8 @@
writer.WriteRune('\n')
}
}
+
+ // Only print one token per line
} else {
tw.Token = func(offset int, buf []rune) {
writer.WriteString(string(buf[offset:]))
@@ -71,20 +81,53 @@
}
}
- tw.SentenceEnd = func(_ int) {
- writer.WriteRune('\n')
+ // Print sentence boundaries
+ if sentenceFlag || sentencePositionFlag {
+ tw.SentenceEnd = func(offset int) {
+
+ // Add end position of last token to sentence boundary
+ sent = append(sent, pos[len(pos)-1])
+ sentB = true
+
+ if sentenceFlag {
+ writer.WriteRune('\n')
+ }
+ }
+
+ // Print sentence boundaries as newlines
+ } else if sentenceFlag {
+ tw.SentenceEnd = func(_ int) {
+ writer.WriteRune('\n')
+ }
+
+ // Ignore sentence boundaries
+ } else {
+ tw.SentenceEnd = func(_ int) {}
}
- if positionFlag {
+ if positionFlag || sentencePositionFlag {
tw.TextEnd = func(_ int) {
writer.Flush()
- writer.WriteString(strconv.Itoa(pos[0]))
- for _, x := range pos[1:] {
- writer.WriteByte(' ')
- writer.WriteString(strconv.Itoa(x))
+ if positionFlag {
+ writer.WriteString(strconv.Itoa(pos[0]))
+ for _, x := range pos[1:] {
+ writer.WriteByte(' ')
+ writer.WriteString(strconv.Itoa(x))
+ }
+ writer.WriteRune('\n')
}
- writer.WriteRune('\n')
+
+ if sentencePositionFlag {
+ writer.WriteString(strconv.Itoa(sent[0]))
+ for _, x := range sent[1:] {
+ writer.WriteByte(' ')
+ writer.WriteString(strconv.Itoa(x))
+ }
+ writer.WriteRune('\n')
+ sent = sent[:0]
+ sentB = true
+ }
posC = 0
pos = pos[:0]
diff --git a/token_writer_test.go b/token_writer_test.go
index a75b3d9..601da0e 100644
--- a/token_writer_test.go
+++ b/token_writer_test.go
@@ -37,7 +37,7 @@
b := make([]byte, 0, 2048)
w := bytes.NewBuffer(b)
- tws := NewTokenWriterFromOptions(w, true, true, false)
+ tws := NewTokenWriterFromOptions(w, true, true, true, false, false)
mat := LoadMatrixFile("testdata/tokenizer.matok")
@@ -58,7 +58,7 @@
//
// Accept newline after EOT
- tws = NewTokenWriterFromOptions(w, true, true, true)
+ tws = NewTokenWriterFromOptions(w, true, true, true, false, true)
w.Reset()
mat.TransduceTokenWriter(strings.NewReader("\nThis.\x0a\x04\nAnd.\n\x04\n"), tws)
@@ -68,11 +68,21 @@
//
// Write no tokens
- tws = NewTokenWriterFromOptions(w, true, false, true)
+ tws = NewTokenWriterFromOptions(w, true, false, true, false, true)
w.Reset()
mat.TransduceTokenWriter(strings.NewReader("\nThis.\x0a\x04\nAnd.\n\x04\n"), tws)
matStr = w.String()
assert.Equal("\n1 5 5 6\n\n0 3 3 4\n", matStr)
+
+ //
+ // Write sentences
+ tws = NewTokenWriterFromOptions(w, true, false, false, true, true)
+
+ w.Reset()
+ mat.TransduceTokenWriter(strings.NewReader("\nThis.\x0a\x04\nAnd.\n\x04\n"), tws)
+
+ matStr = w.String()
+ assert.Equal("1 5 5 6\n1 6\n0 3 3 4\n0 4\n", matStr)
}