Ignore newline after EOT with a flag
Change-Id: Ia18cc0cbb1dda6311c6b2b8db1fae52c4b6335e0
diff --git a/cmd/datok.go b/cmd/datok.go
index 9314a93..e31745a 100644
--- a/cmd/datok.go
+++ b/cmd/datok.go
@@ -17,9 +17,10 @@
DoubleArray bool `kong:"optional,short='d',help='Convert to Double Array instead of Matrix representation'"`
} `kong:"cmd, help='Convert a foma file to a Matrix or Double Array tokenizer'"`
Tokenize struct {
- Tokenizer string `kong:"required,short='t',help='The Matrix or Double Array Tokenizer file'"`
- Positions bool `kong:"optional,negatable,default=false,short='p',help='Print token offsets'"`
- Tokens bool `kong:"optional,negatable,default=true,help="Print token surfaces""`
+ Tokenizer string `kong:"required,short='t',help='The Matrix or Double Array Tokenizer file'"`
+ Positions bool `kong:"optional,negatable,default=false,short='p',help='Print token offsets'"`
+ Tokens bool `kong:"optional,negatable,default=true,help='Print token surfaces'"`
+ NewlineAfterEOT bool `kong:"optional,negatable,help='Ignore newline after EOT'"`
} `kong:"cmd, help='Tokenize a text'"`
}
@@ -70,7 +71,11 @@
}
// Create token writer based on the options defined
- tw := datok.NewTokenWriterFromOptions(os.Stdout, cli.Tokenize.Positions)
+ tw := datok.NewTokenWriterFromOptions(
+ os.Stdout,
+ cli.Tokenize.Positions,
+ cli.Tokenize.NewlineAfterEOT,
+ )
// Program is running in a pipe
fileInfo, _ := os.Stdin.Stat()
diff --git a/token_writer.go b/token_writer.go
index 9f4088a..32c5a99 100644
--- a/token_writer.go
+++ b/token_writer.go
@@ -35,7 +35,7 @@
}
// Create a new token writer based on the options
-func NewTokenWriterFromOptions(w io.Writer, positionFlag bool) *TokenWriter {
+func NewTokenWriterFromOptions(w io.Writer, positionFlag bool, newlineAfterEot bool) *TokenWriter {
writer := bufio.NewWriter(w)
posC := 0
pos := make([]int, 0, 200)
@@ -49,11 +49,15 @@
// Store in []uint16
// and write to string
+ // Accept newline after EOT
+ if newlineAfterEot && posC == 0 && buf[0] == '\n' && writer.Buffered() != 0 {
+ posC--
+ }
+
posC += offset
pos = append(pos, posC)
posC += len(buf) - offset
pos = append(pos, posC)
- // pos = append(pos, offset, len(buf)-offset)
writer.WriteString(string(buf[offset:]))
writer.WriteRune('\n')
@@ -70,7 +74,7 @@
}
if positionFlag {
- tw.TextEnd = func(offset int) {
+ tw.TextEnd = func(_ int) {
writer.Flush()
writer.WriteString(strconv.Itoa(pos[0]))
diff --git a/token_writer_test.go b/token_writer_test.go
index 291c3b9..f7bd1f7 100644
--- a/token_writer_test.go
+++ b/token_writer_test.go
@@ -37,7 +37,7 @@
b := make([]byte, 0, 2048)
w := bytes.NewBuffer(b)
- tws := NewTokenWriterFromOptions(w, true)
+ tws := NewTokenWriterFromOptions(w, true, false)
mat := LoadMatrixFile("testdata/tokenizer.matok")
@@ -56,4 +56,12 @@
matStr = w.String()
assert.Equal("This\n.\n\n1 5 5 6\nAnd\n.\n\n1 4 4 5\n", matStr)
+ // Accept newline after EOT
+ tws = NewTokenWriterFromOptions(w, true, true)
+
+ w.Reset()
+ mat.TransduceTokenWriter(strings.NewReader("\nThis.\x0a\x04\nAnd.\n\x04\n"), tws)
+
+ matStr = w.String()
+ assert.Equal("This\n.\n\n1 5 5 6\nAnd\n.\n\n0 3 3 4\n", matStr)
}