Parse command line options as bit flags
Change-Id: I7d7730d9a0c790a4608652590d4e5183132c86a9
diff --git a/cmd/datok.go b/cmd/datok.go
index 00c79eb..08b4160 100644
--- a/cmd/datok.go
+++ b/cmd/datok.go
@@ -22,7 +22,7 @@
Sentences bool `kong:"optional,negatable,default=true,help='Print sentence boundaries'"`
TokenPositions bool `kong:"optional,negatable,default=false,short='p',help='Print token offsets'"`
SentencePositions bool `kong:"optional,negatable,default=false,help='Print sentence offsets'"`
- NewlineAfterEOT bool `kong:"optional,negatable,help='Ignore newline after EOT'"`
+ NewlineAfterEOT bool `kong:"optional,negatable,default=false,help='Ignore newline after EOT'"`
} `kong:"cmd, help='Tokenize a text'"`
}
@@ -72,15 +72,30 @@
os.Exit(1)
}
+ // Create flags parameter based on command line parameters
+ var flags datok.Bits
+ if cli.Tokenize.Tokens {
+ flags |= datok.TOKENS
+ }
+
+ if cli.Tokenize.TokenPositions {
+ flags |= datok.TOKEN_POS
+ }
+
+ if cli.Tokenize.Sentences {
+ flags |= datok.SENTENCES
+ }
+
+ if cli.Tokenize.SentencePositions {
+ flags |= datok.SENTENCE_POS
+ }
+
+ if cli.Tokenize.NewlineAfterEOT {
+ flags |= datok.NEWLINE_AFTER_EOT
+ }
+
// Create token writer based on the options defined
- tw := datok.NewTokenWriterFromOptions(
- os.Stdout,
- cli.Tokenize.TokenPositions,
- cli.Tokenize.Tokens,
- cli.Tokenize.Sentences,
- cli.Tokenize.SentencePositions,
- cli.Tokenize.NewlineAfterEOT,
- )
+ tw := datok.NewTokenWriterFromOptions(os.Stdout, flags)
// Program is running in a pipe
fileInfo, _ := os.Stdin.Stat()
diff --git a/token_writer.go b/token_writer.go
index dd528b5..aa9c32d 100644
--- a/token_writer.go
+++ b/token_writer.go
@@ -6,6 +6,16 @@
"strconv"
)
+type Bits uint8
+
+const (
+ TOKENS Bits = 1 << iota
+ SENTENCES
+ TOKEN_POS
+ SENTENCE_POS
+ NEWLINE_AFTER_EOT
+)
+
type TokenWriter struct {
SentenceEnd func(int)
TextEnd func(int)
@@ -35,7 +45,7 @@
}
// Create a new token writer based on the options
-func NewTokenWriterFromOptions(w io.Writer, positionFlag bool, tokenFlag bool, sentenceFlag bool, sentencePositionFlag bool, newlineAfterEot bool) *TokenWriter {
+func NewTokenWriterFromOptions(w io.Writer, flags Bits) *TokenWriter {
writer := bufio.NewWriter(w)
posC := 0
pos := make([]int, 0, 1024)
@@ -44,7 +54,7 @@
tw := &TokenWriter{}
- if positionFlag {
+ if flags&TOKEN_POS != 0 {
tw.Token = func(offset int, buf []rune) {
// TODO:
@@ -52,7 +62,7 @@
// and write to string
// Accept newline after EOT
- if newlineAfterEot && posC == 0 && buf[0] == '\n' && writer.Buffered() != 0 {
+ if flags&NEWLINE_AFTER_EOT != 0 && posC == 0 && buf[0] == '\n' && writer.Buffered() != 0 {
posC--
}
@@ -67,7 +77,7 @@
posC += len(buf) - offset
pos = append(pos, posC)
- if tokenFlag {
+ if flags&TOKENS != 0 {
writer.WriteString(string(buf[offset:]))
writer.WriteRune('\n')
}
@@ -82,20 +92,21 @@
}
// Print sentence boundaries
- if sentenceFlag || sentencePositionFlag {
+ if flags&(SENTENCES|SENTENCE_POS) != 0 {
tw.SentenceEnd = func(offset int) {
// Add end position of last token to sentence boundary
+ // TODO: This only works if token positions are taking into account
sent = append(sent, pos[len(pos)-1])
sentB = true
- if sentenceFlag {
+ if flags&SENTENCES != 0 {
writer.WriteRune('\n')
}
}
// Print sentence boundaries as newlines
- } else if sentenceFlag {
+ } else if flags&SENTENCES != 0 {
tw.SentenceEnd = func(_ int) {
writer.WriteRune('\n')
}
@@ -105,11 +116,11 @@
tw.SentenceEnd = func(_ int) {}
}
- if positionFlag || sentencePositionFlag {
+ if flags&(TOKEN_POS|SENTENCE_POS) != 0 {
tw.TextEnd = func(_ int) {
writer.Flush()
- if positionFlag {
+ if flags&TOKEN_POS != 0 {
writer.WriteString(strconv.Itoa(pos[0]))
for _, x := range pos[1:] {
writer.WriteByte(' ')
@@ -118,7 +129,7 @@
writer.WriteRune('\n')
}
- if sentencePositionFlag {
+ if flags&SENTENCE_POS != 0 {
writer.WriteString(strconv.Itoa(sent[0]))
for _, x := range sent[1:] {
writer.WriteByte(' ')
diff --git a/token_writer_test.go b/token_writer_test.go
index 601da0e..e724212 100644
--- a/token_writer_test.go
+++ b/token_writer_test.go
@@ -34,14 +34,13 @@
func TestTokenWriterFromOptions(t *testing.T) {
assert := assert.New(t)
+ mat := LoadMatrixFile("testdata/tokenizer.matok")
+ assert.NotNil(mat)
+
b := make([]byte, 0, 2048)
w := bytes.NewBuffer(b)
- tws := NewTokenWriterFromOptions(w, true, true, true, false, false)
-
- mat := LoadMatrixFile("testdata/tokenizer.matok")
-
- assert.NotNil(mat)
+ tws := NewTokenWriterFromOptions(w, TOKENS|SENTENCES|TOKEN_POS)
assert.True(mat.TransduceTokenWriter(
strings.NewReader("This.\x0a\x04And.\n\x04\n"), tws),
@@ -58,7 +57,7 @@
//
// Accept newline after EOT
- tws = NewTokenWriterFromOptions(w, true, true, true, false, true)
+ tws = NewTokenWriterFromOptions(w, TOKENS|SENTENCES|TOKEN_POS|NEWLINE_AFTER_EOT)
w.Reset()
mat.TransduceTokenWriter(strings.NewReader("\nThis.\x0a\x04\nAnd.\n\x04\n"), tws)
@@ -68,7 +67,7 @@
//
// Write no tokens
- tws = NewTokenWriterFromOptions(w, true, false, true, false, true)
+ tws = NewTokenWriterFromOptions(w, SENTENCES|TOKEN_POS|NEWLINE_AFTER_EOT)
w.Reset()
mat.TransduceTokenWriter(strings.NewReader("\nThis.\x0a\x04\nAnd.\n\x04\n"), tws)
@@ -77,8 +76,8 @@
assert.Equal("\n1 5 5 6\n\n0 3 3 4\n", matStr)
//
- // Write sentences
- tws = NewTokenWriterFromOptions(w, true, false, false, true, true)
+ // Write sentence offsets
+ tws = NewTokenWriterFromOptions(w, TOKEN_POS|SENTENCE_POS|NEWLINE_AFTER_EOT)
w.Reset()
mat.TransduceTokenWriter(strings.NewReader("\nThis.\x0a\x04\nAnd.\n\x04\n"), tws)