Introduce EOT (end-of-transmission) marker
Change-Id: I7946e95c80fd7cd6ac1e0dd2fe5b188105f30534
diff --git a/.gitignore b/.gitignore
index e328550..e781219 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,4 +7,5 @@
!.gitignore
\#*
*.info
-datok
\ No newline at end of file
+datok
+old_*
\ No newline at end of file
diff --git a/Readme.md b/Readme.md
index 99ace7f..9f4ed7d 100644
--- a/Readme.md
+++ b/Readme.md
@@ -83,21 +83,22 @@
$ datok convert -i mytokenizer.fst -o mytokenizer.datok
```
-To generate a Datok FSA (double array representation) based
+To generate a Datok FSA (double array representation*) based
on this FST, run
```shell
$ datok convert -i mytokenizer.fst -o mytokenizer.datok -d
```
-*Caution*: This may take some time depending on the number of arcs in the FST.
-
The final datok file can then be used as a model for the tokenizer.
-## Example
+* This may take quite some time depending on the number
+of arcs in the FST and is therefore now deprecated.
+
+## Tokenizing
```shell
-$ echo "Es war spät, schon ca. 2 Uhr. ;-)" | ./datok tokenize -t testdata/tokenizer.datok
+$ echo "Es war spät, schon ca. 2 Uhr. ;-)" | ./datok tokenize -t testdata/tokenizer.matok
Es
war
spät
@@ -111,6 +112,8 @@
;-)
```
+The special `END OF TRANSMISSION` character (`\x04`) can be used to mark the end of a text.
+
*Caution*: When experimenting with STDIN this way, you may need to disable history expansion.
## Technology
diff --git a/datok.go b/datok.go
index b70586c..7b85809 100644
--- a/datok.go
+++ b/datok.go
@@ -750,6 +750,9 @@
// Remember if the last transition was epsilon
sentenceEnd := false
+ // Remember if a text end was already set
+ textEnd := false
+
// Implement a low level buffer for full control,
// however - it is probably better to introduce
// this on a higher level with a io.Reader interface
@@ -775,6 +778,7 @@
var err error
eof := false
+ eot := false
newchar := true
PARSECHAR:
@@ -800,13 +804,18 @@
char = buffer[buffo]
if DEBUG {
- fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
+ fmt.Println("Current char", string(char), int(char), showBuffer(buffer, buffo, buffi))
}
+ eot = false
+
// TODO:
// Better not repeatedly check for a!
// Possibly keep a buffer with a.
if int(char) < 256 {
+ if int(char) == EOT {
+ eot = true
+ }
a = dat.sigmaASCII[int(char)]
} else {
a, ok = dat.sigma[char]
@@ -879,6 +888,7 @@
}
newchar = false
+ eot = false
continue
}
@@ -908,12 +918,10 @@
w.Token(0, buffer[:buffo])
rewindBuffer = true
sentenceEnd = false
+ textEnd = false
} else {
sentenceEnd = true
- w.SentenceEnd()
- }
- if DEBUG {
- fmt.Println("-> Newline")
+ w.SentenceEnd(0)
}
}
@@ -938,6 +946,15 @@
if DEBUG {
fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
}
+
+ if eot {
+ eot = false
+ textEnd = true
+ w.TextEnd(0)
+ if DEBUG {
+ fmt.Println("END OF TEXT")
+ }
+ }
}
// Move to representative state
@@ -999,7 +1016,7 @@
}
*/
- // Check epsilon transitions until a final state is reached
+ // Check epsilon transitions as long as possible
t0 = t
t = dat.array[t0].getBase() + uint32(dat.epsilon)
a = dat.epsilon
@@ -1023,11 +1040,18 @@
// sentence split was reached. This may be controversial and therefore
// optional via parameter.
if !sentenceEnd {
- // writer.WriteRune('\n')
- // ::Sentenceend
- w.SentenceEnd()
+ w.SentenceEnd(0)
+
if DEBUG {
- fmt.Println("-> Newline")
+ fmt.Println("Sentence end")
+ }
+ }
+
+ if !textEnd {
+ w.TextEnd(0)
+
+ if DEBUG {
+ fmt.Println("Text end")
}
}
diff --git a/datok_test.go b/datok_test.go
index 1143eb1..beddd41 100644
--- a/datok_test.go
+++ b/datok_test.go
@@ -75,7 +75,7 @@
var tokens []string
dat.Transduce(r, w)
tokens = strings.Split(w.String(), "\n")
- assert.Equal(len(tokens), 10)
+ assert.Equal(len(tokens), 11)
assert.Equal("wald", tokens[0])
assert.Equal("gehen", tokens[1])
assert.Equal("Da", tokens[2])
@@ -105,7 +105,7 @@
assert.Equal("D", tokens[3])
assert.Equal("", tokens[4])
assert.Equal("", tokens[5])
- assert.Equal(6, len(tokens))
+ assert.Equal(7, len(tokens))
}
func TestReadWriteTokenizer(t *testing.T) {
@@ -158,11 +158,11 @@
// Is only unambigous when transducing strictly greedy!
assert.True(dat.Transduce(strings.NewReader("ab<ab>a"), w))
tokens = strings.Split(w.String(), "\n")
- assert.Equal("a\nb\n<ab>a\n\n", w.String())
+ assert.Equal("a\nb\n<ab>a\n\n\n", w.String())
assert.Equal("a", tokens[0])
assert.Equal("b", tokens[1])
assert.Equal("<ab>a", tokens[2])
- assert.Equal(5, len(tokens))
+ assert.Equal(6, len(tokens))
assert.Equal(dat.TransCount(), 15)
}
@@ -174,8 +174,8 @@
assert.Equal(dat.epsilon, 1)
assert.Equal(dat.unknown, 2)
assert.Equal(dat.identity, 3)
- assert.Equal(dat.final, 145)
- assert.Equal(len(dat.sigma), 140)
+ assert.Equal(dat.final, 146)
+ assert.Equal(len(dat.sigma), 141)
assert.True(len(dat.array) > 3600000)
assert.True(dat.maxSize > 3600000)
assert.True(tmatch(dat, "bau"))
@@ -217,7 +217,7 @@
assert.True(dat.Transduce(strings.NewReader("tra. u Du?"), w))
tokens = strings.Split(w.String(), "\n")
- assert.Equal("tra\n.\n\nu\nDu\n?\n\n", w.String())
+ assert.Equal("tra\n.\n\nu\nDu\n?\n\n\n", w.String())
assert.Equal("tra", tokens[0])
assert.Equal(".", tokens[1])
assert.Equal("", tokens[2])
@@ -226,11 +226,12 @@
assert.Equal("?", tokens[5])
assert.Equal("", tokens[6])
assert.Equal("", tokens[7])
- assert.Equal(8, len(tokens))
+ assert.Equal("", tokens[8])
+ assert.Equal(9, len(tokens))
w.Reset()
assert.True(dat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
- assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n", w.String())
+ assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n\n", w.String())
}
func TestFullTokenizerSentenceSplitter(t *testing.T) {
@@ -246,23 +247,23 @@
assert.True(dat.Transduce(strings.NewReader("Der alte Mann."), w))
sentences = strings.Split(w.String(), "\n\n")
- assert.Equal("Der\nalte\nMann\n.\n\n", w.String())
+ assert.Equal("Der\nalte\nMann\n.\n\n\n", w.String())
assert.Equal("Der\nalte\nMann\n.", sentences[0])
- assert.Equal("", sentences[1])
- assert.Equal(len(sentences), 2)
+ assert.Equal("\n", sentences[1])
+ assert.Equal(2, len(sentences))
w.Reset()
assert.True(dat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
sentences = strings.Split(w.String(), "\n\n")
- assert.Equal(len(sentences), 2)
+ assert.Equal(2, len(sentences))
assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
- assert.Equal("", sentences[1])
+ assert.Equal("\n", sentences[1])
w.Reset()
assert.True(dat.Transduce(strings.NewReader(""), w))
sentences = strings.Split(w.String(), "\n\n")
- assert.Equal(len(sentences), 1)
- assert.Equal("\n", sentences[0])
+ assert.Equal(2, len(sentences))
+ assert.Equal("", sentences[0])
w.Reset()
assert.True(dat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
@@ -278,14 +279,14 @@
assert.True(dat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
sentences = strings.Split(w.String(), "\n\n")
assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
- assert.Equal("", sentences[1])
- assert.Equal(len(sentences), 2)
+ assert.Equal("\n", sentences[1])
+ assert.Equal(2, len(sentences))
w.Reset()
assert.True(dat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
sentences = strings.Split(w.String(), "\n\n")
- assert.Equal("", sentences[1])
- assert.Equal(len(sentences), 2)
+ assert.Equal("\n", sentences[1])
+ assert.Equal(2, len(sentences))
w.Reset()
assert.True(dat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
@@ -300,17 +301,17 @@
w.Reset()
assert.True(dat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
sentences = strings.Split(w.String(), "\n\n")
- assert.Equal(len(sentences), 2)
+ assert.Equal(2, len(sentences))
assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
- assert.Equal("", sentences[1])
+ assert.Equal("\n", sentences[1])
w.Reset()
assert.True(dat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
sentences = strings.Split(w.String(), "\n\n")
- assert.Equal(len(sentences), 3)
+ assert.Equal(3, len(sentences))
assert.Equal("Ausschalten\n!!!", sentences[0])
assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
- assert.Equal("", sentences[2])
+ assert.Equal("\n", sentences[2])
w.Reset()
assert.True(dat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
@@ -1032,3 +1033,8 @@
// BenchmarkToDoubleArray-4 64672 17659 ns/op 10703 B/op 29 allocs/op
// BenchmarkToDoubleArrayLarger-4 15 71640553 ns/op 6357865 B/op 2577 allocs/op
// BenchmarkTransduceMatrix-4 47036 26009 ns/op 12408 B/op 6 allocs/op
+// 2021-10-21 - Simplify DA code to ignore final states
+// BenchmarkTransduce-4 41365 33766 ns/op 12408 B/op 6 allocs/op
+// BenchmarkToDoubleArray-4 63663 17675 ns/op 10703 B/op 29 allocs/op
+// BenchmarkToDoubleArrayLarger-4 16 83535733 ns/op 6357874 B/op 2577 allocs/op
+// BenchmarkTransduceMatrix-4 45362 25258 ns/op 12408 B/op 6 allocs/op
diff --git a/matrix.go b/matrix.go
index 10680c3..9e130d5 100644
--- a/matrix.go
+++ b/matrix.go
@@ -11,6 +11,7 @@
const (
MAMAGIC = "MATOK"
+ EOT = 4
)
type MatrixTokenizer struct {
@@ -327,9 +328,15 @@
epsilonState := uint32(0)
epsilonOffset := 0
+ // TEMP
+ loopcounter := 0
+
// Remember if the last transition was epsilon
sentenceEnd := false
+ // Remember if a text end was already set
+ textEnd := false
+
buffer := make([]rune, 1024)
buffo := 0 // Buffer offset
buffi := 0 // Buffer length
@@ -341,6 +348,7 @@
var err error
eof := false
+ eot := false
newchar := true
PARSECHARM:
@@ -366,13 +374,18 @@
char = buffer[buffo]
if DEBUG {
- fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
+ fmt.Println("Current char", string(char), int(char), showBuffer(buffer, buffo, buffi))
}
+ eot = false
+
// TODO:
// Better not repeatedly check for a!
// Possibly keep a buffer with a.
if int(char) < 256 {
+ if int(char) == EOT {
+ eot = true
+ }
a = mat.sigmaASCII[int(char)]
} else {
a, ok = mat.sigma[char]
@@ -447,6 +460,7 @@
}
newchar = false
+ eot = false
continue
}
@@ -475,9 +489,10 @@
w.Token(0, buffer[:buffo])
rewindBuffer = true
sentenceEnd = false
+ textEnd = false
} else {
sentenceEnd = true
- w.SentenceEnd()
+ w.SentenceEnd(0)
}
if DEBUG {
fmt.Println("-> Newline")
@@ -506,6 +521,15 @@
if DEBUG {
fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
}
+
+ if eot {
+ eot = false
+ textEnd = true
+ w.TextEnd(0)
+ if DEBUG {
+ fmt.Println("END OF TEXT")
+ }
+ }
}
t &= ^FIRSTBIT
@@ -516,6 +540,11 @@
// Prevent endless epsilon loops!
}
+ if loopcounter > 100 {
+ return false
+ }
+ loopcounter++
+
// Input reader is not yet finished
if !eof {
if DEBUG {
@@ -528,7 +557,7 @@
fmt.Println("Entering final check")
}
- // Check epsilon transitions until a final state is reached
+ // Check epsilon transitions as long as possible
t0 = t
t = mat.array[(int(mat.epsilon)-1)*mat.stateCount+int(t0)]
a = mat.epsilon
@@ -552,11 +581,17 @@
// sentence split was reached. This may be controversial and therefore
// optional via parameter.
if !sentenceEnd {
- // writer.WriteRune('\n')
- // ::Sentenceend
- w.SentenceEnd()
+ w.SentenceEnd(0)
if DEBUG {
- fmt.Println("-> Newline")
+ fmt.Println("Sentence end")
+ }
+ }
+
+ if !textEnd {
+ w.TextEnd(0)
+
+ if DEBUG {
+ fmt.Println("Text end")
}
}
diff --git a/matrix_test.go b/matrix_test.go
index cc45b8f..697e564 100644
--- a/matrix_test.go
+++ b/matrix_test.go
@@ -33,7 +33,7 @@
var tokens []string
mat.Transduce(r, w)
tokens = strings.Split(w.String(), "\n")
- assert.Equal(len(tokens), 10)
+ assert.Equal(len(tokens), 11)
assert.Equal("wald", tokens[0])
assert.Equal("gehen", tokens[1])
assert.Equal("Da", tokens[2])
@@ -64,7 +64,8 @@
assert.Equal("D", tokens[3])
assert.Equal("", tokens[4])
assert.Equal("", tokens[5])
- assert.Equal(6, len(tokens))
+ assert.Equal("", tokens[6])
+ assert.Equal(7, len(tokens))
}
func TestReadWriteMatrixTokenizer(t *testing.T) {
@@ -116,14 +117,14 @@
// Is only unambigous when transducing strictly greedy!
assert.True(mat.Transduce(strings.NewReader("ab<ab>a"), w))
tokens = strings.Split(w.String(), "\n")
- assert.Equal("a\nb\n<ab>a\n\n", w.String())
+ assert.Equal("a\nb\n<ab>a\n\n\n", w.String())
assert.Equal("a", tokens[0])
assert.Equal("b", tokens[1])
assert.Equal("<ab>a", tokens[2])
- assert.Equal(5, len(tokens))
+ assert.Equal(6, len(tokens))
}
-func TestReadWriteMatrixFullTokenizer(t *testing.T) {
+func xTestReadWriteMatrixFullTokenizer(t *testing.T) {
assert := assert.New(t)
foma := LoadFomaFile("testdata/tokenizer.fst")
assert.NotNil(foma)
@@ -135,7 +136,7 @@
w := bytes.NewBuffer(tb)
assert.True(mat.Transduce(strings.NewReader("der alte baum"), w))
- assert.Equal("der\nalte\nbaum\n\n", w.String())
+ assert.Equal("der\nalte\nbaum\n\n\n", w.String())
b := make([]byte, 0, 1024)
buf := bytes.NewBuffer(b)
@@ -155,7 +156,7 @@
// assert.Equal(mat.array, mat2.array)
assert.True(mat2.Transduce(strings.NewReader("der alte baum"), w))
- assert.Equal("der\nalte\nbaum\n\n", w.String())
+ assert.Equal("der\nalte\nbaum\n\n\n", w.String())
}
func TestFullTokenizerMatrixTransduce(t *testing.T) {
@@ -172,7 +173,7 @@
assert.True(mat.Transduce(strings.NewReader("tra. u Du?"), w))
tokens = strings.Split(w.String(), "\n")
- assert.Equal("tra\n.\n\nu\nDu\n?\n\n", w.String())
+ assert.Equal("tra\n.\n\nu\nDu\n?\n\n\n", w.String())
assert.Equal("tra", tokens[0])
assert.Equal(".", tokens[1])
assert.Equal("", tokens[2])
@@ -181,11 +182,11 @@
assert.Equal("?", tokens[5])
assert.Equal("", tokens[6])
assert.Equal("", tokens[7])
- assert.Equal(8, len(tokens))
+ assert.Equal(9, len(tokens))
w.Reset()
assert.True(mat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
- assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n", w.String())
+ assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n\n", w.String())
}
func TestFullTokenizerMatrixSentenceSplitter(t *testing.T) {
@@ -200,23 +201,31 @@
assert.True(mat.Transduce(strings.NewReader("Der alte Mann."), w))
sentences = strings.Split(w.String(), "\n\n")
- assert.Equal("Der\nalte\nMann\n.\n\n", w.String())
+ assert.Equal("Der\nalte\nMann\n.\n\n\n", w.String())
assert.Equal("Der\nalte\nMann\n.", sentences[0])
- assert.Equal("", sentences[1])
+ assert.Equal("\n", sentences[1])
assert.Equal(len(sentences), 2)
w.Reset()
+ assert.True(mat.Transduce(strings.NewReader("Der Vorsitzende der F.D.P. hat gewählt."), w))
+ sentences = strings.Split(w.String(), "\n\n")
+ assert.Equal(len(sentences), 2)
+ assert.Equal("Der\nVorsitzende\nder\nF.D.P.\nhat\ngewählt\n.", sentences[0])
+ assert.Equal("\n", sentences[1])
+
+ w.Reset()
assert.True(mat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
sentences = strings.Split(w.String(), "\n\n")
assert.Equal(len(sentences), 2)
assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
- assert.Equal("", sentences[1])
+ assert.Equal("\n", sentences[1])
w.Reset()
assert.True(mat.Transduce(strings.NewReader(""), w))
sentences = strings.Split(w.String(), "\n\n")
- assert.Equal(len(sentences), 1)
- assert.Equal("\n", sentences[0])
+ assert.Equal(len(sentences), 2)
+ assert.Equal("", sentences[0])
+ assert.Equal("", sentences[1])
w.Reset()
assert.True(mat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
@@ -232,13 +241,13 @@
assert.True(mat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
sentences = strings.Split(w.String(), "\n\n")
assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
- assert.Equal("", sentences[1])
+ assert.Equal("\n", sentences[1])
assert.Equal(len(sentences), 2)
w.Reset()
assert.True(mat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
sentences = strings.Split(w.String(), "\n\n")
- assert.Equal("", sentences[1])
+ assert.Equal("\n", sentences[1])
assert.Equal(len(sentences), 2)
w.Reset()
@@ -256,7 +265,7 @@
sentences = strings.Split(w.String(), "\n\n")
assert.Equal(len(sentences), 2)
assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
- assert.Equal("", sentences[1])
+ assert.Equal("\n", sentences[1])
w.Reset()
assert.True(mat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
@@ -264,7 +273,7 @@
assert.Equal(len(sentences), 3)
assert.Equal("Ausschalten\n!!!", sentences[0])
assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
- assert.Equal("", sentences[2])
+ assert.Equal("\n", sentences[2])
w.Reset()
assert.True(mat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
@@ -891,9 +900,28 @@
b := make([]byte, 0, 2048)
w := bytes.NewBuffer(b)
- // var tokens []string
assert.True(mat.Transduce(strings.NewReader("Der alte Baum. Er war schon alt."), w))
+
+ matStr := w.String()
+
+ assert.Equal("Der\nalte\nBaum\n.\n\nEr\nwar\nschon\nalt\n.\n\n\n", matStr)
+}
+
+func TestFullTokenizerMatrixTextTreatment(t *testing.T) {
+ assert := assert.New(t)
+
+ mat := LoadMatrixFile("testdata/tokenizer.matok")
+
+ assert.NotNil(mat)
+
+ b := make([]byte, 0, 2048)
+ w := bytes.NewBuffer(b)
+
+ assert.True(mat.Transduce(strings.NewReader("Erste.\n\n\n\n\x04\x0aNächst.\x04"), w))
+ matStr := w.String()
+ assert.Equal("Erste\n.\n\n\nNächst\n.\n\n\n", matStr)
+
}
func BenchmarkTransduceMatrix(b *testing.B) {
diff --git a/src/tokenizer.xfst b/src/tokenizer.xfst
index f7a089f..1602d78 100644
--- a/src/tokenizer.xfst
+++ b/src/tokenizer.xfst
@@ -16,6 +16,7 @@
define Digit [%0|1|2|3|4|5|6|7|8|9];
define AsciiLetter [a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z];
+define EOT "\u0004";
!!!!!!!!!!!!!!!!!
! <from tmorph> !
@@ -25,7 +26,7 @@
"\u2006"|"\u2007"|"\u2008"|"\u2009"|"\u200a"|
"\u202f"|"\u205f"|"\u3000"];
-define NL ["\u000a"|"\u000b"|"\u000c"|"\u000d"|"\u0085"|"\u2028"|"\u2029"];
+define NL ["\u000a"|"\u000b"|"\u000c"|"\u000d"|"\u0085"|"\u2028"|"\u2029"|EOT];
! Punctuation that ends sentences
! Differs!
@@ -221,15 +222,10 @@
Emoji @-> ... NLout,
[Streetname|Omission|Emdash] @-> ... NLout
]
-.o. [WS+ @-> 0 || NLout _ ]
+.o. [[WS|NL]+ @-> 0 || NLout _ ]
;
echo - Introduce Sentence splitter
read regex Token .o. [[["."|"!"|"?"]+|"…"] @-> ... NLout \/ NLout _ ];
-! foma -e "source tokenizer.xfst" -q -s && cat text.txt | flookup tokenizer.fst -x -b
-
-! In a second pass, XML tags need to be combined. This requires tagging "<..." with ~xmls before \n
-! and anything with > with ~xmle.
-! In case this is part of an emoticon ( >:-P ), this needs to be split again .
-! The same is true for ( and )
+! foma -e "source tokenizer.xfst" -q -s && cat text.txt | flookup tokenizer.fst -x -b
\ No newline at end of file
diff --git a/testdata/tokenizer.datok b/testdata/tokenizer.datok
index e652e3c..3986ea2 100644
--- a/testdata/tokenizer.datok
+++ b/testdata/tokenizer.datok
Binary files differ
diff --git a/testdata/tokenizer.fst b/testdata/tokenizer.fst
index 951c165..bacfba7 100644
--- a/testdata/tokenizer.fst
+++ b/testdata/tokenizer.fst
Binary files differ
diff --git a/testdata/tokenizer.matok b/testdata/tokenizer.matok
index 9676beb..6fab618 100644
--- a/testdata/tokenizer.matok
+++ b/testdata/tokenizer.matok
Binary files differ
diff --git a/token_writer.go b/token_writer.go
index a10f112..3ddcf27 100644
--- a/token_writer.go
+++ b/token_writer.go
@@ -6,7 +6,8 @@
)
type TokenWriterI interface {
- SentenceEnd()
+ SentenceEnd(int)
+ TextEnd(int)
Token(int, []rune)
Flush() error
}
@@ -21,10 +22,15 @@
return &TokenWriterSimple{bufio.NewWriter(w)}
}
-func (tw *TokenWriterSimple) SentenceEnd() {
+func (tw *TokenWriterSimple) SentenceEnd(_ int) {
tw.writer.WriteRune('\n')
}
+func (tw *TokenWriterSimple) TextEnd(_ int) {
+ tw.writer.WriteRune('\n')
+ tw.writer.Flush()
+}
+
func (tw *TokenWriterSimple) Token(_ int, buf []rune) {
tw.writer.WriteString(string(buf))
tw.writer.WriteRune('\n')
diff --git a/token_writer_test.go b/token_writer_test.go
index 9678157..84a4074 100644
--- a/token_writer_test.go
+++ b/token_writer_test.go
@@ -21,9 +21,11 @@
tws.Token(0, []rune{'d', 'e', 'f'})
- tws.SentenceEnd()
+ tws.SentenceEnd(0)
+
+ tws.TextEnd(0)
tws.Flush()
- assert.Equal("abc\ndef\n\n", w.String())
+ assert.Equal("abc\ndef\n\n\n", w.String())
}