Introduce the english model as being on the same level as german
Change-Id: Ib63095af7a93c158fde7bca8271ea6129347de90
diff --git a/Makefile b/Makefile
index 05981c6..b6fb810 100644
--- a/Makefile
+++ b/Makefile
@@ -7,7 +7,7 @@
buildfoma_de:
cd src && \
foma -e "source de/tokenizer.xfst" \
- -e "save stack ../testdata/tokenizer.fst" -q -s && \
+ -e "save stack ../testdata/tokenizer_de.fst" -q -s && \
cd ..
buildfoma_en:
@@ -17,13 +17,13 @@
cd ..
buildmatok_de: buildfoma_de build
- ./bin/datok convert -i ./testdata/tokenizer.fst -o ./testdata/tokenizer.matok
+ ./bin/datok convert -i ./testdata/tokenizer_de.fst -o ./testdata/tokenizer_de.matok
buildmatok_en: buildfoma_en build
./bin/datok convert -i ./testdata/tokenizer_en.fst -o ./testdata/tokenizer_en.matok
-builddatok: buildfoma_de build
- ./bin/datok convert -i ./testdata/tokenizer.fst -o ./testdata/tokenizer.datok -d
+builddatok_de: buildfoma_de build
+ ./bin/datok convert -i ./testdata/tokenizer_de.fst -o ./testdata/tokenizer_de.datok -d
builddatok_en: buildfoma_en build
./bin/datok convert -i ./testdata/tokenizer_en.fst -o ./testdata/tokenizer_en.datok -d
diff --git a/Readme.md b/Readme.md
index 3f9683b..c7b53cd 100644
--- a/Readme.md
+++ b/Readme.md
@@ -5,9 +5,14 @@
![Introduction to Datok](https://raw.githubusercontent.com/KorAP/Datok/master/misc/introducing-datok.gif)
Implementation of a finite state automaton for
-fast natural language tokenization, based on a finite state
+high-performance natural language tokenization, based on a finite state
transducer generated with [Foma](https://fomafst.github.io/).
+The library contains precompiled tokenizer models for
+
+- [german](testdata/tokenizer_de.matok)
+- [english](testdata/tokenizer_en.matok)
+
## Tokenization
```
@@ -30,8 +35,7 @@
The special `END OF TRANSMISSION` character (`\x04`) can be used to mark the end of a text.
> *Caution*: When experimenting with STDIN and echo,
-> you may need to disable history expansion.
-
+> you may need to disable [history expansion](https://www.gnu.org/software/bash/manual/html_node/History-Interaction.html).
## Conversion
@@ -49,11 +53,13 @@
## Conventions
-The FST generated by Foma must adhere to the following rules,
-to be converted by Datok:
+The FST generated by [Foma](https://fomafst.github.io/) must adhere to
+the following rules, to be convertible by Datok:
- Character accepting arcs need to be translated
*only* to themselves or to ε (the empty symbol).
+ I.e. they will either be unchanged part of the
+ output or ignored (e.g. whitespace characters).
- Multi-character symbols are not allowed,
except for the `@_TOKEN_BOUND_@`,
that denotes the end of a token.
@@ -90,8 +96,8 @@
read regex Tokenizer;
```
-> *Hint*: For development it's easier to replace `@_TOKEN_BOUND_@`
-with a newline.
+> *Hint*: For development in Foma it's easier to replace
+> `@_TOKEN_BOUND_@` with a newline symbol.
## Building
@@ -133,7 +139,7 @@
The final datok file can then be used as a model for the tokenizer.
* This may take quite some time depending on the number
-of arcs in the FST and is therefore now deprecated.
+of arcs in the FST and is therefore not recommended in most cases.
## Technology
@@ -180,7 +186,7 @@
Datok is published under the [Apache 2.0 License](LICENSE).
-The german tokenizer shipped is based on work done by the
+The german and english tokenizer shipped is based on work done by the
[Lucene project](https://github.com/apache/lucene-solr)
(published under the Apache License),
[David Hall](https://github.com/dlwh/epic)
diff --git a/datok_test.go b/datok_test.go
index 8562a98..8130690 100644
--- a/datok_test.go
+++ b/datok_test.go
@@ -210,7 +210,7 @@
assert := assert.New(t)
if dat == nil {
- dat = LoadDatokFile("testdata/tokenizer.datok")
+ dat = LoadDatokFile("testdata/tokenizer_de.datok")
}
assert.NotNil(dat)
assert.True(dat.LoadFactor() >= 60)
@@ -239,10 +239,10 @@
func XTestDoubleArrayFullTokenizerBuild(t *testing.T) {
assert := assert.New(t)
- tok := LoadFomaFile("testdata/tokenizer.fst")
+ tok := LoadFomaFile("testdata/tokenizer_de.fst")
dat := tok.ToDoubleArray()
assert.NotNil(dat)
- // n, err := dat.Save("testdata/tokenizer.datok")
+ // n, err := dat.Save("testdata/tokenizer_de.datok")
// assert.Nil(err)
// assert.True(n > 500)
}
@@ -251,7 +251,7 @@
assert := assert.New(t)
if dat == nil {
- dat = LoadDatokFile("testdata/tokenizer.datok")
+ dat = LoadDatokFile("testdata/tokenizer_de.datok")
}
assert.NotNil(dat)
@@ -284,7 +284,7 @@
assert := assert.New(t)
if dat == nil {
- dat = LoadDatokFile("testdata/tokenizer.datok")
+ dat = LoadDatokFile("testdata/tokenizer_de.datok")
}
assert.NotNil(dat)
@@ -378,7 +378,7 @@
assert := assert.New(t)
if dat == nil {
- dat = LoadDatokFile("testdata/tokenizer.datok")
+ dat = LoadDatokFile("testdata/tokenizer_de.datok")
}
assert.NotNil(dat)
@@ -920,7 +920,7 @@
assert := assert.New(t)
if dat == nil {
- dat = LoadDatokFile("testdata/tokenizer.datok")
+ dat = LoadDatokFile("testdata/tokenizer_de.datok")
}
b := make([]byte, 0, 2048)
@@ -951,7 +951,7 @@
assert := assert.New(t)
if dat == nil {
- dat = LoadDatokFile("testdata/tokenizer.datok")
+ dat = LoadDatokFile("testdata/tokenizer_de.datok")
}
assert.NotNil(dat)
@@ -1005,7 +1005,7 @@
Mach's macht's was'n ist's haste willste kannste biste kriegste.`
r := strings.NewReader(s)
- dat := LoadDatokFile("testdata/tokenizer.datok")
+ dat := LoadDatokFile("testdata/tokenizer_de.datok")
b.ResetTimer()
@@ -1024,7 +1024,7 @@
// This test is deprecated as the datok file changes over time
func XBenchmarkLoadDatokFile(b *testing.B) {
for i := 0; i < b.N; i++ {
- dat := LoadDatokFile("testdata/tokenizer.datok")
+ dat := LoadDatokFile("testdata/tokenizer_de.datok")
if dat == nil {
fmt.Println("Fail!")
os.Exit(1)
diff --git a/matrix_test.go b/matrix_test.go
index 9d2c674..e58005f 100644
--- a/matrix_test.go
+++ b/matrix_test.go
@@ -207,7 +207,7 @@
func xTestMatrixReadWriteFullTokenizer(t *testing.T) {
assert := assert.New(t)
- foma := LoadFomaFile("testdata/tokenizer.fst")
+ foma := LoadFomaFile("testdata/tokenizer_de.fst")
assert.NotNil(foma)
mat := foma.ToMatrix()
@@ -244,7 +244,7 @@
assert := assert.New(t)
if mat_de == nil {
- mat_de = LoadMatrixFile("testdata/tokenizer.matok")
+ mat_de = LoadMatrixFile("testdata/tokenizer_de.matok")
}
assert.NotNil(mat_de)
@@ -276,7 +276,7 @@
assert := assert.New(t)
if mat_de == nil {
- mat_de = LoadMatrixFile("testdata/tokenizer.matok")
+ mat_de = LoadMatrixFile("testdata/tokenizer_de.matok")
}
b := make([]byte, 0, 2048)
@@ -443,7 +443,7 @@
assert := assert.New(t)
if mat_de == nil {
- mat_de = LoadMatrixFile("testdata/tokenizer.matok")
+ mat_de = LoadMatrixFile("testdata/tokenizer_de.matok")
}
b := make([]byte, 0, 2048)
@@ -467,7 +467,7 @@
assert := assert.New(t)
if mat_de == nil {
- mat_de = LoadMatrixFile("testdata/tokenizer.matok")
+ mat_de = LoadMatrixFile("testdata/tokenizer_de.matok")
}
b := make([]byte, 0, 2048)
@@ -1149,7 +1149,7 @@
assert := assert.New(t)
if mat_de == nil {
- mat_de = LoadMatrixFile("testdata/tokenizer.matok")
+ mat_de = LoadMatrixFile("testdata/tokenizer_de.matok")
}
assert.NotNil(mat_de)
@@ -1183,7 +1183,7 @@
assert := assert.New(t)
if mat_de == nil {
- mat_de = LoadMatrixFile("testdata/tokenizer.matok")
+ mat_de = LoadMatrixFile("testdata/tokenizer_de.matok")
}
assert.NotNil(mat_de)
@@ -1249,9 +1249,9 @@
assert := assert.New(t)
if mat_de == nil {
- mat_de = LoadMatrixFile("testdata/tokenizer.matok")
+ mat_de = LoadMatrixFile("testdata/tokenizer_de.matok")
}
- dat := LoadDatokFile("testdata/tokenizer.datok")
+ dat := LoadDatokFile("testdata/tokenizer_de.datok")
r := strings.NewReader(s)
@@ -1278,7 +1278,7 @@
assert := assert.New(t)
if mat_de == nil {
- mat_de = LoadMatrixFile("testdata/tokenizer.matok")
+ mat_de = LoadMatrixFile("testdata/tokenizer_de.matok")
}
assert.NotNil(mat_de)
@@ -1297,7 +1297,7 @@
assert := assert.New(t)
if mat_de == nil {
- mat_de = LoadMatrixFile("testdata/tokenizer.matok")
+ mat_de = LoadMatrixFile("testdata/tokenizer_de.matok")
}
assert.NotNil(mat_de)
@@ -1314,7 +1314,7 @@
assert := assert.New(t)
if mat_de == nil {
- mat_de = LoadMatrixFile("testdata/tokenizer.matok")
+ mat_de = LoadMatrixFile("testdata/tokenizer_de.matok")
}
assert.NotNil(mat_de)
@@ -1362,7 +1362,7 @@
assert := assert.New(t)
if mat_de == nil {
- mat_de = LoadMatrixFile("testdata/tokenizer.matok")
+ mat_de = LoadMatrixFile("testdata/tokenizer_de.matok")
}
assert.NotNil(mat_de)
@@ -1381,7 +1381,7 @@
r := strings.NewReader(s)
- mat := LoadMatrixFile("testdata/tokenizer.matok")
+ mat := LoadMatrixFile("testdata/tokenizer_de.matok")
b.ResetTimer()
diff --git a/testdata/tokenizer.datok b/testdata/tokenizer_de.datok
similarity index 100%
rename from testdata/tokenizer.datok
rename to testdata/tokenizer_de.datok
Binary files differ
diff --git a/testdata/tokenizer.fst b/testdata/tokenizer_de.fst
similarity index 100%
rename from testdata/tokenizer.fst
rename to testdata/tokenizer_de.fst
Binary files differ
diff --git a/testdata/tokenizer.matok b/testdata/tokenizer_de.matok
similarity index 100%
rename from testdata/tokenizer.matok
rename to testdata/tokenizer_de.matok
Binary files differ
diff --git a/token_writer_test.go b/token_writer_test.go
index a27ae1c..868e69d 100644
--- a/token_writer_test.go
+++ b/token_writer_test.go
@@ -34,7 +34,7 @@
func TestTokenWriterFromOptions(t *testing.T) {
assert := assert.New(t)
- mat := LoadMatrixFile("testdata/tokenizer.matok")
+ mat := LoadMatrixFile("testdata/tokenizer_de.matok")
assert.NotNil(mat)
b := make([]byte, 0, 2048)