Support both matrix and da in the command
diff --git a/Readme.md b/Readme.md
index 6956d47..ce710d9 100644
--- a/Readme.md
+++ b/Readme.md
@@ -1,7 +1,8 @@
-# Datok - Double Array Tokenizer
+# Datok - Matrix or Double Array FSA based Tokenizer
-This is an implementation of a double array based
-finite state automaton (FSA) for natural language tokenization.
+This is an implementation of an FSA for natural language
+tokenization, either in form of a matrix representation
+or as a double array.
The system accepts a finite state transducer (FST)
describing a tokenizer generated by
[Foma](https://fomafst.github.io/)
@@ -52,7 +53,7 @@
# Building
-To build the Double Array Tokenizer tool, run
+To build the tokenizer tool, run
```shell
$ go build ./cmd/datok.go
@@ -73,11 +74,17 @@
the generated FST as `mytokenizer.fst`
in the root directory.
+To generate a matrix representation of this FST, run
+
+```shell
+$ datok convert -i mytokenizer.fst -o mytokenizer.datok
+```
+
To generate a double array representation
of this FST, run
```shell
-$ datok convert -i mytokenizer.fst -o mytokenizer.datok
+$ datok convert -i mytokenizer.fst -o mytokenizer.datok -d
```
*Caution*: This may take some time depending on the number of arcs in the FST.
@@ -105,7 +112,8 @@
# Technology
-Datok is based on a double array representation (Aoe 1989) of all transitions in the FST,
+The double array representation (Aoe 1989) of all transitions
+in the FST is
implemented as an extended FSA following Mizobuchi et al. (2000)
and implementation details following Kanda et al. (2018).
diff --git a/cmd/datok.go b/cmd/datok.go
index 22f4875..adff996 100644
--- a/cmd/datok.go
+++ b/cmd/datok.go
@@ -12,11 +12,12 @@
var cli struct {
Convert struct {
- Foma string `kong:"required,short='i',help='The Foma file'"`
- Tokenizer string `kong:"required,short='o',help='The Double Array Tokenizer file'"`
- } `kong:"cmd, help='Convert a foma file to a double array tokenizer'"`
+ Foma string `kong:"required,short='i',help='The Foma file'"`
+ Tokenizer string `kong:"required,short='o',help='The Tokenizer file'"`
+ DoubleArray bool `kong:"optional,short='d',help='Convert to Double Array instead of Matrix representation'"`
+ } `kong:"cmd, help='Convert a foma file to a Matrix or Double Array tokenizer'"`
Tokenize struct {
- Tokenizer string `kong:"required,short='t',help='The Double Array Tokenizer file'"`
+ Tokenizer string `kong:"required,short='t',help='The Matrix or Double Array Tokenizer file'"`
} `kong:"cmd, help='Tokenize a text'"`
}
@@ -27,7 +28,7 @@
parser := kong.Must(
&cli,
kong.Name("datok"),
- kong.Description("Double Array based tokenizer"),
+ kong.Description("FSA based tokenizer"),
kong.UsageOnError(),
)
@@ -40,27 +41,35 @@
if tok == nil {
log.Fatalln("Unable to load foma file")
}
- dat := tok.ToDoubleArray()
- _, err := dat.Save(cli.Convert.Tokenizer)
- if err != nil {
- log.Fatalln(err)
+ if cli.Convert.DoubleArray {
+ dat := tok.ToDoubleArray()
+ _, err := dat.Save(cli.Convert.Tokenizer)
+ if err != nil {
+ log.Fatalln(err)
+ }
+ } else {
+ mat := tok.ToMatrix()
+ _, err := mat.Save(cli.Convert.Tokenizer)
+ if err != nil {
+ log.Fatalln(err)
+ }
}
fmt.Println("File successfully converted.")
os.Exit(0)
}
- // Load the Datok file
- dat := datok.LoadDatokFile(cli.Tokenize.Tokenizer)
+ // Load the Datok or Matrix file
+ dat := datok.LoadTokenizerFile(cli.Tokenize.Tokenizer)
// Unable to load the datok file
if dat == nil {
+ log.Fatalln("Unable to load file")
os.Exit(1)
}
// Program is running in a pipe
fileInfo, _ := os.Stdin.Stat()
if fileInfo.Mode()&os.ModeCharDevice == 0 {
-
// Transduce from STDIN and write to STDOUT
dat.Transduce(os.Stdin, os.Stdout)
}
diff --git a/datok.go b/datok.go
index 7812981..fa6e44d 100644
--- a/datok.go
+++ b/datok.go
@@ -239,6 +239,11 @@
return 0
}
+// Type of tokenizer
+func (DaTokenizer) Type() string {
+ return DAMAGIC
+}
+
// Resize double array when necessary
func (dat *DaTokenizer) resize(l int) {
// TODO:
diff --git a/datok_test.go b/datok_test.go
index 8c2b894..1ebb167 100644
--- a/datok_test.go
+++ b/datok_test.go
@@ -179,6 +179,17 @@
assert.True(tmatch(dat, "wald gehen"))
}
+func TestTokenizerBranch(t *testing.T) {
+ assert := assert.New(t)
+ tok := LoadTokenizerFile("testdata/simpletok.datok")
+ assert.NotNil(tok)
+ assert.Equal(tok.Type(), "DATOK")
+
+ tok = LoadTokenizerFile("testdata/simpletok.matok")
+ assert.NotNil(tok)
+ assert.Equal(tok.Type(), "MATOK")
+}
+
func XTestFullTokenizerBuild(t *testing.T) {
assert := assert.New(t)
tok := LoadFomaFile("testdata/tokenizer.fst")
diff --git a/fomafile.go b/fomafile.go
index 1d46500..2b10736 100644
--- a/fomafile.go
+++ b/fomafile.go
@@ -29,6 +29,7 @@
type Tokenizer interface {
Transduce(r io.Reader, w io.Writer) bool
+ Type() string
}
// Automaton is the intermediate representation
@@ -442,6 +443,40 @@
return auto
}
+func LoadTokenizerFile(file string) Tokenizer {
+ f, err := os.Open(file)
+ if err != nil {
+ log.Println(err)
+ return nil
+ }
+ defer f.Close()
+
+ gz, err := gzip.NewReader(f)
+ if err != nil {
+ log.Println(err)
+ return nil
+ }
+ defer gz.Close()
+
+ r := bufio.NewReader(gz)
+
+ mstr, err := r.Peek(len(DAMAGIC))
+
+ if err != nil {
+ log.Println(err)
+ return nil
+ }
+
+ if string(mstr) == MAMAGIC {
+ return ParseMatrix(r)
+ } else if string(mstr) == DAMAGIC {
+ return ParseDatok(r)
+ }
+
+ log.Println("Neither a matrix nor a datok file")
+ return nil
+}
+
// Set alphabet A to the list of all symbols
// outgoing from s
func (auto *Automaton) getSet(s int, A *[]int) {
diff --git a/matrix.go b/matrix.go
index 4ce82c9..4b439d0 100644
--- a/matrix.go
+++ b/matrix.go
@@ -82,6 +82,11 @@
return mat
}
+// Type of tokenizer
+func (MatrixTokenizer) Type() string {
+ return MAMAGIC
+}
+
// Save stores the matrix data in a file
func (mat *MatrixTokenizer) Save(file string) (n int64, err error) {
f, err := os.Create(file)
diff --git a/testdata/simpletok.datok b/testdata/simpletok.datok
new file mode 100644
index 0000000..26517c4
--- /dev/null
+++ b/testdata/simpletok.datok
Binary files differ
diff --git a/testdata/simpletok.matok b/testdata/simpletok.matok
new file mode 100644
index 0000000..51ee615
--- /dev/null
+++ b/testdata/simpletok.matok
Binary files differ