Ignore MCS in sigma if not used in the transducer
diff --git a/.gitignore b/.gitignore
index a3d640f..edd4559 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,4 +6,5 @@
.*
!.gitignore
\#*
-*.info
\ No newline at end of file
+*.info
+datok
\ No newline at end of file
diff --git a/datokenizer.go b/datokenizer.go
index 9f8079b..805b8df 100644
--- a/datokenizer.go
+++ b/datokenizer.go
@@ -74,6 +74,7 @@
// of the tokenizer.
type Tokenizer struct {
sigmaRev map[int]rune
+ sigmaMCS map[int]string
arcCount int
sigmaCount int
transitions []map[int]*edge
@@ -137,6 +138,7 @@
tok := &Tokenizer{
sigmaRev: make(map[int]rune),
+ sigmaMCS: make(map[int]string),
epsilon: -1,
unknown: -1,
identity: -1,
@@ -352,10 +354,13 @@
")")
return nil
}
-
} else if inSym == tok.epsilon {
log.Println("General epsilon transitions are not supported")
return nil
+ } else if tok.sigmaMCS[inSym] != "" {
+ log.Fatalln("Non supported character", tok.sigmaMCS[inSym])
+ } else if tok.sigmaMCS[outSym] != "" {
+ log.Fatalln("Non supported character", tok.sigmaMCS[outSym])
}
// Create an edge based on the collected information
@@ -451,8 +456,8 @@
}
default:
{
- log.Println("MCS not supported: " + line)
- return nil
+ // MCS not supported
+ tok.sigmaMCS[number] = line
}
}
continue
@@ -464,8 +469,9 @@
return nil
}
if len(line) != 1 {
- log.Println("MCS not supported:" + line)
- return nil
+ // MCS not supported
+ tok.sigmaMCS[number] = line
+ continue
}
symbol = rune('\n')
}
@@ -474,7 +480,7 @@
}
}
}
-
+ tok.sigmaMCS = nil
return tok
}
diff --git a/datokenizer_test.go b/datokenizer_test.go
index ef6ccb4..ffee353 100644
--- a/datokenizer_test.go
+++ b/datokenizer_test.go
@@ -136,6 +136,28 @@
assert.True(tmatch(dat2, "wald gehen"))
}
+func TestIgnorableMCS(t *testing.T) {
+ assert := assert.New(t)
+ // File has MCS in sigma but not in net
+ tok := LoadFomaFile("testdata/ignorable_mcs.fst")
+ assert.NotNil(tok)
+ dat := tok.ToDoubleArray()
+ assert.NotNil(dat)
+
+ b := make([]byte, 0, 2048)
+ w := bytes.NewBuffer(b)
+ var tokens []string
+
+ // Is only unambigous when transducing strictly greedy!
+ assert.True(dat.Transduce(strings.NewReader("ab<ab>"), w))
+ tokens = strings.Split(w.String(), "\n")
+ assert.Equal("a\nb\n<ab>\n", w.String())
+ assert.Equal("a", tokens[0])
+ assert.Equal("b", tokens[1])
+ assert.Equal("<ab>", tokens[2])
+ assert.Equal(4, len(tokens))
+}
+
func TestFullTokenizer(t *testing.T) {
assert := assert.New(t)
dat := LoadDatokFile("testdata/tokenizer.datok")
diff --git a/testdata/ignorable_mcs.fst b/testdata/ignorable_mcs.fst
new file mode 100644
index 0000000..c114635
--- /dev/null
+++ b/testdata/ignorable_mcs.fst
Binary files differ