Fix sigma to start with 1
diff --git a/datokenizer.go b/datokenizer.go
index 1d5291b..b4a3f36 100644
--- a/datokenizer.go
+++ b/datokenizer.go
@@ -284,6 +284,10 @@
nontoken := false
tokenend := false
+ // ID needs to be > 1
+ inSym++
+ outSym++
+
if inSym != outSym {
if tok.sigmaRev[outSym] == NEWLINE {
@@ -356,7 +360,10 @@
string(tok.sigmaRev[inSym]),
":",
string(tok.sigmaRev[outSym]),
- ")")
+ ")",
+ ";",
+ "TE:", tokenend,
+ "NT:", nontoken)
}
continue
@@ -368,6 +375,9 @@
// Turn string into sigma id
number, err := strconv.Atoi(elem[0])
+ // ID needs to be > 1
+ number++
+
if err != nil {
log.Error().Err(err)
os.Exit(0)
@@ -497,14 +507,25 @@
t1 := dat.getBase(t) + uint32(a)
dat.setCheck(t1, t)
+ if DEBUG {
+ fmt.Println("Translate transition",
+ s, "->", s1, "(", a, ")", "to", t, "->", t1)
+ }
+
// Mark the state as being the target of a nontoken transition
if tok.transitions[s][a].nontoken {
dat.setNonToken(t1, true)
+ if DEBUG {
+ fmt.Println("Set", t1, "to nontoken")
+ }
}
// Mark the state as being the target of a tokenend transition
if tok.transitions[s][a].tokenend {
dat.setTokenEnd(t1, true)
+ if DEBUG {
+ fmt.Println("Set", t1, "to tokenend")
+ }
}
// Check for representative states
@@ -1000,7 +1021,6 @@
defer writer.Flush()
t := uint32(1) // Initial state
- // chars := []rune(input)
skip := false
var char rune
@@ -1036,6 +1056,10 @@
t = dat.getBase(tu) + uint32(a)
+ if DEBUG {
+ fmt.Println("Check", tu, "-", a, "(", string(char), ")", "->", t)
+ }
+
// Check if the transition is valid according to the double array
if t > dat.getCheck(1) || dat.getCheck(t) != tu {
@@ -1067,6 +1091,10 @@
t = dat.getBase(t)
+ if DEBUG {
+ fmt.Println("Representative pointing to", t)
+ }
+
} else {
nontoken = dat.isNonToken(t)
tokenend = dat.isTokenEnd(t)
@@ -1080,6 +1108,10 @@
writer.WriteRune(char)
}
+ if DEBUG {
+ fmt.Println(" --> ok!")
+ }
+
/*
if nontoken {
writer.WriteRune(("<|>")
diff --git a/datokenizer_test.go b/datokenizer_test.go
index e8a32d9..efd1e7c 100644
--- a/datokenizer_test.go
+++ b/datokenizer_test.go
@@ -50,9 +50,10 @@
r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
b := make([]byte, 0, 2048)
w := bytes.NewBuffer(b)
- dat.Transduce(r, w)
+ var tokens []string
- tokens := strings.Split(string(w.Bytes()), "\n")
+ dat.Transduce(r, w)
+ tokens = strings.Split(w.String(), "\n")
assert.Equal("wald", tokens[0])
assert.Equal("gehen", tokens[1])
assert.Equal("Da", tokens[2])
@@ -61,19 +62,28 @@
assert.Equal("was", tokens[5])
assert.Equal("\"erleben\"", tokens[6])
- /*
- r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
- w.Reset()
- dat.Transduce(r, w)
+ r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
+ w.Reset()
+ dat.Transduce(r, w)
+ tokens = strings.Split(w.String(), "\n")
+ assert.Equal("In", tokens[0])
+ assert.Equal("den", tokens[1])
+ assert.Equal("Wald", tokens[2])
+ assert.Equal("gehen", tokens[3])
+ assert.Equal("?", tokens[4])
+ assert.Equal("--", tokens[5])
- tokens = strings.Split(string(w.Bytes()), "\n")
- assert.Equal("In", tokens[0])
- assert.Equal("den", tokens[1])
- assert.Equal("Wald", tokens[2])
- assert.Equal("gehen", tokens[3])
- assert.Equal("?", tokens[4])
- assert.Equal("--", tokens[5])
- */
+ r = strings.NewReader(" g? -- D")
+ w.Reset()
+ dat.Transduce(r, w)
+ tokens = strings.Split(w.String(), "\n")
+ assert.Equal("g", tokens[0])
+ assert.Equal("?", tokens[1])
+ assert.Equal("--", tokens[2])
+ assert.Equal("D", tokens[3])
+ assert.Equal("", tokens[4])
+ assert.Equal("", tokens[5])
+ assert.Equal(6, len(tokens))
}
func TestReadWriteTokenizer(t *testing.T) {
@@ -90,7 +100,7 @@
buf := bytes.NewBuffer(b)
n, err := dat.WriteTo(buf)
assert.Nil(err)
- assert.Equal(int64(208), n)
+ assert.Equal(int64(218), n)
dat2 := ParseDatok(buf)
assert.NotNil(dat2)