Added support for streetnames
diff --git a/.gitignore b/.gitignore
index c9d1bec..e4b40a2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,7 @@
sandbox
tokenizer.code-workspace
+~*
+.*
+!.gitignore
+\#*
*.info
\ No newline at end of file
diff --git a/datokenizer_test.go b/datokenizer_test.go
index a522159..2c6a525 100644
--- a/datokenizer_test.go
+++ b/datokenizer_test.go
@@ -131,8 +131,8 @@
assert.Equal(dat.identity, 3)
assert.Equal(dat.final, 136)
assert.Equal(len(dat.sigma), 131)
- assert.Equal(len(dat.array), 3806280)
- assert.Equal(dat.maxSize, 3806279)
+ assert.True(len(dat.array) > 3800000)
+ assert.True(dat.maxSize > 3800000)
assert.True(dat.Match("bau"))
assert.True(dat.Match("bad"))
@@ -147,7 +147,7 @@
if false {
tok := LoadFomaFile("testdata/tokenizer.fst")
dat = tok.ToDoubleArray()
- // dat.Save("testdata/tokenizer.datok")
+ dat.Save("testdata/tokenizer.datok")
} else {
dat = LoadDatokFile("testdata/tokenizer.datok")
}
@@ -255,12 +255,10 @@
assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
assert.Equal("", sentences[2])
- /*
- w.Reset()
- assert.True(dat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
- sentences = strings.Split(w.String(), "\n\n")
- assert.Equal(len(sentences), 1)
- */
+ w.Reset()
+ assert.True(dat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
+ sentences = strings.Split(w.String(), "\n\n")
+ assert.Equal(len(sentences), 2)
/*
Test:
@@ -675,11 +673,9 @@
assert.Equal(6, len(tokens))
// testTokenizerStrasse
- /*
- tokens = tokenize(dat, w, "Ich wohne in der Weststr. und Du?")
- assert.Equal(tokens[4], "Weststr.")
- assert.Equal(8, len(tokens))
- */
+ tokens = tokenize(dat, w, "Ich wohne in der Weststr. und Du?")
+ assert.Equal(tokens[4], "Weststr.")
+ assert.Equal(8, len(tokens))
// germanTokenizerKnowsGermanOmissionWords
tokens = tokenize(dat, w, "D'dorf Ku'damm Lu'hafen M'gladbach W'schaft")
diff --git a/src/tokenizer.xfst b/src/tokenizer.xfst
index 20b07f9..37fc799 100644
--- a/src/tokenizer.xfst
+++ b/src/tokenizer.xfst
@@ -11,7 +11,8 @@
! The abbreviation list is part of the sentence splitter tool
! of the IDS.
-define NLout "@_TOKEN_SYMBOL_@"; !"\u000a";
+! define NLout "\u000a";
+define NLout "@_TOKEN_SYMBOL_@";
define NLin ("\u000d") "\u000a";
define Digit [%0|1|2|3|4|5|6|7|8|9];
@@ -155,6 +156,8 @@
] .o. Caseinsensitive;
define File [Char|"-"]+ "." FileEnd;
+define Streetname Word {str};
+
! Also supports
! 19.4.2015, 19/4/2015 etc.
define DigitPunct ["_"|"-"|"."|","|Slash];
@@ -184,7 +187,7 @@
-define RealToken [XML|Email|URL|SNS|[Abbr %.]|Omission|Domain|AcronymDep|File|Emdash|Punct|Num|Years|Emoji|Word];
+define RealToken [XML|Email|URL|SNS|[Abbr %.]|[Streetname %.]|Omission|Domain|AcronymDep|File|Emdash|Punct|Num|Years|Emoji|Word];
echo - Introduce Token splitter
define Token [RealToken @-> ... NLout]