Add ordinals
diff --git a/datokenizer_test.go b/datokenizer_test.go
index 2c6a525..003006b 100644
--- a/datokenizer_test.go
+++ b/datokenizer_test.go
@@ -118,11 +118,6 @@
func TestFullTokenizer(t *testing.T) {
assert := assert.New(t)
- /*
- tok := LoadFomaFile("testdata/tokenizer.fst")
- dat := tok.ToDoubleArray()
- dat.Save("testdata/tokenizer.datok")
- */
dat := LoadDatokFile("testdata/tokenizer.datok")
assert.NotNil(dat)
assert.True(dat.LoadFactor() >= 70)
@@ -139,18 +134,19 @@
assert.True(dat.Match("wald gehen"))
}
+func XTestFullTokenizerBuild(t *testing.T) {
+ assert := assert.New(t)
+ tok := LoadFomaFile("testdata/tokenizer.fst")
+ dat := tok.ToDoubleArray()
+ n, err := dat.Save("testdata/tokenizer.datok")
+ assert.Nil(err)
+ assert.True(n > 500)
+}
+
func TestFullTokenizerTransduce(t *testing.T) {
assert := assert.New(t)
- var dat *DaTokenizer
-
- if false {
- tok := LoadFomaFile("testdata/tokenizer.fst")
- dat = tok.ToDoubleArray()
- dat.Save("testdata/tokenizer.datok")
- } else {
- dat = LoadDatokFile("testdata/tokenizer.datok")
- }
+ dat := LoadDatokFile("testdata/tokenizer.datok")
assert.NotNil(dat)
b := make([]byte, 0, 2048)
@@ -651,16 +647,14 @@
// Ignored in KorAP-Tokenizer
// testTokenizerOrd
- /*
- tokens = tokenize(dat, w, "Sie erreichte den 1. Platz!")
- assert.Equal(tokens[0], "Sie")
- assert.Equal(tokens[1], "erreichte")
- assert.Equal(tokens[2], "den")
- assert.Equal(tokens[3], "1.")
- assert.Equal(tokens[4], "Platz")
- assert.Equal(tokens[5], "!")
- assert.Equal(len(tokens), 6)
- */
+ tokens = tokenize(dat, w, "Sie erreichte den 1. Platz!")
+ assert.Equal(tokens[0], "Sie")
+ assert.Equal(tokens[1], "erreichte")
+ assert.Equal(tokens[2], "den")
+ assert.Equal(tokens[3], "1.")
+ assert.Equal(tokens[4], "Platz")
+ assert.Equal(tokens[5], "!")
+ assert.Equal(len(tokens), 6)
// testNoZipOuputArchive
tokens = tokenize(dat, w, "Archive: Ich bin kein zip\n")
diff --git a/src/tokenizer.xfst b/src/tokenizer.xfst
index 37fc799..5777666 100644
--- a/src/tokenizer.xfst
+++ b/src/tokenizer.xfst
@@ -156,13 +156,16 @@
] .o. Caseinsensitive;
define File [Char|"-"]+ "." FileEnd;
-define Streetname Word {str};
+define Streetname Word {str} %.;
! Also supports
! 19.4.2015, 19/4/2015 etc.
define DigitPunct ["_"|"-"|"."|","|Slash];
define Num Digit+ [DigitPunct Digit+]* (Char+);
+! ordinals
+define Ord Digit ( Digit (Digit) ) %.;
+
! TODO:
! floating point, serial, model numbers, ip addresses, etc.
! every other segment must have at least one digit
@@ -187,7 +190,7 @@
-define RealToken [XML|Email|URL|SNS|[Abbr %.]|[Streetname %.]|Omission|Domain|AcronymDep|File|Emdash|Punct|Num|Years|Emoji|Word];
+define RealToken [XML|Email|URL|SNS|[Abbr %.]|Streetname|Omission|Domain|AcronymDep|File|Emdash|Punct|Ord|Num|Years|Emoji|Word];
echo - Introduce Token splitter
define Token [RealToken @-> ... NLout]