Add file scheme
diff --git a/datokenizer_test.go b/datokenizer_test.go
index d476316..815f926 100644
--- a/datokenizer_test.go
+++ b/datokenizer_test.go
@@ -553,29 +553,27 @@
assert.Equal(len(tokens), 3)
// testTokenizerFile1
- /*
- tokens = tokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.docx")
- assert.Equal(tokens[0], "Zeig")
- assert.Equal(tokens[1], "mir")
- assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx")
- assert.Equal(len(tokens), 3)
+ tokens = tokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.docx")
+ assert.Equal(tokens[0], "Zeig")
+ assert.Equal(tokens[1], "mir")
+ assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx")
+ assert.Equal(len(tokens), 3)
- // testTokenizerFile2
- tokens = tokenize(dat, w, "Gehe zu /Dokumente/profile.docx")
- assert.Equal(tokens[0], "Gehe")
- assert.Equal(tokens[1], "zu")
- assert.Equal(tokens[2], "/Dokumente/profile.docx")
- assert.Equal(len(tokens), 3)
+ // testTokenizerFile2
+ tokens = tokenize(dat, w, "Gehe zu /Dokumente/profile.docx")
+ assert.Equal(tokens[0], "Gehe")
+ assert.Equal(tokens[1], "zu")
+ assert.Equal(tokens[2], "/Dokumente/profile.docx")
+ assert.Equal(len(tokens), 3)
- // testTokenizerFile3
- tokens = tokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.jpeg")
- assert.Equal(tokens[0], "Zeig")
- assert.Equal(tokens[1], "mir")
- assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg")
- assert.Equal(len(tokens), 3)
- // Ignored in KorAP-Tokenizer
- */
+ // testTokenizerFile3
+ tokens = tokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.jpeg")
+ assert.Equal(tokens[0], "Zeig")
+ assert.Equal(tokens[1], "mir")
+ assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg")
+ assert.Equal(len(tokens), 3)
+ // Ignored in KorAP-Tokenizer
// testTokenizerPunct
tokens = tokenize(dat, w, "Er sagte: \"Es geht mir gut!\", daraufhin ging er.")
diff --git a/src/tokenizer.xfst b/src/tokenizer.xfst
index 977e144..780884a 100644
--- a/src/tokenizer.xfst
+++ b/src/tokenizer.xfst
@@ -16,6 +16,7 @@
define NLin ("\u000d") "\u000a";
define Digit [%0|1|2|3|4|5|6|7|8|9];
+define AsciiLetter [a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z];
!!!!!!!!!!!!!!!!!
! <from tmorph> !
@@ -152,9 +153,11 @@
{xls}|
{xml}|
{aac}|
- {gif}
+ {gif}|
+ {exe}
] .o. Caseinsensitive;
-define File [Char|"-"]+ "." FileEnd;
+
+define File (( AsciiLetter ":" %\ | "/" ) [ Char | "_" | "-" | Char [ %\ | "/" ] ]*) [Char | "-" | "_" ]+ "." FileEnd;
define Streetname Word {str} %.;
@@ -176,10 +179,6 @@
! TODO: Name words with ' and `
-! TODO:
-! FNAME = (({LETTER}:[\\/])?|\/)?({LETTER}+|[\\_/-])+\.{EXTENSION}
-
-
! Support ASCII elements, like
! +---------------+
! <---->, -->, <--