Fix Genderstern and ommission word token breaks after hyphens
Resolves #115
Change-Id: Iacf5667b508050a6dfd09ca9938f449d05582a95
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4a41661..ba70c7b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,10 @@
# Changelog
+
+## 2.2.6 [unreleased]
+
+* Fixed genderstern and omission asterisk breaking after hyphens (issue #115)
+
## 2.2.5
* adds more ossrh sync data to maven pom
diff --git a/Readme.md b/Readme.md
index 8d419cc..ada404c 100644
--- a/Readme.md
+++ b/Readme.md
@@ -26,7 +26,7 @@
## Installation
```shell script
-mvn clean install
+mvn clean package
```
#### Note
Because of the large table of abbreviations, the conversion from the jflex source to java,
diff --git a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
index 5579b30..d22a7dc 100644
--- a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
+++ b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
@@ -635,7 +635,7 @@
// normal stuff
// dashed words
-{WORD}({DASH}{NEWLINE}*{WORD})+ { return currentToken();}
+{WORD}({DASH}{NEWLINE}*({WORD}|{OMISSIONWORD}))+ { return currentToken();}
{WORD}{DASH} { return currentToken();}
{TWITTER_HANDLE} { return currentToken(); }
{TWITTER_HASHTAG} { return currentToken(); }
diff --git a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
index ab11412..f1eda03 100644
--- a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
+++ b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
@@ -333,6 +333,19 @@
assertEquals(tokens.length, 4);
}
+ // Regression test for https://github.com/KorAP/KorAP-Tokenizer/issues/115
+ @Test
+ public void testTokenizerGendersternAfterHyphen () {
+ DerekoDfaTokenizer_de tok = new DerekoDfaTokenizer_de();
+ String[] tokens = tok.tokenize("Die Serb*innen wie die Kosovo-Albaner*innen");
+ assertEquals("Die", tokens[0]);
+ assertEquals("Serb*innen", tokens[1]);
+ assertEquals("wie", tokens[2]);
+ assertEquals("die", tokens[3]);
+ assertEquals("Kosovo-Albaner*innen", tokens[4]);
+ assertEquals(5, tokens.length);
+ }
+
@Test
// Probably interpreted as HOST
public void testTokenizerFileExtension1 () {