Let HST require at least one letter
(as in mastodon)
Change-Id: I2be91a30a6803933dd64d6a789fc28501ae18632
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 20d0894..e25a8dd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,7 @@
### Changed
- `ADR` is now emitted whenever a token matches the `@`-address pattern, regardless of existing POS values in the input.
+- Purely numeric tokens such as `#10` are no longer tagged as `HST`; hashtags must contain at least one letter.
- Documentation examples were revised and anonymized for public release.
## 1.0.0
diff --git a/Readme.md b/Readme.md
index e4cb61b..1fc5c46 100644
--- a/Readme.md
+++ b/Readme.md
@@ -26,11 +26,13 @@
| `EMOIMG` | Unicode emoji tokens | `😂`, `😇` | Writes `EMOIMG` to XPOS, normalizes LEMMA to the base emoji, and adds FEATS metadata |
| `AKW` | Action words / inflectives | `:grins:` | Writes `AKW` to XPOS |
| `EMOASC` | ASCII emoticons | `:)`, `<3` | Writes `EMOASC` to XPOS |
-| `HST` | Hashtags | `#KorAP`, `#10` | Writes `HST` to XPOS |
+| `HST` | Hashtags | `#KorAP`, `#3D` | Writes `HST` to XPOS when the hashtag contains at least one letter |
| `URL` | URLs | `https://korap.ids-mannheim.de` | Writes `URL` to XPOS |
| `EML` | Email addresses | `mail@example.org` | Writes `EML` to XPOS |
| `ADR` | `@`-names / addresses | `@markup` | Writes `ADR` to XPOS |
+Numeric-only forms such as `#10` are not tagged as `HST`.
+
## CoNLL-U Output Examples
The following example shows how the different tags appear in CoNLL-U output. In all cases, the annotation is written to XPOS; only `EMOIMG` additionally changes LEMMA and FEATS.
diff --git a/src/index.js b/src/index.js
index 148ff52..ce45a04 100755
--- a/src/index.js
+++ b/src/index.js
@@ -1,7 +1,7 @@
#!/usr/bin/env node
const emoticonRegex = /^(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)/;
-const hashtagRegex = /^#[a-zA-Z0-9]+/;
+const hashtagRegex = /^#[a-zA-Z0-9]*[a-zA-Z][a-zA-Z0-9]*$/;
const urlRegex = /^(ftp|http)s?:\/\/[^\s]+/;
const emailRegex = /^\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/;
const addressRegex = /^@[a-zA-Z0-9]+/;
diff --git a/test/test.js b/test/test.js
index aeaaf10..9eea567 100644
--- a/test/test.js
+++ b/test/test.js
@@ -12,7 +12,7 @@
var ascimg_count = (stdout.match(/EMOASC/g) || []).length;
expect(ascimg_count).toBe(30);
var hst_count = (stdout.match(/\tHST\t/g) || []).length;
- expect(hst_count).toBe(14);
+ expect(hst_count).toBe(12);
var url_count = (stdout.match(/\tURL\t/g) || []).length;
expect(url_count).toBe(4);
var adr_count = (stdout.match(/\tADR\t/g) || []).length;
@@ -22,7 +22,7 @@
var eof_count = (stdout.match(/\n# eof/g) || []).length;
expect(eof_count).toBe(1);
var lines_count = (stdout.split("\n")).length;
- expect(lines_count).toBe(810);
+ expect(lines_count).toBe(803);
done();
});
@@ -35,7 +35,7 @@
var ascimg_count = (stdout.match(/EMOASC/g) || []).length;
expect(ascimg_count).toBe(30);
var hst_count = (stdout.match(/\tHST\t/g) || []).length;
- expect(hst_count).toBe(14);
+ expect(hst_count).toBe(12);
var url_count = (stdout.match(/\tURL\t/g) || []).length;
expect(url_count).toBe(4);
var adr_count = (stdout.match(/\tADR\t/g) || []).length;
@@ -49,15 +49,17 @@
const testInput = [
'# foundry = base',
'# text_id = test-hashtag',
- '# text = #KorAP #10',
+ '# text = #KorAP #3D #10',
['1', '#KorAP', '_', '_', '_', '_', '_', '_', '_', '_'].join('\t'),
- ['2', '#10', '_', '_', '_', '_', '_', '_', '_', '_'].join('\t'),
+ ['2', '#3D', '_', '_', '_', '_', '_', '_', '_', '_'].join('\t'),
+ ['3', '#10', '_', '_', '_', '_', '_', '_', '_', '_'].join('\t'),
''
].join('\n');
const stdout = execSync('node src/index.js', { input: testInput }).toString();
expect(stdout).toContain('#KorAP\t_\t_\tHST');
- expect(stdout).toContain('#10\t_\t_\tHST');
+ expect(stdout).toContain('#3D\t_\t_\tHST');
+ expect(stdout).not.toContain('#10\t_\t_\tHST');
var hst_count = (stdout.match(/\tHST\t/g) || []).length;
expect(hst_count).toBe(2);