Hashtags may contain unicode letters
Change-Id: Iddca4c676e0b76657ecebae9bab79197bf4804ca
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2e6ef15..bf549a1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,12 +13,13 @@
### Changed
- `ADR` is now emitted whenever a token matches the `@`-address pattern, regardless of existing POS values in the input.
-- Purely numeric tokens such as `#10` are no longer tagged as `HST`; hashtags must contain at least one letter.
+- Purely numeric tokens such as `#10` are no longer tagged as `HST`; hashtags must contain at least one Unicode letter.
- Documentation examples were revised and anonymized for public release.
### Fixed
- Sparse mode now respects stdout backpressure, avoiding Node.js heap exhaustion on very large corpora with many matches.
+- Hashtags containing Unicode letters such as umlauts are now tagged as `HST`.
- Emoji-name values in the `n` FEATS field no longer insert spurious underscores after separators such as `:` and `,`; examples now use forms like `thumbs_up:light_skin_tone` and `family:man,man,boy`.
## 1.0.0
diff --git a/src/index.js b/src/index.js
index 3d34cd2..f8e16f9 100755
--- a/src/index.js
+++ b/src/index.js
@@ -1,7 +1,7 @@
#!/usr/bin/env node
const emoticonRegex = /^(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)/;
-const hashtagRegex = /^#[a-zA-Z0-9]*[a-zA-Z][a-zA-Z0-9]*$/;
+const hashtagRegex = /^#(?=.*\p{L})[\p{L}\p{M}\p{N}]+$/u;
const urlRegex = /^(ftp|http)s?:\/\/[^\s]+/;
const emailRegex = /^\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/;
const addressRegex = /^@[a-zA-Z0-9]+/;
diff --git a/test/test.js b/test/test.js
index 5177948..3014c90 100644
--- a/test/test.js
+++ b/test/test.js
@@ -66,6 +66,27 @@
done();
});
+ test('Regression test for hashtags with Unicode letters: emit HST', (done) => {
+ const testInput = [
+ '# foundry = base',
+ '# text_id = test-hashtag-unicode',
+ '# text = #okeichhörejetztauf #schön #10',
+ ['1', '#okeichhörejetztauf', '_', '_', '_', '_', '_', '_', '_', '_'].join('\t'),
+ ['2', '#schön', '_', '_', '_', '_', '_', '_', '_', '_'].join('\t'),
+ ['3', '#10', '_', '_', '_', '_', '_', '_', '_', '_'].join('\t'),
+ ''
+ ].join('\n');
+ const stdout = execSync('node src/index.js', { input: testInput }).toString();
+
+ expect(stdout).toContain('#okeichhörejetztauf\t_\t_\tHST');
+ expect(stdout).toContain('#schön\t_\t_\tHST');
+ expect(stdout).not.toContain('#10\t_\t_\tHST');
+
+ var hst_count = (stdout.match(/\tHST\t/g) || []).length;
+ expect(hst_count).toBe(2);
+ done();
+ });
+
test('Regression test for addresses: emit ADR regardless of existing POS values', (done) => {
const testInput = [
'# foundry = base',