Hashtags may contain unicode letters

Change-Id: Iddca4c676e0b76657ecebae9bab79197bf4804ca
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2e6ef15..bf549a1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,12 +13,13 @@
 ### Changed
 
 - `ADR` is now emitted whenever a token matches the `@`-address pattern, regardless of existing POS values in the input.
-- Purely numeric tokens such as `#10` are no longer tagged as `HST`; hashtags must contain at least one letter.
+- Purely numeric tokens such as `#10` are no longer tagged as `HST`; hashtags must contain at least one Unicode letter.
 - Documentation examples were revised and anonymized for public release.
 
 ### Fixed
 
 - Sparse mode now respects stdout backpressure, avoiding Node.js heap exhaustion on very large corpora with many matches.
+- Hashtags containing Unicode letters such as umlauts are now tagged as `HST`.
 - Emoji-name values in the `n` FEATS field no longer insert spurious underscores after separators such as `:` and `,`; examples now use forms like `thumbs_up:light_skin_tone` and `family:man,man,boy`.
 
 ## 1.0.0
diff --git a/src/index.js b/src/index.js
index 3d34cd2..f8e16f9 100755
--- a/src/index.js
+++ b/src/index.js
@@ -1,7 +1,7 @@
 #!/usr/bin/env node
 
 const emoticonRegex = /^(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)/;
-const hashtagRegex = /^#[a-zA-Z0-9]*[a-zA-Z][a-zA-Z0-9]*$/;
+const hashtagRegex = /^#(?=.*\p{L})[\p{L}\p{M}\p{N}]+$/u;
 const urlRegex = /^(ftp|http)s?:\/\/[^\s]+/;
 const emailRegex = /^\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/;
 const addressRegex = /^@[a-zA-Z0-9]+/;
diff --git a/test/test.js b/test/test.js
index 5177948..3014c90 100644
--- a/test/test.js
+++ b/test/test.js
@@ -66,6 +66,27 @@
     done();
   });
 
+  test('Regression test for hashtags with Unicode letters: emit HST', (done) => {
+    const testInput = [
+      '# foundry = base',
+      '# text_id = test-hashtag-unicode',
+      '# text = #okeichhörejetztauf #schön #10',
+      ['1', '#okeichhörejetztauf', '_', '_', '_', '_', '_', '_', '_', '_'].join('\t'),
+      ['2', '#schön', '_', '_', '_', '_', '_', '_', '_', '_'].join('\t'),
+      ['3', '#10', '_', '_', '_', '_', '_', '_', '_', '_'].join('\t'),
+      ''
+    ].join('\n');
+    const stdout = execSync('node src/index.js', { input: testInput }).toString();
+
+    expect(stdout).toContain('#okeichhörejetztauf\t_\t_\tHST');
+    expect(stdout).toContain('#schön\t_\t_\tHST');
+    expect(stdout).not.toContain('#10\t_\t_\tHST');
+
+    var hst_count = (stdout.match(/\tHST\t/g) || []).length;
+    expect(hst_count).toBe(2);
+    done();
+  });
+
   test('Regression test for addresses: emit ADR regardless of existing POS values', (done) => {
     const testInput = [
       '# foundry = base',