blob: e5e3f68a8eebfbf5e06eeb8f03f3fabe64935e4c [file] [log] [blame]
Marc Kupietzb43a5182024-02-03 18:09:10 +01001const { execSync } = require('child_process');
2const exp = require('constants');
Marc Kupietz20f33d92026-04-12 14:27:57 +02003const packageVersion = require('../package.json').version;
Marc Kupietzb43a5182024-02-03 18:09:10 +01004
5describe('conllu2cmc', () => {
6 test('Test sparse mode', (done) => {
7 // Modify the command based on your script's location and options
8 const command = 'node src/index.js -s < test/data/ndy.conllu';
9 const stdout = execSync(command).toString();
Marc Kupietz187bcdc2025-12-18 11:43:26 +010010 expect(stdout).toContain('😂\t😂\t_\tEMOIMG');
Marc Kupietzb43a5182024-02-03 18:09:10 +010011 var emoimg_count = (stdout.match(/EMOIMG/g) || []).length;
Marc Kupietz187bcdc2025-12-18 11:43:26 +010012 expect(emoimg_count).toBe(191);
Marc Kupietzb43a5182024-02-03 18:09:10 +010013 var ascimg_count = (stdout.match(/EMOASC/g) || []).length;
Marc Kupietz76007d62025-12-11 17:13:05 +010014 expect(ascimg_count).toBe(30);
Marc Kupietzc2875332026-04-10 14:21:06 +020015 var hst_count = (stdout.match(/\tHST\t/g) || []).length;
Marc Kupietze8e40ec2026-04-10 15:15:59 +020016 expect(hst_count).toBe(12);
Marc Kupietz76007d62025-12-11 17:13:05 +010017 var url_count = (stdout.match(/\tURL\t/g) || []).length;
18 expect(url_count).toBe(4);
19 var adr_count = (stdout.match(/\tADR\t/g) || []).length;
20 expect(adr_count).toBe(1);
Marc Kupietzfd92b1d2024-03-13 10:51:29 +010021 var eot_count = (stdout.match(/\n# eot/g) || []).length;
22 expect(eot_count).toBe(1);
23 var eof_count = (stdout.match(/\n# eof/g) || []).length;
24 expect(eof_count).toBe(1);
Marc Kupietzb43a5182024-02-03 18:09:10 +010025 var lines_count = (stdout.split("\n")).length;
Marc Kupietze8e40ec2026-04-10 15:15:59 +020026 expect(lines_count).toBe(803);
Marc Kupietzb43a5182024-02-03 18:09:10 +010027 done();
28 });
29
30 test('Test full mode', (done) => {
31 const command = 'node src/index.js < test/data/ndy.conllu';
32 const stdout = execSync(command).toString();
Marc Kupietzc2875332026-04-10 14:21:06 +020033 expect(stdout).toContain('😂\t😂\t_\tEMOIMG');
34 var emoimg_count = (stdout.match(/EMOIMG/g) || []).length;
35 expect(emoimg_count).toBe(191);
36 var ascimg_count = (stdout.match(/EMOASC/g) || []).length;
37 expect(ascimg_count).toBe(30);
38 var hst_count = (stdout.match(/\tHST\t/g) || []).length;
Marc Kupietze8e40ec2026-04-10 15:15:59 +020039 expect(hst_count).toBe(12);
Marc Kupietzc2875332026-04-10 14:21:06 +020040 var url_count = (stdout.match(/\tURL\t/g) || []).length;
41 expect(url_count).toBe(4);
42 var adr_count = (stdout.match(/\tADR\t/g) || []).length;
43 expect(adr_count).toBe(1);
44 var lines_count = (stdout.split("\n")).length;
45 expect(lines_count).toBe(6202);
46 done();
47 });
48
Marc Kupietz20f33d92026-04-12 14:27:57 +020049 test('Test version flag', (done) => {
50 const stdout = execSync('node src/index.js -V').toString().trim();
51
52 expect(stdout).toBe(packageVersion);
53 done();
54 });
55
Marc Kupietz202a36d2026-04-12 14:58:13 +020056 test('Test cmc-tagger alias', (done) => {
57 const stdout = execSync('npm exec --yes --package=. cmc-tagger -- -V').toString().trim();
58 const versionLine = stdout.split('\n').pop().trim();
59
60 expect(versionLine).toBe(packageVersion);
61 done();
62 });
63
Marc Kupietzc2875332026-04-10 14:21:06 +020064 test('Regression test for hashtags: emit HST', (done) => {
65 const testInput = [
66 '# foundry = base',
67 '# text_id = test-hashtag',
Marc Kupietze8e40ec2026-04-10 15:15:59 +020068 '# text = #KorAP #3D #10',
Marc Kupietzc2875332026-04-10 14:21:06 +020069 ['1', '#KorAP', '_', '_', '_', '_', '_', '_', '_', '_'].join('\t'),
Marc Kupietze8e40ec2026-04-10 15:15:59 +020070 ['2', '#3D', '_', '_', '_', '_', '_', '_', '_', '_'].join('\t'),
71 ['3', '#10', '_', '_', '_', '_', '_', '_', '_', '_'].join('\t'),
Marc Kupietzc2875332026-04-10 14:21:06 +020072 ''
73 ].join('\n');
74 const stdout = execSync('node src/index.js', { input: testInput }).toString();
75
76 expect(stdout).toContain('#KorAP\t_\t_\tHST');
Marc Kupietze8e40ec2026-04-10 15:15:59 +020077 expect(stdout).toContain('#3D\t_\t_\tHST');
78 expect(stdout).not.toContain('#10\t_\t_\tHST');
Marc Kupietzc2875332026-04-10 14:21:06 +020079
80 var hst_count = (stdout.match(/\tHST\t/g) || []).length;
81 expect(hst_count).toBe(2);
82 done();
Marc Kupietzb43a5182024-02-03 18:09:10 +010083 });
Marc Kupietzb5d80b32025-12-11 15:48:09 +010084
Marc Kupietz3d525092026-04-11 20:44:32 +020085 test('Regression test for hashtags with Unicode letters: emit HST', (done) => {
86 const testInput = [
87 '# foundry = base',
88 '# text_id = test-hashtag-unicode',
89 '# text = #okeichhörejetztauf #schön #10',
90 ['1', '#okeichhörejetztauf', '_', '_', '_', '_', '_', '_', '_', '_'].join('\t'),
91 ['2', '#schön', '_', '_', '_', '_', '_', '_', '_', '_'].join('\t'),
92 ['3', '#10', '_', '_', '_', '_', '_', '_', '_', '_'].join('\t'),
93 ''
94 ].join('\n');
95 const stdout = execSync('node src/index.js', { input: testInput }).toString();
96
97 expect(stdout).toContain('#okeichhörejetztauf\t_\t_\tHST');
98 expect(stdout).toContain('#schön\t_\t_\tHST');
99 expect(stdout).not.toContain('#10\t_\t_\tHST');
100
101 var hst_count = (stdout.match(/\tHST\t/g) || []).length;
102 expect(hst_count).toBe(2);
103 done();
104 });
105
Marc Kupietz804750d2026-04-10 14:44:13 +0200106 test('Regression test for addresses: emit ADR regardless of existing POS values', (done) => {
107 const testInput = [
108 '# foundry = base',
109 '# text_id = test-address',
110 '# text = @handle @markup',
111 ['1', '@handle', '_', 'PROPN', '_', '_', '_', '_', '_', '_'].join('\t'),
112 ['2', '@markup', '_', 'NE', '_', '_', '_', '_', '_', '_'].join('\t'),
113 ''
114 ].join('\n');
115 const stdout = execSync('node src/index.js', { input: testInput }).toString();
116
117 expect(stdout).toContain('@handle\t_\tPROPN\tADR');
118 expect(stdout).toContain('@markup\t_\tNE\tADR');
119
120 var adr_count = (stdout.match(/\tADR\t/g) || []).length;
121 expect(adr_count).toBe(2);
122 done();
123 });
124
Marc Kupietzb5d80b32025-12-11 15:48:09 +0100125 test('Regression test for issue #113: emoji modifiers and ZWJ', (done) => {
126 // Test that compound emojis with modifiers and ZWJ are recognized as single EMOIMG tokens
127 const testInput = `# foundry = base
128# text_id = test-113
129# text = ✊🏿 and 👨‍👨‍👦
1301 ✊🏿 _ _ _ _ _ _ _ _
1312 and _ CCONJ _ _ _ _ _ _
1323 👨‍👨‍👦 _ _ _ _ _ _ _ _
133
134`;
135 const { execSync } = require('child_process');
136 const stdout = execSync('node src/index.js', { input: testInput }).toString();
Marc Kupietzc2875332026-04-10 14:21:06 +0200137
Marc Kupietza7934e02025-12-18 07:25:53 +0100138 // Check that compound emojis are tagged as EMOIMG and lemma has base emoji
Marc Kupietz187bcdc2025-12-18 11:43:26 +0100139 expect(stdout).toContain('✊🏿\t✊\t_\tEMOIMG');
140 expect(stdout).toContain('👨‍👨‍👦\t👨\t_\tEMOIMG');
Marc Kupietzc2875332026-04-10 14:21:06 +0200141
Marc Kupietz76007d62025-12-11 17:13:05 +0100142 // Count EMOIMG occurrences (should be 1 per emoji - only in XPOS column)
Marc Kupietzb5d80b32025-12-11 15:48:09 +0100143 var emoimg_count = (stdout.match(/EMOIMG/g) || []).length;
Marc Kupietz187bcdc2025-12-18 11:43:26 +0100144 expect(emoimg_count).toBe(2); // 2 emojis × 1 column = 2
Marc Kupietzb5d80b32025-12-11 15:48:09 +0100145 done();
146 });
147
Marc Kupietz7497fc42025-12-11 15:47:34 +0100148 test('Regression test for issue #114: Wikipedia emoji templates', (done) => {
149 // Test that Wikipedia emoji templates are recognized as EMOWIKI tokens
150 const testInput = `# foundry = base
151# text_id = test-114
152# text = [_EMOJI:{{S|;)}}_] and [_EMOJI:{{cool}}_]
1531 [_EMOJI:{{S|;)}}_] _ _ _ _ _ _ _ _
1542 and _ CCONJ _ _ _ _ _ _
1553 [_EMOJI:{{cool}}_] _ _ _ _ _ _ _ _
156
157`;
158 const { execSync } = require('child_process');
159 const stdout = execSync('node src/index.js', { input: testInput }).toString();
Marc Kupietzc2875332026-04-10 14:21:06 +0200160
Marc Kupietz76007d62025-12-11 17:13:05 +0100161 // Check that Wikipedia emoji templates are tagged as EMOWIKI in XPOS column only
162 expect(stdout).toContain('[_EMOJI:{{S|;)}}_]\t_\t_\tEMOWIKI');
163 expect(stdout).toContain('[_EMOJI:{{cool}}_]\t_\t_\tEMOWIKI');
Marc Kupietzc2875332026-04-10 14:21:06 +0200164
Marc Kupietz76007d62025-12-11 17:13:05 +0100165 // Count EMOWIKI occurrences (should be 1 per template - only in XPOS column)
Marc Kupietz7497fc42025-12-11 15:47:34 +0100166 var emowiki_count = (stdout.match(/EMOWIKI/g) || []).length;
Marc Kupietz76007d62025-12-11 17:13:05 +0100167 expect(emowiki_count).toBe(2); // 2 templates × 1 column = 2
Marc Kupietz7497fc42025-12-11 15:47:34 +0100168 done();
169 });
Marc Kupietz30634ff2025-12-18 11:39:03 +0100170
171 test('Test emoji metadata in FEATS column', (done) => {
172 // Test that EMOIMG tokens have populated FEATS column
173 const testInput = `# foundry = base
174# text_id = test-feats
175# text = 😇
1761 😇 _ _ _ _ _ _ _ _
177
178`;
179 const { execSync } = require('child_process');
180 const stdout = execSync('node src/index.js', { input: testInput }).toString();
Marc Kupietzc2875332026-04-10 14:21:06 +0200181
Marc Kupietz30634ff2025-12-18 11:39:03 +0100182 // Check that 😇 has correct metadata
183 // g=smileys_and_emotion|s=face_smiling|q=fully_qualified|v=E1.0|n=smiling_face_with_halo
184 // Note: spaces in data are replaced by _ in our script
185 expect(stdout).toContain('g=smileys_&_emotion|s=face_smiling|q=fully_qualified|v=E1.0|n=smiling_face_with_halo');
Marc Kupietzc2875332026-04-10 14:21:06 +0200186
Marc Kupietz30634ff2025-12-18 11:39:03 +0100187 // Also check for the base emoji lemma and tags
Marc Kupietz187bcdc2025-12-18 11:43:26 +0100188 expect(stdout).toContain('😇\t😇\t_\tEMOIMG');
Marc Kupietzc2875332026-04-10 14:21:06 +0200189
Marc Kupietz30634ff2025-12-18 11:39:03 +0100190 done();
191 });
Marc Kupietz2de30e22026-04-11 12:49:52 +0200192
193 test('Test normalized emoji-name separators in FEATS', (done) => {
194 const testInput = `# foundry = base
195# text_id = test-name-normalization
196# text = 👍🏻 👨‍👨‍👦
1971 👍🏻 _ _ _ _ _ _ _ _
1982 👨‍👨‍👦 _ _ _ _ _ _ _ _
199
200`;
201 const stdout = execSync('node src/index.js', { input: testInput }).toString();
202
203 expect(stdout).toContain('n=thumbs_up:light_skin_tone');
204 expect(stdout).toContain('n=family:man,man,boy');
205 expect(stdout).not.toContain('n=thumbs_up:_light_skin_tone');
206 expect(stdout).not.toContain('n=family:_man,_man,_boy');
207
208 done();
209 });
Marc Kupietzb43a5182024-02-03 18:09:10 +0100210});