blob: fbf38e13019b495874e6dec09a115d86bc2b0cc6 [file] [log] [blame]
Marc Kupietzb43a5182024-02-03 18:09:10 +01001const { execSync } = require('child_process');
2const exp = require('constants');
Marc Kupietz20f33d92026-04-12 14:27:57 +02003const packageVersion = require('../package.json').version;
Marc Kupietzb43a5182024-02-03 18:09:10 +01004
5describe('conllu2cmc', () => {
6 test('Test sparse mode', (done) => {
7 // Modify the command based on your script's location and options
8 const command = 'node src/index.js -s < test/data/ndy.conllu';
9 const stdout = execSync(command).toString();
Marc Kupietz187bcdc2025-12-18 11:43:26 +010010 expect(stdout).toContain('😂\t😂\t_\tEMOIMG');
Marc Kupietzb43a5182024-02-03 18:09:10 +010011 var emoimg_count = (stdout.match(/EMOIMG/g) || []).length;
Marc Kupietz187bcdc2025-12-18 11:43:26 +010012 expect(emoimg_count).toBe(191);
Marc Kupietzb43a5182024-02-03 18:09:10 +010013 var ascimg_count = (stdout.match(/EMOASC/g) || []).length;
Marc Kupietz76007d62025-12-11 17:13:05 +010014 expect(ascimg_count).toBe(30);
Marc Kupietzc2875332026-04-10 14:21:06 +020015 var hst_count = (stdout.match(/\tHST\t/g) || []).length;
Marc Kupietze8e40ec2026-04-10 15:15:59 +020016 expect(hst_count).toBe(12);
Marc Kupietz76007d62025-12-11 17:13:05 +010017 var url_count = (stdout.match(/\tURL\t/g) || []).length;
18 expect(url_count).toBe(4);
19 var adr_count = (stdout.match(/\tADR\t/g) || []).length;
20 expect(adr_count).toBe(1);
Marc Kupietzfd92b1d2024-03-13 10:51:29 +010021 var eot_count = (stdout.match(/\n# eot/g) || []).length;
22 expect(eot_count).toBe(1);
23 var eof_count = (stdout.match(/\n# eof/g) || []).length;
24 expect(eof_count).toBe(1);
Marc Kupietzb43a5182024-02-03 18:09:10 +010025 var lines_count = (stdout.split("\n")).length;
Marc Kupietze8e40ec2026-04-10 15:15:59 +020026 expect(lines_count).toBe(803);
Marc Kupietzb43a5182024-02-03 18:09:10 +010027 done();
28 });
29
30 test('Test full mode', (done) => {
31 const command = 'node src/index.js < test/data/ndy.conllu';
32 const stdout = execSync(command).toString();
Marc Kupietzc2875332026-04-10 14:21:06 +020033 expect(stdout).toContain('😂\t😂\t_\tEMOIMG');
34 var emoimg_count = (stdout.match(/EMOIMG/g) || []).length;
35 expect(emoimg_count).toBe(191);
36 var ascimg_count = (stdout.match(/EMOASC/g) || []).length;
37 expect(ascimg_count).toBe(30);
38 var hst_count = (stdout.match(/\tHST\t/g) || []).length;
Marc Kupietze8e40ec2026-04-10 15:15:59 +020039 expect(hst_count).toBe(12);
Marc Kupietzc2875332026-04-10 14:21:06 +020040 var url_count = (stdout.match(/\tURL\t/g) || []).length;
41 expect(url_count).toBe(4);
42 var adr_count = (stdout.match(/\tADR\t/g) || []).length;
43 expect(adr_count).toBe(1);
44 var lines_count = (stdout.split("\n")).length;
45 expect(lines_count).toBe(6202);
46 done();
47 });
48
Marc Kupietz20f33d92026-04-12 14:27:57 +020049 test('Test version flag', (done) => {
50 const stdout = execSync('node src/index.js -V').toString().trim();
51
52 expect(stdout).toBe(packageVersion);
53 done();
54 });
55
Marc Kupietzc2875332026-04-10 14:21:06 +020056 test('Regression test for hashtags: emit HST', (done) => {
57 const testInput = [
58 '# foundry = base',
59 '# text_id = test-hashtag',
Marc Kupietze8e40ec2026-04-10 15:15:59 +020060 '# text = #KorAP #3D #10',
Marc Kupietzc2875332026-04-10 14:21:06 +020061 ['1', '#KorAP', '_', '_', '_', '_', '_', '_', '_', '_'].join('\t'),
Marc Kupietze8e40ec2026-04-10 15:15:59 +020062 ['2', '#3D', '_', '_', '_', '_', '_', '_', '_', '_'].join('\t'),
63 ['3', '#10', '_', '_', '_', '_', '_', '_', '_', '_'].join('\t'),
Marc Kupietzc2875332026-04-10 14:21:06 +020064 ''
65 ].join('\n');
66 const stdout = execSync('node src/index.js', { input: testInput }).toString();
67
68 expect(stdout).toContain('#KorAP\t_\t_\tHST');
Marc Kupietze8e40ec2026-04-10 15:15:59 +020069 expect(stdout).toContain('#3D\t_\t_\tHST');
70 expect(stdout).not.toContain('#10\t_\t_\tHST');
Marc Kupietzc2875332026-04-10 14:21:06 +020071
72 var hst_count = (stdout.match(/\tHST\t/g) || []).length;
73 expect(hst_count).toBe(2);
74 done();
Marc Kupietzb43a5182024-02-03 18:09:10 +010075 });
Marc Kupietzb5d80b32025-12-11 15:48:09 +010076
Marc Kupietz3d525092026-04-11 20:44:32 +020077 test('Regression test for hashtags with Unicode letters: emit HST', (done) => {
78 const testInput = [
79 '# foundry = base',
80 '# text_id = test-hashtag-unicode',
81 '# text = #okeichhörejetztauf #schön #10',
82 ['1', '#okeichhörejetztauf', '_', '_', '_', '_', '_', '_', '_', '_'].join('\t'),
83 ['2', '#schön', '_', '_', '_', '_', '_', '_', '_', '_'].join('\t'),
84 ['3', '#10', '_', '_', '_', '_', '_', '_', '_', '_'].join('\t'),
85 ''
86 ].join('\n');
87 const stdout = execSync('node src/index.js', { input: testInput }).toString();
88
89 expect(stdout).toContain('#okeichhörejetztauf\t_\t_\tHST');
90 expect(stdout).toContain('#schön\t_\t_\tHST');
91 expect(stdout).not.toContain('#10\t_\t_\tHST');
92
93 var hst_count = (stdout.match(/\tHST\t/g) || []).length;
94 expect(hst_count).toBe(2);
95 done();
96 });
97
Marc Kupietz804750d2026-04-10 14:44:13 +020098 test('Regression test for addresses: emit ADR regardless of existing POS values', (done) => {
99 const testInput = [
100 '# foundry = base',
101 '# text_id = test-address',
102 '# text = @handle @markup',
103 ['1', '@handle', '_', 'PROPN', '_', '_', '_', '_', '_', '_'].join('\t'),
104 ['2', '@markup', '_', 'NE', '_', '_', '_', '_', '_', '_'].join('\t'),
105 ''
106 ].join('\n');
107 const stdout = execSync('node src/index.js', { input: testInput }).toString();
108
109 expect(stdout).toContain('@handle\t_\tPROPN\tADR');
110 expect(stdout).toContain('@markup\t_\tNE\tADR');
111
112 var adr_count = (stdout.match(/\tADR\t/g) || []).length;
113 expect(adr_count).toBe(2);
114 done();
115 });
116
Marc Kupietzb5d80b32025-12-11 15:48:09 +0100117 test('Regression test for issue #113: emoji modifiers and ZWJ', (done) => {
118 // Test that compound emojis with modifiers and ZWJ are recognized as single EMOIMG tokens
119 const testInput = `# foundry = base
120# text_id = test-113
121# text = ✊🏿 and 👨‍👨‍👦
1221 ✊🏿 _ _ _ _ _ _ _ _
1232 and _ CCONJ _ _ _ _ _ _
1243 👨‍👨‍👦 _ _ _ _ _ _ _ _
125
126`;
127 const { execSync } = require('child_process');
128 const stdout = execSync('node src/index.js', { input: testInput }).toString();
Marc Kupietzc2875332026-04-10 14:21:06 +0200129
Marc Kupietza7934e02025-12-18 07:25:53 +0100130 // Check that compound emojis are tagged as EMOIMG and lemma has base emoji
Marc Kupietz187bcdc2025-12-18 11:43:26 +0100131 expect(stdout).toContain('✊🏿\t✊\t_\tEMOIMG');
132 expect(stdout).toContain('👨‍👨‍👦\t👨\t_\tEMOIMG');
Marc Kupietzc2875332026-04-10 14:21:06 +0200133
Marc Kupietz76007d62025-12-11 17:13:05 +0100134 // Count EMOIMG occurrences (should be 1 per emoji - only in XPOS column)
Marc Kupietzb5d80b32025-12-11 15:48:09 +0100135 var emoimg_count = (stdout.match(/EMOIMG/g) || []).length;
Marc Kupietz187bcdc2025-12-18 11:43:26 +0100136 expect(emoimg_count).toBe(2); // 2 emojis × 1 column = 2
Marc Kupietzb5d80b32025-12-11 15:48:09 +0100137 done();
138 });
139
Marc Kupietz7497fc42025-12-11 15:47:34 +0100140 test('Regression test for issue #114: Wikipedia emoji templates', (done) => {
141 // Test that Wikipedia emoji templates are recognized as EMOWIKI tokens
142 const testInput = `# foundry = base
143# text_id = test-114
144# text = [_EMOJI:{{S|;)}}_] and [_EMOJI:{{cool}}_]
1451 [_EMOJI:{{S|;)}}_] _ _ _ _ _ _ _ _
1462 and _ CCONJ _ _ _ _ _ _
1473 [_EMOJI:{{cool}}_] _ _ _ _ _ _ _ _
148
149`;
150 const { execSync } = require('child_process');
151 const stdout = execSync('node src/index.js', { input: testInput }).toString();
Marc Kupietzc2875332026-04-10 14:21:06 +0200152
Marc Kupietz76007d62025-12-11 17:13:05 +0100153 // Check that Wikipedia emoji templates are tagged as EMOWIKI in XPOS column only
154 expect(stdout).toContain('[_EMOJI:{{S|;)}}_]\t_\t_\tEMOWIKI');
155 expect(stdout).toContain('[_EMOJI:{{cool}}_]\t_\t_\tEMOWIKI');
Marc Kupietzc2875332026-04-10 14:21:06 +0200156
Marc Kupietz76007d62025-12-11 17:13:05 +0100157 // Count EMOWIKI occurrences (should be 1 per template - only in XPOS column)
Marc Kupietz7497fc42025-12-11 15:47:34 +0100158 var emowiki_count = (stdout.match(/EMOWIKI/g) || []).length;
Marc Kupietz76007d62025-12-11 17:13:05 +0100159 expect(emowiki_count).toBe(2); // 2 templates × 1 column = 2
Marc Kupietz7497fc42025-12-11 15:47:34 +0100160 done();
161 });
Marc Kupietz30634ff2025-12-18 11:39:03 +0100162
163 test('Test emoji metadata in FEATS column', (done) => {
164 // Test that EMOIMG tokens have populated FEATS column
165 const testInput = `# foundry = base
166# text_id = test-feats
167# text = 😇
1681 😇 _ _ _ _ _ _ _ _
169
170`;
171 const { execSync } = require('child_process');
172 const stdout = execSync('node src/index.js', { input: testInput }).toString();
Marc Kupietzc2875332026-04-10 14:21:06 +0200173
Marc Kupietz30634ff2025-12-18 11:39:03 +0100174 // Check that 😇 has correct metadata
175 // g=smileys_and_emotion|s=face_smiling|q=fully_qualified|v=E1.0|n=smiling_face_with_halo
176 // Note: spaces in data are replaced by _ in our script
177 expect(stdout).toContain('g=smileys_&_emotion|s=face_smiling|q=fully_qualified|v=E1.0|n=smiling_face_with_halo');
Marc Kupietzc2875332026-04-10 14:21:06 +0200178
Marc Kupietz30634ff2025-12-18 11:39:03 +0100179 // Also check for the base emoji lemma and tags
Marc Kupietz187bcdc2025-12-18 11:43:26 +0100180 expect(stdout).toContain('😇\t😇\t_\tEMOIMG');
Marc Kupietzc2875332026-04-10 14:21:06 +0200181
Marc Kupietz30634ff2025-12-18 11:39:03 +0100182 done();
183 });
Marc Kupietz2de30e22026-04-11 12:49:52 +0200184
185 test('Test normalized emoji-name separators in FEATS', (done) => {
186 const testInput = `# foundry = base
187# text_id = test-name-normalization
188# text = 👍🏻 👨‍👨‍👦
1891 👍🏻 _ _ _ _ _ _ _ _
1902 👨‍👨‍👦 _ _ _ _ _ _ _ _
191
192`;
193 const stdout = execSync('node src/index.js', { input: testInput }).toString();
194
195 expect(stdout).toContain('n=thumbs_up:light_skin_tone');
196 expect(stdout).toContain('n=family:man,man,boy');
197 expect(stdout).not.toContain('n=thumbs_up:_light_skin_tone');
198 expect(stdout).not.toContain('n=family:_man,_man,_boy');
199
200 done();
201 });
Marc Kupietzb43a5182024-02-03 18:09:10 +0100202});