blob: 3014c90ccc717c0996d5c11eaf369e5ded09c1ae [file] [log] [blame]
Marc Kupietzb43a5182024-02-03 18:09:10 +01001const { execSync } = require('child_process');
2const exp = require('constants');
3
4describe('conllu2cmc', () => {
5 test('Test sparse mode', (done) => {
6 // Modify the command based on your script's location and options
7 const command = 'node src/index.js -s < test/data/ndy.conllu';
8 const stdout = execSync(command).toString();
Marc Kupietz187bcdc2025-12-18 11:43:26 +01009 expect(stdout).toContain('😂\t😂\t_\tEMOIMG');
Marc Kupietzb43a5182024-02-03 18:09:10 +010010 var emoimg_count = (stdout.match(/EMOIMG/g) || []).length;
Marc Kupietz187bcdc2025-12-18 11:43:26 +010011 expect(emoimg_count).toBe(191);
Marc Kupietzb43a5182024-02-03 18:09:10 +010012 var ascimg_count = (stdout.match(/EMOASC/g) || []).length;
Marc Kupietz76007d62025-12-11 17:13:05 +010013 expect(ascimg_count).toBe(30);
Marc Kupietzc2875332026-04-10 14:21:06 +020014 var hst_count = (stdout.match(/\tHST\t/g) || []).length;
Marc Kupietze8e40ec2026-04-10 15:15:59 +020015 expect(hst_count).toBe(12);
Marc Kupietz76007d62025-12-11 17:13:05 +010016 var url_count = (stdout.match(/\tURL\t/g) || []).length;
17 expect(url_count).toBe(4);
18 var adr_count = (stdout.match(/\tADR\t/g) || []).length;
19 expect(adr_count).toBe(1);
Marc Kupietzfd92b1d2024-03-13 10:51:29 +010020 var eot_count = (stdout.match(/\n# eot/g) || []).length;
21 expect(eot_count).toBe(1);
22 var eof_count = (stdout.match(/\n# eof/g) || []).length;
23 expect(eof_count).toBe(1);
Marc Kupietzb43a5182024-02-03 18:09:10 +010024 var lines_count = (stdout.split("\n")).length;
Marc Kupietze8e40ec2026-04-10 15:15:59 +020025 expect(lines_count).toBe(803);
Marc Kupietzb43a5182024-02-03 18:09:10 +010026 done();
27 });
28
29 test('Test full mode', (done) => {
30 const command = 'node src/index.js < test/data/ndy.conllu';
31 const stdout = execSync(command).toString();
Marc Kupietzc2875332026-04-10 14:21:06 +020032 expect(stdout).toContain('😂\t😂\t_\tEMOIMG');
33 var emoimg_count = (stdout.match(/EMOIMG/g) || []).length;
34 expect(emoimg_count).toBe(191);
35 var ascimg_count = (stdout.match(/EMOASC/g) || []).length;
36 expect(ascimg_count).toBe(30);
37 var hst_count = (stdout.match(/\tHST\t/g) || []).length;
Marc Kupietze8e40ec2026-04-10 15:15:59 +020038 expect(hst_count).toBe(12);
Marc Kupietzc2875332026-04-10 14:21:06 +020039 var url_count = (stdout.match(/\tURL\t/g) || []).length;
40 expect(url_count).toBe(4);
41 var adr_count = (stdout.match(/\tADR\t/g) || []).length;
42 expect(adr_count).toBe(1);
43 var lines_count = (stdout.split("\n")).length;
44 expect(lines_count).toBe(6202);
45 done();
46 });
47
48 test('Regression test for hashtags: emit HST', (done) => {
49 const testInput = [
50 '# foundry = base',
51 '# text_id = test-hashtag',
Marc Kupietze8e40ec2026-04-10 15:15:59 +020052 '# text = #KorAP #3D #10',
Marc Kupietzc2875332026-04-10 14:21:06 +020053 ['1', '#KorAP', '_', '_', '_', '_', '_', '_', '_', '_'].join('\t'),
Marc Kupietze8e40ec2026-04-10 15:15:59 +020054 ['2', '#3D', '_', '_', '_', '_', '_', '_', '_', '_'].join('\t'),
55 ['3', '#10', '_', '_', '_', '_', '_', '_', '_', '_'].join('\t'),
Marc Kupietzc2875332026-04-10 14:21:06 +020056 ''
57 ].join('\n');
58 const stdout = execSync('node src/index.js', { input: testInput }).toString();
59
60 expect(stdout).toContain('#KorAP\t_\t_\tHST');
Marc Kupietze8e40ec2026-04-10 15:15:59 +020061 expect(stdout).toContain('#3D\t_\t_\tHST');
62 expect(stdout).not.toContain('#10\t_\t_\tHST');
Marc Kupietzc2875332026-04-10 14:21:06 +020063
64 var hst_count = (stdout.match(/\tHST\t/g) || []).length;
65 expect(hst_count).toBe(2);
66 done();
Marc Kupietzb43a5182024-02-03 18:09:10 +010067 });
Marc Kupietzb5d80b32025-12-11 15:48:09 +010068
Marc Kupietz3d525092026-04-11 20:44:32 +020069 test('Regression test for hashtags with Unicode letters: emit HST', (done) => {
70 const testInput = [
71 '# foundry = base',
72 '# text_id = test-hashtag-unicode',
73 '# text = #okeichhörejetztauf #schön #10',
74 ['1', '#okeichhörejetztauf', '_', '_', '_', '_', '_', '_', '_', '_'].join('\t'),
75 ['2', '#schön', '_', '_', '_', '_', '_', '_', '_', '_'].join('\t'),
76 ['3', '#10', '_', '_', '_', '_', '_', '_', '_', '_'].join('\t'),
77 ''
78 ].join('\n');
79 const stdout = execSync('node src/index.js', { input: testInput }).toString();
80
81 expect(stdout).toContain('#okeichhörejetztauf\t_\t_\tHST');
82 expect(stdout).toContain('#schön\t_\t_\tHST');
83 expect(stdout).not.toContain('#10\t_\t_\tHST');
84
85 var hst_count = (stdout.match(/\tHST\t/g) || []).length;
86 expect(hst_count).toBe(2);
87 done();
88 });
89
Marc Kupietz804750d2026-04-10 14:44:13 +020090 test('Regression test for addresses: emit ADR regardless of existing POS values', (done) => {
91 const testInput = [
92 '# foundry = base',
93 '# text_id = test-address',
94 '# text = @handle @markup',
95 ['1', '@handle', '_', 'PROPN', '_', '_', '_', '_', '_', '_'].join('\t'),
96 ['2', '@markup', '_', 'NE', '_', '_', '_', '_', '_', '_'].join('\t'),
97 ''
98 ].join('\n');
99 const stdout = execSync('node src/index.js', { input: testInput }).toString();
100
101 expect(stdout).toContain('@handle\t_\tPROPN\tADR');
102 expect(stdout).toContain('@markup\t_\tNE\tADR');
103
104 var adr_count = (stdout.match(/\tADR\t/g) || []).length;
105 expect(adr_count).toBe(2);
106 done();
107 });
108
Marc Kupietzb5d80b32025-12-11 15:48:09 +0100109 test('Regression test for issue #113: emoji modifiers and ZWJ', (done) => {
110 // Test that compound emojis with modifiers and ZWJ are recognized as single EMOIMG tokens
111 const testInput = `# foundry = base
112# text_id = test-113
113# text = ✊🏿 and 👨‍👨‍👦
1141 ✊🏿 _ _ _ _ _ _ _ _
1152 and _ CCONJ _ _ _ _ _ _
1163 👨‍👨‍👦 _ _ _ _ _ _ _ _
117
118`;
119 const { execSync } = require('child_process');
120 const stdout = execSync('node src/index.js', { input: testInput }).toString();
Marc Kupietzc2875332026-04-10 14:21:06 +0200121
Marc Kupietza7934e02025-12-18 07:25:53 +0100122 // Check that compound emojis are tagged as EMOIMG and lemma has base emoji
Marc Kupietz187bcdc2025-12-18 11:43:26 +0100123 expect(stdout).toContain('✊🏿\t✊\t_\tEMOIMG');
124 expect(stdout).toContain('👨‍👨‍👦\t👨\t_\tEMOIMG');
Marc Kupietzc2875332026-04-10 14:21:06 +0200125
Marc Kupietz76007d62025-12-11 17:13:05 +0100126 // Count EMOIMG occurrences (should be 1 per emoji - only in XPOS column)
Marc Kupietzb5d80b32025-12-11 15:48:09 +0100127 var emoimg_count = (stdout.match(/EMOIMG/g) || []).length;
Marc Kupietz187bcdc2025-12-18 11:43:26 +0100128 expect(emoimg_count).toBe(2); // 2 emojis × 1 column = 2
Marc Kupietzb5d80b32025-12-11 15:48:09 +0100129 done();
130 });
131
Marc Kupietz7497fc42025-12-11 15:47:34 +0100132 test('Regression test for issue #114: Wikipedia emoji templates', (done) => {
133 // Test that Wikipedia emoji templates are recognized as EMOWIKI tokens
134 const testInput = `# foundry = base
135# text_id = test-114
136# text = [_EMOJI:{{S|;)}}_] and [_EMOJI:{{cool}}_]
1371 [_EMOJI:{{S|;)}}_] _ _ _ _ _ _ _ _
1382 and _ CCONJ _ _ _ _ _ _
1393 [_EMOJI:{{cool}}_] _ _ _ _ _ _ _ _
140
141`;
142 const { execSync } = require('child_process');
143 const stdout = execSync('node src/index.js', { input: testInput }).toString();
Marc Kupietzc2875332026-04-10 14:21:06 +0200144
Marc Kupietz76007d62025-12-11 17:13:05 +0100145 // Check that Wikipedia emoji templates are tagged as EMOWIKI in XPOS column only
146 expect(stdout).toContain('[_EMOJI:{{S|;)}}_]\t_\t_\tEMOWIKI');
147 expect(stdout).toContain('[_EMOJI:{{cool}}_]\t_\t_\tEMOWIKI');
Marc Kupietzc2875332026-04-10 14:21:06 +0200148
Marc Kupietz76007d62025-12-11 17:13:05 +0100149 // Count EMOWIKI occurrences (should be 1 per template - only in XPOS column)
Marc Kupietz7497fc42025-12-11 15:47:34 +0100150 var emowiki_count = (stdout.match(/EMOWIKI/g) || []).length;
Marc Kupietz76007d62025-12-11 17:13:05 +0100151 expect(emowiki_count).toBe(2); // 2 templates × 1 column = 2
Marc Kupietz7497fc42025-12-11 15:47:34 +0100152 done();
153 });
Marc Kupietz30634ff2025-12-18 11:39:03 +0100154
155 test('Test emoji metadata in FEATS column', (done) => {
156 // Test that EMOIMG tokens have populated FEATS column
157 const testInput = `# foundry = base
158# text_id = test-feats
159# text = 😇
1601 😇 _ _ _ _ _ _ _ _
161
162`;
163 const { execSync } = require('child_process');
164 const stdout = execSync('node src/index.js', { input: testInput }).toString();
Marc Kupietzc2875332026-04-10 14:21:06 +0200165
Marc Kupietz30634ff2025-12-18 11:39:03 +0100166 // Check that 😇 has correct metadata
167 // g=smileys_and_emotion|s=face_smiling|q=fully_qualified|v=E1.0|n=smiling_face_with_halo
168 // Note: spaces in data are replaced by _ in our script
169 expect(stdout).toContain('g=smileys_&_emotion|s=face_smiling|q=fully_qualified|v=E1.0|n=smiling_face_with_halo');
Marc Kupietzc2875332026-04-10 14:21:06 +0200170
Marc Kupietz30634ff2025-12-18 11:39:03 +0100171 // Also check for the base emoji lemma and tags
Marc Kupietz187bcdc2025-12-18 11:43:26 +0100172 expect(stdout).toContain('😇\t😇\t_\tEMOIMG');
Marc Kupietzc2875332026-04-10 14:21:06 +0200173
Marc Kupietz30634ff2025-12-18 11:39:03 +0100174 done();
175 });
Marc Kupietz2de30e22026-04-11 12:49:52 +0200176
177 test('Test normalized emoji-name separators in FEATS', (done) => {
178 const testInput = `# foundry = base
179# text_id = test-name-normalization
180# text = 👍🏻 👨‍👨‍👦
1811 👍🏻 _ _ _ _ _ _ _ _
1822 👨‍👨‍👦 _ _ _ _ _ _ _ _
183
184`;
185 const stdout = execSync('node src/index.js', { input: testInput }).toString();
186
187 expect(stdout).toContain('n=thumbs_up:light_skin_tone');
188 expect(stdout).toContain('n=family:man,man,boy');
189 expect(stdout).not.toContain('n=thumbs_up:_light_skin_tone');
190 expect(stdout).not.toContain('n=family:_man,_man,_boy');
191
192 done();
193 });
Marc Kupietzb43a5182024-02-03 18:09:10 +0100194});