blob: 4a6be49c2b7bf922d96fa374d2ac90231610546d [file] [log] [blame]
Marc Kupietzb43a5182024-02-03 18:09:10 +01001const { execSync } = require('child_process');
2const exp = require('constants');
3
4describe('conllu2cmc', () => {
5 test('Test sparse mode', (done) => {
6 // Modify the command based on your script's location and options
7 const command = 'node src/index.js -s < test/data/ndy.conllu';
8 const stdout = execSync(command).toString();
Marc Kupietz187bcdc2025-12-18 11:43:26 +01009 expect(stdout).toContain('😂\t😂\t_\tEMOIMG');
Marc Kupietzb43a5182024-02-03 18:09:10 +010010 var emoimg_count = (stdout.match(/EMOIMG/g) || []).length;
Marc Kupietz187bcdc2025-12-18 11:43:26 +010011 expect(emoimg_count).toBe(191);
Marc Kupietzb43a5182024-02-03 18:09:10 +010012 var ascimg_count = (stdout.match(/EMOASC/g) || []).length;
Marc Kupietz76007d62025-12-11 17:13:05 +010013 expect(ascimg_count).toBe(30);
14 var url_count = (stdout.match(/\tURL\t/g) || []).length;
15 expect(url_count).toBe(4);
16 var adr_count = (stdout.match(/\tADR\t/g) || []).length;
17 expect(adr_count).toBe(1);
Marc Kupietzfd92b1d2024-03-13 10:51:29 +010018 var eot_count = (stdout.match(/\n# eot/g) || []).length;
19 expect(eot_count).toBe(1);
20 var eof_count = (stdout.match(/\n# eof/g) || []).length;
21 expect(eof_count).toBe(1);
Marc Kupietzb43a5182024-02-03 18:09:10 +010022 var lines_count = (stdout.split("\n")).length;
Marc Kupietzfd92b1d2024-03-13 10:51:29 +010023 expect(lines_count).toBe(746);
Marc Kupietzb43a5182024-02-03 18:09:10 +010024 done();
25 });
26
27 test('Test full mode', (done) => {
28 const command = 'node src/index.js < test/data/ndy.conllu';
29 const stdout = execSync(command).toString();
Marc Kupietz187bcdc2025-12-18 11:43:26 +010030 expect(stdout).toContain('😂\t😂\t_\tEMOIMG');
Marc Kupietzb43a5182024-02-03 18:09:10 +010031 var emoimg_count = (stdout.match(/EMOIMG/g) || []).length;
Marc Kupietz187bcdc2025-12-18 11:43:26 +010032 expect(emoimg_count).toBe(191);
Marc Kupietzb43a5182024-02-03 18:09:10 +010033 var ascimg_count = (stdout.match(/EMOASC/g) || []).length;
Marc Kupietz76007d62025-12-11 17:13:05 +010034 expect(ascimg_count).toBe(30);
35 var url_count = (stdout.match(/\tURL\t/g) || []).length;
36 expect(url_count).toBe(4);
37 var adr_count = (stdout.match(/\tADR\t/g) || []).length;
38 expect(adr_count).toBe(1);
Marc Kupietzb43a5182024-02-03 18:09:10 +010039 var lines_count = (stdout.split("\n")).length;
Marc Kupietzfd92b1d2024-03-13 10:51:29 +010040 expect(lines_count).toBe(6202);
Marc Kupietzb43a5182024-02-03 18:09:10 +010041 done();
42 });
Marc Kupietzb5d80b32025-12-11 15:48:09 +010043
44 test('Regression test for issue #113: emoji modifiers and ZWJ', (done) => {
45 // Test that compound emojis with modifiers and ZWJ are recognized as single EMOIMG tokens
46 const testInput = `# foundry = base
47# text_id = test-113
48# text = ✊🏿 and 👨‍👨‍👦
491 ✊🏿 _ _ _ _ _ _ _ _
502 and _ CCONJ _ _ _ _ _ _
513 👨‍👨‍👦 _ _ _ _ _ _ _ _
52
53`;
54 const { execSync } = require('child_process');
55 const stdout = execSync('node src/index.js', { input: testInput }).toString();
56
Marc Kupietza7934e02025-12-18 07:25:53 +010057 // Check that compound emojis are tagged as EMOIMG and lemma has base emoji
Marc Kupietz187bcdc2025-12-18 11:43:26 +010058 expect(stdout).toContain('✊🏿\t✊\t_\tEMOIMG');
59 expect(stdout).toContain('👨‍👨‍👦\t👨\t_\tEMOIMG');
Marc Kupietzb5d80b32025-12-11 15:48:09 +010060
Marc Kupietz76007d62025-12-11 17:13:05 +010061 // Count EMOIMG occurrences (should be 1 per emoji - only in XPOS column)
Marc Kupietzb5d80b32025-12-11 15:48:09 +010062 var emoimg_count = (stdout.match(/EMOIMG/g) || []).length;
Marc Kupietz187bcdc2025-12-18 11:43:26 +010063 expect(emoimg_count).toBe(2); // 2 emojis × 1 column = 2
Marc Kupietzb5d80b32025-12-11 15:48:09 +010064 done();
65 });
66
Marc Kupietz7497fc42025-12-11 15:47:34 +010067 test('Regression test for issue #114: Wikipedia emoji templates', (done) => {
68 // Test that Wikipedia emoji templates are recognized as EMOWIKI tokens
69 const testInput = `# foundry = base
70# text_id = test-114
71# text = [_EMOJI:{{S|;)}}_] and [_EMOJI:{{cool}}_]
721 [_EMOJI:{{S|;)}}_] _ _ _ _ _ _ _ _
732 and _ CCONJ _ _ _ _ _ _
743 [_EMOJI:{{cool}}_] _ _ _ _ _ _ _ _
75
76`;
77 const { execSync } = require('child_process');
78 const stdout = execSync('node src/index.js', { input: testInput }).toString();
79
Marc Kupietz76007d62025-12-11 17:13:05 +010080 // Check that Wikipedia emoji templates are tagged as EMOWIKI in XPOS column only
81 expect(stdout).toContain('[_EMOJI:{{S|;)}}_]\t_\t_\tEMOWIKI');
82 expect(stdout).toContain('[_EMOJI:{{cool}}_]\t_\t_\tEMOWIKI');
Marc Kupietz7497fc42025-12-11 15:47:34 +010083
Marc Kupietz76007d62025-12-11 17:13:05 +010084 // Count EMOWIKI occurrences (should be 1 per template - only in XPOS column)
Marc Kupietz7497fc42025-12-11 15:47:34 +010085 var emowiki_count = (stdout.match(/EMOWIKI/g) || []).length;
Marc Kupietz76007d62025-12-11 17:13:05 +010086 expect(emowiki_count).toBe(2); // 2 templates × 1 column = 2
Marc Kupietz7497fc42025-12-11 15:47:34 +010087 done();
88 });
Marc Kupietz30634ff2025-12-18 11:39:03 +010089
90 test('Test emoji metadata in FEATS column', (done) => {
91 // Test that EMOIMG tokens have populated FEATS column
92 const testInput = `# foundry = base
93# text_id = test-feats
94# text = 😇
951 😇 _ _ _ _ _ _ _ _
96
97`;
98 const { execSync } = require('child_process');
99 const stdout = execSync('node src/index.js', { input: testInput }).toString();
100
101 // Check that 😇 has correct metadata
102 // g=smileys_and_emotion|s=face_smiling|q=fully_qualified|v=E1.0|n=smiling_face_with_halo
103 // Note: spaces in data are replaced by _ in our script
104 expect(stdout).toContain('g=smileys_&_emotion|s=face_smiling|q=fully_qualified|v=E1.0|n=smiling_face_with_halo');
105
106 // Also check for the base emoji lemma and tags
Marc Kupietz187bcdc2025-12-18 11:43:26 +0100107 expect(stdout).toContain('😇\t😇\t_\tEMOIMG');
Marc Kupietz30634ff2025-12-18 11:39:03 +0100108
109 done();
110 });
Marc Kupietzb43a5182024-02-03 18:09:10 +0100111});