blob: 1a823396b2ea1e193f6f6d396f7a60c0e99678a2 [file] [log] [blame]
Marc Kupietz30634ff2025-12-18 11:39:03 +01001const https = require('https');
2const fs = require('fs');
3const path = require('path');
4
5const url = 'https://www.unicode.org/Public/UCD/latest/emoji/emoji-test.txt';
6const outputPath = path.join(__dirname, '../src/emoji_data.json');
7
8console.log(`Downloading ${url}...`);
9
10https.get(url, (res) => {
11 let data = '';
12
13 res.on('data', (chunk) => {
14 data += chunk;
15 });
16
17 res.on('end', () => {
18 console.log('Download complete. Parsing data...');
19 const emojiData = parseEmojiData(data);
20 fs.writeFileSync(outputPath, JSON.stringify(emojiData, null, 2));
21 console.log(`Wrote ${Object.keys(emojiData).length} emojis to ${outputPath}`);
22 });
23
24}).on('error', (err) => {
25 console.error('Error downloading file: ' + err.message);
26 process.exit(1);
27});
28
29function parseEmojiData(text) {
30 const lines = text.split('\n');
31 const result = {};
32 let currentGroup = '';
33 let currentSubgroup = '';
34
35 for (const line of lines) {
36 // Skip empty lines
37 if (!line.trim()) continue;
38
39 // Parse Group
40 if (line.startsWith('# group:')) {
41 currentGroup = normalize(line.substring(8).trim());
42 continue;
43 }
44
45 // Parse Subgroup
46 if (line.startsWith('# subgroup:')) {
47 currentSubgroup = normalize(line.substring(11).trim());
48 continue;
49 }
50
51 // Skip comments that don't look like data (lines starting with #)
52 if (line.startsWith('#')) continue;
53
54 // Parse data line
55 // Format: code_points ; status # emoji name
56 // Example: 1F607 ; fully-qualified # 😇 E1.0 smiling face with halo
57 const parts = line.split(';');
58 if (parts.length < 2) continue;
59
60 const statusPart = parts[1].split('#');
61 const status = normalize(statusPart[0].trim()); // e.g., fully-qualified
62
63 // The part after # contains: emoji char, version, name
64 // Example: " 😇 E1.0 smiling face with halo"
65 const commentPart = statusPart[1].trim();
66
67 // We need to extract the actual emoji character(s) to use as key.
68 // It's the first 'word' in the comment part usually.
69 // But extracting it from the code points is safer/standard.
70 const codePoints = parts[0].trim().split(' ');
71 const emojiKey = String.fromCodePoint(...codePoints.map(cp => parseInt(cp, 16)));
72
73 // Extract Version and Name from the comment part
74 // Format is: [Emoji] E[Version] [Name]
75 // Example: 😇 E1.0 smiling face with halo
76 const versionMatch = commentPart.match(/E(\d+\.\d+)/);
77 let version = '';
78 let name = '';
79
80 if (versionMatch) {
81 version = 'E' + versionMatch[1];
82 // Name is everything after the version
83 const nameIndex = commentPart.indexOf(version) + version.length;
84 name = normalize(commentPart.substring(nameIndex).trim());
85 } else {
86 // Fallback if regex fails (shouldn't happen with standard file)
87 console.warn(`Could not parse version for line: ${line}`);
88 }
89
90 result[emojiKey] = {
91 g: currentGroup,
92 s: currentSubgroup,
93 q: status,
94 v: version,
95 n: name
96 };
97 }
98 return result;
99}
100
101function normalize(str) {
102 // Replace spaces with underscores and lowercase
103 return str.toLowerCase().replace(/[ \-]+/g, '_');
104}