blob: 1a823396b2ea1e193f6f6d396f7a60c0e99678a2 [file] [log] [blame]
const https = require('https');
const fs = require('fs');
const path = require('path');
const url = 'https://www.unicode.org/Public/UCD/latest/emoji/emoji-test.txt';
const outputPath = path.join(__dirname, '../src/emoji_data.json');
console.log(`Downloading ${url}...`);
https.get(url, (res) => {
let data = '';
res.on('data', (chunk) => {
data += chunk;
});
res.on('end', () => {
console.log('Download complete. Parsing data...');
const emojiData = parseEmojiData(data);
fs.writeFileSync(outputPath, JSON.stringify(emojiData, null, 2));
console.log(`Wrote ${Object.keys(emojiData).length} emojis to ${outputPath}`);
});
}).on('error', (err) => {
console.error('Error downloading file: ' + err.message);
process.exit(1);
});
function parseEmojiData(text) {
const lines = text.split('\n');
const result = {};
let currentGroup = '';
let currentSubgroup = '';
for (const line of lines) {
// Skip empty lines
if (!line.trim()) continue;
// Parse Group
if (line.startsWith('# group:')) {
currentGroup = normalize(line.substring(8).trim());
continue;
}
// Parse Subgroup
if (line.startsWith('# subgroup:')) {
currentSubgroup = normalize(line.substring(11).trim());
continue;
}
// Skip comments that don't look like data (lines starting with #)
if (line.startsWith('#')) continue;
// Parse data line
// Format: code_points ; status # emoji name
// Example: 1F607 ; fully-qualified # 😇 E1.0 smiling face with halo
const parts = line.split(';');
if (parts.length < 2) continue;
const statusPart = parts[1].split('#');
const status = normalize(statusPart[0].trim()); // e.g., fully-qualified
// The part after # contains: emoji char, version, name
// Example: " 😇 E1.0 smiling face with halo"
const commentPart = statusPart[1].trim();
// We need to extract the actual emoji character(s) to use as key.
// It's the first 'word' in the comment part usually.
// But extracting it from the code points is safer/standard.
const codePoints = parts[0].trim().split(' ');
const emojiKey = String.fromCodePoint(...codePoints.map(cp => parseInt(cp, 16)));
// Extract Version and Name from the comment part
// Format is: [Emoji] E[Version] [Name]
// Example: 😇 E1.0 smiling face with halo
const versionMatch = commentPart.match(/E(\d+\.\d+)/);
let version = '';
let name = '';
if (versionMatch) {
version = 'E' + versionMatch[1];
// Name is everything after the version
const nameIndex = commentPart.indexOf(version) + version.length;
name = normalize(commentPart.substring(nameIndex).trim());
} else {
// Fallback if regex fails (shouldn't happen with standard file)
console.warn(`Could not parse version for line: ${line}`);
}
result[emojiKey] = {
g: currentGroup,
s: currentSubgroup,
q: status,
v: version,
n: name
};
}
return result;
}
function normalize(str) {
// Replace spaces with underscores and lowercase
return str.toLowerCase().replace(/[ \-]+/g, '_');
}