Add check for min tokens in corpus
Resolves #61
Change-Id: I75dec5876f4728d948ea2b6e1be143c4ba655aaa
diff --git a/lib/korap_rc.js b/lib/korap_rc.js
index 016a7db..21c1eb8 100644
--- a/lib/korap_rc.js
+++ b/lib/korap_rc.js
@@ -155,7 +155,77 @@
if (glimpse_value) {
await page.click("#glimpse")
}
+ }
+ async check_corpus_statistics(page, minTokenThreshold = 1000) {
+ try {
+ // Navigate to the corpus view if not already there
+ await page.goto(this.korap_url, { waitUntil: 'networkidle2' });
+
+ // Click the vc-choose element to open corpus selection
+ await page.waitForSelector('#vc-choose', { visible: true, timeout: 10000 });
+ await page.click('#vc-choose');
+
+ // Wait a moment for the UI to respond
+ await new Promise(resolve => setTimeout(resolve, 1000));
+
+ // Click the statistic element
+ await page.waitForSelector('.statistic', { visible: true, timeout: 10000 });
+ await page.click('.statistic');
+
+ // Wait for statistics to load
+ await new Promise(resolve => setTimeout(resolve, 3000));
+
+ // Look for the tokens count in a dd element that follows an element with title "tokens"
+ const tokenCount = await page.evaluate((minThreshold) => {
+ // Find the element with title "tokens"
+ const tokenTitleElements = document.querySelectorAll('[title="tokens"], [title*="token"]');
+
+ for (const element of tokenTitleElements) {
+ // Look for the next dd element
+ let nextElement = element.nextElementSibling;
+ while (nextElement) {
+ if (nextElement.tagName.toLowerCase() === 'dd') {
+ const text = nextElement.textContent || nextElement.innerText || '';
+ // Remove number separators (commas and periods) and extract number
+ const cleanedText = text.replace(/[,\.]/g, '');
+ const numbers = cleanedText.match(/\d+/g);
+ if (numbers && numbers.length > 0) {
+ return parseInt(numbers[0], 10);
+ }
+ }
+ nextElement = nextElement.nextElementSibling;
+ }
+ }
+
+ // Alternative approach: look for dd elements that contain large numbers
+ const ddElements = document.querySelectorAll('dd');
+ for (const dd of ddElements) {
+ const text = dd.textContent || dd.innerText || '';
+ // Remove separators and check if it's a large number (likely token count)
+ const cleanedText = text.replace(/[,\.]/g, '');
+ const numbers = cleanedText.match(/\d+/g);
+ if (numbers && numbers.length > 0) {
+ const num = parseInt(numbers[0], 10);
+ // Use the provided threshold instead of hardcoded value
+ if (num > minThreshold) {
+ return num;
+ }
+ }
+ }
+
+ return null;
+ }, minTokenThreshold);
+
+ if (tokenCount === null) {
+ throw new Error("Could not find token count in corpus statistics");
+ }
+
+ return tokenCount;
+
+ } catch (error) {
+ throw new Error(`Failed to check corpus statistics: ${error.message}`);
+ }
}
}