Add debug logging for token count test
Change-Id: I7702d270880ee61f4a0deb01a9b7589bb1017c44
diff --git a/lib/korap_rc.js b/lib/korap_rc.js
index 8e5a22f..a628dbf 100644
--- a/lib/korap_rc.js
+++ b/lib/korap_rc.js
@@ -168,105 +168,133 @@
async check_corpus_statistics(page, minTokenThreshold = 1000) {
try {
+ console.log(`Starting corpus statistics check with minTokenThreshold: ${minTokenThreshold}`);
+
// Navigate to the corpus view if not already there
+ console.log(`Navigating to: ${this.korap_url}`);
await page.goto(this.korap_url, { waitUntil: 'domcontentloaded' });
-
+ console.log("Navigation completed");
+
// Click the vc-choose element to open corpus selection
+ console.log("Waiting for #vc-choose selector...");
await page.waitForSelector('#vc-choose', { visible: true, timeout: 90000 });
+ console.log("Found #vc-choose, clicking...");
await page.click('#vc-choose');
-
+ console.log("Clicked #vc-choose");
+
// Wait a moment for the UI to respond
+ console.log("Waiting 1 second for UI to respond...");
await new Promise(resolve => setTimeout(resolve, 1000));
-
+
// Click the statistic element
+ console.log("Waiting for .statistic selector...");
await page.waitForSelector('.statistic', { visible: true, timeout: 90000 });
+ console.log("Found .statistic element, attempting to click...");
try {
await page.click('.statistic');
+ console.log("Successfully clicked .statistic element");
} catch (error) {
+ console.error(`Failed to click statistic element: ${error.message}`);
throw new Error(`Failed to click statistic element: ${error.message}`);
}
+
+ // Wait for statistics to load with a more efficient approach
+ console.log("Waiting for token statistics to load...");
- // Wait for statistics to load and token count to appear
+ // First, wait for any dd elements to appear (basic structure)
+ await page.waitForSelector('dd', { visible: true, timeout: 30000 });
+
+ // Then wait for the specific token statistics with a simplified check
await page.waitForFunction(() => {
- const tokenTitleElements = document.querySelectorAll('[title="tokens"], [title*="token"]');
- for (const element of tokenTitleElements) {
- let nextElement = element.nextElementSibling;
- while (nextElement) {
- if (nextElement.tagName.toLowerCase() === 'dd') {
- const text = nextElement.textContent || nextElement.innerText || '';
- const cleanedText = text.replace(/[,\.]/g, '');
- const numbers = cleanedText.match(/\d+/g);
- if (numbers && numbers.length > 0) {
- return true;
- }
- }
- nextElement = nextElement.nextElementSibling;
- }
- }
+ // Simplified check - look for any dd element with a large number
const ddElements = document.querySelectorAll('dd');
- for (const dd of ddElements) {
- const text = dd.textContent || dd.innerText || '';
+ for (let i = 0; i < ddElements.length; i++) {
+ const text = ddElements[i].textContent || ddElements[i].innerText || '';
const cleanedText = text.replace(/[,\.]/g, '');
const numbers = cleanedText.match(/\d+/g);
if (numbers && numbers.length > 0) {
- return true;
+ const num = parseInt(numbers[0], 10);
+ if (num > 1000) { // Found a substantial number, likely loaded
+ return true;
+ }
}
}
return false;
- }, { timeout: 60000 });
+ }, { timeout: 90000, polling: 1000 }); // Poll every second instead of continuously
// Look for the tokens count in a dd element that follows an element with title "tokens"
+ console.log(`Starting token count extraction with minThreshold: ${minTokenThreshold}`);
const tokenCount = await page.evaluate((minThreshold) => {
- console.log("Attempting to find token count within page.evaluate...");
- // Find the element with title "tokens"
+ // Find the element with title "tokens" first
const tokenTitleElements = document.querySelectorAll('[title="tokens"], [title*="token"]');
- for (const element of tokenTitleElements) {
+ for (let i = 0; i < tokenTitleElements.length; i++) {
+ const element = tokenTitleElements[i];
+
// Look for the next dd element
let nextElement = element.nextElementSibling;
- while (nextElement) {
+ let siblingCount = 0;
+ while (nextElement && siblingCount < 10) {
+ siblingCount++;
+
if (nextElement.tagName.toLowerCase() === 'dd') {
const text = nextElement.textContent || nextElement.innerText || '';
// Remove number separators (commas and periods) and extract number
const cleanedText = text.replace(/[,\.]/g, '');
const numbers = cleanedText.match(/\d+/g);
if (numbers && numbers.length > 0) {
- console.log(`Found token count from title element: ${numbers[0]}`);
- return parseInt(numbers[0], 10);
+ const tokenValue = parseInt(numbers[0], 10);
+ return tokenValue;
}
}
nextElement = nextElement.nextElementSibling;
}
}
-
+
// Alternative approach: look for dd elements that contain large numbers
const ddElements = document.querySelectorAll('dd');
- for (const dd of ddElements) {
+ const candidateTokenCounts = [];
+
+ for (let i = 0; i < ddElements.length; i++) {
+ const dd = ddElements[i];
const text = dd.textContent || dd.innerText || '';
// Remove separators and check if it's a large number (likely token count)
const cleanedText = text.replace(/[,\.]/g, '');
const numbers = cleanedText.match(/\d+/g);
if (numbers && numbers.length > 0) {
const num = parseInt(numbers[0], 10);
- // Use the provided threshold instead of hardcoded value
+
+ // Use the provided threshold to filter candidates
if (num > minThreshold) {
- console.log(`Found token count from dd element: ${num}`);
- return num;
+ candidateTokenCounts.push({ value: num, text: text, index: i });
}
}
}
-
- console.log("Could not find token count using any method.");
+
+ if (candidateTokenCounts.length > 0) {
+ // Return the largest candidate (most likely to be the total token count)
+ const bestCandidate = candidateTokenCounts.reduce((max, current) =>
+ current.value > max.value ? current : max
+ );
+ return bestCandidate.value;
+ }
+
return null;
}, minTokenThreshold);
-
+
+ console.log(`Token count extraction completed. Result: ${tokenCount}`);
+
if (tokenCount === null) {
+ console.error("ERROR: Token count extraction returned null");
throw new Error("Could not find token count in corpus statistics");
}
-
+
+ console.log(`SUCCESS: Found token count: ${tokenCount}, threshold was: ${minTokenThreshold}`);
return tokenCount;
-
+
} catch (error) {
+ console.error(`ERROR in check_corpus_statistics: ${error.message}`);
+ console.error("Full error stack:", error.stack);
throw new Error(`Failed to check corpus statistics: ${error.message}`);
}
}