Add check for min tokens in corpus Resolves #61 Change-Id: I75dec5876f4728d948ea2b6e1be143c4ba655aaa

commit: c8ffb2b9bc3484a073299e85f1b3d930788d14d2 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Thu Jun 12 16:44:23 2025 +0200
committer: Marc Kupietz <kupietz@ids-mannheim.de> Thu Jun 12 17:02:12 2025 +0200
tree: 6b55a46f2f54cc49da322dd7e765b6fb17250768
parent: 35287a4777df0c4d94fc70fb8f224c39ccf1d4ee [diff]
diff --git a/lib/korap_rc.js b/lib/korap_rc.js
index 016a7db..21c1eb8 100644
--- a/lib/korap_rc.js
+++ b/lib/korap_rc.js

@@ -155,7 +155,77 @@
         if (glimpse_value) {
             await page.click("#glimpse")
         }
+    }
 
+    async check_corpus_statistics(page, minTokenThreshold = 1000) {
+        try {
+            // Navigate to the corpus view if not already there
+            await page.goto(this.korap_url, { waitUntil: 'networkidle2' });
+            
+            // Click the vc-choose element to open corpus selection
+            await page.waitForSelector('#vc-choose', { visible: true, timeout: 10000 });
+            await page.click('#vc-choose');
+            
+            // Wait a moment for the UI to respond
+            await new Promise(resolve => setTimeout(resolve, 1000));
+            
+            // Click the statistic element
+            await page.waitForSelector('.statistic', { visible: true, timeout: 10000 });
+            await page.click('.statistic');
+            
+            // Wait for statistics to load
+            await new Promise(resolve => setTimeout(resolve, 3000));
+            
+            // Look for the tokens count in a dd element that follows an element with title "tokens"
+            const tokenCount = await page.evaluate((minThreshold) => {
+                // Find the element with title "tokens"
+                const tokenTitleElements = document.querySelectorAll('[title="tokens"], [title*="token"]');
+                
+                for (const element of tokenTitleElements) {
+                    // Look for the next dd element
+                    let nextElement = element.nextElementSibling;
+                    while (nextElement) {
+                        if (nextElement.tagName.toLowerCase() === 'dd') {
+                            const text = nextElement.textContent || nextElement.innerText || '';
+                            // Remove number separators (commas and periods) and extract number
+                            const cleanedText = text.replace(/[,\.]/g, '');
+                            const numbers = cleanedText.match(/\d+/g);
+                            if (numbers && numbers.length > 0) {
+                                return parseInt(numbers[0], 10);
+                            }
+                        }
+                        nextElement = nextElement.nextElementSibling;
+                    }
+                }
+                
+                // Alternative approach: look for dd elements that contain large numbers
+                const ddElements = document.querySelectorAll('dd');
+                for (const dd of ddElements) {
+                    const text = dd.textContent || dd.innerText || '';
+                    // Remove separators and check if it's a large number (likely token count)
+                    const cleanedText = text.replace(/[,\.]/g, '');
+                    const numbers = cleanedText.match(/\d+/g);
+                    if (numbers && numbers.length > 0) {
+                        const num = parseInt(numbers[0], 10);
+                        // Use the provided threshold instead of hardcoded value
+                        if (num > minThreshold) {
+                            return num;
+                        }
+                    }
+                }
+                
+                return null;
+            }, minTokenThreshold);
+            
+            if (tokenCount === null) {
+                throw new Error("Could not find token count in corpus statistics");
+            }
+            
+            return tokenCount;
+            
+        } catch (error) {
+            throw new Error(`Failed to check corpus statistics: ${error.message}`);
+        }
     }
 }
commit	c8ffb2b9bc3484a073299e85f1b3d930788d14d2	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Thu Jun 12 16:44:23 2025 +0200
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Thu Jun 12 17:02:12 2025 +0200
tree	6b55a46f2f54cc49da322dd7e765b6fb17250768
parent	35287a4777df0c4d94fc70fb8f224c39ccf1d4ee [diff]