Add check for min tokens in corpus

Resolves #61

Change-Id: I75dec5876f4728d948ea2b6e1be143c4ba655aaa
diff --git a/.github/workflows/ci_test.yml b/.github/workflows/ci_test.yml
index f9a39d7..a565077 100644
--- a/.github/workflows/ci_test.yml
+++ b/.github/workflows/ci_test.yml
@@ -9,6 +9,7 @@
       KORAP_URL: https://korap.ids-mannheim.de/
       KORAP_LOGIN: ""
       KORAP_QUERIES: 'geht, [orth=geht & tt/pos=VVFIN]'
+      MIN_TOKEN_IN_CORPUS: "22000000000"
     steps:
       - name: Checkout KorAP-E2E-Tests
         uses: actions/checkout@v4
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index ae9025a..6632f6b 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -8,6 +8,7 @@
   # Default test configuration
   KORAP_URL: "https://korap.ids-mannheim.de/"
   KORAP_QUERIES: 'geht, [orth=geht & tt/p="VVFIN"]'
+  MIN_TOKEN_IN_CORPUS: "22000000000"
   LC_ALL: "C"
   
   # Node.js configuration
diff --git a/GITLAB_CI_SETUP.md b/GITLAB_CI_SETUP.md
index 3a77789..347cf51 100644
--- a/GITLAB_CI_SETUP.md
+++ b/GITLAB_CI_SETUP.md
@@ -49,6 +49,10 @@
 - **Default**: `geht, [orth=geht & tt/p="VVFIN"]`
 - **Description**: Comma-separated list of queries to test
 
+#### MIN_TOKEN_IN_CORPUS
+- **Default**: `1000`
+- **Description**: Minimum expected number of tokens in the corpus for the corpus statistics test
+
 ## Pipeline Triggers
 
 The CI pipeline will run:
@@ -131,6 +135,7 @@
 export KORAP_PASSWORD="your-password"
 export KORAP_URL="https://korap.ids-mannheim.de/"
 export KORAP_QUERIES='geht, [orth=geht & tt/p="VVFIN"]'
+export MIN_TOKEN_IN_CORPUS="1000"
 export LC_ALL="C"
 
 npm test
diff --git a/Readme.md b/Readme.md
index 93bc538..fa8d79d 100644
--- a/Readme.md
+++ b/Readme.md
@@ -18,7 +18,7 @@
 
 ```bash
 KORAP_URL="http://localhost:64543" KORAP_LOGIN="user2" KORAP_PWD="password2"\
- KORAP_QUERIES='geht, [orth=geht & cmc/pos=VVFIN]'\
+ KORAP_QUERIES='geht, [orth=geht & cmc/pos=VVFIN]' KORAP_MIN_TOKENS_IN_CORPUS="100000"\
  npm test
 ```
 
@@ -28,6 +28,7 @@
 ### Comments on Environment Variables
 
 - Use `KORAP_LOGIN="" npm test` to skip login and logout tests, e.g. to run tests against Kustvakt-lite.
+- Use `KORAP_MIN_TOKENS_IN_CORPUS` to set the minimum expected number of tokens in the corpus for the corpus statistics test (default: 100000).
 - The tests respect the current locale, consider e.g. `LC_ALL=C npm test`
 
 ## GitLab CI/CD
diff --git a/lib/korap_rc.js b/lib/korap_rc.js
index 016a7db..21c1eb8 100644
--- a/lib/korap_rc.js
+++ b/lib/korap_rc.js
@@ -155,7 +155,77 @@
         if (glimpse_value) {
             await page.click("#glimpse")
         }
+    }
 
+    async check_corpus_statistics(page, minTokenThreshold = 1000) {
+        try {
+            // Navigate to the corpus view if not already there
+            await page.goto(this.korap_url, { waitUntil: 'networkidle2' });
+            
+            // Click the vc-choose element to open corpus selection
+            await page.waitForSelector('#vc-choose', { visible: true, timeout: 10000 });
+            await page.click('#vc-choose');
+            
+            // Wait a moment for the UI to respond
+            await new Promise(resolve => setTimeout(resolve, 1000));
+            
+            // Click the statistic element
+            await page.waitForSelector('.statistic', { visible: true, timeout: 10000 });
+            await page.click('.statistic');
+            
+            // Wait for statistics to load
+            await new Promise(resolve => setTimeout(resolve, 3000));
+            
+            // Look for the tokens count in a dd element that follows an element with title "tokens"
+            const tokenCount = await page.evaluate((minThreshold) => {
+                // Find the element with title "tokens"
+                const tokenTitleElements = document.querySelectorAll('[title="tokens"], [title*="token"]');
+                
+                for (const element of tokenTitleElements) {
+                    // Look for the next dd element
+                    let nextElement = element.nextElementSibling;
+                    while (nextElement) {
+                        if (nextElement.tagName.toLowerCase() === 'dd') {
+                            const text = nextElement.textContent || nextElement.innerText || '';
+                            // Remove number separators (commas and periods) and extract number
+                            const cleanedText = text.replace(/[,\.]/g, '');
+                            const numbers = cleanedText.match(/\d+/g);
+                            if (numbers && numbers.length > 0) {
+                                return parseInt(numbers[0], 10);
+                            }
+                        }
+                        nextElement = nextElement.nextElementSibling;
+                    }
+                }
+                
+                // Alternative approach: look for dd elements that contain large numbers
+                const ddElements = document.querySelectorAll('dd');
+                for (const dd of ddElements) {
+                    const text = dd.textContent || dd.innerText || '';
+                    // Remove separators and check if it's a large number (likely token count)
+                    const cleanedText = text.replace(/[,\.]/g, '');
+                    const numbers = cleanedText.match(/\d+/g);
+                    if (numbers && numbers.length > 0) {
+                        const num = parseInt(numbers[0], 10);
+                        // Use the provided threshold instead of hardcoded value
+                        if (num > minThreshold) {
+                            return num;
+                        }
+                    }
+                }
+                
+                return null;
+            }, minTokenThreshold);
+            
+            if (tokenCount === null) {
+                throw new Error("Could not find token count in corpus statistics");
+            }
+            
+            return tokenCount;
+            
+        } catch (error) {
+            throw new Error(`Failed to check corpus statistics: ${error.message}`);
+        }
     }
 }
 
diff --git a/test/korap-ui.js b/test/korap-ui.js
index 2c42858..708d012 100644
--- a/test/korap-ui.js
+++ b/test/korap-ui.js
@@ -17,6 +17,7 @@
 const KORAP_LOGIN = 'KORAP_USERNAME' in process.env ? process.env.KORAP_USERNAME : 'KORAP_LOGIN' in process.env ? process.env.KORAP_LOGIN : "user2"
 const KORAP_PWD = process.env.KORAP_PWD || process.env.KORAP_PASSWORD || "password2";
 const KORAP_QUERIES = process.env.KORAP_QUERIES || 'geht, [orth=geht & cmc/pos=VVFIN]'
+const KORAP_MIN_TOKENS_IN_CORPUS = parseInt(process.env.KORAP_MIN_TOKENS_IN_CORPUS || "100000", 10);
 const korap_rc = require('../lib/korap_rc.js').new(KORAP_URL)
 
 const slack_webhook = process.env.SLACK_WEBHOOK_URL;
@@ -96,6 +97,14 @@
             await korap_rc.assure_glimpse_off(page)
         }))
 
+    it('Corpus statistics show sufficient tokens',
+        (async () => {
+            const tokenCount = await korap_rc.check_corpus_statistics(page, KORAP_MIN_TOKENS_IN_CORPUS);
+            console.log(`Found ${tokenCount} tokens in corpus, minimum required: ${KORAP_MIN_TOKENS_IN_CORPUS}`);
+            tokenCount.should.be.above(KORAP_MIN_TOKENS_IN_CORPUS - 1,
+                `Corpus should have at least ${KORAP_MIN_TOKENS_IN_CORPUS} tokens, but found ${tokenCount}`);
+        })).timeout(25000)
+
     describe('Running searches that should have hits', () => {
 
         before(async () => { await korap_rc.login(page, KORAP_LOGIN, KORAP_PWD) })