Add check for min tokens in corpus
Resolves #61
Change-Id: I75dec5876f4728d948ea2b6e1be143c4ba655aaa
diff --git a/.github/workflows/ci_test.yml b/.github/workflows/ci_test.yml
index f9a39d7..a565077 100644
--- a/.github/workflows/ci_test.yml
+++ b/.github/workflows/ci_test.yml
@@ -9,6 +9,7 @@
KORAP_URL: https://korap.ids-mannheim.de/
KORAP_LOGIN: ""
KORAP_QUERIES: 'geht, [orth=geht & tt/pos=VVFIN]'
+ MIN_TOKEN_IN_CORPUS: "22000000000"
steps:
- name: Checkout KorAP-E2E-Tests
uses: actions/checkout@v4
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index ae9025a..6632f6b 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -8,6 +8,7 @@
# Default test configuration
KORAP_URL: "https://korap.ids-mannheim.de/"
KORAP_QUERIES: 'geht, [orth=geht & tt/p="VVFIN"]'
+ MIN_TOKEN_IN_CORPUS: "22000000000"
LC_ALL: "C"
# Node.js configuration
diff --git a/GITLAB_CI_SETUP.md b/GITLAB_CI_SETUP.md
index 3a77789..347cf51 100644
--- a/GITLAB_CI_SETUP.md
+++ b/GITLAB_CI_SETUP.md
@@ -49,6 +49,10 @@
- **Default**: `geht, [orth=geht & tt/p="VVFIN"]`
- **Description**: Comma-separated list of queries to test
+#### MIN_TOKEN_IN_CORPUS
+- **Default**: `1000`
+- **Description**: Minimum expected number of tokens in the corpus for the corpus statistics test
+
## Pipeline Triggers
The CI pipeline will run:
@@ -131,6 +135,7 @@
export KORAP_PASSWORD="your-password"
export KORAP_URL="https://korap.ids-mannheim.de/"
export KORAP_QUERIES='geht, [orth=geht & tt/p="VVFIN"]'
+export MIN_TOKEN_IN_CORPUS="1000"
export LC_ALL="C"
npm test
diff --git a/Readme.md b/Readme.md
index 93bc538..fa8d79d 100644
--- a/Readme.md
+++ b/Readme.md
@@ -18,7 +18,7 @@
```bash
KORAP_URL="http://localhost:64543" KORAP_LOGIN="user2" KORAP_PWD="password2"\
- KORAP_QUERIES='geht, [orth=geht & cmc/pos=VVFIN]'\
+ KORAP_QUERIES='geht, [orth=geht & cmc/pos=VVFIN]' KORAP_MIN_TOKENS_IN_CORPUS="100000"\
npm test
```
@@ -28,6 +28,7 @@
### Comments on Environment Variables
- Use `KORAP_LOGIN="" npm test` to skip login and logout tests, e.g. to run tests against Kustvakt-lite.
+- Use `KORAP_MIN_TOKENS_IN_CORPUS` to set the minimum expected number of tokens in the corpus for the corpus statistics test (default: 100000).
- The tests respect the current locale, consider e.g. `LC_ALL=C npm test`
## GitLab CI/CD
diff --git a/lib/korap_rc.js b/lib/korap_rc.js
index 016a7db..21c1eb8 100644
--- a/lib/korap_rc.js
+++ b/lib/korap_rc.js
@@ -155,7 +155,77 @@
if (glimpse_value) {
await page.click("#glimpse")
}
+ }
+ async check_corpus_statistics(page, minTokenThreshold = 1000) {
+ try {
+ // Navigate to the corpus view if not already there
+ await page.goto(this.korap_url, { waitUntil: 'networkidle2' });
+
+ // Click the vc-choose element to open corpus selection
+ await page.waitForSelector('#vc-choose', { visible: true, timeout: 10000 });
+ await page.click('#vc-choose');
+
+ // Wait a moment for the UI to respond
+ await new Promise(resolve => setTimeout(resolve, 1000));
+
+ // Click the statistic element
+ await page.waitForSelector('.statistic', { visible: true, timeout: 10000 });
+ await page.click('.statistic');
+
+ // Wait for statistics to load
+ await new Promise(resolve => setTimeout(resolve, 3000));
+
+ // Look for the tokens count in a dd element that follows an element with title "tokens"
+ const tokenCount = await page.evaluate((minThreshold) => {
+ // Find the element with title "tokens"
+ const tokenTitleElements = document.querySelectorAll('[title="tokens"], [title*="token"]');
+
+ for (const element of tokenTitleElements) {
+ // Look for the next dd element
+ let nextElement = element.nextElementSibling;
+ while (nextElement) {
+ if (nextElement.tagName.toLowerCase() === 'dd') {
+ const text = nextElement.textContent || nextElement.innerText || '';
+ // Remove number separators (commas and periods) and extract number
+ const cleanedText = text.replace(/[,\.]/g, '');
+ const numbers = cleanedText.match(/\d+/g);
+ if (numbers && numbers.length > 0) {
+ return parseInt(numbers[0], 10);
+ }
+ }
+ nextElement = nextElement.nextElementSibling;
+ }
+ }
+
+ // Alternative approach: look for dd elements that contain large numbers
+ const ddElements = document.querySelectorAll('dd');
+ for (const dd of ddElements) {
+ const text = dd.textContent || dd.innerText || '';
+ // Remove separators and check if it's a large number (likely token count)
+ const cleanedText = text.replace(/[,\.]/g, '');
+ const numbers = cleanedText.match(/\d+/g);
+ if (numbers && numbers.length > 0) {
+ const num = parseInt(numbers[0], 10);
+ // Use the provided threshold instead of hardcoded value
+ if (num > minThreshold) {
+ return num;
+ }
+ }
+ }
+
+ return null;
+ }, minTokenThreshold);
+
+ if (tokenCount === null) {
+ throw new Error("Could not find token count in corpus statistics");
+ }
+
+ return tokenCount;
+
+ } catch (error) {
+ throw new Error(`Failed to check corpus statistics: ${error.message}`);
+ }
}
}
diff --git a/test/korap-ui.js b/test/korap-ui.js
index 2c42858..708d012 100644
--- a/test/korap-ui.js
+++ b/test/korap-ui.js
@@ -17,6 +17,7 @@
const KORAP_LOGIN = 'KORAP_USERNAME' in process.env ? process.env.KORAP_USERNAME : 'KORAP_LOGIN' in process.env ? process.env.KORAP_LOGIN : "user2"
const KORAP_PWD = process.env.KORAP_PWD || process.env.KORAP_PASSWORD || "password2";
const KORAP_QUERIES = process.env.KORAP_QUERIES || 'geht, [orth=geht & cmc/pos=VVFIN]'
+const KORAP_MIN_TOKENS_IN_CORPUS = parseInt(process.env.KORAP_MIN_TOKENS_IN_CORPUS || "100000", 10);
const korap_rc = require('../lib/korap_rc.js').new(KORAP_URL)
const slack_webhook = process.env.SLACK_WEBHOOK_URL;
@@ -96,6 +97,14 @@
await korap_rc.assure_glimpse_off(page)
}))
+ it('Corpus statistics show sufficient tokens',
+ (async () => {
+ const tokenCount = await korap_rc.check_corpus_statistics(page, KORAP_MIN_TOKENS_IN_CORPUS);
+ console.log(`Found ${tokenCount} tokens in corpus, minimum required: ${KORAP_MIN_TOKENS_IN_CORPUS}`);
+ tokenCount.should.be.above(KORAP_MIN_TOKENS_IN_CORPUS - 1,
+ `Corpus should have at least ${KORAP_MIN_TOKENS_IN_CORPUS} tokens, but found ${tokenCount}`);
+ })).timeout(25000)
+
describe('Running searches that should have hits', () => {
before(async () => { await korap_rc.login(page, KORAP_LOGIN, KORAP_PWD) })