blob: 5a650a4d3eb7316f7ebbaba8a709326b9894e9f6 [file] [log] [blame]
Marc Kupietz55fc3162022-12-04 16:25:49 +01001const chai = require('chai');
2const assert = chai.assert;
Marc Kupietz5e45a2f2022-12-03 15:32:40 +01003
4class KorAPRC {
5 korap_url = ""
6
7 constructor(korap_url) {
8 this.korap_url = korap_url
9 }
10
11 static new(korap_url) {
12 return new KorAPRC(korap_url)
13 }
14
15 async login(page, username, password) {
Marc Kupietz93d7f702025-06-27 15:41:48 +020016 try {
17 await page.goto(this.korap_url, { waitUntil: 'domcontentloaded' });
18 if (username == "") return false;
19 if (password == "") return false;
Marc Kupietz5e45a2f2022-12-03 15:32:40 +010020
Marc Kupietz93d7f702025-06-27 15:41:48 +020021 await page.waitForSelector('.dropdown-btn', { visible: true });
22 await page.click('.dropdown-btn');
23 await page.waitForSelector('input[name=handle_or_email]', { visible: true });
24 const username_field = await page.$("input[name=handle_or_email]")
25 if (username_field != null) {
26 await username_field.focus();
27 await username_field.type(username);
28 const password_field = await page.$("input[name=pwd]")
29 await password_field.focus()
30 await page.keyboard.type(password)
31 await page.keyboard.press("Enter")
32 } else {
33 return false
34 }
35
36 await page.waitForNavigation({ waitUntil: 'domcontentloaded' }); // Wait for navigation after login
37 await page.waitForSelector("#q-field", { visible: true }); // Wait for query field to confirm login
38 const logout = await page.$(".logout")
39 if (logout == null) {
40 return false
41 }
42
43 return true
44 } catch (error) {
45 console.error(`Login failed: ${error.message}`);
46 return false;
Marc Kupietz5e45a2f2022-12-03 15:32:40 +010047 }
Marc Kupietz5e45a2f2022-12-03 15:32:40 +010048 }
49
50 async search(page, query) {
Marc Kupietz964e7772025-06-03 15:02:30 +020051 try {
Marc Kupietz93d7f702025-06-27 15:41:48 +020052 await page.waitForSelector("#q-field", { visible: true });
53 const query_field = await page.$("#q-field");
54 assert.notEqual(query_field, null, "Query field not found");
Marc Kupietz964e7772025-06-03 15:02:30 +020055
Marc Kupietz93d7f702025-06-27 15:41:48 +020056 await query_field.click({ clickCount: 3 });
57 await page.keyboard.type(query);
58 await page.keyboard.press("Enter");
Marc Kupietz964e7772025-06-03 15:02:30 +020059
Marc Kupietz93d7f702025-06-27 15:41:48 +020060 await page.waitForNavigation({ waitUntil: 'domcontentloaded' });
61
62 // Wait for search results to be fully loaded
63 try {
64 await page.waitForSelector('ol li, #resultinfo, .result-item', {
65 visible: true,
66 timeout: 15000
67 });
68 // Give additional time for the results count to be populated
69 await new Promise(resolve => setTimeout(resolve, 2000));
70 } catch (error) {
71 // Continue if timeout, fallback methods will handle it
72 }
73
74 const resultsInfo = await page.evaluate(() => {
75 // Check common selectors for result counts
76 const selectors = [
77 '#total-results',
78 '#resultinfo',
79 '.result-count',
80 '.total-results',
81 '[data-results]',
82 '.found'
83 ];
84
85 for (const selector of selectors) {
86 const element = document.querySelector(selector);
87 if (element) {
88 const text = element.textContent || element.innerText || '';
89 const numbers = text.match(/\d+/g);
90 if (numbers && numbers.length > 0) {
91 return {
92 selector: selector,
93 numbers: numbers
94 };
95 }
96 }
97 }
98
99 // Look in the page title for results count
100 const title = document.title;
101 if (title) {
102 const numbers = title.match(/\d+/g);
Marc Kupietz964e7772025-06-03 15:02:30 +0200103 if (numbers && numbers.length > 0) {
104 return {
Marc Kupietz93d7f702025-06-27 15:41:48 +0200105 selector: 'title',
Marc Kupietz964e7772025-06-03 15:02:30 +0200106 numbers: numbers
107 };
108 }
109 }
Marc Kupietz964e7772025-06-03 15:02:30 +0200110
Marc Kupietz93d7f702025-06-27 15:41:48 +0200111 // Count the actual result items as fallback
112 const resultItems = document.querySelectorAll('ol li');
113 if (resultItems.length > 0) {
Marc Kupietz964e7772025-06-03 15:02:30 +0200114 return {
Marc Kupietz93d7f702025-06-27 15:41:48 +0200115 selector: 'counted-items',
116 numbers: [resultItems.length.toString()]
Marc Kupietz964e7772025-06-03 15:02:30 +0200117 };
118 }
Marc Kupietz964e7772025-06-03 15:02:30 +0200119
Marc Kupietz93d7f702025-06-27 15:41:48 +0200120 return null;
Marc Kupietz964e7772025-06-03 15:02:30 +0200121 });
122
Marc Kupietz93d7f702025-06-27 15:41:48 +0200123 if (!resultsInfo || !resultsInfo.numbers || resultsInfo.numbers.length === 0) {
124 // Final fallback: just count visible list items
125 const itemCount = await page.evaluate(() => {
126 return document.querySelectorAll('ol li').length;
127 });
128
129 if (itemCount > 0) {
130 return itemCount;
131 }
132
133 throw new Error("Cannot find any results count on the page");
Marc Kupietz964e7772025-06-03 15:02:30 +0200134 }
135
Marc Kupietz93d7f702025-06-27 15:41:48 +0200136 // Extract the largest number found (likely the total results)
137 const hits = Math.max(...resultsInfo.numbers.map(n => parseInt(n, 10)));
138 return hits;
139 } catch (error) {
140 throw new Error(`Failed to perform search: ${error.message}`);
Marc Kupietz964e7772025-06-03 15:02:30 +0200141 }
Marc Kupietz5e45a2f2022-12-03 15:32:40 +0100142 }
143
144 async logout(page) {
Marc Kupietz964e7772025-06-03 15:02:30 +0200145 try {
146 // Direct navigation to logout URL - most reliable method
147 const currentUrl = await page.url();
148 const logoutUrl = currentUrl.replace(/\/$/, '') + '/logout';
149
150 await page.goto(logoutUrl, { waitUntil: 'domcontentloaded', timeout: 10000 });
151
152 // Navigate back to main page to ensure clean state for subsequent tests
153 await page.goto(this.korap_url, { waitUntil: 'domcontentloaded', timeout: 10000 });
154
155 return true;
156 } catch (error) {
157 return false;
Marc Kupietz5e45a2f2022-12-03 15:32:40 +0100158 }
Marc Kupietz5e45a2f2022-12-03 15:32:40 +0100159 }
160
161 async assure_glimpse_off(page) {
Marc Kupietz0462c7d2026-03-21 10:10:00 +0100162 // Get the cutoff checkbox - works in both old and new Kalamar versions
Marc Kupietz5e45a2f2022-12-03 15:32:40 +0100163 const glimpse = await page.$("input[name=cutoff]")
Marc Kupietz0462c7d2026-03-21 10:10:00 +0100164 if (!glimpse) {
165 console.log("Glimpse checkbox not found, skipping")
166 return
167 }
Marc Kupietz5e45a2f2022-12-03 15:32:40 +0100168 const glimpse_value = await (await glimpse.getProperty('checked')).jsonValue()
169 if (glimpse_value) {
Marc Kupietz0462c7d2026-03-21 10:10:00 +0100170 // Try new Kalamar version first (toggle button with class 'glimpse')
171 const newGlimpseButton = await page.$(".glimpse")
172 if (newGlimpseButton) {
173 const isVisible = await page.evaluate(el => {
174 const style = window.getComputedStyle(el)
175 return style.display !== 'none' && style.visibility !== 'hidden'
176 }, newGlimpseButton)
177 if (isVisible) {
178 await newGlimpseButton.click()
179 return
180 }
181 }
182 // Fall back to old Kalamar version (label with id 'glimpse')
183 const oldGlimpseLabel = await page.$("#glimpse")
184 if (oldGlimpseLabel) {
185 const isVisible = await page.evaluate(el => {
186 const style = window.getComputedStyle(el.parentNode || el)
187 return style.display !== 'none' && style.visibility !== 'hidden'
188 }, oldGlimpseLabel)
189 if (isVisible) {
190 await page.click("#glimpse")
191 return
192 }
193 }
194 // Last resort: directly toggle the checkbox via JavaScript
195 await page.evaluate(() => {
196 const checkbox = document.querySelector("input[name=cutoff]")
197 if (checkbox) checkbox.checked = false
198 })
Marc Kupietz5e45a2f2022-12-03 15:32:40 +0100199 }
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200200 }
Marc Kupietz5e45a2f2022-12-03 15:32:40 +0100201
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200202 async check_corpus_statistics(page, minTokenThreshold = 1000) {
203 try {
Marc Kupietz669c0432025-07-12 12:33:42 +0200204 console.log(`Starting corpus statistics check with minTokenThreshold: ${minTokenThreshold}`);
205
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200206 // Navigate to the corpus view if not already there
Marc Kupietz669c0432025-07-12 12:33:42 +0200207 console.log(`Navigating to: ${this.korap_url}`);
Marc Kupietz93d7f702025-06-27 15:41:48 +0200208 await page.goto(this.korap_url, { waitUntil: 'domcontentloaded' });
Marc Kupietz669c0432025-07-12 12:33:42 +0200209 console.log("Navigation completed");
210
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200211 // Click the vc-choose element to open corpus selection
Marc Kupietz669c0432025-07-12 12:33:42 +0200212 console.log("Waiting for #vc-choose selector...");
Marc Kupietz93d7f702025-06-27 15:41:48 +0200213 await page.waitForSelector('#vc-choose', { visible: true, timeout: 90000 });
Marc Kupietz669c0432025-07-12 12:33:42 +0200214 console.log("Found #vc-choose, clicking...");
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200215 await page.click('#vc-choose');
Marc Kupietz669c0432025-07-12 12:33:42 +0200216 console.log("Clicked #vc-choose");
217
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200218 // Wait a moment for the UI to respond
Marc Kupietz669c0432025-07-12 12:33:42 +0200219 console.log("Waiting 1 second for UI to respond...");
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200220 await new Promise(resolve => setTimeout(resolve, 1000));
Marc Kupietz669c0432025-07-12 12:33:42 +0200221
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200222 // Click the statistic element
Marc Kupietz669c0432025-07-12 12:33:42 +0200223 console.log("Waiting for .statistic selector...");
Marc Kupietz93d7f702025-06-27 15:41:48 +0200224 await page.waitForSelector('.statistic', { visible: true, timeout: 90000 });
Marc Kupietz669c0432025-07-12 12:33:42 +0200225 console.log("Found .statistic element, attempting to click...");
Marc Kupietz93d7f702025-06-27 15:41:48 +0200226 try {
227 await page.click('.statistic');
Marc Kupietz669c0432025-07-12 12:33:42 +0200228 console.log("Successfully clicked .statistic element");
Marc Kupietz93d7f702025-06-27 15:41:48 +0200229 } catch (error) {
Marc Kupietz669c0432025-07-12 12:33:42 +0200230 console.error(`Failed to click statistic element: ${error.message}`);
Marc Kupietz93d7f702025-06-27 15:41:48 +0200231 throw new Error(`Failed to click statistic element: ${error.message}`);
232 }
Marc Kupietz669c0432025-07-12 12:33:42 +0200233
234 // Wait for statistics to load with a more efficient approach
235 console.log("Waiting for token statistics to load...");
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200236
Marc Kupietz669c0432025-07-12 12:33:42 +0200237 // First, wait for any dd elements to appear (basic structure)
238 await page.waitForSelector('dd', { visible: true, timeout: 30000 });
239
240 // Then wait for the specific token statistics with a simplified check
Marc Kupietz93d7f702025-06-27 15:41:48 +0200241 await page.waitForFunction(() => {
Marc Kupietz669c0432025-07-12 12:33:42 +0200242 // Simplified check - look for any dd element with a large number
Marc Kupietz93d7f702025-06-27 15:41:48 +0200243 const ddElements = document.querySelectorAll('dd');
Marc Kupietz669c0432025-07-12 12:33:42 +0200244 for (let i = 0; i < ddElements.length; i++) {
245 const text = ddElements[i].textContent || ddElements[i].innerText || '';
Marc Kupietz93d7f702025-06-27 15:41:48 +0200246 const cleanedText = text.replace(/[,\.]/g, '');
247 const numbers = cleanedText.match(/\d+/g);
248 if (numbers && numbers.length > 0) {
Marc Kupietz669c0432025-07-12 12:33:42 +0200249 const num = parseInt(numbers[0], 10);
250 if (num > 1000) { // Found a substantial number, likely loaded
251 return true;
252 }
Marc Kupietz93d7f702025-06-27 15:41:48 +0200253 }
254 }
255 return false;
Marc Kupietz669c0432025-07-12 12:33:42 +0200256 }, { timeout: 90000, polling: 1000 }); // Poll every second instead of continuously
Marc Kupietz93d7f702025-06-27 15:41:48 +0200257
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200258 // Look for the tokens count in a dd element that follows an element with title "tokens"
Marc Kupietz669c0432025-07-12 12:33:42 +0200259 console.log(`Starting token count extraction with minThreshold: ${minTokenThreshold}`);
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200260 const tokenCount = await page.evaluate((minThreshold) => {
Marc Kupietz669c0432025-07-12 12:33:42 +0200261 // Find the element with title "tokens" first
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200262 const tokenTitleElements = document.querySelectorAll('[title="tokens"], [title*="token"]');
263
Marc Kupietz669c0432025-07-12 12:33:42 +0200264 for (let i = 0; i < tokenTitleElements.length; i++) {
265 const element = tokenTitleElements[i];
266
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200267 // Look for the next dd element
268 let nextElement = element.nextElementSibling;
Marc Kupietz669c0432025-07-12 12:33:42 +0200269 let siblingCount = 0;
270 while (nextElement && siblingCount < 10) {
271 siblingCount++;
272
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200273 if (nextElement.tagName.toLowerCase() === 'dd') {
274 const text = nextElement.textContent || nextElement.innerText || '';
275 // Remove number separators (commas and periods) and extract number
276 const cleanedText = text.replace(/[,\.]/g, '');
277 const numbers = cleanedText.match(/\d+/g);
278 if (numbers && numbers.length > 0) {
Marc Kupietz669c0432025-07-12 12:33:42 +0200279 const tokenValue = parseInt(numbers[0], 10);
280 return tokenValue;
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200281 }
282 }
283 nextElement = nextElement.nextElementSibling;
284 }
285 }
Marc Kupietz669c0432025-07-12 12:33:42 +0200286
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200287 // Alternative approach: look for dd elements that contain large numbers
288 const ddElements = document.querySelectorAll('dd');
Marc Kupietz669c0432025-07-12 12:33:42 +0200289 const candidateTokenCounts = [];
290
291 for (let i = 0; i < ddElements.length; i++) {
292 const dd = ddElements[i];
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200293 const text = dd.textContent || dd.innerText || '';
294 // Remove separators and check if it's a large number (likely token count)
295 const cleanedText = text.replace(/[,\.]/g, '');
296 const numbers = cleanedText.match(/\d+/g);
297 if (numbers && numbers.length > 0) {
298 const num = parseInt(numbers[0], 10);
Marc Kupietz669c0432025-07-12 12:33:42 +0200299
300 // Use the provided threshold to filter candidates
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200301 if (num > minThreshold) {
Marc Kupietz669c0432025-07-12 12:33:42 +0200302 candidateTokenCounts.push({ value: num, text: text, index: i });
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200303 }
304 }
305 }
Marc Kupietz669c0432025-07-12 12:33:42 +0200306
307 if (candidateTokenCounts.length > 0) {
308 // Return the largest candidate (most likely to be the total token count)
309 const bestCandidate = candidateTokenCounts.reduce((max, current) =>
310 current.value > max.value ? current : max
311 );
312 return bestCandidate.value;
313 }
314
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200315 return null;
316 }, minTokenThreshold);
Marc Kupietz669c0432025-07-12 12:33:42 +0200317
318 console.log(`Token count extraction completed. Result: ${tokenCount}`);
319
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200320 if (tokenCount === null) {
Marc Kupietz669c0432025-07-12 12:33:42 +0200321 console.error("ERROR: Token count extraction returned null");
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200322 throw new Error("Could not find token count in corpus statistics");
323 }
Marc Kupietz669c0432025-07-12 12:33:42 +0200324
325 console.log(`SUCCESS: Found token count: ${tokenCount}, threshold was: ${minTokenThreshold}`);
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200326 return tokenCount;
Marc Kupietz669c0432025-07-12 12:33:42 +0200327
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200328 } catch (error) {
Marc Kupietz669c0432025-07-12 12:33:42 +0200329 console.error(`ERROR in check_corpus_statistics: ${error.message}`);
330 console.error("Full error stack:", error.stack);
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200331 throw new Error(`Failed to check corpus statistics: ${error.message}`);
332 }
Marc Kupietz5e45a2f2022-12-03 15:32:40 +0100333 }
334}
335
Marc Kupietz93d7f702025-06-27 15:41:48 +0200336module.exports = KorAPRC