blob: a628dbfabb479824611b48b8efc4dfdb199b9205 [file] [log] [blame]
Marc Kupietz55fc3162022-12-04 16:25:49 +01001const chai = require('chai');
2const assert = chai.assert;
Marc Kupietz5e45a2f2022-12-03 15:32:40 +01003
4class KorAPRC {
5 korap_url = ""
6
7 constructor(korap_url) {
8 this.korap_url = korap_url
9 }
10
11 static new(korap_url) {
12 return new KorAPRC(korap_url)
13 }
14
15 async login(page, username, password) {
Marc Kupietz93d7f702025-06-27 15:41:48 +020016 try {
17 await page.goto(this.korap_url, { waitUntil: 'domcontentloaded' });
18 if (username == "") return false;
19 if (password == "") return false;
Marc Kupietz5e45a2f2022-12-03 15:32:40 +010020
Marc Kupietz93d7f702025-06-27 15:41:48 +020021 await page.waitForSelector('.dropdown-btn', { visible: true });
22 await page.click('.dropdown-btn');
23 await page.waitForSelector('input[name=handle_or_email]', { visible: true });
24 const username_field = await page.$("input[name=handle_or_email]")
25 if (username_field != null) {
26 await username_field.focus();
27 await username_field.type(username);
28 const password_field = await page.$("input[name=pwd]")
29 await password_field.focus()
30 await page.keyboard.type(password)
31 await page.keyboard.press("Enter")
32 } else {
33 return false
34 }
35
36 await page.waitForNavigation({ waitUntil: 'domcontentloaded' }); // Wait for navigation after login
37 await page.waitForSelector("#q-field", { visible: true }); // Wait for query field to confirm login
38 const logout = await page.$(".logout")
39 if (logout == null) {
40 return false
41 }
42
43 return true
44 } catch (error) {
45 console.error(`Login failed: ${error.message}`);
46 return false;
Marc Kupietz5e45a2f2022-12-03 15:32:40 +010047 }
Marc Kupietz5e45a2f2022-12-03 15:32:40 +010048 }
49
50 async search(page, query) {
Marc Kupietz964e7772025-06-03 15:02:30 +020051 try {
Marc Kupietz93d7f702025-06-27 15:41:48 +020052 await page.waitForSelector("#q-field", { visible: true });
53 const query_field = await page.$("#q-field");
54 assert.notEqual(query_field, null, "Query field not found");
Marc Kupietz964e7772025-06-03 15:02:30 +020055
Marc Kupietz93d7f702025-06-27 15:41:48 +020056 await query_field.click({ clickCount: 3 });
57 await page.keyboard.type(query);
58 await page.keyboard.press("Enter");
Marc Kupietz964e7772025-06-03 15:02:30 +020059
Marc Kupietz93d7f702025-06-27 15:41:48 +020060 await page.waitForNavigation({ waitUntil: 'domcontentloaded' });
61
62 // Wait for search results to be fully loaded
63 try {
64 await page.waitForSelector('ol li, #resultinfo, .result-item', {
65 visible: true,
66 timeout: 15000
67 });
68 // Give additional time for the results count to be populated
69 await new Promise(resolve => setTimeout(resolve, 2000));
70 } catch (error) {
71 // Continue if timeout, fallback methods will handle it
72 }
73
74 const resultsInfo = await page.evaluate(() => {
75 // Check common selectors for result counts
76 const selectors = [
77 '#total-results',
78 '#resultinfo',
79 '.result-count',
80 '.total-results',
81 '[data-results]',
82 '.found'
83 ];
84
85 for (const selector of selectors) {
86 const element = document.querySelector(selector);
87 if (element) {
88 const text = element.textContent || element.innerText || '';
89 const numbers = text.match(/\d+/g);
90 if (numbers && numbers.length > 0) {
91 return {
92 selector: selector,
93 numbers: numbers
94 };
95 }
96 }
97 }
98
99 // Look in the page title for results count
100 const title = document.title;
101 if (title) {
102 const numbers = title.match(/\d+/g);
Marc Kupietz964e7772025-06-03 15:02:30 +0200103 if (numbers && numbers.length > 0) {
104 return {
Marc Kupietz93d7f702025-06-27 15:41:48 +0200105 selector: 'title',
Marc Kupietz964e7772025-06-03 15:02:30 +0200106 numbers: numbers
107 };
108 }
109 }
Marc Kupietz964e7772025-06-03 15:02:30 +0200110
Marc Kupietz93d7f702025-06-27 15:41:48 +0200111 // Count the actual result items as fallback
112 const resultItems = document.querySelectorAll('ol li');
113 if (resultItems.length > 0) {
Marc Kupietz964e7772025-06-03 15:02:30 +0200114 return {
Marc Kupietz93d7f702025-06-27 15:41:48 +0200115 selector: 'counted-items',
116 numbers: [resultItems.length.toString()]
Marc Kupietz964e7772025-06-03 15:02:30 +0200117 };
118 }
Marc Kupietz964e7772025-06-03 15:02:30 +0200119
Marc Kupietz93d7f702025-06-27 15:41:48 +0200120 return null;
Marc Kupietz964e7772025-06-03 15:02:30 +0200121 });
122
Marc Kupietz93d7f702025-06-27 15:41:48 +0200123 if (!resultsInfo || !resultsInfo.numbers || resultsInfo.numbers.length === 0) {
124 // Final fallback: just count visible list items
125 const itemCount = await page.evaluate(() => {
126 return document.querySelectorAll('ol li').length;
127 });
128
129 if (itemCount > 0) {
130 return itemCount;
131 }
132
133 throw new Error("Cannot find any results count on the page");
Marc Kupietz964e7772025-06-03 15:02:30 +0200134 }
135
Marc Kupietz93d7f702025-06-27 15:41:48 +0200136 // Extract the largest number found (likely the total results)
137 const hits = Math.max(...resultsInfo.numbers.map(n => parseInt(n, 10)));
138 return hits;
139 } catch (error) {
140 throw new Error(`Failed to perform search: ${error.message}`);
Marc Kupietz964e7772025-06-03 15:02:30 +0200141 }
Marc Kupietz5e45a2f2022-12-03 15:32:40 +0100142 }
143
144 async logout(page) {
Marc Kupietz964e7772025-06-03 15:02:30 +0200145 try {
146 // Direct navigation to logout URL - most reliable method
147 const currentUrl = await page.url();
148 const logoutUrl = currentUrl.replace(/\/$/, '') + '/logout';
149
150 await page.goto(logoutUrl, { waitUntil: 'domcontentloaded', timeout: 10000 });
151
152 // Navigate back to main page to ensure clean state for subsequent tests
153 await page.goto(this.korap_url, { waitUntil: 'domcontentloaded', timeout: 10000 });
154
155 return true;
156 } catch (error) {
157 return false;
Marc Kupietz5e45a2f2022-12-03 15:32:40 +0100158 }
Marc Kupietz5e45a2f2022-12-03 15:32:40 +0100159 }
160
161 async assure_glimpse_off(page) {
162 const glimpse = await page.$("input[name=cutoff]")
163 const glimpse_value = await (await glimpse.getProperty('checked')).jsonValue()
164 if (glimpse_value) {
165 await page.click("#glimpse")
166 }
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200167 }
Marc Kupietz5e45a2f2022-12-03 15:32:40 +0100168
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200169 async check_corpus_statistics(page, minTokenThreshold = 1000) {
170 try {
Marc Kupietz669c0432025-07-12 12:33:42 +0200171 console.log(`Starting corpus statistics check with minTokenThreshold: ${minTokenThreshold}`);
172
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200173 // Navigate to the corpus view if not already there
Marc Kupietz669c0432025-07-12 12:33:42 +0200174 console.log(`Navigating to: ${this.korap_url}`);
Marc Kupietz93d7f702025-06-27 15:41:48 +0200175 await page.goto(this.korap_url, { waitUntil: 'domcontentloaded' });
Marc Kupietz669c0432025-07-12 12:33:42 +0200176 console.log("Navigation completed");
177
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200178 // Click the vc-choose element to open corpus selection
Marc Kupietz669c0432025-07-12 12:33:42 +0200179 console.log("Waiting for #vc-choose selector...");
Marc Kupietz93d7f702025-06-27 15:41:48 +0200180 await page.waitForSelector('#vc-choose', { visible: true, timeout: 90000 });
Marc Kupietz669c0432025-07-12 12:33:42 +0200181 console.log("Found #vc-choose, clicking...");
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200182 await page.click('#vc-choose');
Marc Kupietz669c0432025-07-12 12:33:42 +0200183 console.log("Clicked #vc-choose");
184
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200185 // Wait a moment for the UI to respond
Marc Kupietz669c0432025-07-12 12:33:42 +0200186 console.log("Waiting 1 second for UI to respond...");
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200187 await new Promise(resolve => setTimeout(resolve, 1000));
Marc Kupietz669c0432025-07-12 12:33:42 +0200188
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200189 // Click the statistic element
Marc Kupietz669c0432025-07-12 12:33:42 +0200190 console.log("Waiting for .statistic selector...");
Marc Kupietz93d7f702025-06-27 15:41:48 +0200191 await page.waitForSelector('.statistic', { visible: true, timeout: 90000 });
Marc Kupietz669c0432025-07-12 12:33:42 +0200192 console.log("Found .statistic element, attempting to click...");
Marc Kupietz93d7f702025-06-27 15:41:48 +0200193 try {
194 await page.click('.statistic');
Marc Kupietz669c0432025-07-12 12:33:42 +0200195 console.log("Successfully clicked .statistic element");
Marc Kupietz93d7f702025-06-27 15:41:48 +0200196 } catch (error) {
Marc Kupietz669c0432025-07-12 12:33:42 +0200197 console.error(`Failed to click statistic element: ${error.message}`);
Marc Kupietz93d7f702025-06-27 15:41:48 +0200198 throw new Error(`Failed to click statistic element: ${error.message}`);
199 }
Marc Kupietz669c0432025-07-12 12:33:42 +0200200
201 // Wait for statistics to load with a more efficient approach
202 console.log("Waiting for token statistics to load...");
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200203
Marc Kupietz669c0432025-07-12 12:33:42 +0200204 // First, wait for any dd elements to appear (basic structure)
205 await page.waitForSelector('dd', { visible: true, timeout: 30000 });
206
207 // Then wait for the specific token statistics with a simplified check
Marc Kupietz93d7f702025-06-27 15:41:48 +0200208 await page.waitForFunction(() => {
Marc Kupietz669c0432025-07-12 12:33:42 +0200209 // Simplified check - look for any dd element with a large number
Marc Kupietz93d7f702025-06-27 15:41:48 +0200210 const ddElements = document.querySelectorAll('dd');
Marc Kupietz669c0432025-07-12 12:33:42 +0200211 for (let i = 0; i < ddElements.length; i++) {
212 const text = ddElements[i].textContent || ddElements[i].innerText || '';
Marc Kupietz93d7f702025-06-27 15:41:48 +0200213 const cleanedText = text.replace(/[,\.]/g, '');
214 const numbers = cleanedText.match(/\d+/g);
215 if (numbers && numbers.length > 0) {
Marc Kupietz669c0432025-07-12 12:33:42 +0200216 const num = parseInt(numbers[0], 10);
217 if (num > 1000) { // Found a substantial number, likely loaded
218 return true;
219 }
Marc Kupietz93d7f702025-06-27 15:41:48 +0200220 }
221 }
222 return false;
Marc Kupietz669c0432025-07-12 12:33:42 +0200223 }, { timeout: 90000, polling: 1000 }); // Poll every second instead of continuously
Marc Kupietz93d7f702025-06-27 15:41:48 +0200224
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200225 // Look for the tokens count in a dd element that follows an element with title "tokens"
Marc Kupietz669c0432025-07-12 12:33:42 +0200226 console.log(`Starting token count extraction with minThreshold: ${minTokenThreshold}`);
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200227 const tokenCount = await page.evaluate((minThreshold) => {
Marc Kupietz669c0432025-07-12 12:33:42 +0200228 // Find the element with title "tokens" first
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200229 const tokenTitleElements = document.querySelectorAll('[title="tokens"], [title*="token"]');
230
Marc Kupietz669c0432025-07-12 12:33:42 +0200231 for (let i = 0; i < tokenTitleElements.length; i++) {
232 const element = tokenTitleElements[i];
233
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200234 // Look for the next dd element
235 let nextElement = element.nextElementSibling;
Marc Kupietz669c0432025-07-12 12:33:42 +0200236 let siblingCount = 0;
237 while (nextElement && siblingCount < 10) {
238 siblingCount++;
239
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200240 if (nextElement.tagName.toLowerCase() === 'dd') {
241 const text = nextElement.textContent || nextElement.innerText || '';
242 // Remove number separators (commas and periods) and extract number
243 const cleanedText = text.replace(/[,\.]/g, '');
244 const numbers = cleanedText.match(/\d+/g);
245 if (numbers && numbers.length > 0) {
Marc Kupietz669c0432025-07-12 12:33:42 +0200246 const tokenValue = parseInt(numbers[0], 10);
247 return tokenValue;
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200248 }
249 }
250 nextElement = nextElement.nextElementSibling;
251 }
252 }
Marc Kupietz669c0432025-07-12 12:33:42 +0200253
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200254 // Alternative approach: look for dd elements that contain large numbers
255 const ddElements = document.querySelectorAll('dd');
Marc Kupietz669c0432025-07-12 12:33:42 +0200256 const candidateTokenCounts = [];
257
258 for (let i = 0; i < ddElements.length; i++) {
259 const dd = ddElements[i];
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200260 const text = dd.textContent || dd.innerText || '';
261 // Remove separators and check if it's a large number (likely token count)
262 const cleanedText = text.replace(/[,\.]/g, '');
263 const numbers = cleanedText.match(/\d+/g);
264 if (numbers && numbers.length > 0) {
265 const num = parseInt(numbers[0], 10);
Marc Kupietz669c0432025-07-12 12:33:42 +0200266
267 // Use the provided threshold to filter candidates
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200268 if (num > minThreshold) {
Marc Kupietz669c0432025-07-12 12:33:42 +0200269 candidateTokenCounts.push({ value: num, text: text, index: i });
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200270 }
271 }
272 }
Marc Kupietz669c0432025-07-12 12:33:42 +0200273
274 if (candidateTokenCounts.length > 0) {
275 // Return the largest candidate (most likely to be the total token count)
276 const bestCandidate = candidateTokenCounts.reduce((max, current) =>
277 current.value > max.value ? current : max
278 );
279 return bestCandidate.value;
280 }
281
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200282 return null;
283 }, minTokenThreshold);
Marc Kupietz669c0432025-07-12 12:33:42 +0200284
285 console.log(`Token count extraction completed. Result: ${tokenCount}`);
286
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200287 if (tokenCount === null) {
Marc Kupietz669c0432025-07-12 12:33:42 +0200288 console.error("ERROR: Token count extraction returned null");
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200289 throw new Error("Could not find token count in corpus statistics");
290 }
Marc Kupietz669c0432025-07-12 12:33:42 +0200291
292 console.log(`SUCCESS: Found token count: ${tokenCount}, threshold was: ${minTokenThreshold}`);
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200293 return tokenCount;
Marc Kupietz669c0432025-07-12 12:33:42 +0200294
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200295 } catch (error) {
Marc Kupietz669c0432025-07-12 12:33:42 +0200296 console.error(`ERROR in check_corpus_statistics: ${error.message}`);
297 console.error("Full error stack:", error.stack);
Marc Kupietzc8ffb2b2025-06-12 16:44:23 +0200298 throw new Error(`Failed to check corpus statistics: ${error.message}`);
299 }
Marc Kupietz5e45a2f2022-12-03 15:32:40 +0100300 }
301}
302
Marc Kupietz93d7f702025-06-27 15:41:48 +0200303module.exports = KorAPRC