Initial import (translated from rderekovecs)
Change-Id: Ib4a4747f6474dfe67d79288be3f8bdaf66a513b8
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..154780c
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,34 @@
+# Python virtual environments
+venv/
+env/
+.env/
+pyenv/
+.pyenv/
+.venv/
+ENV/
+
+# Python bytecode
+__pycache__/
+*.py[cod]
+*$py.class
+
+# Distribution / packaging
+dist/
+build/
+*.egg-info/
+
+# Coverage reports
+htmlcov/
+.coverage
+.coverage.*
+coverage.xml
+*.cover
+
+# IDE files
+.idea/
+.vscode/
+*.swp
+*.swo
+
+# OS generated files
+.DS_Store
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..b9cc498
--- /dev/null
+++ b/README.md
@@ -0,0 +1,78 @@
+# pyderekovecs
+
+A Python client package that makes the DeReKoVecs web service API accessible from Python.
+
+## Installation
+
+```bash
+pip install git+https://korap.ids-mannheim.de/gerrit/IDS-Mannheim/pyderekovecs.git
+```
+
+Or clone the repository and install locally:
+
+```bash
+git clone https://korap.ids-mannheim.de/gerrit/IDS-Mannheim/pyderekovecs.git
+cd pyderekovecs
+pip install -e .
+```
+
+## Usage
+
+```python
+import pyderekovecs as pd
+
+# Get paradigmatic neighbors for a word
+neighbors = pd.paradigmatic_neighbours("Haus")
+print(neighbors.head())
+
+# Get syntagmatic neighbors
+collocates = pd.syntagmatic_neighbours("Haus")
+print(collocates.head())
+
+# Get word embedding
+embedding = pd.word_embedding("Haus")
+print(len(embedding)) # Should be 200
+
+# Calculate cosine similarity between two words
+similarity = pd.cosine_similarity("Haus", "Gebäude")
+print(f"Similarity: {similarity}")
+```
+
+## Accessing other DeReKoVecs instances
+
+### KoKoKom
+
+```python
+import os
+os.environ["DEREKOVECS_SERVER"] = "https://corpora.ids-mannheim.de/openlab/kokokomvecs"
+```
+
+### CoRoLa (Contemporary Reference Corpus of the Romanian Language)
+
+```python
+import os
+os.environ["DEREKOVECS_SERVER"] = "https://corpora.ids-mannheim.de/openlab/corolavecs"
+```
+
+## Available Functions
+
+- `syntagmatic_neighbours(word, **params)`: Get the syntagmatic neighbour predictions of a word
+- `countbased_collocates(w, **params)`: Get the collocates of a word in the count-based dereko model
+- `word_frequency(w, **params)`: Get the absolute frequency of a word in the corpus
+- `corpus_size(w, **params)`: Get the token size of the corpus used to train the model
+- `paradigmatic_neighbours(word, **params)`: Get the paradigmatic neighbours of a word
+- `word_embedding(word, **params)`: Get the normalized embedding vector of a word
+- `frequency_rank(word, **params)`: Get the frequency rank of a word in the training data
+- `server_version()`: Get the version of the derekovecs server
+- `vocab_size()`: Get the vocabulary size of the model
+- `model_name()`: Get the name of the model
+- `collocation_scores(w, c, **params)`: Calculate the association scores between a node and a collocate
+- `cosine_similarity(w1, w2, **params)`: Calculate the cosine similarity between two words
+
+## Development
+
+To run tests:
+
+```bash
+python -m unittest discover tests
+```
\ No newline at end of file
diff --git a/pyderekovecs/__init__.py b/pyderekovecs/__init__.py
new file mode 100644
index 0000000..828a875
--- /dev/null
+++ b/pyderekovecs/__init__.py
@@ -0,0 +1,41 @@
+"""
+pyderekovecs: DeReKoVecs API Client Package in Python
+
+A client package that makes the DeReKoVecs web service API accessible from Python.
+"""
+
+__version__ = "0.1.0"
+
+from .derekovecs import (
+ syntagmatic_neighbours,
+ countbased_collocates,
+ word_frequency,
+ corpus_size,
+ paradigmatic_neighbours,
+ word_embedding,
+ frequency_rank,
+ server_version,
+ vocab_size,
+ model_name,
+ collocation_scores,
+ cosine_similarity,
+ derekovecs_server,
+ derekovecs_api_call
+)
+
+__all__ = [
+ "syntagmatic_neighbours",
+ "countbased_collocates",
+ "word_frequency",
+ "corpus_size",
+ "paradigmatic_neighbours",
+ "word_embedding",
+ "frequency_rank",
+ "server_version",
+ "vocab_size",
+ "model_name",
+ "collocation_scores",
+ "cosine_similarity",
+ "derekovecs_server",
+ "derekovecs_api_call"
+]
\ No newline at end of file
diff --git a/pyderekovecs/derekovecs.py b/pyderekovecs/derekovecs.py
new file mode 100644
index 0000000..bd56982
--- /dev/null
+++ b/pyderekovecs/derekovecs.py
@@ -0,0 +1,220 @@
+"""
+DeReKoVecs API client module.
+
+This module provides functions to access the DeReKoVecs web service API.
+"""
+import os
+import requests
+import pandas as pd
+from typing import Dict, List, Union, Any
+
+
+def derekovecs_server() -> str:
+ """
+ Get the URL of the DeReKoVecs API server.
+
+ Returns:
+ str: The URL of the DeReKoVecs API server.
+ """
+ api_server = os.getenv("DEREKOVECS_SERVER")
+ if api_server:
+ return api_server
+ return "https://corpora.ids-mannheim.de/openlab/derekovecs/"
+
+
+def derekovecs_api_call(method: str = "", **params) -> Dict:
+ """
+ Call the DeReKoVecs API.
+
+ Args:
+ method: The method to call.
+ **params: The parameters to pass to the method.
+
+ Returns:
+ Dict: The result of the call.
+ """
+ url = f"{derekovecs_server()}{method}"
+ response = requests.get(url, params=params)
+ response.raise_for_status()
+ return response.json()
+
+
+def syntagmatic_neighbours(word: str = "Test", **params) -> pd.DataFrame:
+ """
+ Get the syntagmatic neighbour predictions of a word from the DeReKoVecs model.
+
+ Args:
+ word: The word to get the syntagmatic neighbours for.
+ **params: Additional parameters to pass to the API.
+
+ Returns:
+ pd.DataFrame: Data frame with the syntagmatic neighbours of a node predicted from derekovecs model.
+ """
+ params["word"] = word
+ params["json"] = 1
+ result = derekovecs_api_call("", **params)
+ return pd.DataFrame(result["collocators"])
+
+
+def countbased_collocates(w: str = "Test", **params) -> pd.DataFrame:
+ """
+ Get the collocates of a word in the count-based dereko model.
+
+ Args:
+ w: The word to get the collocates for.
+ **params: Additional parameters to pass to the API.
+
+ Returns:
+ pd.DataFrame: A data frame with the most salient collocates and their association scores.
+ """
+ params["w"] = w
+ result = derekovecs_api_call("/getClassicCollocators", **params)
+ return pd.DataFrame(result["collocates"])
+
+
+def word_frequency(w: str = "Test", **params) -> int:
+ """
+ Gets the absolute frequency of a word in the corpus.
+
+ Args:
+ w: The word to get the frequency of.
+ **params: Additional parameters to pass to the API.
+
+ Returns:
+ int: The absolute frequency of the word.
+ """
+ params["w"] = w
+ result = derekovecs_api_call("/getClassicCollocators", **params)
+ return result["f1"]
+
+
+def corpus_size(w: str = "Test", **params) -> int:
+ """
+ Gets the token size of the corpus used to train the model.
+
+ Args:
+ w: Probe word (defaults to 'Test') required for old derekovecs servers.
+ **params: Additional parameters to pass to the API.
+
+ Returns:
+ int: The number of tokens in the corpus.
+ """
+ params["w"] = w
+ result = derekovecs_api_call("/getClassicCollocators", **params)
+ return result["N"]
+
+
+def paradigmatic_neighbours(word: str = "Test", **params) -> pd.DataFrame:
+ """
+ Get the paradigmatic neighbours of a word in the derekovecs model.
+
+ Args:
+ word: The word to get the paradigmatic neighbours for.
+ **params: Additional parameters to pass to the API.
+
+ Returns:
+ pd.DataFrame: A data frame of words with their similarity scores.
+ """
+ params["word"] = word
+ params["json"] = 1
+ result = derekovecs_api_call("", **params)
+ return pd.DataFrame(result["list"][0])
+
+
+def word_embedding(word: str = "Test", **params) -> List[float]:
+ """
+ Get the normalized embedding vector of a word from the derekovecs model.
+
+ Args:
+ word: The word to get the embedding for.
+ **params: Additional parameters to pass to the API.
+
+ Returns:
+ List[float]: Normalized embedding vector of the given word.
+ """
+ params["word"] = word
+ params["n"] = 1
+ params["json"] = 1
+ result = derekovecs_api_call("", **params)
+ return result["list"][0]["vector"][0]
+
+
+def frequency_rank(word: str = "Test", **params) -> int:
+ """
+ Gets the frequency rank of a word in the training data.
+
+ Args:
+ word: The word to get the frequency rank of.
+ **params: Additional parameters to pass to the API.
+
+ Returns:
+ int: Frequency rank.
+ """
+ params["w"] = word
+ result = derekovecs_api_call("/getWord", **params)
+ return result["frequencyRank"]
+
+
+def server_version() -> str:
+ """
+ Get the version of the derekovecs server.
+
+ Returns:
+ str: The version of the derekovecs server.
+ """
+ return derekovecs_api_call("/getVersion")
+
+
+def vocab_size() -> int:
+ """
+ Get the vocabulary size of the model.
+
+ Returns:
+ int: The vocabulary size of the model.
+ """
+ return derekovecs_api_call("/getVocabSize")
+
+
+def model_name() -> str:
+ """
+ Get the name of the model.
+
+ Returns:
+ str: The name of the model.
+ """
+ return derekovecs_api_call("/getModelName")
+
+
+def collocation_scores(w: str, c: str, **params) -> pd.DataFrame:
+ """
+ Calculate the association scores between a node (target word) and words in a window around it.
+
+ Args:
+ w: The target word/node.
+ c: The collocate.
+ **params: Additional parameters to pass to the API.
+
+ Returns:
+ pd.DataFrame: A one row data frame with collocate and its association scores.
+ """
+ params["w"] = w
+ params["c"] = c
+ result = derekovecs_api_call("/getCollocationAssociation", **params)
+ return pd.DataFrame(result["collocates"])
+
+
+def cosine_similarity(w1: str, w2: str, **params) -> float:
+ """
+ Calculate the cosine similarity between two words in the derekovecs model.
+
+ Args:
+ w1: The first word.
+ w2: The second word.
+ **params: Additional parameters to pass to the API.
+
+ Returns:
+ float: The cosine similarity between the two words.
+ """
+ params["w1"] = w1
+ params["w2"] = w2
+ return derekovecs_api_call("/getSimilarity", **params)
\ No newline at end of file
diff --git a/pyderekovecs/utils.py b/pyderekovecs/utils.py
new file mode 100644
index 0000000..691591f
--- /dev/null
+++ b/pyderekovecs/utils.py
@@ -0,0 +1,67 @@
+"""
+Utility functions for the pyderekovecs package.
+"""
+import pandas as pd
+from typing import List, Dict, Any, Union
+
+
+def is_word(value: Any) -> bool:
+ """
+ Check if a value is a non-empty string.
+
+ Args:
+ value: The value to check.
+
+ Returns:
+ bool: True if the value is a non-empty string, False otherwise.
+ """
+ return isinstance(value, str) and len(value) >= 1
+
+
+def merge_results(results: List[Dict[str, Any]]) -> pd.DataFrame:
+ """
+ Merge multiple API call results into a single DataFrame.
+
+ Args:
+ results: A list of dictionaries with API call results.
+
+ Returns:
+ pd.DataFrame: A merged DataFrame.
+ """
+ if not results:
+ return pd.DataFrame()
+
+ return pd.concat([pd.DataFrame(result) for result in results], ignore_index=True)
+
+
+def filter_by_threshold(df: pd.DataFrame, column: str, threshold: float) -> pd.DataFrame:
+ """
+ Filter a DataFrame by a threshold value.
+
+ Args:
+ df: The DataFrame to filter.
+ column: The column to filter on.
+ threshold: The threshold value.
+
+ Returns:
+ pd.DataFrame: The filtered DataFrame.
+ """
+ return df[df[column] >= threshold]
+
+
+def batch_api_calls(func, items: List[str], **kwargs) -> Dict[str, Any]:
+ """
+ Perform batch API calls for a list of items.
+
+ Args:
+ func: The API function to call.
+ items: The list of items to process.
+ **kwargs: Additional parameters to pass to the API.
+
+ Returns:
+ Dict[str, Any]: A dictionary mapping items to results.
+ """
+ results = {}
+ for item in items:
+ results[item] = func(item, **kwargs)
+ return results
\ No newline at end of file
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..c8e4854
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,23 @@
+from setuptools import setup, find_packages
+
+setup(
+ name="pyderekovecs",
+ version="0.1.0",
+ packages=find_packages(),
+ install_requires=[
+ "requests>=2.25.0",
+ "pandas>=1.2.0",
+ ],
+ author="Marc Kupietz",
+ author_email="kupietz@ids-mannheim.de",
+ description="DeReKoVecs API Client Package in Python",
+ long_description=open("README.md").read(),
+ long_description_content_type="text/markdown",
+ url="https://korap.ids-mannheim.de/gerrit/plugins/gitiles/ids-kl/pyderekovecs",
+ classifiers=[
+ "Programming Language :: Python :: 3",
+ "License :: OSI Approved :: BSD License",
+ "Operating System :: OS Independent",
+ ],
+ python_requires=">=3.6",
+)
diff --git a/tests/test_derekovecs.py b/tests/test_derekovecs.py
new file mode 100644
index 0000000..d634215
--- /dev/null
+++ b/tests/test_derekovecs.py
@@ -0,0 +1,149 @@
+"""
+Tests for the pyderekovecs package.
+"""
+import unittest
+import pandas as pd
+from unittest import mock
+
+from pyderekovecs import (
+ syntagmatic_neighbours,
+ countbased_collocates,
+ word_frequency,
+ corpus_size,
+ paradigmatic_neighbours,
+ word_embedding,
+ frequency_rank,
+ cosine_similarity,
+ collocation_scores,
+ derekovecs_api_call
+)
+from pyderekovecs.utils import is_word
+
+
+class MockResponse:
+ """Mock response class for testing."""
+
+ def __init__(self, json_data, status_code=200):
+ self.json_data = json_data
+ self.status_code = status_code
+
+ def json(self):
+ return self.json_data
+
+ def raise_for_status(self):
+ if self.status_code != 200:
+ raise Exception(f"HTTP Error: {self.status_code}")
+
+
+class TestDerekovecs(unittest.TestCase):
+ """Test cases for the pyderekovecs package."""
+
+ @mock.patch('pyderekovecs.derekovecs.requests.get')
+ def test_paradigmatic_neighbours(self, mock_get):
+ """Test that paradigmatic_neighbours works."""
+ mock_response = {
+ "list": [
+ [{"word": "Test", "similarity": 1.0}, {"word": "Experiment", "similarity": 0.8}]
+ ]
+ }
+ mock_get.return_value = MockResponse(mock_response)
+
+ result = paradigmatic_neighbours("Test")
+ self.assertEqual(result.iloc[0]['word'], "Test")
+
+ @mock.patch('pyderekovecs.derekovecs.requests.get')
+ def test_syntagmatic_neighbours(self, mock_get):
+ """Test that syntagmatic_neighbours works."""
+ mock_response = {
+ "collocators": [
+ {"word": "durchführen", "rank": 1, "average": 0.8}
+ ]
+ }
+ mock_get.return_value = MockResponse(mock_response)
+
+ result = syntagmatic_neighbours("Test")
+ self.assertTrue(is_word(result.iloc[0]['word']))
+
+ @mock.patch('pyderekovecs.derekovecs.requests.get')
+ def test_countbased_collocates(self, mock_get):
+ """Test that countbased_collocates works."""
+ mock_response = {
+ "collocates": [
+ {"word": "durchführen", "f": 100, "pmi": 0.8}
+ ]
+ }
+ mock_get.return_value = MockResponse(mock_response)
+
+ result = countbased_collocates("Test")
+ self.assertTrue(is_word(result.iloc[0]['word']))
+
+ @mock.patch('pyderekovecs.derekovecs.requests.get')
+ def test_collocation_scores(self, mock_get):
+ """Test that collocation_scores works."""
+ mock_response = {
+ "collocates": [
+ {"word": "putzen", "f2": 500, "pmi": 0.8}
+ ]
+ }
+ mock_get.return_value = MockResponse(mock_response)
+
+ result = collocation_scores("Zähne", "putzen")
+ self.assertTrue(result.iloc[0]['f2'] > 0)
+
+ @mock.patch('pyderekovecs.derekovecs.requests.get')
+ def test_cosine_similarity(self, mock_get):
+ """Test that cosine_similarity works."""
+ # Test for same word
+ mock_get.return_value = MockResponse(1.0)
+
+ result = cosine_similarity("Test", "Test")
+ self.assertEqual(result, 1.0)
+
+ # Test for different words
+ mock_get.return_value = MockResponse(0.7)
+
+ result = cosine_similarity("Test", "testen")
+ self.assertTrue(0 <= result <= 1.0)
+
+ @mock.patch('pyderekovecs.derekovecs.requests.get')
+ def test_word_embedding(self, mock_get):
+ """Test that word_embedding works."""
+ # Create a mock vector of length 200
+ mock_vector = [0.1] * 200
+ mock_response = {
+ "list": [
+ {"vector": [mock_vector]}
+ ]
+ }
+ mock_get.return_value = MockResponse(mock_response)
+
+ result = word_embedding("Test")
+ self.assertEqual(len(result), 200)
+
+ @mock.patch('pyderekovecs.derekovecs.requests.get')
+ def test_frequency_rank(self, mock_get):
+ """Test that frequency_rank works."""
+ mock_get.return_value = MockResponse({"frequencyRank": 500})
+
+ result = frequency_rank("Test")
+ self.assertTrue(isinstance(result, int) and result > 0)
+
+ @mock.patch('pyderekovecs.derekovecs.requests.get')
+ def test_word_frequency(self, mock_get):
+ """Test that word_frequency works."""
+ mock_get.return_value = MockResponse({"f1": 1000})
+
+ result = word_frequency("Test")
+ self.assertTrue(isinstance(result, int) and result > 0)
+
+ @mock.patch('pyderekovecs.derekovecs.requests.get')
+ def test_corpus_size(self, mock_get):
+ """Test that corpus_size works."""
+ mock_get.return_value = MockResponse({"N": 1000000})
+
+ result = corpus_size()
+ self.assertTrue(isinstance(result, int) and result > 1000)
+
+
+if __name__ == '__main__':
+ unittest.main()
\ No newline at end of file