Initial import (translated from rderekovecs) Change-Id: Ib4a4747f6474dfe67d79288be3f8bdaf66a513b8

commit: 04784b96d4ac2e3a57e1bf4e503c808881d0a4e4 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Sun May 04 13:38:12 2025 +0200
committer: Marc Kupietz <kupietz@ids-mannheim.de> Sun May 04 13:41:10 2025 +0200
tree: fd9f6908c479e09ad768d6d2995d58ee53026ce5
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..154780c
--- /dev/null
+++ b/.gitignore

@@ -0,0 +1,34 @@
+# Python virtual environments
+venv/
+env/
+.env/
+pyenv/
+.pyenv/
+.venv/
+ENV/
+
+# Python bytecode
+__pycache__/
+*.py[cod]
+*$py.class
+
+# Distribution / packaging
+dist/
+build/
+*.egg-info/
+
+# Coverage reports
+htmlcov/
+.coverage
+.coverage.*
+coverage.xml
+*.cover
+
+# IDE files
+.idea/
+.vscode/
+*.swp
+*.swo
+
+# OS generated files
+.DS_Store
\ No newline at end of file

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..b9cc498
--- /dev/null
+++ b/README.md

@@ -0,0 +1,78 @@
+# pyderekovecs
+
+A Python client package that makes the DeReKoVecs web service API accessible from Python.
+
+## Installation
+
+```bash
+pip install git+https://korap.ids-mannheim.de/gerrit/IDS-Mannheim/pyderekovecs.git
+```
+
+Or clone the repository and install locally:
+
+```bash
+git clone https://korap.ids-mannheim.de/gerrit/IDS-Mannheim/pyderekovecs.git
+cd pyderekovecs
+pip install -e .
+```
+
+## Usage
+
+```python
+import pyderekovecs as pd
+
+# Get paradigmatic neighbors for a word
+neighbors = pd.paradigmatic_neighbours("Haus")
+print(neighbors.head())
+
+# Get syntagmatic neighbors
+collocates = pd.syntagmatic_neighbours("Haus")
+print(collocates.head())
+
+# Get word embedding
+embedding = pd.word_embedding("Haus")
+print(len(embedding))  # Should be 200
+
+# Calculate cosine similarity between two words
+similarity = pd.cosine_similarity("Haus", "Gebäude")
+print(f"Similarity: {similarity}")
+```
+
+## Accessing other DeReKoVecs instances
+
+### KoKoKom
+
+```python
+import os
+os.environ["DEREKOVECS_SERVER"] = "https://corpora.ids-mannheim.de/openlab/kokokomvecs"
+```
+
+### CoRoLa (Contemporary Reference Corpus of the Romanian Language)
+
+```python
+import os
+os.environ["DEREKOVECS_SERVER"] = "https://corpora.ids-mannheim.de/openlab/corolavecs"
+```
+
+## Available Functions
+
+- `syntagmatic_neighbours(word, **params)`: Get the syntagmatic neighbour predictions of a word
+- `countbased_collocates(w, **params)`: Get the collocates of a word in the count-based dereko model
+- `word_frequency(w, **params)`: Get the absolute frequency of a word in the corpus
+- `corpus_size(w, **params)`: Get the token size of the corpus used to train the model
+- `paradigmatic_neighbours(word, **params)`: Get the paradigmatic neighbours of a word
+- `word_embedding(word, **params)`: Get the normalized embedding vector of a word
+- `frequency_rank(word, **params)`: Get the frequency rank of a word in the training data
+- `server_version()`: Get the version of the derekovecs server
+- `vocab_size()`: Get the vocabulary size of the model
+- `model_name()`: Get the name of the model
+- `collocation_scores(w, c, **params)`: Calculate the association scores between a node and a collocate
+- `cosine_similarity(w1, w2, **params)`: Calculate the cosine similarity between two words
+
+## Development
+
+To run tests:
+
+```bash
+python -m unittest discover tests
+```
\ No newline at end of file

diff --git a/pyderekovecs/__init__.py b/pyderekovecs/__init__.py
new file mode 100644
index 0000000..828a875
--- /dev/null
+++ b/pyderekovecs/__init__.py

@@ -0,0 +1,41 @@
+"""
+pyderekovecs: DeReKoVecs API Client Package in Python
+
+A client package that makes the DeReKoVecs web service API accessible from Python.
+"""
+
+__version__ = "0.1.0"
+
+from .derekovecs import (
+    syntagmatic_neighbours,
+    countbased_collocates,
+    word_frequency,
+    corpus_size,
+    paradigmatic_neighbours,
+    word_embedding,
+    frequency_rank,
+    server_version,
+    vocab_size,
+    model_name,
+    collocation_scores,
+    cosine_similarity,
+    derekovecs_server,
+    derekovecs_api_call
+)
+
+__all__ = [
+    "syntagmatic_neighbours",
+    "countbased_collocates",
+    "word_frequency",
+    "corpus_size",
+    "paradigmatic_neighbours",
+    "word_embedding",
+    "frequency_rank",
+    "server_version",
+    "vocab_size",
+    "model_name",
+    "collocation_scores",
+    "cosine_similarity",
+    "derekovecs_server",
+    "derekovecs_api_call"
+]
\ No newline at end of file

diff --git a/pyderekovecs/derekovecs.py b/pyderekovecs/derekovecs.py
new file mode 100644
index 0000000..bd56982
--- /dev/null
+++ b/pyderekovecs/derekovecs.py

@@ -0,0 +1,220 @@
+"""
+DeReKoVecs API client module.
+
+This module provides functions to access the DeReKoVecs web service API.
+"""
+import os
+import requests
+import pandas as pd
+from typing import Dict, List, Union, Any
+
+
+def derekovecs_server() -> str:
+    """
+    Get the URL of the DeReKoVecs API server.
+
+    Returns:
+        str: The URL of the DeReKoVecs API server.
+    """
+    api_server = os.getenv("DEREKOVECS_SERVER")
+    if api_server:
+        return api_server
+    return "https://corpora.ids-mannheim.de/openlab/derekovecs/"
+
+
+def derekovecs_api_call(method: str = "", **params) -> Dict:
+    """
+    Call the DeReKoVecs API.
+
+    Args:
+        method: The method to call.
+        **params: The parameters to pass to the method.
+
+    Returns:
+        Dict: The result of the call.
+    """
+    url = f"{derekovecs_server()}{method}"
+    response = requests.get(url, params=params)
+    response.raise_for_status()
+    return response.json()
+
+
+def syntagmatic_neighbours(word: str = "Test", **params) -> pd.DataFrame:
+    """
+    Get the syntagmatic neighbour predictions of a word from the DeReKoVecs model.
+
+    Args:
+        word: The word to get the syntagmatic neighbours for.
+        **params: Additional parameters to pass to the API.
+
+    Returns:
+        pd.DataFrame: Data frame with the syntagmatic neighbours of a node predicted from derekovecs model.
+    """
+    params["word"] = word
+    params["json"] = 1
+    result = derekovecs_api_call("", **params)
+    return pd.DataFrame(result["collocators"])
+
+
+def countbased_collocates(w: str = "Test", **params) -> pd.DataFrame:
+    """
+    Get the collocates of a word in the count-based dereko model.
+
+    Args:
+        w: The word to get the collocates for.
+        **params: Additional parameters to pass to the API.
+
+    Returns:
+        pd.DataFrame: A data frame with the most salient collocates and their association scores.
+    """
+    params["w"] = w
+    result = derekovecs_api_call("/getClassicCollocators", **params)
+    return pd.DataFrame(result["collocates"])
+
+
+def word_frequency(w: str = "Test", **params) -> int:
+    """
+    Gets the absolute frequency of a word in the corpus.
+
+    Args:
+        w: The word to get the frequency of.
+        **params: Additional parameters to pass to the API.
+
+    Returns:
+        int: The absolute frequency of the word.
+    """
+    params["w"] = w
+    result = derekovecs_api_call("/getClassicCollocators", **params)
+    return result["f1"]
+
+
+def corpus_size(w: str = "Test", **params) -> int:
+    """
+    Gets the token size of the corpus used to train the model.
+
+    Args:
+        w: Probe word (defaults to 'Test') required for old derekovecs servers.
+        **params: Additional parameters to pass to the API.
+
+    Returns:
+        int: The number of tokens in the corpus.
+    """
+    params["w"] = w
+    result = derekovecs_api_call("/getClassicCollocators", **params)
+    return result["N"]
+
+
+def paradigmatic_neighbours(word: str = "Test", **params) -> pd.DataFrame:
+    """
+    Get the paradigmatic neighbours of a word in the derekovecs model.
+
+    Args:
+        word: The word to get the paradigmatic neighbours for.
+        **params: Additional parameters to pass to the API.
+
+    Returns:
+        pd.DataFrame: A data frame of words with their similarity scores.
+    """
+    params["word"] = word
+    params["json"] = 1
+    result = derekovecs_api_call("", **params)
+    return pd.DataFrame(result["list"][0])
+
+
+def word_embedding(word: str = "Test", **params) -> List[float]:
+    """
+    Get the normalized embedding vector of a word from the derekovecs model.
+
+    Args:
+        word: The word to get the embedding for.
+        **params: Additional parameters to pass to the API.
+
+    Returns:
+        List[float]: Normalized embedding vector of the given word.
+    """
+    params["word"] = word
+    params["n"] = 1
+    params["json"] = 1
+    result = derekovecs_api_call("", **params)
+    return result["list"][0]["vector"][0]
+
+
+def frequency_rank(word: str = "Test", **params) -> int:
+    """
+    Gets the frequency rank of a word in the training data.
+
+    Args:
+        word: The word to get the frequency rank of.
+        **params: Additional parameters to pass to the API.
+
+    Returns:
+        int: Frequency rank.
+    """
+    params["w"] = word
+    result = derekovecs_api_call("/getWord", **params)
+    return result["frequencyRank"]
+
+
+def server_version() -> str:
+    """
+    Get the version of the derekovecs server.
+
+    Returns:
+        str: The version of the derekovecs server.
+    """
+    return derekovecs_api_call("/getVersion")
+
+
+def vocab_size() -> int:
+    """
+    Get the vocabulary size of the model.
+
+    Returns:
+        int: The vocabulary size of the model.
+    """
+    return derekovecs_api_call("/getVocabSize")
+
+
+def model_name() -> str:
+    """
+    Get the name of the model.
+
+    Returns:
+        str: The name of the model.
+    """
+    return derekovecs_api_call("/getModelName")
+
+
+def collocation_scores(w: str, c: str, **params) -> pd.DataFrame:
+    """
+    Calculate the association scores between a node (target word) and words in a window around it.
+
+    Args:
+        w: The target word/node.
+        c: The collocate.
+        **params: Additional parameters to pass to the API.
+
+    Returns:
+        pd.DataFrame: A one row data frame with collocate and its association scores.
+    """
+    params["w"] = w
+    params["c"] = c
+    result = derekovecs_api_call("/getCollocationAssociation", **params)
+    return pd.DataFrame(result["collocates"])
+
+
+def cosine_similarity(w1: str, w2: str, **params) -> float:
+    """
+    Calculate the cosine similarity between two words in the derekovecs model.
+
+    Args:
+        w1: The first word.
+        w2: The second word.
+        **params: Additional parameters to pass to the API.
+
+    Returns:
+        float: The cosine similarity between the two words.
+    """
+    params["w1"] = w1
+    params["w2"] = w2
+    return derekovecs_api_call("/getSimilarity", **params)
\ No newline at end of file

diff --git a/pyderekovecs/utils.py b/pyderekovecs/utils.py
new file mode 100644
index 0000000..691591f
--- /dev/null
+++ b/pyderekovecs/utils.py

@@ -0,0 +1,67 @@
+"""
+Utility functions for the pyderekovecs package.
+"""
+import pandas as pd
+from typing import List, Dict, Any, Union
+
+
+def is_word(value: Any) -> bool:
+    """
+    Check if a value is a non-empty string.
+
+    Args:
+        value: The value to check.
+
+    Returns:
+        bool: True if the value is a non-empty string, False otherwise.
+    """
+    return isinstance(value, str) and len(value) >= 1
+
+
+def merge_results(results: List[Dict[str, Any]]) -> pd.DataFrame:
+    """
+    Merge multiple API call results into a single DataFrame.
+
+    Args:
+        results: A list of dictionaries with API call results.
+
+    Returns:
+        pd.DataFrame: A merged DataFrame.
+    """
+    if not results:
+        return pd.DataFrame()
+    
+    return pd.concat([pd.DataFrame(result) for result in results], ignore_index=True)
+
+
+def filter_by_threshold(df: pd.DataFrame, column: str, threshold: float) -> pd.DataFrame:
+    """
+    Filter a DataFrame by a threshold value.
+
+    Args:
+        df: The DataFrame to filter.
+        column: The column to filter on.
+        threshold: The threshold value.
+
+    Returns:
+        pd.DataFrame: The filtered DataFrame.
+    """
+    return df[df[column] >= threshold]
+
+
+def batch_api_calls(func, items: List[str], **kwargs) -> Dict[str, Any]:
+    """
+    Perform batch API calls for a list of items.
+
+    Args:
+        func: The API function to call.
+        items: The list of items to process.
+        **kwargs: Additional parameters to pass to the API.
+
+    Returns:
+        Dict[str, Any]: A dictionary mapping items to results.
+    """
+    results = {}
+    for item in items:
+        results[item] = func(item, **kwargs)
+    return results
\ No newline at end of file

diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..c8e4854
--- /dev/null
+++ b/setup.py

@@ -0,0 +1,23 @@
+from setuptools import setup, find_packages
+
+setup(
+    name="pyderekovecs",
+    version="0.1.0",
+    packages=find_packages(),
+    install_requires=[
+        "requests>=2.25.0",
+        "pandas>=1.2.0",
+    ],
+    author="Marc Kupietz",
+    author_email="kupietz@ids-mannheim.de",
+    description="DeReKoVecs API Client Package in Python",
+    long_description=open("README.md").read(),
+    long_description_content_type="text/markdown",
+    url="https://korap.ids-mannheim.de/gerrit/plugins/gitiles/ids-kl/pyderekovecs",
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: BSD License",
+        "Operating System :: OS Independent",
+    ],
+    python_requires=">=3.6",
+)

diff --git a/tests/test_derekovecs.py b/tests/test_derekovecs.py
new file mode 100644
index 0000000..d634215
--- /dev/null
+++ b/tests/test_derekovecs.py

@@ -0,0 +1,149 @@
+"""
+Tests for the pyderekovecs package.
+"""
+import unittest
+import pandas as pd
+from unittest import mock
+
+from pyderekovecs import (
+    syntagmatic_neighbours,
+    countbased_collocates,
+    word_frequency,
+    corpus_size,
+    paradigmatic_neighbours,
+    word_embedding,
+    frequency_rank,
+    cosine_similarity,
+    collocation_scores,
+    derekovecs_api_call
+)
+from pyderekovecs.utils import is_word
+
+
+class MockResponse:
+    """Mock response class for testing."""
+    
+    def __init__(self, json_data, status_code=200):
+        self.json_data = json_data
+        self.status_code = status_code
+        
+    def json(self):
+        return self.json_data
+    
+    def raise_for_status(self):
+        if self.status_code != 200:
+            raise Exception(f"HTTP Error: {self.status_code}")
+
+
+class TestDerekovecs(unittest.TestCase):
+    """Test cases for the pyderekovecs package."""
+
+    @mock.patch('pyderekovecs.derekovecs.requests.get')
+    def test_paradigmatic_neighbours(self, mock_get):
+        """Test that paradigmatic_neighbours works."""
+        mock_response = {
+            "list": [
+                [{"word": "Test", "similarity": 1.0}, {"word": "Experiment", "similarity": 0.8}]
+            ]
+        }
+        mock_get.return_value = MockResponse(mock_response)
+        
+        result = paradigmatic_neighbours("Test")
+        self.assertEqual(result.iloc[0]['word'], "Test")
+        
+    @mock.patch('pyderekovecs.derekovecs.requests.get')
+    def test_syntagmatic_neighbours(self, mock_get):
+        """Test that syntagmatic_neighbours works."""
+        mock_response = {
+            "collocators": [
+                {"word": "durchführen", "rank": 1, "average": 0.8}
+            ]
+        }
+        mock_get.return_value = MockResponse(mock_response)
+        
+        result = syntagmatic_neighbours("Test")
+        self.assertTrue(is_word(result.iloc[0]['word']))
+        
+    @mock.patch('pyderekovecs.derekovecs.requests.get')
+    def test_countbased_collocates(self, mock_get):
+        """Test that countbased_collocates works."""
+        mock_response = {
+            "collocates": [
+                {"word": "durchführen", "f": 100, "pmi": 0.8}
+            ]
+        }
+        mock_get.return_value = MockResponse(mock_response)
+        
+        result = countbased_collocates("Test")
+        self.assertTrue(is_word(result.iloc[0]['word']))
+        
+    @mock.patch('pyderekovecs.derekovecs.requests.get')
+    def test_collocation_scores(self, mock_get):
+        """Test that collocation_scores works."""
+        mock_response = {
+            "collocates": [
+                {"word": "putzen", "f2": 500, "pmi": 0.8}
+            ]
+        }
+        mock_get.return_value = MockResponse(mock_response)
+        
+        result = collocation_scores("Zähne", "putzen")
+        self.assertTrue(result.iloc[0]['f2'] > 0)
+        
+    @mock.patch('pyderekovecs.derekovecs.requests.get')
+    def test_cosine_similarity(self, mock_get):
+        """Test that cosine_similarity works."""
+        # Test for same word
+        mock_get.return_value = MockResponse(1.0)
+        
+        result = cosine_similarity("Test", "Test")
+        self.assertEqual(result, 1.0)
+        
+        # Test for different words
+        mock_get.return_value = MockResponse(0.7)
+        
+        result = cosine_similarity("Test", "testen")
+        self.assertTrue(0 <= result <= 1.0)
+        
+    @mock.patch('pyderekovecs.derekovecs.requests.get')
+    def test_word_embedding(self, mock_get):
+        """Test that word_embedding works."""
+        # Create a mock vector of length 200
+        mock_vector = [0.1] * 200
+        mock_response = {
+            "list": [
+                {"vector": [mock_vector]}
+            ]
+        }
+        mock_get.return_value = MockResponse(mock_response)
+        
+        result = word_embedding("Test")
+        self.assertEqual(len(result), 200)
+        
+    @mock.patch('pyderekovecs.derekovecs.requests.get')
+    def test_frequency_rank(self, mock_get):
+        """Test that frequency_rank works."""
+        mock_get.return_value = MockResponse({"frequencyRank": 500})
+        
+        result = frequency_rank("Test")
+        self.assertTrue(isinstance(result, int) and result > 0)
+        
+    @mock.patch('pyderekovecs.derekovecs.requests.get')
+    def test_word_frequency(self, mock_get):
+        """Test that word_frequency works."""
+        mock_get.return_value = MockResponse({"f1": 1000})
+        
+        result = word_frequency("Test")
+        self.assertTrue(isinstance(result, int) and result > 0)
+        
+    @mock.patch('pyderekovecs.derekovecs.requests.get')
+    def test_corpus_size(self, mock_get):
+        """Test that corpus_size works."""
+        mock_get.return_value = MockResponse({"N": 1000000})
+        
+        result = corpus_size()
+        self.assertTrue(isinstance(result, int) and result > 1000)
+
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
commit	04784b96d4ac2e3a57e1bf4e503c808881d0a4e4	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Sun May 04 13:38:12 2025 +0200
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Sun May 04 13:41:10 2025 +0200
tree	fd9f6908c479e09ad768d6d2995d58ee53026ce5