WIP test against gds-api-spec

FlorentinD · FlorentinD · commit cfc30edabef8 · 2025-11-04T16:43:30.000+01:00
ref GDSA-144
diff --git a/graphdatascience/tests/integrationV2/procedure_surface/session/gds_api_spec.py b/graphdatascience/tests/integrationV2/procedure_surface/session/gds_api_spec.py
@@ -0,0 +1,89 @@
+import json
+from pathlib import Path
+from typing import Any, List, Optional
+
+from pydantic import BaseModel, Field
+
+
+class TypeInfo(BaseModel):
+    """Represents type information for a parameter or return field."""
+
+    typeName: str = Field(alias="typeName")
+    optional: bool = Field(alias="optional")
+
+    class Config:
+        populate_by_name = True
+
+
+class Parameter(BaseModel):
+    """Represents a procedure parameter."""
+
+    name: str
+    type: TypeInfo
+    defaultValue: Optional[Any] = None
+
+    class Config:
+        populate_by_name = True
+
+
+class ReturnField(BaseModel):
+    """Represents a return field from a procedure mode."""
+
+    name: str
+    type: TypeInfo
+
+    class Config:
+        populate_by_name = True
+
+
+class Mode(BaseModel, extra="forbid"):
+    """Represents an execution mode (stream, stats, mutate, write) for a procedure."""
+
+    mode: str
+    parameters: List[Parameter]
+    returnFields: List[ReturnField]
+
+    class Config:
+        populate_by_name = True
+
+
+class Procedure(BaseModel, extra="forbid"):
+    """Represents a GDS procedure with its parameters and modes."""
+
+    name: str
+    parameters: List[Parameter]
+    modes: List[Mode]
+
+    class Config:
+        populate_by_name = True
+
+    def parameters_for_mode(self, mode_name: str) -> List[Parameter]:
+        """Get the parameters for a specific mode."""
+        result = self.parameters.copy()
+        for mode in self.modes:
+            if mode.mode == mode_name:
+                result.extend(mode.parameters)
+                return result
+        raise ValueError(
+            f"Mode '{mode_name}' not found in procedure '{self.name}'. Available modes: {[m.mode for m in self.modes]}."
+        )
+
+
+def resolve_spec_from_file(file_path: Path) -> list[Procedure]:
+    """
+    Load and parse the gds-api-spec.json file.
+
+    Args:
+        file_path: Path to the gds-api-spec.json file.
+                   If None, uses the default location in the repository root.
+
+    Returns:
+        GdsApiSpec: Parsed API specification containing all procedures.
+    """
+    with open(file_path, "r") as f:
+        data = json.load(f)
+
+    # The JSON file is a list of procedures at the root level
+    procedures = [Procedure(**proc) for proc in data]
+
+    return procedures
diff --git a/graphdatascience/tests/integrationV2/procedure_surface/session/test_session_api_spec_coverage.py b/graphdatascience/tests/integrationV2/procedure_surface/session/test_session_api_spec_coverage.py
@@ -0,0 +1,132 @@
+from pathlib import Path
+import re
+from collections import defaultdict
+from typing import Any
+
+from pydantic import BaseModel
+import pytest
+
+from graphdatascience.arrow_client.authenticated_flight_client import AuthenticatedArrowClient
+from graphdatascience.session.session_v2_endpoints import SessionV2Endpoints
+from graphdatascience.tests.integrationV2.procedure_surface.session.gds_api_spec import (
+    GdsApiSpec,
+    resolve_spec_from_file,
+)
+
+MISSING_ENDPOINTS = set()
+
+# mapping of the snake-cased version of endpoint parts to the actual attribute names in SessionV2Endpoints
+ENDPOINT_MAPPINGS = {
+    # centrality algos
+    "betweenness": "betweenness_centrality",
+    "celf": "influence_maximization_celf",
+    "closeness": "closeness_centrality",
+    "degree": "degree_centrality",
+    "eigenvector": "eigenvector_centrality",
+    "harmonic": "harmonic_centrality",
+    # community algos
+    "cliquecounting": "clique_counting",
+    "k1coloring": "k1_coloring",
+    "kcore": "k_core_decomposition",
+    "maxkcut": "max_k_cut",
+    # embedding algos
+    "fastrp": "fast_rp",
+    "graphSage": "graphsage",
+    "hashgnn": "hash_gnn",
+    # pathfinding algos
+    "source_target": "shortest_path",
+    "single_source": "all_shortest_path",
+    "delta_stepping": "delta",
+    "kspanning_tree": "k_spanning_tree",
+    "prizesteiner_tree": "prize_steiner_tree",
+    "spanning_tree": "spanning_tree",
+    "steiner_tree": "steiner_tree",
+}
+
+
+@pytest.fixture
+def endpoints(arrow_client: AuthenticatedArrowClient) -> SessionV2Endpoints:
+    return SessionV2Endpoints(arrow_client, db_client=None, show_progress=False)
+
+
+def to_snake(camel: str) -> str:
+    # adjusted version of pydantic.alias_generators.to_snake (without digit handling)
+
+    # Handle the sequence of uppercase letters followed by a lowercase letter
+    snake = re.sub(r"([A-Z]+)([A-Z][a-z])", lambda m: f"{m.group(1)}_{m.group(2)}", camel)
+    # Insert an underscore between a lowercase letter and an uppercase letter
+    snake = re.sub(r"([a-z])([A-Z])", lambda m: f"{m.group(1)}_{m.group(2)}", snake)
+    # Replace hyphens with underscores to handle kebab-case
+    snake = snake.replace("-", "_")
+    return snake.lower()
+
+
+def resolve_callable_object(endpoints: SessionV2Endpoints, endpoint: str) -> Any | None:
+    """Check if an algorithm is available through gds.v2 interface"""
+
+    endpoint_parts = endpoint.split(".")
+    endpoint_parts = [to_snake(part) for part in endpoint_parts]
+    # algo_parts = [ENDPOINT_MAPPINGS.get(part, part) for part in algo_parts]
+
+    callable_object = endpoints
+    for endpoint_part in endpoint_parts:
+        # Get the algorithm endpoint
+        if not hasattr(callable_object, endpoint_part):
+            return None
+
+        callable_object = getattr(callable_object, endpoint_part)
+
+    if not callable(callable_object):
+        raise ValueError(f"Resolved object {callable_object} for endpoint {endpoint} is not callable")
+
+    return callable_object
+
+
+# TODO how to fetch to json? it is not published anywhere yet? (could be published as part of the release?)
+def test_api_spec_coverage(endpoints: SessionV2Endpoints) -> None:
+    # Get all available Arrow actions
+    api_spec = resolve_spec_from_file(Path("/Users/florentin/repos/graph-data-science-client/gds-api-spec.json"))
+
+    algo_prefixes = ["pathfinding", "centrality", "community", "similarity", "embedding"]
+    # Filter to only v2 algorithm actions (exclude graph, model, catalog operations)
+    algorithm_actions: set[str] = {
+        action for action in api_spec.procedures if any(action.startswith(prefix) for prefix in algo_prefixes)
+    }
+
+    missing_endpoints: set[str] = set()
+    available_endpoints: set[str] = set()
+
+    algos_per_category = defaultdict(list)
+    for action in algorithm_actions:
+        category, algo_parts = action.split(".", maxsplit=1)
+        algos_per_category[category].append(algo_parts)
+
+    for category, algos in algos_per_category.items():
+        for algo in algos:
+            callable_object = resolve_callable_object(
+                endpoints,
+                algo,
+            )
+            if not callable_object:
+                missing_endpoints.add(f"{category}.{algo}")
+            else:
+                # TODO verify against gds-api spec
+                returnFields =
+                continue
+
+    # Print summary
+    print("\nGDS API Spec Coverage Summary:")
+    print(f"Total algorithm actions found: {len(algorithm_actions)}")
+    print(f"Available through gds.v2: {len(available_endpoints)}")
+
+    # check if any previously missing algos are now available
+    newly_available_endpoints = available_endpoints.intersection(MISSING_ENDPOINTS)
+    assert not newly_available_endpoints, "Endpoints now available, please remove from MISSING_ENDPOINTS"
+
+    # check missing endpoints against known missing algos
+    missing_endpoints = missing_endpoints.difference(MISSING_ENDPOINTS)
+    assert not missing_endpoints, f"Unexpectedly missing endpoints {len(missing_endpoints)}"
+
+
+def get_api_spec() -> GdsApiSpec:
+    return GdsApiSpec()