Skip to content

Commit 4d21e8a

Browse files
Clean extracted schema json (#432)
* schema extract clean json * remove fixtures * updated CHANGELOG * ruff formatted modfied files
1 parent 4b39851 commit 4d21e8a

File tree

3 files changed

+45
-0
lines changed

3 files changed

+45
-0
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
### Added
66

77
- Added automatic rate limiting with retry logic and exponential backoff for all Embedding providers using tenacity. The `RateLimitHandler` interface allows for custom rate limiting strategies, including the ability to disable rate limiting entirely.
8+
- JSON response returned to `SchemaFromTextExtractor` is cleansed of any markdown code blocks before being loaded.
89

910
## 1.10.0
1011

src/neo4j_graphrag/experimental/components/schema.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from __future__ import annotations
1616

1717
import json
18+
import re
1819

1920
import neo4j
2021
import logging
@@ -554,6 +555,15 @@ def _filter_relationships_without_labels(
554555
relationship_types, "relationship type"
555556
)
556557

558+
def _clean_json_content(self, content: str) -> str:
559+
content = content.strip()
560+
561+
# Remove markdown code block markers if present
562+
content = re.sub(r"^```(?:json)?\s*", "", content, flags=re.MULTILINE)
563+
content = re.sub(r"```\s*$", "", content, flags=re.MULTILINE)
564+
565+
return content.strip()
566+
557567
@validate_call
558568
async def run(self, text: str, examples: str = "", **kwargs: Any) -> GraphSchema:
559569
"""
@@ -575,6 +585,9 @@ async def run(self, text: str, examples: str = "", **kwargs: Any) -> GraphSchema
575585
# Re-raise the LLMGenerationError
576586
raise LLMGenerationError("Failed to generate schema from text") from e
577587

588+
# Clean response
589+
content = self._clean_json_content(content)
590+
578591
try:
579592
extracted_schema: Dict[str, Any] = json.loads(content)
580593

tests/unit/experimental/components/test_schema.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -960,6 +960,37 @@ async def test_schema_from_text_filters_relationships_without_labels(
960960
assert ("Person", "MANAGES", "Organization") in schema.patterns
961961

962962

963+
def test_clean_json_content_markdown_with_json_language(
964+
schema_from_text: SchemaFromTextExtractor,
965+
) -> None:
966+
content = """```json
967+
{"node_types": [{"label": "Person"}]}
968+
```"""
969+
970+
cleaned = schema_from_text._clean_json_content(content)
971+
assert cleaned == '{"node_types": [{"label": "Person"}]}'
972+
973+
974+
def test_clean_json_content_markdown_without_language(
975+
schema_from_text: SchemaFromTextExtractor,
976+
) -> None:
977+
content = """```
978+
{"node_types": [{"label": "Person"}]}
979+
```"""
980+
981+
cleaned = schema_from_text._clean_json_content(content)
982+
assert cleaned == '{"node_types": [{"label": "Person"}]}'
983+
984+
985+
def test_clean_json_content_plain_json(
986+
schema_from_text: SchemaFromTextExtractor,
987+
) -> None:
988+
content = '{"node_types": [{"label": "Person"}]}'
989+
990+
cleaned = schema_from_text._clean_json_content(content)
991+
assert cleaned == '{"node_types": [{"label": "Person"}]}'
992+
993+
963994
@pytest.mark.asyncio
964995
@patch("neo4j_graphrag.experimental.components.schema.get_structured_schema")
965996
async def test_schema_from_existing_graph(mock_get_structured_schema: Mock) -> None:

0 commit comments

Comments
 (0)