Новый раг
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
77
app/modules/rag/indexing/docs/chunkers/markdown_chunker.py
Normal file
77
app/modules/rag/indexing/docs/chunkers/markdown_chunker.py
Normal file
@@ -0,0 +1,77 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
from app.modules.rag.indexing.docs.chunkers.text_chunker import DocTextChunker
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class SectionChunk:
|
||||
section_path: str
|
||||
section_title: str
|
||||
content: str
|
||||
order: int
|
||||
|
||||
|
||||
class MarkdownDocChunker:
|
||||
def __init__(self, text_chunker: DocTextChunker | None = None) -> None:
|
||||
self._fallback = text_chunker or DocTextChunker()
|
||||
|
||||
def chunk(self, text: str) -> list[SectionChunk]:
|
||||
lines = text.splitlines()
|
||||
sections: list[SectionChunk] = []
|
||||
stack: list[tuple[int, str]] = []
|
||||
current_title = "Document"
|
||||
current_lines: list[str] = []
|
||||
order = 0
|
||||
for line in lines:
|
||||
heading = self._heading(line)
|
||||
if heading is None:
|
||||
current_lines.append(line)
|
||||
continue
|
||||
self._flush_section(sections, stack, current_title, current_lines, order)
|
||||
order += 1
|
||||
level, title = heading
|
||||
stack = [item for item in stack if item[0] < level]
|
||||
stack.append((level, title))
|
||||
current_title = title
|
||||
current_lines = []
|
||||
self._flush_section(sections, stack, current_title, current_lines, order)
|
||||
if sections:
|
||||
return sections
|
||||
chunks = self._fallback.split(text)
|
||||
return [
|
||||
SectionChunk(section_path="Document", section_title="Document", content=chunk, order=index)
|
||||
for index, chunk in enumerate(chunks)
|
||||
]
|
||||
|
||||
def _flush_section(
|
||||
self,
|
||||
sections: list[SectionChunk],
|
||||
stack: list[tuple[int, str]],
|
||||
current_title: str,
|
||||
current_lines: list[str],
|
||||
order: int,
|
||||
) -> None:
|
||||
content = "\n".join(current_lines).strip()
|
||||
if not content:
|
||||
return
|
||||
titles = [title for _, title in stack] or [current_title]
|
||||
sections.append(
|
||||
SectionChunk(
|
||||
section_path=" > ".join(titles),
|
||||
section_title=titles[-1],
|
||||
content=content,
|
||||
order=order,
|
||||
)
|
||||
)
|
||||
|
||||
def _heading(self, line: str) -> tuple[int, str] | None:
|
||||
stripped = line.strip()
|
||||
if not stripped.startswith("#"):
|
||||
return None
|
||||
level = len(stripped) - len(stripped.lstrip("#"))
|
||||
title = stripped[level:].strip()
|
||||
if not title:
|
||||
return None
|
||||
return level, title
|
||||
21
app/modules/rag/indexing/docs/chunkers/text_chunker.py
Normal file
21
app/modules/rag/indexing/docs/chunkers/text_chunker.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
class DocTextChunker:
|
||||
def __init__(self, max_chars: int = 4000, overlap_chars: int = 250) -> None:
|
||||
self._max_chars = max_chars
|
||||
self._overlap_chars = overlap_chars
|
||||
|
||||
def split(self, text: str) -> list[str]:
|
||||
cleaned = text.strip()
|
||||
if not cleaned:
|
||||
return []
|
||||
chunks: list[str] = []
|
||||
start = 0
|
||||
while start < len(cleaned):
|
||||
end = min(len(cleaned), start + self._max_chars)
|
||||
chunks.append(cleaned[start:end].strip())
|
||||
if end >= len(cleaned):
|
||||
break
|
||||
start = max(0, end - self._overlap_chars)
|
||||
return [chunk for chunk in chunks if chunk]
|
||||
18
app/modules/rag/indexing/docs/classifier.py
Normal file
18
app/modules/rag/indexing/docs/classifier.py
Normal file
@@ -0,0 +1,18 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import PurePosixPath
|
||||
|
||||
from app.modules.rag.contracts import DocKind
|
||||
|
||||
|
||||
class DocsClassifier:
|
||||
def classify(self, path: str) -> str:
|
||||
upper = PurePosixPath(path).name.upper()
|
||||
lowered = path.lower()
|
||||
if "runbook" in lowered or upper.startswith("RUNBOOK"):
|
||||
return DocKind.RUNBOOK
|
||||
if upper.startswith("README"):
|
||||
return DocKind.README
|
||||
if "spec" in lowered or "architecture" in lowered:
|
||||
return DocKind.SPEC
|
||||
return DocKind.MISC
|
||||
115
app/modules/rag/indexing/docs/document_builder.py
Normal file
115
app/modules/rag/indexing/docs/document_builder.py
Normal file
@@ -0,0 +1,115 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from hashlib import sha256
|
||||
|
||||
from app.modules.rag.contracts import EvidenceLink, EvidenceType, RagDocument, RagLayer, RagSource
|
||||
from app.modules.rag.indexing.docs.chunkers.markdown_chunker import SectionChunk
|
||||
|
||||
|
||||
class DocsDocumentBuilder:
|
||||
def build_module_catalog(self, source: RagSource, frontmatter: dict, summary_text: str, doc_kind: str) -> RagDocument | None:
|
||||
module_id = str(frontmatter.get("id") or "").strip()
|
||||
module_type = str(frontmatter.get("type") or "").strip()
|
||||
domain = str(frontmatter.get("domain") or "").strip()
|
||||
if not module_id or not module_type or not domain:
|
||||
return None
|
||||
links = frontmatter.get("links") or {}
|
||||
metadata = {
|
||||
"module_id": module_id,
|
||||
"type": module_type,
|
||||
"domain": domain,
|
||||
"status": frontmatter.get("status"),
|
||||
"version": frontmatter.get("version"),
|
||||
"tags": frontmatter.get("tags") or [],
|
||||
"owners": frontmatter.get("owners") or [],
|
||||
"links": links,
|
||||
"source_path": source.path,
|
||||
"summary_text": summary_text[:4000],
|
||||
"doc_kind": doc_kind,
|
||||
}
|
||||
metadata.update({name: links.get(name, []) for name in (
|
||||
"calls_api", "called_by", "uses_logic", "used_by", "reads_db", "writes_db",
|
||||
"integrates_with", "emits_events", "consumes_events",
|
||||
)})
|
||||
return RagDocument(
|
||||
layer=RagLayer.DOCS_MODULE_CATALOG,
|
||||
source=source,
|
||||
title=module_id,
|
||||
text=summary_text[:4000] or module_id,
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
def build_section(self, source: RagSource, chunk: SectionChunk, frontmatter: dict, doc_kind: str) -> RagDocument:
|
||||
module_id = str(frontmatter.get("id") or source.path)
|
||||
metadata = {
|
||||
"module_id": module_id,
|
||||
"type": frontmatter.get("type"),
|
||||
"domain": frontmatter.get("domain"),
|
||||
"tags": frontmatter.get("tags") or [],
|
||||
"section_path": chunk.section_path,
|
||||
"section_title": chunk.section_title,
|
||||
"order": chunk.order,
|
||||
"doc_kind": doc_kind,
|
||||
"source_path": source.path,
|
||||
"artifact_type": "DOCS",
|
||||
}
|
||||
return RagDocument(
|
||||
layer=RagLayer.DOCS_SECTION_INDEX,
|
||||
source=source,
|
||||
title=f"{module_id}:{chunk.section_title}",
|
||||
text=chunk.content,
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
def build_policy(self, source: RagSource, frontmatter: dict, chunk: SectionChunk, doc_kind: str) -> RagDocument | None:
|
||||
policy_id = str(frontmatter.get("id") or "").strip()
|
||||
applies_to = frontmatter.get("applies_to") or frontmatter.get("type")
|
||||
if not policy_id:
|
||||
return None
|
||||
metadata = {
|
||||
"policy_id": policy_id,
|
||||
"applies_to": applies_to,
|
||||
"rules": chunk.content[:4000],
|
||||
"default_behaviors": frontmatter.get("default_behaviors") or [],
|
||||
"doc_kind": doc_kind,
|
||||
"section_path": chunk.section_path,
|
||||
"source_path": source.path,
|
||||
}
|
||||
return RagDocument(
|
||||
layer=RagLayer.DOCS_POLICY_INDEX,
|
||||
source=source,
|
||||
title=policy_id,
|
||||
text=chunk.content[:4000],
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
def build_fact(
|
||||
self,
|
||||
source: RagSource,
|
||||
*,
|
||||
subject_id: str,
|
||||
predicate: str,
|
||||
obj: str,
|
||||
object_ref: str | None,
|
||||
anchor: str,
|
||||
tags: list[str] | None = None,
|
||||
) -> RagDocument:
|
||||
fact_id = sha256(f"{subject_id}|{predicate}|{obj}|{source.path}|{anchor}".encode("utf-8")).hexdigest()
|
||||
metadata = {
|
||||
"fact_id": fact_id,
|
||||
"subject_id": subject_id,
|
||||
"predicate": predicate,
|
||||
"object": obj,
|
||||
"object_ref": object_ref,
|
||||
"anchor": anchor,
|
||||
"tags": tags or [],
|
||||
"source_path": source.path,
|
||||
}
|
||||
return RagDocument(
|
||||
layer=RagLayer.DOCS_FACT_INDEX,
|
||||
source=source,
|
||||
title=f"{subject_id}:{predicate}",
|
||||
text=f"{subject_id} {predicate} {obj}".strip(),
|
||||
metadata=metadata,
|
||||
links=[EvidenceLink(type=EvidenceType.DOC_FACT, target_id=fact_id, path=source.path, note=anchor)],
|
||||
)
|
||||
21
app/modules/rag/indexing/docs/file_filter.py
Normal file
21
app/modules/rag/indexing/docs/file_filter.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import PurePosixPath
|
||||
|
||||
|
||||
class DocsFileFilter:
|
||||
_EXTENSIONS = {".md", ".rst", ".txt", ".adoc"}
|
||||
_NAMES = ("README", "CHANGELOG", "CONTRIBUTING", "ARCHITECTURE", "SPEC", "RUNBOOK")
|
||||
_EXCLUDE_PARTS = {"node_modules", ".git", "vendor", "dist", "build", "target", "__pycache__"}
|
||||
|
||||
def should_index(self, path: str) -> bool:
|
||||
candidate = PurePosixPath(path)
|
||||
if any(part in self._EXCLUDE_PARTS for part in candidate.parts):
|
||||
return False
|
||||
if candidate.suffix.lower() in self._EXTENSIONS:
|
||||
return True
|
||||
upper_name = candidate.name.upper()
|
||||
if any(upper_name.startswith(prefix) for prefix in self._NAMES):
|
||||
return True
|
||||
joined = "/".join(candidate.parts).lower()
|
||||
return any(marker in joined for marker in ("docs/", "doc/", "documentation/"))
|
||||
135
app/modules/rag/indexing/docs/pipeline.py
Normal file
135
app/modules/rag/indexing/docs/pipeline.py
Normal file
@@ -0,0 +1,135 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
import yaml
|
||||
|
||||
from app.modules.rag.contracts import RagDocument, RagSource
|
||||
from app.modules.rag.indexing.docs.chunkers.markdown_chunker import MarkdownDocChunker
|
||||
from app.modules.rag.indexing.docs.classifier import DocsClassifier
|
||||
from app.modules.rag.indexing.docs.document_builder import DocsDocumentBuilder
|
||||
from app.modules.rag.indexing.docs.file_filter import DocsFileFilter
|
||||
|
||||
|
||||
class DocsIndexingPipeline:
|
||||
def __init__(self) -> None:
|
||||
self._filter = DocsFileFilter()
|
||||
self._classifier = DocsClassifier()
|
||||
self._chunker = MarkdownDocChunker()
|
||||
self._builder = DocsDocumentBuilder()
|
||||
|
||||
def supports(self, path: str) -> bool:
|
||||
return self._filter.should_index(path)
|
||||
|
||||
def index_file(self, *, repo_id: str, commit_sha: str | None, path: str, content: str) -> list[RagDocument]:
|
||||
source = RagSource(repo_id=repo_id, commit_sha=commit_sha, path=path)
|
||||
frontmatter, body = self._split_frontmatter(content)
|
||||
doc_kind = self._classifier.classify(path)
|
||||
sections = self._chunker.chunk(body)
|
||||
summary_text = self._summary_from_sections(sections)
|
||||
docs: list[RagDocument] = []
|
||||
module_doc = self._builder.build_module_catalog(source, frontmatter, summary_text, doc_kind)
|
||||
if module_doc is not None:
|
||||
docs.append(module_doc)
|
||||
for section in sections:
|
||||
docs.append(self._builder.build_section(source, section, frontmatter, doc_kind))
|
||||
if str(frontmatter.get("type") or "").strip() == "policy":
|
||||
for section in sections[:1]:
|
||||
policy = self._builder.build_policy(source, frontmatter, section, doc_kind)
|
||||
if policy is not None:
|
||||
docs.append(policy)
|
||||
docs.extend(self._extract_facts(source, frontmatter, sections))
|
||||
return docs
|
||||
|
||||
def _split_frontmatter(self, content: str) -> tuple[dict, str]:
|
||||
if not content.startswith("---\n"):
|
||||
return {}, content
|
||||
_, raw, body = content.split("---", 2)
|
||||
payload = yaml.safe_load(raw) or {}
|
||||
return payload if isinstance(payload, dict) else {}, body.strip()
|
||||
|
||||
def _summary_from_sections(self, sections) -> str:
|
||||
text = "\n\n".join(section.content for section in sections[:2]).strip()
|
||||
return text[:4000]
|
||||
|
||||
def _extract_facts(self, source: RagSource, frontmatter: dict, sections) -> list[RagDocument]:
|
||||
subject_id = str(frontmatter.get("id") or source.path)
|
||||
docs: list[RagDocument] = []
|
||||
links = frontmatter.get("links") or {}
|
||||
for predicate, values in links.items():
|
||||
for value in values or []:
|
||||
docs.append(
|
||||
self._builder.build_fact(
|
||||
source,
|
||||
subject_id=subject_id,
|
||||
predicate=predicate,
|
||||
obj=str(value),
|
||||
object_ref=str(value),
|
||||
anchor="frontmatter.links",
|
||||
)
|
||||
)
|
||||
for section in sections:
|
||||
docs.extend(self._facts_from_table(source, subject_id, section.section_title, section.content))
|
||||
docs.extend(self._facts_from_lists(source, subject_id, section.section_title, section.content))
|
||||
return docs
|
||||
|
||||
def _facts_from_table(self, source: RagSource, subject_id: str, title: str, content: str) -> list[RagDocument]:
|
||||
lines = [line.strip() for line in content.splitlines() if line.strip()]
|
||||
if len(lines) < 3 or "|" not in lines[0]:
|
||||
return []
|
||||
headers = [part.strip().lower() for part in lines[0].strip("|").split("|")]
|
||||
if not all(headers):
|
||||
return []
|
||||
docs: list[RagDocument] = []
|
||||
for row in lines[2:]:
|
||||
if "|" not in row:
|
||||
continue
|
||||
values = [part.strip() for part in row.strip("|").split("|")]
|
||||
if len(values) != len(headers):
|
||||
continue
|
||||
payload = dict(zip(headers, values))
|
||||
docs.extend(self._facts_from_row(source, subject_id, title, payload))
|
||||
return docs
|
||||
|
||||
def _facts_from_row(self, source: RagSource, subject_id: str, title: str, payload: dict) -> list[RagDocument]:
|
||||
docs: list[RagDocument] = []
|
||||
name = payload.get("field") or payload.get("name") or payload.get("column") or payload.get("code")
|
||||
if "request" in title.lower() or "response" in title.lower():
|
||||
if name:
|
||||
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="has_field", obj=name, object_ref=None, anchor=title))
|
||||
if payload.get("required"):
|
||||
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_required", obj=f"{name}:{payload['required']}", object_ref=None, anchor=title))
|
||||
if payload.get("type"):
|
||||
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_type", obj=f"{name}:{payload['type']}", object_ref=None, anchor=title))
|
||||
if payload.get("validation"):
|
||||
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_validation", obj=f"{name}:{payload['validation']}", object_ref=None, anchor=title))
|
||||
if "error" in title.lower():
|
||||
if payload.get("status"):
|
||||
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="returns_status", obj=payload["status"], object_ref=None, anchor=title))
|
||||
if payload.get("error") or payload.get("code"):
|
||||
error_value = payload.get("error") or payload.get("code")
|
||||
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="returns_error", obj=error_value, object_ref=None, anchor=title))
|
||||
if payload.get("client action"):
|
||||
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="client_action", obj=payload["client action"], object_ref=None, anchor=title))
|
||||
if "constraint" in title.lower() and name:
|
||||
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="has_constraint", obj=name, object_ref=None, anchor=title))
|
||||
return docs
|
||||
|
||||
def _facts_from_lists(self, source: RagSource, subject_id: str, title: str, content: str) -> list[RagDocument]:
|
||||
docs: list[RagDocument] = []
|
||||
for line in content.splitlines():
|
||||
item = line.strip()
|
||||
if not item.startswith(("-", "*", "1.", "2.", "3.")):
|
||||
continue
|
||||
normalized = re.sub(r"^[-*0-9. ]+", "", item).strip()
|
||||
lowered = normalized.lower()
|
||||
if lowered.startswith("metric:"):
|
||||
predicate = "emits_metric"
|
||||
elif lowered.startswith("event:"):
|
||||
predicate = "emits_analytics_event"
|
||||
elif lowered.startswith("log:"):
|
||||
predicate = "logs_event"
|
||||
else:
|
||||
predicate = "validates_rule" if "rule" in title.lower() else "client_action"
|
||||
docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate=predicate, obj=normalized, object_ref=None, anchor=title))
|
||||
return docs
|
||||
Reference in New Issue
Block a user