Скелет проекта

2026-01-30 22:21:12 +03:00
commit 84ded7d7a9
30 changed files with 752 additions and 0 deletions
--- a/src/rag_agent/index/init.py
+++ b/src/rag_agent/index/init.py
@@ -0,0 +1 @@
+__all__ = []
--- a/src/rag_agent/index/pycache/init.cpython-312.pyc
+++ b/src/rag_agent/index/pycache/init.cpython-312.pyc
--- a/src/rag_agent/index/pycache/embeddings.cpython-312.pyc
+++ b/src/rag_agent/index/pycache/embeddings.cpython-312.pyc
--- a/src/rag_agent/index/pycache/postgres.cpython-312.pyc
+++ b/src/rag_agent/index/pycache/postgres.cpython-312.pyc
--- a/src/rag_agent/index/embeddings.py
+++ b/src/rag_agent/index/embeddings.py
@@ -0,0 +1,29 @@
+from __future__ import annotations
+
+import hashlib
+from dataclasses import dataclass
+from typing import Iterable, Protocol
+
+
+class EmbeddingClient(Protocol):
+    def embed_texts(self, texts: Iterable[str]) -> list[list[float]]:
+        raise NotImplementedError
+
+
+@dataclass
+class StubEmbeddingClient:
+    dim: int
+
+    def embed_texts(self, texts: Iterable[str]) -> list[list[float]]:
+        vectors: list[list[float]] = []
+        for text in texts:
+            digest = hashlib.sha256(text.encode("utf-8")).digest()
+            values = [b / 255.0 for b in digest]
+            if len(values) < self.dim:
+                values = (values * ((self.dim // len(values)) + 1))[: self.dim]
+            vectors.append(values[: self.dim])
+        return vectors
+
+
+def get_embedding_client(dim: int) -> EmbeddingClient:
+    return StubEmbeddingClient(dim=dim)
--- a/src/rag_agent/index/postgres.py
+++ b/src/rag_agent/index/postgres.py
@@ -0,0 +1,194 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from typing import Iterable
+
+import psycopg
+from pgvector.psycopg import register_vector
+
+from rag_agent.ingest.chunker import TextChunk
+
+
+@dataclass(frozen=True)
+class ChunkRecord:
+    document_path: str
+    document_version: str
+    chunk: TextChunk
+    embedding: list[float]
+
+
+def connect(dsn: str) -> psycopg.Connection:
+    conn = psycopg.connect(dsn)
+    register_vector(conn)
+    return conn
+
+
+def ensure_schema(conn: psycopg.Connection, embeddings_dim: int) -> None:
+    with conn.cursor() as cur:
+        cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
+        cur.execute(
+            """
+            CREATE TABLE IF NOT EXISTS stories (
+                id SERIAL PRIMARY KEY,
+                slug TEXT UNIQUE NOT NULL,
+                created_at TIMESTAMPTZ NOT NULL DEFAULT (NOW() AT TIME ZONE 'utc')
+            );
+            """
+        )
+        cur.execute(
+            """
+            CREATE TABLE IF NOT EXISTS documents (
+                id SERIAL PRIMARY KEY,
+                story_id INTEGER NOT NULL REFERENCES stories(id) ON DELETE CASCADE,
+                path TEXT NOT NULL,
+                version TEXT NOT NULL,
+                updated_at TIMESTAMPTZ NOT NULL,
+                UNIQUE(story_id, path)
+            );
+            """
+        )
+        cur.execute(
+            f"""
+            CREATE TABLE IF NOT EXISTS chunks (
+                id SERIAL PRIMARY KEY,
+                document_id INTEGER NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
+                chunk_index INTEGER NOT NULL,
+                hash TEXT NOT NULL,
+                content TEXT NOT NULL,
+                embedding vector({embeddings_dim}) NOT NULL
+            );
+            """
+        )
+        cur.execute(
+            """
+            CREATE INDEX IF NOT EXISTS idx_documents_story_id
+            ON documents(story_id);
+            """
+        )
+        cur.execute(
+            """
+            CREATE INDEX IF NOT EXISTS idx_chunks_document_id
+            ON chunks(document_id);
+            """
+        )
+        cur.execute(
+            """
+            CREATE INDEX IF NOT EXISTS idx_chunks_embedding
+            ON chunks USING ivfflat (embedding vector_cosine_ops);
+            """
+        )
+    conn.commit()
+
+
+def get_or_create_story(conn: psycopg.Connection, slug: str) -> int:
+    with conn.cursor() as cur:
+        cur.execute(
+            """
+            INSERT INTO stories (slug)
+            VALUES (%s)
+            ON CONFLICT (slug) DO UPDATE SET slug = EXCLUDED.slug
+            RETURNING id;
+            """,
+            (slug.strip(),),
+        )
+        story_id = cur.fetchone()[0]
+    conn.commit()
+    return story_id
+
+
+def get_story_id(conn: psycopg.Connection, slug: str) -> int | None:
+    s = slug.strip()
+    with conn.cursor() as cur:
+        if s.isdigit():
+            cur.execute("SELECT id FROM stories WHERE id = %s;", (int(s),))
+        else:
+            cur.execute("SELECT id FROM stories WHERE slug = %s;", (s,))
+        row = cur.fetchone()
+    return row[0] if row else None
+
+
+def upsert_document(
+    conn: psycopg.Connection, story_id: int, path: str, version: str
+) -> int:
+    with conn.cursor() as cur:
+        cur.execute(
+            """
+            INSERT INTO documents (story_id, path, version, updated_at)
+            VALUES (%s, %s, %s, %s)
+            ON CONFLICT (story_id, path) DO UPDATE
+            SET version = EXCLUDED.version,
+                updated_at = EXCLUDED.updated_at
+            RETURNING id;
+            """,
+            (story_id, path, version, datetime.now(timezone.utc)),
+        )
+        document_id = cur.fetchone()[0]
+    return document_id
+
+
+def replace_chunks(
+    conn: psycopg.Connection,
+    document_id: int,
+    chunks: Iterable[TextChunk],
+    embeddings: Iterable[list[float]],
+) -> None:
+    with conn.cursor() as cur:
+        cur.execute(
+            "DELETE FROM chunks WHERE document_id = %s;",
+            (document_id,),
+        )
+        for chunk, embedding in zip(chunks, embeddings):
+            cur.execute(
+                """
+                INSERT INTO chunks (document_id, chunk_index, hash, content, embedding)
+                VALUES (%s, %s, %s, %s, %s);
+                """,
+                (document_id, chunk.index, chunk.hash, chunk.text, embedding),
+            )
+    conn.commit()
+
+
+def delete_document(
+    conn: psycopg.Connection, story_id: int, path: str
+) -> None:
+    with conn.cursor() as cur:
+        cur.execute(
+            "DELETE FROM documents WHERE story_id = %s AND path = %s;",
+            (story_id, path),
+        )
+    conn.commit()
+
+
+def fetch_similar(
+    conn: psycopg.Connection,
+    query_embedding: list[float],
+    top_k: int,
+    story_id: int | None = None,
+) -> list[tuple[str, str, float]]:
+    with conn.cursor() as cur:
+        if story_id is not None:
+            cur.execute(
+                """
+                SELECT d.path, c.content, c.embedding <=> %s AS distance
+                FROM chunks c
+                JOIN documents d ON d.id = c.document_id
+                WHERE d.story_id = %s
+                ORDER BY c.embedding <=> %s
+                LIMIT %s;
+                """,
+                (query_embedding, story_id, query_embedding, top_k),
+            )
+        else:
+            cur.execute(
+                """
+                SELECT d.path, c.content, c.embedding <=> %s AS distance
+                FROM chunks c
+                JOIN documents d ON d.id = c.document_id
+                ORDER BY c.embedding <=> %s
+                LIMIT %s;
+                """,
+                (query_embedding, query_embedding, top_k),
+            )
+        rows = cur.fetchall()
+    return [(row[0], row[1], row[2]) for row in rows]