гих хук и сохранение изменений в контексте стори

2026-01-31 00:32:36 +03:00
parent 5ce6335ad8
commit 20af12f47d
17 changed files with 695 additions and 40 deletions
--- a/src/rag_agent/index/init.py
+++ b/src/rag_agent/index/init.py
@@ -1 +1,11 @@
-__all__ = []
+from rag_agent.index.postgres import (
+    ChangedChunkRecord,
+    fetch_changed_chunks,
+    get_story_indexed_range,
+)
+
+__all__ = [
+    "ChangedChunkRecord",
+    "fetch_changed_chunks",
+    "get_story_indexed_range",
+]
--- a/src/rag_agent/index/postgres.py
+++ b/src/rag_agent/index/postgres.py
@@ -2,13 +2,17 @@ from __future__ import annotations

 from dataclasses import dataclass
 from datetime import datetime, timezone
-from typing import Iterable
+from typing import Iterable, Sequence

 import psycopg
 from pgvector.psycopg import register_vector

 from rag_agent.ingest.chunker import TextChunk

+CHANGE_ADDED = "added"
+CHANGE_MODIFIED = "modified"
+CHANGE_UNCHANGED = "unchanged"
+

@dataclass(frozen=True)
 class ChunkRecord:
@@ -18,6 +22,18 @@ class ChunkRecord:
    embedding: list[float]


+@dataclass(frozen=True)
+class ChangedChunkRecord:
+    """Chunk that was added or modified in a story (for test-case generation)."""
+
+    path: str
+    content: str
+    change_type: str
+    start_line: int | None
+    end_line: int | None
+    previous_content: str | None
+
+
 def connect(dsn: str) -> psycopg.Connection:
    conn = psycopg.connect(dsn)
    register_vector(conn)
@@ -32,10 +48,22 @@ def ensure_schema(conn: psycopg.Connection, embeddings_dim: int) -> None:
            CREATE TABLE IF NOT EXISTS stories (
                id SERIAL PRIMARY KEY,
                slug TEXT UNIQUE NOT NULL,
-                created_at TIMESTAMPTZ NOT NULL DEFAULT (NOW() AT TIME ZONE 'utc')
+                created_at TIMESTAMPTZ NOT NULL DEFAULT (NOW() AT TIME ZONE 'utc'),
+                indexed_base_ref TEXT,
+                indexed_head_ref TEXT,
+                indexed_at TIMESTAMPTZ
            );
            """
        )
+        for col_def in (
+            "ADD COLUMN IF NOT EXISTS indexed_base_ref TEXT",
+            "ADD COLUMN IF NOT EXISTS indexed_head_ref TEXT",
+            "ADD COLUMN IF NOT EXISTS indexed_at TIMESTAMPTZ",
+        ):
+            try:
+                cur.execute(f"ALTER TABLE stories {col_def};")
+            except psycopg.ProgrammingError:
+                pass
        cur.execute(
            """
            CREATE TABLE IF NOT EXISTS documents (
@@ -56,10 +84,41 @@ def ensure_schema(conn: psycopg.Connection, embeddings_dim: int) -> None:
                chunk_index INTEGER NOT NULL,
                hash TEXT NOT NULL,
                content TEXT NOT NULL,
-                embedding vector({embeddings_dim}) NOT NULL
+                embedding vector({embeddings_dim}) NOT NULL,
+                start_line INTEGER,
+                end_line INTEGER,
+                change_type TEXT NOT NULL DEFAULT 'added'
+                    CHECK (change_type IN ('added', 'modified', 'unchanged')),
+                previous_content TEXT
            );
            """
        )
+        # Migrations: add columns if table already existed without them (Postgres 11+)
+        for col_def in (
+            "ADD COLUMN IF NOT EXISTS start_line INTEGER",
+            "ADD COLUMN IF NOT EXISTS end_line INTEGER",
+            "ADD COLUMN IF NOT EXISTS previous_content TEXT",
+            "ADD COLUMN IF NOT EXISTS change_type TEXT DEFAULT 'added'",
+        ):
+            try:
+                cur.execute(f"ALTER TABLE chunks {col_def};")
+            except psycopg.ProgrammingError:
+                pass
+        try:
+            cur.execute(
+                "ALTER TABLE chunks ALTER COLUMN change_type SET NOT NULL;"
+            )
+        except psycopg.ProgrammingError:
+            pass
+        try:
+            cur.execute(
+                """
+                ALTER TABLE chunks ADD CONSTRAINT chunks_change_type_check
+                CHECK (change_type IN ('added', 'modified', 'unchanged'));
+                """
+            )
+        except psycopg.ProgrammingError:
+            pass  # constraint may already exist
        cur.execute(
            """
            CREATE INDEX IF NOT EXISTS idx_documents_story_id
@@ -78,6 +137,12 @@ def ensure_schema(conn: psycopg.Connection, embeddings_dim: int) -> None:
            ON chunks USING ivfflat (embedding vector_cosine_ops);
            """
        )
+        cur.execute(
+            """
+            CREATE INDEX IF NOT EXISTS idx_chunks_change_type
+            ON chunks(change_type);
+            """
+        )
    conn.commit()


@@ -97,6 +162,44 @@ def get_or_create_story(conn: psycopg.Connection, slug: str) -> int:
    return story_id


+def update_story_indexed_range(
+    conn: psycopg.Connection,
+    story_id: int,
+    base_ref: str,
+    head_ref: str,
+) -> None:
+    """Record that this story was indexed as all changes from base_ref to head_ref (all commits in story)."""
+    with conn.cursor() as cur:
+        cur.execute(
+            """
+            UPDATE stories
+            SET indexed_base_ref = %s, indexed_head_ref = %s,
+                indexed_at = (NOW() AT TIME ZONE 'utc')
+            WHERE id = %s;
+            """,
+            (base_ref.strip(), head_ref.strip(), story_id),
+        )
+    conn.commit()
+
+
+def get_story_indexed_range(
+    conn: psycopg.Connection, story_id: int
+) -> tuple[str | None, str | None, datetime | None]:
+    """Return (indexed_base_ref, indexed_head_ref, indexed_at) for the story, or (None, None, None)."""
+    with conn.cursor() as cur:
+        cur.execute(
+            """
+            SELECT indexed_base_ref, indexed_head_ref, indexed_at
+            FROM stories WHERE id = %s;
+            """,
+            (story_id,),
+        )
+        row = cur.fetchone()
+    if row is None:
+        return (None, None, None)
+    return (row[0], row[1], row[2])
+
+
 def get_story_id(conn: psycopg.Connection, slug: str) -> int | None:
    s = slug.strip()
    with conn.cursor() as cur:
@@ -127,28 +230,98 @@ def upsert_document(
    return document_id


+def _change_type_and_previous(
+    chunk: TextChunk,
+    base_by_range: dict[tuple[int, int], TextChunk],
+) -> tuple[str, str | None]:
+    """Determine change_type and previous_content for a chunk given base chunks keyed by (start_line, end_line)."""
+    if chunk.start_line is None or chunk.end_line is None:
+        return (CHANGE_ADDED, None)
+    key = (chunk.start_line, chunk.end_line)
+    base = base_by_range.get(key)
+    if base is None:
+        return (CHANGE_ADDED, None)
+    if base.hash == chunk.hash:
+        return (CHANGE_UNCHANGED, None)
+    return (CHANGE_MODIFIED, base.text)
+
+
 def replace_chunks(
    conn: psycopg.Connection,
    document_id: int,
    chunks: Iterable[TextChunk],
    embeddings: Iterable[list[float]],
+    base_chunks: Sequence[TextChunk] | None = None,
 ) -> None:
+    base_by_range: dict[tuple[int, int], TextChunk] = {}
+    if base_chunks:
+        for c in base_chunks:
+            if c.start_line is not None and c.end_line is not None:
+                base_by_range[(c.start_line, c.end_line)] = c
+
    with conn.cursor() as cur:
        cur.execute(
            "DELETE FROM chunks WHERE document_id = %s;",
            (document_id,),
        )
        for chunk, embedding in zip(chunks, embeddings):
+            change_type, previous_content = _change_type_and_previous(
+                chunk, base_by_range
+            )
            cur.execute(
                """
-                INSERT INTO chunks (document_id, chunk_index, hash, content, embedding)
-                VALUES (%s, %s, %s, %s, %s);
+                INSERT INTO chunks (
+                    document_id, chunk_index, hash, content, embedding,
+                    start_line, end_line, change_type, previous_content
+                )
+                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s);
                """,
-                (document_id, chunk.index, chunk.hash, chunk.text, embedding),
+                (
+                    document_id,
+                    chunk.index,
+                    chunk.hash,
+                    chunk.text,
+                    embedding,
+                    chunk.start_line,
+                    chunk.end_line,
+                    change_type,
+                    previous_content,
+                ),
            )
    conn.commit()


+def fetch_changed_chunks(
+    conn: psycopg.Connection, story_id: int
+) -> list[ChangedChunkRecord]:
+    """Return chunks that were added or modified in this story (for test-case generation)."""
+    with conn.cursor() as cur:
+        cur.execute(
+            """
+            SELECT d.path, c.content, c.change_type, c.start_line, c.end_line,
+                   c.previous_content
+            FROM chunks c
+            JOIN documents d ON d.id = c.document_id
+            WHERE d.story_id = %s
+              AND c.change_type IN ('added', 'modified')
+            ORDER BY d.path, c.start_line NULLS FIRST, c.chunk_index;
+            """,
+            (story_id,),
+        )
+        rows = cur.fetchall()
+    return [
+        ChangedChunkRecord(
+            path=row[0],
+            content=row[1],
+            change_type=row[2],
+            start_line=row[3],
+            end_line=row[4],
+            previous_content=row[5],
+        )
+        for row in rows
+    ]
+
+
 def delete_document(
    conn: psycopg.Connection, story_id: int, path: str
 ) -> None: