гих хук и сохранение изменений в контексте стори

This commit is contained in:
2026-01-31 00:32:36 +03:00
parent 5ce6335ad8
commit 20af12f47d
17 changed files with 695 additions and 40 deletions

View File

@@ -1 +1,11 @@
__all__ = []
from rag_agent.index.postgres import (
ChangedChunkRecord,
fetch_changed_chunks,
get_story_indexed_range,
)
__all__ = [
"ChangedChunkRecord",
"fetch_changed_chunks",
"get_story_indexed_range",
]

View File

@@ -2,13 +2,17 @@ from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime, timezone
from typing import Iterable
from typing import Iterable, Sequence
import psycopg
from pgvector.psycopg import register_vector
from rag_agent.ingest.chunker import TextChunk
CHANGE_ADDED = "added"
CHANGE_MODIFIED = "modified"
CHANGE_UNCHANGED = "unchanged"
@dataclass(frozen=True)
class ChunkRecord:
@@ -18,6 +22,18 @@ class ChunkRecord:
embedding: list[float]
@dataclass(frozen=True)
class ChangedChunkRecord:
"""Chunk that was added or modified in a story (for test-case generation)."""
path: str
content: str
change_type: str
start_line: int | None
end_line: int | None
previous_content: str | None
def connect(dsn: str) -> psycopg.Connection:
conn = psycopg.connect(dsn)
register_vector(conn)
@@ -32,10 +48,22 @@ def ensure_schema(conn: psycopg.Connection, embeddings_dim: int) -> None:
CREATE TABLE IF NOT EXISTS stories (
id SERIAL PRIMARY KEY,
slug TEXT UNIQUE NOT NULL,
created_at TIMESTAMPTZ NOT NULL DEFAULT (NOW() AT TIME ZONE 'utc')
created_at TIMESTAMPTZ NOT NULL DEFAULT (NOW() AT TIME ZONE 'utc'),
indexed_base_ref TEXT,
indexed_head_ref TEXT,
indexed_at TIMESTAMPTZ
);
"""
)
for col_def in (
"ADD COLUMN IF NOT EXISTS indexed_base_ref TEXT",
"ADD COLUMN IF NOT EXISTS indexed_head_ref TEXT",
"ADD COLUMN IF NOT EXISTS indexed_at TIMESTAMPTZ",
):
try:
cur.execute(f"ALTER TABLE stories {col_def};")
except psycopg.ProgrammingError:
pass
cur.execute(
"""
CREATE TABLE IF NOT EXISTS documents (
@@ -56,10 +84,41 @@ def ensure_schema(conn: psycopg.Connection, embeddings_dim: int) -> None:
chunk_index INTEGER NOT NULL,
hash TEXT NOT NULL,
content TEXT NOT NULL,
embedding vector({embeddings_dim}) NOT NULL
embedding vector({embeddings_dim}) NOT NULL,
start_line INTEGER,
end_line INTEGER,
change_type TEXT NOT NULL DEFAULT 'added'
CHECK (change_type IN ('added', 'modified', 'unchanged')),
previous_content TEXT
);
"""
)
# Migrations: add columns if table already existed without them (Postgres 11+)
for col_def in (
"ADD COLUMN IF NOT EXISTS start_line INTEGER",
"ADD COLUMN IF NOT EXISTS end_line INTEGER",
"ADD COLUMN IF NOT EXISTS previous_content TEXT",
"ADD COLUMN IF NOT EXISTS change_type TEXT DEFAULT 'added'",
):
try:
cur.execute(f"ALTER TABLE chunks {col_def};")
except psycopg.ProgrammingError:
pass
try:
cur.execute(
"ALTER TABLE chunks ALTER COLUMN change_type SET NOT NULL;"
)
except psycopg.ProgrammingError:
pass
try:
cur.execute(
"""
ALTER TABLE chunks ADD CONSTRAINT chunks_change_type_check
CHECK (change_type IN ('added', 'modified', 'unchanged'));
"""
)
except psycopg.ProgrammingError:
pass # constraint may already exist
cur.execute(
"""
CREATE INDEX IF NOT EXISTS idx_documents_story_id
@@ -78,6 +137,12 @@ def ensure_schema(conn: psycopg.Connection, embeddings_dim: int) -> None:
ON chunks USING ivfflat (embedding vector_cosine_ops);
"""
)
cur.execute(
"""
CREATE INDEX IF NOT EXISTS idx_chunks_change_type
ON chunks(change_type);
"""
)
conn.commit()
@@ -97,6 +162,44 @@ def get_or_create_story(conn: psycopg.Connection, slug: str) -> int:
return story_id
def update_story_indexed_range(
conn: psycopg.Connection,
story_id: int,
base_ref: str,
head_ref: str,
) -> None:
"""Record that this story was indexed as all changes from base_ref to head_ref (all commits in story)."""
with conn.cursor() as cur:
cur.execute(
"""
UPDATE stories
SET indexed_base_ref = %s, indexed_head_ref = %s,
indexed_at = (NOW() AT TIME ZONE 'utc')
WHERE id = %s;
""",
(base_ref.strip(), head_ref.strip(), story_id),
)
conn.commit()
def get_story_indexed_range(
conn: psycopg.Connection, story_id: int
) -> tuple[str | None, str | None, datetime | None]:
"""Return (indexed_base_ref, indexed_head_ref, indexed_at) for the story, or (None, None, None)."""
with conn.cursor() as cur:
cur.execute(
"""
SELECT indexed_base_ref, indexed_head_ref, indexed_at
FROM stories WHERE id = %s;
""",
(story_id,),
)
row = cur.fetchone()
if row is None:
return (None, None, None)
return (row[0], row[1], row[2])
def get_story_id(conn: psycopg.Connection, slug: str) -> int | None:
s = slug.strip()
with conn.cursor() as cur:
@@ -127,28 +230,98 @@ def upsert_document(
return document_id
def _change_type_and_previous(
chunk: TextChunk,
base_by_range: dict[tuple[int, int], TextChunk],
) -> tuple[str, str | None]:
"""Determine change_type and previous_content for a chunk given base chunks keyed by (start_line, end_line)."""
if chunk.start_line is None or chunk.end_line is None:
return (CHANGE_ADDED, None)
key = (chunk.start_line, chunk.end_line)
base = base_by_range.get(key)
if base is None:
return (CHANGE_ADDED, None)
if base.hash == chunk.hash:
return (CHANGE_UNCHANGED, None)
return (CHANGE_MODIFIED, base.text)
def replace_chunks(
conn: psycopg.Connection,
document_id: int,
chunks: Iterable[TextChunk],
embeddings: Iterable[list[float]],
base_chunks: Sequence[TextChunk] | None = None,
) -> None:
base_by_range: dict[tuple[int, int], TextChunk] = {}
if base_chunks:
for c in base_chunks:
if c.start_line is not None and c.end_line is not None:
base_by_range[(c.start_line, c.end_line)] = c
with conn.cursor() as cur:
cur.execute(
"DELETE FROM chunks WHERE document_id = %s;",
(document_id,),
)
for chunk, embedding in zip(chunks, embeddings):
change_type, previous_content = _change_type_and_previous(
chunk, base_by_range
)
cur.execute(
"""
INSERT INTO chunks (document_id, chunk_index, hash, content, embedding)
VALUES (%s, %s, %s, %s, %s);
INSERT INTO chunks (
document_id, chunk_index, hash, content, embedding,
start_line, end_line, change_type, previous_content
)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s);
""",
(document_id, chunk.index, chunk.hash, chunk.text, embedding),
(
document_id,
chunk.index,
chunk.hash,
chunk.text,
embedding,
chunk.start_line,
chunk.end_line,
change_type,
previous_content,
),
)
conn.commit()
def fetch_changed_chunks(
conn: psycopg.Connection, story_id: int
) -> list[ChangedChunkRecord]:
"""Return chunks that were added or modified in this story (for test-case generation)."""
with conn.cursor() as cur:
cur.execute(
"""
SELECT d.path, c.content, c.change_type, c.start_line, c.end_line,
c.previous_content
FROM chunks c
JOIN documents d ON d.id = c.document_id
WHERE d.story_id = %s
AND c.change_type IN ('added', 'modified')
ORDER BY d.path, c.start_line NULLS FIRST, c.chunk_index;
""",
(story_id,),
)
rows = cur.fetchall()
return [
ChangedChunkRecord(
path=row[0],
content=row[1],
change_type=row[2],
start_line=row[3],
end_line=row[4],
previous_content=row[5],
)
for row in rows
]
def delete_document(
conn: psycopg.Connection, story_id: int, path: str
) -> None: