гих хук и сохранение изменений в контексте стори
This commit is contained in:
@@ -1 +1,11 @@
|
||||
__all__ = []
|
||||
from rag_agent.index.postgres import (
|
||||
ChangedChunkRecord,
|
||||
fetch_changed_chunks,
|
||||
get_story_indexed_range,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"ChangedChunkRecord",
|
||||
"fetch_changed_chunks",
|
||||
"get_story_indexed_range",
|
||||
]
|
||||
|
||||
@@ -2,13 +2,17 @@ from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
from typing import Iterable
|
||||
from typing import Iterable, Sequence
|
||||
|
||||
import psycopg
|
||||
from pgvector.psycopg import register_vector
|
||||
|
||||
from rag_agent.ingest.chunker import TextChunk
|
||||
|
||||
CHANGE_ADDED = "added"
|
||||
CHANGE_MODIFIED = "modified"
|
||||
CHANGE_UNCHANGED = "unchanged"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ChunkRecord:
|
||||
@@ -18,6 +22,18 @@ class ChunkRecord:
|
||||
embedding: list[float]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ChangedChunkRecord:
|
||||
"""Chunk that was added or modified in a story (for test-case generation)."""
|
||||
|
||||
path: str
|
||||
content: str
|
||||
change_type: str
|
||||
start_line: int | None
|
||||
end_line: int | None
|
||||
previous_content: str | None
|
||||
|
||||
|
||||
def connect(dsn: str) -> psycopg.Connection:
|
||||
conn = psycopg.connect(dsn)
|
||||
register_vector(conn)
|
||||
@@ -32,10 +48,22 @@ def ensure_schema(conn: psycopg.Connection, embeddings_dim: int) -> None:
|
||||
CREATE TABLE IF NOT EXISTS stories (
|
||||
id SERIAL PRIMARY KEY,
|
||||
slug TEXT UNIQUE NOT NULL,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT (NOW() AT TIME ZONE 'utc')
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT (NOW() AT TIME ZONE 'utc'),
|
||||
indexed_base_ref TEXT,
|
||||
indexed_head_ref TEXT,
|
||||
indexed_at TIMESTAMPTZ
|
||||
);
|
||||
"""
|
||||
)
|
||||
for col_def in (
|
||||
"ADD COLUMN IF NOT EXISTS indexed_base_ref TEXT",
|
||||
"ADD COLUMN IF NOT EXISTS indexed_head_ref TEXT",
|
||||
"ADD COLUMN IF NOT EXISTS indexed_at TIMESTAMPTZ",
|
||||
):
|
||||
try:
|
||||
cur.execute(f"ALTER TABLE stories {col_def};")
|
||||
except psycopg.ProgrammingError:
|
||||
pass
|
||||
cur.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS documents (
|
||||
@@ -56,10 +84,41 @@ def ensure_schema(conn: psycopg.Connection, embeddings_dim: int) -> None:
|
||||
chunk_index INTEGER NOT NULL,
|
||||
hash TEXT NOT NULL,
|
||||
content TEXT NOT NULL,
|
||||
embedding vector({embeddings_dim}) NOT NULL
|
||||
embedding vector({embeddings_dim}) NOT NULL,
|
||||
start_line INTEGER,
|
||||
end_line INTEGER,
|
||||
change_type TEXT NOT NULL DEFAULT 'added'
|
||||
CHECK (change_type IN ('added', 'modified', 'unchanged')),
|
||||
previous_content TEXT
|
||||
);
|
||||
"""
|
||||
)
|
||||
# Migrations: add columns if table already existed without them (Postgres 11+)
|
||||
for col_def in (
|
||||
"ADD COLUMN IF NOT EXISTS start_line INTEGER",
|
||||
"ADD COLUMN IF NOT EXISTS end_line INTEGER",
|
||||
"ADD COLUMN IF NOT EXISTS previous_content TEXT",
|
||||
"ADD COLUMN IF NOT EXISTS change_type TEXT DEFAULT 'added'",
|
||||
):
|
||||
try:
|
||||
cur.execute(f"ALTER TABLE chunks {col_def};")
|
||||
except psycopg.ProgrammingError:
|
||||
pass
|
||||
try:
|
||||
cur.execute(
|
||||
"ALTER TABLE chunks ALTER COLUMN change_type SET NOT NULL;"
|
||||
)
|
||||
except psycopg.ProgrammingError:
|
||||
pass
|
||||
try:
|
||||
cur.execute(
|
||||
"""
|
||||
ALTER TABLE chunks ADD CONSTRAINT chunks_change_type_check
|
||||
CHECK (change_type IN ('added', 'modified', 'unchanged'));
|
||||
"""
|
||||
)
|
||||
except psycopg.ProgrammingError:
|
||||
pass # constraint may already exist
|
||||
cur.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS idx_documents_story_id
|
||||
@@ -78,6 +137,12 @@ def ensure_schema(conn: psycopg.Connection, embeddings_dim: int) -> None:
|
||||
ON chunks USING ivfflat (embedding vector_cosine_ops);
|
||||
"""
|
||||
)
|
||||
cur.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS idx_chunks_change_type
|
||||
ON chunks(change_type);
|
||||
"""
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
|
||||
@@ -97,6 +162,44 @@ def get_or_create_story(conn: psycopg.Connection, slug: str) -> int:
|
||||
return story_id
|
||||
|
||||
|
||||
def update_story_indexed_range(
|
||||
conn: psycopg.Connection,
|
||||
story_id: int,
|
||||
base_ref: str,
|
||||
head_ref: str,
|
||||
) -> None:
|
||||
"""Record that this story was indexed as all changes from base_ref to head_ref (all commits in story)."""
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"""
|
||||
UPDATE stories
|
||||
SET indexed_base_ref = %s, indexed_head_ref = %s,
|
||||
indexed_at = (NOW() AT TIME ZONE 'utc')
|
||||
WHERE id = %s;
|
||||
""",
|
||||
(base_ref.strip(), head_ref.strip(), story_id),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def get_story_indexed_range(
|
||||
conn: psycopg.Connection, story_id: int
|
||||
) -> tuple[str | None, str | None, datetime | None]:
|
||||
"""Return (indexed_base_ref, indexed_head_ref, indexed_at) for the story, or (None, None, None)."""
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT indexed_base_ref, indexed_head_ref, indexed_at
|
||||
FROM stories WHERE id = %s;
|
||||
""",
|
||||
(story_id,),
|
||||
)
|
||||
row = cur.fetchone()
|
||||
if row is None:
|
||||
return (None, None, None)
|
||||
return (row[0], row[1], row[2])
|
||||
|
||||
|
||||
def get_story_id(conn: psycopg.Connection, slug: str) -> int | None:
|
||||
s = slug.strip()
|
||||
with conn.cursor() as cur:
|
||||
@@ -127,28 +230,98 @@ def upsert_document(
|
||||
return document_id
|
||||
|
||||
|
||||
def _change_type_and_previous(
|
||||
chunk: TextChunk,
|
||||
base_by_range: dict[tuple[int, int], TextChunk],
|
||||
) -> tuple[str, str | None]:
|
||||
"""Determine change_type and previous_content for a chunk given base chunks keyed by (start_line, end_line)."""
|
||||
if chunk.start_line is None or chunk.end_line is None:
|
||||
return (CHANGE_ADDED, None)
|
||||
key = (chunk.start_line, chunk.end_line)
|
||||
base = base_by_range.get(key)
|
||||
if base is None:
|
||||
return (CHANGE_ADDED, None)
|
||||
if base.hash == chunk.hash:
|
||||
return (CHANGE_UNCHANGED, None)
|
||||
return (CHANGE_MODIFIED, base.text)
|
||||
|
||||
|
||||
def replace_chunks(
|
||||
conn: psycopg.Connection,
|
||||
document_id: int,
|
||||
chunks: Iterable[TextChunk],
|
||||
embeddings: Iterable[list[float]],
|
||||
base_chunks: Sequence[TextChunk] | None = None,
|
||||
) -> None:
|
||||
base_by_range: dict[tuple[int, int], TextChunk] = {}
|
||||
if base_chunks:
|
||||
for c in base_chunks:
|
||||
if c.start_line is not None and c.end_line is not None:
|
||||
base_by_range[(c.start_line, c.end_line)] = c
|
||||
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"DELETE FROM chunks WHERE document_id = %s;",
|
||||
(document_id,),
|
||||
)
|
||||
for chunk, embedding in zip(chunks, embeddings):
|
||||
change_type, previous_content = _change_type_and_previous(
|
||||
chunk, base_by_range
|
||||
)
|
||||
cur.execute(
|
||||
"""
|
||||
INSERT INTO chunks (document_id, chunk_index, hash, content, embedding)
|
||||
VALUES (%s, %s, %s, %s, %s);
|
||||
INSERT INTO chunks (
|
||||
document_id, chunk_index, hash, content, embedding,
|
||||
start_line, end_line, change_type, previous_content
|
||||
)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s);
|
||||
""",
|
||||
(document_id, chunk.index, chunk.hash, chunk.text, embedding),
|
||||
(
|
||||
document_id,
|
||||
chunk.index,
|
||||
chunk.hash,
|
||||
chunk.text,
|
||||
embedding,
|
||||
chunk.start_line,
|
||||
chunk.end_line,
|
||||
change_type,
|
||||
previous_content,
|
||||
),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def fetch_changed_chunks(
|
||||
conn: psycopg.Connection, story_id: int
|
||||
) -> list[ChangedChunkRecord]:
|
||||
"""Return chunks that were added or modified in this story (for test-case generation)."""
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT d.path, c.content, c.change_type, c.start_line, c.end_line,
|
||||
c.previous_content
|
||||
FROM chunks c
|
||||
JOIN documents d ON d.id = c.document_id
|
||||
WHERE d.story_id = %s
|
||||
AND c.change_type IN ('added', 'modified')
|
||||
ORDER BY d.path, c.start_line NULLS FIRST, c.chunk_index;
|
||||
""",
|
||||
(story_id,),
|
||||
)
|
||||
rows = cur.fetchall()
|
||||
return [
|
||||
ChangedChunkRecord(
|
||||
path=row[0],
|
||||
content=row[1],
|
||||
change_type=row[2],
|
||||
start_line=row[3],
|
||||
end_line=row[4],
|
||||
previous_content=row[5],
|
||||
)
|
||||
for row in rows
|
||||
]
|
||||
|
||||
|
||||
def delete_document(
|
||||
conn: psycopg.Connection, story_id: int, path: str
|
||||
) -> None:
|
||||
|
||||
Reference in New Issue
Block a user