661 lines
27 KiB
Python
661 lines
27 KiB
Python
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass
|
|
import hashlib
|
|
from sqlalchemy import text
|
|
|
|
from app.modules.shared.db import get_engine
|
|
|
|
|
|
@dataclass
|
|
class RagJobRow:
|
|
index_job_id: str
|
|
rag_session_id: str
|
|
status: str
|
|
indexed_files: int
|
|
failed_files: int
|
|
cache_hit_files: int
|
|
cache_miss_files: int
|
|
error_code: str | None
|
|
error_desc: str | None
|
|
error_module: str | None
|
|
|
|
|
|
class RagRepository:
|
|
def ensure_tables(self) -> None:
|
|
engine = get_engine()
|
|
with engine.connect() as conn:
|
|
conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector"))
|
|
conn.execute(
|
|
text(
|
|
"""
|
|
CREATE TABLE IF NOT EXISTS rag_sessions (
|
|
rag_session_id VARCHAR(64) PRIMARY KEY,
|
|
project_id VARCHAR(512) NOT NULL,
|
|
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
|
|
)
|
|
"""
|
|
)
|
|
)
|
|
conn.execute(
|
|
text(
|
|
"""
|
|
CREATE TABLE IF NOT EXISTS rag_index_jobs (
|
|
index_job_id VARCHAR(64) PRIMARY KEY,
|
|
rag_session_id VARCHAR(64) NOT NULL,
|
|
status VARCHAR(16) NOT NULL,
|
|
indexed_files INTEGER NOT NULL DEFAULT 0,
|
|
failed_files INTEGER NOT NULL DEFAULT 0,
|
|
cache_hit_files INTEGER NOT NULL DEFAULT 0,
|
|
cache_miss_files INTEGER NOT NULL DEFAULT 0,
|
|
error_code VARCHAR(128) NULL,
|
|
error_desc TEXT NULL,
|
|
error_module VARCHAR(64) NULL,
|
|
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
|
|
updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
|
|
)
|
|
"""
|
|
)
|
|
)
|
|
conn.execute(
|
|
text(
|
|
"""
|
|
CREATE TABLE IF NOT EXISTS rag_chunks (
|
|
id BIGSERIAL PRIMARY KEY,
|
|
rag_session_id VARCHAR(64) NOT NULL,
|
|
path TEXT NOT NULL,
|
|
chunk_index INTEGER NOT NULL,
|
|
content TEXT NOT NULL,
|
|
embedding vector NULL,
|
|
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
|
|
updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
|
|
)
|
|
"""
|
|
)
|
|
)
|
|
conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS artifact_type VARCHAR(16) NULL"))
|
|
conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS section TEXT NULL"))
|
|
conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS doc_id TEXT NULL"))
|
|
conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS doc_version TEXT NULL"))
|
|
conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS owner TEXT NULL"))
|
|
conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS system_component TEXT NULL"))
|
|
conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS last_modified TIMESTAMPTZ NULL"))
|
|
conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS staleness_score DOUBLE PRECISION NULL"))
|
|
conn.execute(
|
|
text(
|
|
"""
|
|
ALTER TABLE rag_chunks
|
|
ADD COLUMN IF NOT EXISTS created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
|
|
"""
|
|
)
|
|
)
|
|
conn.execute(
|
|
text(
|
|
"""
|
|
ALTER TABLE rag_chunks
|
|
ADD COLUMN IF NOT EXISTS updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
|
|
"""
|
|
)
|
|
)
|
|
conn.execute(
|
|
text(
|
|
"""
|
|
CREATE TABLE IF NOT EXISTS rag_blob_cache (
|
|
id BIGSERIAL PRIMARY KEY,
|
|
repo_id VARCHAR(512) NOT NULL,
|
|
blob_sha VARCHAR(128) NOT NULL,
|
|
path TEXT NOT NULL,
|
|
artifact_type VARCHAR(16) NULL,
|
|
section TEXT NULL,
|
|
doc_id TEXT NULL,
|
|
doc_version TEXT NULL,
|
|
owner TEXT NULL,
|
|
system_component TEXT NULL,
|
|
last_modified TIMESTAMPTZ NULL,
|
|
staleness_score DOUBLE PRECISION NULL,
|
|
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
|
|
updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
|
|
CONSTRAINT uq_rag_blob_cache UNIQUE (repo_id, blob_sha, path)
|
|
)
|
|
"""
|
|
)
|
|
)
|
|
conn.execute(
|
|
text(
|
|
"""
|
|
CREATE TABLE IF NOT EXISTS rag_chunk_cache (
|
|
id BIGSERIAL PRIMARY KEY,
|
|
repo_id VARCHAR(512) NOT NULL,
|
|
blob_sha VARCHAR(128) NOT NULL,
|
|
chunk_index INTEGER NOT NULL,
|
|
content TEXT NOT NULL,
|
|
embedding vector NULL,
|
|
section TEXT NULL,
|
|
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
|
|
updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
|
|
CONSTRAINT uq_rag_chunk_cache UNIQUE (repo_id, blob_sha, chunk_index)
|
|
)
|
|
"""
|
|
)
|
|
)
|
|
conn.execute(
|
|
text(
|
|
"""
|
|
CREATE TABLE IF NOT EXISTS rag_session_chunk_map (
|
|
id BIGSERIAL PRIMARY KEY,
|
|
rag_session_id VARCHAR(64) NOT NULL,
|
|
repo_id VARCHAR(512) NOT NULL,
|
|
blob_sha VARCHAR(128) NOT NULL,
|
|
chunk_index INTEGER NOT NULL,
|
|
path TEXT NOT NULL,
|
|
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
|
|
)
|
|
"""
|
|
)
|
|
)
|
|
conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunks_session ON rag_chunks (rag_session_id)"))
|
|
conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunks_artifact_type ON rag_chunks (artifact_type)"))
|
|
conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunks_doc ON rag_chunks (doc_id, doc_version)"))
|
|
conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunks_component ON rag_chunks (system_component)"))
|
|
conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunks_path ON rag_chunks (path)"))
|
|
conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_blob_cache_repo_blob ON rag_blob_cache (repo_id, blob_sha)"))
|
|
conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunk_cache_repo_blob ON rag_chunk_cache (repo_id, blob_sha, chunk_index)"))
|
|
conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_session_chunk_map_session ON rag_session_chunk_map (rag_session_id, created_at DESC)"))
|
|
conn.execute(text("ALTER TABLE rag_index_jobs ADD COLUMN IF NOT EXISTS cache_hit_files INTEGER NOT NULL DEFAULT 0"))
|
|
conn.execute(text("ALTER TABLE rag_index_jobs ADD COLUMN IF NOT EXISTS cache_miss_files INTEGER NOT NULL DEFAULT 0"))
|
|
conn.commit()
|
|
|
|
def upsert_session(self, rag_session_id: str, project_id: str) -> None:
|
|
with get_engine().connect() as conn:
|
|
conn.execute(
|
|
text(
|
|
"""
|
|
INSERT INTO rag_sessions (rag_session_id, project_id)
|
|
VALUES (:sid, :pid)
|
|
ON CONFLICT (rag_session_id) DO UPDATE SET project_id = EXCLUDED.project_id
|
|
"""
|
|
),
|
|
{"sid": rag_session_id, "pid": project_id},
|
|
)
|
|
conn.commit()
|
|
|
|
def session_exists(self, rag_session_id: str) -> bool:
|
|
with get_engine().connect() as conn:
|
|
row = conn.execute(
|
|
text("SELECT 1 FROM rag_sessions WHERE rag_session_id = :sid"),
|
|
{"sid": rag_session_id},
|
|
).fetchone()
|
|
return bool(row)
|
|
|
|
def get_session(self, rag_session_id: str) -> dict | None:
|
|
with get_engine().connect() as conn:
|
|
row = conn.execute(
|
|
text("SELECT rag_session_id, project_id FROM rag_sessions WHERE rag_session_id = :sid"),
|
|
{"sid": rag_session_id},
|
|
).mappings().fetchone()
|
|
return dict(row) if row else None
|
|
|
|
def create_job(self, index_job_id: str, rag_session_id: str, status: str) -> None:
|
|
with get_engine().connect() as conn:
|
|
conn.execute(
|
|
text(
|
|
"""
|
|
INSERT INTO rag_index_jobs (index_job_id, rag_session_id, status)
|
|
VALUES (:jid, :sid, :status)
|
|
"""
|
|
),
|
|
{"jid": index_job_id, "sid": rag_session_id, "status": status},
|
|
)
|
|
conn.commit()
|
|
|
|
def update_job(
|
|
self,
|
|
index_job_id: str,
|
|
*,
|
|
status: str,
|
|
indexed_files: int,
|
|
failed_files: int,
|
|
cache_hit_files: int = 0,
|
|
cache_miss_files: int = 0,
|
|
error_code: str | None = None,
|
|
error_desc: str | None = None,
|
|
error_module: str | None = None,
|
|
) -> None:
|
|
with get_engine().connect() as conn:
|
|
conn.execute(
|
|
text(
|
|
"""
|
|
UPDATE rag_index_jobs
|
|
SET status = :status,
|
|
indexed_files = :indexed,
|
|
failed_files = :failed,
|
|
cache_hit_files = :cache_hit_files,
|
|
cache_miss_files = :cache_miss_files,
|
|
error_code = :ecode,
|
|
error_desc = :edesc,
|
|
error_module = :emodule,
|
|
updated_at = CURRENT_TIMESTAMP
|
|
WHERE index_job_id = :jid
|
|
"""
|
|
),
|
|
{
|
|
"jid": index_job_id,
|
|
"status": status,
|
|
"indexed": indexed_files,
|
|
"failed": failed_files,
|
|
"cache_hit_files": cache_hit_files,
|
|
"cache_miss_files": cache_miss_files,
|
|
"ecode": error_code,
|
|
"edesc": error_desc,
|
|
"emodule": error_module,
|
|
},
|
|
)
|
|
conn.commit()
|
|
|
|
def get_job(self, index_job_id: str) -> RagJobRow | None:
|
|
with get_engine().connect() as conn:
|
|
row = conn.execute(
|
|
text(
|
|
"""
|
|
SELECT index_job_id, rag_session_id, status, indexed_files, failed_files,
|
|
cache_hit_files, cache_miss_files, error_code, error_desc, error_module
|
|
FROM rag_index_jobs
|
|
WHERE index_job_id = :jid
|
|
"""
|
|
),
|
|
{"jid": index_job_id},
|
|
).mappings().fetchone()
|
|
if not row:
|
|
return None
|
|
return RagJobRow(**dict(row))
|
|
|
|
def replace_chunks(self, rag_session_id: str, items: list[dict]) -> None:
|
|
with get_engine().connect() as conn:
|
|
conn.execute(text("DELETE FROM rag_chunks WHERE rag_session_id = :sid"), {"sid": rag_session_id})
|
|
conn.execute(text("DELETE FROM rag_session_chunk_map WHERE rag_session_id = :sid"), {"sid": rag_session_id})
|
|
self._insert_chunks(conn, rag_session_id, items)
|
|
conn.commit()
|
|
|
|
def apply_changes(self, rag_session_id: str, delete_paths: list[str], upserts: list[dict]) -> None:
|
|
with get_engine().connect() as conn:
|
|
if delete_paths:
|
|
conn.execute(
|
|
text("DELETE FROM rag_chunks WHERE rag_session_id = :sid AND path = ANY(:paths)"),
|
|
{"sid": rag_session_id, "paths": delete_paths},
|
|
)
|
|
conn.execute(
|
|
text("DELETE FROM rag_session_chunk_map WHERE rag_session_id = :sid AND path = ANY(:paths)"),
|
|
{"sid": rag_session_id, "paths": delete_paths},
|
|
)
|
|
if upserts:
|
|
paths = sorted({str(x["path"]) for x in upserts})
|
|
conn.execute(
|
|
text("DELETE FROM rag_chunks WHERE rag_session_id = :sid AND path = ANY(:paths)"),
|
|
{"sid": rag_session_id, "paths": paths},
|
|
)
|
|
conn.execute(
|
|
text("DELETE FROM rag_session_chunk_map WHERE rag_session_id = :sid AND path = ANY(:paths)"),
|
|
{"sid": rag_session_id, "paths": paths},
|
|
)
|
|
self._insert_chunks(conn, rag_session_id, upserts)
|
|
conn.commit()
|
|
|
|
def get_cached_chunks(self, repo_id: str, blob_sha: str) -> list[dict]:
|
|
with get_engine().connect() as conn:
|
|
rows = conn.execute(
|
|
text(
|
|
"""
|
|
SELECT chunk_index, content, embedding::text AS embedding_txt, section
|
|
FROM rag_chunk_cache
|
|
WHERE repo_id = :repo_id AND blob_sha = :blob_sha
|
|
ORDER BY chunk_index ASC
|
|
"""
|
|
),
|
|
{"repo_id": repo_id, "blob_sha": blob_sha},
|
|
).mappings().fetchall()
|
|
output: list[dict] = []
|
|
for row in rows:
|
|
output.append(
|
|
{
|
|
"chunk_index": int(row["chunk_index"]),
|
|
"content": str(row["content"] or ""),
|
|
"embedding": self._parse_vector(str(row["embedding_txt"] or "")),
|
|
"section": row.get("section"),
|
|
}
|
|
)
|
|
return output
|
|
|
|
def record_repo_cache(
|
|
self,
|
|
*,
|
|
project_id: str,
|
|
commit_sha: str | None,
|
|
changed_files: list[str],
|
|
summary: str,
|
|
) -> None:
|
|
repo_session_id = f"repo:{project_id}"
|
|
with get_engine().connect() as conn:
|
|
for path in changed_files:
|
|
key = f"{commit_sha or 'no-commit'}:{path}"
|
|
blob_sha = hashlib.sha256(key.encode("utf-8")).hexdigest()
|
|
conn.execute(
|
|
text(
|
|
"""
|
|
INSERT INTO rag_blob_cache (
|
|
repo_id,
|
|
blob_sha,
|
|
path,
|
|
artifact_type,
|
|
section
|
|
)
|
|
VALUES (
|
|
:repo_id,
|
|
:blob_sha,
|
|
:path,
|
|
:artifact_type,
|
|
:section
|
|
)
|
|
ON CONFLICT (repo_id, blob_sha, path) DO UPDATE SET
|
|
updated_at = CURRENT_TIMESTAMP
|
|
"""
|
|
),
|
|
{
|
|
"repo_id": project_id,
|
|
"blob_sha": blob_sha,
|
|
"path": path,
|
|
"artifact_type": "CODE",
|
|
"section": "repo_webhook",
|
|
},
|
|
)
|
|
conn.execute(
|
|
text(
|
|
"""
|
|
INSERT INTO rag_chunk_cache (
|
|
repo_id,
|
|
blob_sha,
|
|
chunk_index,
|
|
content,
|
|
embedding,
|
|
section
|
|
)
|
|
VALUES (
|
|
:repo_id,
|
|
:blob_sha,
|
|
0,
|
|
:content,
|
|
NULL,
|
|
:section
|
|
)
|
|
ON CONFLICT (repo_id, blob_sha, chunk_index) DO UPDATE SET
|
|
content = EXCLUDED.content,
|
|
section = EXCLUDED.section,
|
|
updated_at = CURRENT_TIMESTAMP
|
|
"""
|
|
),
|
|
{
|
|
"repo_id": project_id,
|
|
"blob_sha": blob_sha,
|
|
"content": f"repo_webhook:{path}:{summary[:300]}",
|
|
"section": "repo_webhook",
|
|
},
|
|
)
|
|
conn.execute(
|
|
text(
|
|
"""
|
|
INSERT INTO rag_session_chunk_map (
|
|
rag_session_id,
|
|
repo_id,
|
|
blob_sha,
|
|
chunk_index,
|
|
path
|
|
)
|
|
VALUES (
|
|
:rag_session_id,
|
|
:repo_id,
|
|
:blob_sha,
|
|
0,
|
|
:path
|
|
)
|
|
"""
|
|
),
|
|
{
|
|
"rag_session_id": repo_session_id,
|
|
"repo_id": project_id,
|
|
"blob_sha": blob_sha,
|
|
"path": path,
|
|
},
|
|
)
|
|
conn.commit()
|
|
|
|
def cache_file_chunks(self, repo_id: str, path: str, blob_sha: str, items: list[dict]) -> None:
|
|
if not items:
|
|
return
|
|
meta = items[0]
|
|
with get_engine().connect() as conn:
|
|
conn.execute(
|
|
text(
|
|
"""
|
|
INSERT INTO rag_blob_cache (
|
|
repo_id,
|
|
blob_sha,
|
|
path,
|
|
artifact_type,
|
|
section,
|
|
doc_id,
|
|
doc_version,
|
|
owner,
|
|
system_component,
|
|
last_modified,
|
|
staleness_score
|
|
)
|
|
VALUES (
|
|
:repo_id,
|
|
:blob_sha,
|
|
:path,
|
|
:artifact_type,
|
|
:section,
|
|
:doc_id,
|
|
:doc_version,
|
|
:owner,
|
|
:system_component,
|
|
:last_modified,
|
|
:staleness_score
|
|
)
|
|
ON CONFLICT (repo_id, blob_sha, path) DO UPDATE SET
|
|
artifact_type = EXCLUDED.artifact_type,
|
|
section = EXCLUDED.section,
|
|
doc_id = EXCLUDED.doc_id,
|
|
doc_version = EXCLUDED.doc_version,
|
|
owner = EXCLUDED.owner,
|
|
system_component = EXCLUDED.system_component,
|
|
last_modified = EXCLUDED.last_modified,
|
|
staleness_score = EXCLUDED.staleness_score,
|
|
updated_at = CURRENT_TIMESTAMP
|
|
"""
|
|
),
|
|
{
|
|
"repo_id": repo_id,
|
|
"blob_sha": blob_sha,
|
|
"path": path,
|
|
"artifact_type": meta.get("artifact_type"),
|
|
"section": meta.get("section"),
|
|
"doc_id": meta.get("doc_id"),
|
|
"doc_version": meta.get("doc_version"),
|
|
"owner": meta.get("owner"),
|
|
"system_component": meta.get("system_component"),
|
|
"last_modified": meta.get("last_modified"),
|
|
"staleness_score": meta.get("staleness_score"),
|
|
},
|
|
)
|
|
for item in items:
|
|
emb = item.get("embedding") or []
|
|
emb_str = "[" + ",".join(str(x) for x in emb) + "]" if emb else None
|
|
conn.execute(
|
|
text(
|
|
"""
|
|
INSERT INTO rag_chunk_cache (
|
|
repo_id,
|
|
blob_sha,
|
|
chunk_index,
|
|
content,
|
|
embedding,
|
|
section
|
|
)
|
|
VALUES (
|
|
:repo_id,
|
|
:blob_sha,
|
|
:chunk_index,
|
|
:content,
|
|
CAST(:embedding AS vector),
|
|
:section
|
|
)
|
|
ON CONFLICT (repo_id, blob_sha, chunk_index) DO UPDATE SET
|
|
content = EXCLUDED.content,
|
|
embedding = EXCLUDED.embedding,
|
|
section = EXCLUDED.section,
|
|
updated_at = CURRENT_TIMESTAMP
|
|
"""
|
|
),
|
|
{
|
|
"repo_id": repo_id,
|
|
"blob_sha": blob_sha,
|
|
"chunk_index": int(item["chunk_index"]),
|
|
"content": item["content"],
|
|
"embedding": emb_str,
|
|
"section": item.get("section"),
|
|
},
|
|
)
|
|
conn.commit()
|
|
|
|
def retrieve(self, rag_session_id: str, query_embedding: list[float], limit: int = 5) -> list[dict]:
|
|
emb = "[" + ",".join(str(x) for x in query_embedding) + "]"
|
|
with get_engine().connect() as conn:
|
|
rows = conn.execute(
|
|
text(
|
|
"""
|
|
SELECT path, content
|
|
FROM rag_chunks
|
|
WHERE rag_session_id = :sid
|
|
ORDER BY embedding <=> CAST(:emb AS vector)
|
|
LIMIT :lim
|
|
"""
|
|
),
|
|
{"sid": rag_session_id, "emb": emb, "lim": limit},
|
|
).mappings().fetchall()
|
|
return [dict(x) for x in rows]
|
|
|
|
def fallback_chunks(self, rag_session_id: str, limit: int = 5) -> list[dict]:
|
|
with get_engine().connect() as conn:
|
|
rows = conn.execute(
|
|
text(
|
|
"""
|
|
SELECT path, content
|
|
FROM rag_chunks
|
|
WHERE rag_session_id = :sid
|
|
ORDER BY id DESC
|
|
LIMIT :lim
|
|
"""
|
|
),
|
|
{"sid": rag_session_id, "lim": limit},
|
|
).mappings().fetchall()
|
|
return [dict(x) for x in rows]
|
|
|
|
def _insert_chunks(self, conn, rag_session_id: str, items: list[dict]) -> None:
|
|
for item in items:
|
|
emb = item.get("embedding") or []
|
|
emb_str = "[" + ",".join(str(x) for x in emb) + "]" if emb else None
|
|
conn.execute(
|
|
text(
|
|
"""
|
|
INSERT INTO rag_chunks (
|
|
rag_session_id,
|
|
path,
|
|
chunk_index,
|
|
content,
|
|
embedding,
|
|
artifact_type,
|
|
section,
|
|
doc_id,
|
|
doc_version,
|
|
owner,
|
|
system_component,
|
|
last_modified,
|
|
staleness_score,
|
|
created_at,
|
|
updated_at
|
|
)
|
|
VALUES (
|
|
:sid,
|
|
:path,
|
|
:idx,
|
|
:content,
|
|
CAST(:emb AS vector),
|
|
:artifact_type,
|
|
:section,
|
|
:doc_id,
|
|
:doc_version,
|
|
:owner,
|
|
:system_component,
|
|
:last_modified,
|
|
:staleness_score,
|
|
CURRENT_TIMESTAMP,
|
|
CURRENT_TIMESTAMP
|
|
)
|
|
"""
|
|
),
|
|
{
|
|
"sid": rag_session_id,
|
|
"path": item["path"],
|
|
"idx": int(item["chunk_index"]),
|
|
"content": item["content"],
|
|
"emb": emb_str,
|
|
"artifact_type": item.get("artifact_type"),
|
|
"section": item.get("section"),
|
|
"doc_id": item.get("doc_id"),
|
|
"doc_version": item.get("doc_version"),
|
|
"owner": item.get("owner"),
|
|
"system_component": item.get("system_component"),
|
|
"last_modified": item.get("last_modified"),
|
|
"staleness_score": item.get("staleness_score"),
|
|
},
|
|
)
|
|
repo_id = str(item.get("repo_id") or "").strip()
|
|
blob_sha = str(item.get("blob_sha") or "").strip()
|
|
if repo_id and blob_sha:
|
|
conn.execute(
|
|
text(
|
|
"""
|
|
INSERT INTO rag_session_chunk_map (
|
|
rag_session_id,
|
|
repo_id,
|
|
blob_sha,
|
|
chunk_index,
|
|
path
|
|
) VALUES (
|
|
:sid,
|
|
:repo_id,
|
|
:blob_sha,
|
|
:chunk_index,
|
|
:path
|
|
)
|
|
"""
|
|
),
|
|
{
|
|
"sid": rag_session_id,
|
|
"repo_id": repo_id,
|
|
"blob_sha": blob_sha,
|
|
"chunk_index": int(item["chunk_index"]),
|
|
"path": item["path"],
|
|
},
|
|
)
|
|
|
|
def _parse_vector(self, value: str) -> list[float]:
|
|
text_value = value.strip()
|
|
if not text_value:
|
|
return []
|
|
if text_value.startswith("[") and text_value.endswith("]"):
|
|
text_value = text_value[1:-1]
|
|
if not text_value:
|
|
return []
|
|
return [float(part.strip()) for part in text_value.split(",") if part.strip()]
|