From 84ded7d7a977f286d45a4fa578ecc8c01cbf35d3 Mon Sep 17 00:00:00 2001 From: zosimovaa Date: Fri, 30 Jan 2026 22:21:12 +0300 Subject: [PATCH] =?UTF-8?q?=D0=A1=D0=BA=D0=B5=D0=BB=D0=B5=D1=82=20=D0=BF?= =?UTF-8?q?=D1=80=D0=BE=D0=B5=D0=BA=D1=82=D0=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 43 ++++ pyproject.toml | 17 ++ scripts/create_db.py | 29 +++ scripts/post-commit | 35 ++++ scripts/schema.sql | 33 +++ src/rag_agent/__init__.py | 3 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 183 bytes src/rag_agent/__pycache__/cli.cpython-312.pyc | Bin 0 -> 5976 bytes .../__pycache__/config.cpython-312.pyc | Bin 0 -> 2932 bytes src/rag_agent/agent/__init__.py | 1 + src/rag_agent/agent/pipeline.py | 51 +++++ src/rag_agent/cli.py | 117 +++++++++++ src/rag_agent/config.py | 57 +++++ src/rag_agent/index/__init__.py | 1 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 174 bytes .../__pycache__/embeddings.cpython-312.pyc | Bin 0 -> 1918 bytes .../__pycache__/postgres.cpython-312.pyc | Bin 0 -> 8410 bytes src/rag_agent/index/embeddings.py | 29 +++ src/rag_agent/index/postgres.py | 194 ++++++++++++++++++ src/rag_agent/ingest/__init__.py | 1 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 175 bytes .../__pycache__/chunker.cpython-312.pyc | Bin 0 -> 2144 bytes .../__pycache__/file_loader.cpython-312.pyc | Bin 0 -> 1327 bytes .../__pycache__/git_watcher.cpython-312.pyc | Bin 0 -> 1866 bytes src/rag_agent/ingest/chunker.py | 42 ++++ src/rag_agent/ingest/file_loader.py | 23 +++ src/rag_agent/ingest/git_watcher.py | 42 ++++ src/rag_agent/retrieval/__init__.py | 1 + src/rag_agent/retrieval/search.py | 24 +++ tests/test_chunker.py | 9 + 30 files changed, 752 insertions(+) create mode 100644 README.md create mode 100644 pyproject.toml create mode 100644 scripts/create_db.py create mode 100644 scripts/post-commit create mode 100644 scripts/schema.sql create mode 100644 src/rag_agent/__init__.py create mode 100644 src/rag_agent/__pycache__/__init__.cpython-312.pyc create mode 100644 src/rag_agent/__pycache__/cli.cpython-312.pyc create mode 100644 src/rag_agent/__pycache__/config.cpython-312.pyc create mode 100644 src/rag_agent/agent/__init__.py create mode 100644 src/rag_agent/agent/pipeline.py create mode 100644 src/rag_agent/cli.py create mode 100644 src/rag_agent/config.py create mode 100644 src/rag_agent/index/__init__.py create mode 100644 src/rag_agent/index/__pycache__/__init__.cpython-312.pyc create mode 100644 src/rag_agent/index/__pycache__/embeddings.cpython-312.pyc create mode 100644 src/rag_agent/index/__pycache__/postgres.cpython-312.pyc create mode 100644 src/rag_agent/index/embeddings.py create mode 100644 src/rag_agent/index/postgres.py create mode 100644 src/rag_agent/ingest/__init__.py create mode 100644 src/rag_agent/ingest/__pycache__/__init__.cpython-312.pyc create mode 100644 src/rag_agent/ingest/__pycache__/chunker.cpython-312.pyc create mode 100644 src/rag_agent/ingest/__pycache__/file_loader.cpython-312.pyc create mode 100644 src/rag_agent/ingest/__pycache__/git_watcher.cpython-312.pyc create mode 100644 src/rag_agent/ingest/chunker.py create mode 100644 src/rag_agent/ingest/file_loader.py create mode 100644 src/rag_agent/ingest/git_watcher.py create mode 100644 src/rag_agent/retrieval/__init__.py create mode 100644 src/rag_agent/retrieval/search.py create mode 100644 tests/test_chunker.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..e26ee72 --- /dev/null +++ b/README.md @@ -0,0 +1,43 @@ +# RAG Agent (Postgres) + +Custom RAG agent that indexes text files from a git repository into Postgres +and answers queries using retrieval + LLM generation. Commits are tied to +**stories**; indexing and retrieval can be scoped by story. + +## Quick start + +1. Configure environment variables: + - `RAG_REPO_PATH` — path to git repo with text files + - `RAG_DB_DSN` — Postgres DSN (e.g. `postgresql://user:pass@localhost:5432/rag`) + - `RAG_EMBEDDINGS_DIM` — embedding vector dimension (e.g. `1536`) +2. Create DB schema: + - `python scripts/create_db.py` (or `psql "$RAG_DB_DSN" -f scripts/schema.sql`) +3. Index files for a story (e.g. branch name as story slug): + - `rag-agent index --story my-branch --changed --base-ref HEAD~1 --head-ref HEAD` +4. Ask a question (optionally scoped to a story): + - `rag-agent ask "What is covered?"` + - `rag-agent ask "What is covered?" --story my-branch` + +## Git hook (index on commit) + +Install the post-commit hook so changed files are indexed after each commit: + +```bash +cp scripts/post-commit .git/hooks/post-commit && chmod +x .git/hooks/post-commit +``` + +Story for the commit is taken from (in order): env `RAG_STORY`, file `.rag-story` in repo root (one line = slug), or current branch name. + +## DB structure + +- **stories** — story slug (e.g. branch name); documents and chunks are tied to a story. +- **documents** — path + version per story; unique `(story_id, path)`. +- **chunks** — text chunks with embeddings (pgvector); updated when documents are re-indexed. + +Scripts: `scripts/create_db.py` (Python, uses `ensure_schema` and `RAG_*` env), `scripts/schema.sql` (raw SQL). + +## Notes + +- The default embedding/LLM clients are stubs. Replace them in + `src/rag_agent/index/embeddings.py` and `src/rag_agent/agent/pipeline.py`. +- This project requires Postgres with the `pgvector` extension. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..f6968af --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,17 @@ +[project] +name = "rag-agent" +version = "0.1.0" +description = "Custom RAG agent with Postgres-backed vector index" +readme = "README.md" +requires-python = ">=3.10" +dependencies = [ + "psycopg[binary]>=3.1.18", + "pgvector>=0.2.5", + "pydantic>=2.7.0", +] + +[project.scripts] +rag-agent = "rag_agent.cli:main" + +[tool.ruff] +line-length = 100 diff --git a/scripts/create_db.py b/scripts/create_db.py new file mode 100644 index 0000000..a6b5246 --- /dev/null +++ b/scripts/create_db.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +""" +Create RAG vector DB schema in Postgres (extension + stories, documents, chunks). +Requires RAG_DB_DSN and RAG_EMBEDDINGS_DIM (optional, default 1536). +Run from repo root with package installed: pip install -e . && python scripts/create_db.py +""" +from __future__ import annotations + +import sys +from pathlib import Path + +# Allow importing rag_agent when run as scripts/create_db.py +repo_root = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(repo_root / "src")) + +from rag_agent.config import load_config +from rag_agent.index.postgres import connect, ensure_schema + + +def main() -> None: + config = load_config() + conn = connect(config.db_dsn) + ensure_schema(conn, config.embeddings_dim) + conn.close() + print("Schema created successfully.") + + +if __name__ == "__main__": + main() diff --git a/scripts/post-commit b/scripts/post-commit new file mode 100644 index 0000000..3c939cf --- /dev/null +++ b/scripts/post-commit @@ -0,0 +1,35 @@ +#!/usr/bin/env sh +# Git post-commit hook: index changed files into RAG vector DB for the current story. +# Install: cp scripts/post-commit .git/hooks/post-commit && chmod +x .git/hooks/post-commit +# Requires: RAG_REPO_PATH, RAG_DB_DSN, RAG_EMBEDDINGS_DIM; story from RAG_STORY or .rag-story or current branch name. + +set -e +cd "$(git rev-parse --show-toplevel)" + +# Resolve story: env RAG_STORY > file .rag-story > current branch name +if [ -n "${RAG_STORY}" ]; then + STORY="${RAG_STORY}" +elif [ -f .rag-story ]; then + STORY=$(head -n1 .rag-story | tr -d '\n\r') +else + STORY=$(git branch --show-current) +fi + +if [ -z "$STORY" ]; then + echo "post-commit: RAG_STORY or .rag-story or branch name required for indexing." + exit 0 +fi + +# Run index (changed files only: previous commit -> HEAD) +if command -v rag-agent >/dev/null 2>&1; then + rag-agent index --changed --base-ref HEAD~1 --head-ref HEAD --story "$STORY" +elif [ -n "${VIRTUAL_ENV}" ]; then + rag-agent index --changed --base-ref HEAD~1 --head-ref HEAD --story "$STORY" +else + # Try repo venv or python -m + if [ -f "venv/bin/rag-agent" ]; then + venv/bin/rag-agent index --changed --base-ref HEAD~1 --head-ref HEAD --story "$STORY" + else + PYTHONPATH=src python -m rag_agent.cli index --changed --base-ref HEAD~1 --head-ref HEAD --story "$STORY" 2>/dev/null || true + fi +fi diff --git a/scripts/schema.sql b/scripts/schema.sql new file mode 100644 index 0000000..f31463a --- /dev/null +++ b/scripts/schema.sql @@ -0,0 +1,33 @@ +-- RAG vector DB schema for Postgres (pgvector). +-- Run once against an empty DB. If RAG_EMBEDDINGS_DIM is not 1536, change vector(1536) below. +-- Usage: psql "$RAG_DB_DSN" -f scripts/schema.sql + +CREATE EXTENSION IF NOT EXISTS vector; + +CREATE TABLE IF NOT EXISTS stories ( + id SERIAL PRIMARY KEY, + slug TEXT UNIQUE NOT NULL, + created_at TIMESTAMPTZ NOT NULL DEFAULT (NOW() AT TIME ZONE 'utc') +); + +CREATE TABLE IF NOT EXISTS documents ( + id SERIAL PRIMARY KEY, + story_id INTEGER NOT NULL REFERENCES stories(id) ON DELETE CASCADE, + path TEXT NOT NULL, + version TEXT NOT NULL, + updated_at TIMESTAMPTZ NOT NULL, + UNIQUE(story_id, path) +); + +CREATE TABLE IF NOT EXISTS chunks ( + id SERIAL PRIMARY KEY, + document_id INTEGER NOT NULL REFERENCES documents(id) ON DELETE CASCADE, + chunk_index INTEGER NOT NULL, + hash TEXT NOT NULL, + content TEXT NOT NULL, + embedding vector(1536) NOT NULL +); + +CREATE INDEX IF NOT EXISTS idx_documents_story_id ON documents(story_id); +CREATE INDEX IF NOT EXISTS idx_chunks_document_id ON chunks(document_id); +CREATE INDEX IF NOT EXISTS idx_chunks_embedding ON chunks USING ivfflat (embedding vector_cosine_ops); diff --git a/src/rag_agent/__init__.py b/src/rag_agent/__init__.py new file mode 100644 index 0000000..258f66c --- /dev/null +++ b/src/rag_agent/__init__.py @@ -0,0 +1,3 @@ +__all__ = [ + "config", +] diff --git a/src/rag_agent/__pycache__/__init__.cpython-312.pyc b/src/rag_agent/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8b2e2792c310bcbb48d4eaa34086784760c4fc9a GIT binary patch literal 183 zcmX@j%ge<81hGGAGG&1DV-N=h7@>^MASKfoQW(-1qZld~HJNU)CFkd*Wv2USGTvg3 zk5A0WiH~2&@EN51mxX?4acWVqeqv5)g}zH_S$siJepYI7NpXCck$!QaesNK$ z-ceQ!Ht`#(G1%>BjVaUw|Rf9K-6eF%L{D@L-E31;nIK-@wi5*ZmOOoU-5 zZILYzi(XnIRw%79tJoqooyW>{#SwAnv`yxex=5W)+hwQXinw&zA-fe%#G`m4Ud0#j z>1~``ulOT=ovxDuN<*YU2}XiSW28~p6xpOSMVgf6NVC!sY0+hza;wr7X;a!G?NIlN zuAj4!4ly9Qq3ndx17(-kAbOz;i9yi^rw+x!LOk2W@U(kA( zAub`AOlg7^PbJlHsIzATZHl(bDM93;spMol232QtDwDj#YtmH>YQDH8VZFjn#$`zj zTZs<{+9+L(t6Dr6Bi>2_mXy?GNhJQ5r18-yAsLgzN-u^4X``e}NTLXR_^2G0l3Lh7 z958WGifY6sO0uL$yqJn+6yOn$lvFcV;??MsqzI&e4v@lp6iWh#sagt86E|(tNj@$T zFP74>5S3tZ*n&!YnY1cl%^WgpA)a$uX5!ST)5rAriCajjS70wMW+XfvWhy5}3mbe^ zgqXFz191!GP+~ooNI?n(fq&Fsm{{|>pj!Cc^`3O}w_F9}bZF!4UXK)g_S{t=; znX@E3kGIg8Xt`Zkt5GK4A#6;>+RmY{b^IIVaMqU5CVLNtIl`)-C4>`HKAlqISBXp2 zctwjV5}#Cw11f4fD-p+(pias031U~LguVL@5WAF&rbLONAiPP|4-He(2hvSC14%O8&-M9XC5(?^>$gJm;U= zx3XnmdCS03_rAs9Me$wv9eMHjCGSY#N2{*D^`UD+v)g~u_p847ref=!qHFJp>*%uU zXwfya#15^&$mlWMpm(C{#uVhYkY;Lzai)1jGqs`iy&E?JH|(I})0DG-ELW!2=eG^B zg*2DZ4zeiWHfq(Pf zV#6^xH9;J4RX0DFT3upQHH_)&B>_Q}Q&#}(05k!$D8gcD(hV@x4W_3ItN@W%7ShC} zuUCgwm9P|UaR9pL77hD>3VR7F;FwD6Wo#xag-XJia93~cxB{mHrdWr%#9<&ZXJ9p^ zPYKg|hEdBPY12#MGzU|O%fMg&Rh1qtSnG8O(|1PdqKe4JlcIDL4@12ZK2-smQb4Po z=2xF9jDlgkap2bA&B3B;b75qa_21Y!YhP*ZU2g8Z%l57My65)I2Y=uCc5BhMuW(|u z>52K#m0iQjyM~KRPcE^+`wrJDxtDUQ&EffND?1M_?>zk8$e&LC@$?7#J{tURu-JTI z#V$YG4p5f)5;X7N3J)@sCT$nxo!Pbv< zeYoqdeZL%8YPc}NL3Y2s=^MapZOX~?qI30 zXQgq^a^s#44t%=z?46#){qG+B!{Lwjo-H<>n{lpg>M6PFZ|wN>u{rjQvA6Pz=Zo79 zuURck&KVa>Ti0|$o4q){ZE@!(+~AjfnDpz1E)fpd&XRp$Ms#?hVZ`@JmW zf9`4+?RVBjt6#wF{6Eo3M0qQqK@GnJ@77GeMMtt4GKN)zF|38=Dw=^ORddWY(3_^; z^Ae-c7o;b!lC>}u&Vr^qTSYm;_5(%%LX;D%j>)k(+y8=fIqQPub_LY|y{dR;H*6@l z=G5!T`p;1>HUw^YXfl;aiibnsqS~}nn!kh(!UDqHs?kTRKc}8&kk095}Y%q=Tb$j8<)F9H(A4-cKbC(2Ux0eJ)IGHgU?bVptP< z**R9}Rw~cG{PZ34=?m&5JO&(U%+-0w{>z77@--Arl;Ck^U&^jFbj`7I;_ni_O~B)w z;YzNCC0E;Opna)p_hRtf)^}PL-Af&V#lX;#d+1vhV%6XIdRO7(s;8yIxl5e4#5I?= z0MOSx*E}U};CkX(qU3LRnAy?h!|*pI?zCTgm!)YoqZ>|Ost5*5$O zlmtR#;C2n|p>YAlqYX*4itK{T)L`oSSM*`MplBBz3!Eus0dJnd&W0q`iLB^ca2dGH zfS*`X&MkUG?}G1p(D5K=}GJqyB0!KZv#fIwIkCTer!hwBBjma**XYG zdi9Wm#?TP5#ApnwC_+*^53iRM;lU9OOD|^P7=mG&4w2cGF^ZHSJ%$d!8%%}Hq*N`2 zA)d2`nU|Q=z1wNTC!C>0+U{7(Nm*kXj81 zSPE%zNesc*+EiREONA|X9JY?nK$flR?X5Tn#7W1HcnxQyY;$>SsiZs|s_@GTh3(jd zsBS*={?6WB>MHeugOs(OI6ge`(>+=Dlh6{1#kCN$LI;N^O$r%VE6)f$s7pnA5SI4H zx=%pel!v`fW|C221#6P^4y%_!LdY~($LX3qKdD%-5OtBVy{F=eK9`2$(q%z@e7B+U znWfTn&4L`JXA0J|_tF>ik!G9Er_!OPV2||g0PTq-eHe)bs@il~!sm>$3fDmZjALm5 zs}k`J*WgkQ2-ixmj7Y6Rro(qIT2O&lZX{=UpJg# zme?zutS}gH>zZPW#}F-{1bOQ z2MVJlXZ?z^YuVW~cj3L@lC!Jm94Z{U->~V{(9NN_?Mu6#x-nF2I8``RV(VAfj%Bu^ z#Q7m|E_osk9LU!B2(dQ%11GZi%iP&Zi@TPN^Gj^UUG@dD&#b(7;tqt$(0F!z7HLN~Z{Ib=L0I}Zcn|=Vb?o?_+ zGWpc5*RcLMGv?HBo*55Y^=l$d{ko}ksQ76Z6dvabQ0Qo^Qm8Smm{o*$633t=37^^x z8AaEYSD%L)b@^9Z?aQwAqN@}Ba9JM^NG81RZe;aq|3R2BwgB z9@8Vk7idOZANt=%qd$tukyO3;N2jkG*61D%X|6WI??Eu3efleva5;`@{}!8CQl9=! z0uhR>7&-6`uVewdC?o6h)bq8Z&X&cH2~P1uIFLpp6{TG98IBL_4kNrrR}Vq|jad zWOgaZ1XUPu3dBeX%O^Pqg0{KTt#i;NfOF`N&%Myupr!{5S|GWkC$$RD)V=i0QlzD{ zP;=-2`{tYX-puUG_h#PwF&OkCDCDzd=?7kfKH`mD@pOT(y$WChVT1(}m4$*Ja4ec) zL4;Z|rLtU*U7c)t%HD#vJWv=AkcfzkJ@0sL3f*x;G=uQKD+nt$#jfUNw?YL42e9uI zx!}V=?1$QqLs*4cy(ntoPq>#RFx9A3s5ZY6aa!uAgVY#@RP0!9V#3@ndGSbQ3^E|Od%gU-`O-aZ=W z`zd|obCDlh6R9UigTq?83u6DVKiOGDXS=L?H2=l;yh$D}*;|WJjv|Wb-sSo57P4osR?!*tY5gLFGC>6Fh z0c;?9C)yCYu-l7!LTLiEz398J6W?7+SN0-uO>Bym*c9@}hipEBTbg1omh-66Sy2FzX~kX_aa+lR0N>0FyYNQ>GzZ%*SCC^Er6OOJ=z1C2|~ea1WK75SPlo0{FjDsqP|C z=n;}fP4PmXG?;Ry;6BZ1bQS(^2^GZ`4kMthDeV_DmhgIPe~)Y8>nRyVTa)r21sAO# zP0G@Anb+1@%2+$~wO^h4<+;rvCpGma_4xJEw_?9P_};_RQefHA6pKny4w*YHzLBiwAmtze}rN-+$5ivD(+}ds{(P z41Ph+5Dxb}`TgJ)JKRG!vV`#9+x@$*x1x8VH+OEaJko}8EVwW|qc3Dm&FiP8PoHq3 zvXR)MOO~u8g$!Gw1T#N3rH|?9e6A6HfcN->xc=D_XL8T!`D`ImT8AaG!3WCu=Q9hl z)2BX}gnwmGuFuRJ%cRrU+)Q3iXXi=?-X$#dIm6t1Iy2h{TXy~YHy)(J?*Gru&gosd z;@TlOGWYMMXJ_ZXmr3iH@1M@(^4a-ZUXvQqco{e3aeLJUNiADbD5sV3(54|S_%2c& z%yf{8C>KK@z;o|mrAcmb^Blijna8fzOyZtGO?)t_#lyUJA**K9z$iJyjw{1c-=g$dAnB?e_eL(D>rL-DY5Ic;-;|waA^d4k2v@3LZ5`b z4IjVA+%{|>*SPzu1^B~du?~a5!-c{vgIq3{+J6)hi}KXRy!OQ!Je?>{E)+s(7eOvF z?xxcN+~7Vic$&PgJl%a1eTVXg96zMiGaw+F3Brfyk&lq?L-g>UQR;7AB&zF=ejtqe jLsWz>Zy^xd5)uz=@#c1XKsYIE$7Eqf_%sv~x?A&aTs(xJ literal 0 HcmV?d00001 diff --git a/src/rag_agent/agent/__init__.py b/src/rag_agent/agent/__init__.py new file mode 100644 index 0000000..a9a2c5b --- /dev/null +++ b/src/rag_agent/agent/__init__.py @@ -0,0 +1 @@ +__all__ = [] diff --git a/src/rag_agent/agent/pipeline.py b/src/rag_agent/agent/pipeline.py new file mode 100644 index 0000000..8a890c7 --- /dev/null +++ b/src/rag_agent/agent/pipeline.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Protocol + +import psycopg + +from rag_agent.config import AppConfig +from rag_agent.index.embeddings import EmbeddingClient +from rag_agent.retrieval.search import search_similar + + +class LLMClient(Protocol): + def generate(self, prompt: str, model: str) -> str: + raise NotImplementedError + + +@dataclass +class StubLLMClient: + def generate(self, prompt: str, model: str) -> str: + return ( + "LLM client is not configured. " + "Replace StubLLMClient with a real implementation." + ) + + +def build_prompt(question: str, contexts: list[str]) -> str: + joined = "\n\n".join(contexts) + return ( + "You are a RAG assistant. Use the context below to answer the question.\n\n" + f"Context:\n{joined}\n\n" + f"Question: {question}\nAnswer:" + ) + + +def answer_query( + conn: psycopg.Connection, + config: AppConfig, + embedding_client: EmbeddingClient, + llm_client: LLMClient, + question: str, + top_k: int = 5, + story_id: int | None = None, +) -> str: + query_embedding = embedding_client.embed_texts([question])[0] + results = search_similar( + conn, query_embedding, top_k=top_k, story_id=story_id + ) + contexts = [f"Source: {item.path}\n{item.content}" for item in results] + prompt = build_prompt(question, contexts) + return llm_client.generate(prompt, model=config.llm_model) diff --git a/src/rag_agent/cli.py b/src/rag_agent/cli.py new file mode 100644 index 0000000..772cde5 --- /dev/null +++ b/src/rag_agent/cli.py @@ -0,0 +1,117 @@ +from __future__ import annotations + +import argparse +import hashlib +from pathlib import Path + +from rag_agent.config import load_config +from rag_agent.ingest.chunker import chunk_text +from rag_agent.ingest.file_loader import iter_text_files +from rag_agent.ingest.git_watcher import filter_existing, filter_removed, get_changed_files +from rag_agent.index.embeddings import get_embedding_client +from rag_agent.index.postgres import ( + connect, + delete_document, + ensure_schema, + get_or_create_story, + get_story_id, + replace_chunks, + upsert_document, +) +from rag_agent.agent.pipeline import StubLLMClient, answer_query + + +def _file_version(path: Path) -> str: + stat = path.stat() + payload = f"{path.as_posix()}:{stat.st_mtime_ns}:{stat.st_size}" + return hashlib.sha256(payload.encode("utf-8")).hexdigest() + + +def cmd_index(args: argparse.Namespace) -> None: + config = load_config() + conn = connect(config.db_dsn) + ensure_schema(conn, config.embeddings_dim) + story_id = get_or_create_story(conn, args.story) + embedding_client = get_embedding_client(config.embeddings_dim) + + if args.changed: + changed_files = get_changed_files(config.repo_path, args.base_ref, args.head_ref) + removed = filter_removed(changed_files) + existing = filter_existing(changed_files) + else: + removed = [] + existing = [path for path in Path(config.repo_path).rglob("*") if path.is_file()] + + for path in removed: + delete_document(conn, story_id, str(path)) + + for path, text in iter_text_files(existing, config.allowed_extensions): + chunks = chunk_text(text, config.chunk_size, config.chunk_overlap) + if not chunks: + continue + embeddings = embedding_client.embed_texts([chunk.text for chunk in chunks]) + document_id = upsert_document( + conn, story_id, str(path), _file_version(path) + ) + replace_chunks(conn, document_id, chunks, embeddings) + + +def cmd_ask(args: argparse.Namespace) -> None: + config = load_config() + conn = connect(config.db_dsn) + ensure_schema(conn, config.embeddings_dim) + story_id = None + if args.story: + story_id = get_story_id(conn, args.story) + if story_id is None: + raise SystemExit(f"Story not found: {args.story}") + embedding_client = get_embedding_client(config.embeddings_dim) + llm_client = StubLLMClient() + answer = answer_query( + conn, + config, + embedding_client, + llm_client, + args.question, + top_k=args.top_k, + story_id=story_id, + ) + print(answer) + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(prog="rag-agent") + sub = parser.add_subparsers(dest="command", required=True) + + index_parser = sub.add_parser("index", help="Index files into Postgres") + index_parser.add_argument( + "--story", + required=True, + help="Story slug (e.g. branch name or story id); documents are tied to this story", + ) + index_parser.add_argument("--changed", action="store_true", help="Index only changed files") + index_parser.add_argument("--base-ref", default="HEAD~1", help="Base git ref") + index_parser.add_argument("--head-ref", default="HEAD", help="Head git ref") + index_parser.set_defaults(func=cmd_index) + + ask_parser = sub.add_parser("ask", help="Ask a question") + ask_parser.add_argument("question", help="Question text") + ask_parser.add_argument( + "--story", + default=None, + help="Limit retrieval to this story slug (optional)", + ) + ask_parser.add_argument("--top-k", type=int, default=5, help="Top K chunks to retrieve") + ask_parser.set_defaults(func=cmd_ask) + + return parser + + +def main() -> None: + parser = build_parser() + args = parser.parse_args() + args.func(args) + + +if __name__ == "__main__": + main() diff --git a/src/rag_agent/config.py b/src/rag_agent/config.py new file mode 100644 index 0000000..3f0ff0e --- /dev/null +++ b/src/rag_agent/config.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +import os +from dataclasses import dataclass +from typing import Iterable, Sequence + + +@dataclass(frozen=True) +class AppConfig: + repo_path: str + db_dsn: str + chunk_size: int + chunk_overlap: int + embeddings_dim: int + embeddings_model: str + llm_model: str + allowed_extensions: Sequence[str] + + +def _env_int(name: str, default: int) -> int: + value = os.getenv(name, "").strip() + if not value: + return default + try: + return int(value) + except ValueError as exc: + raise ValueError(f"Invalid integer for {name}: {value}") from exc + + +def _env_list(name: str, default: Iterable[str]) -> list[str]: + value = os.getenv(name, "").strip() + if not value: + return list(default) + return [item.strip() for item in value.split(",") if item.strip()] + + +def load_config() -> AppConfig: + repo_path = os.getenv("RAG_REPO_PATH", "").strip() + if not repo_path: + raise ValueError("RAG_REPO_PATH is required") + + db_dsn = os.getenv("RAG_DB_DSN", "").strip() + if not db_dsn: + raise ValueError("RAG_DB_DSN is required") + + return AppConfig( + repo_path=repo_path, + db_dsn=db_dsn, + chunk_size=_env_int("RAG_CHUNK_SIZE", 400), + chunk_overlap=_env_int("RAG_CHUNK_OVERLAP", 50), + embeddings_dim=_env_int("RAG_EMBEDDINGS_DIM", 1536), + embeddings_model=os.getenv("RAG_EMBEDDINGS_MODEL", "stub-embeddings"), + llm_model=os.getenv("RAG_LLM_MODEL", "stub-llm"), + allowed_extensions=tuple( + _env_list("RAG_ALLOWED_EXTENSIONS", [".md", ".txt", ".rst"]) + ), + ) diff --git a/src/rag_agent/index/__init__.py b/src/rag_agent/index/__init__.py new file mode 100644 index 0000000..a9a2c5b --- /dev/null +++ b/src/rag_agent/index/__init__.py @@ -0,0 +1 @@ +__all__ = [] diff --git a/src/rag_agent/index/__pycache__/__init__.cpython-312.pyc b/src/rag_agent/index/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b12d8f8bc8e1db573911b7836334c9fe47c1cc82 GIT binary patch literal 174 zcmX@j%ge<81dD&vWby*(#~=<2FhUuhIe?7m4CxF}43!L;jDDJox7g$36LWIn<5x0# z1}XYwryp9JT2!o`n3Gze?~+;;Ur>~vm6}{q9A9RnU!164T$HR|l$ai$n4X$fqMwu+I7wBMs;DBAq@BZBCtC+dvQ<$uqMvvorH{ zKK`Y)wgv&@|0$+FP!W184pq?04BpKFvw#R9*g+Ybz*z8#qa+lds-tGKgeGO0qi2kS zkqIOM7%Av&MD$ri3>sJi-vSars!4eOw31AcC$PV=Mb?yJL%Y*n~ER2>QQEoJJb)p$6U`%x=ymH-69jpy z1VbDZ;%I*T<;*xGB%MvY<)mrWOI9vP2N*CUk9X^&15XMC`V5;`q|*?0y{nm=Lo;xd zlFN*_3=-2gbtZh>qqjZIg~RstX9KYtoH8D>9eO)9M5nDBbEjz1pIqh9lOAz_^QStKnu0i zXS>NgHs%|QdU=*DoOIIMyD8Cx!ZBwlr>B| z&MIQ{Kv=CnBD6$;;Q>IiXivdaY_tkXP^Hq}0JXb*j?#c=}P|F3BLM5e$htSvxLM7D`^_4Jnxat*IQJty` zK}8@5!E2RURXQa;@Ca0!x}+7gA#_(yDk+py#=x?q7j>c(!U)|{?u0Z{GD^B93MQdw zRJ|6_lpM~grc6JCJlQKn<9O|HehP4ds0FxQfUhCw_?s<|M^Tv))jI-3!Ziz(^xkvs52Je(Qu1CutNbkgI#!6)qtul4x`%_dz! zeS@S^ltUFAnq|#k>>GA2N3+BXFnBN|+t&|F!_>Hp`pc3RkBa5il_dCFzGwU?pX zjiNmZr;llw!y|JNj5ss}jLcS7_grABvH2;|L$SG`?P%M=t$VkYhE}dEU%MaJJacg~ z`ud;IfsN?EW^{1w%9a`XBYJM>(!9Oc^>unZdT#CtpzZoo^L<}^vUvU5(Qih7t?$}u zX|zi-{>>VJSY&Rp7P8GP{01M>6aPm}A-@6H=rk%qhRFXM}SOD{vf z;pO4=)}G%Zy*pt@Wha78G|koh`!s}JIL}3E{umiNZ^YRNU`?H!0>T;u66?;dx2!|E z)3W54vZ&A>Olp%j3M4Ae#di1tJqkrWAKue~dQ;g@sFVAX0tYd!a;92W&Fkz^(?u!#QwFCase|F4BCXH$O$%B_@$*+e+zf7 zYuz#*QVX>el^03KZK{1gc6Hff_{vQVAOcN5ro5?{qCSH(<{ZQUl z@+m~@BX9I9*k9gzZ?YcHiuCy#fOb@j@k7-47}Y&Q9e<&&Jww5r^OJiB$W9GX!hb7a V+_ZxL?W%Cx5y!nYjL*vN_zxSPrb7S# literal 0 HcmV?d00001 diff --git a/src/rag_agent/index/__pycache__/postgres.cpython-312.pyc b/src/rag_agent/index/__pycache__/postgres.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a34cf6eab557c090ca4c2f131d7f910f1adb428b GIT binary patch literal 8410 zcmd5>TWlNGnLfjr;hof7wk$gt-(;~-rP_|&IFaL0B9$_hs8pn!n5jW9oRLhK7v;>z z7lPinMT(VRFJz-Y?P9U0i?*nPEU*bspna>8?%Td7wOa^-1~pK$K;D?hg#hn9?Ejy+ zh!c0yb~yT8|$EKx7_irnCiMBhW31ha|?00FH~%v*US4w)9J%;r=mNMBU4 zT8Uy8N#(LOdVZmJAqhF>3@w$FV%B5@i+Cf$tsRHNZK9AACzF(88a4@7>KKMnmf~4T zfHIM#oRD&1>WVw$(S2-;DJ5H?vd*g-)dO-VyHHSyT6#XCo!49K>_vsDut4g8wz`YIiPW7v@{2-QPUTX45^tRm1c)%W;UIfg?SF; ziZYPRmsD+*D(c|;73zmhSP+1Ox(k9IkVhTe4}|X3?w$ui&#JHUfzY|yvg@I+i|(|D zvnB>1PYALX{@mIaByJPUuA4J)iA2^{Ar^kGa+)bMgj6`Il!J7FyvM~uyzZJ;uVhQ} zv$`u=Di&c;)D7JLPH-$3ID{n89*F23E9ioRin(j#Prml$Q%k4r9DU$>_FCjm zLdVU>LtItMwbN`l+1fcep$S`w%%r$N=IpgVjrmzpc8pwgR2*|AxU5=?<=WM3PT{Co z@8%TzwiH5z`)4AH|FS)$bKAaL+fukUC_qEO&(b=uV`R?W$qhbOeB#IzvY#9#DtF1j zk+Xz@-;G_D`LpB_7xKi*eWO#+a55@I-$+K|iP&UZij7I}$s`oS63K*Q2>MIqFT7IC zXEsWPN5-QK4OGwzIYpKF>l$F&oGc}xQ?c;4^x9NxB0P0gIvqVb&_t%@7iOg-%txAz z$G$TiWwVJ-kB>K#W~l-?OHOArDH)rHCX(Ta*ODo_ffR|3g{Q}pQh$8%_5P3)PBJnn zH5rdehZeN#p-|oEp_l5inF8Riu4Id4TiWNVEF3s@B@JAP#goxjqEj2|n2L@?r=s!E zXhJgC+nUHv9u34(ncY0Lv3>nG zV_epNgp=8tZR#KEdGj=yErqG2MyG3lbh^MXG?0X zsH98tYQt>a<8^0tfvP2{yOhgHc0p61pj0U2v=C391ud=z?cUV1oGa)&(xbHlMat~0 z9#D$v0#(v#_PkQa&_hty3xD;u5L_dxp1^YNQt$OWRr0*(y|aJS?OXOO`BplIAG%MP znX1DXXnoAPJ>F_N@%XX6>hNHZ4~ll$MXt7e)gak_cxCV5DsdbMR9iM7;<23w?fiDT zEsECD989YU7GCuHREQntm;U);|$VO3dz_ z1iL>vJ{^fh1~FeZfvIS6dMeH!FV|a2if&v;r28PE`ytRp@X&Mfx_d^^vghF>K@T8> zU|?Iou4y0Axh)l<3XlbCo8a!4RkU=8rVR^kC>r{8AW6bseG7s*h!Br)jxvCZvz?tR2ncc#TqkAg+W)PG) z&4yUTc57>)TkLpBOK_iS>3|B?+wfU4yec$;`yGLb6+z|^`U!AIH-U1y31uN(K9~SU z1&{~c&DhlBgbm==zYUHJmY$RjsxOrfZAoS#6bKE#IcW$6XQBF`4Mmy80k@{l;N+IApoSg{mb4Z?{)vRaE)FS z#XSHzpMUwp(uo@{t@>M*PcNNb*?H>U_utR_uJ3-&AN;2uS%u;I;rq`&^nZIb*tVQs z%3m)o^6Nh6;B!Ce?pZv#`1aEA6=B~U8G`iylvfD^h93NBxNU^zKjy^|zblJUVtsfY z!XWrJNPG!BC{%>aKqzw+uHvZh8|T3}Jc%`Y8Xo@GH^=T_teg?QZ))Hfp+;v4b9kz4 z_$)w~I#Xvh^BrLvs!Q z7_EOjO}!Sa_iQ)xX`R|c-D)Ci+8{CN%x^e-+B*pQr$)TJ6445_1>C{@jef&p#V)z?R5tn{LV{hj+ zY#}bSdUHc;P>3w}N60zg!*{_hUKOu8pCWh;5Ik`+VS`dz+<=MZ2c?^cUd1^hTBE|z zeprCZ#U`O5TZ!V9R|MdOe9b&$GR1AWzJ}he!G)>7e^iG~L`ki5nvHAy^Z~<}HjYaJ z5}t;j$T;r|NVXGPFNN6e=F*B%+u&>?ai~TPbiNFpiO9}px_|@ff(DnEhKu4dWZ?wx z99(j|J5+VAJq^Qv&{n9W?GUKw2H*hj8D4piD)aeFR!JLe=?D;fA1VI{0+0rOoiqe( zX;}5Yuo~F87=JAC9`9o}*?Zvb*u7UDNZ0X$j>5x*?eDs?b^g+eVJ|{*D+S%g(H0mSi30Kx#l0It`Z* zVgX%8h((NN_*;$}?E=tUbvAndHx3&1KMQx5NO6gTobk+_Mo$36(DWeXK{RBbN(OCZ zk{z_=SQ117}nBlT#760310B^KYDV6YY3xB9=t=xv8eS-c-qHWSk9*GifHD2cy8QT3X&-P~e8r=CdwprTO#)Lo1*{&;>j&&?As+R`>yS#7(QY zLN1@7bR39&hJ-(Xph4g76raHZL&u8mSzD94K$EwU_Puxe?@j(@dZl&r?;YMi5H4Q= zt$3V(D;gjOBEirH$5vWTJZ|$qbth@|V!W;%3{@ znVX3_ZMV+cN!)Gw@tNQI5B%PDY(+S>eiSJQ1poEL5#m3tw!s{4oeDqCf3&-6ga{u! zFNROLKq)>0%n~y<*qsChhysw2!w=s^y2Ql~t1{2s!h~w*FEdd_D zSeJ}|k#14{#0v*FsUop%yoO0;<1kIl#$oCj=SW}Jr0d!?Xv>YceG5|M4(mnpP5bHc z91UYlmt9lyT6oTw@FWm~+}q3q*G`d=;!Rwrb`mixMSjy)FgyLW0==HY>pqJu(r zBX#TitnrvLXznF@&(n9>?~{W-uc+D}^IR0l-cOAeO7^sO2z&ZE26#GVZa(vFDB44Y zd%6Qt?3of(`V;rC@tpcHJp);kMiq@%mFGC_Q?l=`WbdEJp-)NgUr6v%GVoV2__>of zd^Zm~cW(pwDnaznDP>*cg=^X0k_!% zW9%Pe>>uJsqGwHnxcUs)Emb|wa>rTc5I^dYsv^YDxxc0A!jzk|^e`IieBWaqlrdiY EH@U^}hX4Qo literal 0 HcmV?d00001 diff --git a/src/rag_agent/index/embeddings.py b/src/rag_agent/index/embeddings.py new file mode 100644 index 0000000..19cd659 --- /dev/null +++ b/src/rag_agent/index/embeddings.py @@ -0,0 +1,29 @@ +from __future__ import annotations + +import hashlib +from dataclasses import dataclass +from typing import Iterable, Protocol + + +class EmbeddingClient(Protocol): + def embed_texts(self, texts: Iterable[str]) -> list[list[float]]: + raise NotImplementedError + + +@dataclass +class StubEmbeddingClient: + dim: int + + def embed_texts(self, texts: Iterable[str]) -> list[list[float]]: + vectors: list[list[float]] = [] + for text in texts: + digest = hashlib.sha256(text.encode("utf-8")).digest() + values = [b / 255.0 for b in digest] + if len(values) < self.dim: + values = (values * ((self.dim // len(values)) + 1))[: self.dim] + vectors.append(values[: self.dim]) + return vectors + + +def get_embedding_client(dim: int) -> EmbeddingClient: + return StubEmbeddingClient(dim=dim) diff --git a/src/rag_agent/index/postgres.py b/src/rag_agent/index/postgres.py new file mode 100644 index 0000000..1aa2f3c --- /dev/null +++ b/src/rag_agent/index/postgres.py @@ -0,0 +1,194 @@ +from __future__ import annotations + +from dataclasses import dataclass +from datetime import datetime, timezone +from typing import Iterable + +import psycopg +from pgvector.psycopg import register_vector + +from rag_agent.ingest.chunker import TextChunk + + +@dataclass(frozen=True) +class ChunkRecord: + document_path: str + document_version: str + chunk: TextChunk + embedding: list[float] + + +def connect(dsn: str) -> psycopg.Connection: + conn = psycopg.connect(dsn) + register_vector(conn) + return conn + + +def ensure_schema(conn: psycopg.Connection, embeddings_dim: int) -> None: + with conn.cursor() as cur: + cur.execute("CREATE EXTENSION IF NOT EXISTS vector;") + cur.execute( + """ + CREATE TABLE IF NOT EXISTS stories ( + id SERIAL PRIMARY KEY, + slug TEXT UNIQUE NOT NULL, + created_at TIMESTAMPTZ NOT NULL DEFAULT (NOW() AT TIME ZONE 'utc') + ); + """ + ) + cur.execute( + """ + CREATE TABLE IF NOT EXISTS documents ( + id SERIAL PRIMARY KEY, + story_id INTEGER NOT NULL REFERENCES stories(id) ON DELETE CASCADE, + path TEXT NOT NULL, + version TEXT NOT NULL, + updated_at TIMESTAMPTZ NOT NULL, + UNIQUE(story_id, path) + ); + """ + ) + cur.execute( + f""" + CREATE TABLE IF NOT EXISTS chunks ( + id SERIAL PRIMARY KEY, + document_id INTEGER NOT NULL REFERENCES documents(id) ON DELETE CASCADE, + chunk_index INTEGER NOT NULL, + hash TEXT NOT NULL, + content TEXT NOT NULL, + embedding vector({embeddings_dim}) NOT NULL + ); + """ + ) + cur.execute( + """ + CREATE INDEX IF NOT EXISTS idx_documents_story_id + ON documents(story_id); + """ + ) + cur.execute( + """ + CREATE INDEX IF NOT EXISTS idx_chunks_document_id + ON chunks(document_id); + """ + ) + cur.execute( + """ + CREATE INDEX IF NOT EXISTS idx_chunks_embedding + ON chunks USING ivfflat (embedding vector_cosine_ops); + """ + ) + conn.commit() + + +def get_or_create_story(conn: psycopg.Connection, slug: str) -> int: + with conn.cursor() as cur: + cur.execute( + """ + INSERT INTO stories (slug) + VALUES (%s) + ON CONFLICT (slug) DO UPDATE SET slug = EXCLUDED.slug + RETURNING id; + """, + (slug.strip(),), + ) + story_id = cur.fetchone()[0] + conn.commit() + return story_id + + +def get_story_id(conn: psycopg.Connection, slug: str) -> int | None: + s = slug.strip() + with conn.cursor() as cur: + if s.isdigit(): + cur.execute("SELECT id FROM stories WHERE id = %s;", (int(s),)) + else: + cur.execute("SELECT id FROM stories WHERE slug = %s;", (s,)) + row = cur.fetchone() + return row[0] if row else None + + +def upsert_document( + conn: psycopg.Connection, story_id: int, path: str, version: str +) -> int: + with conn.cursor() as cur: + cur.execute( + """ + INSERT INTO documents (story_id, path, version, updated_at) + VALUES (%s, %s, %s, %s) + ON CONFLICT (story_id, path) DO UPDATE + SET version = EXCLUDED.version, + updated_at = EXCLUDED.updated_at + RETURNING id; + """, + (story_id, path, version, datetime.now(timezone.utc)), + ) + document_id = cur.fetchone()[0] + return document_id + + +def replace_chunks( + conn: psycopg.Connection, + document_id: int, + chunks: Iterable[TextChunk], + embeddings: Iterable[list[float]], +) -> None: + with conn.cursor() as cur: + cur.execute( + "DELETE FROM chunks WHERE document_id = %s;", + (document_id,), + ) + for chunk, embedding in zip(chunks, embeddings): + cur.execute( + """ + INSERT INTO chunks (document_id, chunk_index, hash, content, embedding) + VALUES (%s, %s, %s, %s, %s); + """, + (document_id, chunk.index, chunk.hash, chunk.text, embedding), + ) + conn.commit() + + +def delete_document( + conn: psycopg.Connection, story_id: int, path: str +) -> None: + with conn.cursor() as cur: + cur.execute( + "DELETE FROM documents WHERE story_id = %s AND path = %s;", + (story_id, path), + ) + conn.commit() + + +def fetch_similar( + conn: psycopg.Connection, + query_embedding: list[float], + top_k: int, + story_id: int | None = None, +) -> list[tuple[str, str, float]]: + with conn.cursor() as cur: + if story_id is not None: + cur.execute( + """ + SELECT d.path, c.content, c.embedding <=> %s AS distance + FROM chunks c + JOIN documents d ON d.id = c.document_id + WHERE d.story_id = %s + ORDER BY c.embedding <=> %s + LIMIT %s; + """, + (query_embedding, story_id, query_embedding, top_k), + ) + else: + cur.execute( + """ + SELECT d.path, c.content, c.embedding <=> %s AS distance + FROM chunks c + JOIN documents d ON d.id = c.document_id + ORDER BY c.embedding <=> %s + LIMIT %s; + """, + (query_embedding, query_embedding, top_k), + ) + rows = cur.fetchall() + return [(row[0], row[1], row[2]) for row in rows] diff --git a/src/rag_agent/ingest/__init__.py b/src/rag_agent/ingest/__init__.py new file mode 100644 index 0000000..a9a2c5b --- /dev/null +++ b/src/rag_agent/ingest/__init__.py @@ -0,0 +1 @@ +__all__ = [] diff --git a/src/rag_agent/ingest/__pycache__/__init__.cpython-312.pyc b/src/rag_agent/ingest/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d48a4994a81bfa8f0491b9288dc2707fa8b261c9 GIT binary patch literal 175 zcmX@j%ge<81PgxDWby*(#~=<2FhUuhIe?7m4CxF}43!L;jDDJox7g$36LWIn<5x0# z1}XYwuOC{RT2!o`n3Gze?~+;;Ur>~vm6}{q9A9RnU!164T$HR|l$ai$n4X$fqMw2KczG$)vkyQXdcMsVvxf=Ff%eT-r(YCWG`X?asjs8 BEy(}? literal 0 HcmV?d00001 diff --git a/src/rag_agent/ingest/__pycache__/chunker.cpython-312.pyc b/src/rag_agent/ingest/__pycache__/chunker.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..15f125adaa14a3633145fbefc64ffb6e319389ca GIT binary patch literal 2144 zcmaJ?-ER{|5Z}EopMCy_1BEmQl$Ipzl@_-lKoBSog;J#zc_F-*rl-TVah#mbX78LP z)>*O&MTv?;s-h-55vhHk3YGfaKcRgHHB#LZsF8RnZ>B~n<*BoGzJyZRwLCjJGdnl? z+nL!P6Nwmtaq)>;=o1inz!u>U0|vJjfmuNYGO&q?IEyjkIg`tBF!Lr~6taRZ6HKuv zWu;;y8^MS}L@>nL(mD?OaOizxL@pykUgrYIdN84^Y{ZS|Wg#0i5=IQ>SQP#9XCw^; z)aQt5n?$20O9r&Fe}K}(#>@4kQs%; zMKhh82_u?f2id^9=av0iy&xp%ap}#-| zbh4!;{tpuP>J_$M6~`$ve~SB5p0aTL`=v?X9x~)r*%R%uS#UhwB$me)3zjF$lz>Wl zxlAl0$ai+}pw#EpsPDjw`4yqrg(~qPr8z=Pz3fR&X_i>FC;3VnJUcpd{Lgwk`0~1f zT`ZThIo+%fo1xklj!wdD8&TFqcEI2M0Z1J+6X|Q&rR?V))ldJbq?@VqVsvv~U;T75 zo~XYUkXY*a{^hAJ5Bwb4doP(@9$0zMCUAM%GT>I{f{P zjsC-pJx4a=o~E3MW906xsk?H%DJPcGOR+_s0?da9 z`^9hyO+rJd@!PEdQW3jcm_%RU6?ibN?hw!d`i}nus~DQK4-%BO$Y8bU0QjSWjQEud z50MXWfUQ!1R;~rtLo=XahBhO=qbGILl2Cfr9|()Xi^9ER&+5+A>aC-VpVeLtjI2CG9wov75MGy;n6oY_#?rm6BQ)a<-&QCzVCs}8mgk1-j zJP|%M=b`IQ{`dZueLHxF&Ohg9QjN1WVX6WF!J9x?_0ioxJZ0-iBQ|BnfOK6bLw~2N zXHa$@+qv#z>H|bs`UMT0wZo_KiQv5%rNgjfC1igLq{U;5@1s{ApyYk@%5P}kk%Ty9 zx&JQS{U^ucWD5al^N8zdv1L1s@rkzFg}YmW0v>KEJuRNigK97jANA)=ao=MR-D@O85bvI^o$Oq8F~-xJLD#5|0d?>qXJTSN$%|;h!Qsn>Wti;r<|fR}EZx2N zVonb6;R1nxit$O}gUAtu`#*eHjd;tTD2SkM(fDweC#yQM@i@d>pWZNcfG-0)e$iKg^99p0o+A6uu9~j)Fxf3KwDgzm%iGZzfHIUa`Ew|s2j0Y<*DZv&%r(CmC5o?L)K%77x>5p~&b0+0&ZZb<)%tBZS=0GVTBDKh$OdU*pIrI6<7qj0@EcTqm zp0RizBw>KhXOGWVO$x$*P^k(%34$v0gQta6NJH%_{jg}zHR2je+h$j`UYyRYXf;VW z!Fd>5Oj>_{1%Cw9Q0K&;L znIT*bZ;5`>Z5zvPPY4A|H9~x)v-^X>(wrQ zU!8(ozcTXi(!nc7qm#YLv3?&bufY$V<2x=&OPxZH_ose*iV_s+g)aw+X+9TAZF!d#;^^-I&dD z=sz?b%60|Vu}&%diA?-Pp7@zOe_~O(L{Dr=Uq7~O>h_(nV;xs-POZO~PycPgUd8ggkXRy25#knt|JRs*g++n$5>#=k@6C7*^x_1Uh#E` zqn1MX5JnMXX%V39pB1sx%W6JsMXk_fC9eSw1CK0Anb@D=m<;AIqg<}IhHF>KY#Lk%4u%{NaKx*@s zdAo#}gpsG-MBhq*2cG&Fq`a~Q#3{Tcs6~ zFvewCEAy_ih69u5_jFT~Rw$2uSS!1BiR?)s=30bOt}>U}RW3w02X?ADw(Hns!g$E2 zR!P~S;>I!&9-*XK(W~&rxHfArqEj-*wRvJ#K2i)?fHjn`n&WbrESX%L0}mNj#CBX= zFq|4;5!fspjNLQ2j~JmWXE5IcDS-HFMM`=thB*-JD%65a#CK z2J}V4g?i8Ag*rC8!&0<{)x074iWidzX4fAO$|jl8(O@fr5YN5o`#l$e>0)Q zL#va`uCUtuD0O7LZ>_HpPu@^AW`0(7WT`89CDcr!j>PIzBiXy&vDWc0Ik=r1e2_f; z7n0SHD@r4IU_H7P{W^X{<*{TVp1OAKi*w%`xMAPy-x%9IH2iz~t!5Z3;rw`N_1v?a z9u)6=7#rM<4T>GQA9WsXboSqp??i7$AEt-5)58zaBlpv%eloZEC${46v}7p}eUj*E zs#2u;pQa2;cG*D)>8nGhPoiH=cAuG$??s2sjLG-LRKVu*`UM*mKEK|l0jwhrO`>bq zlnTg{F7`n6VGkEyvFPlRR@CtYpx8%?XcJFwVjdz(Hgnmg#8vSq zAkRJ%ogod2kJC|->`mXfl_riHshFL+OAZ$AGXGgX!E4hLTccfmrl77-WRe3$ct} zpV>I}b|@i zb>MzK5l#JKMxo-`Ryu>$zNe;HvFHR4(FAmTuI4ukor=a!MOWrwkrc@idA#Sdvp|Z;+e21AWa!#{1^>9MXAT=mB;7^yzJ45 mrhFKu8t_bg-gk75XNP_ML`}3*m~TyBJhr1@e2TseJN*Y8W3w{= literal 0 HcmV?d00001 diff --git a/src/rag_agent/ingest/chunker.py b/src/rag_agent/ingest/chunker.py new file mode 100644 index 0000000..35e63ce --- /dev/null +++ b/src/rag_agent/ingest/chunker.py @@ -0,0 +1,42 @@ +from __future__ import annotations + +import hashlib +from dataclasses import dataclass +from typing import Iterable, Iterator + + +@dataclass(frozen=True) +class TextChunk: + index: int + text: str + hash: str + + +def _hash_text(text: str) -> str: + return hashlib.sha256(text.encode("utf-8")).hexdigest() + + +def chunk_text(text: str, chunk_size: int, overlap: int) -> list[TextChunk]: + tokens = text.split() + if not tokens: + return [] + + chunks: list[TextChunk] = [] + start = 0 + index = 0 + while start < len(tokens): + end = min(start + chunk_size, len(tokens)) + chunk_text_value = " ".join(tokens[start:end]) + chunks.append(TextChunk(index=index, text=chunk_text_value, hash=_hash_text(chunk_text_value))) + index += 1 + if end == len(tokens): + break + start = max(end - overlap, 0) + return chunks + + +def iter_chunks( + texts: Iterable[str], chunk_size: int, overlap: int +) -> Iterator[list[TextChunk]]: + for text in texts: + yield chunk_text(text, chunk_size, overlap) diff --git a/src/rag_agent/ingest/file_loader.py b/src/rag_agent/ingest/file_loader.py new file mode 100644 index 0000000..2fa3d83 --- /dev/null +++ b/src/rag_agent/ingest/file_loader.py @@ -0,0 +1,23 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Iterable, Iterator + + +def is_allowed(path: Path, allowed_extensions: Iterable[str]) -> bool: + return path.suffix.lower() in {ext.lower() for ext in allowed_extensions} + + +def read_text_file(path: Path) -> str: + return path.read_text(encoding="utf-8", errors="ignore") + + +def iter_text_files( + paths: Iterable[Path], allowed_extensions: Iterable[str] +) -> Iterator[tuple[Path, str]]: + for path in paths: + if not path.is_file(): + continue + if not is_allowed(path, allowed_extensions): + continue + yield path, read_text_file(path) diff --git a/src/rag_agent/ingest/git_watcher.py b/src/rag_agent/ingest/git_watcher.py new file mode 100644 index 0000000..2e82a96 --- /dev/null +++ b/src/rag_agent/ingest/git_watcher.py @@ -0,0 +1,42 @@ +from __future__ import annotations + +import subprocess +from pathlib import Path +from typing import Iterable + + +def get_changed_files( + repo_path: str, base_ref: str, head_ref: str = "HEAD" +) -> list[Path]: + args = [ + "git", + "-C", + repo_path, + "diff", + "--name-only", + base_ref, + head_ref, + ] + try: + result = subprocess.run( + args, check=True, capture_output=True, text=True + ) + except subprocess.CalledProcessError as exc: + raise RuntimeError( + f"git diff failed: {exc.stderr.strip() or exc}" + ) from exc + + files = [] + for line in result.stdout.splitlines(): + value = line.strip() + if value: + files.append(Path(repo_path) / value) + return files + + +def filter_existing(paths: Iterable[Path]) -> list[Path]: + return [path for path in paths if path.exists()] + + +def filter_removed(paths: Iterable[Path]) -> list[Path]: + return [path for path in paths if not path.exists()] diff --git a/src/rag_agent/retrieval/__init__.py b/src/rag_agent/retrieval/__init__.py new file mode 100644 index 0000000..a9a2c5b --- /dev/null +++ b/src/rag_agent/retrieval/__init__.py @@ -0,0 +1 @@ +__all__ = [] diff --git a/src/rag_agent/retrieval/search.py b/src/rag_agent/retrieval/search.py new file mode 100644 index 0000000..cda41bd --- /dev/null +++ b/src/rag_agent/retrieval/search.py @@ -0,0 +1,24 @@ +from __future__ import annotations + +from dataclasses import dataclass + +import psycopg + +from rag_agent.index.postgres import fetch_similar + + +@dataclass(frozen=True) +class SearchResult: + path: str + content: str + distance: float + + +def search_similar( + conn: psycopg.Connection, + query_embedding: list[float], + top_k: int = 5, + story_id: int | None = None, +) -> list[SearchResult]: + rows = fetch_similar(conn, query_embedding, top_k, story_id=story_id) + return [SearchResult(path=row[0], content=row[1], distance=row[2]) for row in rows] diff --git a/tests/test_chunker.py b/tests/test_chunker.py new file mode 100644 index 0000000..b13a734 --- /dev/null +++ b/tests/test_chunker.py @@ -0,0 +1,9 @@ +from rag_agent.ingest.chunker import chunk_text + + +def test_chunk_text_basic(): + text = "one two three four five six seven eight" + chunks = chunk_text(text, chunk_size=3, overlap=1) + assert len(chunks) == 3 + assert chunks[0].text == "one two three" + assert chunks[1].text.startswith("three four")