From 1ef0b4d68c1deb7f263c7377d561ca9fe14c43da Mon Sep 17 00:00:00 2001 From: zosimovaa Date: Sun, 1 Mar 2026 14:21:33 +0300 Subject: [PATCH] =?UTF-8?q?=D0=9D=D0=BE=D0=B2=D1=8B=D0=B9=20=D1=80=D0=B0?= =?UTF-8?q?=D0=B3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../__pycache__/application.cpython-312.pyc | Bin 2821 -> 2884 bytes .../__pycache__/step_registry.cpython-312.pyc | Bin 8461 -> 8141 bytes .../task_spec_builder.cpython-312.pyc | Bin 8278 -> 8209 bytes .../template_registry.cpython-312.pyc | Bin 16379 -> 15884 bytes .../explain_actions.cpython-312.pyc | Bin 4535 -> 19075 bytes .../orchestrator/actions/explain_actions.py | 256 +++++-- .../engine/orchestrator/step_registry.py | 2 - .../engine/orchestrator/task_spec_builder.py | 11 +- .../engine/orchestrator/template_registry.py | 4 +- app/modules/agent/service.py | 29 +- app/modules/application.py | 2 +- app/modules/rag/README.md | 300 ++++++++ app/modules/rag/__init__.py | 25 + .../rag/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 477 bytes app/modules/rag/contracts/__init__.py | 17 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 688 bytes .../__pycache__/documents.cpython-312.pyc | Bin 0 -> 4241 bytes .../__pycache__/enums.cpython-312.pyc | Bin 0 -> 1671 bytes .../__pycache__/evidence.cpython-312.pyc | Bin 0 -> 948 bytes .../__pycache__/retrieval.cpython-312.pyc | Bin 0 -> 1258 bytes app/modules/rag/contracts/documents.py | 77 ++ app/modules/rag/contracts/enums.py | 35 + app/modules/rag/contracts/evidence.py | 16 + app/modules/rag/contracts/retrieval.py | 23 + .../__pycache__/file_filter.cpython-312.pyc | Bin 0 -> 1221 bytes .../code/__pycache__/pipeline.cpython-312.pyc | Bin 0 -> 4449 bytes .../__pycache__/chunker.cpython-312.pyc | Bin 0 -> 3071 bytes .../document_builder.cpython-312.pyc | Bin 0 -> 1467 bytes .../rag/indexing/code/code_text/chunker.py | 57 ++ .../code/code_text/document_builder.py | 22 + .../document_builder.cpython-312.pyc | Bin 0 -> 2090 bytes .../__pycache__/extractor.cpython-312.pyc | Bin 0 -> 7366 bytes .../indexing/code/edges/document_builder.py | 29 + .../rag/indexing/code/edges/extractor.py | 114 +++ .../document_builder.cpython-312.pyc | Bin 0 -> 1952 bytes .../fastapi_detector.cpython-312.pyc | Bin 0 -> 2030 bytes .../flask_detector.cpython-312.pyc | Bin 0 -> 1553 bytes .../__pycache__/registry.cpython-312.pyc | Bin 0 -> 1677 bytes .../typer_click_detector.cpython-312.pyc | Bin 0 -> 1644 bytes .../code/entrypoints/document_builder.py | 26 + .../code/entrypoints/fastapi_detector.py | 34 + .../code/entrypoints/flask_detector.py | 28 + .../rag/indexing/code/entrypoints/registry.py | 27 + .../code/entrypoints/typer_click_detector.py | 29 + app/modules/rag/indexing/code/file_filter.py | 13 + app/modules/rag/indexing/code/pipeline.py | 52 ++ .../__pycache__/ast_parser.cpython-312.pyc | Bin 0 -> 770 bytes .../document_builder.cpython-312.pyc | Bin 0 -> 2386 bytes .../__pycache__/extractor.cpython-312.pyc | Bin 0 -> 9179 bytes .../rag/indexing/code/symbols/ast_parser.py | 11 + .../indexing/code/symbols/document_builder.py | 32 + .../rag/indexing/code/symbols/extractor.py | 130 ++++ .../rag/indexing/common/document_upserter.py | 15 + app/modules/rag/indexing/common/report.py | 21 + .../__pycache__/classifier.cpython-312.pyc | Bin 0 -> 1255 bytes .../document_builder.cpython-312.pyc | Bin 0 -> 5550 bytes .../__pycache__/file_filter.cpython-312.pyc | Bin 0 -> 2221 bytes .../docs/__pycache__/pipeline.cpython-312.pyc | Bin 0 -> 10772 bytes .../markdown_chunker.cpython-312.pyc | Bin 0 -> 3657 bytes .../__pycache__/text_chunker.cpython-312.pyc | Bin 0 -> 1494 bytes .../docs/chunkers/markdown_chunker.py | 77 ++ .../indexing/docs/chunkers/text_chunker.py | 21 + app/modules/rag/indexing/docs/classifier.py | 18 + .../rag/indexing/docs/document_builder.py | 115 +++ app/modules/rag/indexing/docs/file_filter.py | 21 + app/modules/rag/indexing/docs/pipeline.py | 135 ++++ .../local_project_reader.cpython-312.pyc | Bin 0 -> 4058 bytes .../query_repository.cpython-312.pyc | Bin 0 -> 6698 bytes .../rag/persistence/cache_repository.py | 189 +++++ .../rag/persistence/document_repository.py | 122 ++++ app/modules/rag/persistence/job_repository.py | 95 +++ .../rag/persistence/query_repository.py | 111 +++ app/modules/rag/persistence/repository.py | 82 +++ .../rag/persistence/schema_repository.py | 179 +++++ .../rag/persistence/session_repository.py | 37 + .../__pycache__/query_router.cpython-312.pyc | Bin 0 -> 2285 bytes .../__pycache__/query_terms.cpython-312.pyc | Bin 0 -> 2640 bytes app/modules/rag/retrieval/query_router.py | 43 ++ app/modules/rag/retrieval/query_terms.py | 45 ++ app/modules/rag/services/rag_service.py | 197 ++++++ app/modules/rag_repo/module.py | 2 +- app/modules/rag_session/repository.py | 662 +----------------- app/modules/rag_session/service.py | 212 +----- ...plain_actions.cpython-312-pytest-9.0.2.pyc | Bin 0 -> 10157 bytes .../orchestrator/test_explain_actions.py | 131 ++++ ...xing_pipeline.cpython-312-pytest-9.0.2.pyc | Bin 0 -> 8490 bytes ...xing_pipeline.cpython-312-pytest-9.0.2.pyc | Bin 0 -> 8680 bytes ...roject_reader.cpython-312-pytest-9.0.2.pyc | Bin 0 -> 2902 bytes ...ository_terms.cpython-312-pytest-9.0.2.pyc | Bin 0 -> 1422 bytes ..._query_router.cpython-312-pytest-9.0.2.pyc | Bin 0 -> 3359 bytes ...t_query_terms.cpython-312-pytest-9.0.2.pyc | Bin 0 -> 2189 bytes tests/rag/test_code_indexing_pipeline.py | 57 ++ tests/rag/test_docs_indexing_pipeline.py | 63 ++ tests/rag/test_query_router.py | 12 + tests/rag/test_query_terms.py | 9 + 95 files changed, 3145 insertions(+), 927 deletions(-) create mode 100644 app/modules/rag/README.md create mode 100644 app/modules/rag/__init__.py create mode 100644 app/modules/rag/__pycache__/__init__.cpython-312.pyc create mode 100644 app/modules/rag/contracts/__init__.py create mode 100644 app/modules/rag/contracts/__pycache__/__init__.cpython-312.pyc create mode 100644 app/modules/rag/contracts/__pycache__/documents.cpython-312.pyc create mode 100644 app/modules/rag/contracts/__pycache__/enums.cpython-312.pyc create mode 100644 app/modules/rag/contracts/__pycache__/evidence.cpython-312.pyc create mode 100644 app/modules/rag/contracts/__pycache__/retrieval.cpython-312.pyc create mode 100644 app/modules/rag/contracts/documents.py create mode 100644 app/modules/rag/contracts/enums.py create mode 100644 app/modules/rag/contracts/evidence.py create mode 100644 app/modules/rag/contracts/retrieval.py create mode 100644 app/modules/rag/indexing/code/__pycache__/file_filter.cpython-312.pyc create mode 100644 app/modules/rag/indexing/code/__pycache__/pipeline.cpython-312.pyc create mode 100644 app/modules/rag/indexing/code/code_text/__pycache__/chunker.cpython-312.pyc create mode 100644 app/modules/rag/indexing/code/code_text/__pycache__/document_builder.cpython-312.pyc create mode 100644 app/modules/rag/indexing/code/code_text/chunker.py create mode 100644 app/modules/rag/indexing/code/code_text/document_builder.py create mode 100644 app/modules/rag/indexing/code/edges/__pycache__/document_builder.cpython-312.pyc create mode 100644 app/modules/rag/indexing/code/edges/__pycache__/extractor.cpython-312.pyc create mode 100644 app/modules/rag/indexing/code/edges/document_builder.py create mode 100644 app/modules/rag/indexing/code/edges/extractor.py create mode 100644 app/modules/rag/indexing/code/entrypoints/__pycache__/document_builder.cpython-312.pyc create mode 100644 app/modules/rag/indexing/code/entrypoints/__pycache__/fastapi_detector.cpython-312.pyc create mode 100644 app/modules/rag/indexing/code/entrypoints/__pycache__/flask_detector.cpython-312.pyc create mode 100644 app/modules/rag/indexing/code/entrypoints/__pycache__/registry.cpython-312.pyc create mode 100644 app/modules/rag/indexing/code/entrypoints/__pycache__/typer_click_detector.cpython-312.pyc create mode 100644 app/modules/rag/indexing/code/entrypoints/document_builder.py create mode 100644 app/modules/rag/indexing/code/entrypoints/fastapi_detector.py create mode 100644 app/modules/rag/indexing/code/entrypoints/flask_detector.py create mode 100644 app/modules/rag/indexing/code/entrypoints/registry.py create mode 100644 app/modules/rag/indexing/code/entrypoints/typer_click_detector.py create mode 100644 app/modules/rag/indexing/code/file_filter.py create mode 100644 app/modules/rag/indexing/code/pipeline.py create mode 100644 app/modules/rag/indexing/code/symbols/__pycache__/ast_parser.cpython-312.pyc create mode 100644 app/modules/rag/indexing/code/symbols/__pycache__/document_builder.cpython-312.pyc create mode 100644 app/modules/rag/indexing/code/symbols/__pycache__/extractor.cpython-312.pyc create mode 100644 app/modules/rag/indexing/code/symbols/ast_parser.py create mode 100644 app/modules/rag/indexing/code/symbols/document_builder.py create mode 100644 app/modules/rag/indexing/code/symbols/extractor.py create mode 100644 app/modules/rag/indexing/common/document_upserter.py create mode 100644 app/modules/rag/indexing/common/report.py create mode 100644 app/modules/rag/indexing/docs/__pycache__/classifier.cpython-312.pyc create mode 100644 app/modules/rag/indexing/docs/__pycache__/document_builder.cpython-312.pyc create mode 100644 app/modules/rag/indexing/docs/__pycache__/file_filter.cpython-312.pyc create mode 100644 app/modules/rag/indexing/docs/__pycache__/pipeline.cpython-312.pyc create mode 100644 app/modules/rag/indexing/docs/chunkers/__pycache__/markdown_chunker.cpython-312.pyc create mode 100644 app/modules/rag/indexing/docs/chunkers/__pycache__/text_chunker.cpython-312.pyc create mode 100644 app/modules/rag/indexing/docs/chunkers/markdown_chunker.py create mode 100644 app/modules/rag/indexing/docs/chunkers/text_chunker.py create mode 100644 app/modules/rag/indexing/docs/classifier.py create mode 100644 app/modules/rag/indexing/docs/document_builder.py create mode 100644 app/modules/rag/indexing/docs/file_filter.py create mode 100644 app/modules/rag/indexing/docs/pipeline.py create mode 100644 app/modules/rag/ingestion/session/__pycache__/local_project_reader.cpython-312.pyc create mode 100644 app/modules/rag/persistence/__pycache__/query_repository.cpython-312.pyc create mode 100644 app/modules/rag/persistence/cache_repository.py create mode 100644 app/modules/rag/persistence/document_repository.py create mode 100644 app/modules/rag/persistence/job_repository.py create mode 100644 app/modules/rag/persistence/query_repository.py create mode 100644 app/modules/rag/persistence/repository.py create mode 100644 app/modules/rag/persistence/schema_repository.py create mode 100644 app/modules/rag/persistence/session_repository.py create mode 100644 app/modules/rag/retrieval/__pycache__/query_router.cpython-312.pyc create mode 100644 app/modules/rag/retrieval/__pycache__/query_terms.cpython-312.pyc create mode 100644 app/modules/rag/retrieval/query_router.py create mode 100644 app/modules/rag/retrieval/query_terms.py create mode 100644 app/modules/rag/services/rag_service.py create mode 100644 tests/agent/orchestrator/__pycache__/test_explain_actions.cpython-312-pytest-9.0.2.pyc create mode 100644 tests/agent/orchestrator/test_explain_actions.py create mode 100644 tests/rag/__pycache__/test_code_indexing_pipeline.cpython-312-pytest-9.0.2.pyc create mode 100644 tests/rag/__pycache__/test_docs_indexing_pipeline.cpython-312-pytest-9.0.2.pyc create mode 100644 tests/rag/__pycache__/test_local_project_reader.cpython-312-pytest-9.0.2.pyc create mode 100644 tests/rag/__pycache__/test_query_repository_terms.cpython-312-pytest-9.0.2.pyc create mode 100644 tests/rag/__pycache__/test_query_router.cpython-312-pytest-9.0.2.pyc create mode 100644 tests/rag/__pycache__/test_query_terms.cpython-312-pytest-9.0.2.pyc create mode 100644 tests/rag/test_code_indexing_pipeline.py create mode 100644 tests/rag/test_docs_indexing_pipeline.py create mode 100644 tests/rag/test_query_router.py create mode 100644 tests/rag/test_query_terms.py diff --git a/app/modules/__pycache__/application.cpython-312.pyc b/app/modules/__pycache__/application.cpython-312.pyc index ad16c08597f8a562d5e514444c82301f53bceaac..8a365844f92566ae227cf55a4e0c0abfe2252aca 100644 GIT binary patch delta 128 zcmZn_J0iw=nwOW00SFBEmt=n0$g9g{6RICtoLW?@pO}+cq3@Df7GF@5pOutU8r1Ip>;=BJeAq!#NHC8p~Y Z0L{xRE=kQxPThQ;lbw;#Ve)e>IRKWXE4=^! delta 46 zcmX>i)+)w(nwOW00SFeJSeW^4Bd;zSqtxaywjO3irO5_d5}W0@*ccgYCaZDF0RS(k B3#R}8 diff --git a/app/modules/agent/engine/orchestrator/__pycache__/step_registry.cpython-312.pyc b/app/modules/agent/engine/orchestrator/__pycache__/step_registry.cpython-312.pyc index d945215c9510c827bb4c25b0e779450dd2dd52b7..83b7e297b5818f3c24edfe781986400dd776fd16 100644 GIT binary patch delta 1658 zcmZvdO-vg{6vub`1vXwg7-O5@&-DkalUl|^kQxVY2p=tAwTYEdCDe6cA$E*G<8@Oi zOBA(IPL)#4g+mXn)Jo|wvD8Cuy|${CL|QEr*qZ7u-=0QvTscCjj3Hgx*67e@Q&>ro><0=cp!mxpQMCK+WZbG#Jxv=+YQZFDI!}5197}s$O**7u%Y^GIh>ZlX>|a|MYwBSj-;hXS{(%9hGmGdZ&dP) zsvaO-W{(|yY?t|YiJw<}fcy*_tlyAB*QC%jH2{Qz(zG8_Qg})Y0tqqxP`gai7d#nw zgR(avc@xdxupCTD!BmqQlDVYBC7Y+AaJ1dkm!VLamqU|MXtFsJlZWP{p}8i<%Unw0 zQZI=+D7)j5JKhWo%Ymd6NH(J*a&$?GE*+*(pz*7D6`@fwY6+n$WX0k}Nm8|N==H{d zWm?z?h0Q`m#JjQFnNyY~zu105sm9a3Ze93osAk^8`JHh%kQ^1>gj=5B?P(h~whP=Bc5wk< z5d_{x{5}=MaN;3P{&4jfTI-o6$F4BSH2edPv}l7oanJ6(3He}Cv2GN~`BHA}UcS63 zR0NSXtf}p8TDU@lW%4-6?0~D5k(;%l_6DrZ>7b0lD)au+4ZH1U{hOG8mrLJ^@x3 zfmrCTLftZ-F-MwVz3C{640`9$g&sY3bkU*b{*CwP9qk(-op&gvTyCRE%?i03rhftq z-l3vcPtH%FvF|gY%g`*v04Hp&+8PHbJ Ef1Hh*sQ>@~ delta 1752 zcmZ9M%Wo4$9LHz;isN|gkOVuy&#WKG8c2=vfF_O;$2bWkq_jy(Bm|3bmf&C<7~3Fd zBp=$U<$zRms>-)Isw#n0RjPdB+Dp}{Quj2K)k=szpjv9w1EmsLkIZ4o;tuG8;8|e4l$7`W6+aGpMeVzH@sxPh?`TcWysIkr>^J-*X zGxK}Rf&_p4J_}5#fhnyEh!qLm`X%N|sJ?{O4a9~7Uww=DCshB0)&s8&9l(6F%JX*GCSI|8T=3B!#j6K7O$M(YPMa5(2E5El}pCs7tVug1=6 zZXkn5h%|1nXhw}@v>_nJ4)^f@@ggDI_>e`?Y9y`sfcR03mu<3WT8*Z)0H7dDU15Q7 zH88FX0}+uh`ecAfORBW=G6W=ygizxw3(uHSV*KMRjCRn*o%BvJoaGRWYf}0!i_tgPb`a z^B4)9`V#ZTRc~BNV_*xM1siTQJgp8-YZ;xLR^c&Z4R~9lI2bnv?dkP^}>2udf zKBL3EctPd~#13)|B*#Uv5q;)G0Q%tFod&m*2jmvOBFH?*buKN-{tS7#=kB-Y>sg@B z+!1`S`P}`55#OYDJ=xj{?`rJm6yH6fM)97Gr*%$%6C8+x|H^z?^a*MkVrE-=16$1Q zc5jbaY#(!(#aHouv#V1G>5{8uljR##KCB|k1P&}ATw2ck^HXT1{oVKkY*xzxCv35# jKZiVm)Pn(DTa&kUHmgM?MZTo>0?W?Z7`y&KpnB4OvncMT diff --git a/app/modules/agent/engine/orchestrator/__pycache__/task_spec_builder.cpython-312.pyc b/app/modules/agent/engine/orchestrator/__pycache__/task_spec_builder.cpython-312.pyc index d8182c7c51771cdd39af4e98f770c46e346d397a..82a2c037227d33a137e495d0f1e516f5cf623fca 100644 GIT binary patch delta 1862 zcma)6No*WN6s_v%+4sdWp1nO@2RjfF99sk|FCmdRSXNlFfIS{O?a7c??CP=NL><{8 zL<&A2LIsIKkU$_r7JE9ND}1->ceHhw4;a1>v%I)>pS~#Hv^oRFf*IX4S1)R7AfDgy?ah z+w4_6s*f%UT`p0xlYB8(s>>Cn(GZ&PgkKXCxjaL{Ooc=MngP53J^(*J0w4(x00;tv z0Ga?|0BL{}fW$z7Fo|ZwtG2foTSB@-SH=ENG_w}(Sj#-N7R~%_H-{V-Y=62MNpn%e zo%FnY1P{`0>|Z2zvk>wqz#aw))JKw*NGDkK(f1tvc$jLAHrz+=IX?d%-Uq1HmBM|E zhh15WJLpk&(%%ch7A`DF(je)iu0)*v=JewJ#trv*jI)hC?*Q(3kV@7>_A+Uu}KhyvfTLv-C07n0{!pKys_`?XD zRgA+tqnDLku)aes)%_7KjKgR%3g(TR0V4DQF_J2023C40c<1x!b5e5vD%| zy?BcL8oa`?prImU@pn_}WC4VW04G4uLS6(u!9cSTdA6LNmW|D!D8vjtW@A&{OF6Jk z0f3MK>+CM1`)6;bYB-Hg(#zp@*shO9I&jxh;Cm2Y1OUFXD}o^iJ*Wr_Ae?>~jQdz| zSOxN>YF)|A)W{jyXg(azGLLC++jwk@uv^6I-%=_15dAFXqR&U8Vi8|A)AytMdo7#a zSHW5^T^Fz2kKSUpZM7E#78#5kOWL=1*bDZ}NUr@_Y=6?R#lul>?(q02)-oz=Q)W4B z?EVbOy2jXbR+UpS(X7gRh4s&A%?4dmsGX{21x>8xE3#%`N3Z1TbS|D7+QrVAwGlge zk*5Km{|U?wG7PYzyme-KP*PaWuwH2IT?F%O21{to8N9UcL!{?sr015i=XS8=qZ426 zT6u1Ar8u`bOjd@J)nI+aRcCJWLgL0E6n`umGiD8M6+Un{<5Pz-K6*Iw!KR$?Ap}>& zC(&I;xun!zD3qq_6UJ0}1(x_|;*8HH9>|9jXHAHHoa{REpTg%r0!ot=`4?$y3|38i zsa7j9a{^rW2Ipup6(8bFZ2V^{XWTrigQc>3Se}_+{(LU-YIAXD*X>q%I`zn67pS%X zi~_U)v;(vPWB~Xmgz6bn!3_)mN~K(7@G8=U4zokIpg@yu1!hBuv~CCHK+%Nm1m;5C zfbIt70Y5J=-#GHcbwAJm3QKwrSP1!IdK0iP3Z?W2ux1pF>QP`Zl$7*1umlRV>PcWJ z delta 1913 zcma)6TWB0r7@o5`v-jES>~8j&Np_QF(_#`yW1}^uO)n{glvYVAX|k-llV$1dCONaA zO`MQEv>>8jd%!2f2)@+D2MI_~gg*Edls*Irqk>@H@-n)4F(CE)XRO&2A6(dPzWL67 zIp;s~|93}!9QAIAqKl)y)vKRYrrYj%pP~lwqF2d9H05VhKxQ2-x+A0Pmb07wD^0YU&0Knx%akOF9@pn@A2=0A7b zqvRH6q{%LSB9Q3b=^WgNb^dPWCD%KSJ=f+-LYYV1!6;XZ*z$(}2VhDgl)_5ZLwO>&+z8rmH9*5D!Zvo)aWWC;N0VeV&`lG57NB{1NpC2Fu9C@6 zWc&*C!c*XC)2(U)>(;W;z^{QiN1P-!6xZSN#`pKrG}@-M(C)S zFhhO`b)i{emG0AoUr1$07>;CSi_{Iz0hA$_4Nn8VMnSh@WuaOqDrVvt&;_s&%!KS) znFTcu0D)9kr^S_Jv$|RGVWb^Z$XAh%Y1TKQ^sr~Z_AJ0@0BDi>7Be7xXsU$*xRb|; z1o^hjhi;PZ+b$=2s9c1#XFn$;^cGNaACdWJmgHjs8X!t6J9y97;2-?s)}q^6 z^|E!tLEewOa@^Ts;cB&L(cP_j*+M=4ioKcgv{-mbo`ala#9~LuyYXyFY_Sqc-sh~g z;vH*;_eFi5?frEw=bNP0URQ4@Sof>S;*wG?D)~~SP{xItZm%i0R;ZM67M-sbYKm@~ zo2NoUcP^<)zF1Hdm39LCfJ^f?>bhNBs?nEaYVF2~x>RlC8_NrdPR=GSo;*ein{!~O zFTMZ(ErVg);8Ot4U7$*}gG*Z%6J$5xv#o;q=e=ZPXeXI{B6U1IGW;mFElvC^b*#Vr z@a1jk9I)kw@onjhVdvw)J;5${4bCol4L|3RNMG{HHMsN1T$?$!*k{Rx7h`OKF~<6x zF*eH>V*||ryG>BocU3EDW4cr+HfGETcN>=2m}BN_)-eVHZ^Z|e(<-T?(&w4_UxDvH z2q;pHXCuMv8&Gxrt@-&Xl_57~sqT;;Qi)TnVa#{BvLdowovT!p%gWpgwP%Bt)tv>0 z);O$^-gMvE5r`@Sj00oiU-6v zCuNKTuq5XSHdB-}Cs0JlH2b~6a8q^)k!D1skhThSwC9QmiwO6S3+*@KhYzyvI7x^3 ZQdJqj?~$L{y8`DeXc7Ivu?L3p{tZd!ld1p! diff --git a/app/modules/agent/engine/orchestrator/__pycache__/template_registry.cpython-312.pyc b/app/modules/agent/engine/orchestrator/__pycache__/template_registry.cpython-312.pyc index d944184c948168e95b5d40e0c3f35aaeacc89cff..0a99665a3968551056ddc698a4c525ebf399f0a7 100644 GIT binary patch delta 765 zcma)$OH31S5XN`5?Y6FM0i_@mY>gyPL!wYkXem|E8lVV5eMB$y!D?zKD_aUN)hGue z2Q(z(!AFQ9q?U;I*ckBuB%XX7s)iVQFkXxYV^YPa(Zu+lZS?FOe&5Wu-~4BOofA=&tVCl0_?5WkM9kYm=PcFM|2o5+rf;tmzRJAubLfT#d)(0 z@^HZ%!BNW%sKS7CG&qvD7QADByf)puFD*r0OFb{7p0s@OE5~i(Tlw!K9sK8YCR|n&H`X#Ew!q_{ zb4J}(d|s$%p+u0-O4v?lBkUk_5IWT#RIm+6zv%8Rk~D}CQRYSh6{pVXyF(0)ieIpP(NTuXxSCf+TT5Cs3ZqY`iK!*{-O&i9>j@+Y^q>wAywk=1Gz z_%~_og{WcrOuwKvEM*qE3EM0*``@n%q>=cba5RjzN+S<<)$Ct#+}tZmYI7s--(#rn{e}yFUm*53FFH1tG{n>|lWB)(Cr5;5ZyO6wqAd+T>cTewnsmyVerZ+V^mC2h1Zg1@3BK zOKETuYhixgY+{?KNr6x5fG}y=q4m$?w&BS~_MBflSgZxsYa8NPBfc27z+@2%%?FJi zY?-gtXGX^w{)oytf);{}m>so9TtajfzV{Yky|V>oWllSng)h2@F1S<$u609Ov zP0*alFEIlIGtRN?01Gp{p3^2VPMri!@g68iahi3uCF47i{LN)4nhvGZcs#7^R#bfK zEf(8J+rc-NLMPhG=fmpEs`B3d+l_{b`@_o~R@@t}(yU4XeXS$1RaUk~6QOugl4GG= zm3+^&O246#E3nDq#-Ej5G0cwKW(Q7|hdJSdk4E)!hAebr;R@N9J*?8Z0MSD_F0`y1gHKK3U^50D2G6NM2u>kV=Z zU~|oEIDx%24SeB~8h^zgl||AybJK59L0_S>E0pSNs?M;wHqlI%<%+spztE)g{jFXn zW+a%&A!}$&CS!^&rT@t%D*7?{RJv5TMKFjzYLAO1V}K1gGbigNz?j=q9!F5h;hU+B Zzp5IO=PGycd3|}|at00XNzj8X`whh`EBycf diff --git a/app/modules/agent/engine/orchestrator/actions/__pycache__/explain_actions.cpython-312.pyc b/app/modules/agent/engine/orchestrator/actions/__pycache__/explain_actions.cpython-312.pyc index ad3fd1e6f7087fa0d0c69c95a5d9fc3a1e7d3521..1c0860bf88a48c7594e055b6be3dae2fb53aca64 100644 GIT binary patch literal 19075 zcmc(HeQ;Cfx#u}Y-?D7UvL%0Duxt~+fbC#D0)&u&4Tc09u%QVgL>4--Eh9_j9LZou zPSTLJN-tZW=@#cEnK(0dChjD6uG1#PO`DWeG{BfSAv=9>db;orqtqbY<4aW^Etq&RdO~*|vZ3vnBEypc9Cva_?Xnche zO##DuI9)**AfYaBqR%W`6;(EHZTy8qR4RU zIUDfDD6b`gI%g@*wo8fzyM6xXk)eTrUmZXV1vTPtCy=6%yU4$yKdwV8 zAl4(+AvTB(QIFUt7K;YNCeev`>yMien-E(Nn-N=J)_E5gfMlvmfladEzHZoO=F zpI$4Ujr&ZqX)Pu4J=}ZQ_Gfun;5mF}`yLUZ7Omc2IEo;7uJ>+|%kj~4uu z=57cs*dcstJ7kCpaa~*=rPE=*xIrt6bA5I#cQ%d-TDfR^(=cM};YLhx0k~tT~m+6d}kIsAYa4OPL}$5|hwyK#EZI7qp^`iFJ-rHmJWh7Cx0Rsug?w zu+} zfvB(Ep3#Sbax|ll`s7m?Gv$fCs4t`IK^HnXDrNLy(2ry&5YE`W*@3b%#u!m4IFK>= zFk)ED*nFbs)rQFw(N@U;Sq6#q2E~kSAQqK~o@DHuM>>wS968#tr`7v-=acRA1sS~@ z2z6(4{^;2STP6`kWK09Tp-{vpO3Ua=PiG;OQ?kI9rPvrjrp@-u{_Z^AbW{#Va+5C< zINQ`37^I!<3;3h5cd)t17xaiya4-<|VD=uA_%YQcD(DG>qfN+c>W_#q9A!2q5bg2gsXzCnx=qCg}vGD#(|-X-5&{sFj#iazkUo%~~ zE?K#5y0R%**)&l&DWxiRr`#=xf}6`dVwkln_>E^N(qeE}(Os}Y&UeTOf(LAv&wc?R43ydY!glVRTt9u|} zOP4MmJvn;%;=!*to24kxO0!%^vmDztelk_LVXE?>iH?+e>$H1k(!Dd~ek{@Uc}Zp3 zQJ%JyPur@Kw(7K_blOpqbkw9v%hH~8H>+#MHod#`ovq{hCj3`=Kj@v_axl5&V5+V? zRsCeTsxDo*e5QaauD#6}i=BVenG1_%EEsvFk}F$wx%X0U+GbDNTsNurHA&l=w5=rV z^o*@nikr}^tQyVAtce25zPKteqL{00;~cno_oxvW>>k~L)45DECQCV~ruGAk;X&n5 zkA5nR4RRyG5_B%lT|1(S=5E`#kZ%kz-<_z(d~$cLIy!~%9DN@##Py>7)EdwrecT`k zq9JU{!HJf)gdRR_#64quQ;++G>m&GF@WYx5qD0)-ua79) zf?wS9UUsF7mfG4SBEu5hI3O$$znszgBVr&%T$M7K{HF3tSM9_3?v zen`=T^07zx6cs)}&W~A$;>qMUlP@FtJII)%8j~L6d^7ns%Fof_XGmS}plCwv=o97R zYZeB?>KLX|Mw->41I=u>5~ad;=FhKHu*@yVWOZvez0*zc2uJSz3`_`;&c z4-$krVZRhWf>HIRkZ%aorZLu^e=*6IX>{cSwtxwmG0kq$qkM>*kFnlgp!5^WZX)?@ zlzs2Mm)$trOf!}Or(;1W&=2Y*H__hG^cPG$G8mB9(i(@24c$RWj^ZvcQA?0C5K~!> z^$#$TmN9^l4#>Gn3I((WnSxNHC+PQrGzLQTX32xKN-Gd#Y%$PUZ$F4A2&hy;Sq0vp z?3H4;0eoSpo@x?+OKTCJ={icUr+{E0W6CxKtyk~Ej3C9NtyE-SW6EZvHR>)sNCkyp z+a=IkFRL(IwSdCTD{Vvr`PT?u;?hp{VjV%9r1!6$X9;ilVh8SXo&_!;7U+8$5_vH6xYX%?;ABn zGVDaHe2n(IT8!xBVU0obOi`!ScVDTNnvE9_J@Xu*Cv8GCmBZPL)Q=#dX7M9q>kfu} zA+IkipMk(ypkfxGk+hrw!ZGP#3U(mK*uZXzfoyGj5x!@?W-%?JKeFfPBa2W?+DSba z7(**PLR~&e8Af&JJZNT7kv}*b7%pFcV!4u5bRl0suq1lr$ETZ)(u+@Be&*6MW6QuF zxHe3=h(B;_O&AuS<3hwsZ+qJNC6!yzq?gQ7zeG?>c@T{%(il5oCG&yDENOAe2qI>5Ev%DtEE zLKAHyen>xL@I%n{3nRiAuHM)Vl1P`0M6mA9O#>fv+ioP8pU(IRj%WPJ2}o%pJWy_K zO_bA5%t@v{};al1vDOlwwxNP}MFxrbeq47cKAR5Y~0`Pbk zA--eT0j3#Du(We{!vY9dATu=X2u1w9ki4@o*W^j+jTrm?&i&~nZmMY8xGNExwpSMn}JP>!A%7gFwZQ|=Ap(hX;`QoJW&L~yg9 z2os%Hb-ifY^@45bl8Ut5_4QXK&bgPD>4?9zY8S7c$5+yl7=W!pVy=D$Iwb~f3jqPr zsL-DOxL-yWx#NC_j$0%(Po4wUI`s=@g;ThB&+-zf+;~cd0pC0^uO<9FT8jeF+bqaz z1IXwY$mnAP70LvQk{8tIQak7z?r@@Of^--aGJ?N%F8oLx$Q*812syLdLmsF(f+jLR zJqtC?&TGQOO{2|kZ+(618{0oCX&5iPUeYjC(lX(^QvN~tPbw#)$qg;3l9r^iGYd_R zJj&PeQYX5=w1^vl6bzPO-o%L54p%NcF={QBAb`|nccSlW;&o?d%HFA%JLi)WI*Qzh z|06z#q{!f~!I4AHAPSvJN)~b9{hTgrM2E{O53z~!^J?i?zk2ZWhPbXrAQ?a^jvM-N z#0C?{{^EL#&;SgKd9_C-z@uR?VFEmw+hZ~Gqr64@^f|Dy6U0S>@jlNOZ}8wR6H`u% z@6Yl7$0#Qjml$x3y(fNn_LE9K0y_-G-b2!WC;1Nfk9t5sxs_Z&5Y*^Sl@0aSww^qc=({q*kdrcjZzEd8>8#LuFj+UEqa^401z~W zt;pz9+1z2lYMqB@wBrCa{`PJpa)?&8G+ouu4BE5OIG;t73Lpwci1H&IgjIR~eQ6k> zA0Qe#ho-7H(YO%w=3M)EkRpH{Wm1f&q_lGPSnI@*E5|=L{?W5xkHD~iU{868)=Pngl^AtUUzxJ+Na%lG18INxcnI?Tj6p|S^_OPO=1AL$FC2LF zz-Z}p+X@iq3(vp$Jho+g&vocb?7J27?!`z;h|9<{$+tk-IS5;?Yma>gxyt1nU?xb2 zBbgJz9pusp0%Sr42DS{cA>cujRX#|5H+g{xwA27uESME>flrGBM#7n;1V)lv+m1k5 z%}R3tk(64b?()#Ncz^-{RZE4EZYk2AM_|mNPveg`T&_eBC3ct6VatR8xif9U$Ko@c z!-V!w6JlIn;z<4}!+{olgAUYU#eq7Xp#!y87jJfs>3Rt%Mj`nja4ulIbXnG`4mhWq z!+PoRQA21-c#ZYasGF#xD+~Bo8`pvTdI0Q~9@O-&(>K$v%UJHuL&6qHh92 zBj~%Shi9V7T`BvBIc}EToU?1@yt`(ewOiyZx7?%Lx_H;~-6tAtgmpoX>S0@;Q(y2C z3*!0bXqz1adaA;>1!2)!Ibo3J!fTf3(SBb&Iu`f1ga8U{8fnPhFV=KiWHZ1mYsxEv zS~WPtE$bX9fCi2eOPB@c-|q{eOJ~=e6()GC#}V6{lg*D0b9QY}>%x1sb}c>jUC*~D zUL@7Ti}DHE8VrbTGT7X$7r;0_VvpN^0YAh^g1$i>Gy0YGm5&<5OSIl0A%CXlqTu}0 zA4bY!anzt&YXw~QFkC;tanFLu8*z@5iZy*C^e*_tOI5m{wH$E)Rx6f(y4tuCP+Avv zAzV3^2A&K_woFM&v$u86p*`)bd)ix$dD}X69om<1v^?nTeDY{V%N}pbzN76=bjHB4LD+({ z^)88W9!Ph(pL)rDDL~!I{Ef zKRNOQL@)8OF%mKwJY6j?#|L{3`od&`=)y?z8Sjsnv4yeZ=OA2y@z#tizEBV%BkFfa zP?m8qQn#W#MAvi7`g8>nDhaQ!9;uvq*RaslINY#!OqI`$$l$WkqYvg~)^#xK=^8HX zV#cEZNK;XmmLeFf1p`SW_I;s8kTFDq(NI953+#*kjbF@xRD(x(2V3?F%mCIgPgji8 z3~T zzzS8RWE68mlRLb%-abdK==4G@FM8P-dS$l5Ol()>;u3gy5U5d~Ud=*ggbs6hem?`<%J=0w6P5U#zMWrjZF37%;+*Bx?XP zvoYY^B2`hLM1hPTV}h0#UL$hGNZNvc$Q(3ex+ozw=KHDIST8CpvfRznO3+N?)sSzP zejr`qO4m1jRwj;>epojll1ISgnQKC7%M+>Q1Ii(Ps@eaydZkQE*bt;wum8*?jz&M! zO|)Lw_rbo&r>>q%ZQ7rD@NuQxpL)>$eNl0V2`d8Dr$2w6CsWNQsZT%k>Hq!qUCH7t zN?VuW@WDz!-U_Rej@9F~lw)(k@OgpbW>xi=;a$r+mhrU{CsUqn)1F6?o<}Cb*YI(8 z+H*ANIjVdkp!5V&p1xGosp+cI$*R+-DmiiRbGtk3a9y@tvMJR8rTf%$cO=;zQ3lQ` z!|_!2h!S{VhT~r3PY8&f6HZZd0v?ozLPD6L6%pZ&99KLb+|s%^aP8=CkNxJD@|0Kc z`BMi)6Q45I2;eh;-c!|$bNuK;xm+V2thQt+H}*`tXc7$7+<)m0xki7B{>;O3Oy|Z<}8?kClzrrpg

B{{dv3bQM@z}KWo*qj{8^f&DmG6vr(9bSd+*0nWc#j*J=5+rN%xwRd+m60 z(%m@i-jsB2y6)a`RU%K4a`Gf0eNau(UNg4py1ib@Se3M|8jD@GHy~r~J~vGm#?MUj zP7Y4?UiDvFeXT-S-g(1uB)fs5!{ZN(mra-_-IM04x~pwhw<*gHUU#&^TV-??6ikT& zY5TI#(bBAlD4vU{z#jRbRFsDmbe8pKUn-Z<*lCsNNFKr!t9A<-(byFoU8(@RV zmyNc*z3=sXW6w;~rm8kiRXsfEOqK7LF5jIj-<>LNP3*s){V1t=r%hWRt@!Wi#>97f z-{~Dci{4i4m|C^_s*tK}xo&TzTDDTv!-mY>*KLhyyL+^EY$&;6!<2mk%10}bj=E{b z+N5Lc_(L}w8)saY{~z33-RiNk@4oQP3#ru)rE0fMIwxzC+8tkU2Fv4o(zYV8|Kgdn ztK#ypOUFjfj7j7C=<_Mpy2ReuJsy?EwvDY*8crzBiOLCpU$?Tn=Z2$K+v05TPId%7 zW%<4vj{P$Q)IGd%Ts1k!vRe9h?A-q~043V;K*X}qGjET)KJvzkNqgf&{p6Wnj{I!o z=Pyn*x5J-^xhp+1UAH+|w>ee!(4;U~tkgZ6s@ySMxf|NTRAuW$6FDuFR$M-C>A={Q z@m1sHK%VAKLZKtVGZZ~5bWwCdjgAOD`sx%;Qgjk#7DTloA*(AO+{$*eacpp`S804o zdD^RViHc8Do(?Eabt~OHN=@&L(jXm3A3syhIZ7`Ky*f1NO4+LwbM@C>?H9Q60RMHy zC4QYL^UF0Ln2EZM4f^->2!2}Eu|@x@NpC9R|Jtlam@`EYlTd*FU8+e27DUG? zFy-dF>Y-fUbD8pbOS^gnlbtu#fPk^luYr4pdia~fFR4mmRmZ%L+a30(3^jE5#B7El zky9SW|^eGKe5T{@SLA^=xQ2J#GYA9fwX+eJk?2A9r-yaDx?nm;WOmi+^ z%t|X&W`tgQ$sSA~ctMl3iX6+DHP)Ytb1k4`$KX6AK z7bc?D^uIO##;hDZc4Oah#osgSk0t%F>w`ne@C*NwAf#&-4hgp@ z^RVz0Mex_*xTC^TlzE1KnlMu2j|o30xp?IA)0dte?FH3#t($U@vA4JxbulnJbXW{) z1-OyVTc1c4Kc*a(6-P8-xLH*8dTssx1?^*0+7d@s>5O9!RuaT%}!{8%Lv~ z-DA5(m*23hp3$NBmc3*?Mit_jSopIn>gl1q{%G91O&U_F*FlK62BrsYy*h z$R$Z=*vP)UI3f~oq(cZav(sxxXH3jxARx{+J<+-r%L{YGtKXo8ya$2minR99+Sk^@ zanJ6~a)oPB_H|SCq4A-~Hl?WrsjFwE+m0pMjw#1|skW2IP`bm4gm6eP4=vteJ-TK{ z2W1}j9pPS^{9R!dX%a#=wArtiMIo;>T{qvmgHC26-O8(w_>X+f>N+;DO8XxsIYI*?p^sy-Iyii-JqL#8Mz>-u}~J}VH)l+OP?DFq?7X3Tgez+8CUW6=!rQp-Ci>?XFEeb{+)I5;qQVBNoxhVr$%T z*VYAKHggsz5K-%0YZsz+(d<~5eL)-)fBvm_LEdvMj2Ct1W(4;91K2PV9qi9hMXoIsrrgJ z?7zzV5TC*na`U{Ch#s+v;vy2cYXUBy_jUth*ZL~J^Ai4{+B#W1Cu0Tf%2MR6q{nb2 zX-j;tmEAz^TLDb9QyK) z=%Gv=^}6c7q^Z}TCU0Vfnom>}`}VA39vfFx-REp2OmR=ZQ*^O%%4u#kL$1_=t4pOBS}%qg&;;2iSVQE10|8a(7X zMq4>|C$Yoqfem?&Eu89l&zat!zZYi$i6}tcW!QB*VaQF4-brO=Mrs3m;&Idj+qeaN zshiqGDvcilmgd=?W*0N5x{PL*l7Djn$)R9(z(3><1sbqf*kcvKkcJhDJ+Z>Be5m{s zMX)BpxxWqDmvyr`fl&CU!) zsAz9Qgstp8MxQmY47=t-i#mzMoph1janK9eMlUQd#6)A>cR)=khhPJ*uweD-58Nry z^57DvWlBWo^4}*9q9k{p6lGOjJiThfeFjkw-Z8qY2;u;}&%PTN)_WRudfMlSAvzc! zGdh?a=Sy=;2>J79^#8)#WU>sazOJ6bADo?#CtJo^#x{@Z$2;NEiD$2#!&ztVQa8_( zU9p$Y0tEGKaxK=t4VFhnt#Y(9XQ5sXCr;I9JsXZzBsGEm=gXo zs0*5S+`tEMq2sDPCk zv^1QQhEWUNr!+8G-!QPu@2A04bY2e#<{zjs`A>*@E}utf%800!`(FHCexvFOQMCImZ>E}FgLmzrrcz1 zbZ$ztr0vcNFTDDKvSRZM`zZ^r7Sx5oH03`pC&-qSH(NG*{z&}h3Tc!GkJ z6zrg2F9mHBkZ*z1MFFXDRdRWbQbc&z^~Cf!DlsC#U!s8imXS>6kXN|dLXBbh?Y(@d zp_9M8-eh>3zwOi;+_!DK;Rt`*Xg9R*w`&as>+Pa8-r%|2A(#+4O@`9jPQftD-!=;d zXpYRy2KOzuXU2dS-tVsJ855<7FU1R1s$_`GXxPbz(Ff`tumrh3*&* z9=jmXZJ#N~{`Cm*WKr!in?PNYKF$XK literal 4535 zcmb_gO>7&-72f6U@}Ck#{r^~6NpURN7HP|lT??vQTUJuLsccI!(yC3@8_tSan%re( zm$I~x03UqdA&2NB84X~im%5M-Mo|>*sXiKL3y@|3E*3Iipy(k$4@%@-eCnHBQW9mU z3DC}A=FOX#H$VHm?>+J_p^%?~^7%(!$sdL&>K~+2Eu16J+t)x?r4WVaG?k%eXqw2b zv@65RFt*I3*$g+s*)p4UXFM|=nsQMeP>8!vA-BYAFpjMm@1iH_{hD;5v=tB)Mb$-J zRuv7pp-TxOPUW&$73-h~k1t4x9MO%biY_e>UGNgtb2^|7qV$lR-BV)iHc3zvJYyw z2EBb8w5ya(JNr@85}n#RucA(|)$!dn<=K_15~7i7qq2UZVsqK`f3waUV4cy)UF`Bd zo}HDc>e&})%|%ntuik-OaxF1>U}q<`I)b<_nPo0XExVUoDFEI6vQbgs3rp@){r*vS zZz|PrfR;w?Lv9l}U^d|5v3H{a@#@KFV!Qc(*);O$EmeK0tt#!dOC0hqxn=rsplWm3 zv*bzAQ&coKVFj?56cP@GEy{|3IGfH%Nm)3L?sqISpf5)~44(PENdfkkpcx0tk~fH57*S<5YgX$4t95h0;! z&#EIQ)Fe#%uePQ>)nBc+k zWW~j_Dk4n21<*vr<-wxn1r1**Z$69P(j=_K#k8~#zaq^GS*)g{gsutmLvc~&5tipA zg@>c^Fp_|C#mPWYQuH_|;~5p@;MFS{NlD6z6jyO#PJ&B{x{BjKG=!bvlEVu^r60>K zTJ;GvorcYX%Cj2Xwd4`oH%@#`bAxx>qc&UG*ZLn0JQyf^)aF|C8sFrQRE*BR31)j?JdFwZ?~S58BG@ z$BXU9%k8mZd+gC?PcN6+FTQ9We>PNVdOz=b)%mX3cXqS0du{R8qu+SgMt|P3<)`|O z=7VOSr5xxg2D<(p=r)6GX4f&Z z28zJ}GuUWG`1O;8hB&Nf?SU1o z{(NAoj%w)2FBiOB+wdGz4Z#we*aZ7~5LSsR?oce5>y#xL(Lgelcv2;uW!E9De*g(D zGfO~M%eISb?;floJAAT1rRVxC4%Y2~%0Y95{)P0rUg%aWB!6r^bCyMLFD0 z9N{$443vrs=-@J`|=Y*B+GLY+2}{s;mfiB3VgzZ?{_mtBZX zRf59a`N)MB#3Bwsx&Wx5khg;}U;^?XKMJ5=(q9cBfUd*P6#5P{F*Rtan|MR}4gc-i zw=)vXh(MX9+VVlT542>;F#o=+Ai!%{nkTmR0@vsvGX{6pPgZAD0-SNwF$FWoM#KD+j&lh; zhb6?@G^*GcpTWS@tZ?;C0&*?|$R(h~r=YUflqxH`h_%@1@t;jc**%awO{M{4?HAIB zHxjHmP_6qUC#MmFBj?qOBm1ygJ4`HT3RDEFHK-)mjjlvrg}Y4P#c-b)?zd}x{rsad zk1nr|6vJ^dJZy$fo8j|jxOuBN5USf|LN?cS?O?aied-S<%R^(up|NKlmBK$Q)K3(= z6K{i;P>u~=a+p;r32FClXdt-5=+LF_WS4hWLighXA}EFVk}66C1( zJ$NyI$1cSUg92Y5T4onCz-$obHI2lO9kA3pXxOp!Ry5x?J_d`mA*k+A-@4wSsXLEH z$|pvOCq|y4Kg)lTpL34 z;68xiloQ;H52oiF_(3!TKveQPp3ogHYYwl#M@0ZjHB-Y^PN1!DQ_Kp^$^d3TRJ6Mi z2C``-TiUUbz+@kE_C`m`TO|_vgpdH+CicfipA(K{Lo(X-@y*W2hp>}I!Utr6jV)j3 zs|za&>rCP3`KOUm>&T1N%gLtNcq0D1dn>%dJ8R10bUAjh7`ynar4;^gq5f*Yd)3Au zIJ*=0NyM|<*!DcbmzFjC7vNqWLyqnh1Vzk9f?)Xtl3GBz402EqKFx_~X9N$BhdV>6 z5mNn#RO3+njL$ruw>vnwKKJy6}Jwq2dvk?mn0H$`u^gt_bV_B$T#CjB}R;I7fm zS~7bg>La_)=12??1Wa&iorU~8R>|LEB8hY{Cx4Ia&EI3yd_AFNGOB_}fUwSgzok<3 z&xh)7G;BZ9$?vz?PQx@O(X#JO{zTb*k2l^C=kTNWIt&ok)d+2EF*HrTr276zO}wP0 WUsB_*-PdXQ8vVt`|E6rgHt`=#c3c?% diff --git a/app/modules/agent/engine/orchestrator/actions/explain_actions.py b/app/modules/agent/engine/orchestrator/actions/explain_actions.py index eb97284..42ea04e 100644 --- a/app/modules/agent/engine/orchestrator/actions/explain_actions.py +++ b/app/modules/agent/engine/orchestrator/actions/explain_actions.py @@ -1,5 +1,7 @@ from __future__ import annotations +from collections import Counter + from app.modules.agent.engine.orchestrator.actions.common import ActionSupport from app.modules.agent.engine.orchestrator.execution_context import ExecutionContext from app.modules.agent.engine.orchestrator.models import ArtifactType @@ -7,22 +9,28 @@ from app.modules.agent.engine.orchestrator.models import ArtifactType class ExplainActions(ActionSupport): def collect_sources(self, ctx: ExecutionContext) -> list[str]: + rag_items = list(ctx.task.metadata.get("rag_items", []) or []) rag_context = str(ctx.task.metadata.get("rag_context", "")) confluence_context = str(ctx.task.metadata.get("confluence_context", "")) files_map = dict(ctx.task.metadata.get("files_map", {}) or {}) payload = { + "rag_items": rag_items, "rag_context": rag_context, "confluence_context": confluence_context, "files_count": len(files_map), + "source_profile": self._source_profile(rag_items), } evidence_ids: list[str] = [] - if rag_context.strip(): + for item in rag_items[:5]: + snippet = str(item.get("content", "") or "").strip() + if not snippet: + continue evidence_ids.append( self.add_evidence( ctx, source_type="rag_chunk", - source_ref=ctx.task.rag_session_id, - snippet=rag_context, + source_ref=str(item.get("source", ctx.task.rag_session_id)), + snippet=snippet, score=0.9, ) ) @@ -38,50 +46,214 @@ class ExplainActions(ActionSupport): def extract_logic(self, ctx: ExecutionContext) -> list[str]: sources = self.get(ctx, "sources", {}) or {} message = ctx.task.user_message + profile = str(sources.get("source_profile", "docs")) + ru = self._is_russian(message) + notes = ( + "Используй код как основной источник и ссылайся на конкретные файлы и слои." + if profile == "code" and ru + else "Use code as the primary source and cite concrete files/layers." + if profile == "code" + else "Используй требования и документацию как основной источник." + if ru + else "Use requirements/docs as primary source over code." + ) logic = { "request": message, - "assumptions": ["requirements-first"], - "notes": "Use requirements/docs as primary source over code.", + "assumptions": [f"{profile}-first"], + "notes": notes, "source_summary": sources, } return [self.put(ctx, "logic_model", ArtifactType.STRUCTURED_JSON, logic)] - def build_sequence(self, ctx: ExecutionContext) -> list[str]: - message = ctx.task.user_message - mermaid = "\n".join( - [ - "```mermaid", - "sequenceDiagram", - "participant User", - "participant Agent", - "participant Docs", - "User->>Agent: " + message[:80], - "Agent->>Docs: Find relevant requirements", - "Docs-->>Agent: Relevant context", - "Agent-->>User: Structured explanation", - "```", - ] - ) - return [self.put(ctx, "sequence_diagram", ArtifactType.TEXT, mermaid)] - - def build_use_cases(self, ctx: ExecutionContext) -> list[str]: - lines = [ - "### Use Cases", - "- Analyze requirement fragments relevant to user question", - "- Reconstruct behavior flow and decision points", - "- Return user-focused explanation with constraints", - ] - return [self.put(ctx, "use_cases", ArtifactType.TEXT, "\n".join(lines))] - def summarize(self, ctx: ExecutionContext) -> list[str]: - sequence = str(self.get(ctx, "sequence_diagram", "") or "") - use_cases = str(self.get(ctx, "use_cases", "") or "") - answer = "\n\n".join( - [ - "## Summary", - "The requested project part is explained from requirements/docs context.", - sequence, - use_cases, - ] - ) + sources = self.get(ctx, "sources", {}) or {} + profile = str(sources.get("source_profile", "docs")) + items = list(sources.get("rag_items", []) or []) + message = ctx.task.user_message + ru = self._is_russian(message) + answer = self._code_answer(items, russian=ru) if profile == "code" else self._docs_answer(items, russian=ru) return [self.put(ctx, "final_answer", ArtifactType.TEXT, answer)] + + def _source_profile(self, items: list[dict]) -> str: + layers = [str(item.get("layer", "") or "") for item in items] + if any(layer.startswith("C") for layer in layers): + return "code" + return "docs" + + def _is_russian(self, text: str) -> bool: + return any("а" <= ch.lower() <= "я" or ch.lower() == "ё" for ch in text) + + def _code_answer(self, items: list[dict], *, russian: bool) -> str: + if not items: + return ( + "Не удалось найти релевантный кодовый контекст по этому запросу." + if russian + else "No relevant code context was found for this request." + ) + details = self._code_details(items, russian=russian) + refs = self._code_references(items, russian=russian) + parts = [ + "## Кратко" if russian else "## Summary", + details, + ] + if refs: + parts.append(refs) + return "\n\n".join(part for part in parts if part.strip()) + + def _docs_answer(self, items: list[dict], *, russian: bool) -> str: + return ( + "Запрошенная часть проекта объяснена на основе требований и документации." + if russian + else "The requested project part is explained from requirements/docs context." + ) + + def _code_details(self, items: list[dict], *, russian: bool) -> str: + if not items: + return "" + symbol_items = [item for item in items if str(item.get("layer", "")) == "C1_SYMBOL_CATALOG"] + edge_items = [item for item in items if str(item.get("layer", "")) == "C2_DEPENDENCY_GRAPH"] + source_items = [item for item in items if str(item.get("layer", "")) == "C0_SOURCE_CHUNKS"] + + lines = ["### Что видно по коду" if russian else "### What the code shows"] + alias = self._find_alias_symbol(symbol_items) + if alias: + imported_from = str(alias.get("metadata", {}).get("lang_payload", {}).get("imported_from", "")).strip() + if russian: + lines.append(f"- `ConfigManager` в проекте доступен как alias в `{alias.get('source', '')}` и указывает на `{imported_from}`.") + else: + lines.append(f"- `ConfigManager` is exposed as an alias in `{alias.get('source', '')}` and points to `{imported_from}`.") + + management_hint = self._management_summary(symbol_items, edge_items, source_items, russian=russian) + if management_hint: + lines.extend(management_hint) + + symbol_lines = 0 + for item in symbol_items[:4]: + title = str(item.get("title", "") or "") + source = str(item.get("source", "") or "") + content = str(item.get("content", "") or "").strip() + summary = content.splitlines()[-1].strip() if content else "" + if not title: + continue + if self._is_test_path(source): + continue + if self._is_control_symbol(title): + continue + if russian: + lines.append(f"- Символ `{title}` из `{source}`: {summary}") + else: + lines.append(f"- Symbol `{title}` from `{source}`: {summary}") + symbol_lines += 1 + if symbol_lines >= 2: + break + + edge_map: dict[str, list[str]] = {} + for item in edge_items: + meta = item.get("metadata", {}) or {} + src_qname = str(meta.get("src_qname", "") or "").strip() + dst_ref = str(meta.get("dst_ref", "") or "").strip() + if not src_qname or not dst_ref: + continue + if self._is_test_path(str(item.get("source", "") or "")): + continue + edge_map.setdefault(src_qname, []) + if dst_ref not in edge_map[src_qname]: + edge_map[src_qname].append(dst_ref) + for src_qname, targets in list(edge_map.items())[:3]: + joined = ", ".join(targets[:4]) + if russian: + lines.append(f"- `{src_qname}` вызывает или использует: {joined}.") + else: + lines.append(f"- `{src_qname}` calls or uses: {joined}.") + + for item in source_items[:2]: + source = str(item.get("source", "") or "") + content = str(item.get("content", "") or "") + if self._is_test_path(source): + continue + if "management" in content.lower() or "control" in content.lower(): + snippet = " ".join(content.splitlines()[:4]).strip() + if russian: + lines.append(f"- В `{source}` есть прямое указание на управление через конфиг/API: `{snippet[:220]}`") + else: + lines.append(f"- `{source}` directly mentions config/API control: `{snippet[:220]}`") + + return "\n".join(lines) + + def _code_references(self, items: list[dict], *, russian: bool) -> str: + paths = [str(item.get("source", "") or "") for item in items if item.get("source") and not self._is_test_path(str(item.get("source", "") or ""))] + if not paths: + return "" + lines = ["### Где смотреть в проекте" if russian else "### Where to look in the project"] + for path, _count in Counter(paths).most_common(3): + lines.append(f"- `{path}`") + return "\n".join(lines) + + def _find_alias_symbol(self, items: list[dict]) -> dict | None: + for item in items: + meta = item.get("metadata", {}) or {} + payload = meta.get("lang_payload", {}) or {} + qname = str(meta.get("qname", "") or "") + if qname == "ConfigManager" and payload.get("import_alias"): + return item + return None + + def _is_test_path(self, path: str) -> bool: + lowered = path.lower() + return lowered.startswith("tests/") or "/tests/" in lowered or lowered.startswith("test_") or "/test_" in lowered + + def _is_control_symbol(self, title: str) -> bool: + lowered = title.lower() + return any(token in lowered for token in ("controlchannel", "controlchannelbridge", "on_start", "on_stop", "on_status")) + + def _management_summary( + self, + symbol_items: list[dict], + edge_items: list[dict], + source_items: list[dict], + *, + russian: bool, + ) -> list[str]: + qnames = {str((item.get("metadata", {}) or {}).get("qname", "") or ""): item for item in symbol_items if not self._is_test_path(str(item.get("source", "") or ""))} + source_texts = [str(item.get("content", "") or "") for item in source_items if not self._is_test_path(str(item.get("source", "") or ""))] + result: list[str] = [] + + if any("управление через api" in text.lower() or "section management" in text.lower() or "секция management" in text.lower() for text in source_texts): + result.append( + "- Для `ConfigManager` в коде предусмотрен отдельный интерфейс управления через API/конфиг: это прямо указано в публичной точке входа модуля." + if russian + else "- `ConfigManager` has a dedicated API/config-based management interface; this is stated in the module's public entrypoint." + ) + + has_control_channel = "ControlChannel" in qnames + has_bridge = "ControlChannelBridge" in qnames + if has_control_channel: + result.append( + "- Базовый контракт управления задает `ControlChannel`: он определяет команды `start` и `stop` для внешнего канала управления." + if russian + else "- The base management contract is `ControlChannel`, which defines external `start` and `stop` commands." + ) + if has_bridge: + result.append( + "- `ControlChannelBridge` связывает внешний канал управления с lifecycle-методами менеджера: `on_start`, `on_stop`, `on_status`." + if russian + else "- `ControlChannelBridge` maps the external control channel to manager lifecycle methods: `on_start`, `on_stop`, `on_status`." + ) + + edge_refs = [] + for item in edge_items: + if self._is_test_path(str(item.get("source", "") or "")): + continue + meta = item.get("metadata", {}) or {} + src = str(meta.get("src_qname", "") or "") + dst = str(meta.get("dst_ref", "") or "") + if src.startswith("ControlChannelBridge.") and dst in {"self._start_runtime", "self._stop_runtime", "self._get_status"}: + edge_refs.append((src, dst)) + if edge_refs: + mappings = ", ".join(f"{src} -> {dst}" for src, dst in edge_refs[:3]) + result.append( + f"- По связям в коде видно, что команды управления маршрутизируются так: {mappings}." + if russian + else f"- The code relationships show the management command routing: {mappings}." + ) + return result diff --git a/app/modules/agent/engine/orchestrator/step_registry.py b/app/modules/agent/engine/orchestrator/step_registry.py index 918809e..736c473 100644 --- a/app/modules/agent/engine/orchestrator/step_registry.py +++ b/app/modules/agent/engine/orchestrator/step_registry.py @@ -24,8 +24,6 @@ class StepRegistry: "finalize_graph_output": self._finalize_graph_output, "collect_sources": explain.collect_sources, "extract_logic": explain.extract_logic, - "build_sequence": explain.build_sequence, - "build_use_cases": explain.build_use_cases, "summarize": explain.summarize, "fetch_source_doc": review.fetch_source_doc, "normalize_document": review.normalize_document, diff --git a/app/modules/agent/engine/orchestrator/task_spec_builder.py b/app/modules/agent/engine/orchestrator/task_spec_builder.py index 1c85dda..858bea7 100644 --- a/app/modules/agent/engine/orchestrator/task_spec_builder.py +++ b/app/modules/agent/engine/orchestrator/task_spec_builder.py @@ -24,6 +24,7 @@ class TaskSpecBuilder: route: RoutingMeta, attachments: list[dict], files: list[dict], + rag_items: list[dict], rag_context: str, confluence_context: str, files_map: dict[str, dict], @@ -32,6 +33,7 @@ class TaskSpecBuilder: output_contract = self._output_contract(scenario) constraints = self._constraints_for(scenario) metadata = { + "rag_items": rag_items, "rag_context": rag_context, "confluence_context": confluence_context, "files_map": files_map, @@ -124,14 +126,7 @@ class TaskSpecBuilder: def _output_contract(self, scenario: Scenario) -> OutputContract: if scenario == Scenario.EXPLAIN_PART: - return OutputContract( - result_type="answer", - sections=[ - OutputSection(name="sequence_diagram", format="mermaid"), - OutputSection(name="use_cases", format="markdown"), - OutputSection(name="summary", format="markdown"), - ], - ) + return OutputContract(result_type="answer", sections=[OutputSection(name="summary", format="markdown")]) if scenario == Scenario.ANALYTICS_REVIEW: return OutputContract( result_type="review_report", diff --git a/app/modules/agent/engine/orchestrator/template_registry.py b/app/modules/agent/engine/orchestrator/template_registry.py index d1b6ff1..b6554a1 100644 --- a/app/modules/agent/engine/orchestrator/template_registry.py +++ b/app/modules/agent/engine/orchestrator/template_registry.py @@ -43,9 +43,7 @@ class ScenarioTemplateRegistry: steps = [ self._step("collect_sources", "Collect sources", "collect_sources", outputs=[self._out("sources", ArtifactType.STRUCTURED_JSON)]), self._step("extract_logic", "Extract logic", "extract_logic", depends_on=["collect_sources"], outputs=[self._out("logic_model", ArtifactType.STRUCTURED_JSON)]), - self._step("build_sequence", "Build sequence", "build_sequence", depends_on=["extract_logic"], outputs=[self._out("sequence_diagram", ArtifactType.TEXT)]), - self._step("build_use_cases", "Build use cases", "build_use_cases", depends_on=["extract_logic"], outputs=[self._out("use_cases", ArtifactType.TEXT)]), - self._step("summarize", "Summarize", "summarize", depends_on=["build_sequence", "build_use_cases"], outputs=[self._out("final_answer", ArtifactType.TEXT)]), + self._step("summarize", "Summarize", "summarize", depends_on=["extract_logic"], outputs=[self._out("final_answer", ArtifactType.TEXT)]), ] return self._plan(task, "explain_part_v1", steps, [self._gate("evidence_required"), self._gate("non_empty_answer_or_changeset")]) diff --git a/app/modules/agent/service.py b/app/modules/agent/service.py index 9467b56..0b3a114 100644 --- a/app/modules/agent/service.py +++ b/app/modules/agent/service.py @@ -117,6 +117,7 @@ class GraphAgentRuntime: route=route_meta, attachments=attachments, files=files, + rag_items=rag_ctx, rag_context=self._format_rag(rag_ctx), confluence_context=self._format_confluence(conf_pages), files_map=files_map, @@ -368,7 +369,33 @@ class GraphAgentRuntime: return pages def _format_rag(self, items: list[dict]) -> str: - return "\n".join(str(x.get("content", "")) for x in items) + blocks: list[str] = [] + for item in items: + source = str(item.get("source", "") or item.get("path", "") or "") + layer = str(item.get("layer", "") or "").strip() + title = str(item.get("title", "") or "").strip() + metadata = item.get("metadata", {}) or {} + lines = [] + if source: + lines.append(f"Source: {source}") + if layer: + lines.append(f"Layer: {layer}") + if title: + lines.append(f"Title: {title}") + if metadata: + hints = [] + for key in ("module_id", "qname", "predicate", "entry_type", "framework", "section_path"): + value = metadata.get(key) + if value: + hints.append(f"{key}={value}") + if hints: + lines.append("Meta: " + ", ".join(hints)) + content = str(item.get("content", "")).strip() + if content: + lines.append(content) + if lines: + blocks.append("\n".join(lines)) + return "\n\n".join(blocks) def _format_confluence(self, pages: list[dict]) -> str: return "\n".join(str(x.get("content_markdown", "")) for x in pages) diff --git a/app/modules/application.py b/app/modules/application.py index 8f4cbba..caab8f3 100644 --- a/app/modules/application.py +++ b/app/modules/application.py @@ -3,7 +3,7 @@ from app.modules.agent.repository import AgentRepository from app.modules.agent.story_context_repository import StoryContextRepository, StoryContextSchemaRepository from app.modules.chat.repository import ChatRepository from app.modules.chat.module import ChatModule -from app.modules.rag_session.repository import RagRepository +from app.modules.rag.persistence.repository import RagRepository from app.modules.rag_session.module import RagModule from app.modules.rag_repo.module import RagRepoModule from app.modules.shared.bootstrap import bootstrap_database diff --git a/app/modules/rag/README.md b/app/modules/rag/README.md new file mode 100644 index 0000000..a30afbc --- /dev/null +++ b/app/modules/rag/README.md @@ -0,0 +1,300 @@ +# Модуль rag + +## 1. Функции модуля +- Единое ядро RAG для индексации и retrieval по документации и коду проекта. +- Поддержка двух семейств индексации: `DOCS` и `CODE`, с разными слоями и разными pipeline. +- Хранение `rag_session`, index-jobs, многослойных документов, cache-слоев и retrieval-запросов. +- Поддержка индексации snapshot и changes с переиспользованием cache по `blob_sha`. +- Предоставление контекста для agent/chat, где `DOCS` используется по умолчанию, а `CODE` включается для явных вопросов по реализации. + +## 2. Диаграмма классов и взаимосвязей +```mermaid +classDiagram + class RagService + class RagRepository + class RagSchemaRepository + class RagDocumentUpserter + class DocsIndexingPipeline + class CodeIndexingPipeline + class RagQueryRouter + class GigaChatEmbedder + + RagService --> RagRepository + RagService --> DocsIndexingPipeline + RagService --> CodeIndexingPipeline + RagService --> RagQueryRouter + RagService --> GigaChatEmbedder + RagRepository --> RagSchemaRepository + RagService --> RagDocumentUpserter +``` + +## 3. Описание классов +- `RagService`: основной application-service модуля. + Методы: `index_snapshot` — индексирует полный набор файлов; `index_changes` — применяет инкрементальные изменения; `retrieve` — возвращает релевантный контекст из `DOCS` или `CODE`. +- `RagRepository`: фасад persistence-слоя RAG. + Методы: `ensure_tables` — создает/обновляет схему; `upsert_session/get_session/session_exists` — операции по `rag_session`; `create_job/update_job/get_job` — операции по index jobs; `replace_documents/apply_document_changes` — операции по документам; `get_cached_documents/cache_documents` — работа с cache; `retrieve/fallback_chunks` — retrieval. +- `RagSchemaRepository`: управление схемой БД для RAG. + Методы: `ensure_tables` — создает таблицы и индексы; `_ensure_columns` — добавляет новые поля; `_ensure_indexes` — поддерживает индексы для retrieval и фильтрации. +- `RagDocumentUpserter`: батчевый writer многослойных `RagDocument`. + Методы: `replace` — полностью заменяет документы сессии; `apply_changes` — применяет upsert/delete по измененным путям. +- `DocsIndexingPipeline`: pipeline индексации документации. + Методы: `supports` — определяет, относится ли файл к docs; `index_file` — строит документы слоев `D1-D4` для одного файла. +- `CodeIndexingPipeline`: pipeline индексации Python-кода. + Методы: `supports` — определяет, относится ли файл к code; `index_file` — строит документы слоев `C0-C3` для одного файла. +- `RagQueryRouter`: выбирает retrieval mode и активные слои. + Методы: `resolve_mode` — определяет `docs` или `code`; `layers_for_mode` — возвращает набор слоев для retrieval. +- `GigaChatEmbedder`: адаптер embeddings-модели. + Методы: `embed` — возвращает embeddings для списка текстов. + +## 4. Сиквенс-диаграммы API и выполнения + +### Индексация snapshot через текущий `rag_session` facade +Назначение: создать/обновить `rag_session` и построить многослойный индекс по переданным файлам проекта. +```mermaid +sequenceDiagram + participant Router as RagModule.APIRouter + participant Sessions as RagSessionStore + participant Indexing as IndexingOrchestrator + participant Rag as RagService + participant Docs as DocsIndexingPipeline + participant Code as CodeIndexingPipeline + participant Repo as RagRepository + + Router->>Sessions: create(project_id) + Sessions-->>Router: rag_session_id + Router->>Indexing: enqueue_snapshot(rag_session_id, files) + Indexing->>Rag: index_snapshot(rag_session_id, files) + loop for each file + Rag->>Docs: supports/index_file + Rag->>Code: supports/index_file + Rag->>Repo: cache_documents(...) + end + Rag->>Repo: replace_documents(...) + Indexing-->>Router: index_job_id,status +``` + +### Retrieval для agent/chat +Назначение: вернуть релевантный контекст из нужного семейства слоев. +```mermaid +sequenceDiagram + participant Agent as GraphAgentRuntime + participant Rag as RagService + participant Router as RagQueryRouter + participant Repo as RagRepository + + Agent->>Rag: retrieve(rag_session_id, query) + Rag->>Router: resolve_mode(query) + Router-->>Rag: docs|code + layers + Rag->>Repo: retrieve(query_embedding, query_text, layers) + Repo-->>Rag: ranked items + Rag-->>Agent: items +``` + +## 5. Слои, фиксируемые в RAG + +### 5.1. Слои DOCS + +#### `D1_MODULE_CATALOG` +Назначение: каталог модулей документации и граф связей между ними. + +Основные атрибуты: +- `module_id` +- `type` +- `domain` +- `title` +- `status` +- `version` +- `tags` +- `owners` +- `links` +- `calls_api` +- `called_by` +- `uses_logic` +- `used_by` +- `reads_db` +- `writes_db` +- `integrates_with` +- `emits_events` +- `consumes_events` +- `source_path` +- `summary_text` + +#### `D2_FACT_INDEX` +Назначение: атомарные факты `subject-predicate-object` с evidence. + +Основные атрибуты: +- `fact_id` +- `subject_id` +- `predicate` +- `object` +- `object_ref` +- `source_path` +- `anchor` +- `line_start` +- `line_end` +- `confidence` +- `tags` + +#### `D3_SECTION_INDEX` +Назначение: семантические секции документации, нарезанные по заголовкам. + +Основные атрибуты: +- `chunk_id` +- `module_id` +- `section_path` +- `section_title` +- `content` +- `source_path` +- `order` +- `tags` +- `domain` +- `type` +- `embedding` + +#### `D4_POLICY_INDEX` +Назначение: глобальные правила и конвенции проекта. + +Основные атрибуты: +- `policy_id` +- `applies_to` +- `rules` +- `default_behaviors` +- `source_path` + +### 5.2. Слои CODE + +#### `C0_SOURCE_CHUNKS` +Назначение: сырой код как источник истины для цитирования и evidence. + +Основные атрибуты: +- `lang` +- `repo_id` +- `commit_sha` +- `path` +- `span` +- `title` +- `text` +- `module_or_unit` +- `chunk_type` +- `symbol_id` +- `hash` + +#### `C1_SYMBOL_CATALOG` +Назначение: каталог символов кода и их деклараций. + +Основные атрибуты: +- `lang` +- `repo_id` +- `commit_sha` +- `symbol_id` +- `qname` +- `kind` +- `decl.path` +- `decl.start_line` +- `decl.end_line` +- `text` +- `visibility` +- `signature` +- `decorators_or_annotations` +- `docstring_or_javadoc` +- `parent_symbol_id` +- `package_or_module` +- `is_entry_candidate` +- `lang_payload` + +#### `C2_DEPENDENCY_GRAPH` +Назначение: связи между сущностями кода. + +Основные атрибуты: +- `lang` +- `repo_id` +- `commit_sha` +- `edge_id` +- `edge_type` +- `src_symbol_id` +- `dst_symbol_id` +- `dst_ref` +- `evidence.path` +- `evidence.start_line` +- `evidence.end_line` +- `text` +- `resolution` +- `callsite_kind` +- `lang_payload` + +#### `C3_ENTRYPOINTS` +Назначение: точки входа приложения и их обработчики. + +Основные атрибуты: +- `lang` +- `repo_id` +- `commit_sha` +- `entry_id` +- `entry_type` +- `framework` +- `route_or_command` +- `handler_symbol_id` +- `evidence.path` +- `evidence.start_line` +- `evidence.end_line` +- `text` +- `http.methods` +- `http.auth` +- `request_model` +- `response_model` +- `cli.args_schema` +- `task.queue` +- `task.cron` +- `tags` +- `lang_payload` + +#### `C4_PUBLIC_API` +Назначение: публичная поверхность API/экспортируемых символов. + +Основные атрибуты: +- `api_id` +- `symbol_id` +- `stability` +- `source_of_truth` +- `versioning_tags` +- `lang_payload` + +#### `C5_BEHAVIOR_SUMMARIES` +Назначение: поведенческие summary с обязательными evidence links. + +Основные атрибуты: +- `target_type` +- `target_id` +- `text` +- `claims` +- `evidence_links` +- `confidence` +- `generated_by` +- `generated_at` + +#### `C6_RUNTIME_TRACES` +Назначение: runtime/trace слой для связи кода и реального исполнения. + +Основные атрибуты: +- `env` +- `trace_id` +- `span_id` +- `symbol_id` +- `entry_id` +- `text` +- `timings` +- `service` +- `host` +- `labels` + +## 6. Правила retrieval +- По умолчанию retrieval идет в `DOCS`. +- `CODE` используется только для явных вопросов по реализации, устройству кода, endpoint'ам, handler'ам и документации “из кода”. +- Для `DOCS` приоритет слоев: `D1 -> D2 -> D3 -> D4`. +- Для `CODE` приоритет слоев: `C3 -> C1 -> C2 -> C0`. + +## 7. Текущий статус реализации +- В первой итерации реализованы `DOCS D1-D4`. +- В первой итерации реализованы `CODE C0-C3`. +- `C4-C6` зафиксированы в контракте и зарезервированы под следующие этапы. +- Текущие `rag_session` и `rag_repo` работают как facade/adapter поверх нового пакета `rag`. diff --git a/app/modules/rag/__init__.py b/app/modules/rag/__init__.py new file mode 100644 index 0000000..153b59a --- /dev/null +++ b/app/modules/rag/__init__.py @@ -0,0 +1,25 @@ +from app.modules.rag.contracts import ( + DocKind, + EvidenceLink, + EvidenceType, + RagDocument, + RagLayer, + RagSource, + RagSpan, + RetrievalItem, + RetrievalMode, + RetrievalQuery, +) + +__all__ = [ + "DocKind", + "EvidenceLink", + "EvidenceType", + "RagDocument", + "RagLayer", + "RagSource", + "RagSpan", + "RetrievalItem", + "RetrievalMode", + "RetrievalQuery", +] diff --git a/app/modules/rag/__pycache__/__init__.cpython-312.pyc b/app/modules/rag/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3f0862051eaf581fb176b6c1d6aca0e7ea28501a GIT binary patch literal 477 zcmYk3y-ve06ou`iZPKPihz(e}cBv(1#8d`EL8AN&UMwrQstj=wIcY>jo&vEV7M_JS z@P?S!5Db-=xEAH-4xi5Pv97P}r@^2{u$~7G>W&if;>=HZEAg?5;+7;Np()XnQ|)n2 z``p(p-qLN})*ar_0S|P@L*3>e*nD$A>CQY=6I231l%$ zRd)5~&YA*jG?R1Ox6qJPHbgU(4H(-+b($|s3^s7GkeMCKpfU<-nI2T2|JkEF0lW5_ zo-DvL$D_X8ltnStd9p~M92+?w$9Yy68CRw8aYWN{XghQq0*BC{>)_tXL=Js~4TO+s zD#RoCKubO^*m()2WHNLJ6~Y+7|*;4(CG+CbEoUK7cmK z8ML~9o|h+;fE52F9vA9u>it7FYC)R49sflKdPwJbYQ<>hhYlV14k8DEgNcLK!OX$J z!t-?BJm7Vm$x3u3G@0hBEU#3ndEPde*lyHpp4z=)?*0D`s7^K;C8LOOz8h%n-*Rc| z$SD3cMf%2~%_43G6EeojlCev4Wm9zhn!IX2H-wjPK@Op3wXWWP-G}uL2``pH7d@z@ zRkO669GZr>#>##=VvA(!1+iX`ofm|$LKQ7zSv_1wk2jnr`i>oFe>>FNweZu&7_ZUk aXLPzo^b4Y2(W5mw`4z;m_b${2mhuPYyT_#f literal 0 HcmV?d00001 diff --git a/app/modules/rag/contracts/__pycache__/documents.cpython-312.pyc b/app/modules/rag/contracts/__pycache__/documents.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8b54baa40ff5ba978c66f51a1bb8839ebabcf199 GIT binary patch literal 4241 zcmb7HO>7&-6`ox#|9@?fltfuJWh;g$CnT*pMqN2U4{{vafew+41}u!lnzN#oS(57P z%CTe_P~o(Y+WLCXy4p6&?y67qBsX#BHXq3z%0bCSC(VJZvD5_K6+a)P! zL-%J6Z{Oei?7Vs3y!m4==p)en9Q;K2nTL?SpwMnki{Z9E2WFX2LOGR`xE#kJ&#QdN zk#m%UoKSM+oTkp9x=QYx+vEk+Q}X7#9N~#9Q0Hf^2b?t?Pc9JZ{*X}51K!d+us$T` z>+0(TU*Fe!{atDwiv|q$`!OmU0*_Nux?$H@sBRrMxO> zn&F&MWR-%eOPiKPUq1s};QS4R%H_QLj#7UA3MibKTG6$9Yvtz0VF}#!X<(L#OmZA0 zIsPi_o9v+cInYuEaDvI9&S`QU>iQ6Nan7Cc8s2xM!enKZU6>CA95d8Q&NbjJr$ zav?cUDa(d0Un!LoU4+#c!i=O(=UZFbSq3bH+wL=$p?7Qxq#c4uvcXMka7+M?4bBiW zS)J0*FS#RSu4*#VGLkCa%#6!7#2Hp8%6VNAZ;WOnB}tibLoO$!nVBS5^0KC7&`^-e zdIpr4QiaZ{vX)^|A(O9^btXaI46Wp6OW>lVXXXsIUJ-H2=MQv#Hr-N%VGY{>AdBS7 zaO^()@xrEOU>jsIvB4WIChN1TYzWvplOdv?c~VAd26cmDH9cEQ)*L zEsCrc2HPO4A3iLO#NQ@fU^F!UfIJOG?!42CBcQm~yj0)E-811j#UxB$8K<;6Q8$$?mou zLlFs1tqlNyZ5oKJ`jY3$1 zw3o)<8=4EG`V2qp@OUc$7#?oJsY-J)Gfy61jFs+AUC~w95On#b4)_wL@{}~I>f#h6 zP=(F0L)g(%l647zYW|?AX!>{eqS7@O7QjiHm=Co=+CE8w28WA(geP137%rZdc)M(_Abw1KA>8Juqll?(YcPs=_3c-K79(V9H#v=mOAY@3chT00 z5t{+S-2^8*|pyU~B{~TPtGf*%}on*S-v4owoR*-0cg!5C?2+fh&00 ziFKRT5hBdCz!woDY>jpbfOelEB8IKCOMwLu;0hT5Vvcr9Ahg&eb+Fa>+ zMa84-ZQTS4op;XaQ?Gq9b%33Mek_Z`L@op^Lnv01GE1Qnk(-Sm2_nH;h+$A!8i?U2 z$T~ZTq8mjQlmQhg8{TR8CRGZurl%Y%h87RznfSguXE>Pjfd=V9lF6irwMI}bYqLxi zt^Aws--8M1eI{IxwgY65H2vXwV?P>e_Vg}~RmV01hnqdIB@b%P-aWhNPd57#E2r1a zuAXfSWNHJM%|!Mg_i*@e>QSmOI$j$c-#m4Gv-ho~x0|5@_onYoKaE6}eN|r*KHjSL zC*hTMk@Z*ojmWWD(fC zOOHy8GZ$)SF8sm2dG$N>Gw-3RIZ@xU1Kwb0+u;xUmK-noNblgvmM{58NJ zz+$c&UbJ_x%6}9Uz}Lj=E}75bFiRJG5icoQD?<_7-TS4ZV4Y!Wtx=LTHHa2|766@p z9-M?VbLP4~ASr$#6|@q6H^*)1Z6@`rFo>DxECZb3Gn3e?y;=zkwHNyam;gty0?2eyi&{TO5o%UkFnn723HkZsKHLRHM_AOtmUTpRbEZ?f$YV?lOdPkax!)qs3Pc{ z(cxNjcz>ra`wp#KS-ZA+t=@OCdFu7YiARY)#m=oq@6%(t8d zT52`bNTjzC>CdCh^ysgYPn8v6&AaMt^o?xwjr{gXvwv{y@ao~M{@2z=pAUp0zGo!l z_5JG~Q4)Vc!&LeGVDES+p5hsniDrq!@N}iZ_PR5PcI;u6Ll!*4zQNlFV`m|}iS;I; zW6-RSokz}Apq@Y-5{#3k0D%J4AG&j?8Hm+|Skq*&`2Ql7SU>$FmW6_q)`jdB$yXlI zC&{tJ^PB#`x-f>N?D$d|N?G{N#kZC&)rCRWbI^^{g@NYqiN%YX;i0;4Vq55O4!jr% zJAGD}jB~>o#mCFkEIe%BA;a*_F#MgTsH~Z9G=@)c^S=%Ew4_a|%6025Itov&bnD5L zW>O*Dd2*%Y)`OKHmRXOSW5}7my?-lqJm;)2-ei}-1))_d19|4)IPNQQ^lv0vBiXM= z;;&@vxr^}r`_U&{{GYswJM@eI*>(`V_Zf=M5AIr>Xa&-;0{1+!Yt3?S3V@i_=R9!c Gru-X>f&q5GAD*^=(^nY}K)mC@q45C>q6fQ}j@@kVr-rERg~!6)}Ng%V;--T8UIi z$*67$kpGZB&`Z&u(u;Z%7U-eqskf#+7|5x!q-EH$OMs8Vx5MGQnc4jrnG^0H$ghk#35aq*CofJ`x5RIR^mX~`A#<$&rI5Iz1krzP z7U37+oX^MmZ!6t^`a#7R(4Q(QJ;Nj4`=0j30X{pZIQ=s5`)AZGJEKt#dl6CJW?Fwja-o#m*;l zw7vL1o};ZxJ6hb^pQD{c>Ek)N7dbw7Hb=Ygjez;z}Dr2U*K_z95w+M;$38jSJi*W*)oSm96nkC_YQIZAS+v#sQ|Cl z$dUWpo!}m<+)i-0A*TshVoBBO9rsm?OV9g(Nj`PR5HcV3gPtxYZ{=*Y zHH9Ui*EPd9W|FB@)t1Ikt6?dtB4kLz=TL$>*aldNSS!uyi~QE?`87G*@oqU~SowcB z{Ct^8a+2k=vp%72k9MXb%F?{(I2buDlY;5+r8i)Rl2gjE&=VZM1j2#FQvBr=jFc{? zR;8xHaktu4ok`L0!H}lbHI{|GEl(kq;xBx=uBtzTtbGm-;0|^HK>fzdNolr@oAK%2)o^;K$GA;3>dA oaZwci5I($+vf`6>0>H&yOf0?=04_3!TL2ep3Gx2F0zjC^e_5WXF#rGn literal 0 HcmV?d00001 diff --git a/app/modules/rag/contracts/__pycache__/evidence.cpython-312.pyc b/app/modules/rag/contracts/__pycache__/evidence.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9cc701311499d60fb4ee09c0346f76c4415085c3 GIT binary patch literal 948 zcmYjQO=}ZD7@pakB%3b-wOS$7rry>=7rY6@Q^gNT4r(vCjN{ICy>;zg*3uqY@V#9K=Qp(o$j-Neq}+2?uRnRnig*-!O)1;P4Je;+)>2>mo- zx(XS=lXW1+h$D_e)Wsc)4Yxw8Yj^A}=@5)8BnY=Zk`stwmW3W7UN}I!cw%Lilk9^! zg<;8cejDZDq{^Zw)=UR+q+Ba0GQ|U5CuPnw^FyZ8bJ$c+VXT#(2S`75Ch+7XkYglJ z2XoZ1HXub|b88i>DYH`sDe?l?i@7TB5-2BE#Z;AZRZ4eOQdQ&C1JZGZ#H}Z_2ZMl% z$QSEDw3FCM%Y^7*Pb6i{^q79NTL~gPwSbpNO~U}Zj)-{1 z2sA+WdEaKmL4kO3BP~wPe0ylx-)GRgwu>c!9nc}7M7XKUzShMHZ$k;GJQj*K-kKQD zo{YDJuN58K^;pp2G8l-c#d^IKczmIhXB=B1(jFMSZp{0kP@ZI4o*zeAGC1pry!w^i zFe&MnnlbIpU+&3DZf!soss=DZUmMqs_`AK&PICepnE^|o`!Y(1Iq#Mehti031*J)a z((Gp7HA-LiS(tkolwNuzlu8p^)(u=SU@nwb0X`r(Z}^-6X9`6i>K?#b^u1Ewe>`r? z?YGAZ&C%N7om0{r-&`I&IjWqJ<#BUq^yqN=lq~)C{vo)qm_;=K+iuY$y4i;!AZcly zVN#tvR-w{?Nb}=l2{a9xC3WGfW;NcIx4>cMrCfkB8)N(p-TsB#^CGgUNAq9s^}kjb RFP^27wk!?p43{=fJHA6S%#g7u5NbY%p^3$L!k;i z_U1p}L8%A-50A~E4hV{f^w!dY;K}!Po1$27AisU@{oc;|+~=wTO*m;7j5N##Pz~g4&Vn_QtW1Jxm;-hfHOSQA?)$=^sz!l%z!eYVyo@Vl zx0D-2HUyp()OzLGupVCEN<_R%<9i)0dfJjoXj}0`rR`SA_!c%>TT!C46GyE`K{1CI zZ_-Yzf+h`BDtb%BIFjliy!lAGG3{|7b)`+!lb|hlGg{=bzElFX`l4D-6Rtg$hQYHa zVYQO>f*_$S9t64+1Q4(jqdXJ@&pR~UV-5wuA1w|7F$_70EHN7+ixJchBa8yPBfEg) z7xf=VNX4fyU-Yl@E@YEF<)M;6_mWSeDicwcCso>RSAhw+ls*!Vd7^wU`gu0e7xb|o zril_zEog&&U2OM+hcOXc_{pmPugI5DWo3RdgTtfCH&$m>>Yv6>zh(XLndO_S^BeZe zr^&M)?)N9JEZ<$bykTG2JUO|1YxQa0o)jkr_Z_?`3qb#OQ*e9iY}zO}ndHLu6hQO# zG=~*I%XXsY*2=oFzXkJ(w?q-*=mORZz0fv}5~h!X)sBi(%YT|J&qJ*kH-FC?8`M_U}}F zd9(QvTodQO8UI7%9Kg0^7{+HZ{ezs}afw-8JK8tKewwaP-6jCL7BP=+qq;NtCp(4g MRL1TAJC!B-8%g>*W&i*H literal 0 HcmV?d00001 diff --git a/app/modules/rag/contracts/documents.py b/app/modules/rag/contracts/documents.py new file mode 100644 index 0000000..e53fa4f --- /dev/null +++ b/app/modules/rag/contracts/documents.py @@ -0,0 +1,77 @@ +from __future__ import annotations + +from dataclasses import asdict, dataclass, field +from hashlib import sha256 + +from app.modules.rag.contracts.evidence import EvidenceLink + + +@dataclass(slots=True) +class RagSource: + repo_id: str + commit_sha: str | None + path: str + + def to_dict(self) -> dict: + return asdict(self) + + +@dataclass(slots=True) +class RagSpan: + start_line: int | None = None + end_line: int | None = None + + def to_dict(self) -> dict: + return asdict(self) + + +@dataclass(slots=True) +class RagDocument: + layer: str + source: RagSource + title: str + text: str + metadata: dict = field(default_factory=dict) + links: list[EvidenceLink] = field(default_factory=list) + span: RagSpan | None = None + doc_id: str | None = None + lang: str | None = None + embedding: list[float] | None = None + + def ensure_doc_id(self) -> str: + if self.doc_id: + return self.doc_id + span_key = "" + if self.span is not None: + span_key = f":{self.span.start_line}:{self.span.end_line}" + raw = "|".join( + [ + self.layer, + self.lang or "", + self.source.repo_id, + self.source.commit_sha or "", + self.source.path, + self.metadata.get("symbol_id", "") or self.metadata.get("module_id", ""), + self.title, + span_key, + ] + ) + self.doc_id = sha256(raw.encode("utf-8")).hexdigest() + return self.doc_id + + def to_record(self) -> dict: + return { + "doc_id": self.ensure_doc_id(), + "layer": self.layer, + "lang": self.lang, + "repo_id": self.source.repo_id, + "commit_sha": self.source.commit_sha, + "path": self.source.path, + "title": self.title, + "text": self.text, + "metadata": dict(self.metadata), + "links": [link.to_dict() for link in self.links], + "span_start": self.span.start_line if self.span else None, + "span_end": self.span.end_line if self.span else None, + "embedding": self.embedding or [], + } diff --git a/app/modules/rag/contracts/enums.py b/app/modules/rag/contracts/enums.py new file mode 100644 index 0000000..72502c6 --- /dev/null +++ b/app/modules/rag/contracts/enums.py @@ -0,0 +1,35 @@ +from __future__ import annotations + + +class RagLayer: + DOCS_MODULE_CATALOG = "D1_MODULE_CATALOG" + DOCS_FACT_INDEX = "D2_FACT_INDEX" + DOCS_SECTION_INDEX = "D3_SECTION_INDEX" + DOCS_POLICY_INDEX = "D4_POLICY_INDEX" + CODE_SOURCE_CHUNKS = "C0_SOURCE_CHUNKS" + CODE_SYMBOL_CATALOG = "C1_SYMBOL_CATALOG" + CODE_DEPENDENCY_GRAPH = "C2_DEPENDENCY_GRAPH" + CODE_ENTRYPOINTS = "C3_ENTRYPOINTS" + CODE_PUBLIC_API = "C4_PUBLIC_API" + CODE_BEHAVIOR_SUMMARIES = "C5_BEHAVIOR_SUMMARIES" + CODE_RUNTIME_TRACES = "C6_RUNTIME_TRACES" + + +class RetrievalMode: + DOCS = "docs" + CODE = "code" + + +class DocKind: + SPEC = "spec" + RUNBOOK = "runbook" + README = "readme" + MISC = "misc" + + +class EvidenceType: + CODE_SPAN = "code_span" + SYMBOL = "symbol" + EDGE = "edge" + DOC_SECTION = "doc_section" + DOC_FACT = "doc_fact" diff --git a/app/modules/rag/contracts/evidence.py b/app/modules/rag/contracts/evidence.py new file mode 100644 index 0000000..0b1e4d0 --- /dev/null +++ b/app/modules/rag/contracts/evidence.py @@ -0,0 +1,16 @@ +from __future__ import annotations + +from dataclasses import asdict, dataclass + + +@dataclass(slots=True) +class EvidenceLink: + type: str + target_id: str + path: str | None = None + start_line: int | None = None + end_line: int | None = None + note: str | None = None + + def to_dict(self) -> dict: + return asdict(self) diff --git a/app/modules/rag/contracts/retrieval.py b/app/modules/rag/contracts/retrieval.py new file mode 100644 index 0000000..97c2fe6 --- /dev/null +++ b/app/modules/rag/contracts/retrieval.py @@ -0,0 +1,23 @@ +from __future__ import annotations + +from dataclasses import dataclass, field + + +@dataclass(slots=True) +class RetrievalQuery: + text: str + mode: str + limit: int = 5 + layers: list[str] = field(default_factory=list) + path_prefixes: list[str] = field(default_factory=list) + doc_kind: str | None = None + + +@dataclass(slots=True) +class RetrievalItem: + content: str + path: str + layer: str + title: str + score: float | None = None + metadata: dict | None = None diff --git a/app/modules/rag/indexing/code/__pycache__/file_filter.cpython-312.pyc b/app/modules/rag/indexing/code/__pycache__/file_filter.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..54cd85a5a5bbdc064c557446c9ce9dca299ef4c4 GIT binary patch literal 1221 zcmaJ=&ube;6n?WmWI1c6p;nYoH$+YYVIgRZa|i_EQtG6K7D+S>6hXRO?~bH(*1O8g ztRfkQ7;0z=J@nAlJqFrKF)cl|r~U!G#Bso3NKU2D+)7nwqEp^zB|G%cfqnDMzWLt! z=DnHUrl*epj{ec_!XF91?`km~Mg~ZK9>F#Q5ReGgNSzRcwMbjj>pB4q-hn{hg233( zGDh9#nRf1}TG&KdUK}UF6JZi_Tu#?gMr#QVyERX={EU&=5uNm3$MrT)sFMKdS_>0@ zOX_-H1lpE?!FAh|$7(&%O5!VFMDY`pElnV4V3eDo7?|Y_jXTl^LN25+#O6vGMuD_k zx83tSzeQbFPQ^(;-L)i0Bg*|z_MsGYbRB^7zeMmUY7tnRY;2I75x%W$Xsa)?Mmtu` z@!rFEX9NRs<01efJxwUR!*LfH1PhqQX!-HK!+9CT+;QG1xC*vWS=Y<$-WC5q7*bF& zTu`zvA^3{C3k0Qa!1Y%FZj%l2nYMZu_W*U^YqE_!{k&j64}Q>Z5t~%7QAgd&AHMzG zwdKoh?XCAexN7TCE6+=#?J=w=r%{8ebfwbfpy^!WlyS$4XxCY$9kTWCAP^FlsTR@8evzOQF4nise5rxnyLic*4d2OfdvsQIq7?G5C)#0 z!>W8-DzSp{PQ9#Z{nY=8om?&Kq@uy4)oTC@As+^#jZXT5PR zePEq`G;{LtOmVMR{kiw+onP)eELQhss`p>1-p^P27y>G`!Cu2z>Rd3HV^pLpmc~kw zMEH90t{Z!6>J1!mM-M=~2iLusdQs+K$B@lb!M+D}N?~;%mKz%dxGL%v$m1VvR{k*z zvqrL=%2&ZO7`%~UVob!;WswuASW&n#EN&L-EdD=?Kgcg;w_%B$Lx;ND`~;f6bVA4z TIP(Y0{hOO0HS#woJ0!jU+6pv) literal 0 HcmV?d00001 diff --git a/app/modules/rag/indexing/code/__pycache__/pipeline.cpython-312.pyc b/app/modules/rag/indexing/code/__pycache__/pipeline.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..00bf0df85cbe37b80f135701c656c0a7db9bd927 GIT binary patch literal 4449 zcmb6cTWcHX^&4q)u`ZTuE0V0WV{M#WTggUpO;!yh&CQq4O=?1%(yhA7RMCuODKi@N zn^B033=7SJ0t?&F2lp`#eaMnL_%S~r&=(8E$P5+)`cT@ZvR)|cQ_uNEGg5p>+`;~I z=G?yXojJGf@4=vtz_UO2w^GGV$UpGKdUzAy_O}7pBq9+xnJ8S2;}CYp4#k;sDtwM- z?@rmJxN~j>^Rh?r=DZAc$peZn=VP#2_A7y0fWaO)s0`)?8SIrqN;ns0@PIs|4CjU! z?2|{7NG`%)zkEuG=AsM^$T20Ji!(SVk1B~=g299GN6J`kjKLv!TsfUP%@GH=NyPAD zA`ZQ9m?z3jtPQ6|{)I0o&hY0e6;;pcCAFfZ9ESgHzBsQIYKm0RjRAOBR%^5%0b^)R z6{UO9BYkeQR(T*%K*cPSC2^xxl0`-uSrCiTVRR7D3y*Y~FX$?&7+t97bginED*9pW z(Z#%`U$2(>cqCowO2!uDy!PNoQsOZ7u2d{(uoG-Kc5kgJ(K)$PI9fV1CNhFfvM3QreE|Gi8 z=bWNjbigk!dPFDuy4LuV*NC8-e^n8sN2N;fPN^!%rHWMG`qa7sb`rwvuL5?H=r-*| zm#Iz!8>hQ0$v!Gbn~y$VL48zPLJuB6^;kH4$b$N0ZGHNwBdC50r;l1tpRB!`K7Ir> zWa0FWZD`VhEG_ychcHtIF-7N#u6Mvs5sRvyw;)RwP@@)ZjeO57rD7C~Fg(Hvixi3< z!te?OGeszR#s~@p_~a&MfW%xt|HlR z!;#Uh15ep57d=AVS?8?=wBgb1l2P|qu9zoZ&l2(#=-cpa4Ak9ayKwYrA!XLEdl-bu zcA4nm>#q7h-TTsE*=y~>k#8^Z_^B{Q4m@SMeDw%mb+*;} zy*K=IKb-CYoGftcx$MHy({jwY0+t^w%iv5IcBXhw)`E_)&R6%_)&=Xqu62XQT9>f& z^?r}BZtz&^#(J_ARGzS)-Y@vh|I%q`=~@;#*0N+z(t?h$EM)0C;5YX&N%2c5r{SSe zRTWC2;VY<$0_90t%~Na^P$T`Uq}z#!Si{q;+NmLm5l_+mhKn&815yQMD4N$L!wq#& zs)!UDoZ(_^fTBN*0P6vQSu*KIi1NUEg|iVwXHjf=^fWH$C&MSQ=|{-RGED*^6=ns3 zo>omi~Dhy47kiqLzk|;zc^ceF*hKsct zgU27k*Vjj|c_j}#W=--jjo1gJ^@$^kdXE8Hvi2q96_oX%#Iv7ot-t2q4E?&{-aT`+ z!S6;Bo61wA9ZfZ(spn_6Jgw-ZhJSDDlV=ZFW3xXHZr~pG&!6uG!r$F`a_g^r zJASSiKi7`WG~+W{(O0KlFKowWTJgI(f#v@=k^k*G{S5XQj&5FfdZ9f$)f}FB z&Oa}1>8;_*4bN_L95FEw1dkx|rhjcd&J>3PS{NGpD2CvX+@T;tv*PPY^1n~C%7M7Ej8zM6kM zwVlYe5_eI{ZLaY}pB6l5JDzREvs({et+(P|>;!J1mfLvJhMyJoB^c-3RMKm-V(?39 zMN08zfH+|grq}R$^Y>L%PWj9Tx&as#M(aLcNY;}m!z3speU5Ynq3rsQ3qRL zr`*`sD|tl{1j8qE0~_FgApEhGmrV+tL_O2ET!rO(!i;4>M6utqWdr~X%kg8f?+CcA z?vHV<%#ZGnD`IjZ<5DV#yR6jUgs>yU&SZ>2vZcF2NE(KaG$sRPlt!_?u|zd+QhA4o zg(eKwz%5E_5p>AJv^B@EvF_4Ir6Aqd{mB}!%rWrd`^8ztr4Na+Y45^GM7xzkrId|} zzxaSmZ#F!dgk zZ0*?#44huf9J`4SPJb-#e)3*6Q|1c&E#PqA(JsRR6@}wEAa3D|@?}HZ>>bJI#hnBv`QNZfpg@t{}n;NHV@l($kX|$F% zyYzz2oO{l>f9Ia>)BR^4;3v@bf{!vG4A$Gbkp`5rb10Q zm(exbtLbV(^QUo)Y!(aJbey-nvaBofnk?IXS)R|U3pvy~W%-v2O0MY%$?}2I%JQGc z9_)hb&nK=J8Z{D1PFqYQwcB!m=CfMbH00Z32_+L#Y38=3$CN@L2A;HL7zuRDX}XyJ zW1_K~k)X<4BBQI?Vn&}!q~TbOknto5XSW(XT_{qxec}jIjH4iaO+N1k-g~R&58S&{ z3rqLjt_8d9T?Xy`+cfG%x7|YXo|(PxfcvXkP%uvoX~mbp2y%v)^MchKL^-SS%&EH3Mm zeS1ujy!RJEmPIq*+AYy^Rgw+1)RuC<8*Uj}$^mbrWo#)oIJfi-=D}-bx9c!_UCk0z zMrAjed1;-{^9`|_<@s8ySLL()?tEW;Ey41r>=NX_@*W}X%vrqjL#>rFpXGynu2_6_ z;7|y3&10_3Vm>6Q&~^q92%vd}56K@m%V&8kuj>8Sx6vdRcYTID_#00wevvUE46#TS zEAddX7`W?Z1-lF`Vz%FfvYy{( z{QJx~n@615!u4D}ear4Niu2d=IoavO1&U=xv6d*7EXB&9SSazZ%_)Xy3k8K5n%yx| z)Jy z=xL-x?1W!8ihBBhg?7NQwr5TQ+62Plv-yk;Oli_g!4`oJ;9L&8<9?erwA>AwFDT|s zXYW93RMTv+xgVjq6&(PIGXTxm91b6W6%0IVtko#itt~hsm%0x@`m|&2!cj3u?p;!6 zIp6rM)Ki{ZnOvQH|3WEQ7s-j?l2{8z%b}G}H9S}nYSF&(rIkz1N!S-GC3mHPRrS%# zAFn;U_UZVaFMM+0@zhT8o$cg1m7%LorBp54w;S#K$g{?;-P~lVePh+=c3k{qFR2%1t93?x zd-*q)cf<&c+dX##YqxA`kXMnaoEX@G_oy?teNYx zRcY)==UANs`@Vsh`c-!_KpqER_>v8fP)=I0lpG(eP9=uA9zGS9%6OE{Y$u?T5Mxw9+$C)|JU#@#40VsA))74khr(vxYiZoXZ#cJX1&qLj%zzdFa_pLIk<7>UY zf2|sgmoC;rBouALRfDm57@$}RI1iQSxY$Ui7jxz06g69g(w|7T?I m-y&f~Vzctz-AgaGdFR4lnG+*8T?%`?6>N literal 0 HcmV?d00001 diff --git a/app/modules/rag/indexing/code/code_text/__pycache__/document_builder.cpython-312.pyc b/app/modules/rag/indexing/code/code_text/__pycache__/document_builder.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7893715c62fc1d6cb0674cdedd7230c95476828f GIT binary patch literal 1467 zcmZ`(&2Jk;6rb_#`YW+h8=BNXgy4c;vFepTxu7aYI1!bI5~(Jay{wj@Gl6oOAxQyFF;&zd~? z89nBUWFN<_M<{ck{A6K}W%z-OXRBb15SPAW@p%e_dh_-y5cwte1fOJj+5h@d9lZJL zUcR*O0lvJ@ufE`wH+gO?+IgY?e&ab1%oE(JUibEHUiT4b%_3TF?FG1cHm{s$B>Ro} zW9}@M?rI}Fzb~@=AwLR(OKe=?<^!eK9}2Z~u(K=abR;{L8xVRdimAbeeJaX+NMes# zA+zGZ<)UOW?sja4TN2iDLh};hRnHE(A{(XLYux1?6)Fefgc`vVh`dj^O>A!K%S>{? zG!UXH^IPqM!~3mWt99>i|I;>;m9bk=$lHrz#385Uxd9b<8jwU|6$z<*n;(fBqkYeY ziCbpW4+k`gJ015Jic%x$b=a!hsU;;b+5Sjo--LBJFb*TiBE$CRv9Uu3R-c7E>hQ=K zyl>dplFqq7*KlB76Jg2X4KjZt*5~Qm z%BE|H{^0eOjCM1D<@`kvh_BGa%KAlheNw$MuHKneYSUM0)9Tu&@vt$itd7*0C5J$>_?zYE%G`RPjOM)3(M6^b)Ri}v+=(mN*K2dhgV@09dVC@wtvETcST zK|&|AJ8xNm?NiGVMa!CxjU|>X>&w{oQiQz)5gsA-whVMz3i*GQ2+4IM#VLB4DQm@a zkA(O2qKw#yAsIwx^5u&f5YU_x&;$aSP8e_oa~WMtS9 list[CodeChunk]: + try: + tree = ast.parse(text) + except SyntaxError: + return self._window_chunks(path, text) + chunks: list[CodeChunk] = [] + lines = text.splitlines() + for node in tree.body: + if not isinstance(node, (ast.ClassDef, ast.FunctionDef, ast.AsyncFunctionDef)): + continue + start = int(getattr(node, "lineno", 1)) + end = int(getattr(node, "end_lineno", start)) + body = "\n".join(lines[start - 1 : end]).strip() + if not body: + continue + chunks.append( + CodeChunk( + title=f"{path}:{getattr(node, 'name', 'block')}", + text=body, + start_line=start, + end_line=end, + chunk_type="symbol_block", + ) + ) + return chunks or self._window_chunks(path, text) + + def _window_chunks(self, path: str, text: str) -> list[CodeChunk]: + lines = text.splitlines() + chunks: list[CodeChunk] = [] + size = 80 + overlap = 15 + start = 0 + while start < len(lines): + end = min(len(lines), start + size) + body = "\n".join(lines[start:end]).strip() + if body: + chunks.append(CodeChunk(f"{path}:{start + 1}-{end}", body, start + 1, end, "window")) + if end >= len(lines): + break + start = max(0, end - overlap) + return chunks diff --git a/app/modules/rag/indexing/code/code_text/document_builder.py b/app/modules/rag/indexing/code/code_text/document_builder.py new file mode 100644 index 0000000..c42f37e --- /dev/null +++ b/app/modules/rag/indexing/code/code_text/document_builder.py @@ -0,0 +1,22 @@ +from __future__ import annotations + +from app.modules.rag.contracts import RagDocument, RagLayer, RagSource, RagSpan +from app.modules.rag.indexing.code.code_text.chunker import CodeChunk + + +class CodeTextDocumentBuilder: + def build(self, source: RagSource, chunk: CodeChunk, *, chunk_index: int) -> RagDocument: + return RagDocument( + layer=RagLayer.CODE_SOURCE_CHUNKS, + lang="python", + source=source, + title=chunk.title, + text=chunk.text, + span=RagSpan(chunk.start_line, chunk.end_line), + metadata={ + "chunk_index": chunk_index, + "chunk_type": chunk.chunk_type, + "module_or_unit": source.path.replace("/", ".").removesuffix(".py"), + "artifact_type": "CODE", + }, + ) diff --git a/app/modules/rag/indexing/code/edges/__pycache__/document_builder.cpython-312.pyc b/app/modules/rag/indexing/code/edges/__pycache__/document_builder.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4e8a6b1cb0a1eb5c128bde5b5b735a556352ed88 GIT binary patch literal 2090 zcma)7&2JM&6rWwMKS=_ylazo+97u#Lfj2D$^@CQW#0r&=8v$umx5~13CT5fUFth74 zu2QK74(+Mp6e5ZeBqWMJYW{@w)I%>$d$2VEsZtMBZ_q?S%Bk<|dS#lVQs?mI_ug;j zH|sa=jeqIxP7xTJJ@<`dl#oBr35OB@X>$s|GNFV@Cb6WFBq5ef*@~1RmQqrzXenyN zN--;5id%_N!sjDq(n^(55|POyp~_7{qmSf3v9zxq%fQyH8UD7KPPPt|=+YPs%FR7#fgD-7TkXt58 z^4v>g2p1*yfP`H8$S{sGa5Cgb`ZD4{&V%8_LNu2MQD>&|k_1}*U3H>}?hqOQrt9C) zfVqpz-g}h)+C!D4$Q^Pw>g^8|Vda4kcV>uU9(IiIhJnQl?-+ZZ3DJQa3&Dlx9$ya8 zJ@N_B-2$j^aCaD@hjvG4l*YbGdPhRW8w$}}Hbk8nH1_&+(%jL`nh?R-|DCbpC%o?_$=rVP?qE+>}>F88+>)bPD^|Gg- zbS^3s|A?fg^9u7&Ua9Kdb&+x*;%TN~Gm&66^-(S%qNd-e3W|!D8#HUwGdz=l6kGI! z;zHYsgvC6a>Yffep#R-mn&Tz%SI}UK@HiOJk3U2J+%%wK!n zxlNlx;~PWc0M|eGI`U-yp(p7>&Gh(2dc4&;*vbsH`UjV1Zp}3MM%T`*(FgWDyD@gU zkv`MvJGflDRcvIAt-ZQ-<-yFona1er>*v(h}0mqe4;`6_X+x7qaeAPtwB?o~VK%e&oKvS#)^v5>Y&th$7wKt&tm6MUoPm^TM3Z znM-=vm>}%jfh&j5obX%1EWNDP1?k zVNJ~{faBIrr=I@$uH(=Y^=#hI({*HtO_rg5hpia}WJ~?_SpJg-HJ)297stutAn#MIBC!XAOBc zl{WHp(i9XeZJ5%iYNYTK5;4h?WEuA%%cf73NAt>f7TWCIdMdu)#Uqurj*H)}gxW8@Qt94!VepEp zENuL&t8a1V?XD*;9G+Hg3s2GkpvR~;XrpZ^Too-~RT~wT!d()QE>L~J45Bt1a(0>) z0MjWbHBe}-`YUM6ks^PAA0b1e=sHMdgfy>_H2)3=bxJ4-8D~vh4Wsr3>ITVA`1kq+ zG9^w4hV3H=DRM>8vE3IQX9u&Xen0A|{$5(_7QK zs?7lz*?mLPHBd5BJckWaWXBcrn1ZfmhiL@pnoN(!mgFg&n&}V*E6Ng!D%mmyO9m^#=^;RBtGq_Na2~_<=~amx227haQDpA<*Aai z?q6SeNY_zLN7-K9usQg~2OK?y4^s70ks5ukP}?TQGsM~NtIs3oNX;)0O$DxHc1wNL3%yiog0tO|@%mw-^r^(dBE$iw0m`Wc<3>V!7m31a@Xz54;RL9aA@NrsBGi15YRj;Z$ zoZ(HlVgmqBoUZ7!@Sd;Iw;8H%mCakGg&+Bt%+s~K0zMiHe2J_66B@uL$<+9Q z0lO5rRwnBh-V_h>^$x;bb&$LRyid7+W(&hrIFw=rU0+ivx*W>TvgvZvW4=#XY0uN(kAPs3QPNKk(NfP4|rljRgt5h>|h$wCue7QYLwrTNL5PSW8eBnU*{TjkUF8^&D)yzYY=Hl8!t)o=am;zs_5VegG)xsELj!+AVHhwDC|H z8(K)CZXV5~+6Djwg6ynYme$Q5zawq>|1m%h19>_Kpkc4w)?WAPleu)=h63OaorNYHNqu$7`OhGf$Q5~tS>V3cx z;1Nc)5N#ngYwGK;O5YEF>HJD$%W`B(Ir6n>&u8K8N+43F_513ye$T?lqVem=>yt~M z@Xvn%3crtgSlTt$R|5L~o3tl59dZnX19`9I3U{)MsI@DElPtP^2KOGwjc`{@~Uwo?4XmH5lc@s~dtD#wqNhTgmrIDtofvc*wjlwpRIgy^=`QMWcE z$EQv4tBL6vhGR!gFuhmM^o&|;E4mC+e^xS>a5WVt$8Z@*iC>;b0{rdn~h1R=Wy>lmKPs}Sn`OcSw_Zr;v zp}XPe{FV~x*;($Z$PL-qs>{k{+;E$P z3p(>1F*|Idi?0ub>>vvrjlyNw6NWf_D*zUqF}Ui$~R>K`L-~HitD;+&>YoD38JTWg^^I!D?nh4J zE(9#PSV(WV@s=#sd8H!XI?r|(cIs9+x1$M;CxYwtf4+xW7@G2|`bb#c!qY*JgJ8!X z#~_A1-l%!Jq5!WDg7XXqfzSKD^E4&FUP2(}rjluI8i1}*^NU3Z?tE?t%$_3NU|R5E zbUEwS+P@?5H890B*V)$WtS@>fuefuZ;wg%jt!m^W>4x9>zJtX)us2Z<$Bp49zVU!7 znr?h#%jN%$x@`KKx3*l~l-MKNIaQ(EFctGtBt`M&MDZvnN{eDxp(rEkQi}0_BFom> z8NQgo!rf4rt`XIssMV%h&EZRz={>DZC>p$W8FUwpdjKP47?j<0DW(F}%NL3}=Cu+J z_IdVY_R#~lPGk=w)~48gaHBc~R+f&`QW6#@3f+xv$hoh*j%z|7gbTz@8++GX3ttVd z^ln@3-L`n-x966$mFE)6&m}%NUw-b*^6nEWyH73eK2_eWlzY|bV5I^5rE>4i>EPW+ z@BG1QgI5RVpItb+7+v(=eCp=On@5+<-s-vK{iOX*yCESdolKR|O8KZ-8XPV~M^>V_ zPFGnW6BBbL0{{@3pfQNW|ya~+sF8fxb=&}^O3;AvDM!3gUHf*yYbi1N+ z$=T%H{tfS+UoaL=+&o*}xToB|_il9E`)@8BUwrltJ-0ee%imV>3&Zj82X1RN{?O1|W&5 zcW?Ikp|xnW$aZVA9tX3tRoo_H@1@lh-!TGkCmigdg}GxEz2-hzrBF7!vc>l?EAqgv{U2o+^`R4#B-_aIAlM}SWe-B zG=f0@4Pkd53=|@b`i?iTGl-Z2aDm+QcU@85KXC2v)x)>_TOcCM?VjB|op6R)9)U7i-tlsfj70(+eWt916x?zNvDI<}SqTPxvq z7TMsMffBKv6!CIi2;pn}Ti^t?4*+X1W5##JB> z_&#i5UP1>DJc0l-G6t}xe-pqZQso8lv1&jR`>G*M++1}tAd36AYRD~~i+K`_;wtoKESd zv)U;ZHmq2%8757C0>Z}|)WYgb>?=NKQ z=cMloH{tzPI&X8`U-5qKse1%K)kXN8d)WPIorhch1p#os^BBiPt~@jM{Ot4pAlPP1 F{TqBM>o@=a literal 0 HcmV?d00001 diff --git a/app/modules/rag/indexing/code/edges/document_builder.py b/app/modules/rag/indexing/code/edges/document_builder.py new file mode 100644 index 0000000..cc6f784 --- /dev/null +++ b/app/modules/rag/indexing/code/edges/document_builder.py @@ -0,0 +1,29 @@ +from __future__ import annotations + +from app.modules.rag.contracts import EvidenceLink, EvidenceType, RagDocument, RagLayer, RagSource, RagSpan +from app.modules.rag.indexing.code.edges.extractor import PyEdge + + +class EdgeDocumentBuilder: + def build(self, source: RagSource, edge: PyEdge) -> RagDocument: + dst = edge.dst_ref or edge.dst_symbol_id or "unknown" + return RagDocument( + layer=RagLayer.CODE_DEPENDENCY_GRAPH, + lang="python", + source=source, + title=f"{edge.src_qname}:{edge.edge_type}", + text=f"{edge.src_qname} {edge.edge_type} {dst}", + span=RagSpan(edge.start_line, edge.end_line), + metadata={ + "edge_id": edge.edge_id, + "edge_type": edge.edge_type, + "src_symbol_id": edge.src_symbol_id, + "src_qname": edge.src_qname, + "dst_symbol_id": edge.dst_symbol_id, + "dst_ref": edge.dst_ref, + "resolution": edge.resolution, + "lang_payload": edge.metadata, + "artifact_type": "CODE", + }, + links=[EvidenceLink(type=EvidenceType.EDGE, target_id=edge.edge_id, path=source.path, start_line=edge.start_line, end_line=edge.end_line)], + ) diff --git a/app/modules/rag/indexing/code/edges/extractor.py b/app/modules/rag/indexing/code/edges/extractor.py new file mode 100644 index 0000000..1eaee75 --- /dev/null +++ b/app/modules/rag/indexing/code/edges/extractor.py @@ -0,0 +1,114 @@ +from __future__ import annotations + +import ast +from dataclasses import dataclass, field +from hashlib import sha256 + + +@dataclass(slots=True) +class PyEdge: + edge_id: str + edge_type: str + src_symbol_id: str + src_qname: str + dst_symbol_id: str | None + dst_ref: str | None + path: str + start_line: int + end_line: int + resolution: str = "partial" + metadata: dict = field(default_factory=dict) + + +class EdgeExtractor: + def extract(self, path: str, ast_tree: ast.AST | None, symbols: list) -> list[PyEdge]: + if ast_tree is None: + return [] + qname_map = {symbol.qname: symbol.symbol_id for symbol in symbols} + visitor = _EdgeVisitor(path, qname_map) + visitor.visit(ast_tree) + return visitor.edges + + +class _EdgeVisitor(ast.NodeVisitor): + def __init__(self, path: str, qname_map: dict[str, str]) -> None: + self._path = path + self._qname_map = qname_map + self._scope: list[str] = [] + self.edges: list[PyEdge] = [] + + def visit_ClassDef(self, node: ast.ClassDef) -> None: + current = self._enter(node.name) + for base in node.bases: + self._add_edge("inherits", current, self._name(base), base) + self.generic_visit(node) + self._scope.pop() + + def visit_FunctionDef(self, node: ast.FunctionDef) -> None: + self._visit_function(node) + + def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None: + self._visit_function(node) + + def visit_Import(self, node: ast.Import) -> None: + current = self._current_qname() + if not current: + return + for item in node.names: + self._add_edge("imports", current, item.name, node) + + def visit_ImportFrom(self, node: ast.ImportFrom) -> None: + current = self._current_qname() + if not current: + return + module = node.module or "" + for item in node.names: + self._add_edge("imports", current, f"{module}.{item.name}".strip("."), node) + + def _visit_function(self, node) -> None: + current = self._enter(node.name) + for inner in ast.walk(node): + if isinstance(inner, ast.Call): + self._add_edge("calls", current, self._name(inner.func), inner, {"callsite_kind": "function_call"}) + self.generic_visit(node) + self._scope.pop() + + def _enter(self, name: str) -> str: + self._scope.append(name) + return self._current_qname() or name + + def _current_qname(self) -> str | None: + if not self._scope: + return None + return ".".join(self._scope) + + def _add_edge(self, edge_type: str, src_qname: str, dst_ref: str, node, extra: dict | None = None) -> None: + if not dst_ref: + return + src_symbol_id = self._qname_map.get(src_qname, sha256(src_qname.encode("utf-8")).hexdigest()) + dst_symbol_id = self._qname_map.get(dst_ref) + edge_id = sha256(f"{self._path}|{src_qname}|{edge_type}|{dst_ref}|{getattr(node, 'lineno', 1)}".encode("utf-8")).hexdigest() + self.edges.append( + PyEdge( + edge_id=edge_id, + edge_type=edge_type, + src_symbol_id=src_symbol_id, + src_qname=src_qname, + dst_symbol_id=dst_symbol_id, + dst_ref=dst_ref, + path=self._path, + start_line=int(getattr(node, "lineno", 1)), + end_line=int(getattr(node, "end_lineno", getattr(node, "lineno", 1))), + resolution="resolved" if dst_symbol_id else "partial", + metadata=extra or {}, + ) + ) + + def _name(self, node) -> str: + if isinstance(node, ast.Name): + return node.id + if isinstance(node, ast.Attribute): + return f"{self._name(node.value)}.{node.attr}" + if isinstance(node, ast.Call): + return self._name(node.func) + return "" diff --git a/app/modules/rag/indexing/code/entrypoints/__pycache__/document_builder.cpython-312.pyc b/app/modules/rag/indexing/code/entrypoints/__pycache__/document_builder.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4693571fc0236fe13c23de9ff228aa19d2eebf6a GIT binary patch literal 1952 zcma)7%}*Og6rWvx;y{88jcQ7W(?Y|7;w7P{q-sI{MUf(c0zGt9WjpZ>W??_v*)hdY z4n6Qcq*B^j4ylqu|AgFoVb#N0qeQCIL)9AzNU4x2^}SuYrj$eH@SFEB@6FryG5hnt zK!L#8KmFAHD^196$aEL*xRm{6z{kWQmf{dsF%$)9)luD)k#f^U+RYdlH)~|woRM?$ zMn2Z3oIbZ;6cnP8Tf|E55G(UsjU5u>R4c3H{y;`k3fZVqT5i zvAxY6x7unjdFmdmFZi{l%RC|Tz}%rN#$_K+D}IyLn9L#BpdR=wEP8^s8oup`T5@lK zogkF`Vc;JVMhwLwhH9lOWhZTn&Hq>W1hNg9xKjVtb~Gn=dP084h=J~Dus+?Tdpk(+ z3y!UBl|WlMrULflI6!(r+#o$6GLT~Ck1dd%JthJcijw|5C(2!o80k_Qqg|@EgO8IU zRyM|ZXSx)e{ofgRk$Ij;?CQ#k?DNi-t`IG^B(sf{Sogg{r6SYwcNZ45oXkUa@|I~^ zvXD>$d$(_mQvq4*qPc1n> zxvWz?=3OSJMFj;5n7=?9b){ zGwo`k+LHD^I_SCa#4O*Iqu1ZW-Y% zW0%$=>V(wUBjlm7puWTHuKFg~KnM>b3JujoA#MkR40|f5hJ5wlLUFLLM~Z!gefV~= zq-En8a!d!GM8HQ&W;qj0?!~PHuWQ~kJ(va4lm*jFN&|GiX@1?LPNLycVD}^8)96D( z==rv@o$CJph zkL8u)hTX;i4;HH)X`Cu&bvuBWkH0pFbOv1XqCd=knuPEf{{$5Hp9pYH_EL(Xye1$1 VPBtU5`I_8*mmN|HZwWwb{5SHs(s2L) literal 0 HcmV?d00001 diff --git a/app/modules/rag/indexing/code/entrypoints/__pycache__/fastapi_detector.cpython-312.pyc b/app/modules/rag/indexing/code/entrypoints/__pycache__/fastapi_detector.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..91effe964e57e0ba1d82c5fc1aa7418c20958b85 GIT binary patch literal 2030 zcmZ`)U2GIZ9G|)UxDQ(T&0T@oUQ3{N;vG~JY)Hk}B0@wXX!NwMOJ{qx=icr)yH|S6 zUf`h*ec++QheC(}qlqOokw+f!NfKXr)})&WJ{cZ(BNSuO81+AQw;b{@$Jgi!#! zg@`zhh_n)@YiH!LoCx0In*?TCRMWH^)zK}}hOKPp)R9+S1r(Vu9a=70y6H^S73*`u z3f|ZW`z55I3??WO$U!XM;*3B<5||e=Vp&K?toNvDJ8u;Arm1nw(WV@Wj=|9o6SA7a zgks5IV$rf4CKXj@D#v7^8C(uG{BoBa@CgXvjWO6Qp)9HhZeScO;>m7=Dxxdq8^Vt- zv)B$`E-SgxIGX$$BxP65gPkYGN`p`dy1_h;?TJC~N+TK)=mlOSeHMyAG!DnA~IyA z-KR8H-~?Uy3i?{gcQvU#A|cFM+PN78W$xWQP_b-e&IYq6K zqiy2hO`MnuyVFpxYEgGLu{$S~oavViF)`;jMHXyuSg@cuIg8kdFbiq?i6}Z@kvek9 zMU920sanv^TJ#InUNf#(RGG301=S?1BL}mgQN=D7&R7Q56`|;JECPP1qZqoWal52} zz?n)^M@?*H5u#05RE5W5Gs&>dYLrQK(a;?hQr?+3eeB)wQ%t6|L-it))nXBhP+oJC zcMTS9I+QhY$|4YGPMalqRs$9OmT4cClS-gT6A>obnlVjz!dbAc3vP8sE7;H^YQfX~ zn&C%bG0}vBGtp6XgGsfc+q{`T^ifX||o_j4Y%#4a9W8A|*q!(^So-b(3hbx|vP$!=!5$Q0rE9x^YzsX*8Qu zvCMl4?>ut@&uvMmx&(h*_AYpG=aDzKYeDk1^=+W;aOc8NueX1_H(BjX{@S~5`OLy` zuW!!-)Y;m(81|wAOXn`0TZ;}aie7Zb(%i+lYeQe1yGEvYjJ%& zZdc>>TD;^9CA`5Gy+q1O?psfeR+FP%UmS?*LkFru2fX;6@B6>$2Z@``E$8RTPnGKH zruM9sN&zWZe2(f?!;ehnc-;nkujETSm&YX#h*C|{OTqt2m&V2X~w zbRE%`Io{8wpg)BG*ex*4Bi454{IQ2(RO+bam(iQ7sFHpym3Ls>Lu02Agqhh7Q*)jnOe+@Sbh+7I$C3 Y82^q2@1pH@(c6E?+i}kW1XE4(Ki@(V7XSbN literal 0 HcmV?d00001 diff --git a/app/modules/rag/indexing/code/entrypoints/__pycache__/flask_detector.cpython-312.pyc b/app/modules/rag/indexing/code/entrypoints/__pycache__/flask_detector.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..229a71e1cb008b147bbe6910d96aa62ef6551822 GIT binary patch literal 1553 zcmZ`(&2Jk;6rb^a_@hbde3dH2ZR{YLO;s<70u5EgDJ|_K7X$}GSZzIHd)@V}GqX

QoOrWqr=@|B_Raf#Z|C=B zejOalAXtZ|A6W$%q2GnjUm%Nx53d8=LIe?PqXsTvEck?-Xh1o{vW zc^wgDI}rt=lG;|a~F%XK)2DzBL{@4g3I=7!_3cGI;SuN())w3h`R zj)J^}C@NusN(mwnye^ldc0!ZGH*VS{zxy)5JoRYVb6FYpGKnOo;giJAd>{3`|2`gc=Mbu)s@%%5b~0J zotN%&K7I&C3H2iw`r3F0i{vX0(br0S@ZT085qZ1!fBn6F=A>7@JE9QPSN^+>k3pY3 z84KFsliHt>+o@RFuZz*<0~u|8FC$<22x+N>Rkg_6mPc2UenmjzA7Ou0X?c|^bD>=G zyk@9GdMy>EsKDK@NSKK^uid0!y28u`z2~yKVK&NSxXdWKjfUxvaHs~EnuFGAc+37vN9C>u6d3 zm{Z1e)21u>JY6=L%&o)Ba$|W$H?0Z6tYzv7djbE@|16>&&d` zmP6=@#?+ z^zg>bAUD~~6+5}&)7F@ z3_KgX@GO7n@$BQbpCo@)yK}cXbGLS{-rgD7d4K2f?$jsUsm0FJ;_l~kZ>qAFuXgiG zo&3^Xo(Gd!P|yNxCYXMwJAJJ)eJ#l4LD`+0?M%)F`Aa`s{O;n5q4e0~-`aF`|scKEk{J!vh&BEvO9H znRraXWX*o5EciJf`x3_ZH&l3z#-F3xf2*T->;M6YBL4yNIh4@= literal 0 HcmV?d00001 diff --git a/app/modules/rag/indexing/code/entrypoints/__pycache__/registry.cpython-312.pyc b/app/modules/rag/indexing/code/entrypoints/__pycache__/registry.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5e580fa57757607e775fddda9d372f8983faa555 GIT binary patch literal 1677 zcmaJ>&2Jk;6rb6h-Pn#3Cv84WMNE<+7Em2NPEc`5XpxGMpuHGX!{QmI8`isKW*y?l zl`17jK?1=o?Wq?IMadrke*?z^$z2Jl5(m^s1e%)E zx{Qi-NJ1He<#cylR_=P)3xwSAg7tDO6tbM8Dajyc<+=x1Y+q`&RR%*qu;J710(?y# zjZf};*qu1P^HH~OZl}_nnLlFl-Pz(1D|T~ZJ0GMfGm{;Z?872#y}AkNe@)`hLI()(jy~ptN@m z$Uf=N4&f+IMjNWW0=JjrqVEIio@wdVq520RjXp{xg;C)pdVv>lzP`5@Yqfh;aKV=5GS_k zA{tm;BpW);3=@U>mvE%1%B2eK`u_IC|4G)V*(^BF?*Kr$7MJ&|?%aj@g?oi3gl6BT zdsn;p$?vaybM5Zj{e^oA55&(`?k*h7ul|-_J2BDVQDJH?pX9gVFtvywG9L;!wv~)p zDu`Jn45VYHN5ET2&Y@mI_Bz1!1e$+(8tP3r^bBe-Fg1s;gzzdt5g{pLEDZJH0L}x^ zd4OHgGpM!vx6Q2S)DUg(C4l~~w;&|we}z{Wj}D(&sTBiB+V@G*C63Q-_AGNb?cn#+ z3K(NG>9+tN50w5+7XKu#{ACkk^oOZqI`h=X(l<^BK+hz`>PiP!f7=LePX8$G8&7UojW^FcGOg7aS4ND7^wG5MaFT1muo!P#b zO|r`bJmio=4)tOvMXexMiH8Lda`Y&8$(lpPp`N4%Z!xI|Ir+Vr-KIwI!F=!g|GxRY z-+Mm|45SgP<0rl{J{1xAjSIa6GGF-k3east5Wyy@;S$E2kD0NWP!d93FvXfwk}!&) zYlw&&h)6rJFc_8M&3G~KI~R&L=+i9Ab~V?qEeFK7Q_*H#dFm|y>1%%@@bZN>QddcnU1^Zc%~=e6Zjvs zK)a2~XkGAP^XSHR2(62rSnaw#zO7*=hQSlOJ_j$!ld6#6?Aqxn*5d?w@fFM@5({^{ zHAV#B5QCa6OvLqsCsd`g@d&wzp3Y76I1k@}PI`%uiq&vZJzKPg7U>T9EKwc!r-e%- zzSI5t-d^P$IJ0`SlZ2P>Bm` z=^#z0ZZi##;{=jvuR=L-tzHKY#;3;kr3g}|+2YjFZ30zP=o&G~)N$D`_v3hll0}D_ zON{p@NJK%9)^KUf0q}<<1*u0T28r-MN*EuL!+SX1T_>LMfkT<2XeM1#=IM%BXLc13 z?x-s>ie`)xW~@+a97-Jri%uO!;f69aN&%%(vq{6Gj>5FEVpxQ(8CF^0l`CN@!<*+Q zVgJ>z{VM6+Ai3TQ;*roQ{NGiRBPR}D=NBL!BY)!JrsSW?9ihS0(B>O{u5gf(J30C1 z+-!T|2xSI_Zl(Ou!WYVIWqan!^ON^x?>*DT?PB}Z5n<20T?sRkZ-Dsv4wKH5v)<{i=Gip_!3|O+cz|5G%mO@|^IOAtW3S4wp92 z!`O(FjeNLbp_t$qml|*~WM}+|3KA8~shGxMgxMKDf!sxb3@DI$D1eJdsQefTWJb$S zI%`IV509UQgr_B)B(Fr7=9mly{>V5_0XY;f#=oM82Wa#Gdh@S%7H5wTkTCLZc}TUM literal 0 HcmV?d00001 diff --git a/app/modules/rag/indexing/code/entrypoints/document_builder.py b/app/modules/rag/indexing/code/entrypoints/document_builder.py new file mode 100644 index 0000000..0315cfe --- /dev/null +++ b/app/modules/rag/indexing/code/entrypoints/document_builder.py @@ -0,0 +1,26 @@ +from __future__ import annotations + +from app.modules.rag.contracts import EvidenceLink, EvidenceType, RagDocument, RagLayer, RagSource, RagSpan +from app.modules.rag.indexing.code.entrypoints.registry import Entrypoint + + +class EntrypointDocumentBuilder: + def build(self, source: RagSource, entrypoint: Entrypoint) -> RagDocument: + return RagDocument( + layer=RagLayer.CODE_ENTRYPOINTS, + lang="python", + source=source, + title=entrypoint.route_or_command, + text=f"{entrypoint.framework} {entrypoint.entry_type} {entrypoint.route_or_command}", + span=RagSpan(entrypoint.start_line, entrypoint.end_line), + metadata={ + "entry_id": entrypoint.entry_id, + "entry_type": entrypoint.entry_type, + "framework": entrypoint.framework, + "route_or_command": entrypoint.route_or_command, + "handler_symbol_id": entrypoint.handler_symbol_id, + "lang_payload": entrypoint.metadata, + "artifact_type": "CODE", + }, + links=[EvidenceLink(type=EvidenceType.CODE_SPAN, target_id=entrypoint.entry_id, path=source.path, start_line=entrypoint.start_line, end_line=entrypoint.end_line)], + ) diff --git a/app/modules/rag/indexing/code/entrypoints/fastapi_detector.py b/app/modules/rag/indexing/code/entrypoints/fastapi_detector.py new file mode 100644 index 0000000..d048f9a --- /dev/null +++ b/app/modules/rag/indexing/code/entrypoints/fastapi_detector.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +from hashlib import sha256 + +from app.modules.rag.indexing.code.entrypoints.registry import Entrypoint + + +class FastApiEntrypointDetector: + _METHODS = {"get", "post", "put", "patch", "delete"} + + def detect(self, *, path: str, symbols: list) -> list[Entrypoint]: + items: list[Entrypoint] = [] + for symbol in symbols: + decorators = symbol.decorators or [] + for decorator in decorators: + name = decorator.lower() + tail = name.split(".")[-1] + if tail not in self._METHODS and ".route" not in name: + continue + route = decorator.split("(")[-1].rstrip(")") if "(" in decorator else decorator + items.append( + Entrypoint( + entry_id=sha256(f"{path}|fastapi|{symbol.symbol_id}|{decorator}".encode("utf-8")).hexdigest(), + entry_type="http", + framework="fastapi", + route_or_command=route, + handler_symbol_id=symbol.symbol_id, + path=path, + start_line=symbol.start_line, + end_line=symbol.end_line, + metadata={"methods": [tail.upper()] if tail in self._METHODS else []}, + ) + ) + return items diff --git a/app/modules/rag/indexing/code/entrypoints/flask_detector.py b/app/modules/rag/indexing/code/entrypoints/flask_detector.py new file mode 100644 index 0000000..25c0c27 --- /dev/null +++ b/app/modules/rag/indexing/code/entrypoints/flask_detector.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +from hashlib import sha256 + +from app.modules.rag.indexing.code.entrypoints.registry import Entrypoint + + +class FlaskEntrypointDetector: + def detect(self, *, path: str, symbols: list) -> list[Entrypoint]: + items: list[Entrypoint] = [] + for symbol in symbols: + for decorator in symbol.decorators or []: + lowered = decorator.lower() + if ".route" not in lowered: + continue + items.append( + Entrypoint( + entry_id=sha256(f"{path}|flask|{symbol.symbol_id}|{decorator}".encode("utf-8")).hexdigest(), + entry_type="http", + framework="flask", + route_or_command=decorator, + handler_symbol_id=symbol.symbol_id, + path=path, + start_line=symbol.start_line, + end_line=symbol.end_line, + ) + ) + return items diff --git a/app/modules/rag/indexing/code/entrypoints/registry.py b/app/modules/rag/indexing/code/entrypoints/registry.py new file mode 100644 index 0000000..d49f6f8 --- /dev/null +++ b/app/modules/rag/indexing/code/entrypoints/registry.py @@ -0,0 +1,27 @@ +from __future__ import annotations + +from dataclasses import dataclass, field + + +@dataclass(slots=True) +class Entrypoint: + entry_id: str + entry_type: str + framework: str + route_or_command: str + handler_symbol_id: str + path: str + start_line: int + end_line: int + metadata: dict = field(default_factory=dict) + + +class EntrypointDetectorRegistry: + def __init__(self, detectors: list) -> None: + self._detectors = detectors + + def detect_all(self, *, path: str, symbols: list) -> list[Entrypoint]: + items: list[Entrypoint] = [] + for detector in self._detectors: + items.extend(detector.detect(path=path, symbols=symbols)) + return items diff --git a/app/modules/rag/indexing/code/entrypoints/typer_click_detector.py b/app/modules/rag/indexing/code/entrypoints/typer_click_detector.py new file mode 100644 index 0000000..139f948 --- /dev/null +++ b/app/modules/rag/indexing/code/entrypoints/typer_click_detector.py @@ -0,0 +1,29 @@ +from __future__ import annotations + +from hashlib import sha256 + +from app.modules.rag.indexing.code.entrypoints.registry import Entrypoint + + +class TyperClickEntrypointDetector: + def detect(self, *, path: str, symbols: list) -> list[Entrypoint]: + items: list[Entrypoint] = [] + for symbol in symbols: + for decorator in symbol.decorators or []: + lowered = decorator.lower() + if ".command" not in lowered and ".callback" not in lowered: + continue + framework = "typer" if "typer" in lowered else "click" + items.append( + Entrypoint( + entry_id=sha256(f"{path}|{framework}|{symbol.symbol_id}|{decorator}".encode("utf-8")).hexdigest(), + entry_type="cli", + framework=framework, + route_or_command=decorator, + handler_symbol_id=symbol.symbol_id, + path=path, + start_line=symbol.start_line, + end_line=symbol.end_line, + ) + ) + return items diff --git a/app/modules/rag/indexing/code/file_filter.py b/app/modules/rag/indexing/code/file_filter.py new file mode 100644 index 0000000..3c497e6 --- /dev/null +++ b/app/modules/rag/indexing/code/file_filter.py @@ -0,0 +1,13 @@ +from __future__ import annotations + +from pathlib import PurePosixPath + + +class PythonFileFilter: + _EXCLUDE_PARTS = {"venv", ".venv", "__pycache__", "node_modules", ".git", "dist", "build"} + + def should_index(self, path: str) -> bool: + candidate = PurePosixPath(path) + if candidate.suffix.lower() != ".py": + return False + return not any(part in self._EXCLUDE_PARTS for part in candidate.parts) diff --git a/app/modules/rag/indexing/code/pipeline.py b/app/modules/rag/indexing/code/pipeline.py new file mode 100644 index 0000000..fe91b01 --- /dev/null +++ b/app/modules/rag/indexing/code/pipeline.py @@ -0,0 +1,52 @@ +from __future__ import annotations + +from app.modules.rag.contracts import RagDocument, RagSource +from app.modules.rag.indexing.code.code_text.chunker import CodeTextChunker +from app.modules.rag.indexing.code.code_text.document_builder import CodeTextDocumentBuilder +from app.modules.rag.indexing.code.edges.document_builder import EdgeDocumentBuilder +from app.modules.rag.indexing.code.edges.extractor import EdgeExtractor +from app.modules.rag.indexing.code.entrypoints.document_builder import EntrypointDocumentBuilder +from app.modules.rag.indexing.code.entrypoints.fastapi_detector import FastApiEntrypointDetector +from app.modules.rag.indexing.code.entrypoints.flask_detector import FlaskEntrypointDetector +from app.modules.rag.indexing.code.entrypoints.registry import EntrypointDetectorRegistry +from app.modules.rag.indexing.code.entrypoints.typer_click_detector import TyperClickEntrypointDetector +from app.modules.rag.indexing.code.file_filter import PythonFileFilter +from app.modules.rag.indexing.code.symbols.ast_parser import PythonAstParser +from app.modules.rag.indexing.code.symbols.document_builder import SymbolDocumentBuilder +from app.modules.rag.indexing.code.symbols.extractor import SymbolExtractor + + +class CodeIndexingPipeline: + def __init__(self) -> None: + self._filter = PythonFileFilter() + self._chunker = CodeTextChunker() + self._code_builder = CodeTextDocumentBuilder() + self._parser = PythonAstParser() + self._symbols = SymbolExtractor() + self._symbol_builder = SymbolDocumentBuilder() + self._edges = EdgeExtractor() + self._edge_builder = EdgeDocumentBuilder() + self._entrypoints = EntrypointDetectorRegistry( + [FastApiEntrypointDetector(), FlaskEntrypointDetector(), TyperClickEntrypointDetector()] + ) + self._entrypoint_builder = EntrypointDocumentBuilder() + + def supports(self, path: str) -> bool: + return self._filter.should_index(path) + + def index_file(self, *, repo_id: str, commit_sha: str | None, path: str, content: str) -> list[RagDocument]: + source = RagSource(repo_id=repo_id, commit_sha=commit_sha, path=path) + docs: list[RagDocument] = [] + code_chunks = self._chunker.chunk(path, content) + for index, chunk in enumerate(code_chunks): + docs.append(self._code_builder.build(source, chunk, chunk_index=index)) + tree = self._parser.parse_module(content) + symbols = self._symbols.extract(path, content, tree) + for symbol in symbols: + docs.append(self._symbol_builder.build(source, symbol)) + edges = self._edges.extract(path, tree, symbols) + for edge in edges: + docs.append(self._edge_builder.build(source, edge)) + for entrypoint in self._entrypoints.detect_all(path=path, symbols=symbols): + docs.append(self._entrypoint_builder.build(source, entrypoint)) + return docs diff --git a/app/modules/rag/indexing/code/symbols/__pycache__/ast_parser.cpython-312.pyc b/app/modules/rag/indexing/code/symbols/__pycache__/ast_parser.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4b0875ae672008c09579202ac40765974bb26935 GIT binary patch literal 770 zcmZ8fO=}ZD7=CAG6PvI0klF_Eurz`!NEW;mL@6jOQIsi|=_mkMyvPJ9ph)INIJ}ux zI8Vq~5${18RB<+ddmt!*eMsR5W^^Bph)cRI)ijVwm&T^AuJux`g4r`665(3fmMlKd zwqi3SO((}$a_=S1C%qtMGw&&z`eTtCvVoHR^r06-PAH-&<4!OhI~W--DLoSyGOj#y zdZQ$q#7uf37 z;VoQRw9-0nZp^peZ-3nT@Z?*4>+kv6ohu-f*7xS-CB=Oy>zMm)eG}%DHvL4}wCmcU zhCWTD61psynh1WpiUW0adV9`^(@i*YZN1|AJQy+G*Hyom40+x6-%NrykBA29e1Qcb z2-8QDuAz)Xnpscbw`G@F!u*RYjM6B0Fu}GcF7VAkn8ducX;_wP^p-1+@;iAfHqbOL ZE^i~?(FpkgTNhCMQ!bI#6(D3~{{U||yY~P9 literal 0 HcmV?d00001 diff --git a/app/modules/rag/indexing/code/symbols/__pycache__/document_builder.cpython-312.pyc b/app/modules/rag/indexing/code/symbols/__pycache__/document_builder.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d32cf32c4693629d15366980b1b0fe8657615f4b GIT binary patch literal 2386 zcmaJ@-ER{|5Z|@W-${t`RXYjP6dd9}&d^kC(xj0R+)@b|qXsI~s5%{Yn{ejrGkfPW zMnOUq4{e3i^3s5y0;H;d(yCRT_#gU$F_NuSkSg_|?Hg^7+UBXVKA#;*+LOHd+uzL1 z&d$!v`N#J5AcAM3<16ii2ce(&!g&N6;ElHdtRfYuSVuXW!I)#0?#j6{?wpVja-NJQ z=goNQyj%C>{24z+E_4K`!X>17)?BuwOrYdV_dtnz^c=+Rbj6)Q|K8E+bu_^1J_omEU?Z)SgPhy|X}`m@W?;98Ft9C^CIWVwm;*c5 zRA~ZWx48vicQsX-Us2Q@6dLf-_}d^R_a5}~$vR`?H*rvDq(NPQhj5H1^bmlf2=k;yA3(ZYz$@6q zm%S_a3%m+teK{nc68hHtAx_}Adnn;zF7Y4}3Np1!ZX_1lw2~)HLd~SCljUTJEJ+1w zWQk&#($dRGS&OMuTO#?GTqwjqq7c(ea={{0brQ&A&QOawF_TnYOlox^Bbnx0D%N7$%=$*`;BbE#XqP)y_URh_D zkCK8eLwj$dBxfuUv$(LJEi>V?Vd!)>=t0vKOrkGPUQuwgdYW*#S~9s~F?cBNQ_NFO zg7X9alOg=d%{C~Gi>THeU7fr#`T6M8bGQ2L4Bj5R)$yP+`F&sY&`kNz%!3`X9}Bfm zxEdOG7#gUxb$)jAlcQI?)yQx;GF**_<%n2`9DLMvXnjZjdRKq7ODuPZwQv-y*196A z2d*5bg?m?{SE7(E~BUo|>Xj*i@Y>$~*5bR|6X)6PBBoulQQqqXRZ zHwUi`)*`!ZhOUKbO?q^!Iyzk*ovsaw)#35-@OW+SNOkXIdGBN`zW>gt+o!7W@p63p ze)oN)5}&Qc)8%;j!SR!o_#V`JsmSZ(*d-(P6!>v+1O zr!)8j^#p<&5EL|*@YW+td1Iun@qq`6c`1QR&)0JRuEH-#yb>gd1*Jy10&bI}b46LV zC3J$n%)d1Bbw2RBq5N+~ec)+g^5GJC>WX-RwhkB1C4$`OLXlUNBvJlNW>LsQvC%X{ zXd0qoKw&`Qo2{(0U9RUJ?=jD)J3R*8Q;BFNk10Yua2p1#pV4;iS%)|MQ}(XYbQb*J i?bPH|^2CiXevDrF1*Iz}{TQYG^hR;;4+KN~B>wsRQclI>WIJ*GL7OqwFKGb7vL zQm6rYQMpKAsf%4no1|8N0HI(Ps}u#|0tMcZ8EDZ0L4i$wc_S}&?LX}~ zcQ`|e9%YMMfM@QVd+z(3@0@do|5j4sq#*69`8515l@#?27L3H}8G7|^ATv!#ltf3V zVS0e3G0#Mq0S2CIlpW>#!jWNEGZ?WK`HCG@Oi>m=O9FHk} zAEP;>fEox!1B#;YgW*tA0v?-kIk0p0tNoDWm1ta5f(6}Dxo+vz6uh0LLeu~)Q3K2= zSbm6=nAhPg$+2dRGjlx2*?`Y(<^;3WfjQik5P#okY|DU6asj`aaFq<$rBWz)awUP3 z%FK3-f_6@+95^emZmh&tsR{3l^^FZ*ibpk8QDx1ckb)SNH2%l2z;H<8u7qQf#*GBj z%NiSwshU$!1F|Yc!?BPigkqAOb131VSU??>L$E&{DKr=ujjG~cAgIRWv4kTUR@8Ga zzDZfafFywtSk#<599hTD^E^4>T)k;rh~ z?4~Vgu4*!p;kJBT-;z?7>bq{7nB2U?b;%oyq1Fg-PQ&O`+?c!&ORPlS;s#jI7*6x( z;_{X;@gN7hMnWxsAH6yNnQ1D?-e88PK1$6gGesq7LxRV-Bp1nPjnN#RN!H+?Dc~sV zJ;&BZ&C`A#tCfoSR8EJLFo=`JU&V}O&+V;`*El5<9h6I&jGZop5A@ z@;jc}o*7|DXw7=+mOb7@k2fuN$)@{#0-2DE2rB!KklZ%NB}!p>?Fak%4VAFTA<(E; zqLiqQp{eJ6oLr9$8;~?2c?C%e5^}I!%xyFXinU?~a*KM(@O-PzgVolD|31_d9KKsW ze5piYKnOUgRAS-DOCE`Xr%fu8czD{Sa>)iyL8<__<^~*6r6j=9DOE`hc)H-}gr|GV z<*Sz4aPXY8K^E~XCphhp0l}e@l!)(18ls?NiV!K8ZO15iGcbWT@Bj>eQ8p3qDR@QR zf^QjlD?G@CE7-(1V*rkDEUb!RqIRtiwHplqZ zCY!is35X*^_IqI@SOOJ+7sW}Y$6#?0bNc!E>$JjvDWu04sJD>%Uh4ce6gAE!*@!LQ z%E&=$H|Pr+LFgz-plQs9V0x`7 z(AvTf!;IJ@2bBAmi{M5VC5bo2H=Yp``l4hw14&M@S&yTlf{>&(8@VJc*}9ykxf3x)p{Wo4n?vb9O)W}!bfl><%2(>wsONP1 zb2^zQNrs0<;$U_I(QrU{&X1~t+xKdG5d0NYv!N}H#lJ-r(VT`!j>QwK_YH6VJfoH9 zjYE<+D9492cVS=if{dpx;{nShNDwdOG9-AIas?8^JYSjI2@lPtn>CF`3#(|hfTBAJ zgz{*@M~L=8&BzGMMz?DywDJzpJ`_V6s5vf&u1n#e5VSx^((FSaaKXR{DFFi-lEXof z0J_gclvv|p&`3UjoLpE94QtM5JQ#=)51?R$WKuX7BD$gPWuzzZDA~iqum_0^MS5S% z@DlW(*kDHy^GPd83AJ}Tch$4uC=Rdv(HrjC6_l{!i%db9P-cN=FL zAJuoJxL-Nu9TUg08`@?r&js}2+)vU?yOx`}7n`~tY|1q4n|M3xF1vH$_K9WphDG;= zm9n~(md(KGoOjM87djUP7EU}YNw@iz+fFUEol2iRpJ}_0X%UxO!iz28Ov{yQ!{%(g z7YA*aZCKf`@zXu2u7$b>otaH(4y6C=^kVCor{#|Fir=%YlF}!3SkIFd%2Bc`G%N}YGhL5! zw#yps+J6w-0i697;EYK!R$!NdGsX()00Jp1n6m;g6$P3k)dVYOjlx}&0Ep-$i!cdr zWI|V-Niy9MLy;p0_(jQtBUa+9$5BnFQ%yJnIN~MS3vgu3XV( zx2VYjp!KO3E0-JqY)%Vc`yT>qCd}qHrwG_we)(0XOxSa%ChvigycbC~lGl**07=+& zsPf7fu*my?_&fxKL(vLF46R0^90Vr~$GQeCrymTken)UA zpoA2kO>>C>Ny>$-azAtjfobUah)j$W5jfq1vLo>kADb6FkVxzSHQ2OSrMlD|!V!8y zgNbHKxihN$&{R1M1niaIUKZ*Wg}RxlnQN)q^r^GUr^Llm;#WH_e%Zg!^~L_r_do1< zyz9uLofp$v153gs(m;p$CBX}&JFeR<3}GG%_0PWJsa*k@1W6y)9rTs1F#@P3%#&Iv#H|21S5BJ4u zxi1AAeb@T=K`X^IZvcpr7WIQgE5$8TL7(0K?f*iOF^ZD$bipMJsh8uDCKv*w@kFoy z-qB^1Pa#3Pm(KuM1Ns>462EsX=*t)2eWI!8Y}N?rS>P~Ku5rWNo?rH)x_;F&-?Jdk z?|;y_`0^VMYahLQC^9Vxd1-7 zJA9BO016QJKJbon5X12Rlnww&cASTZ&=Puw=mZQgZj)$B$Tcrz$L*>KuGTP$j#Sc4 zD8!y0^H#8G;Zm9Inz3YxGERP6fRRimE?=LA^l?W%`b-K*dlF5D1;pngQ`L0gz#zf3 zvtd3AM`s;Q+q&&?nqjn0I@WO7hbUtNI3;I+k#oXKO?R(gZRUNiY2=)F_b-C?KSe3O z8_;7>9Tw#Z>-UmCbe%^EdJZ=U_Z|1pltUVKL7OH$Uq~$@SeLQ(b+y|n?gd#HG!43G zW{IR;Gfb`%Os>-}?}jN5?@@OeyRm5ci(099yA0RT5Lil3?Q{KqBHW_8;DUgcIG7q2 ziHBpdgVpV^p>NMPkg8zGR>f~H+rphprTTtC~yx{h0^-5BGlgZf5t%O#M5V>UW{( z%4-K7ZcZP0FFo*n=C$)*)m~WGJj31f%z9GXwR)Gya;S*!u~HdC`59owG2 z>zV1ldv5kzs$y15z5A%S;~~WBZBJ|670wCUlLqi<@3?P+*VMQuG|n7e5;o4TA4{O8^MB0JmNfFbze-7#q}{jyUsu7&!oFmRt_P zC0rj~Oj_{3^}Pwpz=5|zMM>h<02`YFHsB=Jl@`%9z*!lGW{I^P6X#{U`Xpzp3wX$^ za~?Sh@L=;Nm{+_(^nZ+m_H!48BL^V!bNV<%6L07|Apbahn~4j^cVCr4fl1m z960`e%U+}Pt7e+0HQYEe?K|Z6pb^pFob!HzDByF)>v)F+p8vkYE5&MPqDA}bS(6<_(SeroxgT2H6PDZ9iMpX z>-Jrf(sXz#Jb7-8o@<+{P0{JfZE5$L532sD{vYZgv$AO$cvao=-3x;cE@d_y{L1s@ z%%Mqk+Ws?p+VkcF2P9ijGwqx5O*YOPgslqChEv>!@pN;0Dv#k;Rh+I8-y4;FG_2_dle|Xh1!}*zYT0H0!vYX5fJH%Qh_Sa z)~bnj(d*8`LNU#GG_E@j^g@f0F%K9i=K+Bij|R0UftSVHXN@uXz!BH&`C<5GToH~1 zqoYzt#4F(NrBM|EYL|QsdeEhr^W)cO20lzX4jLS~edVtPqNDJkn%o_8hXT>4#$g-v)lP zTFP-CPgt6!e?x8jhI0NVwe>et!|!Yq ast.AST | None: + try: + return ast.parse(text) + except SyntaxError: + return None diff --git a/app/modules/rag/indexing/code/symbols/document_builder.py b/app/modules/rag/indexing/code/symbols/document_builder.py new file mode 100644 index 0000000..22085cc --- /dev/null +++ b/app/modules/rag/indexing/code/symbols/document_builder.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +from app.modules.rag.contracts import RagDocument, RagLayer, RagSource, RagSpan +from app.modules.rag.indexing.code.symbols.extractor import PySymbol + + +class SymbolDocumentBuilder: + def build(self, source: RagSource, symbol: PySymbol) -> RagDocument: + body = [f"{symbol.kind} {symbol.qname}", symbol.signature] + if symbol.docstring: + body.append(symbol.docstring.strip()) + return RagDocument( + layer=RagLayer.CODE_SYMBOL_CATALOG, + lang="python", + source=source, + title=symbol.qname, + text="\n".join(part for part in body if part), + span=RagSpan(symbol.start_line, symbol.end_line), + metadata={ + "symbol_id": symbol.symbol_id, + "qname": symbol.qname, + "kind": symbol.kind, + "signature": symbol.signature, + "decorators_or_annotations": symbol.decorators, + "docstring_or_javadoc": symbol.docstring, + "parent_symbol_id": symbol.parent_symbol_id, + "package_or_module": source.path.replace("/", ".").removesuffix(".py"), + "is_entry_candidate": bool(symbol.decorators), + "lang_payload": symbol.lang_payload, + "artifact_type": "CODE", + }, + ) diff --git a/app/modules/rag/indexing/code/symbols/extractor.py b/app/modules/rag/indexing/code/symbols/extractor.py new file mode 100644 index 0000000..8142699 --- /dev/null +++ b/app/modules/rag/indexing/code/symbols/extractor.py @@ -0,0 +1,130 @@ +from __future__ import annotations + +import ast +from dataclasses import dataclass, field +from hashlib import sha256 + + +@dataclass(slots=True) +class PySymbol: + symbol_id: str + qname: str + kind: str + path: str + start_line: int + end_line: int + signature: str + decorators: list[str] = field(default_factory=list) + docstring: str | None = None + parent_symbol_id: str | None = None + lang_payload: dict = field(default_factory=dict) + + +class SymbolExtractor: + def extract(self, path: str, text: str, ast_tree: ast.AST | None) -> list[PySymbol]: + if ast_tree is None: + return [] + collector = _SymbolVisitor(path) + collector.visit(ast_tree) + return collector.symbols + + +class _SymbolVisitor(ast.NodeVisitor): + def __init__(self, path: str) -> None: + self._path = path + self._stack: list[tuple[str, str]] = [] + self.symbols: list[PySymbol] = [] + + def visit_ImportFrom(self, node: ast.ImportFrom) -> None: + if self._stack: + return + module = node.module or "" + for item in node.names: + local_name = item.asname or item.name + imported_name = f"{module}.{item.name}".strip(".") + self.symbols.append( + PySymbol( + symbol_id=sha256(f"{self._path}|{local_name}|import_alias".encode("utf-8")).hexdigest(), + qname=local_name, + kind="const", + path=self._path, + start_line=int(getattr(node, "lineno", 1)), + end_line=int(getattr(node, "end_lineno", getattr(node, "lineno", 1))), + signature=f"{local_name} = {imported_name}", + lang_payload={"imported_from": imported_name, "import_alias": True}, + ) + ) + self.generic_visit(node) + + def visit_Import(self, node: ast.Import) -> None: + if self._stack: + return + for item in node.names: + local_name = item.asname or item.name + self.symbols.append( + PySymbol( + symbol_id=sha256(f"{self._path}|{local_name}|import".encode("utf-8")).hexdigest(), + qname=local_name, + kind="const", + path=self._path, + start_line=int(getattr(node, "lineno", 1)), + end_line=int(getattr(node, "end_lineno", getattr(node, "lineno", 1))), + signature=f"import {item.name}", + lang_payload={"imported_from": item.name, "import_alias": bool(item.asname)}, + ) + ) + self.generic_visit(node) + + def visit_ClassDef(self, node: ast.ClassDef) -> None: + self._add_symbol(node, "class", {"bases": [self._expr_name(base) for base in node.bases]}) + self.generic_visit(node) + self._stack.pop() + + def visit_FunctionDef(self, node: ast.FunctionDef) -> None: + self._add_function(node, is_async=False) + + def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None: + self._add_function(node, is_async=True) + + def _add_function(self, node, *, is_async: bool) -> None: + kind = "method" if self._stack and self._stack[-1][0] == "class" else "function" + self._add_symbol(node, kind, {"async": is_async}) + self.generic_visit(node) + self._stack.pop() + + def _add_symbol(self, node, kind: str, lang_payload: dict) -> None: + names = [name for _, name in self._stack] + [node.name] + qname = ".".join(names) + symbol_id = sha256(f"{self._path}|{qname}|{kind}".encode("utf-8")).hexdigest() + signature = self._signature(node) + symbol = PySymbol( + symbol_id=symbol_id, + qname=qname, + kind=kind, + path=self._path, + start_line=int(getattr(node, "lineno", 1)), + end_line=int(getattr(node, "end_lineno", getattr(node, "lineno", 1))), + signature=signature, + decorators=[self._expr_name(item) for item in getattr(node, "decorator_list", [])], + docstring=ast.get_docstring(node), + parent_symbol_id=self._stack[-1][1] if self._stack else None, + lang_payload=lang_payload, + ) + self.symbols.append(symbol) + self._stack.append((kind, qname)) + + def _signature(self, node) -> str: + if isinstance(node, ast.ClassDef): + bases = ", ".join(self._expr_name(base) for base in node.bases) + return f"{node.name}({bases})" if bases else node.name + args = [arg.arg for arg in getattr(node.args, "args", [])] + return f"{node.name}({', '.join(args)})" + + def _expr_name(self, node) -> str: + if isinstance(node, ast.Name): + return node.id + if isinstance(node, ast.Attribute): + return f"{self._expr_name(node.value)}.{node.attr}" + if isinstance(node, ast.Call): + return self._expr_name(node.func) + return ast.dump(node, include_attributes=False) diff --git a/app/modules/rag/indexing/common/document_upserter.py b/app/modules/rag/indexing/common/document_upserter.py new file mode 100644 index 0000000..7bf2dba --- /dev/null +++ b/app/modules/rag/indexing/common/document_upserter.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +from app.modules.rag.contracts import RagDocument +from app.modules.rag.persistence.repository import RagRepository + + +class RagDocumentUpserter: + def __init__(self, repository: RagRepository) -> None: + self._repository = repository + + def replace(self, rag_session_id: str, docs: list[RagDocument]) -> None: + self._repository.replace_documents(rag_session_id, docs) + + def apply_changes(self, rag_session_id: str, delete_paths: list[str], docs: list[RagDocument]) -> None: + self._repository.apply_document_changes(rag_session_id, delete_paths, docs) diff --git a/app/modules/rag/indexing/common/report.py b/app/modules/rag/indexing/common/report.py new file mode 100644 index 0000000..53da748 --- /dev/null +++ b/app/modules/rag/indexing/common/report.py @@ -0,0 +1,21 @@ +from __future__ import annotations + +from dataclasses import dataclass, field + + +@dataclass(slots=True) +class IndexReport: + indexed_files: int = 0 + failed_files: int = 0 + cache_hit_files: int = 0 + cache_miss_files: int = 0 + documents: int = 0 + warnings: list[str] = field(default_factory=list) + + def as_tuple(self) -> tuple[int, int, int, int]: + return ( + self.indexed_files, + self.failed_files, + self.cache_hit_files, + self.cache_miss_files, + ) diff --git a/app/modules/rag/indexing/docs/__pycache__/classifier.cpython-312.pyc b/app/modules/rag/indexing/docs/__pycache__/classifier.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..266921689beabcb71843cdb7654a27916b8644b7 GIT binary patch literal 1255 zcmZ`&&1(};5Pxq!vS~K0>sPh5($pZDqFGTq2&F~aq9WE}C_*mFy8GI0-0a5pHno)q z9(t%JEgo9%;=zmb(*MItLxq+{J&6ZzVgCUTW!_!_tlNz|9b^HnX!^7Je@&aOq8&ah!VLR59lAEa!#S;)*CEJNePzn8~g8a`4JxN~2 zmS7_BTNGCn_OBa_PF#XY=)jAPB;JlKp+ahkYqTCb48uoUyFW8^Mf8UXV+>uU>w4jDTp}|L?$@RYUtK|!c1zX;fA3u}A;iqtb5PIGo^g8J-8S+&#`&gke#a;_jpDX(X+_;V(z`aiI{f;=ingmC zTi2TU;M=P^`76!*mCf59^P_u5Ppyruj%{Vn?CQpj-rv;w*Kd8)&kFzO>gZN>Xjeb{ z>OoT?gh>_Kit?c05q28B`7mclS|316{O%)V+94wbvM0tu57)n4-fd~4U zwk$Cg%i7&-6`tk)E+tc>NJ^9}i*n<{Ol&GrZO4@zSGJ`DPGl*Gl){l?Y**Zsv^0O@ z*`;j>6q17vDi9!6dyot}Fardr0_7kB0g|4YLk|IZ5yci_7Y-1hD2m+VSZGn7`ra&; z`Z1LtMTfxKxBF(^zL}lxdvEyX`g(ytySwib^`f7UzvGS79HzoOz68nwk%`QuNSYhv zIMi(^TiQNqXSzM*NIOTJY1gPL?H+ZfJ)@p9Kgu(kBPFE0qh5~K$g4zl-XgMV*=Bkk zt($hoJm29>j5EAaCX>}AUCm}RxOHihQupx_F{e@2e_fT8OhOq@Ggr6nMy96}!~2#r z(U(o+(n>}*Ja980O)J#kK^xBIXhH!`b;C*my`G=UWv(X7SDVsqUbx3^fU-aoGRnzh z)F!)S?v`WJE_-AfTpcnm+u`bz9kTNt$CDrW$%5><1?w>FhavO*Ki zZ4uOg4nc5_KZhk*AUbE+^0s6pIO_>sWp26B<0hxsV0mrUeR8|k%8=#Cb=LW#e(jc@ zpXAr|qkbL#lV5k}n>kmw=Rd1ouI#a4 z@XImZH%M&XpkY&GgBT7S@#vDXX-Ul(&Jv*dvN$$v2susB#8h@dO&D&_*+UOiBv}*XF@wKBRUJBj z>QghiGC?K0y`kc(l(edAqH-OmMl*Z~pedjo>%pf^m^0%Zml>bo)KxvDz>HaBX+@W0 zNta?C!>;Mnuumwu;RH>cG8}9wuG|!KP^cSc>cN>d_6?mM7B3FXtQd~azJlY%?2|~yH5Z!7|Qk`#W z@L4*glybAPPq{8m(QFd9QxmUucS>qhrYcN6DossA!IDrktrHCsuppiAu+w~@)=8y_ zPWTAQO*J#o37@yUz}moPnK=w}xtlWMwR&RS(q0ETe4jiF zHZ6MZ`|kLb9Hr)?Yt27j36z4z)`LC8U{5J{dX9e>jx4@#zw=J#(zB(OuCeFNKz#S>4}RZjL@`stXEp zo{bj5`#<#k#80QjEwy(SBi-MlVUyVTx=jyhXnh!H zSr5dEfq1zwQVz71{cVfNqQ2^XX3gKV#Fs;Dn_gFgut8kB@SpE_7^)#l{mIwTAM)%- zQ@BxBn)(3NdqGf8By-KY~WVKxlN%k(0FT>ZAp6yL>BPl30dg(mS%hNCGYF|G^H&PT3w^7hSh?fK@7q6aU+wBE zb@Z=wy#93nu~x;QnLr=;JPdIG3vow^fg@H1WcXUl6&$PnLpxwb&q3gWF^*N6iOK<$ zV6s3$XK=&Q&W_|~0|Mvrb|xK6a!G3>46+)txPa36TBo$FGn|zWllVb*wdKk^oN+vP zTiy<7h@Ew+u&DWF*?!M6%j?xsgz?TrVlMhD+_3ya52F@CfysoOMPgox5t}1iN+yw&6@#BtZp!L}0=4}X zGJOo0ube$UV&alAAmf3G;yeux*hMr$X04_~dJ*k|Xt%>$DE>=9-$cQVL<6X?#D<^M zf^IZwf>$uXi)%G=U~lN6{Z}wtD}i{QY=%i=Wbw7d(}inGktOd+*UIop|CjzRPyF%J zA5MLF^s8%Mhrjl$M&4YHTq;H`t-kZ?QbZ~R#?}L=VjxutWaix8HXT6R0(!#{RG=yj zHvjgu#dDRM_{h&Qcm^$aK3@zx&k|z$?STSUh!t8_Awvca+&&HIF;eg@buA4q^{@K7 z*Zn7p{*xtt4*>UJWBVuXt{>_t9_smOVs&J6eMBsdh^ynNzci*-1DVZwHx{;TUSPO} zMSTtz3xU8`&~U(B(ajSRD`XfUqGq_LqUUJlNnqPQOVNCEFq=_!xamb-xbP;eUPi6z zg}#fo`1yB?VYLnG*^72Ide@jMIHqA+xNHxw7;z(*soaZ1=+wh1Y9t1_v5Jc-BCK`33cr7QTObb1M?>dX(0*X-(pAxpJY@bmi z5+-nQwIqmJxPGfnm2gV1QKl8@S;{n z_W3vJQQJpC?HfMS8i+r*;YTe%eC-?PV)JG{wxv~@W+)OfUhK9Gjvbzz zuBke$5|!3X9JT}NCJs!3!=?#ooPZO50GEYSO7%2O2nlY8A~AB}`<$dp*J%>sDf#;Q zp1;4ppReDK?d^U9>+|4OdX7Wr7q)3EzQ*94Az(I=IpDpjpz`Z{S^!5aG&pW$>#*sMii(p6B`{r8>6jbr$!U;*qNy~Z5x|PBxfd1&Lt-kup6BjJ2jb%jV0%1 zVw9hqj*ZclnYqNtsj1T=EzCF{UC-N zENZe$+e}N<oMXO+_va23FZde(`ut%c1^6e0+{SuJJzoEx#=%--( zuCw7>cPc%DrJlj9&|oPT|Ho#K{iL@KPbBx0l^wffJpoKpS-V+Tp{UI*01399Oh>)A zJ&od$7vVLO*KfmEI`&1~Srj?KLZd>SX!H6q^%wGajmV~w(;77c~C-q~6MK>aKA8$gR{?=b=(KRc~Uj1B={>*9FlE4-aW22(*v`%()+Y zN%RxeSIQ}7k$M@bZ7SG>dDZ~0VO@lf126%}SYmc^DltoW)(+}*TSEmoF?u!zhb$@4 zOAaziFdEJxZOtgAs;de>R)kYB0L1nJ`7jbWal-8*ZEHsBO5by!9AMfvfmGMfqptot z1Ldy68?BE5!E!MEp!n_OZ!T{I<6D9Fc4zN)psNz-D+T&0f&NmUzZ^Jp$Eh4XT7qBT z=yqq%cK5-}^Eb{{y2VnrSnhs(!(TJ=-P=DL`~FzNOwV>l*JkgH-do}q(ru~SF<9vs zDs>E%JBBxeN1<>zoO(Dsw-rv6L#cSu4VNwav%Y4okARgw0VVSnDCK>t#xB0HYr#w}C$7|oC5%L}= du-SIq{lQ_3e@3tTiu(T+Liim16EW(#`xl8{7xDlA literal 0 HcmV?d00001 diff --git a/app/modules/rag/indexing/docs/__pycache__/pipeline.cpython-312.pyc b/app/modules/rag/indexing/docs/__pycache__/pipeline.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e3267d8ef0e561caff14c919298a5108de458df8 GIT binary patch literal 10772 zcmb_iYit`=cAgo|@HM0;iIhY=Y|EA{i+WIs<@G~R>^PFG*p8LNlI^S+bR-E+>p|75kA32?Vd{?dQGjv&6mgz9J|LT*GLGDmO( zM+S%>IYg2ervlUvC5O5p9fZ079b|@>pnga%|EB|npmE43$C-dBXdW`laeaUdT81og z+z_w^ONL5Ff+G3|&iEn0neI_at3$R)%4Pl<6E0F>y`fMz;*I#jA+g^@N$gqgaBtW* z9^^w2$pndk@VMaPfwKIJR~Y5O6CwEBb9Fp4$_tQN0x7X4;1xyx6~CNg#~ihaSH}GT zE>C#EAK>AO@PbcaM)7l;ksE`Mo+Ef-h~x+u1C}E{q+tv!90eiGSvehq3|GR@5b8M_ z$3SS{N;y4*My`xAKxpF1p~v))8PX;Q*-5j@E;*2`lOc}3?hg&W;UD7z{t)jY3)ny| zbOb?e#AGfa+P`}_q8Q=P@*<2{vfwY2R$RoS#tQywB_h@$e+_EhBypGQcNqn&LNa)+ z$ipSzZ;AEzw2>C{m}B(#l&KJmO7gNc;IzmGu87ER{bR2EoX87eyEnjJZ|~)=dB%kB z2=9xCo@@Ksz5aSm@L%IY_1>|udidkxMX?=!48tO5hs^e1m>Uo9V!PlSZujTLu$_Ye zZ6C{xqpYILqS zo72u#$h59GThh+91pTMh4In3cxrKspoGN9y4nbs&h!G>0yUC4cQU6jg)Q5fMkq`>0{=Y@m{V6aZAkduqZJd?=5bHP3atBv$ePIlX&}(I`>5B#G$*GyzmQ z@^z^(n6;g1JRiy}nbcL8uP4S15H5YcOD7owek|*_OgovFIcrM`4scgk3uU}pxi_Nt7%A!TpK z*jrQf*2TfJeP6-|xpObgzLc>yrtFOik+i)P@~UbxRr^v^`<4!+tGW}G?5^DjI$Kda zcWL%grs80#;^4A=g}wiF@}e(Y!6nRDwj#;y%vwrsp1yH<-n?pQ$X4xmM3AOSWTH>a zrtes9Su@Vgl(Tc`#B$%t#pKzcwDaN@mbce+NcnGUNn+~e`#0X7zm~Q&KCrbe?n<+5 z8MZUUcBa`MEIS{t$Fj8z3$HA^Ip6(=&{>;i4B6^!cedTymZ|PcRb#EE?^j{1)fZ>< zS$lQLz9;K&&Yhk;y`cZXv3K17W!8;^$$FE$!Orw%9D7ray^H!KcKNNeQibA=i}$|7*jQ2;5q`rn5CaVoBh(|OTk z1Z9LxE)VkxDEWyvlNVQFOhgu2v5;D`K*g9437pWy;9Z3+h!;-F34i1&>?+*X5>k zHD~P=GZQl}CN0~uRomy{v+;# ze?iWXa6FXL#3b?8x(QN|n@9m?2DAps-60%=XrY|+*whX3BY%BNI1)Xw#e@_?ZFhGd z?DGc1BW|rcaw$5%`%mJ(KOi2|^es-zAG~w)7e`Yyecy<n z*dJ0Bkxp4e&|f@s08&NVO0Z~@6|)->(U$MQAXX&eD#TS&adYa%6tJ<}w6bsI*mCP? zO<&s9m$98m+0LYGuLIcG>@&w_1`|_BV~w&7kYClIyd3oFxcZt~7vzXjI^180C{*SU9E_ zGooodYWF~fBSthp$>$aW5|DA}n4v)C)#S!vd7SQJ`d$v+)c(Q(JwX%GgCN7`fU7a! z$_!lL0lCQmSHtXddkQ12nX1_k-$}mKr~8`J^RXaRdYtS#T+>owLo_77Xdq zrU#|&#a9wcc4yNLdY)hP+xt6Wo@w2*xeUAvUwS0!jmOAFxSr!F| zGqYzhj@Fc;HS64-ZD`BxYFVq>vB2CuwbszMF#fBq?2diwR-&S9gD_N7{y}FdEB~Di z>R8u7-Rn-mS^tQjOo2jCDC2aeobJVIOVO2snVz>&J#Qt4yqO_?YRI2-Ot%p15O8u-SH zP!{D5aB3Z((3^?^F;P^-myIBx=;w|+wHAQ=KL;Q*4Zvz6;fpg9YRpg=13~C(9K6S5 zOh86tdBJ4{XhfM5&4HpUFi5%p9}>EO0uVJk!Gl8R4FrUjF+&&ik11$pD?~dXJmIny zfEYrT#1w!Yjn&Hl4d4YPB_SdiuJT?mjfyhFfoL0^5Z;1fLJxQ%$^|N*$)SR3=O8=U zSRB=4O|$@f5j4faGO%TvU&u6fr<%Ld%}2g0eSKl~%I<$`{%!L=w9Yu@YG!MGT9+(+ zJ;8vt2Ab3OPrByGKRGf#u*fWW?;2CvIzSpuoXqY&yu9<1S3m9lq<^I|y}vKB|4eHC znft!<{Y6!N>94|)h{bfE<|RyxslnC zp9Yc@Cle>Y!)g!wWa{U>g}t`}4?tuub*@#`{Je91Vo{&2ZBJKr6ex@tXKTvYx_EwR zXr7!$1C(cZASnJ?qyh53@Z7jf{6hCVInoS|&xs=#-5f`j|0745 zLGL+nWYah*c@{@p>9f|4@IPD3Y?+GZLg#at*>afEGUWGr&S$C0hgh}+r`nzp2DhNl z>iCX()9_q+Y>yefXI~v^U!T*MSHPG%o;K!O^_xo|mk;d0XFPhS`T zBMa-8XYR1K*xS|=yJxXxY5dcvPp1Ah{=n7y&o+Bb0h{l=bLQ5WOx^xe-TtNX_sz+7 z{K>GmP7uea;}it_)Bu&N+rL2&+XkuM6GYiL3ao-MPhG3F#w}|JB`2nm*J8<794dLA zJb`ufQEyVe!_-+!y+;mUFis9);H7vBLX@aB7m=GA*P1)9YTLDCb0bM{A}L%?UY~?Y zrpQ;Yt{&$zjlsL*YZ&~Hx`e?MYE*45AU7A(n(JP*HEr44SaLF+y#8MD zz4xJ#W7Mly*GcLeHg}9VkExfb3m8n0Z(+bwehdU^LT&E4+}tFUWy@#yxzX9tkAf+- zK^@vdtG3-+HXcewrl7Joc^umtSH^ZgY41lcjJfH$BDXNCb?D%#twCKst9!e$+xOfV zy*0YHH@$t|gY74mhCdzsWOSu5ec&zupqYW@dzUcB|cI#SM#rOK64$)S>(n5XbaueXtv-j`S0U!IFL#^O$Ut$mS^Zf;&p>gRj#aW#75j*4)u{*j<0AMc59F zL=8b6?mT^6l7)wxTG68?i39_sQ0!DHG!EuZB{vq9je{K4K%;e1 zGKa!K5DwWWoX7@Qb>}(RZYbiMs0S5<%2{;*GQWd1E!!xs0j&sIOFOL`wHZfa%F&p1 zG(93pOy{UXFJ6GkIcg4pqp9kn>FQ&N zQ`yS;Ol4!LvN2uR1S+d`59M05wPx#@XHD5MXQr$nRo1ZZ>S|eAwyv`<>%?kVOSZ0Q z*7UC(2bTAI-tt+?Z`x8FeW|+E`<@xomiXaI`Zu3^3VsYye*pYAN3CzeKCN%h8w2rm zm|l@WB)UHw;$5_C8CDF%(=ar6r5z5q%*s8uY(K`!@F-jW3iYviEK^WIoFOwmVOz%d^;9=2!#Tr$0M0N>caz%Wbt_39rp&5KY{}bp{Ohz zLT@K}yU|0-wSa~m0c9xx7wKI>zBy7>EiPpNy%PAu4)8uCHYl2LZ&*mCeM4W)I5wIX z<`lV6LNT=)MmwaNOTpV+$vA%BQN}ow%CHJoDROva97IjvRzU0({usTNz>})sjf6Y* z^vexTzufqq0bf0caPa^{IMUL=%|!B}0(fKR=HT6eJNG7&-6`uX&l3a3^ltf9EEK{=Nh;~9cNgc$7lhm->xPc`%wS6d4VZm}&GG&TX zXP1^GED-~B0UB^2DLF_+4po5^l_4K|%rQZMB0w)vC`9W;3Zy6k)Hl|0VZ?{FZ)Uml zPYD!VL2uu@H?#BJ%=^9>{y7o}A$UHCep>i_5TU=3hW`kj!t7lJWfkd2XNqW+$ubPl z*&;j3WjU&IMSfPu3JhYA!Rh>6VUzL3vglo;i#L!S*knD+ruRZwk@_U?1$|#Ounc+$jZ!iC6nyO=k)v-MCgLO=t9S-pu5^6*M!mIebMg&zH2OB|W!*C77EE zqL@7(evQ73ch&hWrz2746HY|^)j1~|z40FI^t-m-fggeU=?>_k9?;nvLYC7dox8zj z`5>UPW71_)+#tbpY zYE~seeihnCYuOgi9cCiMVp8^QP@BpXi@(U_ucTO)H;v+yD_c--{_2=yiewUZkqVlK1i!UJ9s^Ai!)8Yx9`QE zTchx$z_VUYw>g`)1zWTOn?7YAJD_um5tuz-2Y(K0RSi|cHa~`z2| zC9#CAKp&I21(p@|mcm0W;N{y!6tcrptRFjr?68;ft$7~(AMtURwYdvq1gY&8o|*6q z258!hKW8;!LG<2ZM|>;f7F|;Eun&+Opn+x`ttyt{_thDlu@yUdm*3<)8|0ecQJ$+R z)#wEn$H0G8p7%6rokuBQ!j+)VE3*J*Ov>&7sf9Z{CPjitq*x{*3IgaVWX=_VhYBUb zgu=XR`_>Xp~?QoU;sy`IL(Lh?ggI?+VQ? zHV2w82S=Y7||;8^9MVW7ZWZ>A*)RIk|$@ zRAKt$3lBvcJ_eq}o;F~k0|^(IM_`ozX=>-HvZB=0!J6Q_l&VQY2u{yXE#e#;0injA z+1dLLg@XrbV@_h<>g>ww`dB@Ys=ezdiQ0#bx_?(4-d2a#FYc&^9ks`i)m{0(wtQe+ z+8DXF|MSR~l^yw#BgdBCzxiJ6jMF!C^IYxB@-gSok*_=7S<^SHd#CQqf7Z46;YSy@ zI^U_C0Rg?2&fPq>HnQHeHn1a)I{kySai@Ra=6ItU0%`Q1U}RV7+m`y)&aN99lb?-$ zd2B~||9-f`>FBGScVbSKXZLw{BKmxblZ zBMwD+*JnNtZC(7?R&MHRd3uinZv%0`(Dx5xsOyK!_j@W>%s4FC7o(BWKM=oog9rJ? z?%vbK#k>e+{~<`Btn_zK;6l)rm2BQeL_5}Lah7O{CbNJDODzSKq$X~FT_|n>z*-WZ zBS_I9RRciaaMiH6X$IJeX~zpzi2^-^-u{AAc|}#orSq zw8$mjPCocIJa`b2hvS@|DHlov1-LMWhr#B7CPYDl$-#q<5b-h*uMj~1N}=j98h#ZT zu1GOpngq#D>)b2x5wI-|wkh%F5quQ7nI#a{QA0%?-K(jU)Y^DGet0*2bUS{u9)GPC zxUVD}Iqr$ZT5C{L#*%5 zI0M7$KfOKU^c-*!eUE}dB>GP-09CWcp?Jn5Our$YN{i2(TclcOh5QMW|9x)RY`YrK zTTDwT#A=;Nc8m3&wY58uo_@jK)kRq1DrW=d>u4WpS0Am);;r!C!R0| zQ#?KaDFM4$Ic%Bp1-SQdMWf`3MOpzV!Q1yV=_FCpoS|c~J9Mk*deJTRcK0AS+TRwM zGLM+Fxb@-Dp%ob(_Ujr^y9AO;0@dZIFKUdP*;`l#slPp3m}$ zbkh6Y+xW*Y1G!F^&2JM&6rb6hT|30FiE%)3+Q>}ucKb zFWzc;Ym`|7>&S;V7kGSFiaZ54Kt?1GB#?l^3A+Iyt!Pqs!l;=o!@6ZM9%haDI%Tfe zh>#(WU>UPtmTxVugcRZW69IvX|b&Km(d`+(VaA$}sy7YWwBTO5H z<2k-zw9mYqk@-ZRKyZ8<$UW3Ae%vKnx&G9rUGhhI@=#>K8dOPOBy^+9`H-P)6yVk4 zOhxjD><(Z@3M4L7(N$DM9ofbm5)l8m4uP~guw8=_*j-=vj@G+$0urEAHPIOU+W?b3 z>L`J{Ix>_c2r5bfrHI}aYZ41af%2~N*S>)mvG<8Tns~Kwpg>HSgdhIfFc-%$z3uw+~qx-4_S+b-XDTF#|pmMwg^ntWBLSVO}UImIHbO&kD3KIo^4p+F(oJJ_S)C=XB z<1sKtp$dIMJsWn+7E4E(OyndaqBFvjIxiR*z* z!*%?2;eQPj?F!%gXCTl$`P1984`v_EbyF{Lr~9WSx?lA_nChziljB{bpFQ=x)jR*t zeU_bmnV*R9&ZS4ozg&5IWpC!Q{rrXQ#eOD#|MIt&d*^q~K9Zjn=bjfA_KFMp#q#f& zrG73yaFCnspPYDEIMc(u#ht?QkIH-SO_uiyOWjKcqbOI1_4c#V2N^Uo`)6*kclM`i z-zm>>i%-*w|H6-pYdtxa`eKPZnI}MD#GRcSLa&cNps7r}Y@zIUJ`?q39|~4w)Ne8` zBrA1~YJ^QhI#2{A9;Eh&>u~=%us1YCOr2-esA0gvVZ>7lIAa*Mo2DB}*qHD<3FI+i zS%F1ROiV#U#1k6DxryFLk}~%uol(lMqR?4^X9s-WY(h+M4ij?~8;d7*=e$8q^@1SGQg4|*M3h5!Hn literal 0 HcmV?d00001 diff --git a/app/modules/rag/indexing/docs/chunkers/markdown_chunker.py b/app/modules/rag/indexing/docs/chunkers/markdown_chunker.py new file mode 100644 index 0000000..2c218e5 --- /dev/null +++ b/app/modules/rag/indexing/docs/chunkers/markdown_chunker.py @@ -0,0 +1,77 @@ +from __future__ import annotations + +from dataclasses import dataclass + +from app.modules.rag.indexing.docs.chunkers.text_chunker import DocTextChunker + + +@dataclass(slots=True) +class SectionChunk: + section_path: str + section_title: str + content: str + order: int + + +class MarkdownDocChunker: + def __init__(self, text_chunker: DocTextChunker | None = None) -> None: + self._fallback = text_chunker or DocTextChunker() + + def chunk(self, text: str) -> list[SectionChunk]: + lines = text.splitlines() + sections: list[SectionChunk] = [] + stack: list[tuple[int, str]] = [] + current_title = "Document" + current_lines: list[str] = [] + order = 0 + for line in lines: + heading = self._heading(line) + if heading is None: + current_lines.append(line) + continue + self._flush_section(sections, stack, current_title, current_lines, order) + order += 1 + level, title = heading + stack = [item for item in stack if item[0] < level] + stack.append((level, title)) + current_title = title + current_lines = [] + self._flush_section(sections, stack, current_title, current_lines, order) + if sections: + return sections + chunks = self._fallback.split(text) + return [ + SectionChunk(section_path="Document", section_title="Document", content=chunk, order=index) + for index, chunk in enumerate(chunks) + ] + + def _flush_section( + self, + sections: list[SectionChunk], + stack: list[tuple[int, str]], + current_title: str, + current_lines: list[str], + order: int, + ) -> None: + content = "\n".join(current_lines).strip() + if not content: + return + titles = [title for _, title in stack] or [current_title] + sections.append( + SectionChunk( + section_path=" > ".join(titles), + section_title=titles[-1], + content=content, + order=order, + ) + ) + + def _heading(self, line: str) -> tuple[int, str] | None: + stripped = line.strip() + if not stripped.startswith("#"): + return None + level = len(stripped) - len(stripped.lstrip("#")) + title = stripped[level:].strip() + if not title: + return None + return level, title diff --git a/app/modules/rag/indexing/docs/chunkers/text_chunker.py b/app/modules/rag/indexing/docs/chunkers/text_chunker.py new file mode 100644 index 0000000..c0df728 --- /dev/null +++ b/app/modules/rag/indexing/docs/chunkers/text_chunker.py @@ -0,0 +1,21 @@ +from __future__ import annotations + + +class DocTextChunker: + def __init__(self, max_chars: int = 4000, overlap_chars: int = 250) -> None: + self._max_chars = max_chars + self._overlap_chars = overlap_chars + + def split(self, text: str) -> list[str]: + cleaned = text.strip() + if not cleaned: + return [] + chunks: list[str] = [] + start = 0 + while start < len(cleaned): + end = min(len(cleaned), start + self._max_chars) + chunks.append(cleaned[start:end].strip()) + if end >= len(cleaned): + break + start = max(0, end - self._overlap_chars) + return [chunk for chunk in chunks if chunk] diff --git a/app/modules/rag/indexing/docs/classifier.py b/app/modules/rag/indexing/docs/classifier.py new file mode 100644 index 0000000..b7129e0 --- /dev/null +++ b/app/modules/rag/indexing/docs/classifier.py @@ -0,0 +1,18 @@ +from __future__ import annotations + +from pathlib import PurePosixPath + +from app.modules.rag.contracts import DocKind + + +class DocsClassifier: + def classify(self, path: str) -> str: + upper = PurePosixPath(path).name.upper() + lowered = path.lower() + if "runbook" in lowered or upper.startswith("RUNBOOK"): + return DocKind.RUNBOOK + if upper.startswith("README"): + return DocKind.README + if "spec" in lowered or "architecture" in lowered: + return DocKind.SPEC + return DocKind.MISC diff --git a/app/modules/rag/indexing/docs/document_builder.py b/app/modules/rag/indexing/docs/document_builder.py new file mode 100644 index 0000000..4420694 --- /dev/null +++ b/app/modules/rag/indexing/docs/document_builder.py @@ -0,0 +1,115 @@ +from __future__ import annotations + +from hashlib import sha256 + +from app.modules.rag.contracts import EvidenceLink, EvidenceType, RagDocument, RagLayer, RagSource +from app.modules.rag.indexing.docs.chunkers.markdown_chunker import SectionChunk + + +class DocsDocumentBuilder: + def build_module_catalog(self, source: RagSource, frontmatter: dict, summary_text: str, doc_kind: str) -> RagDocument | None: + module_id = str(frontmatter.get("id") or "").strip() + module_type = str(frontmatter.get("type") or "").strip() + domain = str(frontmatter.get("domain") or "").strip() + if not module_id or not module_type or not domain: + return None + links = frontmatter.get("links") or {} + metadata = { + "module_id": module_id, + "type": module_type, + "domain": domain, + "status": frontmatter.get("status"), + "version": frontmatter.get("version"), + "tags": frontmatter.get("tags") or [], + "owners": frontmatter.get("owners") or [], + "links": links, + "source_path": source.path, + "summary_text": summary_text[:4000], + "doc_kind": doc_kind, + } + metadata.update({name: links.get(name, []) for name in ( + "calls_api", "called_by", "uses_logic", "used_by", "reads_db", "writes_db", + "integrates_with", "emits_events", "consumes_events", + )}) + return RagDocument( + layer=RagLayer.DOCS_MODULE_CATALOG, + source=source, + title=module_id, + text=summary_text[:4000] or module_id, + metadata=metadata, + ) + + def build_section(self, source: RagSource, chunk: SectionChunk, frontmatter: dict, doc_kind: str) -> RagDocument: + module_id = str(frontmatter.get("id") or source.path) + metadata = { + "module_id": module_id, + "type": frontmatter.get("type"), + "domain": frontmatter.get("domain"), + "tags": frontmatter.get("tags") or [], + "section_path": chunk.section_path, + "section_title": chunk.section_title, + "order": chunk.order, + "doc_kind": doc_kind, + "source_path": source.path, + "artifact_type": "DOCS", + } + return RagDocument( + layer=RagLayer.DOCS_SECTION_INDEX, + source=source, + title=f"{module_id}:{chunk.section_title}", + text=chunk.content, + metadata=metadata, + ) + + def build_policy(self, source: RagSource, frontmatter: dict, chunk: SectionChunk, doc_kind: str) -> RagDocument | None: + policy_id = str(frontmatter.get("id") or "").strip() + applies_to = frontmatter.get("applies_to") or frontmatter.get("type") + if not policy_id: + return None + metadata = { + "policy_id": policy_id, + "applies_to": applies_to, + "rules": chunk.content[:4000], + "default_behaviors": frontmatter.get("default_behaviors") or [], + "doc_kind": doc_kind, + "section_path": chunk.section_path, + "source_path": source.path, + } + return RagDocument( + layer=RagLayer.DOCS_POLICY_INDEX, + source=source, + title=policy_id, + text=chunk.content[:4000], + metadata=metadata, + ) + + def build_fact( + self, + source: RagSource, + *, + subject_id: str, + predicate: str, + obj: str, + object_ref: str | None, + anchor: str, + tags: list[str] | None = None, + ) -> RagDocument: + fact_id = sha256(f"{subject_id}|{predicate}|{obj}|{source.path}|{anchor}".encode("utf-8")).hexdigest() + metadata = { + "fact_id": fact_id, + "subject_id": subject_id, + "predicate": predicate, + "object": obj, + "object_ref": object_ref, + "anchor": anchor, + "tags": tags or [], + "source_path": source.path, + } + return RagDocument( + layer=RagLayer.DOCS_FACT_INDEX, + source=source, + title=f"{subject_id}:{predicate}", + text=f"{subject_id} {predicate} {obj}".strip(), + metadata=metadata, + links=[EvidenceLink(type=EvidenceType.DOC_FACT, target_id=fact_id, path=source.path, note=anchor)], + ) diff --git a/app/modules/rag/indexing/docs/file_filter.py b/app/modules/rag/indexing/docs/file_filter.py new file mode 100644 index 0000000..c8ae4f7 --- /dev/null +++ b/app/modules/rag/indexing/docs/file_filter.py @@ -0,0 +1,21 @@ +from __future__ import annotations + +from pathlib import PurePosixPath + + +class DocsFileFilter: + _EXTENSIONS = {".md", ".rst", ".txt", ".adoc"} + _NAMES = ("README", "CHANGELOG", "CONTRIBUTING", "ARCHITECTURE", "SPEC", "RUNBOOK") + _EXCLUDE_PARTS = {"node_modules", ".git", "vendor", "dist", "build", "target", "__pycache__"} + + def should_index(self, path: str) -> bool: + candidate = PurePosixPath(path) + if any(part in self._EXCLUDE_PARTS for part in candidate.parts): + return False + if candidate.suffix.lower() in self._EXTENSIONS: + return True + upper_name = candidate.name.upper() + if any(upper_name.startswith(prefix) for prefix in self._NAMES): + return True + joined = "/".join(candidate.parts).lower() + return any(marker in joined for marker in ("docs/", "doc/", "documentation/")) diff --git a/app/modules/rag/indexing/docs/pipeline.py b/app/modules/rag/indexing/docs/pipeline.py new file mode 100644 index 0000000..073a742 --- /dev/null +++ b/app/modules/rag/indexing/docs/pipeline.py @@ -0,0 +1,135 @@ +from __future__ import annotations + +import re + +import yaml + +from app.modules.rag.contracts import RagDocument, RagSource +from app.modules.rag.indexing.docs.chunkers.markdown_chunker import MarkdownDocChunker +from app.modules.rag.indexing.docs.classifier import DocsClassifier +from app.modules.rag.indexing.docs.document_builder import DocsDocumentBuilder +from app.modules.rag.indexing.docs.file_filter import DocsFileFilter + + +class DocsIndexingPipeline: + def __init__(self) -> None: + self._filter = DocsFileFilter() + self._classifier = DocsClassifier() + self._chunker = MarkdownDocChunker() + self._builder = DocsDocumentBuilder() + + def supports(self, path: str) -> bool: + return self._filter.should_index(path) + + def index_file(self, *, repo_id: str, commit_sha: str | None, path: str, content: str) -> list[RagDocument]: + source = RagSource(repo_id=repo_id, commit_sha=commit_sha, path=path) + frontmatter, body = self._split_frontmatter(content) + doc_kind = self._classifier.classify(path) + sections = self._chunker.chunk(body) + summary_text = self._summary_from_sections(sections) + docs: list[RagDocument] = [] + module_doc = self._builder.build_module_catalog(source, frontmatter, summary_text, doc_kind) + if module_doc is not None: + docs.append(module_doc) + for section in sections: + docs.append(self._builder.build_section(source, section, frontmatter, doc_kind)) + if str(frontmatter.get("type") or "").strip() == "policy": + for section in sections[:1]: + policy = self._builder.build_policy(source, frontmatter, section, doc_kind) + if policy is not None: + docs.append(policy) + docs.extend(self._extract_facts(source, frontmatter, sections)) + return docs + + def _split_frontmatter(self, content: str) -> tuple[dict, str]: + if not content.startswith("---\n"): + return {}, content + _, raw, body = content.split("---", 2) + payload = yaml.safe_load(raw) or {} + return payload if isinstance(payload, dict) else {}, body.strip() + + def _summary_from_sections(self, sections) -> str: + text = "\n\n".join(section.content for section in sections[:2]).strip() + return text[:4000] + + def _extract_facts(self, source: RagSource, frontmatter: dict, sections) -> list[RagDocument]: + subject_id = str(frontmatter.get("id") or source.path) + docs: list[RagDocument] = [] + links = frontmatter.get("links") or {} + for predicate, values in links.items(): + for value in values or []: + docs.append( + self._builder.build_fact( + source, + subject_id=subject_id, + predicate=predicate, + obj=str(value), + object_ref=str(value), + anchor="frontmatter.links", + ) + ) + for section in sections: + docs.extend(self._facts_from_table(source, subject_id, section.section_title, section.content)) + docs.extend(self._facts_from_lists(source, subject_id, section.section_title, section.content)) + return docs + + def _facts_from_table(self, source: RagSource, subject_id: str, title: str, content: str) -> list[RagDocument]: + lines = [line.strip() for line in content.splitlines() if line.strip()] + if len(lines) < 3 or "|" not in lines[0]: + return [] + headers = [part.strip().lower() for part in lines[0].strip("|").split("|")] + if not all(headers): + return [] + docs: list[RagDocument] = [] + for row in lines[2:]: + if "|" not in row: + continue + values = [part.strip() for part in row.strip("|").split("|")] + if len(values) != len(headers): + continue + payload = dict(zip(headers, values)) + docs.extend(self._facts_from_row(source, subject_id, title, payload)) + return docs + + def _facts_from_row(self, source: RagSource, subject_id: str, title: str, payload: dict) -> list[RagDocument]: + docs: list[RagDocument] = [] + name = payload.get("field") or payload.get("name") or payload.get("column") or payload.get("code") + if "request" in title.lower() or "response" in title.lower(): + if name: + docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="has_field", obj=name, object_ref=None, anchor=title)) + if payload.get("required"): + docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_required", obj=f"{name}:{payload['required']}", object_ref=None, anchor=title)) + if payload.get("type"): + docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_type", obj=f"{name}:{payload['type']}", object_ref=None, anchor=title)) + if payload.get("validation"): + docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="field_validation", obj=f"{name}:{payload['validation']}", object_ref=None, anchor=title)) + if "error" in title.lower(): + if payload.get("status"): + docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="returns_status", obj=payload["status"], object_ref=None, anchor=title)) + if payload.get("error") or payload.get("code"): + error_value = payload.get("error") or payload.get("code") + docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="returns_error", obj=error_value, object_ref=None, anchor=title)) + if payload.get("client action"): + docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="client_action", obj=payload["client action"], object_ref=None, anchor=title)) + if "constraint" in title.lower() and name: + docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate="has_constraint", obj=name, object_ref=None, anchor=title)) + return docs + + def _facts_from_lists(self, source: RagSource, subject_id: str, title: str, content: str) -> list[RagDocument]: + docs: list[RagDocument] = [] + for line in content.splitlines(): + item = line.strip() + if not item.startswith(("-", "*", "1.", "2.", "3.")): + continue + normalized = re.sub(r"^[-*0-9. ]+", "", item).strip() + lowered = normalized.lower() + if lowered.startswith("metric:"): + predicate = "emits_metric" + elif lowered.startswith("event:"): + predicate = "emits_analytics_event" + elif lowered.startswith("log:"): + predicate = "logs_event" + else: + predicate = "validates_rule" if "rule" in title.lower() else "client_action" + docs.append(self._builder.build_fact(source, subject_id=subject_id, predicate=predicate, obj=normalized, object_ref=None, anchor=title)) + return docs diff --git a/app/modules/rag/ingestion/session/__pycache__/local_project_reader.cpython-312.pyc b/app/modules/rag/ingestion/session/__pycache__/local_project_reader.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e6fc9351c0ecdfb2616adc742c921e863184d031 GIT binary patch literal 4058 zcmai1T})fa6`uQN8yhgO0cS&+HAzU~O=Hshuu+nd@Z%5?Vpw9TUANJV@x6du<4fmW z2oR&9hIUCeRfyCsTdk_H>Pnj^TQ!eJed}vq@EXh`A^gcaLg4;r!^yHB($^+-3ldU#Nw%#jvbpm5<7Fg1vs%UW{gCDk@S>i=<$+y z^CJ{H-1@hHStKfnatev^1F$J@lfMk3CMT3R5xFTTU}LU6swLI9aV?QjnV=h#iDR%5 zv&Xfxp{5O1Sz_dom_8B?h|D3&>DZ_$%giauqna|FLcUU#e>5ITl_YLi-m)uMen!^O z#|@)-Kv${W980N_&7JCmJXTy?mnY6N#}a;pCMMLhKQ=bz2T5Gj^=1?dgCor_*j!vr zZ>F)~=0ti})$!Dtbg0BTGc8&WW)XmPkGTYspn^Hzve zbX;!>cZGU-yI5sMZ#dH5-98ZM4nwc4zvFs$B-9ZZ=npaRc3-H2+4~2=?Y+G>TXE-L zaX5h%;fN=gs3dfQi6$=%jwezIvl%futQyQE%VSgVSbW61*tDjorF+)nC3J%|0zp6s zK#yC;6I~=3E<#)Sc{PI)voDGyE=svh8(sH>?VR#6_((T3koYvF^yN~9Bp zEKfJ=KBJ%|L{%Mcbe0qj9GMq?;Wn>sn`f0EHa+_g>@O0d6r}`wv?4ZH3L4NXH!Eh7 zrR-1^0AtB!_za(fY_hXia#{m&bZ??4i{Jbzf^`bkH+gbXYH`+@P#;Ptv$Q$#=r1D42)9YV z9-iJ`$^w4OxPB-C{vmBVtz7go?yYHQ+!#7_f!PwnX^pB7I}SA+*A&Ql%%)PRQ9U3~ z7c5B;?G!;1sHZmgFsC{>7E3GRkZPHos=Ah%P+`&}1iTIgby-PJwhuFYSTnBSk|8t* z)kdR+sxXNTr?f$42hAZY3zaf(>SR1Mu7I=(d;0j;SW#Kjp>{F5ZpeCKTBTTmSOw-@ z*-$4TmrykYwQWL`4UIWsx;&=oiAiRIY_6u2Vrs^41OIS)DytY#CzS-I_JEz9Dq7Pt zLpH0DNP%S%dZjZlVW^|#?wP|{MaR(e7h5r_i0I3-Z;y{BTe(tTJ3?4&i(e{uP-jeKfaQyJ^3YZS5(h;7QDX2%TF%7`g7b^EX?s?d=R?|>;qp@)C$Xes^ zr}jToKdXLmFL&ZfuJLNV@kX}sMy|2vwO!oj`L|%Js($4KXJ7eAP2GHDF{f_Lk$6ea zFTR-G-=0eU7^gRoM1Yvn?m`5Zj?(YK(Da+%%1_I8k)p!>=^R-8_wMUmCx!Sk# zwWqVSr*k#m0d%`PIZxa2`BhI_&fT`+Xxm+R(PtASe*)%x9$y7}UBg~@v9cyqiaqc$ z0d-;ut~H4nZb$on_4eodX1$%~0be}eizG2;?r3{rP6$V~fet-0L3b6HEue#nIfIbf z3eb}M%{VAQ&$knLPDr6}srM4b2;}rWn8$2mR2_mm>4ZG1QuD#R4S=*Cw5HGgk9#d= z=RL630u#4kukXe87B1$!C$ioXPwVsE(^*en340BPxPTq1!uSxnBBiu@P*7^*Na)=N z1WP0ozTMp$zRg5@^fSryln$WxuA)x3?N$f^sfbb&pix}B7)V!(_-w^}_`hi%XK)_k z8~|EyxgTG8cq#8Xm~|b@xehHI%^!X{3qRM}kW348Z!Es|cG^sajPa_+trN1usYc%LR_=ME7Fq#)R4LIVEp*r;lZ(=-#qT3Q837f7fR$smyF zeJQA--!BzzETMM-wqgV#FpFVma;VM3K}}165p(n9VXA;E;H1?!17CQw2FV>HcaiiW z!FZ%SNHB&e9u$S#Moia>hgXkXBxpoG0c4IaU+aokTk!l~_9m+vnGL@dIqB%Stxl?2 z5Asqw_u3&!?{GyUI5iw_;vc!8ao9Dy_bJ{EbCv&usyao?GduqMOC<)WS*-0?jj?C$ z$^3t1`m)`>4aeo;AGTJS02>$^`XwN*1diiglEZ%|jW5Z8mt_CHY=n0$)UI;%U-1H0 a|B3)v7YOfrh25_zk8|&EUlOFI-~R>Ct*0>n literal 0 HcmV?d00001 diff --git a/app/modules/rag/persistence/__pycache__/query_repository.cpython-312.pyc b/app/modules/rag/persistence/__pycache__/query_repository.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8320bca6da3b6c34f229bde724456f2ccf7522e6 GIT binary patch literal 6698 zcmcgQTWlNIb$1?oKP6I6TC0(4S){GE*Rm{2UdSXZYb{e7Nlt5dmFbu>mPzxWof+8@ zGfa)dA6&#hGXe)Sn?tCd;DTNw2lz3B7kfl9`Qs9rHKWC zz&M#A7fAS}QdF8=ptUhNWlEbD%mgCQB;rl?5pRA<>QWXgYh=js8GZ>7s)Ng9vN9(p zvKc7`@3bte%J6Gn5@c4$EG05RT<6oFS^ptk{S_b^NI(k&k6?L%CwU6~Xx{YLWJ9m} z;~l*DzIlPtI4qAz-pO0xo#tJ5yZ7wVL2-$zkP-B>JodCqlY@7j#xQT?O-n@HrVxr% zp?ULTDsNXzVxwY98tYZdvltQxq?i;tZykeK#Vo$Ngz^pFDV|qgxfgS{D^cA`}c~>nTC=P>Lv)YZJ zF+w2F+oPbQrL1_>k3tz#d2f|vUSkw2`$omGI^I*Zte*FlF&|%7#{5bh*jcaHSy!>! zhu?X-qK{BO;nyfUH9b+U)B~qagMJM*C_Wfz)L@h12fSH>El)g#j^M93@Rvty;#>JZ zP484__q~%L_#^oS#Uhe?8{lu`8~OHp6VE6OuttZ{2(VLW0(g`^R#T?USRZJ*p4jru zvdwU;aT1dbV`Tr=m^+Q~ElNw$wLfEEvd6%bW@8P-ZbXV?oiV%rYsgGOj5_=nbNNaTkS-$M%TpdqhvX`lH2R^A!H5hXA<@j2lQni(qn|kSlZwVJ!K# z#0CV^2X!=BL%*Qz5+MQyL`ssy5UG-V11hm9;TXy=d&&N*62zE(E+wq?M}#{NX0k~k zE=%m4p?)sG@M7YQkYTvxWd>&Af+Y3hi6tQ;_rqX+I?J!51gT%-mim`LP(qT0OkC*K zqnTI32A^$@XRzkGJ8Zwb;yF4 zW`$KQF0-mrd(p!)tGczfcs3(*iHyYNg5im|DDxMWqA@0wy(@^p@XW+?bS@kXzL(+B zLJ!m39SUJJbN$RbrW&Y8EUl$)WK&>K6~jrLVX!8HoRCx1())Dcp_;_is_5qC%Jipd z(pL*F)A#D~->gZG{a9AKOg^BKpRP%cy$1*EActS)7-pi=;LyQ=0VW!Ynhj6PPfX8DR@TsuRfTSd zjYQu8uS8?vYwYCg#5y?XHFnup*w}1?EHcPmnTcGT2C@SS zA676j#9o{T&$Cmo^k063a*o9sI?(^H^-wXjphrfYkiurpQ9 zTyz@bGWrp#oE+_o?qSLYH7&>-&&eE{l(Lx~Mq1`FtR!PR-5d*nl$X)ua*^r5GC+@BD2Jdjh|=W%*+QzVGOh0 zfxw&721mg7t(DAe>A)PEmDwn>XM%_qVR8=+%+1V3qO;6} zYX?k)Rl?vweCW$o=_8tJH3RhbuUGiMTGLZkrskPZ2*Q=&kVB=D*+fP)LrQ_YP;EkH zB`t`YET~S_D2&ClB;qNp%v>Ybz4{~=Lcing!6-hyAv@Dc4{dEQ`yT2MB}KW5B%mZ&Y$#N zO7&Ln2a8AC`}?82Q+SZZE>dYQUQ#IxkAUJ;AI+bkB2KZY)yToQz-tmTc8v5xC?M{k zQX}%zuZK4-J-h^|c#Hk$$oQ^*e8)ZhA+_TU7pS7|#CHA3f~DBr`z-!p*G|)=g86rj zx}wwfgV_7A?ZDti&Y@y^SK;FN$d2>K?;Op=mO$a{Puhc9ZM*GfzCe`g%tOaIwf@aw zbK9f%qpR!bb*k9Wy=mPd9;MdJ>(WC<(cAjc+p**A*u4CV+FIG}JpHkExMTs&k`=l9 zc%4V5HXDBy_(@=EVYg#sCouB-)K1|1N6xp3O)Ujy(d#espV;;G7EGVCbZ(yBZ3%sW z>TK@!rwZY9f3dY|GroCsyZK~csu<|rqBd8yTY3v`7u!y5g||+F8HMR0a}2CFyW?yt z9t&;`Z`EzK!u#ls^NnKH@lAR2=GMgKIT#t+aWX|`^Gj#@j=yTi)X{no!V#>I#L{oJo^reUL?*H&rLq>19XaLFL|tW?SG=|E_cZT z`hR{5g^3wr{nn=cXDvTz*&4$RXaonmXgt3?f34v7+>X4==N8h%fVb)!N)BXeDS2(K z))IwmO{Hei=l|5-zCI4pJN!lev29-$pvM92-1Z$UA?lc~WJdnL#@NHLP0Ow?xLqHD z(G#|k$L?_!s6RF$ck}n(ed+4hadqsvfP=725u0++*MwOM0ISwiIg*Bme39>9eHLiiFu>y{PB&$wHLj+cgxm*mOE zWZtAu(C^ZPn>U(U^_tfaB zluCzJ>3eKn+xOU@@3F?j7FZp2sug}j9IwPS3~)`J`(7ni;K=oVl=2VA{QA@>c4JM( z4{%iyC$t(7sOk?)*2;7?F@%i@Vt|vG*n>eY1~`d{eHip(FaRKA7Y8wp+rIr1EWU|f zPh$|qAc8>@gNpzpoV@nqyB^p*Hz6C|;NrK-l|invMtna+vw(&D62LuF9ma*7rm@!r z@e)b-zA7eI%vwxv|NqB?U%5vv^isdBZ@zGr{&lbE!myMbL6-4 zP~(&W5NyJwfDz%(l4pI8S!3P2eq4bro@^CD)^PnBBTzic( zP+c$G1`cT$z&-Sd$G0B-{#t?lB+&lj(Z3u0Aij0%AA&y*K6`8TMEJ#t%OA?S$ESA# zR|<4tVaM8B^!N&nSHKCk8W@%zeBd=N|2-hu-6Tq(JPH1)c-A0=`!7-<%ATT(c$TN& zR*qIExM71Y2;#FTiDe#oDpAa=m=*dCArTLnA&OV_XPfq~ ziCFbq%k_!g1+F*8E|!sj-m~oDw}*Q2q7~*p!x4Z`er~sJ&rTszeyD) zi=M_u$rqmBenHj!vG)X2Qnr&&8+cnEbv_y_d}rI*CeFeVA+H{l@R1YYZVKfbDQHQq z--MGWFX|PvHs^pjl}cn}Et2nfH7dA@T4WR^mdyxyb^R_DgG+3czLCwQR5MhfD`IBP zu94%Dz8JDrt%#+H*D=8PLDbl>(pEd_dCX7JQ)Y?_DXs8`EG?yG2>O@{&$)%xgRM%1RXs8!82TOGtT8{$l zB~4VxkL> list[RagDocument]: + with self._engine().connect() as conn: + rows = conn.execute( + text( + """ + SELECT layer, lang, path, title, content, metadata_json, links_json, span_start, span_end, + repo_id, commit_sha, embedding::text AS embedding_txt + FROM rag_chunk_cache + WHERE repo_id = :repo_id AND blob_sha = :blob_sha + ORDER BY chunk_index ASC + """ + ), + {"repo_id": repo_id, "blob_sha": blob_sha}, + ).mappings().fetchall() + docs: list[RagDocument] = [] + for row in rows: + metadata = self._loads(row.get("metadata_json")) + docs.append( + RagDocument( + layer=str(row["layer"]), + lang=row.get("lang"), + source=RagSource( + repo_id=str(row["repo_id"]), + commit_sha=row.get("commit_sha"), + path=str(row["path"]), + ), + title=str(row["title"] or row["path"]), + text=str(row["content"] or ""), + metadata=metadata, + links=[EvidenceLink(**item) for item in self._loads(row.get("links_json"), default=[])], + span=RagSpan(row.get("span_start"), row.get("span_end")), + embedding=self._parse_vector(str(row["embedding_txt"] or "")), + ) + ) + return docs + + def cache_documents(self, repo_id: str, path: str, blob_sha: str, docs: list[RagDocument]) -> None: + if not docs: + return + with self._engine().connect() as conn: + first = docs[0].to_record() + first_meta = first["metadata"] + conn.execute( + text( + """ + INSERT INTO rag_blob_cache ( + repo_id, blob_sha, path, artifact_type, section, doc_id, doc_version, owner, + system_component, last_modified, staleness_score, layer, lang, metadata_json + ) + VALUES ( + :repo_id, :blob_sha, :path, :artifact_type, :section, :doc_id, :doc_version, :owner, + :system_component, :last_modified, :staleness_score, :layer, :lang, :metadata_json + ) + ON CONFLICT (repo_id, blob_sha, path) DO UPDATE SET + artifact_type = EXCLUDED.artifact_type, + section = EXCLUDED.section, + doc_id = EXCLUDED.doc_id, + doc_version = EXCLUDED.doc_version, + owner = EXCLUDED.owner, + system_component = EXCLUDED.system_component, + last_modified = EXCLUDED.last_modified, + staleness_score = EXCLUDED.staleness_score, + layer = EXCLUDED.layer, + lang = EXCLUDED.lang, + metadata_json = EXCLUDED.metadata_json, + updated_at = CURRENT_TIMESTAMP + """ + ), + { + "repo_id": repo_id, + "blob_sha": blob_sha, + "path": path, + "artifact_type": first_meta.get("artifact_type"), + "section": first_meta.get("section") or first_meta.get("section_title"), + "doc_id": first_meta.get("doc_id"), + "doc_version": first_meta.get("doc_version"), + "owner": first_meta.get("owner"), + "system_component": first_meta.get("system_component"), + "last_modified": first_meta.get("last_modified"), + "staleness_score": first_meta.get("staleness_score"), + "layer": first["layer"], + "lang": first["lang"], + "metadata_json": json.dumps(first_meta, ensure_ascii=True), + }, + ) + conn.execute( + text("DELETE FROM rag_chunk_cache WHERE repo_id = :repo_id AND blob_sha = :blob_sha"), + {"repo_id": repo_id, "blob_sha": blob_sha}, + ) + for idx, doc in enumerate(docs): + row = doc.to_record() + metadata = row["metadata"] + emb = row["embedding"] or [] + emb_str = "[" + ",".join(str(x) for x in emb) + "]" if emb else None + conn.execute( + text( + """ + INSERT INTO rag_chunk_cache ( + repo_id, blob_sha, chunk_index, content, embedding, section, layer, lang, path, title, + metadata_json, links_json, span_start, span_end, commit_sha + ) + VALUES ( + :repo_id, :blob_sha, :chunk_index, :content, CAST(:embedding AS vector), :section, :layer, + :lang, :path, :title, :metadata_json, :links_json, :span_start, :span_end, :commit_sha + ) + """ + ), + { + "repo_id": repo_id, + "blob_sha": blob_sha, + "chunk_index": idx, + "content": row["text"], + "embedding": emb_str, + "section": metadata.get("section") or metadata.get("section_title"), + "layer": row["layer"], + "lang": row["lang"], + "path": row["path"], + "title": row["title"], + "metadata_json": json.dumps(metadata, ensure_ascii=True), + "links_json": json.dumps(row["links"], ensure_ascii=True), + "span_start": row["span_start"], + "span_end": row["span_end"], + "commit_sha": row["commit_sha"], + }, + ) + conn.commit() + + def record_repo_cache( + self, + *, + project_id: str, + commit_sha: str | None, + changed_files: list[str], + summary: str, + ) -> None: + docs: list[RagDocument] = [] + for idx, path in enumerate(changed_files): + docs.append( + RagDocument( + layer=RagLayer.CODE_SOURCE_CHUNKS, + lang="python" if path.endswith(".py") else None, + source=RagSource(project_id, commit_sha, path), + title=path, + text=f"repo_webhook:{path}:{summary[:300]}", + metadata={"chunk_index": idx, "artifact_type": "CODE", "section": "repo_webhook"}, + ) + ) + for doc in docs: + blob_sha = self._blob_sha(commit_sha, doc.source.path) + doc.metadata["blob_sha"] = blob_sha + self.cache_documents(project_id, doc.source.path, blob_sha, [doc]) + + def _blob_sha(self, commit_sha: str | None, path: str) -> str: + from hashlib import sha256 + + return sha256(f"{commit_sha or 'no-commit'}:{path}".encode("utf-8")).hexdigest() + + def _engine(self): + from app.modules.shared.db import get_engine + + return get_engine() + + def _loads(self, value, default=None): + if default is None: + default = {} + if not value: + return default + return json.loads(str(value)) + + def _parse_vector(self, value: str) -> list[float]: + text_value = value.strip() + if not text_value: + return [] + if text_value.startswith("[") and text_value.endswith("]"): + text_value = text_value[1:-1] + if not text_value: + return [] + return [float(part.strip()) for part in text_value.split(",") if part.strip()] diff --git a/app/modules/rag/persistence/document_repository.py b/app/modules/rag/persistence/document_repository.py new file mode 100644 index 0000000..b665094 --- /dev/null +++ b/app/modules/rag/persistence/document_repository.py @@ -0,0 +1,122 @@ +from __future__ import annotations + +import json + +from sqlalchemy import text + +from app.modules.rag.contracts import RagDocument + + +class RagDocumentRepository: + def replace_documents(self, conn, rag_session_id: str, docs: list[RagDocument]) -> None: + conn.execute(text("DELETE FROM rag_chunks WHERE rag_session_id = :sid"), {"sid": rag_session_id}) + conn.execute(text("DELETE FROM rag_session_chunk_map WHERE rag_session_id = :sid"), {"sid": rag_session_id}) + self.insert_documents(conn, rag_session_id, docs) + + def apply_document_changes( + self, + conn, + rag_session_id: str, + delete_paths: list[str], + docs: list[RagDocument], + ) -> None: + if delete_paths: + conn.execute( + text("DELETE FROM rag_chunks WHERE rag_session_id = :sid AND path = ANY(:paths)"), + {"sid": rag_session_id, "paths": delete_paths}, + ) + conn.execute( + text("DELETE FROM rag_session_chunk_map WHERE rag_session_id = :sid AND path = ANY(:paths)"), + {"sid": rag_session_id, "paths": delete_paths}, + ) + if not docs: + return + paths = sorted({doc.source.path for doc in docs}) + conn.execute( + text("DELETE FROM rag_chunks WHERE rag_session_id = :sid AND path = ANY(:paths)"), + {"sid": rag_session_id, "paths": paths}, + ) + conn.execute( + text("DELETE FROM rag_session_chunk_map WHERE rag_session_id = :sid AND path = ANY(:paths)"), + {"sid": rag_session_id, "paths": paths}, + ) + self.insert_documents(conn, rag_session_id, docs) + + def insert_documents(self, conn, rag_session_id: str, docs: list[RagDocument]) -> None: + for doc in docs: + row = doc.to_record() + metadata = row["metadata"] + links = row["links"] + emb = row["embedding"] or [] + emb_str = "[" + ",".join(str(x) for x in emb) + "]" if emb else None + conn.execute( + text( + """ + INSERT INTO rag_chunks ( + rag_session_id, path, chunk_index, content, embedding, artifact_type, section, doc_id, + doc_version, owner, system_component, last_modified, staleness_score, created_at, updated_at, + rag_doc_id, layer, lang, repo_id, commit_sha, title, metadata_json, links_json, span_start, + span_end, symbol_id, qname, kind, framework, entrypoint_type, module_id, section_path, doc_kind + ) + VALUES ( + :sid, :path, :chunk_index, :content, CAST(:emb AS vector), :artifact_type, :section, :doc_id, + :doc_version, :owner, :system_component, :last_modified, :staleness_score, CURRENT_TIMESTAMP, + CURRENT_TIMESTAMP, :rag_doc_id, :layer, :lang, :repo_id, :commit_sha, :title, :metadata_json, + :links_json, :span_start, :span_end, :symbol_id, :qname, :kind, :framework, :entrypoint_type, + :module_id, :section_path, :doc_kind + ) + """ + ), + { + "sid": rag_session_id, + "path": row["path"], + "chunk_index": int(metadata.get("chunk_index", 0)), + "content": row["text"], + "emb": emb_str, + "artifact_type": metadata.get("artifact_type"), + "section": metadata.get("section") or metadata.get("section_title"), + "doc_id": metadata.get("doc_id"), + "doc_version": metadata.get("doc_version"), + "owner": metadata.get("owner"), + "system_component": metadata.get("system_component"), + "last_modified": metadata.get("last_modified"), + "staleness_score": metadata.get("staleness_score"), + "rag_doc_id": row["doc_id"], + "layer": row["layer"], + "lang": row["lang"], + "repo_id": row["repo_id"], + "commit_sha": row["commit_sha"], + "title": row["title"], + "metadata_json": json.dumps(metadata, ensure_ascii=True), + "links_json": json.dumps(links, ensure_ascii=True), + "span_start": row["span_start"], + "span_end": row["span_end"], + "symbol_id": metadata.get("symbol_id"), + "qname": metadata.get("qname"), + "kind": metadata.get("kind") or metadata.get("type"), + "framework": metadata.get("framework"), + "entrypoint_type": metadata.get("entry_type") or metadata.get("entrypoint_type"), + "module_id": metadata.get("module_id") or metadata.get("policy_id"), + "section_path": metadata.get("section_path"), + "doc_kind": metadata.get("doc_kind"), + }, + ) + repo_id = str(row["repo_id"] or "").strip() + blob_sha = str(metadata.get("blob_sha") or "").strip() + if repo_id and blob_sha: + conn.execute( + text( + """ + INSERT INTO rag_session_chunk_map ( + rag_session_id, repo_id, blob_sha, chunk_index, path + ) VALUES (:sid, :repo_id, :blob_sha, :chunk_index, :path) + """ + ), + { + "sid": rag_session_id, + "repo_id": repo_id, + "blob_sha": blob_sha, + "chunk_index": int(metadata.get("chunk_index", 0)), + "path": row["path"], + }, + ) diff --git a/app/modules/rag/persistence/job_repository.py b/app/modules/rag/persistence/job_repository.py new file mode 100644 index 0000000..72fba06 --- /dev/null +++ b/app/modules/rag/persistence/job_repository.py @@ -0,0 +1,95 @@ +from __future__ import annotations + +from dataclasses import dataclass + +from sqlalchemy import text + +from app.modules.shared.db import get_engine + + +@dataclass +class RagJobRow: + index_job_id: str + rag_session_id: str + status: str + indexed_files: int + failed_files: int + cache_hit_files: int + cache_miss_files: int + error_code: str | None + error_desc: str | None + error_module: str | None + + +class RagJobRepository: + def create_job(self, index_job_id: str, rag_session_id: str, status: str) -> None: + with get_engine().connect() as conn: + conn.execute( + text( + """ + INSERT INTO rag_index_jobs (index_job_id, rag_session_id, status) + VALUES (:jid, :sid, :status) + """ + ), + {"jid": index_job_id, "sid": rag_session_id, "status": status}, + ) + conn.commit() + + def update_job( + self, + index_job_id: str, + *, + status: str, + indexed_files: int, + failed_files: int, + cache_hit_files: int = 0, + cache_miss_files: int = 0, + error_code: str | None = None, + error_desc: str | None = None, + error_module: str | None = None, + ) -> None: + with get_engine().connect() as conn: + conn.execute( + text( + """ + UPDATE rag_index_jobs + SET status = :status, + indexed_files = :indexed, + failed_files = :failed, + cache_hit_files = :cache_hit_files, + cache_miss_files = :cache_miss_files, + error_code = :ecode, + error_desc = :edesc, + error_module = :emodule, + updated_at = CURRENT_TIMESTAMP + WHERE index_job_id = :jid + """ + ), + { + "jid": index_job_id, + "status": status, + "indexed": indexed_files, + "failed": failed_files, + "cache_hit_files": cache_hit_files, + "cache_miss_files": cache_miss_files, + "ecode": error_code, + "edesc": error_desc, + "emodule": error_module, + }, + ) + conn.commit() + + def get_job(self, index_job_id: str) -> RagJobRow | None: + with get_engine().connect() as conn: + row = conn.execute( + text( + """ + SELECT index_job_id, rag_session_id, status, indexed_files, failed_files, + cache_hit_files, cache_miss_files, error_code, error_desc, error_module + FROM rag_index_jobs + WHERE index_job_id = :jid + """ + ), + {"jid": index_job_id}, + ).mappings().fetchone() + return RagJobRow(**dict(row)) if row else None diff --git a/app/modules/rag/persistence/query_repository.py b/app/modules/rag/persistence/query_repository.py new file mode 100644 index 0000000..4e0d320 --- /dev/null +++ b/app/modules/rag/persistence/query_repository.py @@ -0,0 +1,111 @@ +from __future__ import annotations + +import json + +from sqlalchemy import text + +from app.modules.rag.retrieval.query_terms import extract_query_terms +from app.modules.shared.db import get_engine + + +class RagQueryRepository: + def retrieve( + self, + rag_session_id: str, + query_embedding: list[float], + *, + query_text: str = "", + limit: int = 5, + layers: list[str] | None = None, + path_prefixes: list[str] | None = None, + prefer_non_tests: bool = False, + ) -> list[dict]: + emb = "[" + ",".join(str(x) for x in query_embedding) + "]" + filters = ["rag_session_id = :sid"] + params: dict = {"sid": rag_session_id, "emb": emb, "lim": limit} + if layers: + filters.append("layer = ANY(:layers)") + params["layers"] = layers + if path_prefixes: + or_filters = [] + for idx, prefix in enumerate(path_prefixes): + key = f"path_{idx}" + params[key] = f"{prefix}%" + or_filters.append(f"path LIKE :{key}") + filters.append("(" + " OR ".join(or_filters) + ")") + term_filters = [] + terms = extract_query_terms(query_text) + for idx, term in enumerate(terms): + exact_key = f"term_exact_{idx}" + prefix_key = f"term_prefix_{idx}" + contains_key = f"term_contains_{idx}" + params[exact_key] = term + params[prefix_key] = f"{term}%" + params[contains_key] = f"%{term}%" + term_filters.append( + "CASE " + f"WHEN lower(COALESCE(qname, '')) = :{exact_key} THEN 0 " + f"WHEN lower(COALESCE(symbol_id, '')) = :{exact_key} THEN 1 " + f"WHEN lower(COALESCE(title, '')) = :{exact_key} THEN 2 " + f"WHEN lower(COALESCE(qname, '')) LIKE :{prefix_key} THEN 3 " + f"WHEN lower(COALESCE(title, '')) LIKE :{prefix_key} THEN 4 " + f"WHEN lower(COALESCE(path, '')) LIKE :{contains_key} THEN 5 " + f"WHEN lower(COALESCE(content, '')) LIKE :{contains_key} THEN 6 " + "ELSE 100 END" + ) + lexical_sql = "LEAST(" + ", ".join(term_filters) + ")" if term_filters else "100" + test_penalty_sql = ( + "CASE " + "WHEN lower(path) LIKE 'tests/%' OR lower(path) LIKE '%/tests/%' OR lower(path) LIKE 'test_%' OR lower(path) LIKE '%/test_%' " + "THEN 1 ELSE 0 END" + if prefer_non_tests + else "0" + ) + layer_rank_sql = ( + "CASE " + "WHEN layer = 'C3_ENTRYPOINTS' THEN 0 " + "WHEN layer = 'C1_SYMBOL_CATALOG' THEN 1 " + "WHEN layer = 'C2_DEPENDENCY_GRAPH' THEN 2 " + "WHEN layer = 'C0_SOURCE_CHUNKS' THEN 3 " + "WHEN layer = 'D1_MODULE_CATALOG' THEN 0 " + "WHEN layer = 'D2_FACT_INDEX' THEN 1 " + "WHEN layer = 'D3_SECTION_INDEX' THEN 2 " + "WHEN layer = 'D4_POLICY_INDEX' THEN 3 " + "ELSE 10 END" + ) + sql = f""" + SELECT path, content, layer, title, metadata_json, span_start, span_end, + {lexical_sql} AS lexical_rank, + {test_penalty_sql} AS test_penalty, + {layer_rank_sql} AS layer_rank, + (embedding <=> CAST(:emb AS vector)) AS distance + FROM rag_chunks + WHERE {' AND '.join(filters)} + ORDER BY lexical_rank ASC, test_penalty ASC, layer_rank ASC, embedding <=> CAST(:emb AS vector) + LIMIT :lim + """ + with get_engine().connect() as conn: + rows = conn.execute(text(sql), params).mappings().fetchall() + return [self._row_to_dict(row) for row in rows] + + def fallback_chunks(self, rag_session_id: str, *, limit: int = 5, layers: list[str] | None = None) -> list[dict]: + filters = ["rag_session_id = :sid"] + params: dict = {"sid": rag_session_id, "lim": limit} + if layers: + filters.append("layer = ANY(:layers)") + params["layers"] = layers + sql = f""" + SELECT path, content, layer, title, metadata_json, span_start, span_end + FROM rag_chunks + WHERE {' AND '.join(filters)} + ORDER BY id DESC + LIMIT :lim + """ + with get_engine().connect() as conn: + rows = conn.execute(text(sql), params).mappings().fetchall() + return [self._row_to_dict(row) for row in rows] + + def _row_to_dict(self, row) -> dict: + data = dict(row) + data["metadata"] = json.loads(str(data.pop("metadata_json") or "{}")) + return data diff --git a/app/modules/rag/persistence/repository.py b/app/modules/rag/persistence/repository.py new file mode 100644 index 0000000..a8418f5 --- /dev/null +++ b/app/modules/rag/persistence/repository.py @@ -0,0 +1,82 @@ +from __future__ import annotations + +from app.modules.rag.contracts import RagDocument +from app.modules.rag.persistence.cache_repository import RagCacheRepository +from app.modules.rag.persistence.document_repository import RagDocumentRepository +from app.modules.rag.persistence.job_repository import RagJobRepository, RagJobRow +from app.modules.rag.persistence.query_repository import RagQueryRepository +from app.modules.rag.persistence.schema_repository import RagSchemaRepository +from app.modules.rag.persistence.session_repository import RagSessionRepository +from app.modules.shared.db import get_engine + + +class RagRepository: + def __init__(self) -> None: + self._schema = RagSchemaRepository() + self._sessions = RagSessionRepository() + self._jobs = RagJobRepository() + self._documents = RagDocumentRepository() + self._cache = RagCacheRepository() + self._query = RagQueryRepository() + + def ensure_tables(self) -> None: + self._schema.ensure_tables() + + def upsert_session(self, rag_session_id: str, project_id: str) -> None: + self._sessions.upsert_session(rag_session_id, project_id) + + def session_exists(self, rag_session_id: str) -> bool: + return self._sessions.session_exists(rag_session_id) + + def get_session(self, rag_session_id: str) -> dict | None: + return self._sessions.get_session(rag_session_id) + + def create_job(self, index_job_id: str, rag_session_id: str, status: str) -> None: + self._jobs.create_job(index_job_id, rag_session_id, status) + + def update_job(self, index_job_id: str, **kwargs) -> None: + self._jobs.update_job(index_job_id, **kwargs) + + def get_job(self, index_job_id: str) -> RagJobRow | None: + return self._jobs.get_job(index_job_id) + + def replace_documents(self, rag_session_id: str, docs: list[RagDocument]) -> None: + with get_engine().connect() as conn: + self._documents.replace_documents(conn, rag_session_id, docs) + conn.commit() + + def apply_document_changes(self, rag_session_id: str, delete_paths: list[str], docs: list[RagDocument]) -> None: + with get_engine().connect() as conn: + self._documents.apply_document_changes(conn, rag_session_id, delete_paths, docs) + conn.commit() + + def get_cached_documents(self, repo_id: str, blob_sha: str) -> list[RagDocument]: + return self._cache.get_cached_documents(repo_id, blob_sha) + + def cache_documents(self, repo_id: str, path: str, blob_sha: str, docs: list[RagDocument]) -> None: + self._cache.cache_documents(repo_id, path, blob_sha, docs) + + def record_repo_cache(self, **kwargs) -> None: + self._cache.record_repo_cache(**kwargs) + + def retrieve( + self, + rag_session_id: str, + query_embedding: list[float], + *, + query_text: str = "", + limit: int = 5, + layers: list[str] | None = None, + prefer_non_tests: bool = False, + ) -> list[dict]: + return self._query.retrieve( + rag_session_id, + query_embedding, + query_text=query_text, + limit=limit, + layers=layers, + prefer_non_tests=prefer_non_tests, + ) + + def fallback_chunks(self, rag_session_id: str, limit: int = 5, layers: list[str] | None = None) -> list[dict]: + return self._query.fallback_chunks(rag_session_id, limit=limit, layers=layers) diff --git a/app/modules/rag/persistence/schema_repository.py b/app/modules/rag/persistence/schema_repository.py new file mode 100644 index 0000000..5648165 --- /dev/null +++ b/app/modules/rag/persistence/schema_repository.py @@ -0,0 +1,179 @@ +from __future__ import annotations + +from sqlalchemy import text + +from app.modules.shared.db import get_engine + + +class RagSchemaRepository: + def ensure_tables(self) -> None: + engine = get_engine() + with engine.connect() as conn: + conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector")) + conn.execute( + text( + """ + CREATE TABLE IF NOT EXISTS rag_sessions ( + rag_session_id VARCHAR(64) PRIMARY KEY, + project_id VARCHAR(512) NOT NULL, + created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP + ) + """ + ) + ) + conn.execute( + text( + """ + CREATE TABLE IF NOT EXISTS rag_index_jobs ( + index_job_id VARCHAR(64) PRIMARY KEY, + rag_session_id VARCHAR(64) NOT NULL, + status VARCHAR(16) NOT NULL, + indexed_files INTEGER NOT NULL DEFAULT 0, + failed_files INTEGER NOT NULL DEFAULT 0, + cache_hit_files INTEGER NOT NULL DEFAULT 0, + cache_miss_files INTEGER NOT NULL DEFAULT 0, + error_code VARCHAR(128) NULL, + error_desc TEXT NULL, + error_module VARCHAR(64) NULL, + created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP + ) + """ + ) + ) + conn.execute( + text( + """ + CREATE TABLE IF NOT EXISTS rag_chunks ( + id BIGSERIAL PRIMARY KEY, + rag_session_id VARCHAR(64) NOT NULL, + path TEXT NOT NULL, + chunk_index INTEGER NOT NULL, + content TEXT NOT NULL, + embedding vector NULL, + created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP + ) + """ + ) + ) + conn.execute( + text( + """ + CREATE TABLE IF NOT EXISTS rag_blob_cache ( + id BIGSERIAL PRIMARY KEY, + repo_id VARCHAR(512) NOT NULL, + blob_sha VARCHAR(128) NOT NULL, + path TEXT NOT NULL, + created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP, + CONSTRAINT uq_rag_blob_cache UNIQUE (repo_id, blob_sha, path) + ) + """ + ) + ) + conn.execute( + text( + """ + CREATE TABLE IF NOT EXISTS rag_chunk_cache ( + id BIGSERIAL PRIMARY KEY, + repo_id VARCHAR(512) NOT NULL, + blob_sha VARCHAR(128) NOT NULL, + chunk_index INTEGER NOT NULL, + content TEXT NOT NULL, + embedding vector NULL, + created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP, + CONSTRAINT uq_rag_chunk_cache UNIQUE (repo_id, blob_sha, chunk_index) + ) + """ + ) + ) + conn.execute( + text( + """ + CREATE TABLE IF NOT EXISTS rag_session_chunk_map ( + id BIGSERIAL PRIMARY KEY, + rag_session_id VARCHAR(64) NOT NULL, + repo_id VARCHAR(512) NOT NULL, + blob_sha VARCHAR(128) NOT NULL, + chunk_index INTEGER NOT NULL, + path TEXT NOT NULL, + created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP + ) + """ + ) + ) + self._ensure_columns(conn) + self._ensure_indexes(conn) + conn.commit() + + def _ensure_columns(self, conn) -> None: + for statement in ( + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS artifact_type VARCHAR(16) NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS section TEXT NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS doc_id TEXT NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS doc_version TEXT NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS owner TEXT NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS system_component TEXT NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS last_modified TIMESTAMPTZ NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS staleness_score DOUBLE PRECISION NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS rag_doc_id VARCHAR(128) NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS layer VARCHAR(64) NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS lang VARCHAR(32) NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS repo_id VARCHAR(512) NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS commit_sha VARCHAR(128) NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS title TEXT NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS metadata_json TEXT NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS links_json TEXT NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS span_start INTEGER NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS span_end INTEGER NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS symbol_id TEXT NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS qname TEXT NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS kind TEXT NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS framework TEXT NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS entrypoint_type TEXT NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS module_id TEXT NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS section_path TEXT NULL", + "ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS doc_kind TEXT NULL", + "ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS artifact_type VARCHAR(16) NULL", + "ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS section TEXT NULL", + "ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS doc_id TEXT NULL", + "ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS doc_version TEXT NULL", + "ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS owner TEXT NULL", + "ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS system_component TEXT NULL", + "ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS last_modified TIMESTAMPTZ NULL", + "ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS staleness_score DOUBLE PRECISION NULL", + "ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS layer VARCHAR(64) NULL", + "ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS lang VARCHAR(32) NULL", + "ALTER TABLE rag_blob_cache ADD COLUMN IF NOT EXISTS metadata_json TEXT NULL", + "ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS section TEXT NULL", + "ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS layer VARCHAR(64) NULL", + "ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS lang VARCHAR(32) NULL", + "ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS path TEXT NULL", + "ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS title TEXT NULL", + "ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS metadata_json TEXT NULL", + "ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS links_json TEXT NULL", + "ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS span_start INTEGER NULL", + "ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS span_end INTEGER NULL", + "ALTER TABLE rag_chunk_cache ADD COLUMN IF NOT EXISTS commit_sha VARCHAR(128) NULL", + "ALTER TABLE rag_index_jobs ADD COLUMN IF NOT EXISTS cache_hit_files INTEGER NOT NULL DEFAULT 0", + "ALTER TABLE rag_index_jobs ADD COLUMN IF NOT EXISTS cache_miss_files INTEGER NOT NULL DEFAULT 0", + ): + conn.execute(text(statement)) + + def _ensure_indexes(self, conn) -> None: + for statement in ( + "CREATE INDEX IF NOT EXISTS idx_rag_chunks_session ON rag_chunks (rag_session_id)", + "CREATE INDEX IF NOT EXISTS idx_rag_chunks_layer ON rag_chunks (rag_session_id, layer)", + "CREATE INDEX IF NOT EXISTS idx_rag_chunks_layer_path ON rag_chunks (rag_session_id, layer, path)", + "CREATE INDEX IF NOT EXISTS idx_rag_chunks_qname ON rag_chunks (qname)", + "CREATE INDEX IF NOT EXISTS idx_rag_chunks_symbol_id ON rag_chunks (symbol_id)", + "CREATE INDEX IF NOT EXISTS idx_rag_chunks_module_id ON rag_chunks (module_id)", + "CREATE INDEX IF NOT EXISTS idx_rag_chunks_doc_kind ON rag_chunks (doc_kind)", + "CREATE INDEX IF NOT EXISTS idx_rag_chunks_entrypoint ON rag_chunks (entrypoint_type, framework)", + "CREATE INDEX IF NOT EXISTS idx_rag_blob_cache_repo_blob ON rag_blob_cache (repo_id, blob_sha)", + "CREATE INDEX IF NOT EXISTS idx_rag_chunk_cache_repo_blob ON rag_chunk_cache (repo_id, blob_sha, chunk_index)", + "CREATE INDEX IF NOT EXISTS idx_rag_session_chunk_map_session ON rag_session_chunk_map (rag_session_id, created_at DESC)", + ): + conn.execute(text(statement)) diff --git a/app/modules/rag/persistence/session_repository.py b/app/modules/rag/persistence/session_repository.py new file mode 100644 index 0000000..5c36a5d --- /dev/null +++ b/app/modules/rag/persistence/session_repository.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from sqlalchemy import text + +from app.modules.shared.db import get_engine + + +class RagSessionRepository: + def upsert_session(self, rag_session_id: str, project_id: str) -> None: + with get_engine().connect() as conn: + conn.execute( + text( + """ + INSERT INTO rag_sessions (rag_session_id, project_id) + VALUES (:sid, :pid) + ON CONFLICT (rag_session_id) DO UPDATE SET project_id = EXCLUDED.project_id + """ + ), + {"sid": rag_session_id, "pid": project_id}, + ) + conn.commit() + + def session_exists(self, rag_session_id: str) -> bool: + with get_engine().connect() as conn: + row = conn.execute( + text("SELECT 1 FROM rag_sessions WHERE rag_session_id = :sid"), + {"sid": rag_session_id}, + ).fetchone() + return bool(row) + + def get_session(self, rag_session_id: str) -> dict | None: + with get_engine().connect() as conn: + row = conn.execute( + text("SELECT rag_session_id, project_id FROM rag_sessions WHERE rag_session_id = :sid"), + {"sid": rag_session_id}, + ).mappings().fetchone() + return dict(row) if row else None diff --git a/app/modules/rag/retrieval/__pycache__/query_router.cpython-312.pyc b/app/modules/rag/retrieval/__pycache__/query_router.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..912715588d961d3a59c5040afcab548fbb74749f GIT binary patch literal 2285 zcma)7Uu+ab7@yhOyKbTFUFji$BHKc1PvV{U=fMPt_3ng*a#wq8V~>)}dOK~e?(LS@ zy}~smsn94IVl+nk>VppuG(n{iQL2w7zU%_YkqJKepuRQMMC8eDcJGb?jd35o`R4ci znQy-N&Fs&ut(y>hJHnq8aS)+j`K11dzJk}D2W1IqNW(fR;XKA%7j&T%$OlSdUc^X1 z2ay(7M4EU}@T2meBA!wGEk5t516KjKJyjCW(GGvx*LSy0b6ApscX&LYp=ki<&IXR=NMMEw?+vXzmnLR~3q}<9={NSwA{M|D?a>e|BQ7s(Z%y-dT38@kQOYoh$D9^8LAbc;x?bor|8BB9zEQOD>jX zbW$RQO||f!bj4uRy=%tT2ff+KgE=Y}PcT8k!DV30alW8>Y^r ziLz1PRARw`u38ogm54oMYK0p0AU8ssJpqL9+NYqrBP^kLJdHtp;Ja9xzbDw>?SE&5 z39R8oX&z12I9RvvF@@?e7i;-w#17Rxrlq=eZ_J@}_6>NOLop#!SYI4330{JGcy5!3D(H*A348%(E}&R|3Gru`IOX+CH%~z8HH&AH z<%PRiV#FfUN~k(HnMjdYWrmv5q+nah?6V2AC~LGhOAJ|^nUTR#AeNQjhDkWG1b~T> zsg-qNC8#=?p!I8;IL@i7P>;LunabVf{SW|rqzC5W_cJ?AiDl}uM1e#kzF#*Bs%{;K zH~f2{PHQWOJHMkl3+Q_9=#}B~(M#Jt-{tm>{$ug8IMddSV<8sww#EXgQDK{vWHyyn z4)te-a}=I1#6%7-F_lf`pdq3~^a&R9%B1{e*ZT1?a&F;~2r>fyjH{9t+#?A3!b=+{r4bayh z7;Vf_3YH~gH%Ks#?FO4SV&R+6EE)lZHkikA`(K@xIiSo&e7t9;{0)o+nWz^n+iM=> zFBfa^dpgiJmLAH%jJHzpXEA|3&Rs-)h!(F=miEfGawn-pFp5E}FrMLAwpb>Bj_ z^LWlS*5PgZNH#SxkXDj?!+itUgDmWUul6N}mHtdB{W{y?0l9Q?xIdezVeK9^m>uX( zj`n}*B+KQJS_2oY1mYSN69+@pOfAu?%&v5X}=$Jfb!oM ezc8~m|3qARRs0KxV`!}a literal 0 HcmV?d00001 diff --git a/app/modules/rag/retrieval/__pycache__/query_terms.cpython-312.pyc b/app/modules/rag/retrieval/__pycache__/query_terms.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..74a313ece012b6a3d6c7b4eef8385d428e8fbb73 GIT binary patch literal 2640 zcmbVOO>7fK6rR~1uVXuo^8@+e&-^s;CM1SX2uLWUYJfruNUeHFh*pbt6CBoSGrI{S zT}xI$s?#DRJ+$#LDjcY2CDiu7t-ZGgJ2r}0r6(?>r<8D@ocd<%jnk5X)RFS$`MsI9 zZ)U#lz5G&N??W(tZMzfS??LD>{oxL#qTscope!H-DOf`ZJd82b8I2ic6hUEca>K0R zRXCVArB2~t=E3fP*)z?Ez9%${h;6^D>q%2K<4N5(AH$X4j^shI2;sFqKv_WMx^JSI z!iKewgoUnSgMlQNdsTlLkE1kW;UP2{K`70dG@>&s_Fgp__}ak7S)9d8VFTatTFivt zuB*)!=kCDhRtJY}b#ut9bNv=_z5h3!#lYpU9q2xg^}G?FlEpPnQMS&Pr8ip^1zEx6(6?S<)h5sQ9gsKkiMntDAt zq)tgGlAKUurXfxBN9A}}A@M0y55w2P;EAb*5v7iCNFfTwXd@_3Y}CW)(4 zvKDm{f;9S4({?jtM&y_&ZLpIj$bp{Um zJ2Pj?q4uYU<$JQ+N^8&Ep_%U4uFYo6@|!Wh@$=4G4JRNWScn$qN$Dtbr*bqmg3Acxnbh0Qy= z?fUA6BqOE8O6q>u;PYNY(xa@DvTp%>-%`r|<@cJ^XFo5bR z!qfOQMV*D!*j>FD=3VYo%4R6iCq`(+936q;Ioc!6@D~qC7yD-#Bh&yzbhPRhMV8Pi zZ?neaB?27)iA?2z`LQ@LOoO(NaP`I7K_^t~g+HSo1XP|taPHLXseE(iUGM#U#lS%3 z4hLS; z9fw|^UQdz1nZ5t+O?}6;WK7nK!M^oHJv6{p5YK-{&ogMrH?qiQ56_=2`L`8(BYzq6 zDsKcEvB*tQ4i|3Ze=211_w(OV7V0KRYPP4sPTN=r8KD zxM7ZfWfa`BFcEkrMBb$0Fo>6})GrFQPm;zaA%rSPgg%MG0;jE33X_9%=M-JkOW9s| xGVk$9*B>V54o!2!mK?U literal 0 HcmV?d00001 diff --git a/app/modules/rag/retrieval/query_router.py b/app/modules/rag/retrieval/query_router.py new file mode 100644 index 0000000..eaa153c --- /dev/null +++ b/app/modules/rag/retrieval/query_router.py @@ -0,0 +1,43 @@ +from __future__ import annotations + +from app.modules.rag.contracts import RagLayer, RetrievalMode + + +class RagQueryRouter: + _CODE_HINTS = ( + "как работает код", + "explain code", + "explain the code", + "по коду", + "из кода", + "построй документацию по коду", + "документацию по коду", + "where is implemented", + "где реализовано", + "endpoint", + "handler", + "symbol", + "function", + "class", + "method", + ) + + _DOCS_LAYERS = [ + RagLayer.DOCS_MODULE_CATALOG, + RagLayer.DOCS_FACT_INDEX, + RagLayer.DOCS_SECTION_INDEX, + RagLayer.DOCS_POLICY_INDEX, + ] + _CODE_LAYERS = [ + RagLayer.CODE_ENTRYPOINTS, + RagLayer.CODE_SYMBOL_CATALOG, + RagLayer.CODE_DEPENDENCY_GRAPH, + RagLayer.CODE_SOURCE_CHUNKS, + ] + + def resolve_mode(self, query: str) -> str: + lowered = query.lower() + return RetrievalMode.CODE if any(hint in lowered for hint in self._CODE_HINTS) else RetrievalMode.DOCS + + def layers_for_mode(self, mode: str) -> list[str]: + return list(self._CODE_LAYERS if mode == RetrievalMode.CODE else self._DOCS_LAYERS) diff --git a/app/modules/rag/retrieval/query_terms.py b/app/modules/rag/retrieval/query_terms.py new file mode 100644 index 0000000..13c7a25 --- /dev/null +++ b/app/modules/rag/retrieval/query_terms.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +import re + + +def extract_query_terms(query_text: str) -> list[str]: + raw_terms = re.findall(r"[A-Za-z_][A-Za-z0-9_]{2,}", query_text or "") + normalized: list[str] = [] + for term in raw_terms: + for variant in _identifier_variants(term): + if variant not in normalized: + normalized.append(variant) + for variant in _intent_variants(query_text): + if variant not in normalized: + normalized.append(variant) + return normalized[:6] + + +def _identifier_variants(term: str) -> list[str]: + lowered = term.lower() + variants = [lowered] + snake = _camel_to_snake(term) + if snake and snake not in variants: + variants.append(snake) + if lowered.endswith("manager") and len(lowered) > len("manager"): + manager_split = lowered[: -len("manager")] + "_manager" + if manager_split not in variants: + variants.append(manager_split) + compact = snake.replace("_", "") if snake else "" + if compact and compact not in variants: + variants.append(compact) + return variants + + +def _camel_to_snake(term: str) -> str: + first = re.sub(r"(.)([A-Z][a-z]+)", r"\1_\2", term) + return re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", first).lower() + + +def _intent_variants(query_text: str) -> list[str]: + lowered = (query_text or "").lower() + variants: list[str] = [] + if any(token in lowered for token in ("управ", "control", "manage", "management")): + variants.extend(["control", "management", "start", "stop", "status"]) + return variants diff --git a/app/modules/rag/services/rag_service.py b/app/modules/rag/services/rag_service.py new file mode 100644 index 0000000..ae2c996 --- /dev/null +++ b/app/modules/rag/services/rag_service.py @@ -0,0 +1,197 @@ +from __future__ import annotations + +import asyncio +import hashlib +import os +from collections.abc import Awaitable, Callable +from inspect import isawaitable + +from app.modules.rag.contracts import RagDocument +from app.modules.rag.indexing.code.pipeline import CodeIndexingPipeline +from app.modules.rag.indexing.common.report import IndexReport +from app.modules.rag.indexing.docs.pipeline import DocsIndexingPipeline +from app.modules.rag.persistence.repository import RagRepository +from app.modules.rag.retrieval.query_router import RagQueryRouter +from app.modules.rag_session.embedding.gigachat_embedder import GigaChatEmbedder + + +class RagService: + def __init__( + self, + embedder: GigaChatEmbedder, + repository: RagRepository, + chunker=None, + ) -> None: + self._embedder = embedder + self._repo = repository + self._docs = DocsIndexingPipeline() + self._code = CodeIndexingPipeline() + self._queries = RagQueryRouter() + + async def index_snapshot( + self, + rag_session_id: str, + files: list[dict], + progress_cb: Callable[[int, int, str], Awaitable[None] | None] | None = None, + ) -> tuple[int, int, int, int]: + report = await self._index_files(rag_session_id, files, progress_cb=progress_cb) + self._repo.replace_documents(rag_session_id, report.documents_list) + return report.as_tuple() + + async def index_changes( + self, + rag_session_id: str, + changed_files: list[dict], + progress_cb: Callable[[int, int, str], Awaitable[None] | None] | None = None, + ) -> tuple[int, int, int, int]: + delete_paths: list[str] = [] + upserts: list[dict] = [] + for item in changed_files: + if str(item.get("op")) == "delete": + delete_paths.append(str(item.get("path", ""))) + else: + upserts.append(item) + report = await self._index_files(rag_session_id, upserts, progress_cb=progress_cb) + self._repo.apply_document_changes(rag_session_id, delete_paths, report.documents_list) + return report.as_tuple() + + async def retrieve(self, rag_session_id: str, query: str) -> list[dict]: + mode = self._queries.resolve_mode(query) + layers = self._queries.layers_for_mode(mode) + prefer_non_tests = mode == "code" and "test" not in query.lower() and "тест" not in query.lower() + try: + query_embedding = self._embedder.embed([query])[0] + rows = self._repo.retrieve( + rag_session_id, + query_embedding, + query_text=query, + limit=8, + layers=layers, + prefer_non_tests=prefer_non_tests, + ) + except Exception: + rows = self._repo.fallback_chunks(rag_session_id, limit=8, layers=layers) + if not rows and mode != "docs": + rows = self._repo.fallback_chunks(rag_session_id, limit=8, layers=self._queries.layers_for_mode("docs")) + return [ + { + "source": row["path"], + "content": row["content"], + "layer": row.get("layer"), + "title": row.get("title"), + "metadata": row.get("metadata", {}), + "score": row.get("distance"), + } + for row in rows + ] + + async def _index_files( + self, + rag_session_id: str, + files: list[dict], + progress_cb: Callable[[int, int, str], Awaitable[None] | None] | None = None, + ) -> "_PipelineReport": + total_files = len(files) + report = _PipelineReport() + repo_id = self._resolve_repo_id(rag_session_id) + for index, file in enumerate(files, start=1): + path = str(file.get("path", "")) + try: + blob_sha = self._blob_sha(file) + cached = await asyncio.to_thread(self._repo.get_cached_documents, repo_id, blob_sha) + if cached: + report.documents_list.extend(self._with_file_metadata(cached, file, repo_id, blob_sha)) + report.cache_hit_files += 1 + else: + built = self._build_documents(repo_id, path, file) + embedded = await asyncio.to_thread(self._embed_documents, built, file, repo_id, blob_sha) + report.documents_list.extend(embedded) + await asyncio.to_thread(self._repo.cache_documents, repo_id, path, blob_sha, embedded) + report.cache_miss_files += 1 + report.indexed_files += 1 + except Exception as exc: + report.failed_files += 1 + report.warnings.append(f"{path}: {exc}") + await self._notify_progress(progress_cb, index, total_files, path) + report.documents = len(report.documents_list) + return report + + def _build_documents(self, repo_id: str, path: str, file: dict) -> list[RagDocument]: + content = str(file.get("content") or "") + commit_sha = file.get("commit_sha") + docs: list[RagDocument] = [] + if self._docs.supports(path): + docs.extend(self._docs.index_file(repo_id=repo_id, commit_sha=commit_sha, path=path, content=content)) + if self._code.supports(path): + docs.extend(self._code.index_file(repo_id=repo_id, commit_sha=commit_sha, path=path, content=content)) + if not docs: + docs.extend(self._docs.index_file(repo_id=repo_id, commit_sha=commit_sha, path=path, content=content)) + return docs + + def _embed_documents(self, docs: list[RagDocument], file: dict, repo_id: str, blob_sha: str) -> list[RagDocument]: + if not docs: + return [] + batch_size = max(1, int(os.getenv("RAG_EMBED_BATCH_SIZE", "16"))) + metadata = self._document_metadata(file, repo_id, blob_sha) + for doc in docs: + doc.metadata.update(metadata) + for start in range(0, len(docs), batch_size): + batch = docs[start : start + batch_size] + vectors = self._embedder.embed([doc.text for doc in batch]) + for doc, vector in zip(batch, vectors): + doc.embedding = vector + return docs + + def _with_file_metadata(self, docs: list[RagDocument], file: dict, repo_id: str, blob_sha: str) -> list[RagDocument]: + metadata = self._document_metadata(file, repo_id, blob_sha) + for doc in docs: + doc.metadata.update(metadata) + doc.source.repo_id = repo_id + doc.source.path = str(file.get("path", doc.source.path)) + return docs + + def _document_metadata(self, file: dict, repo_id: str, blob_sha: str) -> dict: + return { + "blob_sha": blob_sha, + "repo_id": repo_id, + "artifact_type": file.get("artifact_type"), + "section": file.get("section"), + "doc_id": file.get("doc_id"), + "doc_version": file.get("doc_version"), + "owner": file.get("owner"), + "system_component": file.get("system_component"), + "last_modified": file.get("last_modified"), + "staleness_score": file.get("staleness_score"), + } + + def _resolve_repo_id(self, rag_session_id: str) -> str: + session = self._repo.get_session(rag_session_id) + if not session: + return rag_session_id + return str(session.get("project_id") or rag_session_id) + + def _blob_sha(self, file: dict) -> str: + raw = str(file.get("content_hash") or "").strip() + if raw: + return raw + content = str(file.get("content") or "") + return hashlib.sha256(content.encode("utf-8")).hexdigest() + + async def _notify_progress( + self, + progress_cb: Callable[[int, int, str], Awaitable[None] | None] | None, + current_file_index: int, + total_files: int, + current_file_name: str, + ) -> None: + if not progress_cb: + return + result = progress_cb(current_file_index, total_files, current_file_name) + if isawaitable(result): + await result + + +class _PipelineReport(IndexReport): + def __init__(self) -> None: + super().__init__() + self.documents_list: list[RagDocument] = [] diff --git a/app/modules/rag_repo/module.py b/app/modules/rag_repo/module.py index 15f7c86..0f3c095 100644 --- a/app/modules/rag_repo/module.py +++ b/app/modules/rag_repo/module.py @@ -3,8 +3,8 @@ from __future__ import annotations from fastapi import APIRouter, Request from app.modules.agent.story_context_repository import StoryContextRepository +from app.modules.rag.persistence.repository import RagRepository from app.modules.rag_repo.webhook_service import RepoWebhookService -from app.modules.rag_session.repository import RagRepository class RagRepoModule: diff --git a/app/modules/rag_session/repository.py b/app/modules/rag_session/repository.py index bdeca25..a90d628 100644 --- a/app/modules/rag_session/repository.py +++ b/app/modules/rag_session/repository.py @@ -1,660 +1,4 @@ -from __future__ import annotations +from app.modules.rag.persistence.job_repository import RagJobRow +from app.modules.rag.persistence.repository import RagRepository -from dataclasses import dataclass -import hashlib -from sqlalchemy import text - -from app.modules.shared.db import get_engine - - -@dataclass -class RagJobRow: - index_job_id: str - rag_session_id: str - status: str - indexed_files: int - failed_files: int - cache_hit_files: int - cache_miss_files: int - error_code: str | None - error_desc: str | None - error_module: str | None - - -class RagRepository: - def ensure_tables(self) -> None: - engine = get_engine() - with engine.connect() as conn: - conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector")) - conn.execute( - text( - """ - CREATE TABLE IF NOT EXISTS rag_sessions ( - rag_session_id VARCHAR(64) PRIMARY KEY, - project_id VARCHAR(512) NOT NULL, - created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP - ) - """ - ) - ) - conn.execute( - text( - """ - CREATE TABLE IF NOT EXISTS rag_index_jobs ( - index_job_id VARCHAR(64) PRIMARY KEY, - rag_session_id VARCHAR(64) NOT NULL, - status VARCHAR(16) NOT NULL, - indexed_files INTEGER NOT NULL DEFAULT 0, - failed_files INTEGER NOT NULL DEFAULT 0, - cache_hit_files INTEGER NOT NULL DEFAULT 0, - cache_miss_files INTEGER NOT NULL DEFAULT 0, - error_code VARCHAR(128) NULL, - error_desc TEXT NULL, - error_module VARCHAR(64) NULL, - created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP, - updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP - ) - """ - ) - ) - conn.execute( - text( - """ - CREATE TABLE IF NOT EXISTS rag_chunks ( - id BIGSERIAL PRIMARY KEY, - rag_session_id VARCHAR(64) NOT NULL, - path TEXT NOT NULL, - chunk_index INTEGER NOT NULL, - content TEXT NOT NULL, - embedding vector NULL, - created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP, - updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP - ) - """ - ) - ) - conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS artifact_type VARCHAR(16) NULL")) - conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS section TEXT NULL")) - conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS doc_id TEXT NULL")) - conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS doc_version TEXT NULL")) - conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS owner TEXT NULL")) - conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS system_component TEXT NULL")) - conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS last_modified TIMESTAMPTZ NULL")) - conn.execute(text("ALTER TABLE rag_chunks ADD COLUMN IF NOT EXISTS staleness_score DOUBLE PRECISION NULL")) - conn.execute( - text( - """ - ALTER TABLE rag_chunks - ADD COLUMN IF NOT EXISTS created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP - """ - ) - ) - conn.execute( - text( - """ - ALTER TABLE rag_chunks - ADD COLUMN IF NOT EXISTS updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP - """ - ) - ) - conn.execute( - text( - """ - CREATE TABLE IF NOT EXISTS rag_blob_cache ( - id BIGSERIAL PRIMARY KEY, - repo_id VARCHAR(512) NOT NULL, - blob_sha VARCHAR(128) NOT NULL, - path TEXT NOT NULL, - artifact_type VARCHAR(16) NULL, - section TEXT NULL, - doc_id TEXT NULL, - doc_version TEXT NULL, - owner TEXT NULL, - system_component TEXT NULL, - last_modified TIMESTAMPTZ NULL, - staleness_score DOUBLE PRECISION NULL, - created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP, - updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP, - CONSTRAINT uq_rag_blob_cache UNIQUE (repo_id, blob_sha, path) - ) - """ - ) - ) - conn.execute( - text( - """ - CREATE TABLE IF NOT EXISTS rag_chunk_cache ( - id BIGSERIAL PRIMARY KEY, - repo_id VARCHAR(512) NOT NULL, - blob_sha VARCHAR(128) NOT NULL, - chunk_index INTEGER NOT NULL, - content TEXT NOT NULL, - embedding vector NULL, - section TEXT NULL, - created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP, - updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP, - CONSTRAINT uq_rag_chunk_cache UNIQUE (repo_id, blob_sha, chunk_index) - ) - """ - ) - ) - conn.execute( - text( - """ - CREATE TABLE IF NOT EXISTS rag_session_chunk_map ( - id BIGSERIAL PRIMARY KEY, - rag_session_id VARCHAR(64) NOT NULL, - repo_id VARCHAR(512) NOT NULL, - blob_sha VARCHAR(128) NOT NULL, - chunk_index INTEGER NOT NULL, - path TEXT NOT NULL, - created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP - ) - """ - ) - ) - conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunks_session ON rag_chunks (rag_session_id)")) - conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunks_artifact_type ON rag_chunks (artifact_type)")) - conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunks_doc ON rag_chunks (doc_id, doc_version)")) - conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunks_component ON rag_chunks (system_component)")) - conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunks_path ON rag_chunks (path)")) - conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_blob_cache_repo_blob ON rag_blob_cache (repo_id, blob_sha)")) - conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chunk_cache_repo_blob ON rag_chunk_cache (repo_id, blob_sha, chunk_index)")) - conn.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_session_chunk_map_session ON rag_session_chunk_map (rag_session_id, created_at DESC)")) - conn.execute(text("ALTER TABLE rag_index_jobs ADD COLUMN IF NOT EXISTS cache_hit_files INTEGER NOT NULL DEFAULT 0")) - conn.execute(text("ALTER TABLE rag_index_jobs ADD COLUMN IF NOT EXISTS cache_miss_files INTEGER NOT NULL DEFAULT 0")) - conn.commit() - - def upsert_session(self, rag_session_id: str, project_id: str) -> None: - with get_engine().connect() as conn: - conn.execute( - text( - """ - INSERT INTO rag_sessions (rag_session_id, project_id) - VALUES (:sid, :pid) - ON CONFLICT (rag_session_id) DO UPDATE SET project_id = EXCLUDED.project_id - """ - ), - {"sid": rag_session_id, "pid": project_id}, - ) - conn.commit() - - def session_exists(self, rag_session_id: str) -> bool: - with get_engine().connect() as conn: - row = conn.execute( - text("SELECT 1 FROM rag_sessions WHERE rag_session_id = :sid"), - {"sid": rag_session_id}, - ).fetchone() - return bool(row) - - def get_session(self, rag_session_id: str) -> dict | None: - with get_engine().connect() as conn: - row = conn.execute( - text("SELECT rag_session_id, project_id FROM rag_sessions WHERE rag_session_id = :sid"), - {"sid": rag_session_id}, - ).mappings().fetchone() - return dict(row) if row else None - - def create_job(self, index_job_id: str, rag_session_id: str, status: str) -> None: - with get_engine().connect() as conn: - conn.execute( - text( - """ - INSERT INTO rag_index_jobs (index_job_id, rag_session_id, status) - VALUES (:jid, :sid, :status) - """ - ), - {"jid": index_job_id, "sid": rag_session_id, "status": status}, - ) - conn.commit() - - def update_job( - self, - index_job_id: str, - *, - status: str, - indexed_files: int, - failed_files: int, - cache_hit_files: int = 0, - cache_miss_files: int = 0, - error_code: str | None = None, - error_desc: str | None = None, - error_module: str | None = None, - ) -> None: - with get_engine().connect() as conn: - conn.execute( - text( - """ - UPDATE rag_index_jobs - SET status = :status, - indexed_files = :indexed, - failed_files = :failed, - cache_hit_files = :cache_hit_files, - cache_miss_files = :cache_miss_files, - error_code = :ecode, - error_desc = :edesc, - error_module = :emodule, - updated_at = CURRENT_TIMESTAMP - WHERE index_job_id = :jid - """ - ), - { - "jid": index_job_id, - "status": status, - "indexed": indexed_files, - "failed": failed_files, - "cache_hit_files": cache_hit_files, - "cache_miss_files": cache_miss_files, - "ecode": error_code, - "edesc": error_desc, - "emodule": error_module, - }, - ) - conn.commit() - - def get_job(self, index_job_id: str) -> RagJobRow | None: - with get_engine().connect() as conn: - row = conn.execute( - text( - """ - SELECT index_job_id, rag_session_id, status, indexed_files, failed_files, - cache_hit_files, cache_miss_files, error_code, error_desc, error_module - FROM rag_index_jobs - WHERE index_job_id = :jid - """ - ), - {"jid": index_job_id}, - ).mappings().fetchone() - if not row: - return None - return RagJobRow(**dict(row)) - - def replace_chunks(self, rag_session_id: str, items: list[dict]) -> None: - with get_engine().connect() as conn: - conn.execute(text("DELETE FROM rag_chunks WHERE rag_session_id = :sid"), {"sid": rag_session_id}) - conn.execute(text("DELETE FROM rag_session_chunk_map WHERE rag_session_id = :sid"), {"sid": rag_session_id}) - self._insert_chunks(conn, rag_session_id, items) - conn.commit() - - def apply_changes(self, rag_session_id: str, delete_paths: list[str], upserts: list[dict]) -> None: - with get_engine().connect() as conn: - if delete_paths: - conn.execute( - text("DELETE FROM rag_chunks WHERE rag_session_id = :sid AND path = ANY(:paths)"), - {"sid": rag_session_id, "paths": delete_paths}, - ) - conn.execute( - text("DELETE FROM rag_session_chunk_map WHERE rag_session_id = :sid AND path = ANY(:paths)"), - {"sid": rag_session_id, "paths": delete_paths}, - ) - if upserts: - paths = sorted({str(x["path"]) for x in upserts}) - conn.execute( - text("DELETE FROM rag_chunks WHERE rag_session_id = :sid AND path = ANY(:paths)"), - {"sid": rag_session_id, "paths": paths}, - ) - conn.execute( - text("DELETE FROM rag_session_chunk_map WHERE rag_session_id = :sid AND path = ANY(:paths)"), - {"sid": rag_session_id, "paths": paths}, - ) - self._insert_chunks(conn, rag_session_id, upserts) - conn.commit() - - def get_cached_chunks(self, repo_id: str, blob_sha: str) -> list[dict]: - with get_engine().connect() as conn: - rows = conn.execute( - text( - """ - SELECT chunk_index, content, embedding::text AS embedding_txt, section - FROM rag_chunk_cache - WHERE repo_id = :repo_id AND blob_sha = :blob_sha - ORDER BY chunk_index ASC - """ - ), - {"repo_id": repo_id, "blob_sha": blob_sha}, - ).mappings().fetchall() - output: list[dict] = [] - for row in rows: - output.append( - { - "chunk_index": int(row["chunk_index"]), - "content": str(row["content"] or ""), - "embedding": self._parse_vector(str(row["embedding_txt"] or "")), - "section": row.get("section"), - } - ) - return output - - def record_repo_cache( - self, - *, - project_id: str, - commit_sha: str | None, - changed_files: list[str], - summary: str, - ) -> None: - repo_session_id = f"repo:{project_id}" - with get_engine().connect() as conn: - for path in changed_files: - key = f"{commit_sha or 'no-commit'}:{path}" - blob_sha = hashlib.sha256(key.encode("utf-8")).hexdigest() - conn.execute( - text( - """ - INSERT INTO rag_blob_cache ( - repo_id, - blob_sha, - path, - artifact_type, - section - ) - VALUES ( - :repo_id, - :blob_sha, - :path, - :artifact_type, - :section - ) - ON CONFLICT (repo_id, blob_sha, path) DO UPDATE SET - updated_at = CURRENT_TIMESTAMP - """ - ), - { - "repo_id": project_id, - "blob_sha": blob_sha, - "path": path, - "artifact_type": "CODE", - "section": "repo_webhook", - }, - ) - conn.execute( - text( - """ - INSERT INTO rag_chunk_cache ( - repo_id, - blob_sha, - chunk_index, - content, - embedding, - section - ) - VALUES ( - :repo_id, - :blob_sha, - 0, - :content, - NULL, - :section - ) - ON CONFLICT (repo_id, blob_sha, chunk_index) DO UPDATE SET - content = EXCLUDED.content, - section = EXCLUDED.section, - updated_at = CURRENT_TIMESTAMP - """ - ), - { - "repo_id": project_id, - "blob_sha": blob_sha, - "content": f"repo_webhook:{path}:{summary[:300]}", - "section": "repo_webhook", - }, - ) - conn.execute( - text( - """ - INSERT INTO rag_session_chunk_map ( - rag_session_id, - repo_id, - blob_sha, - chunk_index, - path - ) - VALUES ( - :rag_session_id, - :repo_id, - :blob_sha, - 0, - :path - ) - """ - ), - { - "rag_session_id": repo_session_id, - "repo_id": project_id, - "blob_sha": blob_sha, - "path": path, - }, - ) - conn.commit() - - def cache_file_chunks(self, repo_id: str, path: str, blob_sha: str, items: list[dict]) -> None: - if not items: - return - meta = items[0] - with get_engine().connect() as conn: - conn.execute( - text( - """ - INSERT INTO rag_blob_cache ( - repo_id, - blob_sha, - path, - artifact_type, - section, - doc_id, - doc_version, - owner, - system_component, - last_modified, - staleness_score - ) - VALUES ( - :repo_id, - :blob_sha, - :path, - :artifact_type, - :section, - :doc_id, - :doc_version, - :owner, - :system_component, - :last_modified, - :staleness_score - ) - ON CONFLICT (repo_id, blob_sha, path) DO UPDATE SET - artifact_type = EXCLUDED.artifact_type, - section = EXCLUDED.section, - doc_id = EXCLUDED.doc_id, - doc_version = EXCLUDED.doc_version, - owner = EXCLUDED.owner, - system_component = EXCLUDED.system_component, - last_modified = EXCLUDED.last_modified, - staleness_score = EXCLUDED.staleness_score, - updated_at = CURRENT_TIMESTAMP - """ - ), - { - "repo_id": repo_id, - "blob_sha": blob_sha, - "path": path, - "artifact_type": meta.get("artifact_type"), - "section": meta.get("section"), - "doc_id": meta.get("doc_id"), - "doc_version": meta.get("doc_version"), - "owner": meta.get("owner"), - "system_component": meta.get("system_component"), - "last_modified": meta.get("last_modified"), - "staleness_score": meta.get("staleness_score"), - }, - ) - for item in items: - emb = item.get("embedding") or [] - emb_str = "[" + ",".join(str(x) for x in emb) + "]" if emb else None - conn.execute( - text( - """ - INSERT INTO rag_chunk_cache ( - repo_id, - blob_sha, - chunk_index, - content, - embedding, - section - ) - VALUES ( - :repo_id, - :blob_sha, - :chunk_index, - :content, - CAST(:embedding AS vector), - :section - ) - ON CONFLICT (repo_id, blob_sha, chunk_index) DO UPDATE SET - content = EXCLUDED.content, - embedding = EXCLUDED.embedding, - section = EXCLUDED.section, - updated_at = CURRENT_TIMESTAMP - """ - ), - { - "repo_id": repo_id, - "blob_sha": blob_sha, - "chunk_index": int(item["chunk_index"]), - "content": item["content"], - "embedding": emb_str, - "section": item.get("section"), - }, - ) - conn.commit() - - def retrieve(self, rag_session_id: str, query_embedding: list[float], limit: int = 5) -> list[dict]: - emb = "[" + ",".join(str(x) for x in query_embedding) + "]" - with get_engine().connect() as conn: - rows = conn.execute( - text( - """ - SELECT path, content - FROM rag_chunks - WHERE rag_session_id = :sid - ORDER BY embedding <=> CAST(:emb AS vector) - LIMIT :lim - """ - ), - {"sid": rag_session_id, "emb": emb, "lim": limit}, - ).mappings().fetchall() - return [dict(x) for x in rows] - - def fallback_chunks(self, rag_session_id: str, limit: int = 5) -> list[dict]: - with get_engine().connect() as conn: - rows = conn.execute( - text( - """ - SELECT path, content - FROM rag_chunks - WHERE rag_session_id = :sid - ORDER BY id DESC - LIMIT :lim - """ - ), - {"sid": rag_session_id, "lim": limit}, - ).mappings().fetchall() - return [dict(x) for x in rows] - - def _insert_chunks(self, conn, rag_session_id: str, items: list[dict]) -> None: - for item in items: - emb = item.get("embedding") or [] - emb_str = "[" + ",".join(str(x) for x in emb) + "]" if emb else None - conn.execute( - text( - """ - INSERT INTO rag_chunks ( - rag_session_id, - path, - chunk_index, - content, - embedding, - artifact_type, - section, - doc_id, - doc_version, - owner, - system_component, - last_modified, - staleness_score, - created_at, - updated_at - ) - VALUES ( - :sid, - :path, - :idx, - :content, - CAST(:emb AS vector), - :artifact_type, - :section, - :doc_id, - :doc_version, - :owner, - :system_component, - :last_modified, - :staleness_score, - CURRENT_TIMESTAMP, - CURRENT_TIMESTAMP - ) - """ - ), - { - "sid": rag_session_id, - "path": item["path"], - "idx": int(item["chunk_index"]), - "content": item["content"], - "emb": emb_str, - "artifact_type": item.get("artifact_type"), - "section": item.get("section"), - "doc_id": item.get("doc_id"), - "doc_version": item.get("doc_version"), - "owner": item.get("owner"), - "system_component": item.get("system_component"), - "last_modified": item.get("last_modified"), - "staleness_score": item.get("staleness_score"), - }, - ) - repo_id = str(item.get("repo_id") or "").strip() - blob_sha = str(item.get("blob_sha") or "").strip() - if repo_id and blob_sha: - conn.execute( - text( - """ - INSERT INTO rag_session_chunk_map ( - rag_session_id, - repo_id, - blob_sha, - chunk_index, - path - ) VALUES ( - :sid, - :repo_id, - :blob_sha, - :chunk_index, - :path - ) - """ - ), - { - "sid": rag_session_id, - "repo_id": repo_id, - "blob_sha": blob_sha, - "chunk_index": int(item["chunk_index"]), - "path": item["path"], - }, - ) - - def _parse_vector(self, value: str) -> list[float]: - text_value = value.strip() - if not text_value: - return [] - if text_value.startswith("[") and text_value.endswith("]"): - text_value = text_value[1:-1] - if not text_value: - return [] - return [float(part.strip()) for part in text_value.split(",") if part.strip()] +__all__ = ["RagJobRow", "RagRepository"] diff --git a/app/modules/rag_session/service.py b/app/modules/rag_session/service.py index 89bcf84..acae846 100644 --- a/app/modules/rag_session/service.py +++ b/app/modules/rag_session/service.py @@ -1,211 +1,3 @@ -import asyncio -import hashlib -import os -from collections.abc import Awaitable, Callable -from inspect import isawaitable +from app.modules.rag.services.rag_service import RagService -from app.modules.rag_session.embedding.gigachat_embedder import GigaChatEmbedder -from app.modules.rag_session.repository import RagRepository -from app.modules.rag_session.retrieval.chunker import TextChunker - - -class RagService: - def __init__( - self, - embedder: GigaChatEmbedder, - repository: RagRepository, - chunker: TextChunker | None = None, - ) -> None: - self._embedder = embedder - self._repo = repository - self._chunker = chunker or TextChunker() - - async def index_snapshot( - self, - rag_session_id: str, - files: list[dict], - progress_cb: Callable[[int, int, str], Awaitable[None] | None] | None = None, - ) -> tuple[int, int, int, int]: - total_files = len(files) - indexed_files = 0 - failed_files = 0 - cache_hit_files = 0 - cache_miss_files = 0 - all_chunks: list[dict] = [] - repo_id = self._resolve_repo_id(rag_session_id) - for index, file in enumerate(files, start=1): - path = str(file.get("path", "")) - try: - blob_sha = self._blob_sha(file) - cached = await asyncio.to_thread(self._repo.get_cached_chunks, repo_id, blob_sha) - if cached: - all_chunks.extend(self._build_cached_items(path, file, repo_id, blob_sha, cached)) - cache_hit_files += 1 - else: - chunks = self._build_chunks_for_file(file) - embedded_chunks = await asyncio.to_thread(self._embed_chunks, chunks, file, repo_id, blob_sha) - all_chunks.extend(embedded_chunks) - await asyncio.to_thread(self._repo.cache_file_chunks, repo_id, path, blob_sha, embedded_chunks) - cache_miss_files += 1 - indexed_files += 1 - except Exception: - failed_files += 1 - await self._notify_progress(progress_cb, index, total_files, path) - await asyncio.to_thread(self._repo.replace_chunks, rag_session_id, all_chunks) - return indexed_files, failed_files, cache_hit_files, cache_miss_files - - async def index_changes( - self, - rag_session_id: str, - changed_files: list[dict], - progress_cb: Callable[[int, int, str], Awaitable[None] | None] | None = None, - ) -> tuple[int, int, int, int]: - total_files = len(changed_files) - indexed_files = 0 - failed_files = 0 - cache_hit_files = 0 - cache_miss_files = 0 - delete_paths: list[str] = [] - upsert_chunks: list[dict] = [] - repo_id = self._resolve_repo_id(rag_session_id) - - for index, file in enumerate(changed_files, start=1): - path = str(file.get("path", "")) - op = str(file.get("op", "")) - try: - if op == "delete": - delete_paths.append(path) - indexed_files += 1 - await self._notify_progress(progress_cb, index, total_files, path) - continue - if op == "upsert" and file.get("content") is not None: - blob_sha = self._blob_sha(file) - cached = await asyncio.to_thread(self._repo.get_cached_chunks, repo_id, blob_sha) - if cached: - upsert_chunks.extend(self._build_cached_items(path, file, repo_id, blob_sha, cached)) - cache_hit_files += 1 - else: - chunks = self._build_chunks_for_file(file) - embedded_chunks = await asyncio.to_thread(self._embed_chunks, chunks, file, repo_id, blob_sha) - upsert_chunks.extend(embedded_chunks) - await asyncio.to_thread(self._repo.cache_file_chunks, repo_id, path, blob_sha, embedded_chunks) - cache_miss_files += 1 - indexed_files += 1 - await self._notify_progress(progress_cb, index, total_files, path) - continue - failed_files += 1 - except Exception: - failed_files += 1 - await self._notify_progress(progress_cb, index, total_files, path) - - await asyncio.to_thread( - self._repo.apply_changes, - rag_session_id, - delete_paths, - upsert_chunks, - ) - return indexed_files, failed_files, cache_hit_files, cache_miss_files - - async def retrieve(self, rag_session_id: str, query: str) -> list[dict]: - try: - query_embedding = self._embedder.embed([query])[0] - rows = self._repo.retrieve(rag_session_id, query_embedding, limit=5) - except Exception: - rows = self._repo.fallback_chunks(rag_session_id, limit=5) - return [{"source": row["path"], "content": row["content"]} for row in rows] - - def _build_chunks_for_file(self, file: dict) -> list[tuple[str, int, str]]: - path = str(file.get("path", "")) - content = str(file.get("content", "")) - output: list[tuple[str, int, str]] = [] - for idx, chunk in enumerate(self._chunker.chunk(content)): - output.append((path, idx, chunk)) - return output - - def _embed_chunks(self, raw_chunks: list[tuple[str, int, str]], file: dict, repo_id: str, blob_sha: str) -> list[dict]: - if not raw_chunks: - return [] - batch_size = max(1, int(os.getenv("RAG_EMBED_BATCH_SIZE", "16"))) - metadata = self._chunk_metadata(file) - - indexed: list[dict] = [] - for i in range(0, len(raw_chunks), batch_size): - batch = raw_chunks[i : i + batch_size] - texts = [x[2] for x in batch] - vectors = self._embedder.embed(texts) - for (path, chunk_index, content), vector in zip(batch, vectors): - indexed.append( - { - "path": path, - "chunk_index": chunk_index, - "content": content, - "embedding": vector, - "repo_id": repo_id, - "blob_sha": blob_sha, - **metadata, - } - ) - return indexed - - def _build_cached_items( - self, - path: str, - file: dict, - repo_id: str, - blob_sha: str, - cached: list[dict], - ) -> list[dict]: - metadata = self._chunk_metadata(file) - output: list[dict] = [] - for item in cached: - output.append( - { - "path": path, - "chunk_index": int(item["chunk_index"]), - "content": str(item["content"]), - "embedding": item.get("embedding") or [], - "repo_id": repo_id, - "blob_sha": blob_sha, - **metadata, - "section": item.get("section") or metadata.get("section"), - } - ) - return output - - def _resolve_repo_id(self, rag_session_id: str) -> str: - session = self._repo.get_session(rag_session_id) - if not session: - return rag_session_id - return str(session.get("project_id") or rag_session_id) - - def _blob_sha(self, file: dict) -> str: - raw = str(file.get("content_hash") or "").strip() - if raw: - return raw - content = str(file.get("content") or "") - return hashlib.sha256(content.encode("utf-8")).hexdigest() - - def _chunk_metadata(self, file: dict) -> dict: - return { - "artifact_type": file.get("artifact_type"), - "section": file.get("section"), - "doc_id": file.get("doc_id"), - "doc_version": file.get("doc_version"), - "owner": file.get("owner"), - "system_component": file.get("system_component"), - "last_modified": file.get("last_modified"), - "staleness_score": file.get("staleness_score"), - } - - async def _notify_progress( - self, - progress_cb: Callable[[int, int, str], Awaitable[None] | None] | None, - current_file_index: int, - total_files: int, - current_file_name: str, - ) -> None: - if not progress_cb: - return - result = progress_cb(current_file_index, total_files, current_file_name) - if isawaitable(result): - await result +__all__ = ["RagService"] diff --git a/tests/agent/orchestrator/__pycache__/test_explain_actions.cpython-312-pytest-9.0.2.pyc b/tests/agent/orchestrator/__pycache__/test_explain_actions.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000000000000000000000000000000000000..29323aa09daab08dc373828724c316f68ed6a6f1 GIT binary patch literal 10157 zcmeG?ZEPD?a=X-$6txmXeL12f*^J~Uq9Rk4Y%8g2JEu^Jl1m~xmhC3$wd*zCie7p7 z;kQf0BB(;~xxH)e0O#(0*e#4UKMK@_{ljr%ROeiB*MA1u0?W#R77G^*(Ece<6b@|Q z0QXO4-f~xztT;ZvK>v8LoSm6BGw;25Zy(>x+ka|ksN>-JRr}wn|J%xO|AB;i2^Hi) z36P5%;RvsB(>#0nGBqh5&t-&ZKaV9oEszOL2N_Y*isd6>K zvj%<-{s+KC&hW|zPkf|?2!^jB$<4Z6bs6w9f-XV)BtU|s_Il`AMa~GhW!EZ*A-Wt= zcU`MuxIQl@g*N= zInR;S^W60|w{=roe9L4pV1S=?4i#&ODr>nkV1Dq|&Y?mQRL#FOKP&%5`9k?)^GkCn zX8zjzDrVj`zcN2B{}7QI=IvN{5gwnJUzN|n^H%v>3=K+onoB9Vo}+dU&{+jW&UZ#m z|I0t)_1~}Cf}!X}T(CnVmw^G3RASdvbQqq>WoJ~PWK#+bnylxtc0kVRA1M^3jYt)} zpc#@epI2-yE>QF;aABN559g8Hj0Us<8t{0b8C6qsDI@3OAv?&L3Dq_^9hY=yE=)wM zJ&cyB==y@LP$>ffa$2!#b$8MQsWTUC5o+^hUL!DJ#KDRVeJI4HU_H#{h-F+$ZjB^#6T`B-HX2C*Qm zWQ{=_DbQqij;2llNo6BPS%IWD@3aK(Cce~({QRTZ{hFLPN#ujYEmF!jo7k`AQnIEW z1jOO@LYs7at3Ku)0OZzn1{=&egA@58r*cyqeFacDfM6#8r%!Yc>0JPxSPj0Oi=25n zv&hjQz+uh_IP|uFp0~qkD(6p0Frji9c-U@mXf->RI}LNKmf{jiYXR8x;~yM;cl7N^ z>G0^0qZI8@d{Zg5lll>$`{OliHqzb5*aJYv#U&PtvG&>mdW60PEPS{19|2h8?uji{ zq}|%O!+K$x)i!;vDZ1L!TW;#T7wPy^T^hSHWqv5#jhwjG+OypH(!KWH8?W7M-~V;A z+v@1Dx_4O7j&DN#=DLS1jrD=Wv4=6PK8ofd?Vcg4HEKmWt#IUFLy*ZtxcbgdTd$1$ zhajq86fdsmF!?i)8vRYOPt&`wp*}r6UImgXI1XqqUU-lJY>MaCF3u!F{RGv{mqs_HiWoe7cTYHv*t|j+*4t57;`Gp$z z`4A(i$LVcV?bx;^;bNs)w+FOZah_lE@cMYN~$94k;w9IGcVy zE4PDA^K?7xAh@l?O};Ipazu&qk9-FX#Q9?LOMUtIp|~D9a3F^0p12T!=ucv5EqP73ZY<&G?Swl*>Hk}tc($ZT?65ST@N;;lLb|S&>q5cCt$GK zCH+*cpb=(YN^7~3Fd#TTHkF*vPw=v!NejuGkO-N%}{skvb(AIfw#@;!aEHqf<)O;js5fhXO?hpVBj6 zIu1WwgAFJI1a;y20j^`%iVj%O5vyau>P_C6E_Z(LjTmSM-;c1OgcaRyb-ZhJ^<8<{ z>>PHAni@lO4{Dmkz+&>@4z96lIlOIg;%h-%6*|g7$ED<5A!Z>JEep|0J$Hp|jA|3R}MpH(kj5d1f{Iayk6+mC%jhK2%Zzex4}#9paj)D(y<$K>%@SKpq5H?bcj--2>|Pb{UU6uLSVqD@ zJqQYy-t>xH)GG!Av@EgP6b8WPr!%HFuqxoaQeimMGNT;agP`;2j92cWUNIn`W{JI~ zu*Vd8uYm16s{-CD6^27CGs?j|2s&55HcCUei+aU?+!gjb9`)73G-Od92a(mcM}3&- zg2*p)xrWq1^d~~Dte!N$FH9PNkGu%*nn*MJs(BEt5Qr1V3#7dw7sXvO>4f|jXr=w` z9^?NrYFx|HLG*(j*>KHzxwW+d(nYpC7XA3hcD6tA&2bP#3K0D?xf-*Z#1bS1F(Xf2 zgq^0ZD*9Q|fjsZVYb|yp-3I8V6NezJqSRPtA%pMcTjTCTFTS=1G9N8f#V(T_lSpL=rQ*?4lxb5A}aPrm%zlh4SL zeHBmgPqj|Q**e+(e4X56oqXl_oP0KO5+d?}$>KJhrUvmiNlnAqQ5FtD=%6I2S=Epv z2+ip}_-nHpl1$(|r@S~0^12aQ$E;gN#SUP6R%J6WH=I}}aeUbJgNSYvn;$Pa3IS|` zn#t#=p%7_?<}!A@LrSuy$~qiF9DsJwgV?xwIO#}Bd3j#T$%MWRtl~?XbP7&!hTh8v13(a-{>33I5Zt4>3*zi;M*#rid&;} zlBy8hW1X$*&Aq`23I-V3ta7Kg&iorVhr9*h_gCeQ*-7M0^NaEYb{=_3)(i;FW4a+5 z1>HTigrNPiwWG`Ok05%#?Fdn0P&aaU04Pt#um?SxwJ8Cog&;(YN)7NwQ{Ofhs+9H(?Hnnjz!BZu-R56`>%J_XW6rx>KDRGc{-OD6MuE>^4B({LycGkF z-U8+=^B-cPhu@C%rPyRo%*z>V0JA2TCiydSseB>kyi=IBh$p~4orHRvd<&+RdU|?d z=HJ1P!pZYZu>3jL2ekSmn*IT5e24%OD)b0~qX>=xDBvpq4u)5)@^+p8OJD(7hlv~L z0yNj1IoKEMgF8*w%T&NqN7j&EV1fsm|0_K3#{L2P>EreaBcNgae0@#`YWb4QNbC*x z2RdT@J#+@lErG$CKz?D~poc(}z7L>)D+@bV$E?y9<)4Dve`5ZU8O27I7t80N;Fsl( zHyMKRh3dQyokpX7h~NhZ2!cOC@W%)w1Sb&42owY}2+{~nBEZp)2k9xqRRqk{vxsR3 zatQJObUd=Fdzw0cK=6D)VW<)lR*WAe6B zHhAC1bx!%L-u=Hxmpdod6+=elpw)5I>VMlje!ASLd4&h9=qpzAd#F0ar!-LozK`J^PsE>qZNio0$=refc!fcHv;;ZVzra&QlV&JD;_pfr@bs8{T; zE%6;wIA)6P+?g}QW2*w*D;0)AEi=l&JqS8?=Dczj^@;%jHB0Pgxr+Xcxe8!-xd>D2 zuTDb%{@u9>NMTswK2sP2cPv33WNcNyd!@o~sAWbuxCcRJ3F<~^D0fk>*kM~@(iGl@ zeAAr~Q+$6_z%?xJ2XAfRT66U_F+M%%#fbWL%hS`F~;wtZK4 z-}$D&M=?h+8Q)^px=B>_65CP9$<-#Dv?%0T6IA(#!q-rc-gT3+cCf-@>wtX#P9PGQ z94WxZG70wCM?%S_)vS`J{`w=~e( zq3ELk?$_`<|L+`ko?GEQ_&4t8Z+$`D_gfCY{njX-<3Aa@)cUjU-{%lFdv-ry$b;9m Q^LrnT@vremcs4};2Rn4f9smFU literal 0 HcmV?d00001 diff --git a/tests/agent/orchestrator/test_explain_actions.py b/tests/agent/orchestrator/test_explain_actions.py new file mode 100644 index 0000000..4cd41c9 --- /dev/null +++ b/tests/agent/orchestrator/test_explain_actions.py @@ -0,0 +1,131 @@ +from app.modules.agent.engine.orchestrator.actions.explain_actions import ExplainActions +from app.modules.agent.engine.orchestrator.execution_context import ExecutionContext +from app.modules.agent.engine.orchestrator.models import ( + ExecutionPlan, + OutputContract, + RoutingMeta, + Scenario, + TaskConstraints, + TaskSpec, +) + + +def _ctx(rag_items: list[dict]) -> ExecutionContext: + task = TaskSpec( + task_id="task-1", + dialog_session_id="dialog-1", + rag_session_id="rag-1", + user_message="Объясни по коду как работает task_processor", + scenario=Scenario.EXPLAIN_PART, + routing=RoutingMeta(domain_id="project", process_id="qa", confidence=0.9, reason="test"), + constraints=TaskConstraints(), + output_contract=OutputContract(result_type="answer"), + metadata={ + "rag_items": rag_items, + "rag_context": "", + "confluence_context": "", + "files_map": {}, + }, + ) + plan = ExecutionPlan( + plan_id="plan-1", + task_id="task-1", + scenario=Scenario.EXPLAIN_PART, + template_id="tpl", + template_version="1", + steps=[], + ) + return ExecutionContext(task=task, plan=plan, graph_resolver=lambda *_: None, graph_invoker=lambda *_: {}) + + +def test_explain_actions_switch_to_code_profile_when_code_layers_present() -> None: + ctx = _ctx( + [ + { + "source": "app/task_processor.py", + "layer": "C1_SYMBOL_CATALOG", + "title": "task_processor.process_task", + "content": "function task_processor.process_task(task)", + "metadata": {"qname": "task_processor.process_task", "kind": "function"}, + }, + { + "source": "app/task_processor.py", + "layer": "C2_DEPENDENCY_GRAPH", + "title": "task_processor.process_task:calls", + "content": "task_processor.process_task calls queue.publish", + "metadata": {"edge_type": "calls"}, + }, + ] + ) + actions = ExplainActions() + + actions.collect_sources(ctx) + actions.extract_logic(ctx) + actions.summarize(ctx) + + sources = ctx.artifacts.get_content("sources", {}) + assert sources["source_profile"] == "code" + answer = str(ctx.artifacts.get_content("final_answer", "")) + assert "кодовых слоев индекса" not in answer + assert "CodeRAG" not in answer + assert "app/task_processor.py" in answer + assert "requirements/docs context" not in answer + + +def test_explain_actions_add_code_details_block() -> None: + ctx = _ctx( + [ + { + "source": "src/config_manager/__init__.py", + "layer": "C1_SYMBOL_CATALOG", + "title": "ConfigManager", + "content": "const ConfigManager\nConfigManager = config_manager.v2.ConfigManagerV2", + "metadata": { + "qname": "ConfigManager", + "kind": "const", + "lang_payload": {"imported_from": "v2.ConfigManagerV2", "import_alias": True}, + }, + }, + { + "source": "src/config_manager/v2/control/base.py", + "layer": "C1_SYMBOL_CATALOG", + "title": "ControlChannel", + "content": "class ControlChannel\nControlChannel(ABC)", + "metadata": {"qname": "ControlChannel", "kind": "class"}, + }, + { + "source": "src/config_manager/v2/core/control_bridge.py", + "layer": "C1_SYMBOL_CATALOG", + "title": "ControlChannelBridge", + "content": "class ControlChannelBridge\nПредоставляет halt и status как обработчики start/stop/status", + "metadata": {"qname": "ControlChannelBridge", "kind": "class"}, + }, + { + "source": "src/config_manager/v2/core/control_bridge.py", + "layer": "C2_DEPENDENCY_GRAPH", + "title": "ControlChannelBridge.on_start:calls", + "content": "ControlChannelBridge.on_start calls self._start_runtime", + "metadata": {"src_qname": "ControlChannelBridge.on_start", "dst_ref": "self._start_runtime"}, + }, + { + "source": "src/config_manager/v2/__init__.py", + "layer": "C0_SOURCE_CHUNKS", + "title": "src/config_manager/v2/__init__.py:1-6", + "content": '"""Контракт: управление через API (config.yaml, секция management)."""', + "metadata": {}, + }, + ] + ) + actions = ExplainActions() + + actions.collect_sources(ctx) + actions.extract_logic(ctx) + actions.summarize(ctx) + + answer = str(ctx.artifacts.get_content("final_answer", "")) + assert "### Что видно по коду" in answer + assert "ConfigManager` в проекте доступен как alias" in answer + assert "ControlChannelBridge.on_start" in answer + assert "### Где смотреть в проекте" in answer + assert "В индексе нет точного символа" not in answer + assert "отдельный интерфейс управления" in answer diff --git a/tests/rag/__pycache__/test_code_indexing_pipeline.cpython-312-pytest-9.0.2.pyc b/tests/rag/__pycache__/test_code_indexing_pipeline.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7db10f9efab7d0d1cbe9d93d12628effca8ac471 GIT binary patch literal 8490 zcmeGhOKjZMk=*6}Nc}&t9sf3tEo)PGWl8?Wvg1hdCy8ZOvgBH-OVCTMOnHBOq-4w4 zH3TDv05K3CFcKYdFwmx0AuV$4A&1_3A+=IhAX2nwb7*gFltb-PXCBF2N?F7CG);nL z!JRknao)V)ym@bC-uq*HJy#a|!{x zZ6GTv=Q6T#OU+DORC7v7%_s?Hcv^sLj354Md>zW7v+m0e9i#x#71%s8jZ_6{T<)ECXOj^1-~A2+o!`WRxJyG zd?3%wGKFA1sQpbB+&9x8@5Y}DG)YztVUMKGl}QfkjV|SMlZ#bL_e*CPZHtG0OO?XK zs*--_=!tlHw7GQE@{2t(EXY&+iW+_L0hhZR`O=cNdwAB6JkPm2iI=k0=oLzkG$Qlw z`)|Yi6hg8f*X2VhOKV|$llzUHaW}hd)iNJ;$92oQVCT2kLZA?=>Sp@cY7ZCt8uMQN zmtd?J6-Wz_L_p1JE?0Y`%J2uJ-JIq(#=No^V_lYKtuw~+iIk*i(J@USqY9Z-6Us3@3O`v%MyC{A zoPigenvzO(MG+NMIc5_e5mEFRl5t<~{7d!_ap>5&s~hRH?$@1egPWN|r3zix#R*^I7abjZnkY9_|sKDMVbH`g1} z_}3i-L_=(UOp6a(7#tEuFN}>042c70$A(`W}7(`ERnI zeeQ;uk*#oYCX>+BY-SkjAV8z z7OHhiDdkr8pfW3>*(nKK6K8w7B{eD&bymql!Th4|CZT9rH@=tx^)D`1W-XiYsa&9f{(mtg8Wq#3%`wwa&9n~Kfs{^mC0SVIR`1B2GE$PUN+1hZg! z?%V0yZ(JB;D-bxdO)wB9I{|zP^RNfU!6?F>ht|sr1I50NjxHYkI9xt5ZXOvg{bc<9 zfZ2Na(Hfq++}+j1eh}(ntq4xEsL$n;6LhBu!2G5(D`fBfniWbMi)kl10S7RJooYD^3f6V=t!w+AHHu{JlsTD~Qu-}_m za>EKaPIZxe?CbAe^q6$N?FnF~kF^jyIkR}Wh{dFuQat9>@FUej6ag>{bJw_*ZxD#uz!tVA$aDiJK?s;T8 z(!wbrV+kml?Yerrl|H*-4|dR7Ic)Lv)+7QeZ;-hs6<&wSGOe5rBI{Mm=T`bKX4%)@O=U0Ye$VG27|o3^gDZe6Y4 zx!SbpVT7sMxyA(Qg7bq98<^dlf2!|X7cT%RGtisOdRSLX0p zaT@Ge1)5wR3V;-)D!NRGD?~N|EQ{k6QWy0S0S79;Zd2H8aB)M}T?`sRyv*UV;xyQ` z3N*Pm3V;-)D!NRGD?~N|EQ`Sksf&7vfWxQF?cgCFTv~3wQ)<35|H`V+Zg2+|o-J?b zGPiUUFPmEqLjBES29(0VGKbHK(_q(1ShLY3?jX?FxEBJ+r;V-P79Jc}ZoOP;ygYw) z6>V>uDQsJqr0PGvwC}vZorg&<01?iYIeb>!7Q0%aRU2Ode4j&Rw^V38ZEA!4@xkD7 z+i0n2bpG6`01IThDQsW3VG1u8+~J}Mzz|+2s-|$b%;B@*G}yHgt=aez(%YyPE^~(; zeYTZph-tVsf7ALihD!ia`BiQ~-i0Xazi|tq<2dvu2cM9G;=M?%pff~sP)-iRi5Qm; z&>R%+K2mzbQU`e`VjAyiI1>g0OQeL&@g z#lxw_UCR^i12{PK2*MZF)=2{m#2X#WG})S&vG0VQrk7@_mzE#4Ugnrs>+tO!nvKV2 zd)-WYAe%|5Q|H0`PAO!(H!5k-bxk}$3nOUWw|8hH(G6)RbP&^ak8Tmt1ISZF$O#C> zfVCsKyYQwGH-Cq@7QxKIs$cho!vt@lo)GX2)^O%Dl!83Qj%GzErAm<0K(a%#oEqs2 zd@!R80-#~^XAILm=<|koW9HGZQrDOiNpbCZoWm3)w*;?+KPx8(yA2^H=ma-au*JhCu1_2Bx{)R_lYoLX z7^Zp9=k*~;Q`m#i%_bT(?I~_D*F{axPziesVULY3A-y;FsgP8CQ^pkb6#?`c+|VD| z02o3)l)_M%!)L{5uxlk+v+*UQw^1)#=7vCcm<^v5v`@f>V|7*-6LAdH$q{&GZ4vF@ zK5mD}5OBwdauYH!2-GW0cnU?^=!{!C;h-R%&dM_>MT^79Kpd=2ClZ_|#FflU+Br<> z*huA`8UVSh#4C|I2V&3=mrmUxLoQ(hCb5sh$L�kgHI|GaC(W_a6FLmi-IEyuqw6 W2mZ{QSqtuFk9YA$R-aj-pvJ^b0n|`HksimmDVZ(hw4zQX6;4URoGObZpX{!#U!%~Q* z(#+Kw8b?A}Bwc&8)`G)YUAq2iJ-{_G^8s^}ouy-ue;~7w7;V;F5GT%xvkC#YeK;%W zr!ulKqh=<~t2sqeGm7L)YgHf<6^7sHlO~kGpxwa-4p0E83Ur>DM69PCIV6W=CQtvi z<^f|56+-#YF!dqGeuI)i*P;|v2o1q!*R5}Zeb3N^nnJ|a&E$5U!iB!ZzBj(!cUOCE z{fh7V8?Kyw{WbPI`g-4e{n^kfzVADqmKQwCf9~g-Uhn(UdVD4KeYY#6-24HB`GFg8 z-I!3Q&(}}xak2bbZrKKn0xP#d^k?&t_j{nld`~6PR5d0Bd+ocKOx01=^8LFYJcQ zbgyr1F7(sTLL4&FhC*203ijII+iRi5*zaQbS(=rZ{zBd+Z-3!zEtoY{$<~Z+cNW*y zHq0}hp@8gd`v#Wm<(V7c`4;bGgD;%_)w(NiYd`;!cf8gvMdde3yKlVK?&fLN@MGGW z&xvV``Npa-ZKv(O^Q?YE-X-%Je0yG0a!y{+7yk01?ic6Q_~jMYS@4`R{0iZl0VuY6 zL~Mimh~C~_R+W>ym{a4|RSm8SCgKuNL_-nO%yd?j6xNu{DM>z;)l_Mgm9uG4%_MmT zmj#0N^(4#llBj9A0Bo?q^S$dhPK%lV2>lGn$7T#O7I{CbUstUh?ve!WegaU6-iBt8bGs(4rPk(15%v% z1R+_%zb6_*oxxe{B?5|K4di)AQx)KWOMc#bKz67P7~=9!g@QNlxH@AAj%%~$yD+6G zI?ML*X$7W~ye*15s&Tee1`GnIplf>#=_z1wU4aeORZIDE^`ewk5;>BcG$Z15sej<$ z+)f$pl@iW&CDt2AJT1oqRHe$Yub|9v+w;X^!#v{}h)8oVEhDL?=&JABWdqNmGe)0I(b0^sXZYv@jis=NerOb$i z7hZZSY=(ghivcVS%|%6BSBPO@ACBqRc^GDVd!rfOIX-kzI5jeS;(P061JdM1q@yQ> zE}l9!Ql0u+8&RJ>clOlKl`6!NcQXeI#zh~5-|7#bI788JbC;*CK`sL~b@U|kJ^dK& zT_aWPF3!t`=wH%zX_{K-W~f=}cj24#eR>3xT4Sl#R&`sI9l0MeLvc_$S_MwT^G~G2 zB~ZMc5H)2cF|14rXg`W%=)&|sLR5K~z=xdyuaM(mL{fA;fddm@7YPHbSWggf!h!_k z3f{-Q#1wMQEoMA7`=tIT;43pZlA0SqA{*7k5Ux+Dnyd?wXoy;NLKufnOBZwy0zNSC zO&M9Ry;%IHmIZ&Mr{W&Jw_p`I`uM+7|C*;Bc8uMVmPS9ha{tPwJ(c9;Qu1=S_wq+l zsbg$y74Ix>?>RvKy7mBV)~B;Fd`)2ZbK9TqLu2%l(6M7+Os!}R{V|K1?rtY-lrO98xmo}Q+hon&DrY1~QRTkkpLk^9vB5b3n1P-YBg z*9hL=5sXj&ZG9$z=V}KOwmzfDMdQw`d-8JAo>DZqwu<+|x1-JU+=0ON3d*oO0sy{8 z=yATF7+WyIVrKT`wQ@t$1uF)O5IlpDE+{r#A%i&6J}fYEf9XXu62+JzVAWPM`Cl&N zC~7OPv#F&+cYd_Ece$y%6iuyt#p*WObrC#R#loZwm25?cR?Bs*^Unk&-}KGK|91l3 zq;u?^z9fC3-d8_8QW?Bl8oXSNU;aohb&jnOya_iO9494Uyj&N@S8DN z%P)fYYKp&5g4Y`+`|aGUDNfME`0wS<3d~ zDCtF^TCCn+kTV(Z;D%WTX^IFrp-9l75jUcmZv`7~o2=uZfmM*CFW49`Sqv6}fv(Zy+}-dE+wrKe_fhkf z$Msahu2rh0Au>PwI7;p9`K+<$-q0QCL-ogMv9V|V9VkBM+A7@s61Ts|^cT7PONWbG ze}%z!#p$qHURh%L7mnDivJWm>$R3gn0L!Jr9;yolk${6GKv#+DDl)xAu4{3+$n{nj zd{>+fyXBQ7rWXZ3jIs|dTgV=g4FJo=pbr+ewBGDh$3WPKVv{ z$`X@60T83?gUc4Ohhzi5a;eipb-^GKaQLF71Jahoi_0Bj<(9Ge(~r0g+nDa%DskT| zGG~g3GgtzgI8)-zR2Y0$+#b7GwwgBl&?2$nUFiU#PcBDOMJ9#Pr^+0H zB`#HA{C5Qh?T)e!E;A`0u)7}uyq*U7q4SH@P6#-Q@^a@`xpizF_NueO?J9A*7NaFD z0j30{XI7q(dd36@YhUiBqOr9v+o{$|wB7<9>o;w}L1v|6k8p%pITD2ulFo>onT*6Z zvknt-3^!x|kj-s^9iZa22Zx{$C(2Et8W3!}E<8wDeWVX*cEN!mZhLS&4kv^-o+a@h zug8_lRN6TXc0Y~fo*v>DK;m9d!4PV>TPyKfg%)M`?RxFDu1)9Hwdd Y&lL3mwL%^G6Ls>RktE$m&cpD(0VX>8^Z)<= literal 0 HcmV?d00001 diff --git a/tests/rag/__pycache__/test_local_project_reader.cpython-312-pytest-9.0.2.pyc b/tests/rag/__pycache__/test_local_project_reader.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2ec8529cccd39e1114100d2ece8b0850ac06449a GIT binary patch literal 2902 zcmeGeO=w(2c;4H2`=2yTOd6ZEE+mmW!QHe?OQ|9j8U?W>mKJP4o-aGI$&2r2nR%Ng z%Pth@p`uXn;tJmEp^_fFc@ybL@Ulx%Q|6+e9=sLn#h(1;y`9~qt*gX+3cdt<4^A60uLk5*&mpuvXd~?awqt}eX{A4vRTPUPB3n7j zBS0`QZYQ^)RucMgN$Ouc(vNv$B3sD*Q6Smv?kY4C%5_GdRRI0 zHMFznhF;d!P+5O(7@?-#(AV?IqmMt+KN-QOnP}=H*TfCnNNk{{(J=TI;XY-xKBA~T zCi#X@(a4bWsOa0>jv4S7?b|A2f^$A7wv0C>A;^PA`C;JwPY9v%lHfZ^z z*?X40Muy40FZFs7>XIZQWPc;+Xc3nRr&Q#Dc@HUBA2(8JUWGq|^RcNlanm59&nnsc zZz`GTR#JPao*avM^0{X{`FKxm!9AeC*VL~dbNswW*ww1QBq;NGmQ%LjlxeZLRi7lZ zQeZWI&ODth0K}JY;b^vQ)WXWkr{LO7Q@qiXzBc$^ef+{7aYGJ$>Sy$hP?l|sfz@tK9=93ZN-d&P!6UO8u}T3&KmxUX3U#TB`Dgg~|sVdP#hXaxF9jVN9 zjHx|5*THkQ4({TG-||Ox@`b(pY$rc^vwUmrD{m*iaCPyQ>A9csW^3X4#T%DDzTD26 zR~KQpmz!$inRaffRcq&9a{0U46~x?5#Km-P1fN!|r&nZ84FtITzp*FUc>4M)E!xga z?_v478}l)7i|6IW zLl+g}7KTsuWh4&G zD)Ey#FFNK?e|?P}KpezoVR)E+H4gO}D*-G2kntH9?k6-&`x&8Y=q{T49bLMg%xg2> RhxZYn_R___BxXf0{0GOCe@p-X literal 0 HcmV?d00001 diff --git a/tests/rag/__pycache__/test_query_repository_terms.cpython-312-pytest-9.0.2.pyc b/tests/rag/__pycache__/test_query_repository_terms.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a800eba0b4c6aea2da1a5ad0699a8c633ec1851b GIT binary patch literal 1422 zcmZux&1(}u6rb7bCTWvatqOinU8sT!l4$)ni1>ke5TSm6mtK~2cM`Ykmot;tlmx2w zAR>5FK|FbpUQ|THkNO9A*-%9o1O-ptgnIJio84>@#37mYd++_;o0&In_T$*tD1vcz z(-ZdsM(BrBvZ3_A+BAUs$UzSF&@4`*;^%AyBVV1>u=FTi-Y?7+0IN=JNu4dWu%Z2w z31$o|CN5d^iw#0smq-*cmxr`v_mU@~0CNuhYfG?)`-sDwj^&>0pf;M5jDLoMmwVg} zc9f3NR@%6UI=Oa^J>*(Op37%wARXY;u%pU|nzF2>o}rlHfSaMVlC9=wyp&nz${8BS zY~M}9HV@?-A22f1?w^1TGyk`GpBZ3BbMlXr1vpnnbqY?gt-3hTG`=CrJhbjaW*f-1 zmYpS&H{oXIaYyOoTF4lgX z)7?Oq@PNUD#tHSAp@C5|M;N-H?zC-atK9H1yMabQ|ndz8}V6 zT0~zJSr*MBKbcFU9Tj%c-fZSlZ!6Sl@#`8 z2zWsH?hN+b06wWxYp^bYM#)pc8#I_PMno~YQMwL%Vyl_9LnXZixS|Rr*Qv{)75XMo z=-H@z(PA%DwBmzj7GkZwM`+&*I|LfG zqVkT!c4y*0F|-hQyV8?4(S>~bQWmH*c>-B6s)2((l6LD?-l*sVvRGAo)NlX z+b@bpqFsX5Wl;`fxD~i_aOVTHCsjtzw@-p&_(^y>{5o6>SF~^?x)v=(A88=2hAZKn z@NW18jD8!gMmMzZyYQ|Cy4CO-_^g8V%jo@+%kq{oGgFbTxcd*bdPgg)^ll2Y84c02 zPyt9)hbpW}3Gjt3Sj-lMb2UfZ#_sorqBONOJn zMw7&&Iy347Bg8vS@q%p6Jxi%aVe*2h8eW<+IqY87AzC184g*jogp2)MZI}q`b_Uk*9=>;sNEs1 zX0%!wjF^P68V)pw>(_9dSdAJDj`VkMBh{0KRBiR*1AHB`TD{E((=E?r`lX(3k-E`# ze7Ya%$-~XoL0px79^bb%{=(IBKb3cFlqaI{M5uPd$%`NnPhMOr8-Ujd-5}kFba50k zfXm$t6~Fg|Cg~1jq;@|zn)KF&a2>5D67@ZvX^6|lz&QbFVx9(KU=bW+5R#^U*ku#u zB|YXATm`&_jPaOT{!iR;7M^YRS$HS(UoN6Q6i^e`H3g~fJgIvwrw7}# z{X{_tY^bp9(h7vYgW*5#nn5^O57mJ0n)S7E1MvF7;>JP{Ed+o=&;Tw68!CS93r*4; z$Vd%v*F=-GAzVjm*fmjo5AbAth|7%x4`iBH#K0mr#vlu6`hjr*z=a-w*;N#ved4#N zd%iLjj|k@F3m)&)6Rxf%ObPr6hm0<@ec;`JDlVov{85+;)>C&-cLJt)PbFk3%)fXZ$R|`HMe}Wr^3!pYGPki6qM_bQd|zcw;NQ2Wx8#HAF0_6pe*)3X>8JMF@5bGr7rC+9qwJbm z^pXmP`^X4l2k)G^@Nzo5K3&Ck>q+m2t^Qc zRSy#g=IRXLA13|K&s3xrCfD7N#BM~@(p%Ydf+SvNEMe+2=_V|41)g?9tF~6NkqKFr&_}GTG%<5kiNb7 zGQ0%4=(mNJq|FW%`0n~2t#i^`y>o3sWoz}w)Zbywb8?_b4oeGy2`$|PEZqP=^{}-p z&GWLgJn75U`6E+*hqH#99b_!Wr3KBTU-}DJx&eUfVe5HmuH1QTV#(IZk*UAKoaf{~ zlN^>71QV-t7qD~#_|;rFh8!U{xphWGpT!$aQ_U6Uis=VJohA99Z0AP;lG|qV1`7ln z$g)W_W^HUyYc9a{wxh)Fhm^NrKS3c_KnHHvt~}=1q0e^eYs8j8$IgOzH1~S0e1ok5 c)ekv81?KM>#`t%HZlgoA{!iV)FJ_wm0B None: + pipeline = CodeIndexingPipeline() + content = """ +from fastapi import APIRouter + +router = APIRouter() + +class UserService: + def get_user(self, user_id): + return user_id + +@router.get("/users/{user_id}") +async def get_user(user_id: str): + service = UserService() + return service.get_user(user_id) +""" + docs = pipeline.index_file( + repo_id="acme/proj", + commit_sha="abc123", + path="app/api/users.py", + content=content, + ) + + layers = {doc.layer for doc in docs} + assert RagLayer.CODE_SOURCE_CHUNKS in layers + assert RagLayer.CODE_SYMBOL_CATALOG in layers + assert RagLayer.CODE_DEPENDENCY_GRAPH in layers + assert RagLayer.CODE_ENTRYPOINTS in layers + + symbol_doc = next(doc for doc in docs if doc.layer == RagLayer.CODE_SYMBOL_CATALOG and doc.metadata["kind"] == "function") + assert "get_user" in symbol_doc.metadata["qname"] + + edge_doc = next(doc for doc in docs if doc.layer == RagLayer.CODE_DEPENDENCY_GRAPH) + assert edge_doc.metadata["edge_type"] in {"calls", "imports", "inherits"} + + entry_doc = next(doc for doc in docs if doc.layer == RagLayer.CODE_ENTRYPOINTS) + assert entry_doc.metadata["framework"] == "fastapi" + + +def test_code_pipeline_indexes_import_alias_as_symbol() -> None: + pipeline = CodeIndexingPipeline() + content = "from .v2 import ConfigManagerV2 as ConfigManager\n" + + docs = pipeline.index_file( + repo_id="acme/proj", + commit_sha="abc123", + path="src/config_manager/__init__.py", + content=content, + ) + + alias_doc = next(doc for doc in docs if doc.layer == RagLayer.CODE_SYMBOL_CATALOG and doc.metadata["qname"] == "ConfigManager") + assert alias_doc.metadata["kind"] == "const" + assert alias_doc.metadata["lang_payload"]["import_alias"] is True diff --git a/tests/rag/test_docs_indexing_pipeline.py b/tests/rag/test_docs_indexing_pipeline.py new file mode 100644 index 0000000..675d3aa --- /dev/null +++ b/tests/rag/test_docs_indexing_pipeline.py @@ -0,0 +1,63 @@ +from app.modules.rag.contracts.enums import RagLayer +from app.modules.rag.indexing.docs.pipeline import DocsIndexingPipeline + + +def test_docs_pipeline_builds_catalog_facts_sections_and_policy() -> None: + pipeline = DocsIndexingPipeline() + content = """--- +id: api.billing.create_invoice +type: policy +domain: billing +links: + calls_api: + - api.billing.validate_invoice +tags: [billing] +status: active +--- +# Create Invoice + +## Spec Summary + +Creates an invoice in billing. + +## Request Contract + +| field | type | required | validation | +| --- | --- | --- | --- | +| amount | decimal | yes | > 0 | + +## Error Matrix + +| status | error | client action | +| --- | --- | --- | +| 400 | invalid_amount | fix request | + +## Rules + +- metric: billing.invoice.created +- rule: amount must be positive +""" + docs = pipeline.index_file( + repo_id="acme/proj", + commit_sha="abc123", + path="docs/billing/create_invoice.md", + content=content, + ) + + layers = {doc.layer for doc in docs} + assert RagLayer.DOCS_MODULE_CATALOG in layers + assert RagLayer.DOCS_FACT_INDEX in layers + assert RagLayer.DOCS_SECTION_INDEX in layers + assert RagLayer.DOCS_POLICY_INDEX in layers + + module_doc = next(doc for doc in docs if doc.layer == RagLayer.DOCS_MODULE_CATALOG) + assert module_doc.metadata["module_id"] == "api.billing.create_invoice" + assert module_doc.metadata["type"] == "policy" + + fact_texts = [doc.text for doc in docs if doc.layer == RagLayer.DOCS_FACT_INDEX] + assert any("calls_api" in text for text in fact_texts) + assert any("has_field" in text for text in fact_texts) + assert any("returns_error" in text for text in fact_texts) + + section_doc = next(doc for doc in docs if doc.layer == RagLayer.DOCS_SECTION_INDEX) + assert section_doc.metadata["section_path"] diff --git a/tests/rag/test_query_router.py b/tests/rag/test_query_router.py new file mode 100644 index 0000000..8da84d7 --- /dev/null +++ b/tests/rag/test_query_router.py @@ -0,0 +1,12 @@ +from app.modules.rag.contracts.enums import RetrievalMode +from app.modules.rag.retrieval.query_router import RagQueryRouter + + +def test_query_router_uses_docs_by_default() -> None: + router = RagQueryRouter() + assert router.resolve_mode("Какие есть требования по биллингу?") == RetrievalMode.DOCS + + +def test_query_router_switches_to_code_on_explicit_code_requests() -> None: + router = RagQueryRouter() + assert router.resolve_mode("Объясни как работает код endpoint create invoice") == RetrievalMode.CODE diff --git a/tests/rag/test_query_terms.py b/tests/rag/test_query_terms.py new file mode 100644 index 0000000..654ec3c --- /dev/null +++ b/tests/rag/test_query_terms.py @@ -0,0 +1,9 @@ +from app.modules.rag.retrieval.query_terms import extract_query_terms + + +def test_extract_query_terms_from_code_question() -> None: + terms = extract_query_terms("Объясни по коду как можно управлять COnfigmanager?") + + assert "configmanager" in terms + assert "config_manager" in terms + assert "control" in terms