
    h!                        d dl mZ d dlZd dlZd dlmZ d dlmZmZm	Z	 d dl
Zej                  j                  dd      Z eej                  j                  dd            Z eej                  j                  d	d
            Z eej                  j                  dd            Zd Zdedededee   fdZd Zdee   deee      fdZe G d d             Zdedeeeef      fdZdeefdededededededee   fd Zd! Zd"ee   defd#ZdPdedede	e   dedef
d$Zefd%eded&edefd'Z dQd(edededefd)Z!d*Z"d%ed+edefd,Z#e$d-k(  rd dl%Z% e%jL                  d./      Z'e'jQ                  d0ed1d23       e'jQ                  d4ed56       e'jQ                  d7ed86       e'jQ                  d9ed:6       e'jS                         Z*e*jV                  r> e,d;e*jV                           ee*jV                  e*jZ                  <      Z. e,d=e.        e*j^                  r> e,d>e*j^                           e!e*j^                  e*jZ                  <      Z. e,d?e.        e*j`                  r e,d@e*jZ                   dAe*j`                           e e*j`                  e*jZ                  eB      Z1 e,dC       e1j                  dDg       D ]  Z2 e,dEe2dF   dGdHe2dI    dJe2dK             e,dL        e,e1j                  dMdN      ddO        yyy)R    )PathN)	dataclass)ListTupleOptionalOPENAI_EMBED_MODELztext-embedding-3-smallRAG_CHUNK_SIZE500RAG_CHUNK_OVERLAP80	RAG_TOP_K3c                  ,    dd l } | j                  d      S )Nr   cl100k_base)tiktokenget_encoding)r   s    /var/www/html/dynbot/rag_db.py_get_tokenizerr      s      //    text
max_tokensoverlapreturnc                     t               }|j                  |       }g }d}t        |      }||k  rNt        ||z   |      }||| }	|j	                  |j                  |	             ||k(  r	 |S ||z
  }|dk  rd}||k  rN|S )Nr   )r   encodelenminappenddecode)
r   r   r   enctokenschunksstartnendchunk_tokenss
             r   _split_by_tokensr'      s    

CZZFFEFA
!)%*$a(eC(cjj./!8 M g19E !) Mr   c                 T   t        j                  | t         j                        } t        j                  |t         j                        }| t         j                  j	                  | dd      dz   z  }|t         j                  j	                  |dd      dz   z  }||j
                  z  S )N)dtype   T)axiskeepdimsg-q=)npasarrayfloat32linalgnormT)aba_normb_norms       r   _cosine_sim_matrixr7   -   s}    


1BJJ'A


1BJJ'A"))..T.:UBCF"))..T.:UBCFFHHr   textsc                     ddl m}  |       }|j                  j                  t        |       }|j
                  D cg c]  }|j                   }}|S c c}w )Nr   )OpenAI)modelinput)openair:   
embeddingscreateEMBED_MODELdata	embedding)r8   r:   clientrespdvectorss         r   _embed_textsrG   5   sL    XF##+U#CD$(II.qq{{.G.N /s   Ac                   Z    e Zd ZU eed<   eed<   eed<   dZee   ed<   dZee	e
      ed<   y)RAGChunk	tenant_idtitlecontentNtagsrB   )__name__
__module____qualname__int__annotations__strrM   r   rB   r   float r   r   rI   rI   =   s2    NJLD(3-'+IxU$+r   rI   pdf_pathc                     dd l }g }|j                  |       5 }t        |      D ]/  \  }}|j                  d      }|s|j	                  |dz   |f       1 	 d d d        |S # 1 sw Y   |S xY w)Nr   r   r*   )fitzopen	enumerateget_textr   )rV   rX   pagesdocipager   s          r   extract_pdf_textr`   F   sr    E	8	 , ~ 	,GAt==(Da!eT]+	,,
 L,
 Ls   %A!A!!A+pdf,ragrJ   
base_titlerM   chunk_size_tokensoverlap_tokensc           
          g }t        |       }|D ]R  \  }}	t        |	||      }
t        |
      D ]2  \  }}| d|dd|dz   d}|j                  t	        ||||             4 T |S )Nz :: p02dz :: cr*   )rJ   rK   rL   rM   )r`   r'   rZ   r   rI   )rV   rJ   rb   rM   rc   rd   r"   
page_textspage_nor   piecesidxpiecerK   s                 r   make_chunks_from_pdfrl   Q   s      F!(+J# `!$(9>J#F+ 	`JC!l%}E#a%EEMM(YeUY]^_	``
 Mr   c                      	 ddl m}  	 ddlm} | |t        d      | |fS # t        $ r d } Y 'w xY w# t        $ r d }Y 1w xY w)Nr   )SessionLocal)Documentz\Could not import SessionLocal or Document. Adjust `_import_models()` to your project layout.)apprn   	Exceptionmodelsro   ImportError)rn   ro   s     r   _import_modelsrt   a   sg    $# x/j
 	
 !!    s   " 3 00A Ar"   c                 >   t               \  }} |       }	 d}d}t        dt        |       |      D ]  }| |||z    }|D cg c]  }|j                   }	}t	        |	      }
t        ||
      D ]e  \  }}t        j                  |      } ||j                  d |j                  |j                  ||j                  xs d      }|j                  |       g |j                          |t        |      z  } ||j                          S c c}w # |j                          w xY w)N   r   ra   )rJ   case_idrK   rL   rB   rM   )rt   ranger   rL   rG   zipjsondumpsrJ   rK   rM   addcommitclose)r"   rn   ro   sessionBATCHrows_writtenr^   batchcr8   rF   vecemb_jsonr]   s                 r   upsert_chunks_to_dbr   q   s   +-L(nGq#f+u- 	'A1QuW%E(-.1QYY.E."5)GeW- 
!3::c?kk ''II&,9 C 
! NNCJ&L#	'$ % /$ 	s   )D
 DB"D
 D
 
Dc                 ~    t        |       } |t        |       j                  }t        | |||      }t	        |      }|S )NrJ   rb   rM   )rS   r   stemrl   r   )rV   rJ   rb   rM   r"   writtens         r   preprocess_pdf_to_dbr      s@    8}H(^((
!(iJ]abF!&)GNr   
user_querytop_kc           
      v   t               \  }} |       }	 |j                  |      j                  |j                  |k(  |j                  j                  d             j                         }|sdg |dd|j                          S g g g g f\  }}}	}
|D ]  }	 t        j                  |j                        }|j                  |       |j                  |j                         |	j                  |j                         |
j                  |j                          |sdg |dd|j                          S t        | g      d   }t!        |g|      d   }t#        j$                  |       d | }g }t'        |d      D ]:  \  }}|j                  |t)        |
|         |	|   t+        ||         ||   d       < g }|D ]3  }|j                  d	|d
    d|d    d|d   j-                                 5 dj/                  |      }|||d|j                          S # t        $ r Y w xY w# |j                          w xY w)N z&No embedded documents for this tenant.)contextmatchesr   warningzNo valid embeddings parsed.r   r*   )r#   )rankdocument_idrK   
similarityrL   [r   z] rK   
rL   z

---

)r   r   r   )rt   queryfilterrJ   rB   isnotallr~   rz   loadsr   rL   rK   idrq   rG   r7   r-   argsortrZ   rQ   rT   stripjoin)r   rJ   r   rn   ro   r   docsr>   r8   titlesidsrE   r   	query_vecsimstop_idxr   r   rj   context_partsmr   s                         r   query_rag_context_from_dbr      sA   +-L(nG-}}X&--)+$$T*
 #% 	
 !b5MuvL 	I *,RR&
E63 	Ajj-!!#&QYY'agg&

144 	 !b5Mjk0 	- !*.q1	!9+z:1=**dU#FU+"7!4 	ID#NN"3s8}#DI. : 	  	XA  1QvYKr!G*R)@R@R@T?U!VW	X$$]3"wG9  8 	s>   AH& H& BH
H& 1CH& 	H#H& "H##H& &H8folderc                     t        |       }|j                         st        d|        d}t        |j	                  d            D ]'  }|t        t        |      ||j                  |      z  }) |S )NzFolder not found: r   z*.pdfr   )r   existsFileNotFoundErrorsortedglobr   rS   r   )r   rJ   rM   ptotalpdfs         r   preprocess_folder_pdfs_to_dbr      sq    VA88:"4VH =>>EaffWo& e%c#h)PSPXPX_cddeLr   ub  Du bist ein präziser Assistent. Nutze den folgenden Kontext aus den House of PM Best-Practices, falls relevant, um die Nutzerfrage zu beantworten.
Wenn der Kontext nicht passt, antworte ohne ihn zu zitieren. Zitiere kurz in Klammern die Quelle [Titel] bei Verwendung.

KONTEXT:
{context}

NUTZERFRAGE:
{question}

ANTWORT (prägnant, fachlich, deutsch):rag_contextc                 8    t         j                  |xs d|       S )Nz"(kein relevanter Kontext gefunden))r   question)RAG_PROMPT_TEMPLATEformat)r   r   s     r   build_rag_augmented_promptr      s    %%k.a=alv%wwr   __main__z1Preprocess PDFs into DB and/or query RAG context.)descriptionz--tenantr*   z	Tenant ID)typedefaulthelpz--pdfz"Path to a single PDF to preprocess)r   r   z--folderz(Path to a folder with PDFs to preprocessz--queryzQuery to test retrievalzPreprocessing PDF: )rJ   zStored chunks: zPreprocessing folder: zStored chunks (all PDFs): zQuerying RAG for tenant z: )rJ   r   zTop matches:r   z- (r   z.3fz) rK   z  id=r   z
---
Context preview:
r   r   i  )Nra   )ra   )3pathlibr   rz   osdataclassesr   typingr   r   r   numpyr-   environgetr@   rQ   CHUNK_SIZE_TOKENSCHUNK_OVERLAP_TOKENSDEFAULT_TOP_Kr   rS   r'   r7   rT   rG   rI   r`   rl   rt   r   r   dictr   r   r   r   rN   argparseArgumentParserparseradd_argument
parse_argsargsr   printtenantcntr   r   resr   rU   r   r   <module>r      s     	 ! ( (  jjnn13KL

'7?@ 2::>>*=tDE BJJNN;450
3 C # $s) $S	 d4;&7  , , ,s tE#s(O'<  &/2C/C3 $'%(  # -0	
 *-
 IMX " X 3 :3 3 HSM `c tw  MZ 0# 0# 0c 0^b 0f  C X[ 	+ x3 xS xS x z$X$$1deF

akJ
c0TU

3]^
	2KLDxx#DHH:./"488t{{Cu%&{{&t{{m45*4;;$++N*3%01zz(R

|DE'

dkkQ^_nB' 	TAC,,Bqzl%-@P?QRS	T)*cggi$Ud+, ' r   