
    1h'                        d dl mZ d dlZd dlZd dlmZ d dlmZmZm	Z	 d dl
Zd dlmZ  e ee      j                  d             ej                   j#                  dd	      Z eej                   j#                  d
d            Z eej                   j#                  dd            Z eej                   j#                  dd            Zd Zdedededee   fdZd Zdee   deee      fdZe G d d             Zdedeeeef      fdZdeefdededed ed!ed"edee   fd#Zd$ Z d%ee   defd&Z!d^dedede	e   d edef
d'Z"efd(eded)ede#fd*Z$d_d+eded edefd,Z%d-Z&d(ed.edefd/Z'e(d0k(  rd dl)Z) e)jT                  d12      Z+e+jY                  d3ed4d56       e+jY                  d7ed89       e+jY                  d:ed;9       e+jY                  d<ed=9       e+jY                  d>ed?d@       e+jY                  dAedB9       e+jY                  dCedD9       e+jY                  dEedF9       e+jY                  dGedH9       e+j[                         Z.e.j^                  re.j^                  Ze.j`                  re.j`                  Ze.jb                  re.jb                  Ze.jd                  re.jd                  Ze.jf                  rI e4dIe.jf                           e"e.jf                  e.jj                  e.jl                  J      Z7 e4dKe7        e.jp                  rI e4dLe.jp                           e%e.jp                  e.jj                  e.jl                  J      Z7 e4dMe7        e.jr                  r e4dNe.jj                   dOe.jr                           e$e.jr                  e.jj                  eP      Z: e4dQ       e:j#                  dRg       D ]  Z; e4dSe;dT   dUdVe;dW    dXe;dY             e4dZ        e4e:j#                  d[d\      dd]        yyy)`    )PathN)	dataclass)ListTupleOptional)load_dotenvz.env)dotenv_pathOPENAI_EMBED_MODELztext-embedding-3-smallRAG_CHUNK_SIZE500RAG_CHUNK_OVERLAP80	RAG_TOP_K3c                  ,    dd l } | j                  d      S )Nr   cl100k_base)tiktokenget_encoding)r   s    /var/www/html/dynbot/rag_db.py_get_tokenizerr      s      //    text
max_tokensoverlapreturnc                 *   t               }|j                  |       }g }d}t        |      }||k  rct        ||z   |      }||| }	|j	                  |	      j                         }
|
r|j                  |
       ||k(  r	 |S t        d||z
        }||k  rc|S )Nr   )r   encodelenmindecodestripappendmax)r   r   r   enctokenschunksstartnendchunk_tokens
chunk_texts              r   _split_by_tokensr,      s    

CZZFFEFA
!)%*$a(eC(ZZ-335
MM*%!8M AsW}% !) Mr   c                 T   t        j                  | t         j                        } t        j                  |t         j                        }| t         j                  j	                  | dd      dz   z  }|t         j                  j	                  |dd      dz   z  }||j
                  z  S )N)dtype   T)axiskeepdimsg-q=)npasarrayfloat32linalgnormT)aba_normb_norms       r   _cosine_sim_matrixr<   -   s}    


1BJJ'A


1BJJ'A"))..T.:UBCF"))..T.:UBCFFHHr   textsc                     ddl m}  |       }|j                  j                  t        |       }|j
                  D cg c]  }|j                   c}S c c}w )Nr   )OpenAI)modelinput)openair?   
embeddingscreateEMBED_MODELdata	embedding)r=   r?   clientrespds        r   _embed_textsrK   5   sC    XF##+U#CD!%+AAKK+++s   Ac                   Z    e Zd ZU eed<   eed<   eed<   dZee   ed<   dZee	e
      ed<   y)RAGChunk	tenant_idtitlecontentNtagsrG   )__name__
__module____qualname__int__annotations__strrQ   r   rG   r   float r   r   rM   rM   <   s2    NJLD(3-'+IxU$+r   rM   pdf_pathc                     dd l }g }|j                  |       5 }t        |      D ]A  \  }}|j                  d      xs dj	                         }|s,|j                  |dz   |f       C 	 d d d        |S # 1 sw Y   |S xY w)Nr   r    r/   )fitzopen	enumerateget_textr!   r"   )rZ   r]   pagesdocipager   s          r   extract_pdf_textre   E   s    E	8	 , ~ 	,GAtMM&)/R668Da!eT]+	,,
 L,
 Ls   7A3A33A=pdf,ragrN   
base_titlerQ   chunk_size_tokensoverlap_tokensc           
          g }t        |       }|D ]c  \  }}	t        |	||      }
t        |
      D ]C  \  }}|j                         s| d|dd|dz   d}|j	                  t        ||||             E e |S )Nz :: p02dz :: cr/   )rN   rO   rP   rQ   )re   r,   r_   r!   r"   rM   )rZ   rN   rg   rQ   rh   ri   r&   
page_textspage_nor   piecesidxpiecerO   s                 r   make_chunks_from_pdfrq   P   s      F!(+J# `!$(9>J#F+ 	`JC;;=!l%}E#a%EEMM(YeUY]^_		`` Mr   c                      	 ddl m}  	 ddlm} | |t        d      | |fS # t        $ r d } Y 'w xY w# t        $ r d }Y 1w xY w)Nr   )SessionLocal)Documentz\Could not import SessionLocal or Document. Adjust `_import_models()` to your project layout.)apprs   	Exceptionmodelsrt   ImportError)rs   rt   s     r   _import_modelsry   b   sg    $# x/j
 	
 !!    s   " 3 00A Ar&   c                 >   t               \  }} |       }	 d}d}t        dt        |       |      D ]  }| |||z    }|D cg c]  }|j                   }	}t	        |	      }
t        ||
      D ]e  \  }}t        j                  |      } ||j                  d |j                  |j                  ||j                  xs d      }|j                  |       g |j                          |t        |      z  } ||j                          S c c}w # |j                          w xY w)N   r   rf   )rN   case_idrO   rP   rG   rQ   )ry   ranger   rP   rK   zipjsondumpsrN   rO   rQ   addcommitclose)r&   rs   rt   sessionBATCHrows_writtenrc   batchcr=   vectorsvecemb_jsonrb   s                 r   upsert_chunks_to_dbr   r   s   +-L(nGq#f+u- 	'A1QuW%E(-.1QYY.E."5)GeW- 
!3::c?kk ''II&,9 C 
! NNCJ&L#	'$ % /$ 	s   )D
 DB"D
 D
 
Dc                     t        |       } |t        |       j                  }t        | |||t        t
              }t        |      }|S )N)rN   rg   rQ   rh   ri   )rW   r   stemrq   CHUNK_SIZE_TOKENSCHUNK_OVERLAP_TOKENSr   )rZ   rN   rg   rQ   r&   writtens         r   preprocess_pdf_to_dbr      sM    8}H(^((
!++F "&)GNr   
user_querytop_kc           
      v   t               \  }} |       }	 |j                  |      j                  |j                  |k(  |j                  j                  d             j                         }|sdg |dd|j                          S g g g g f\  }}}	}
|D ]  }	 t        j                  |j                        }|j                  |       |j                  |j                         |	j                  |j                         |
j                  |j                          |sdg |dd|j                          S t        | g      d   }t!        |g|      d   }t#        j$                  |       d | }g }t'        |d      D ]:  \  }}|j                  |t)        |
|         |	|   t+        ||         ||   d       < g }|D ]3  }|j                  d	|d
    d|d    d|d   j-                                 5 dj/                  |      }|||d|j                          S # t        $ r Y w xY w# |j                          w xY w)Nr\   z&No embedded documents for this tenant.)contextmatchesr   warningzNo valid embeddings parsed.r   r/   )r'   )rankdocument_idrO   
similarityrP   [r   z] rO   
rP   z

---

)r   r   r   )ry   queryfilterrN   rG   isnotallr   r   loadsr"   rP   rO   idrv   rK   r<   r2   argsortr_   rU   rX   r!   join)r   rN   r   rs   rt   r   docsrC   r=   titlesidsrJ   r   	query_vecsimstop_idxr   r   ro   context_partsmr   s                         r   query_rag_context_from_dbr      sA   +-L(nG-}}X&--)+$$T*
 #% 	
 !b5MuvL 	I *,RR&
E63 	Ajj-!!#&QYY'agg&

144 	 !b5Mjk0 	- !*.q1	!9+z:1=**dU#FU+"7!4 	ID#NN"3s8}#DI. : 	  	XA  1QvYKr!G*R)@R@R@T?U!VW	X$$]3"wG9  8 	s>   AH& H& BH
H& 1CH& 	H#H& "H##H& &H8folderc                     t        |       }|j                         st        d|        d}t        |j	                  d            D ]'  }|t        t        |      ||j                  |      z  }) |S )NzFolder not found: r   z*.pdf)rN   rg   rQ   )r   existsFileNotFoundErrorsortedglobr   rW   r   )r   rN   rQ   ptotalpdfs         r   preprocess_folder_pdfs_to_dbr      sq    VA88:"4VH =>>EaffWo& e%c#h)PSPXPX_cddeLr   u  Du bist ein präziser Assistent. Nutze den folgenden Kontext aus den Realcube Domumenten, falls relevant, um die Nutzerfrage zu beantworten.
Wenn der Kontext nicht passt, antworte ohne ihn zu zitieren. Zitiere kurz in Klammern die Quelle [Titel] bei Verwendung. Realcube ist eine Daten-Warehouse Plattform, 
die alle denkbaren digitalen Immobilien-Daten sinnvoll sammelt und verarbeitet. Realcube arbeitet auch zunehmend mit KI. Entscheidend ist, dass der hier generierte
Kontent zum Ausdruck bringt, dass mit der Realcube Plattform Dinge möglich sind, die sonst nicht möglich sind und dass die Realcube Plattform perfekte Antowrten auf
die gestellten Fragen liefert.   

KONTEXT:
{context}

NUTZERFRAGE:
{question}

ANTWORT (prägnant, fachlich, deutsch):rag_contextc                 8    t         j                  |xs d|       S )Nz"(kein relevanter Kontext gefunden))r   question)RAG_PROMPT_TEMPLATEformat)r   r   s     r   build_rag_augmented_promptr      s    %%k.a=alv%wwr   __main__z1Preprocess PDFs into DB and/or query RAG context.)descriptionz--tenantr/   z	Tenant ID)typedefaulthelpz--pdfz"Path to a single PDF to preprocess)r   r   z--folderz(Path to a folder with PDFs to preprocessz--queryzQuery to test retrievalz--tagszComma-separated tags to store)r   r   r   z--embed-modelzEmbedding model idz--chunk-sizezChunk size in tokensz--chunk-overlapzOverlap in tokensz--top-kzTop K retrievalzPreprocessing PDF: )rN   rQ   zStored chunks: zPreprocessing folder: zStored chunks (all PDFs): zQuerying RAG for tenant z: )rN   r   zTop matches:r   z- (r   z.3fz) rO   z  id=r   z
---
Context preview:
r   r\   i  )Nrf   )rf   )<pathlibr   r   osdataclassesr   typingr   r   r   numpyr2   dotenvr   __file__	with_nameenvirongetrE   rU   r   r   DEFAULT_TOP_Kr   rW   r,   r<   rX   rK   rM   re   rq   ry   r   r   dictr   r   r   r   rR   argparseArgumentParserparseradd_argument
parse_argsargsembed_model
chunk_sizechunk_overlapr   r   printtenantrQ   cntr   r   resr   rY   r   r   <module>r      s     	 ! ( (   X008 9
 jjnn13KL

'7?@ 2::>>*=tDE BJJNN;450
3 C # $s) $,S	 ,d4;&7 , , , ,s tE#s(O'<  &/2C/C3 $'%(  # -0	
 *-
 IMX$" X 3 :3 3 HSM `c tw   MZ 0# 0# 0c 0^b 0f  C X[ + x3 xS xS x z$X$$1deF

akJ
c0TU

3]^
	2KL
s1PZcd c8LM
S7MN
):MN
	2CDD && OO#11zz

xx#DHH:./"488t{{Su%&{{&t{{m45*4;;$++TXT]T]^*3%01zz(R

|DE'

dkkQ^_nB' 	TAC,,Bqzl%-@P?QRS	T)*cggi$Ud+, M r   