Skip to content

Commit ad4f64c

Browse files
authored
Merge pull request #281 from rmusser01/dev
Fixes + RAG Enhancements
2 parents ff73357 + fb051e7 commit ad4f64c

17 files changed

+762
-350
lines changed

.gitignore

32 Bytes
Binary file not shown.

App_Function_Libraries/Audio/Audio_Transcription_Lib.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@
5050

5151
class WhisperModel(OriginalWhisperModel):
5252
tldw_dir = os.path.dirname(os.path.dirname(__file__))
53-
default_download_root = os.path.join(tldw_dir, 'App_Function_Libraries', 'models', 'Whisper')
53+
default_download_root = os.path.join(tldw_dir, 'models', 'Whisper')
5454

5555
valid_model_sizes = [
5656
"tiny.en", "tiny", "base.en", "base", "small.en", "small", "medium.en", "medium",
@@ -207,8 +207,8 @@ def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='me
207207

208208
try:
209209
_, file_ending = os.path.splitext(audio_file_path)
210-
out_file = audio_file_path.replace(file_ending, ".segments.json")
211-
prettified_out_file = audio_file_path.replace(file_ending, ".segments_pretty.json")
210+
out_file = audio_file_path.replace(file_ending, "-whisper_model-"+whisper_model+".segments.json")
211+
prettified_out_file = audio_file_path.replace(file_ending, "-whisper_model-"+whisper_model+".segments_pretty.json")
212212
if os.path.exists(out_file):
213213
logging.info("speech-to-text: Segments file already exists: %s", out_file)
214214
with open(out_file) as f:

App_Function_Libraries/Chunk_Lib.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -476,22 +476,22 @@ def semantic_chunk_long_file(file_path, max_chunk_size=1000, overlap=100, unit='
476476
#
477477
# Embedding Chunking
478478

479-
def chunk_for_embedding(text: str, file_name: str, full_summary: str, custom_chunk_options: Dict[str, Any] = None) -> List[Dict[str, Any]]:
479+
def chunk_for_embedding(text: str, file_name: str, custom_chunk_options: Dict[str, Any] = None) -> List[Dict[str, Any]]:
480480
options = chunk_options.copy()
481481
if custom_chunk_options:
482482
options.update(custom_chunk_options)
483483

484+
logging.info(f"Chunking options: {options}")
484485
chunks = improved_chunking_process(text, options)
485486
total_chunks = len(chunks)
487+
logging.info(f"Total chunks created: {total_chunks}")
486488

487489
chunked_text_with_headers = []
488490
for i, chunk in enumerate(chunks, 1):
489491
chunk_text = chunk['text']
490492
chunk_position = determine_chunk_position(chunk['metadata']['relative_position'])
491-
492493
chunk_header = f"""
493494
Original Document: {file_name}
494-
Full Document Summary: {full_summary or "Full document summary not available."}
495495
Chunk: {i} of {total_chunks}
496496
Position: {chunk_position}
497497

App_Function_Libraries/DB/DB_Manager.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@
5050
check_media_and_whisper_model as sqlite_check_media_and_whisper_model, \
5151
create_document_version as sqlite_create_document_version,
5252
get_document_version as sqlite_get_document_version, sqlite_search_db, add_media_chunk as sqlite_add_media_chunk,
53-
sqlite_update_fts_for_media, sqlite_get_unprocessed_media, fetch_item_details as sqlite_fetch_item_details, \
53+
sqlite_update_fts_for_media, get_unprocessed_media as sqlite_get_unprocessed_media, fetch_item_details as sqlite_fetch_item_details, \
5454
search_media_database as sqlite_search_media_database, mark_as_trash as sqlite_mark_as_trash, \
5555
get_media_transcripts as sqlite_get_media_transcripts, get_specific_transcript as sqlite_get_specific_transcript, \
5656
get_media_summaries as sqlite_get_media_summaries, get_specific_summary as sqlite_get_specific_summary, \
@@ -68,7 +68,7 @@
6868
get_workflow_chat as sqlite_get_workflow_chat, update_media_content_with_version as sqlite_update_media_content_with_version, \
6969
check_existing_media as sqlite_check_existing_media, get_all_document_versions as sqlite_get_all_document_versions, \
7070
fetch_paginated_data as sqlite_fetch_paginated_data, get_latest_transcription as sqlite_get_latest_transcription, \
71-
71+
mark_media_as_processed as sqlite_mark_media_as_processed,
7272
)
7373
#
7474
# Local Imports
@@ -417,7 +417,7 @@ def update_fts_for_media(media_id: int):
417417
raise ValueError(f"Unsupported database type: {db_type}")
418418

419419

420-
def get_unprocessed_media():
420+
def get_unprocessed_media(*args, **kwargs):
421421
if db_type == 'sqlite':
422422
return sqlite_get_unprocessed_media(db)
423423
elif db_type == 'elasticsearch':
@@ -427,6 +427,16 @@ def get_unprocessed_media():
427427
raise ValueError(f"Unsupported database type: {db_type}")
428428

429429

430+
def mark_media_as_processed(*args, **kwargs):
431+
if db_type == 'sqlite':
432+
return sqlite_mark_media_as_processed(*args, **kwargs)
433+
elif db_type == 'elasticsearch':
434+
# Implement Elasticsearch version
435+
raise NotImplementedError("Elasticsearch version of mark_media_as_processed not yet implemented")
436+
else:
437+
raise ValueError(f"Unsupported database type: {db_type}")
438+
439+
430440
#
431441
# End of DB-Ingestion functions
432442
############################################################################################################

App_Function_Libraries/DB/SQLite_DB.py

Lines changed: 38 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -301,7 +301,8 @@ def create_tables(db) -> None:
301301
is_trash BOOLEAN DEFAULT 0,
302302
trash_date DATETIME,
303303
vector_embedding BLOB,
304-
chunking_status TEXT DEFAULT 'pending'
304+
chunking_status TEXT DEFAULT 'pending',
305+
vector_processing INTEGER DEFAULT 0
305306
)
306307
''',
307308
'''
@@ -564,11 +565,14 @@ def sqlite_update_fts_for_media(db, media_id: int):
564565
conn.commit()
565566

566567

567-
def sqlite_get_unprocessed_media(db):
568-
with db.get_connection() as conn:
569-
cursor = conn.cursor()
570-
cursor.execute("SELECT id, content, type FROM Media WHERE id NOT IN (SELECT DISTINCT media_id FROM MediaChunks)")
571-
return cursor.fetchall()
568+
def get_unprocessed_media(db):
569+
query = """
570+
SELECT id, content, type, COALESCE(title, '') as file_name
571+
FROM Media
572+
WHERE vector_processing = 0
573+
ORDER BY id
574+
"""
575+
return db.execute_query(query)
572576

573577
def get_next_media_id():
574578
try:
@@ -580,8 +584,18 @@ def get_next_media_id():
580584
finally:
581585
conn.close()
582586

587+
588+
def mark_media_as_processed(database, media_id):
589+
try:
590+
query = "UPDATE Media SET vector_processing = 1 WHERE id = ?"
591+
database.execute_query(query, (media_id,))
592+
logger.info(f"Marked media_id {media_id} as processed")
593+
except Exception as e:
594+
logger.error(f"Error marking media_id {media_id} as processed: {str(e)}")
595+
raise
596+
583597
#
584-
# End of Media-related Functions
598+
# End of Vector-chunk-related Functions
585599
#######################################################################################################################
586600

587601

@@ -2896,6 +2910,23 @@ def update_media_table(db):
28962910
# Add chunking_status column if it doesn't exist
28972911
add_missing_column_if_not_exists(db, 'Media', 'chunking_status', "TEXT DEFAULT 'pending'")
28982912

2913+
# Vector check FIXME/Delete later
2914+
def alter_media_table(db):
2915+
alter_query = '''
2916+
ALTER TABLE Media ADD COLUMN vector_processing INTEGER DEFAULT 0
2917+
'''
2918+
try:
2919+
db.execute_query(alter_query)
2920+
logging.info("Media table altered successfully to include vector_processing column.")
2921+
except Exception as e:
2922+
logging.error(f"Error altering Media table: {str(e)}")
2923+
# If the column already exists, SQLite will throw an error, which we can safely ignore
2924+
if "duplicate column name" not in str(e).lower():
2925+
raise
2926+
2927+
# Vector check FIXME/Delete later
2928+
alter_media_table(db)
2929+
28992930
#
29002931
# End of Functions to manage media chunks
29012932
#######################################################################################################################

App_Function_Libraries/Gradio_UI/Chat_ui.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,7 @@ def create_chat_interface():
214214
value="You are a helpful AI assitant",
215215
lines=3,
216216
visible=False)
217-
with gr.Column():
217+
with gr.Column(scale=2):
218218
chatbot = gr.Chatbot(height=600, elem_classes="chatbot-container")
219219
msg = gr.Textbox(label="Enter your message")
220220
submit = gr.Button("Submit")

App_Function_Libraries/Gradio_UI/Embeddings_tab.py

Lines changed: 47 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,17 @@
77
#
88
# External Imports
99
import gradio as gr
10+
from tqdm import tqdm
1011

11-
from App_Function_Libraries.Chunk_Lib import improved_chunking_process, determine_chunk_position
12+
from App_Function_Libraries.Chunk_Lib import improved_chunking_process, chunk_for_embedding
1213
#
1314
# Local Imports
1415
from App_Function_Libraries.DB.DB_Manager import get_all_content_from_database
1516
from App_Function_Libraries.RAG.ChromaDB_Library import chroma_client, \
16-
store_in_chroma
17-
from App_Function_Libraries.RAG.Embeddings_Create import create_embedding
17+
store_in_chroma, situate_context
18+
from App_Function_Libraries.RAG.Embeddings_Create import create_embedding, create_embeddings_batch
19+
20+
1821
#
1922
########################################################################################################################
2023
#
@@ -174,17 +177,23 @@ def create_view_embeddings_tab():
174177
value="words"
175178
)
176179
max_chunk_size = gr.Slider(
177-
minimum=1, maximum=8000, step=1, value=500,
180+
minimum=1, maximum=8000, step=5, value=500,
178181
label="Max Chunk Size"
179182
)
180183
chunk_overlap = gr.Slider(
181-
minimum=0, maximum=5000, step=1, value=200,
184+
minimum=0, maximum=5000, step=5, value=200,
182185
label="Chunk Overlap"
183186
)
184187
adaptive_chunking = gr.Checkbox(
185188
label="Use Adaptive Chunking",
186189
value=False
187190
)
191+
contextual_api_choice = gr.Dropdown(
192+
choices=["Local-LLM", "OpenAI", "Anthropic", "Cohere", "Groq", "DeepSeek", "Mistral", "OpenRouter", "Llama.cpp", "Kobold", "Ooba", "Tabbyapi", "VLLM", "ollama", "HuggingFace"],
193+
label="Select API for Contextualized Embeddings",
194+
value="OpenAI"
195+
)
196+
contextual_api_key = gr.Textbox(label="API Key", lines=1)
188197

189198
def get_items_with_embedding_status():
190199
try:
@@ -242,7 +251,7 @@ def check_embedding_status(selected_item, item_mapping):
242251
logging.error(f"Error in check_embedding_status: {str(e)}")
243252
return f"Error processing item: {selected_item}. Details: {str(e)}", "", ""
244253

245-
def create_new_embedding_for_item(selected_item, provider, model, api_url, method, max_size, overlap, adaptive, item_mapping):
254+
def create_new_embedding_for_item(selected_item, provider, model, api_url, method, max_size, overlap, adaptive, item_mapping, contextual_api_choice=None):
246255
if not selected_item:
247256
return "Please select an item", "", ""
248257

@@ -263,31 +272,30 @@ def create_new_embedding_for_item(selected_item, provider, model, api_url, metho
263272
'adaptive': adaptive
264273
}
265274

266-
chunks = improved_chunking_process(item['content'], chunk_options)
275+
logging.info(f"Chunking content for item: {item['title']} (ID: {item_id})")
276+
chunks = chunk_for_embedding(item['content'], item['title'], chunk_options)
267277
collection_name = "all_content_embeddings"
268278
collection = chroma_client.get_or_create_collection(name=collection_name)
269279

270280
# Delete existing embeddings for this item
271281
existing_ids = [f"doc_{item_id}_chunk_{i}" for i in range(len(chunks))]
272282
collection.delete(ids=existing_ids)
283+
logging.info(f"Deleted {len(existing_ids)} existing embeddings for item {item_id}")
273284

274-
for i, chunk in enumerate(chunks):
285+
texts, ids, metadatas = [], [], []
286+
chunk_count = 0
287+
logging.info("Generating contextual summaries and preparing chunks for embedding")
288+
for i, chunk in tqdm(enumerate(chunks), total=len(chunks), desc="Processing chunks"):
275289
chunk_text = chunk['text']
276290
chunk_metadata = chunk['metadata']
277-
chunk_position = determine_chunk_position(chunk_metadata['relative_position'])
278-
279-
chunk_header = f"""
280-
Original Document: {item['title']}
281-
Chunk: {i + 1} of {len(chunks)}
282-
Position: {chunk_position}
283-
Header: {chunk_metadata.get('header_text', 'N/A')}
284-
285-
--- Chunk Content ---
286-
"""
287-
288-
full_chunk_text = chunk_header + chunk_text
291+
if chunk_count == 0:
292+
chunk_count = 1
293+
# Generate contextual summary
294+
logging.debug(f"Generating contextual summary for chunk {chunk_count}")
295+
context = situate_context(contextual_api_choice, item['content'], chunk_text)
296+
contextualized_text = f"{chunk_text}\n\nContextual Summary: {context}"
297+
289298
chunk_id = f"doc_{item_id}_chunk_{i}"
290-
embedding = create_embedding(full_chunk_text, provider, model, api_url)
291299
metadata = {
292300
"media_id": str(item_id),
293301
"chunk_index": i,
@@ -298,13 +306,26 @@ def create_new_embedding_for_item(selected_item, provider, model, api_url, metho
298306
"adaptive_chunking": adaptive,
299307
"embedding_model": model,
300308
"embedding_provider": provider,
309+
"original_text": chunk_text,
310+
"contextual_summary": context,
301311
**chunk_metadata
302312
}
303-
store_in_chroma(collection_name, [full_chunk_text], [embedding], [chunk_id], [metadata])
304313

305-
embedding_preview = str(embedding[:50])
306-
status = f"New embeddings created and stored for item: {item['title']} (ID: {item_id})"
307-
return status, f"First 50 elements of new embedding:\n{embedding_preview}", json.dumps(metadata, indent=2)
314+
texts.append(contextualized_text)
315+
ids.append(chunk_id)
316+
metadatas.append(metadata)
317+
chunk_count = chunk_count+1
318+
319+
# Create embeddings in batch
320+
logging.info(f"Creating embeddings for {len(texts)} chunks")
321+
embeddings = create_embeddings_batch(texts, provider, model, api_url)
322+
323+
# Store in Chroma
324+
store_in_chroma(collection_name, texts, embeddings, ids, metadatas)
325+
326+
embedding_preview = str(embeddings[0][:50]) if embeddings else "No embeddings created"
327+
status = f"New contextual embeddings created and stored for item: {item['title']} (ID: {item_id})"
328+
return status, f"First 50 elements of new embedding:\n{embedding_preview}", json.dumps(metadatas[0], indent=2)
308329
except Exception as e:
309330
logging.error(f"Error in create_new_embedding_for_item: {str(e)}")
310331
return f"Error creating embedding: {str(e)}", "", ""
@@ -321,7 +342,7 @@ def create_new_embedding_for_item(selected_item, provider, model, api_url, metho
321342
create_new_embedding_button.click(
322343
create_new_embedding_for_item,
323344
inputs=[item_dropdown, embedding_provider, embedding_model, embedding_api_url,
324-
chunking_method, max_chunk_size, chunk_overlap, adaptive_chunking, item_mapping],
345+
chunking_method, max_chunk_size, chunk_overlap, adaptive_chunking, item_mapping, contextual_api_choice],
325346
outputs=[embedding_status, embedding_preview, embedding_metadata]
326347
)
327348
embedding_provider.change(

App_Function_Libraries/Gradio_UI/RAG_QA_Chat_tab.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from App_Function_Libraries.Books.Book_Ingestion_Lib import read_epub
1616
from App_Function_Libraries.DB.DB_Manager import DatabaseError, get_paginated_files, add_media_with_keywords
1717
from App_Function_Libraries.PDF.PDF_Ingestion_Lib import extract_text_and_format_from_pdf
18-
from App_Function_Libraries.RAG.RAG_Libary_2 import generate_answer
18+
from App_Function_Libraries.RAG.RAG_Libary_2 import generate_answer, enhanced_rag_pipeline
1919
from App_Function_Libraries.RAG.RAG_QA_Chat import search_database, rag_qa_chat
2020
# Eventually... FIXME
2121
from App_Function_Libraries.RAG.RAG_QA_Chat import load_chat_history, save_chat_history
@@ -31,9 +31,9 @@ def create_rag_qa_chat_tab():
3131
with gr.Row():
3232
with gr.Column(scale=1):
3333
context_source = gr.Radio(
34-
["Existing File", "Search Database", "Upload File"],
34+
["All Files in the Database", "Search Database", "Upload File"],
3535
label="Context Source",
36-
value="Existing File"
36+
value="All Files in the Database"
3737
)
3838
existing_file = gr.Dropdown(label="Select Existing File", choices=[], interactive=True)
3939
file_page = gr.State(value=1)
@@ -127,9 +127,10 @@ def rag_qa_chat_wrapper(message, history, context_source, existing_file, search_
127127
rephrased_question = message
128128
logging.info(f"First question, no rephrasing: {message}")
129129

130-
if context_source == "Existing File":
131-
context = f"media_id:{existing_file.split('(ID: ')[1][:-1]}"
132-
logging.info(f"Using existing file with context: {context}")
130+
if context_source == "All Files in the Database":
131+
# Use the enhanced_rag_pipeline to search the entire database
132+
context = enhanced_rag_pipeline(rephrased_question, api_choice)
133+
logging.info(f"Using enhanced_rag_pipeline for database search")
133134
elif context_source == "Search Database":
134135
context = f"media_id:{search_results.split('(ID: ')[1][:-1]}"
135136
logging.info(f"Using search result with context: {context}")

App_Function_Libraries/Gradio_UI/Search_Tab.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@ def update_content_for_version(selected_item, item_mapping, selected_version):
6666
return "", "", ""
6767

6868
def format_as_html(content, title):
69+
if content is None:
70+
content = "No content available"
6971
escaped_content = html.escape(content)
7072
formatted_content = escaped_content.replace('\n', '<br>')
7173
return f"""
@@ -79,9 +81,9 @@ def format_as_html(content, title):
7981

8082
def create_search_tab():
8183
with gr.TabItem("Search / Detailed View"):
84+
gr.Markdown("# Search across all ingested items in the Database")
8285
with gr.Row():
8386
with gr.Column(scale=1):
84-
gr.Markdown("# Search across all ingested items in the Database")
8587
gr.Markdown("by Title / URL / Keyword / or Content via SQLite Full-Text-Search")
8688
search_query_input = gr.Textbox(label="Search Query", placeholder="Enter your search query here...")
8789
search_type_input = gr.Radio(choices=["Title", "URL", "Keyword", "Content"], value="Title",

0 commit comments

Comments
 (0)