From 4fd4b7c0aded5acb6772402def8e4dd4fcca497a Mon Sep 17 00:00:00 2001 From: notshrirang Date: Sat, 4 Jan 2025 17:26:00 +0530 Subject: [PATCH 1/5] feat: add audio data input --- .gitignore | 1 + app.py | 7 +-- data_search/data_search_page.py | 22 +++++++-- data_upload/data_upload_page.py | 8 +-- data_upload/input_sources_utils/audio_util.py | 29 +++++++++++ utils.py | 11 +++++ vectordb.py | 49 ++++++++++++++++++- 7 files changed, 116 insertions(+), 11 deletions(-) create mode 100644 data_upload/input_sources_utils/audio_util.py diff --git a/.gitignore b/.gitignore index 0388cc3..957b7f0 100644 --- a/.gitignore +++ b/.gitignore @@ -170,6 +170,7 @@ cython_debug/ # PyPI configuration file .pypirc +audio/ images/ vectorstore/ trial.py diff --git a/app.py b/app.py index 1f6c2d6..9afc696 100644 --- a/app.py +++ b/app.py @@ -12,7 +12,7 @@ from data_search import data_search_page from data_annotations import data_annotation_page from model_finetuning import model_finetuning_page -from utils import load_clip_model, load_text_embedding_model +from utils import load_clip_model, load_text_embedding_model, load_whisper_model os.environ['KMP_DUPLICATE_LIB_OK']='True' @@ -21,6 +21,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu" clip_model, preprocess = load_clip_model() text_embedding_model = load_text_embedding_model() +whisper_model = load_whisper_model() os.makedirs("annotations/", exist_ok=True) os.makedirs("images/", exist_ok=True) @@ -34,9 +35,9 @@ ) if page == "Data Upload": - data_upload_page.data_upload(clip_model, preprocess, text_embedding_model) + data_upload_page.data_upload(clip_model, preprocess, text_embedding_model, whisper_model) if page == "Data Search": - data_search_page.data_search(clip_model, preprocess, text_embedding_model, device) + data_search_page.data_search(clip_model, preprocess, text_embedding_model, whisper_model, device) if page == "Data Annotation": data_annotation_page.data_annotations() if page == "Model Fine-Tuning": diff --git a/data_search/data_search_page.py b/data_search/data_search_page.py index badf0e2..4cb6bf7 100644 --- a/data_search/data_search_page.py +++ b/data_search/data_search_page.py @@ -6,13 +6,13 @@ import sys import torch from vectordb import search_image_index, search_text_index, search_image_index_with_image, search_text_index_with_image -from utils import load_image_index, load_text_index, get_local_files +from utils import load_image_index, load_text_index, load_audio_index, get_local_files from data_search import adapter_utils sys.path.append(os.path.dirname(os.path.abspath(__file__))) -def data_search(clip_model, preprocess, text_embedding_model, device): +def data_search(clip_model, preprocess, text_embedding_model, whisper_model, device): @st.cache_resource def load_finetuned_model(file_name): @@ -68,6 +68,8 @@ def load_adapter(): image_index, image_data = load_image_index() if os.path.exists("./vectorstore/text_index.index"): text_index, text_data = load_text_index() + if os.path.exists("./vectorstore/audio_index.index"): + audio_index, audio_data = load_audio_index() with torch.no_grad(): if not os.path.exists("./vectorstore/image_data.csv"): st.warning("No Image Index Found. So not searching for images.") @@ -75,6 +77,8 @@ def load_adapter(): if not os.path.exists("./vectorstore/text_data.csv"): st.warning("No Text Index Found. So not searching for text.") text_index = None + if not os.path.exists("./vectorstore/audio_data.csv"): + st.warning("No Audio Index Found. So not searching for audio.") if image_input: image = Image.open(image_input) image = preprocess(image).unsqueeze(0).to(device) @@ -85,12 +89,16 @@ def load_adapter(): image_indices = search_image_index_with_image(image_features, image_index, clip_model, k=3) if text_index is not None: text_indices = search_text_index_with_image(adapted_text_embeddings, text_index, text_embedding_model, k=3) + if audio_index is not None: + audio_indices = search_text_index_with_image(adapted_text_embeddings, audio_index, text_embedding_model, k=3) else: if image_index is not None: image_indices = search_image_index(text_input, image_index, clip_model, k=3) if text_index is not None: text_indices = search_text_index(text_input, text_index, text_embedding_model, k=3) - if not image_index and not text_index: + if audio_index is not None: + audio_indices = search_text_index(text_input, audio_index, text_embedding_model, k=3) + if not image_index and not text_index and not audio_index: st.error("No Data Found! Please add data to the database.") st.subheader("Top 3 Results") cols = st.columns(3) @@ -111,4 +119,10 @@ def load_adapter(): with cols[i]: if text_index: text_content = text_data['content'].iloc[text_indices[0][i]] - st.write(text_content) \ No newline at end of file + st.write(text_content) + cols = st.columns(3) + for i in range(3): + with cols[i]: + if audio_index: + audio_path = audio_data['path'].iloc[audio_indices[0][i]] + st.audio(audio_path) \ No newline at end of file diff --git a/data_upload/data_upload_page.py b/data_upload/data_upload_page.py index f1e536a..311c13e 100644 --- a/data_upload/data_upload_page.py +++ b/data_upload/data_upload_page.py @@ -2,15 +2,15 @@ import streamlit as st import sys -from data_upload.input_sources_utils import image_util, pdf_util, website_util +from data_upload.input_sources_utils import image_util, pdf_util, website_util, audio_util sys.path.append(os.path.dirname(os.path.abspath(__file__))) -def data_upload(clip_model, preprocess, text_embedding_model): +def data_upload(clip_model, preprocess, text_embedding_model, whisper_model): st.title("Data Upload") st.warning("Please note that this is a public application. Make sure you are not uploading any sensitive data.") - upload_choice = st.selectbox(options=["Upload Image", "Add Image from URL / Link", "Upload PDF", "Website Link"], label="Select Upload Type") + upload_choice = st.selectbox(options=["Upload Image", "Add Image from URL / Link", "Upload PDF", "Website Link", "Audio Recording"], label="Select Upload Type") if upload_choice == "Upload Image": image_util.upload_image(clip_model, preprocess) elif upload_choice == "Add Image from URL / Link": @@ -19,3 +19,5 @@ def data_upload(clip_model, preprocess, text_embedding_model): pdf_util.upload_pdf(clip_model, preprocess, text_embedding_model) elif upload_choice == "Website Link": website_util.data_from_website(clip_model, preprocess, text_embedding_model) + elif upload_choice == "Audio Recording": + audio_util.upload_audio(whisper_model, text_embedding_model) diff --git a/data_upload/input_sources_utils/audio_util.py b/data_upload/input_sources_utils/audio_util.py new file mode 100644 index 0000000..3ddb6ba --- /dev/null +++ b/data_upload/input_sources_utils/audio_util.py @@ -0,0 +1,29 @@ +import os +import requests +import streamlit as st +import sys +import whisper + +from vectordb import add_image_to_index, add_pdf_to_index, add_audio_to_index + +sys.path.append(os.path.dirname(os.path.abspath(__file__))) + +def upload_audio(whisper_model, text_embedding_model): + st.title("Upload Audio") + recorded_audio = st.audio_input("Record Audio") + st.write("---") + uploaded_audios = st.file_uploader("Upload Audio", type=["mp3", "wav"], accept_multiple_files=True) + if recorded_audio: + st.audio(recorded_audio) + if st.button("Add Audio"): + add_audio_to_index(recorded_audio, whisper_model, text_embedding_model) + st.success("Audio Added to Database") + if uploaded_audios: + for audio in uploaded_audios: + st.audio(audio) + if st.button("Add Audio"): + progress_bar = st.progress(0, f"Adding Audio... | 0/{len(uploaded_audios)}") + for count, audio in enumerate(uploaded_audios): + add_audio_to_index(audio, whisper_model, text_embedding_model) + progress_bar.progress((count + 1) / len(uploaded_audios), f"Adding Audio... | {count + 1}/{len(uploaded_audios)}") + st.success("Audio Added to Database") diff --git a/utils.py b/utils.py index 858ea04..82cb260 100644 --- a/utils.py +++ b/utils.py @@ -6,6 +6,7 @@ from sentence_transformers import SentenceTransformer import streamlit as st import torch +import whisper device = "cuda" if torch.cuda.is_available() else "cpu" @@ -19,6 +20,11 @@ def load_text_embedding_model(): model = SentenceTransformer("all-MiniLM-L6-v2") return model +@st.cache_resource +def load_whisper_model(): + model = whisper.load_model("small") + return model + def load_image_index(): index = faiss.read_index('./vectorstore/image_index.index') data = pd.read_csv("./vectorstore/image_data.csv") @@ -29,6 +35,11 @@ def load_text_index(): data = pd.read_csv("./vectorstore/text_data.csv") return index, data +def load_audio_index(): + index = faiss.read_index('./vectorstore/audio_index.index') + data = pd.read_csv("./vectorstore/audio_data.csv") + return index, data + def cosine_similarity(a, b): return torch.cosine_similarity(a, b) diff --git a/vectordb.py b/vectordb.py index 4147668..ca867b3 100644 --- a/vectordb.py +++ b/vectordb.py @@ -11,14 +11,17 @@ import streamlit as st import torch import time +import whisper device = "cuda" if torch.cuda.is_available() else "cpu" os.makedirs("./vectorstore", exist_ok=True) -def update_vectordb(index_path: str, embedding: torch.Tensor, image_path: str = None, text_content: str = None): +def update_vectordb(index_path: str, embedding: torch.Tensor, image_path: str = None, text_content: str = None, audio_path: str = None): if not image_path and not text_content: raise ValueError("Either image_path or text_content must be provided.") + if audio_path and not text_content: + raise ValueError("text_content must be provided when audio_path is provided.") if not os.path.exists(f"./vectorstore/{index_path}"): if image_path: index = faiss.IndexFlatL2(512) @@ -42,6 +45,15 @@ def update_vectordb(index_path: str, embedding: torch.Tensor, image_path: str = new_entry_df = pd.DataFrame({"path": image_path, "index": len(df)}, index=[0]) df = pd.concat([df, new_entry_df], ignore_index=True) df.to_csv("./vectorstore/image_data.csv", index=False) + elif audio_path: + if not os.path.exists("./vectorstore/audio_data.csv"): + df = pd.DataFrame([{"path": audio_path, "content": text_content, "index": 0}]).reset_index(drop=True) + df.to_csv("./vectorstore/audio_data.csv", index=False) + else: + df = pd.read_csv("./vectorstore/audio_data.csv").reset_index(drop=True) + new_entry_df = pd.DataFrame({"path": audio_path, "content": text_content, "index": len(df)}, index=[0]) + df = pd.concat([df, new_entry_df], ignore_index=True) + df.to_csv("./vectorstore/audio_data.csv", index=False) elif text_content: if not os.path.exists("./vectorstore/text_data.csv"): df = pd.DataFrame([{"content": text_content, "index": 0}]).reset_index(drop=True) @@ -120,6 +132,41 @@ def add_pdf_to_index(pdf, clip_model: clip.model.CLIP, preprocess, text_embeddin progress_bar.progress(percent_complete, f"Processing Page {page_num + 1}/{len(pdf_reader.pages)}") return pdf_pages_data + +def add_audio_to_index(audio, whisper_model: whisper.Whisper, text_embedding_model: SentenceTransformer): + if not os.path.exists("./vectorstore/"): + os.makedirs("./vectorstore") + if not os.path.exists("./audio"): + os.makedirs("./audio") + if hasattr(audio, "name"): + audio_name = audio.name + else: + audio_name = f"{time.time()}.wav" + audio_name = audio_name.replace(" ", "_") + with open(f"./audio/{audio_name}", "wb") as f: + try: + f.write(audio.read()) + except: + if hasattr(audio, "data"): + audio = io.BytesIO(audio.data) + else: + audio = io.BytesIO(audio) + f.write(audio.read()) + audio_transcript: str = whisper_model.transcribe(f"./audio/{audio_name}")["text"] + text_splitter = CharacterTextSplitter( + separator="\n", + chunk_size=1000, + chunk_overlap=200, + length_function=len, + is_separator_regex=False, + ) + chunks = text_splitter.split_text(audio_transcript) + text_embeddings = text_embedding_model.encode(chunks) + for i, chunk in enumerate(chunks): + update_vectordb(index_path="audio_index.index", embedding=text_embeddings[i], text_content=chunk, audio_path=f"./audio/{audio_name}") + return audio_transcript + + def search_image_index_with_image(image_features, index: faiss.IndexFlatL2, clip_model: clip.model.CLIP, k: int = 3): with torch.no_grad(): distances, indices = index.search(image_features.cpu().numpy(), k) From 68cd444d9df8155fcdae337842fa9edd028c96d7 Mon Sep 17 00:00:00 2001 From: notshrirang Date: Sat, 4 Jan 2025 17:27:22 +0530 Subject: [PATCH 2/5] feat: add requirements --- requirements.txt | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 4d76931..8ff406c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,7 +12,7 @@ cachetools==5.5.0 certifi==2024.12.14 charset-normalizer==3.4.1 click==8.1.8 -clip @ git+https://github.com/openai/CLIP.git@dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1 +clip==1.0 colorama==0.4.6 contourpy==1.3.1 cycler==0.12.1 @@ -26,6 +26,7 @@ fonttools==4.55.3 frozenlist==1.5.0 fsspec==2024.9.0 ftfy==6.3.1 +future==1.0.0 gitdb==4.0.11 GitPython==3.1.43 greenlet==3.1.1 @@ -48,20 +49,24 @@ langchain-core==0.3.28 langchain-experimental==0.3.4 langchain-text-splitters==0.3.4 langsmith==0.1.147 +llvmlite==0.43.0 lxml==5.1.0 markdown-it-py==3.0.0 MarkupSafe==3.0.2 marshmallow==3.23.2 matplotlib==3.10.0 mdurl==0.1.2 +more-itertools==10.5.0 mpmath==1.3.0 multidict==6.1.0 multiprocess==0.70.16 mypy-extensions==1.0.0 narwhals==1.19.1 networkx==3.4.2 +numba==0.60.0 numpy==1.26.4 open_clip_torch==2.29.0 +openai-whisper==20240930 orjson==3.10.12 packaging==24.2 pandas==2.2.3 @@ -100,6 +105,7 @@ streamlit-option-menu==0.4.0 sympy==1.13.1 tenacity==8.5.0 threadpoolctl==3.5.0 +tiktoken==0.8.0 timm==1.0.12 tokenizers==0.21.0 toml==0.10.2 From acf79a60a00a8584bb8a8da0f8c962b154803359 Mon Sep 17 00:00:00 2001 From: notshrirang Date: Sat, 4 Jan 2025 17:27:59 +0530 Subject: [PATCH 3/5] feat: add requirements --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 8ff406c..6a0f80c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,7 +12,7 @@ cachetools==5.5.0 certifi==2024.12.14 charset-normalizer==3.4.1 click==8.1.8 -clip==1.0 +clip @ git+https://github.com/openai/CLIP.git@dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1 colorama==0.4.6 contourpy==1.3.1 cycler==0.12.1 From 1c179c34114a60824c320f3733a0169ff61d90f2 Mon Sep 17 00:00:00 2001 From: notshrirang Date: Sat, 4 Jan 2025 17:55:09 +0530 Subject: [PATCH 4/5] feat: add captions to retrieved audios --- data_search/data_search_page.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/data_search/data_search_page.py b/data_search/data_search_page.py index 4cb6bf7..cedf0fa 100644 --- a/data_search/data_search_page.py +++ b/data_search/data_search_page.py @@ -100,7 +100,7 @@ def load_adapter(): audio_indices = search_text_index(text_input, audio_index, text_embedding_model, k=3) if not image_index and not text_index and not audio_index: st.error("No Data Found! Please add data to the database.") - st.subheader("Top 3 Results") + st.subheader("Image Results") cols = st.columns(3) for i in range(3): with cols[i]: @@ -114,15 +114,19 @@ def load_adapter(): cosine_similarity = torch.cosine_similarity(image_features, text_features) st.write(f"Similarity: {cosine_similarity.item() * 100:.2f}%") st.image(image_path) + st.subheader("Text Results") cols = st.columns(3) for i in range(3): with cols[i]: if text_index: text_content = text_data['content'].iloc[text_indices[0][i]] st.write(text_content) + st.subheader("Audio Results") cols = st.columns(3) for i in range(3): with cols[i]: if audio_index: audio_path = audio_data['path'].iloc[audio_indices[0][i]] - st.audio(audio_path) \ No newline at end of file + audio_content = audio_data['content'].iloc[audio_indices[0][i]] + st.audio(audio_path) + st.write(f"_{audio_content}_") \ No newline at end of file From 32799629fb972d94f62934895da0358bef9f6c8d Mon Sep 17 00:00:00 2001 From: Shrirang Mahajan <85283622+NotShrirang@users.noreply.github.com> Date: Sat, 4 Jan 2025 17:58:57 +0530 Subject: [PATCH 5/5] Update README.md --- README.md | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 027d463..4886801 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ ![GitHub repo size](https://img.shields.io/github/repo-size/NotShrirang/LoomRAG) -This project implements a Multimodal Retrieval-Augmented Generation (RAG) system, named **LoomRAG**, that leverages OpenAI's CLIP model for neural cross-modal retrieval and semantic search. The system allows users to input text queries and retrieve both text and image responses seamlessly through vector embeddings. It features a comprehensive annotation interface for creating custom datasets and supports CLIP model fine-tuning with configurable parameters for domain-specific applications. The system also supports uploading images and PDFs for enhanced interaction and intelligent retrieval capabilities through a Streamlit-based interface. +This project implements a Multimodal Retrieval-Augmented Generation (RAG) system, named **LoomRAG**, that leverages **OpenAI's CLIP** model for neural cross-modal image retrieval and semantic search, and **OpenAI's Whisper** model for audio processing. The system allows users to input text queries, images, or audio to retrieve multimodal responses seamlessly through vector embeddings. It features a comprehensive annotation interface for creating custom datasets and supports CLIP model fine-tuning with configurable parameters for domain-specific applications. The system also supports uploading images, PDFs, and audio files (including real-time recording) for enhanced interaction and intelligent retrieval capabilities through a Streamlit-based interface. Experience the project in action: @@ -33,10 +33,11 @@ Experience the project in action: - πŸ”„ **Cross-Modal Retrieval**: Search text to retrieve both text and image results using deep learning - πŸ–ΌοΈ **Image-Based Search**: Search the database by uploading an image to find similar content -- 🧠 **Embedding-Based Search**: Uses OpenAI's CLIP model to align text and image embeddings in a shared latent space +- 🧠 **Embedding-Based Search**: Uses OpenAI's CLIP, Whisper and SentenceTransformer's Embedding Models for embedding the input data - 🎯 **CLIP Fine-Tuning**: Supports custom model training with configurable parameters including test dataset split size, learning rate, optimizer, and weight decay - πŸ”¨ **Fine-Tuned Model Integration**: Seamlessly load and utilize fine-tuned CLIP models for enhanced search and retrieval -- πŸ“€ **Upload Options**: Allows users to upload images and PDFs for AI-powered processing and retrieval +- πŸ“€ **Upload Options**: Allows users to upload images, PDFs and audio files for AI-powered processing and retrieval +- πŸŽ™οΈ **Audio Integration**: Upload audio files or record audio directly through the interface - πŸ”— **URL Integration**: Add images directly using URLs and scrape website data including text and images - πŸ•·οΈ **Web Scraping**: Automatically extract and index content from websites for comprehensive search capabilities - 🏷️ **Image Annotation**: Enables users to annotate uploaded images through an intuitive interface @@ -45,6 +46,17 @@ Experience the project in action: --- +## πŸ—ΊοΈ Roadmap + +- [x] Fine-tuning CLIP for domain-specific datasets +- [x] Image-based search and retrieval +- [x] Adding support for audeo modalities +- [ ] Adding support for video modalities +- [ ] Improving the re-ranking system for better contextual relevance +- [ ] Enhanced PDF parsing with semantic section segmentation + +--- + ## πŸ—οΈ Architecture Overview 1. **Data Indexing**: @@ -56,13 +68,14 @@ Experience the project in action: 2. **Query Processing**: - Text queries / image-based queries are converted into embeddings for semantic search - - Uploaded images and PDFs are processed and embedded for comparison - - The system performs a nearest neighbor search in the vector database to retrieve relevant text and images + - Uploaded images, audio files and PDFs are processed and embedded for comparison + - The system performs a nearest neighbor search in the vector database to retrieve relevant text, images, and audio 3. **Response Generation**: - For text results: Optionally refined or augmented using a language model - For image results: Directly returned or enhanced with image captions + - For audio results: Returned with relevant metadata and transcriptions where applicable - For PDFs: Extracts text content and provides relevant sections 4. **Image Annotation**: @@ -108,6 +121,7 @@ Experience the project in action: - Access the interface in your browser to: - Submit natural language queries - Upload images or PDFs to retrieve contextually relevant results + - Upload or record audio files - Add images using URLs - Scrape and index website content - Search using uploaded images @@ -132,16 +146,6 @@ Experience the project in action: --- -## πŸ—ΊοΈ Roadmap - -- [x] Fine-tuning CLIP for domain-specific datasets -- [x] Image-based search and retrieval -- [ ] Adding support for audio and video modalities -- [ ] Improving the re-ranking system for better contextual relevance -- [ ] Enhanced PDF parsing with semantic section segmentation - ---- - ## 🀝 Contributing Contributions are welcome! Please open an issue or submit a pull request for any feature requests or bug fixes. @@ -157,5 +161,6 @@ This project is licensed under the Apache-2.0 License. See the [LICENSE](LICENSE ## πŸ™ Acknowledgments - [OpenAI CLIP](https://openai.com/research/clip) +- [OpenAI Whisper](https://github.com/openai/whisper) - [FAISS](https://github.com/facebookresearch/faiss) - [Hugging Face](https://huggingface.co/)