From c8b18cf7220629c5ea3d68ad882914ff99aa05d7 Mon Sep 17 00:00:00 2001 From: notshrirang Date: Wed, 1 Jan 2025 22:19:40 +0530 Subject: [PATCH 1/4] fix: minor bugs --- README.md | 2 +- app.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index b003cf1..281760e 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ This project implements a Multimodal Retrieval-Augmented Generation (RAG) system Experience the project in action: -[![LoomRAG Streamlit App](https://img.shields.io/badge/Streamlit%20App-red?style=for-the-badge&logo=streamlit&labelColor=white)](https://loomrag.streamlit.app/) +[![LoomRAG Streamlit App](https://img.shields.io/badge/Streamlit%20App-red?style=for-the-badge&logo=streamlit&labelColor=white)](https://huggingface.co/spaces/NotShrirang/LoomRAG) --- diff --git a/app.py b/app.py index aa006f5..1f6c2d6 100644 --- a/app.py +++ b/app.py @@ -21,8 +21,8 @@ device = "cuda" if torch.cuda.is_available() else "cpu" clip_model, preprocess = load_clip_model() text_embedding_model = load_text_embedding_model() - -sidebar = st.sidebar +os.makedirs("annotations/", exist_ok=True) +os.makedirs("images/", exist_ok=True) with st.sidebar: st.title("LoomRAG") From 75f9507aa7e4f822f0c095533503103dfcc0c38b Mon Sep 17 00:00:00 2001 From: notshrirang Date: Wed, 1 Jan 2025 22:21:33 +0530 Subject: [PATCH 2/4] fix: minor bugs --- data_search/data_search_page.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/data_search/data_search_page.py b/data_search/data_search_page.py index 7fd655b..a051545 100644 --- a/data_search/data_search_page.py +++ b/data_search/data_search_page.py @@ -21,6 +21,11 @@ def load_finetuned_model(file_name): st.title("Data Search") + images = os.listdir("images/") + if images == []: + st.warning("No Images Found! Please upload images to the database.") + return + annotation_projects = get_local_files("annotations/", get_details=True) if annotation_projects or st.session_state.get('selected_annotation_project', None) is not None: From cc538391d90713a0c01ca30c2b3afa40040110ab Mon Sep 17 00:00:00 2001 From: Shrirang Mahajan <85283622+NotShrirang@users.noreply.github.com> Date: Wed, 1 Jan 2025 22:26:58 +0530 Subject: [PATCH 3/4] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 281760e..ad5d906 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ ![GitHub](https://img.shields.io/github/license/NotShrirang/LoomRAG) ![GitHub last commit](https://img.shields.io/github/last-commit/NotShrirang/LoomRAG) ![GitHub repo size](https://img.shields.io/github/repo-size/NotShrirang/LoomRAG) - + This project implements a Multimodal Retrieval-Augmented Generation (RAG) system, named **LoomRAG**, that leverages OpenAI's CLIP model for neural cross-modal retrieval and semantic search. The system allows users to input text queries and retrieve both text and image responses seamlessly through vector embeddings. It also supports uploading images and PDFs for enhanced interaction and intelligent retrieval capabilities through a Streamlit-based interface. From 19eda0a11ae14294fe1288afea0a4ba4ba60e85d Mon Sep 17 00:00:00 2001 From: Shrirang Mahajan <85283622+NotShrirang@users.noreply.github.com> Date: Wed, 1 Jan 2025 22:58:40 +0530 Subject: [PATCH 4/4] Update README.md --- README.md | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index ad5d906..6bf96a9 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ ![GitHub repo size](https://img.shields.io/github/repo-size/NotShrirang/LoomRAG) -This project implements a Multimodal Retrieval-Augmented Generation (RAG) system, named **LoomRAG**, that leverages OpenAI's CLIP model for neural cross-modal retrieval and semantic search. The system allows users to input text queries and retrieve both text and image responses seamlessly through vector embeddings. It also supports uploading images and PDFs for enhanced interaction and intelligent retrieval capabilities through a Streamlit-based interface. +This project implements a Multimodal Retrieval-Augmented Generation (RAG) system, named **LoomRAG**, that leverages OpenAI's CLIP model for neural cross-modal retrieval and semantic search. The system allows users to input text queries and retrieve both text and image responses seamlessly through vector embeddings. It features a comprehensive annotation interface for creating custom datasets and supports CLIP model fine-tuning with configurable parameters for domain-specific applications. The system also supports uploading images and PDFs for enhanced interaction and intelligent retrieval capabilities through a Streamlit-based interface. Experience the project in action: @@ -20,9 +20,13 @@ Experience the project in action: ## πŸ“Έ Implementation Screenshots -| ![Screenshot 2024-12-30 111906](https://github.com/user-attachments/assets/13c0bd0d-1569-4d9e-aae5-ea5801a69beb) | ![Screenshot 2024-12-30 114200](https://github.com/user-attachments/assets/d74e9d75-7716-4705-9564-0c6fdc26790b) | +| ![Screenshot 2025-01-01 184852](https://github.com/user-attachments/assets/ad79d0f0-d200-4a82-8c2f-0890a9fe8189) | ![Screenshot 2025-01-01 222334](https://github.com/user-attachments/assets/7307857d-a41f-4f60-8808-00d6db6e8e3e) | | ---------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------- | -| Screenshot 1 | Screenshot 2 | +| Data Upload Page | Data Search / Retrieval | +| | | +| ![Screenshot 2025-01-01 222412](https://github.com/user-attachments/assets/e38273f4-426b-444d-80f0-501fa9563779) | ![Screenshot 2025-01-01 223948](https://github.com/user-attachments/assets/21724a92-ef79-44ae-83e6-25f8de29c45a) +| Data Annotation Page | CLIP Fine-Tuning | + --- @@ -33,6 +37,9 @@ Experience the project in action: - πŸ“€ **Upload Options**: Allows users to upload images and PDFs for AI-powered processing and retrieval - 🧠 **Embedding-Based Search**: Uses OpenAI's CLIP model to align text and image embeddings in a shared latent space - πŸ” **Augmented Text Generation**: Enhances text results using LLMs for contextually rich outputs +- 🏷️ Image Annotation: Enables users to annotate uploaded images through an intuitive interface +- 🎯 CLIP Fine-Tuning: Supports custom model training with configurable parameters including test dataset split size, learning rate, optimizer, and weight decay +- πŸ”¨ Fine-Tuned Model Integration: Seamlessly load and utilize fine-tuned CLIP models for enhanced search and retrieval --- @@ -54,6 +61,17 @@ Experience the project in action: - For image results: Directly returned or enhanced with image captions - For PDFs: Extracts text content and provides relevant sections +4. **Image Annotation**: + - Dedicated annotation page for managing uploaded images + - Support for creating and managing multiple datasets simultaneously + - Flexible annotation workflow for efficient data labeling + - Dataset organization and management capabilities + +5. **Model Fine-Tuning**: + - Custom CLIP model training on annotated images + - Configurable training parameters for optimization + - Integration of fine-tuned models into the search pipeline + --- ## πŸš€ Installation @@ -85,6 +103,9 @@ Experience the project in action: - Access the interface in your browser to: - Submit natural language queries - Upload images or PDFs to retrieve contextually relevant results + - Annotate uploaded images + - Fine-tune CLIP models with custom parameters + - Use fine-tuned models for improved search results 2. **Example Queries**: - **Text Query**: "sunset over mountains" @@ -99,12 +120,13 @@ Experience the project in action: - πŸ“Š **Vector Database**: It uses FAISS for efficient similarity search - πŸ€– **Model**: Uses OpenAI CLIP for neural embedding generation - ✍️ **Augmentation**: Optional LLM-based augmentation for text responses +- πŸŽ›οΈ Fine-Tuning: Configurable parameters for model training and optimization --- ## πŸ—ΊοΈ Roadmap -- [ ] Fine-tuning CLIP for domain-specific datasets +- [x] Fine-tuning CLIP for domain-specific datasets - [ ] Adding support for audio and video modalities - [ ] Improving the re-ranking system for better contextual relevance - [ ] Enhanced PDF parsing with semantic section segmentation @@ -119,7 +141,7 @@ Contributions are welcome! Please open an issue or submit a pull request for any ## πŸ“„ License -This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details. +This project is licensed under the Apache-2.0 License. See the [LICENSE](LICENSE) file for details. ---