From 4fd4b7c0aded5acb6772402def8e4dd4fcca497a Mon Sep 17 00:00:00 2001
From: notshrirang <shrirangmahajan123@gmail.com>
Date: Sat, 4 Jan 2025 17:26:00 +0530
Subject: [PATCH 1/5] feat: add audio data input

---
 .gitignore                                    |  1 +
 app.py                                        |  7 +--
 data_search/data_search_page.py               | 22 +++++++--
 data_upload/data_upload_page.py               |  8 +--
 data_upload/input_sources_utils/audio_util.py | 29 +++++++++++
 utils.py                                      | 11 +++++
 vectordb.py                                   | 49 ++++++++++++++++++-
 7 files changed, 116 insertions(+), 11 deletions(-)
 create mode 100644 data_upload/input_sources_utils/audio_util.py

diff --git a/.gitignore b/.gitignore
index 0388cc3..957b7f0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -170,6 +170,7 @@ cython_debug/
 # PyPI configuration file
 .pypirc
 
+audio/
 images/
 vectorstore/
 trial.py
diff --git a/app.py b/app.py
index 1f6c2d6..9afc696 100644
--- a/app.py
+++ b/app.py
@@ -12,7 +12,7 @@
 from data_search import data_search_page
 from data_annotations import data_annotation_page
 from model_finetuning import model_finetuning_page
-from utils import load_clip_model, load_text_embedding_model
+from utils import load_clip_model, load_text_embedding_model, load_whisper_model
 
 os.environ['KMP_DUPLICATE_LIB_OK']='True'
 
@@ -21,6 +21,7 @@
 device = "cuda" if torch.cuda.is_available() else "cpu"
 clip_model, preprocess = load_clip_model()
 text_embedding_model = load_text_embedding_model()
+whisper_model = load_whisper_model()
 os.makedirs("annotations/", exist_ok=True)
 os.makedirs("images/", exist_ok=True)
 
@@ -34,9 +35,9 @@
     )
 
 if page == "Data Upload":
-    data_upload_page.data_upload(clip_model, preprocess, text_embedding_model)
+    data_upload_page.data_upload(clip_model, preprocess, text_embedding_model, whisper_model)
 if page == "Data Search":
-    data_search_page.data_search(clip_model, preprocess, text_embedding_model, device)
+    data_search_page.data_search(clip_model, preprocess, text_embedding_model, whisper_model, device)
 if page == "Data Annotation":
     data_annotation_page.data_annotations()
 if page == "Model Fine-Tuning":
diff --git a/data_search/data_search_page.py b/data_search/data_search_page.py
index badf0e2..4cb6bf7 100644
--- a/data_search/data_search_page.py
+++ b/data_search/data_search_page.py
@@ -6,13 +6,13 @@
 import sys
 import torch
 from vectordb import search_image_index, search_text_index, search_image_index_with_image, search_text_index_with_image
-from utils import load_image_index, load_text_index, get_local_files
+from utils import load_image_index, load_text_index, load_audio_index, get_local_files
 from data_search import adapter_utils
 
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 
 
-def data_search(clip_model, preprocess, text_embedding_model, device):
+def data_search(clip_model, preprocess, text_embedding_model, whisper_model, device):
 
     @st.cache_resource
     def load_finetuned_model(file_name):
@@ -68,6 +68,8 @@ def load_adapter():
             image_index, image_data = load_image_index()
         if os.path.exists("./vectorstore/text_index.index"):
             text_index, text_data = load_text_index()
+        if os.path.exists("./vectorstore/audio_index.index"):
+            audio_index, audio_data = load_audio_index()
         with torch.no_grad():
             if not os.path.exists("./vectorstore/image_data.csv"):
                 st.warning("No Image Index Found. So not searching for images.")
@@ -75,6 +77,8 @@ def load_adapter():
             if not os.path.exists("./vectorstore/text_data.csv"):
                 st.warning("No Text Index Found. So not searching for text.")
                 text_index = None
+            if not os.path.exists("./vectorstore/audio_data.csv"):
+                st.warning("No Audio Index Found. So not searching for audio.")
             if image_input:
                 image = Image.open(image_input)
                 image = preprocess(image).unsqueeze(0).to(device)
@@ -85,12 +89,16 @@ def load_adapter():
                         image_indices = search_image_index_with_image(image_features, image_index, clip_model, k=3)
                     if text_index is not None:
                         text_indices = search_text_index_with_image(adapted_text_embeddings, text_index, text_embedding_model, k=3)
+                    if audio_index is not None:
+                        audio_indices = search_text_index_with_image(adapted_text_embeddings, audio_index, text_embedding_model, k=3)
             else:
                 if image_index is not None:
                     image_indices = search_image_index(text_input, image_index, clip_model, k=3)
                 if text_index is not None:
                     text_indices = search_text_index(text_input, text_index, text_embedding_model, k=3)
-            if not image_index and not text_index:
+                if audio_index is not None:
+                    audio_indices = search_text_index(text_input, audio_index, text_embedding_model, k=3)
+            if not image_index and not text_index and not audio_index:
                 st.error("No Data Found! Please add data to the database.")
             st.subheader("Top 3 Results")
             cols = st.columns(3)
@@ -111,4 +119,10 @@ def load_adapter():
                 with cols[i]:
                     if text_index:
                         text_content = text_data['content'].iloc[text_indices[0][i]]
-                        st.write(text_content)
\ No newline at end of file
+                        st.write(text_content)
+            cols = st.columns(3)
+            for i in range(3):
+                with cols[i]:
+                    if audio_index:
+                        audio_path = audio_data['path'].iloc[audio_indices[0][i]]
+                        st.audio(audio_path)
\ No newline at end of file
diff --git a/data_upload/data_upload_page.py b/data_upload/data_upload_page.py
index f1e536a..311c13e 100644
--- a/data_upload/data_upload_page.py
+++ b/data_upload/data_upload_page.py
@@ -2,15 +2,15 @@
 import streamlit as st
 import sys
 
-from data_upload.input_sources_utils import image_util, pdf_util, website_util
+from data_upload.input_sources_utils import image_util, pdf_util, website_util, audio_util
 
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 
 
-def data_upload(clip_model, preprocess, text_embedding_model):
+def data_upload(clip_model, preprocess, text_embedding_model, whisper_model):
     st.title("Data Upload")
     st.warning("Please note that this is a public application. Make sure you are not uploading any sensitive data.")
-    upload_choice = st.selectbox(options=["Upload Image", "Add Image from URL / Link", "Upload PDF", "Website Link"], label="Select Upload Type")
+    upload_choice = st.selectbox(options=["Upload Image", "Add Image from URL / Link", "Upload PDF", "Website Link", "Audio Recording"], label="Select Upload Type")
     if upload_choice == "Upload Image":
         image_util.upload_image(clip_model, preprocess)
     elif upload_choice == "Add Image from URL / Link":
@@ -19,3 +19,5 @@ def data_upload(clip_model, preprocess, text_embedding_model):
         pdf_util.upload_pdf(clip_model, preprocess, text_embedding_model)
     elif upload_choice == "Website Link":
         website_util.data_from_website(clip_model, preprocess, text_embedding_model)
+    elif upload_choice == "Audio Recording":
+        audio_util.upload_audio(whisper_model, text_embedding_model)
diff --git a/data_upload/input_sources_utils/audio_util.py b/data_upload/input_sources_utils/audio_util.py
new file mode 100644
index 0000000..3ddb6ba
--- /dev/null
+++ b/data_upload/input_sources_utils/audio_util.py
@@ -0,0 +1,29 @@
+import os
+import requests
+import streamlit as st
+import sys
+import whisper
+
+from vectordb import add_image_to_index, add_pdf_to_index, add_audio_to_index
+
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+
+def upload_audio(whisper_model, text_embedding_model):
+    st.title("Upload Audio")
+    recorded_audio = st.audio_input("Record Audio")
+    st.write("---")
+    uploaded_audios = st.file_uploader("Upload Audio", type=["mp3", "wav"], accept_multiple_files=True)
+    if recorded_audio:
+        st.audio(recorded_audio)
+        if st.button("Add Audio"):
+            add_audio_to_index(recorded_audio, whisper_model, text_embedding_model)
+            st.success("Audio Added to Database")
+    if uploaded_audios:
+        for audio in uploaded_audios:
+            st.audio(audio)
+        if st.button("Add Audio"):
+            progress_bar = st.progress(0, f"Adding Audio... | 0/{len(uploaded_audios)}")
+            for count, audio in enumerate(uploaded_audios):
+                add_audio_to_index(audio, whisper_model, text_embedding_model)
+                progress_bar.progress((count + 1) / len(uploaded_audios), f"Adding Audio... | {count + 1}/{len(uploaded_audios)}")
+            st.success("Audio Added to Database")
diff --git a/utils.py b/utils.py
index 858ea04..82cb260 100644
--- a/utils.py
+++ b/utils.py
@@ -6,6 +6,7 @@
 from sentence_transformers import SentenceTransformer
 import streamlit as st
 import torch
+import whisper
 
 device = "cuda" if torch.cuda.is_available() else "cpu"
 
@@ -19,6 +20,11 @@ def load_text_embedding_model():
     model = SentenceTransformer("all-MiniLM-L6-v2")
     return model
 
+@st.cache_resource
+def load_whisper_model():
+    model = whisper.load_model("small")
+    return model
+
 def load_image_index():
     index = faiss.read_index('./vectorstore/image_index.index')
     data = pd.read_csv("./vectorstore/image_data.csv")
@@ -29,6 +35,11 @@ def load_text_index():
     data = pd.read_csv("./vectorstore/text_data.csv")
     return index, data
 
+def load_audio_index():
+    index = faiss.read_index('./vectorstore/audio_index.index')
+    data = pd.read_csv("./vectorstore/audio_data.csv")
+    return index, data
+
 def cosine_similarity(a, b):
     return torch.cosine_similarity(a, b)
 
diff --git a/vectordb.py b/vectordb.py
index 4147668..ca867b3 100644
--- a/vectordb.py
+++ b/vectordb.py
@@ -11,14 +11,17 @@
 import streamlit as st
 import torch
 import time
+import whisper
 
 device = "cuda" if torch.cuda.is_available() else "cpu"
 
 os.makedirs("./vectorstore", exist_ok=True)
 
-def update_vectordb(index_path: str, embedding: torch.Tensor, image_path: str = None, text_content: str = None):
+def update_vectordb(index_path: str, embedding: torch.Tensor, image_path: str = None, text_content: str = None, audio_path: str = None):
     if not image_path and not text_content:
         raise ValueError("Either image_path or text_content must be provided.")
+    if audio_path and not text_content:
+        raise ValueError("text_content must be provided when audio_path is provided.")
     if not os.path.exists(f"./vectorstore/{index_path}"):
         if image_path:
             index = faiss.IndexFlatL2(512)
@@ -42,6 +45,15 @@ def update_vectordb(index_path: str, embedding: torch.Tensor, image_path: str =
             new_entry_df = pd.DataFrame({"path": image_path, "index": len(df)}, index=[0])
             df = pd.concat([df, new_entry_df], ignore_index=True)
             df.to_csv("./vectorstore/image_data.csv", index=False)
+    elif audio_path:
+        if not os.path.exists("./vectorstore/audio_data.csv"):
+            df = pd.DataFrame([{"path": audio_path, "content": text_content, "index": 0}]).reset_index(drop=True)
+            df.to_csv("./vectorstore/audio_data.csv", index=False)
+        else:
+            df = pd.read_csv("./vectorstore/audio_data.csv").reset_index(drop=True)
+            new_entry_df = pd.DataFrame({"path": audio_path, "content": text_content, "index": len(df)}, index=[0])
+            df = pd.concat([df, new_entry_df], ignore_index=True)
+            df.to_csv("./vectorstore/audio_data.csv", index=False)
     elif text_content:
         if not os.path.exists("./vectorstore/text_data.csv"):
             df = pd.DataFrame([{"content": text_content, "index": 0}]).reset_index(drop=True)
@@ -120,6 +132,41 @@ def add_pdf_to_index(pdf, clip_model: clip.model.CLIP, preprocess, text_embeddin
         progress_bar.progress(percent_complete, f"Processing Page {page_num + 1}/{len(pdf_reader.pages)}")
     return pdf_pages_data
 
+
+def add_audio_to_index(audio, whisper_model: whisper.Whisper, text_embedding_model: SentenceTransformer):
+    if not os.path.exists("./vectorstore/"):
+        os.makedirs("./vectorstore")
+    if not os.path.exists("./audio"):
+        os.makedirs("./audio")
+    if hasattr(audio, "name"):
+        audio_name = audio.name
+    else:
+        audio_name = f"{time.time()}.wav"
+    audio_name = audio_name.replace(" ", "_")
+    with open(f"./audio/{audio_name}", "wb") as f:
+        try:
+            f.write(audio.read())
+        except:
+            if hasattr(audio, "data"):
+                audio = io.BytesIO(audio.data)
+            else:
+                audio = io.BytesIO(audio)
+            f.write(audio.read())
+    audio_transcript: str = whisper_model.transcribe(f"./audio/{audio_name}")["text"]
+    text_splitter = CharacterTextSplitter(
+        separator="\n",
+        chunk_size=1000,
+        chunk_overlap=200,
+        length_function=len,
+        is_separator_regex=False,
+    )
+    chunks = text_splitter.split_text(audio_transcript)
+    text_embeddings = text_embedding_model.encode(chunks)
+    for i, chunk in enumerate(chunks):
+        update_vectordb(index_path="audio_index.index", embedding=text_embeddings[i], text_content=chunk, audio_path=f"./audio/{audio_name}")
+    return audio_transcript
+
+
 def search_image_index_with_image(image_features, index: faiss.IndexFlatL2, clip_model: clip.model.CLIP, k: int = 3):
     with torch.no_grad():
         distances, indices = index.search(image_features.cpu().numpy(), k)

From 68cd444d9df8155fcdae337842fa9edd028c96d7 Mon Sep 17 00:00:00 2001
From: notshrirang <shrirangmahajan123@gmail.com>
Date: Sat, 4 Jan 2025 17:27:22 +0530
Subject: [PATCH 2/5] feat: add requirements

---
 requirements.txt | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 4d76931..8ff406c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,7 +12,7 @@ cachetools==5.5.0
 certifi==2024.12.14
 charset-normalizer==3.4.1
 click==8.1.8
-clip @ git+https://github.com/openai/CLIP.git@dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
+clip==1.0
 colorama==0.4.6
 contourpy==1.3.1
 cycler==0.12.1
@@ -26,6 +26,7 @@ fonttools==4.55.3
 frozenlist==1.5.0
 fsspec==2024.9.0
 ftfy==6.3.1
+future==1.0.0
 gitdb==4.0.11
 GitPython==3.1.43
 greenlet==3.1.1
@@ -48,20 +49,24 @@ langchain-core==0.3.28
 langchain-experimental==0.3.4
 langchain-text-splitters==0.3.4
 langsmith==0.1.147
+llvmlite==0.43.0
 lxml==5.1.0
 markdown-it-py==3.0.0
 MarkupSafe==3.0.2
 marshmallow==3.23.2
 matplotlib==3.10.0
 mdurl==0.1.2
+more-itertools==10.5.0
 mpmath==1.3.0
 multidict==6.1.0
 multiprocess==0.70.16
 mypy-extensions==1.0.0
 narwhals==1.19.1
 networkx==3.4.2
+numba==0.60.0
 numpy==1.26.4
 open_clip_torch==2.29.0
+openai-whisper==20240930
 orjson==3.10.12
 packaging==24.2
 pandas==2.2.3
@@ -100,6 +105,7 @@ streamlit-option-menu==0.4.0
 sympy==1.13.1
 tenacity==8.5.0
 threadpoolctl==3.5.0
+tiktoken==0.8.0
 timm==1.0.12
 tokenizers==0.21.0
 toml==0.10.2

From acf79a60a00a8584bb8a8da0f8c962b154803359 Mon Sep 17 00:00:00 2001
From: notshrirang <shrirangmahajan123@gmail.com>
Date: Sat, 4 Jan 2025 17:27:59 +0530
Subject: [PATCH 3/5] feat: add requirements

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 8ff406c..6a0f80c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,7 +12,7 @@ cachetools==5.5.0
 certifi==2024.12.14
 charset-normalizer==3.4.1
 click==8.1.8
-clip==1.0
+clip @ git+https://github.com/openai/CLIP.git@dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
 colorama==0.4.6
 contourpy==1.3.1
 cycler==0.12.1

From 1c179c34114a60824c320f3733a0169ff61d90f2 Mon Sep 17 00:00:00 2001
From: notshrirang <shrirangmahajan123@gmail.com>
Date: Sat, 4 Jan 2025 17:55:09 +0530
Subject: [PATCH 4/5] feat: add captions to retrieved audios

---
 data_search/data_search_page.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/data_search/data_search_page.py b/data_search/data_search_page.py
index 4cb6bf7..cedf0fa 100644
--- a/data_search/data_search_page.py
+++ b/data_search/data_search_page.py
@@ -100,7 +100,7 @@ def load_adapter():
                     audio_indices = search_text_index(text_input, audio_index, text_embedding_model, k=3)
             if not image_index and not text_index and not audio_index:
                 st.error("No Data Found! Please add data to the database.")
-            st.subheader("Top 3 Results")
+            st.subheader("Image Results")
             cols = st.columns(3)
             for i in range(3):
                 with cols[i]:
@@ -114,15 +114,19 @@ def load_adapter():
                         cosine_similarity = torch.cosine_similarity(image_features, text_features)
                         st.write(f"Similarity: {cosine_similarity.item() * 100:.2f}%")
                         st.image(image_path)
+            st.subheader("Text Results")
             cols = st.columns(3)
             for i in range(3):
                 with cols[i]:
                     if text_index:
                         text_content = text_data['content'].iloc[text_indices[0][i]]
                         st.write(text_content)
+            st.subheader("Audio Results")
             cols = st.columns(3)
             for i in range(3):
                 with cols[i]:
                     if audio_index:
                         audio_path = audio_data['path'].iloc[audio_indices[0][i]]
-                        st.audio(audio_path)
\ No newline at end of file
+                        audio_content = audio_data['content'].iloc[audio_indices[0][i]]
+                        st.audio(audio_path)
+                        st.write(f"_{audio_content}_")
\ No newline at end of file

From 32799629fb972d94f62934895da0358bef9f6c8d Mon Sep 17 00:00:00 2001
From: Shrirang Mahajan <85283622+NotShrirang@users.noreply.github.com>
Date: Sat, 4 Jan 2025 17:58:57 +0530
Subject: [PATCH 5/5] Update README.md

---
 README.md | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index 027d463..4886801 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@
 ![GitHub repo size](https://img.shields.io/github/repo-size/NotShrirang/LoomRAG)
 <a href="https://huggingface.co/spaces/NotShrirang/LoomRAG"><img src="https://img.shields.io/badge/Streamlit%20App-red?style=flat-rounded-square&logo=streamlit&labelColor=white"/></a>
 
-This project implements a Multimodal Retrieval-Augmented Generation (RAG) system, named **LoomRAG**, that leverages OpenAI's CLIP model for neural cross-modal retrieval and semantic search. The system allows users to input text queries and retrieve both text and image responses seamlessly through vector embeddings. It features a comprehensive annotation interface for creating custom datasets and supports CLIP model fine-tuning with configurable parameters for domain-specific applications. The system also supports uploading images and PDFs for enhanced interaction and intelligent retrieval capabilities through a Streamlit-based interface.
+This project implements a Multimodal Retrieval-Augmented Generation (RAG) system, named **LoomRAG**, that leverages **OpenAI's CLIP** model for neural cross-modal image retrieval and semantic search, and **OpenAI's Whisper** model for audio processing. The system allows users to input text queries, images, or audio to retrieve multimodal responses seamlessly through vector embeddings. It features a comprehensive annotation interface for creating custom datasets and supports CLIP model fine-tuning with configurable parameters for domain-specific applications. The system also supports uploading images, PDFs, and audio files (including real-time recording) for enhanced interaction and intelligent retrieval capabilities through a Streamlit-based interface.
 
 Experience the project in action:
 
@@ -33,10 +33,11 @@ Experience the project in action:
 
 - 🔄 **Cross-Modal Retrieval**: Search text to retrieve both text and image results using deep learning
 - 🖼️ **Image-Based Search**: Search the database by uploading an image to find similar content
-- 🧠 **Embedding-Based Search**: Uses OpenAI's CLIP model to align text and image embeddings in a shared latent space
+- 🧠 **Embedding-Based Search**: Uses OpenAI's CLIP, Whisper and SentenceTransformer's Embedding Models for embedding the input data
 - 🎯 **CLIP Fine-Tuning**: Supports custom model training with configurable parameters including test dataset split size, learning rate, optimizer, and weight decay
 - 🔨 **Fine-Tuned Model Integration**: Seamlessly load and utilize fine-tuned CLIP models for enhanced search and retrieval
-- 📤 **Upload Options**: Allows users to upload images and PDFs for AI-powered processing and retrieval
+- 📤 **Upload Options**: Allows users to upload images, PDFs and audio files for AI-powered processing and retrieval
+- 🎙️ **Audio Integration**: Upload audio files or record audio directly through the interface
 - 🔗 **URL Integration**: Add images directly using URLs and scrape website data including text and images
 - 🕷️ **Web Scraping**: Automatically extract and index content from websites for comprehensive search capabilities
 - 🏷️ **Image Annotation**: Enables users to annotate uploaded images through an intuitive interface
@@ -45,6 +46,17 @@ Experience the project in action:
 
 ---
 
+## 🗺️ Roadmap
+
+- [x] Fine-tuning CLIP for domain-specific datasets
+- [x] Image-based search and retrieval
+- [x] Adding support for audeo modalities
+- [ ] Adding support for video modalities
+- [ ] Improving the re-ranking system for better contextual relevance
+- [ ] Enhanced PDF parsing with semantic section segmentation
+
+---
+
 ## 🏗️ Architecture Overview
 
 1. **Data Indexing**:
@@ -56,13 +68,14 @@ Experience the project in action:
 2. **Query Processing**:
 
    - Text queries / image-based queries are converted into embeddings for semantic search
-   - Uploaded images and PDFs are processed and embedded for comparison
-   - The system performs a nearest neighbor search in the vector database to retrieve relevant text and images
+   - Uploaded images, audio files and PDFs are processed and embedded for comparison
+   - The system performs a nearest neighbor search in the vector database to retrieve relevant text, images, and audio
 
 3. **Response Generation**:
 
    - For text results: Optionally refined or augmented using a language model
    - For image results: Directly returned or enhanced with image captions
+   - For audio results: Returned with relevant metadata and transcriptions where applicable
    - For PDFs: Extracts text content and provides relevant sections
 
 4. **Image Annotation**:
@@ -108,6 +121,7 @@ Experience the project in action:
    - Access the interface in your browser to:
      - Submit natural language queries
      - Upload images or PDFs to retrieve contextually relevant results
+     - Upload or record audio files
      - Add images using URLs
      - Scrape and index website content
      - Search using uploaded images
@@ -132,16 +146,6 @@ Experience the project in action:
 
 ---
 
-## 🗺️ Roadmap
-
-- [x] Fine-tuning CLIP for domain-specific datasets
-- [x] Image-based search and retrieval
-- [ ] Adding support for audio and video modalities
-- [ ] Improving the re-ranking system for better contextual relevance
-- [ ] Enhanced PDF parsing with semantic section segmentation
-
----
-
 ## 🤝 Contributing
 
 Contributions are welcome! Please open an issue or submit a pull request for any feature requests or bug fixes.
@@ -157,5 +161,6 @@ This project is licensed under the Apache-2.0 License. See the [LICENSE](LICENSE
 ## 🙏 Acknowledgments
 
 - [OpenAI CLIP](https://openai.com/research/clip)
+- [OpenAI Whisper](https://github.com/openai/whisper)
 - [FAISS](https://github.com/facebookresearch/faiss)
 - [Hugging Face](https://huggingface.co/)