-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlangchain_ollama_llama3_rag_for_docx.py
102 lines (76 loc) · 3.14 KB
/
langchain_ollama_llama3_rag_for_docx.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# pip install --upgrade nltk
# !pip install langchain unstructured python-docx sentence-transformers transformers torch accelerate
# !pip install langchain-community
# !pip install "unstructured[docx,pptx]"
import nltk
import time
nltk.download("punkt")
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever
# from langchain_community.embeddings import HuggingFaceEmbeddings
# from langchain_community.vectorstores import FAISS
# from langchain_community.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
# from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
# from langchain_openai import ChatOpenAI
def load_docx(file_path):
loader = UnstructuredWordDocumentLoader(file_path)
documents = loader.load()
print(f"Loaded {len(documents)} documents")
return documents
def split_documents(documents):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=1000)
chunks = text_splitter.split_documents(documents)
document = chunks[0]
print(document.page_content)
print(document.metadata)
print(f"Split into {len(chunks)} chunks")
return chunks
def create_vector_store(chunks):
vector_db = Chroma.from_documents(
documents=chunks,
embedding=OllamaEmbeddings(model="nomic-embed-text", show_progress=True),
collection_name="local-rag",
)
return vector_db
def setup_llm():
local_model = "llama3.1:latest"
llm = ChatOllama(model=local_model)
return llm
def create_question_extraction_pipeline(vectorstore, llm):
retriever = vectorstore.as_retriever()
qa_chain = RetrievalQA.from_chain_type(
llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True
)
return qa_chain
def extract_questions(qa_chain):
query = """
[INST] Based on the content of the document, find all the questions for assesment task 1.
Format your response as a numbered list. [/INST]
"""
result = qa_chain({"query": query})
return result["result"]
def main():
file_path = "./data/Student_Assessment_Tasks.docx"
documents = load_docx(file_path)
splits = split_documents(documents)
# for i in range(len(splits)):
# print(f'======splits[{i}]========== {splits[i]}\n\n')
vectorstore = create_vector_store(splits)
llm = setup_llm() # need to use opeanai here
start_time = time.time()
qa_chain = create_question_extraction_pipeline(vectorstore, llm)
questions = extract_questions(qa_chain)
end_time = time.time()
print("Extracted Questions:")
print(questions)
print(f"Time spent: {end_time-start_time} seconds")
if __name__ == "__main__":
main()