-
Notifications
You must be signed in to change notification settings - Fork 120
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
PDF Parse Workflow with Docling and Elastic Search (#1137)
- Loading branch information
1 parent
25303e2
commit 498dc94
Showing
9 changed files
with
346 additions
and
17 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
from elastic_transport import ApiError | ||
from elasticsearch import Elasticsearch | ||
from typing import Union | ||
import base64 | ||
import uuid | ||
|
||
from common_objects import ImageWithEmbedding, TextChunk | ||
from indexify.functions_sdk.indexify_functions import IndexifyFunction | ||
|
||
from images import st_image | ||
|
||
|
||
class ElasticSearchWriter(IndexifyFunction): | ||
name = "elastic_search_writer" | ||
image = st_image | ||
|
||
def __init__(self): | ||
super().__init__() | ||
# Connect to Elasticsearch | ||
self._client = Elasticsearch( | ||
hosts=["http://elasticsearch:9200"], # <User Change>: default is service name in the docker compose file. | ||
verify_certs=False, | ||
ssl_show_warn=False, | ||
#basic_auth=("elastic", "your_password"), | ||
retry_on_timeout=True, | ||
max_retries=3, | ||
request_timeout=5, | ||
) | ||
|
||
# Create indices if they don't exist | ||
self._create_indices_if_not_exists() | ||
|
||
def _create_indices_if_not_exists(self): | ||
# Text index mapping | ||
text_mapping = { | ||
"mappings": { | ||
"properties": { | ||
"embedding": { | ||
"type": "dense_vector", | ||
"dims": 768, | ||
"index": True, | ||
"similarity": "cosine", | ||
"index_options": { | ||
"type": "hnsw", | ||
"m": 16, | ||
"ef_construction": 100 | ||
} | ||
}, | ||
"page_number": {"type": "integer"}, | ||
"chunk": {"type": "text"} | ||
} | ||
}, | ||
"settings": { | ||
"number_of_shards": 1, | ||
"number_of_replicas": 0, | ||
}, | ||
} | ||
|
||
# Image index mapping | ||
image_mapping = { | ||
"mappings": { | ||
"properties": { | ||
"embedding": { | ||
"type": "dense_vector", | ||
"dims": 512, | ||
"index": True, | ||
"similarity": "cosine", | ||
"index_options": { | ||
"type": "hnsw", | ||
"m": 16, | ||
"ef_construction": 100 | ||
} | ||
}, | ||
"page_number": {"type": "integer"}, | ||
"image_data": {"type": "binary"} | ||
} | ||
}, | ||
"settings": { | ||
"number_of_shards": 1, | ||
"number_of_replicas": 0, | ||
}, | ||
} | ||
|
||
try: | ||
self._client.indices.create(index="text_embeddings", body=text_mapping) | ||
except ApiError as e: | ||
if e.status_code == 400 and "resource_already_exists_exception" in str(e): | ||
print("Text index already exists. Continuing.") | ||
else: | ||
raise e | ||
|
||
try: | ||
self._client.indices.create(index="image_embeddings", body=image_mapping) | ||
except ApiError as e: | ||
if e.status_code == 400 and "resource_already_exists_exception" in str(e): | ||
print("Image index already exists. Continuing.") | ||
else: | ||
raise e | ||
|
||
def run(self, input: Union[ImageWithEmbedding, TextChunk]) -> bool: | ||
try: | ||
if isinstance(input, ImageWithEmbedding): | ||
# Convert image bytes to base64 for storage | ||
image_base64 = base64.b64encode(input.image_bytes).decode('utf-8') | ||
|
||
document = { | ||
"embedding": input.embedding, | ||
"page_number": input.page_number, | ||
"image_data": image_base64 | ||
} | ||
|
||
self._client.index( | ||
index="image_embeddings", | ||
id=str(uuid.uuid4()), | ||
document=document | ||
) | ||
|
||
elif isinstance(input, TextChunk): | ||
document = { | ||
"embedding": input.embeddings, | ||
"page_number": input.page_number, | ||
"chunk": input.chunk | ||
} | ||
|
||
self._client.index( | ||
index="text_embeddings", | ||
id=str(uuid.uuid4()), | ||
document=document | ||
) | ||
|
||
return True | ||
|
||
except Exception as e: | ||
print(f"Error indexing document: {str(e)}") | ||
return False |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
from common_objects import PDFParserDoclingOutput | ||
from indexify.functions_sdk.data_objects import File | ||
from indexify.functions_sdk.indexify_functions import IndexifyFunction | ||
|
||
from images import inkwell_image_gpu | ||
|
||
|
||
class PDFParserDocling(IndexifyFunction): | ||
name = "pdf-parse-docling" | ||
description = "Parser class that captures a pdf file" | ||
# Change to gpu_image to use GPU | ||
image = inkwell_image_gpu | ||
|
||
def __init__(self): | ||
super().__init__() | ||
|
||
def run(self, file: File) -> PDFParserDoclingOutput: | ||
from docling.datamodel.pipeline_options import PdfPipelineOptions | ||
IMAGE_RESOLUTION_SCALE = 2.0 | ||
pipeline_options = PdfPipelineOptions() | ||
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE | ||
pipeline_options.generate_page_images = True | ||
|
||
from docling.document_converter import DocumentConverter, PdfFormatOption | ||
from docling.datamodel.base_models import InputFormat | ||
|
||
import tempfile | ||
with tempfile.NamedTemporaryFile(mode="wb", suffix=".pdf") as f: | ||
f.write(file.data) | ||
converter = DocumentConverter( | ||
format_options={ | ||
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) | ||
} | ||
) | ||
result = converter.convert(f.name) | ||
|
||
texts = [] | ||
for i in range(len(result.pages)): | ||
page_result = result.document.export_to_markdown(page_no=i+1) | ||
texts.append(page_result) | ||
|
||
images = [] | ||
for element, _level in result.document.iterate_items(): | ||
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem | ||
if isinstance(element, PictureItem): | ||
pil_image = element.get_image(result.document) | ||
|
||
# Using docling APIs to avoid confusion. | ||
b64 = element._image_to_base64(pil_image) | ||
images.append(b64) | ||
|
||
return PDFParserDoclingOutput(texts=texts, images=images) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,10 @@ | ||
httpx | ||
indexify | ||
pydantic | ||
py-inkwell | ||
chromadb | ||
docling==2.14.0 | ||
docling-core | ||
sentence-transformers | ||
chromadb | ||
elasticsearch | ||
py-inkwell[api] | ||
langchain-text-splitters | ||
elastic-transport |
Oops, something went wrong.