Skip to content

Commit

Permalink
allow skipping excerpting
Browse files Browse the repository at this point in the history
  • Loading branch information
zzstoatzz committed Nov 14, 2024
1 parent b9d1e7b commit b4c0a6d
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 2 deletions.
35 changes: 35 additions & 0 deletions examples/scrape_sitemap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# /// script
# dependencies = [
# "raggy",
# "trafilatura",
# ]
# ///

import asyncio

from bs4 import BeautifulSoup

import raggy
from raggy.loaders.web import SitemapLoader


def html_parser(html: str) -> str:
import trafilatura

trafilatura_config = trafilatura.settings.use_config() # type: ignore
trafilatura_config.set("DEFAULT", "EXTRACTION_TIMEOUT", "0")
return (
trafilatura.extract(html, config=trafilatura_config)
or BeautifulSoup(html, "html.parser").get_text()
)


async def main(urls: list[str]):
raggy.settings.html_parser = html_parser
loader = SitemapLoader(urls=urls, create_excerpts=False)
docs = await loader.load()
print(f"scraped {len(docs)} documents")


if __name__ == "__main__":
asyncio.run(main(["https://prefect.io/blog/sitemap.xml"]))
15 changes: 13 additions & 2 deletions src/raggy/loaders/web.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,12 @@ class URLLoader(WebLoader):
Attributes:
urls: The URLs to load from.
create_excerpts: Whether to split documents into excerpts. Defaults to True.
"""

source_type: str = "url"
urls: list[str] = Field(default_factory=list)
create_excerpts: bool = Field(default=True)

async def load(self) -> list[Document]:
headers = await self.get_headers()
Expand All @@ -70,7 +72,10 @@ async def load_url_task(url):
final_documents = []
for d in documents:
if d is not None:
final_documents.extend(await document_to_excerpts(d))
if self.create_excerpts:
final_documents.extend(await document_to_excerpts(d))
else:
final_documents.append(d)
return final_documents

async def load_url(self, url, client) -> Document | None:
Expand Down Expand Up @@ -136,6 +141,7 @@ class SitemapLoader(URLLoader):
include: A list of strings or regular expressions. Only URLs that match one of these will be included.
exclude: A list of strings or regular expressions. URLs that match one of these will be excluded.
url_loader: The loader to use for loading the URLs.
create_excerpts: Whether to split documents into excerpts. Defaults to True.
Examples:
Load all URLs from a sitemap:
```python
Expand All @@ -150,14 +156,19 @@ class SitemapLoader(URLLoader):
exclude: list[str | re.Pattern] = Field(default_factory=list)
url_loader: URLLoader = Field(default_factory=HTMLLoader)
url_processor: Callable[[str], str] = lambda x: x # noqa: E731
create_excerpts: bool = Field(default=True)

async def _get_loader(self: Self) -> MultiLoader:
urls = await run_concurrent_tasks(
[lambda u=url: self.load_sitemap(u) for url in self.urls], max_concurrent=5
)
return MultiLoader(
loaders=[
type(self.url_loader)(urls=url_batch, headers=await self.get_headers()) # type: ignore
type(self.url_loader)(
urls=url_batch,
headers=await self.get_headers(),
create_excerpts=self.create_excerpts,
) # type: ignore
for url_batch in batched(
[self.url_processor(u) for url_list in urls for u in url_list], # type: ignore
10,
Expand Down

0 comments on commit b4c0a6d

Please sign in to comment.