Skip to content

Commit

Permalink
fix typing
Browse files Browse the repository at this point in the history
  • Loading branch information
zzstoatzz committed Dec 21, 2024
1 parent e4b408f commit 3f7b4e2
Show file tree
Hide file tree
Showing 24 changed files with 1,156 additions and 176 deletions.
13 changes: 7 additions & 6 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@ repos:
- id: ruff-format
- id: ruff
args: [--fix, --exit-non-zero-on-fix]
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.8.0
- repo: local
hooks:
- id: mypy
additional_dependencies:
- "aiofiles"
- "types-aiofiles"
- id: typecheck
name: Typecheck
entry: uv run mypy .
types: [python]
language: system
pass_filenames: false
16 changes: 10 additions & 6 deletions examples/chat_with_X/repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@

import asyncio
import warnings
from typing import Any

import httpx
from marvin.beta.assistants import Assistant
from marvin.beta.assistants import Assistant # type: ignore
from prefect import flow, task
from prefect.context import TaskRunContext
from rich.status import Status

from raggy.documents import Document
Expand All @@ -22,7 +24,9 @@
TPUF_NS = "demo"


def get_last_commit_sha(context, parameters) -> str | None:
def get_last_commit_sha(
context: TaskRunContext, parameters: dict[str, Any]
) -> str | None:
"""Cache based on Last-Modified header of the first URL."""
try:
return httpx.get(
Expand All @@ -47,10 +51,10 @@ async def ingest_repo(repo: str):
Args:
repo: The repository to ingest (format: "owner/repo").
"""
documents = await gather_documents(repo)
documents: list[Document] = await gather_documents(repo) # type: ignore
with TurboPuffer(namespace=TPUF_NS) as tpuf:
print(f"Upserting {len(documents)} documents into {TPUF_NS}")
await task(tpuf.upsert_batched)(documents)
print(f"Upserting {len(documents)} documents into {TPUF_NS}") # type: ignore
await task(tpuf.upsert_batched)(documents) # type: ignore


@task(task_run_name="querying: {query_texts}")
Expand Down Expand Up @@ -84,7 +88,7 @@ async def chat_with_repo(initial_message: str | None = None, clean_up: bool = Tr
ingest_repo,
do_research,
],
) as assistant:
) as assistant: # type: ignore
assistant.chat(initial_message=initial_message) # type: ignore

finally:
Expand Down
20 changes: 12 additions & 8 deletions examples/chat_with_X/website.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,12 @@
import re
import warnings
from datetime import timedelta
from typing import Any

import httpx
from marvin.beta.assistants import Assistant
from marvin.beta.assistants import Assistant # type: ignore
from prefect import flow, task
from prefect.context import TaskRunContext
from rich.status import Status

from raggy.documents import Document
Expand All @@ -24,7 +26,9 @@
TPUF_NS = "demo"


def get_last_modified(context, parameters):
def get_last_modified(
context: TaskRunContext, parameters: dict[str, Any]
) -> str | None:
"""Cache based on Last-Modified header of the first URL."""
try:
with httpx.Client() as client:
Expand All @@ -40,25 +44,25 @@ def get_last_modified(context, parameters):
cache_expiration=timedelta(hours=24),
)
async def gather_documents(
urls: list[str], exclude: list[str | re.Pattern] | None = None
urls: list[str], exclude: list[str | re.Pattern[str]] | None = None
) -> list[Document]:
return await SitemapLoader(urls=urls, exclude=exclude or []).load()


@flow(flow_run_name="{urls}")
async def ingest_website(
urls: list[str], exclude: list[str | re.Pattern] | None = None
urls: list[str], exclude: list[str | re.Pattern[str]] | None = None
):
"""Ingest a website into the vector database.
Args:
urls: The URLs to ingest (exact or glob patterns).
exclude: The URLs to exclude (exact or glob patterns).
"""
documents = await gather_documents(urls, exclude)
documents: list[Document] = await gather_documents(urls, exclude) # type: ignore
with TurboPuffer(namespace=TPUF_NS) as tpuf:
print(f"Upserting {len(documents)} documents into {TPUF_NS}")
await tpuf.upsert_batched(documents)
print(f"Upserting {len(documents)} documents into {TPUF_NS}") # type: ignore
await tpuf.upsert_batched(documents) # type: ignore


@task(task_run_name="querying: {query_texts}")
Expand Down Expand Up @@ -92,7 +96,7 @@ async def chat_with_website(initial_message: str | None = None, clean_up: bool =
ingest_website,
do_research,
],
) as assistant:
) as assistant: # type: ignore
assistant.chat(initial_message=initial_message) # type: ignore

finally:
Expand Down
36 changes: 19 additions & 17 deletions examples/reddit_thread.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@

from functools import lru_cache

import marvin
import praw
from marvin.utilities.logging import get_logger
import marvin # type: ignore
import praw # type: ignore
from marvin.utilities.logging import get_logger # type: ignore
from pydantic_settings import BaseSettings, SettingsConfigDict

from raggy.documents import Document, document_to_excerpts
from raggy.vectorstores.tpuf import TurboPuffer, query_namespace

logger = get_logger("reddit_thread_example")
logger = get_logger("reddit_thread_example") # type: ignore


class Settings(BaseSettings):
Expand All @@ -26,45 +26,47 @@ class Settings(BaseSettings):
settings = Settings()


def create_reddit_client() -> praw.Reddit:
return praw.Reddit(
def create_reddit_client() -> praw.Reddit: # type: ignore
return praw.Reddit( # type: ignore
client_id=getattr(settings, "reddit_client_id"),
client_secret=getattr(settings, "reddit_client_secret"),
user_agent="testscript by /u/_n80n8",
)


@lru_cache
def read_thread(submission_id: str):
logger.info(f"Reading thread {submission_id}")
submission = create_reddit_client().submission(submission_id)
def read_thread(submission_id: str) -> str:
logger.info(f"Reading thread {submission_id}") # type: ignore
submission: praw.models.Submission = create_reddit_client().submission( # type: ignore
submission_id
)

text_buffer = ""
text_buffer += f"Title: {submission.title}\n"
text_buffer += f"Selftext: {submission.selftext}\n"
text_buffer += f"Title: {submission.title}\n" # type: ignore
text_buffer += f"Selftext: {submission.selftext}\n" # type: ignore

submission.comments.replace_more(limit=None) # Retrieve all comments
for comment in submission.comments.list():
submission.comments.replace_more(limit=None) # type: ignore
for comment in submission.comments.list(): # type: ignore
text_buffer += "\n---\n"
text_buffer += f"Comment Text: {comment.body}\n"
text_buffer += f"Comment Text: {comment.body}\n" # type: ignore

return text_buffer


@marvin.fn
@marvin.fn # type: ignore
def summarize_results(relevant_excerpts: str) -> str: # type: ignore[empty-body]
"""give a summary of the relevant excerpts"""


async def main(thread_id: str):
logger.info("Starting Reddit thread example")
logger.info("Starting Reddit thread example") # type: ignore
thread_text = read_thread(thread_id)
chunked_documents = await document_to_excerpts(Document(text=thread_text))

with TurboPuffer(namespace="reddit_thread") as tpuf:
tpuf.upsert(chunked_documents)

logger.info("Thread saved!")
logger.info("Thread saved!") # type: ignore

query = "how do people feel about the return of the water taxis?"
results = query_namespace(query, namespace="reddit_thread")
Expand Down
4 changes: 2 additions & 2 deletions examples/refresh_vectorstore/chroma_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,10 @@ def refresh_chroma(
mode: Literal["upsert", "reset"] = "upsert",
):
"""Flow updating vectorstore with info from the Prefect community."""
documents = [
documents: list[Document] = [
doc
for future in run_loader.map(prefect_loaders) # type: ignore
for doc in future.result()
for doc in future.result() # type: ignore
]

print(f"Loaded {len(documents)} documents from the Prefect community.")
Expand Down
18 changes: 12 additions & 6 deletions examples/refresh_vectorstore/tpuf_namespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@

import os
from datetime import timedelta
from typing import Any, Sequence

from prefect import flow, task
from prefect.context import TaskRunContext
from prefect.tasks import task_input_hash
from prefect.utilities.annotations import quote

Expand Down Expand Up @@ -58,7 +60,9 @@
}


def _cache_key_with_invalidation(context, parameters):
def _cache_key_with_invalidation(
context: TaskRunContext, parameters: dict[str, Any]
) -> str:
return f"{task_input_hash(context, parameters)}:{os.getenv("RAGGY_CACHE_VERSION", "0")}"


Expand All @@ -81,16 +85,16 @@ async def run_loader(loader: Loader) -> list[Document]:
)
def refresh_tpuf_namespace(
namespace: str,
namespace_loaders: list[Loader],
namespace_loaders: Sequence[Loader],
reset: bool = False,
batch_size: int = 100,
max_concurrent: int = 8,
):
"""Flow updating vectorstore with info from the Prefect community."""
documents: list[Document] = [
doc
for future in run_loader.map(quote(namespace_loaders))
for doc in future.result()
for future in run_loader.map(quote(namespace_loaders)) # type: ignore
for doc in future.result() # type: ignore
]

print(f"Loaded {len(documents)} documents from the Prefect community.")
Expand All @@ -100,8 +104,10 @@ def refresh_tpuf_namespace(
task(tpuf.reset)()
print(f"RESETTING: Deleted all documents from tpuf ns {namespace!r}.")

task(tpuf.upsert_batched).submit(
documents=documents, batch_size=batch_size, max_concurrent=max_concurrent
task(tpuf.upsert_batched).submit( # type: ignore
documents=documents,
batch_size=batch_size,
max_concurrent=max_concurrent,
).wait()

print(f"Updated tpuf ns {namespace!r} with {len(documents)} documents.")
Expand Down
42 changes: 42 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ dependencies = [
"chardet",
"fake-useragent",
"gh-util",
"prefect",
"pydantic-ai-slim[openai]",
"pypdf",
"tenacity",
Expand Down Expand Up @@ -103,3 +104,44 @@ skip-magic-trailing-comma = false

[tool.setuptools_scm]
write_to = "src/raggy/_version.py"

[tool.mypy]
ignore_missing_imports = true
check_untyped_defs = true
warn_redundant_casts = true
warn_unused_ignores = true
warn_return_any = true
warn_unreachable = true
strict_optional = true
exclude = ["examples/.*"]

[[tool.mypy.overrides]]
module = [
"bs4.*",
"fake_useragent.*",
"gh_util.*",
"turbopuffer.*",
"prefect.*",
"marvin.*",
"praw.*",
"yake.*",
]
ignore_missing_imports = true

[[tool.mypy.overrides]]
module = [
"raggy.cli",
"raggy.settings",
"raggy.utilities.text",
"raggy.utilities.asyncutils",
"raggy.vectorstores.tpuf",
"raggy.vectorstores.chroma",
"raggy.loaders.web",
"raggy.loaders.pdf",
]
disable_error_code = [
"unreachable",
"no-any-return",
"unused-ignore",
"no-redef",
]
2 changes: 2 additions & 0 deletions src/raggy/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
from .documents import Document
from .settings import settings

__all__ = ["Document", "settings"]
Loading

0 comments on commit 3f7b4e2

Please sign in to comment.