fix typing

zzstoatzz · Dec 21, 2024 · 3f7b4e2 · 3f7b4e2
1 parent e4b408f
commit 3f7b4e2
Show file tree

Hide file tree

Showing 24 changed files with 1,156 additions and 176 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -5,10 +5,11 @@ repos:
       - id: ruff-format
       - id: ruff
         args: [--fix, --exit-non-zero-on-fix]
-  - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.8.0
+  - repo: local
     hooks:
-      - id: mypy
-        additional_dependencies:
-          - "aiofiles"
-          - "types-aiofiles"
+      - id: typecheck
+        name: Typecheck
+        entry: uv run mypy .
+        types: [python]
+        language: system
+        pass_filenames: false
diff --git a/examples/chat_with_X/repo.py b/examples/chat_with_X/repo.py
@@ -9,10 +9,12 @@
 
 import asyncio
 import warnings
+from typing import Any
 
 import httpx
-from marvin.beta.assistants import Assistant
+from marvin.beta.assistants import Assistant  # type: ignore
 from prefect import flow, task
+from prefect.context import TaskRunContext
 from rich.status import Status
 
 from raggy.documents import Document
@@ -22,7 +24,9 @@
 TPUF_NS = "demo"
 
 
-def get_last_commit_sha(context, parameters) -> str | None:
+def get_last_commit_sha(
+    context: TaskRunContext, parameters: dict[str, Any]
+) -> str | None:
     """Cache based on Last-Modified header of the first URL."""
     try:
         return httpx.get(
@@ -47,10 +51,10 @@ async def ingest_repo(repo: str):
     Args:
         repo: The repository to ingest (format: "owner/repo").
     """
-    documents = await gather_documents(repo)
+    documents: list[Document] = await gather_documents(repo)  # type: ignore
     with TurboPuffer(namespace=TPUF_NS) as tpuf:
-        print(f"Upserting {len(documents)} documents into {TPUF_NS}")
-        await task(tpuf.upsert_batched)(documents)
+        print(f"Upserting {len(documents)} documents into {TPUF_NS}")  # type: ignore
+        await task(tpuf.upsert_batched)(documents)  # type: ignore
 
 
 @task(task_run_name="querying: {query_texts}")
@@ -84,7 +88,7 @@ async def chat_with_repo(initial_message: str | None = None, clean_up: bool = Tr
                 ingest_repo,
                 do_research,
             ],
-        ) as assistant:
+        ) as assistant:  # type: ignore
             assistant.chat(initial_message=initial_message)  # type: ignore
 
     finally:

diff --git a/examples/chat_with_X/website.py b/examples/chat_with_X/website.py
@@ -11,10 +11,12 @@
 import re
 import warnings
 from datetime import timedelta
+from typing import Any
 
 import httpx
-from marvin.beta.assistants import Assistant
+from marvin.beta.assistants import Assistant  # type: ignore
 from prefect import flow, task
+from prefect.context import TaskRunContext
 from rich.status import Status
 
 from raggy.documents import Document
@@ -24,7 +26,9 @@
 TPUF_NS = "demo"
 
 
-def get_last_modified(context, parameters):
+def get_last_modified(
+    context: TaskRunContext, parameters: dict[str, Any]
+) -> str | None:
     """Cache based on Last-Modified header of the first URL."""
     try:
         with httpx.Client() as client:
@@ -40,25 +44,25 @@ def get_last_modified(context, parameters):
     cache_expiration=timedelta(hours=24),
 )
 async def gather_documents(
-    urls: list[str], exclude: list[str | re.Pattern] | None = None
+    urls: list[str], exclude: list[str | re.Pattern[str]] | None = None
 ) -> list[Document]:
     return await SitemapLoader(urls=urls, exclude=exclude or []).load()
 
 
 @flow(flow_run_name="{urls}")
 async def ingest_website(
-    urls: list[str], exclude: list[str | re.Pattern] | None = None
+    urls: list[str], exclude: list[str | re.Pattern[str]] | None = None
 ):
     """Ingest a website into the vector database.
 
     Args:
         urls: The URLs to ingest (exact or glob patterns).
         exclude: The URLs to exclude (exact or glob patterns).
     """
-    documents = await gather_documents(urls, exclude)
+    documents: list[Document] = await gather_documents(urls, exclude)  # type: ignore
     with TurboPuffer(namespace=TPUF_NS) as tpuf:
-        print(f"Upserting {len(documents)} documents into {TPUF_NS}")
-        await tpuf.upsert_batched(documents)
+        print(f"Upserting {len(documents)} documents into {TPUF_NS}")  # type: ignore
+        await tpuf.upsert_batched(documents)  # type: ignore
 
 
 @task(task_run_name="querying: {query_texts}")
@@ -92,7 +96,7 @@ async def chat_with_website(initial_message: str | None = None, clean_up: bool =
                 ingest_website,
                 do_research,
             ],
-        ) as assistant:
+        ) as assistant:  # type: ignore
             assistant.chat(initial_message=initial_message)  # type: ignore
 
     finally:

diff --git a/examples/reddit_thread.py b/examples/reddit_thread.py
@@ -8,15 +8,15 @@
 
 from functools import lru_cache
 
-import marvin
-import praw
-from marvin.utilities.logging import get_logger
+import marvin  # type: ignore
+import praw  # type: ignore
+from marvin.utilities.logging import get_logger  # type: ignore
 from pydantic_settings import BaseSettings, SettingsConfigDict
 
 from raggy.documents import Document, document_to_excerpts
 from raggy.vectorstores.tpuf import TurboPuffer, query_namespace
 
-logger = get_logger("reddit_thread_example")
+logger = get_logger("reddit_thread_example")  # type: ignore
 
 
 class Settings(BaseSettings):
@@ -26,45 +26,47 @@ class Settings(BaseSettings):
 settings = Settings()
 
 
-def create_reddit_client() -> praw.Reddit:
-    return praw.Reddit(
+def create_reddit_client() -> praw.Reddit:  # type: ignore
+    return praw.Reddit(  # type: ignore
         client_id=getattr(settings, "reddit_client_id"),
         client_secret=getattr(settings, "reddit_client_secret"),
         user_agent="testscript by /u/_n80n8",
     )
 
 
 @lru_cache
-def read_thread(submission_id: str):
-    logger.info(f"Reading thread {submission_id}")
-    submission = create_reddit_client().submission(submission_id)
+def read_thread(submission_id: str) -> str:
+    logger.info(f"Reading thread {submission_id}")  # type: ignore
+    submission: praw.models.Submission = create_reddit_client().submission(  # type: ignore
+        submission_id
+    )
 
     text_buffer = ""
-    text_buffer += f"Title: {submission.title}\n"
-    text_buffer += f"Selftext: {submission.selftext}\n"
+    text_buffer += f"Title: {submission.title}\n"  # type: ignore
+    text_buffer += f"Selftext: {submission.selftext}\n"  # type: ignore
 
-    submission.comments.replace_more(limit=None)  # Retrieve all comments
-    for comment in submission.comments.list():
+    submission.comments.replace_more(limit=None)  # type: ignore
+    for comment in submission.comments.list():  # type: ignore
         text_buffer += "\n---\n"
-        text_buffer += f"Comment Text: {comment.body}\n"
+        text_buffer += f"Comment Text: {comment.body}\n"  # type: ignore
 
     return text_buffer
 
 
-@marvin.fn
+@marvin.fn  # type: ignore
 def summarize_results(relevant_excerpts: str) -> str:  # type: ignore[empty-body]
     """give a summary of the relevant excerpts"""
 
 
 async def main(thread_id: str):
-    logger.info("Starting Reddit thread example")
+    logger.info("Starting Reddit thread example")  # type: ignore
     thread_text = read_thread(thread_id)
     chunked_documents = await document_to_excerpts(Document(text=thread_text))
 
     with TurboPuffer(namespace="reddit_thread") as tpuf:
         tpuf.upsert(chunked_documents)
 
-    logger.info("Thread saved!")
+    logger.info("Thread saved!")  # type: ignore
 
     query = "how do people feel about the return of the water taxis?"
     results = query_namespace(query, namespace="reddit_thread")

diff --git a/examples/refresh_vectorstore/chroma_collection.py b/examples/refresh_vectorstore/chroma_collection.py
@@ -65,10 +65,10 @@ def refresh_chroma(
     mode: Literal["upsert", "reset"] = "upsert",
 ):
     """Flow updating vectorstore with info from the Prefect community."""
-    documents = [
+    documents: list[Document] = [
         doc
         for future in run_loader.map(prefect_loaders)  # type: ignore
-        for doc in future.result()
+        for doc in future.result()  # type: ignore
     ]
 
     print(f"Loaded {len(documents)} documents from the Prefect community.")

diff --git a/examples/refresh_vectorstore/tpuf_namespace.py b/examples/refresh_vectorstore/tpuf_namespace.py
@@ -7,8 +7,10 @@
 
 import os
 from datetime import timedelta
+from typing import Any, Sequence
 
 from prefect import flow, task
+from prefect.context import TaskRunContext
 from prefect.tasks import task_input_hash
 from prefect.utilities.annotations import quote
 
@@ -58,7 +60,9 @@
 }
 
 
-def _cache_key_with_invalidation(context, parameters):
+def _cache_key_with_invalidation(
+    context: TaskRunContext, parameters: dict[str, Any]
+) -> str:
     return f"{task_input_hash(context, parameters)}:{os.getenv("RAGGY_CACHE_VERSION", "0")}"
 
 
@@ -81,16 +85,16 @@ async def run_loader(loader: Loader) -> list[Document]:
 )
 def refresh_tpuf_namespace(
     namespace: str,
-    namespace_loaders: list[Loader],
+    namespace_loaders: Sequence[Loader],
     reset: bool = False,
     batch_size: int = 100,
     max_concurrent: int = 8,
 ):
     """Flow updating vectorstore with info from the Prefect community."""
     documents: list[Document] = [
         doc
-        for future in run_loader.map(quote(namespace_loaders))
-        for doc in future.result()
+        for future in run_loader.map(quote(namespace_loaders))  # type: ignore
+        for doc in future.result()  # type: ignore
     ]
 
     print(f"Loaded {len(documents)} documents from the Prefect community.")
@@ -100,8 +104,10 @@ def refresh_tpuf_namespace(
             task(tpuf.reset)()
             print(f"RESETTING: Deleted all documents from tpuf ns {namespace!r}.")
 
-        task(tpuf.upsert_batched).submit(
-            documents=documents, batch_size=batch_size, max_concurrent=max_concurrent
+        task(tpuf.upsert_batched).submit(  # type: ignore
+            documents=documents,
+            batch_size=batch_size,
+            max_concurrent=max_concurrent,
         ).wait()
 
     print(f"Updated tpuf ns {namespace!r} with {len(documents)} documents.")

diff --git a/pyproject.toml b/pyproject.toml
@@ -19,6 +19,7 @@ dependencies = [
     "chardet",
     "fake-useragent",
     "gh-util",
+    "prefect",
     "pydantic-ai-slim[openai]",
     "pypdf",
     "tenacity",
@@ -103,3 +104,44 @@ skip-magic-trailing-comma = false
 
 [tool.setuptools_scm]
 write_to = "src/raggy/_version.py"
+
+[tool.mypy]
+ignore_missing_imports = true
+check_untyped_defs = true
+warn_redundant_casts = true
+warn_unused_ignores = true
+warn_return_any = true
+warn_unreachable = true
+strict_optional = true
+exclude = ["examples/.*"]
+
+[[tool.mypy.overrides]]
+module = [
+    "bs4.*",
+    "fake_useragent.*",
+    "gh_util.*",
+    "turbopuffer.*",
+    "prefect.*",
+    "marvin.*",
+    "praw.*",
+    "yake.*",
+]
+ignore_missing_imports = true
+
+[[tool.mypy.overrides]]
+module = [
+    "raggy.cli",
+    "raggy.settings",
+    "raggy.utilities.text",
+    "raggy.utilities.asyncutils",
+    "raggy.vectorstores.tpuf",
+    "raggy.vectorstores.chroma",
+    "raggy.loaders.web",
+    "raggy.loaders.pdf",
+]
+disable_error_code = [
+    "unreachable",
+    "no-any-return",
+    "unused-ignore",
+    "no-redef",
+]
diff --git a/src/raggy/__init__.py b/src/raggy/__init__.py
@@ -1,2 +1,4 @@
 from .documents import Document
 from .settings import settings
+
+__all__ = ["Document", "settings"]