From 6d14b8df5acfc7a0be6c74b7f507a6d301e493f3 Mon Sep 17 00:00:00 2001 From: Zachary Hampton <69336300+ZacharyHampton@users.noreply.github.com> Date: Tue, 13 Aug 2024 10:44:11 -0700 Subject: [PATCH] - fix limit parameter - fix specific for_rent apartment listing prices --- README.md | 2 + homeharvest/core/scrapers/models.py | 3 + homeharvest/core/scrapers/realtor/__init__.py | 20 ++++-- homeharvest/utils.py | 2 + pyproject.toml | 2 +- tests/test_realtor.py | 61 +++++++++++++++++-- 6 files changed, 81 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 26a8773..3f6a65f 100644 --- a/README.md +++ b/README.md @@ -128,6 +128,8 @@ Property ├── Property Listing Details: │ ├── days_on_mls │ ├── list_price +│ ├── list_price_min +│ ├── list_price_max │ ├── list_date │ ├── pending_date │ ├── sold_price diff --git a/homeharvest/core/scrapers/models.py b/homeharvest/core/scrapers/models.py index 2060a0d..15b6da7 100644 --- a/homeharvest/core/scrapers/models.py +++ b/homeharvest/core/scrapers/models.py @@ -113,6 +113,9 @@ class Property: address: Address | None = None list_price: int | None = None + list_price_min: int | None = None + list_price_max: int | None = None + list_date: str | None = None pending_date: str | None = None last_sold_date: str | None = None diff --git a/homeharvest/core/scrapers/realtor/__init__.py b/homeharvest/core/scrapers/realtor/__init__.py index f9d202e..7045512 100644 --- a/homeharvest/core/scrapers/realtor/__init__.py +++ b/homeharvest/core/scrapers/realtor/__init__.py @@ -20,6 +20,7 @@ class RealtorScraper(Scraper): PROPERTY_GQL = "https://graph.realtor.com/graphql" ADDRESS_AUTOCOMPLETE_URL = "https://parser-external.geo.moveaws.com/suggest" NUM_PROPERTY_WORKERS = 20 + DEFAULT_PAGE_SIZE = 200 def __init__(self, scraper_input): super().__init__(scraper_input) @@ -76,7 +77,6 @@ def handle_listing(self, listing_id: str) -> list[Property]: baths_half lot_sqft sold_price - sold_price type price status @@ -326,6 +326,8 @@ def general_search(self, variables: dict, search_type: str) -> Dict[str, Union[i last_sold_price last_sold_date list_price + list_price_max + list_price_min price_per_sqft flags { is_contingent @@ -551,6 +553,8 @@ def process_property(result: dict) -> Property | None: ), status="PENDING" if is_pending else result["status"].upper(), list_price=result["list_price"], + list_price_min=result["list_price_min"], + list_price_max=result["list_price_max"], list_date=result["list_date"].split("T")[0] if result.get("list_date") else None, prc_sqft=result.get("price_per_sqft"), last_sold_date=result.get("last_sold_date"), @@ -571,9 +575,17 @@ def process_property(result: dict) -> Property | None: ) return realty_property + properties_list = response_json["data"][search_key]["results"] + total_properties = response_json["data"][search_key]["total"] + offset = variables.get("offset", 0) + + #: limit the number of properties to be processed + #: example, if your offset is 200, and your limit is 250, return 50 + properties_list = properties_list[:self.limit - offset] + with ThreadPoolExecutor(max_workers=self.NUM_PROPERTY_WORKERS) as executor: futures = [ - executor.submit(process_property, result) for result in response_json["data"][search_key]["results"] + executor.submit(process_property, result) for result in properties_list ] for future in as_completed(futures): @@ -582,7 +594,7 @@ def process_property(result: dict) -> Property | None: properties.append(result) return { - "total": response_json["data"][search_key]["total"], + "total": total_properties, "properties": properties, } @@ -654,7 +666,7 @@ def search(self): variables=search_variables | {"offset": i}, search_type=search_type, ) - for i in range(200, min(total, self.limit), 200) + for i in range(self.DEFAULT_PAGE_SIZE, min(total, self.limit), self.DEFAULT_PAGE_SIZE) ] for future in as_completed(futures): diff --git a/homeharvest/utils.py b/homeharvest/utils.py index dd21349..a84d399 100644 --- a/homeharvest/utils.py +++ b/homeharvest/utils.py @@ -24,6 +24,8 @@ "year_built", "days_on_mls", "list_price", + "list_price_min", + "list_price_max", "list_date", "sold_price", "last_sold_date", diff --git a/pyproject.toml b/pyproject.toml index bc1a00d..a5589d9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "homeharvest" -version = "0.3.33" +version = "0.3.34" description = "Real estate scraping library" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/Bunsly/HomeHarvest" diff --git a/tests/test_realtor.py b/tests/test_realtor.py index ca6207a..7387a77 100644 --- a/tests/test_realtor.py +++ b/tests/test_realtor.py @@ -105,8 +105,8 @@ def test_realtor(): location="2530 Al Lipscomb Way", listing_type="for_sale", ), - scrape_property(location="Phoenix, AZ", listing_type="for_rent"), #: does not support "city, state, USA" format - scrape_property(location="Dallas, TX", listing_type="sold"), #: does not support "city, state, USA" format + scrape_property(location="Phoenix, AZ", listing_type="for_rent", limit=1000), #: does not support "city, state, USA" format + scrape_property(location="Dallas, TX", listing_type="sold", limit=1000), #: does not support "city, state, USA" format scrape_property(location="85281"), ] @@ -117,6 +117,7 @@ def test_realtor_city(): results = scrape_property( location="Atlanta, GA", listing_type="for_sale", + limit=1000 ) assert results is not None and len(results) > 0 @@ -140,7 +141,7 @@ def test_realtor_foreclosed(): def test_realtor_agent(): - scraped = scrape_property(location="Detroit, MI", listing_type="for_sale") + scraped = scrape_property(location="Detroit, MI", listing_type="for_sale", limit=1000) assert scraped["agent"].nunique() > 1 @@ -182,6 +183,58 @@ def test_style_value_error(): location="Alaska, AK", listing_type="sold", extra_property_data=False, + limit=1000, ) - assert results is not None and len(results) > 0 \ No newline at end of file + assert results is not None and len(results) > 0 + + +def test_primary_image_error(): + results = scrape_property( + location="Spokane, PA", + listing_type="for_rent", # or (for_sale, for_rent, pending) + past_days=360, + radius=3, + extra_property_data=False, + ) + + assert results is not None and len(results) > 0 + + +def test_limit(): + over_limit = 876 + extra_params = {"limit": over_limit} + + over_results = scrape_property( + location="Waddell, AZ", + listing_type="for_sale", + **extra_params, + ) + + assert over_results is not None and len(over_results) <= over_limit + + under_limit = 1 + under_results = scrape_property( + location="Waddell, AZ", + listing_type="for_sale", + limit=under_limit, + ) + + assert under_results is not None and len(under_results) == under_limit + + +def test_apartment_list_price(): + results = scrape_property( + location="Spokane, WA", + listing_type="for_rent", # or (for_sale, for_rent, pending) + extra_property_data=False, + ) + + assert results is not None + + results = results[results["style"] == "APARTMENT"] + + #: get percentage of results with atleast 1 of any column not none, list_price, list_price_min, list_price_max + assert len(results[results[["list_price", "list_price_min", "list_price_max"]].notnull().any(axis=1)]) / len( + results + ) > 0.5