Skip to content

Commit

Permalink
- fix limit parameter
Browse files Browse the repository at this point in the history
- fix specific for_rent apartment listing prices
  • Loading branch information
ZacharyHampton committed Aug 13, 2024
1 parent 3f44744 commit 6d14b8d
Show file tree
Hide file tree
Showing 6 changed files with 81 additions and 9 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ Property
├── Property Listing Details:
│ ├── days_on_mls
│ ├── list_price
│ ├── list_price_min
│ ├── list_price_max
│ ├── list_date
│ ├── pending_date
│ ├── sold_price
Expand Down
3 changes: 3 additions & 0 deletions homeharvest/core/scrapers/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,9 @@ class Property:
address: Address | None = None

list_price: int | None = None
list_price_min: int | None = None
list_price_max: int | None = None

list_date: str | None = None
pending_date: str | None = None
last_sold_date: str | None = None
Expand Down
20 changes: 16 additions & 4 deletions homeharvest/core/scrapers/realtor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class RealtorScraper(Scraper):
PROPERTY_GQL = "https://graph.realtor.com/graphql"
ADDRESS_AUTOCOMPLETE_URL = "https://parser-external.geo.moveaws.com/suggest"
NUM_PROPERTY_WORKERS = 20
DEFAULT_PAGE_SIZE = 200

def __init__(self, scraper_input):
super().__init__(scraper_input)
Expand Down Expand Up @@ -76,7 +77,6 @@ def handle_listing(self, listing_id: str) -> list[Property]:
baths_half
lot_sqft
sold_price
sold_price
type
price
status
Expand Down Expand Up @@ -326,6 +326,8 @@ def general_search(self, variables: dict, search_type: str) -> Dict[str, Union[i
last_sold_price
last_sold_date
list_price
list_price_max
list_price_min
price_per_sqft
flags {
is_contingent
Expand Down Expand Up @@ -551,6 +553,8 @@ def process_property(result: dict) -> Property | None:
),
status="PENDING" if is_pending else result["status"].upper(),
list_price=result["list_price"],
list_price_min=result["list_price_min"],
list_price_max=result["list_price_max"],
list_date=result["list_date"].split("T")[0] if result.get("list_date") else None,
prc_sqft=result.get("price_per_sqft"),
last_sold_date=result.get("last_sold_date"),
Expand All @@ -571,9 +575,17 @@ def process_property(result: dict) -> Property | None:
)
return realty_property

properties_list = response_json["data"][search_key]["results"]
total_properties = response_json["data"][search_key]["total"]
offset = variables.get("offset", 0)

#: limit the number of properties to be processed
#: example, if your offset is 200, and your limit is 250, return 50
properties_list = properties_list[:self.limit - offset]

with ThreadPoolExecutor(max_workers=self.NUM_PROPERTY_WORKERS) as executor:
futures = [
executor.submit(process_property, result) for result in response_json["data"][search_key]["results"]
executor.submit(process_property, result) for result in properties_list
]

for future in as_completed(futures):
Expand All @@ -582,7 +594,7 @@ def process_property(result: dict) -> Property | None:
properties.append(result)

return {
"total": response_json["data"][search_key]["total"],
"total": total_properties,
"properties": properties,
}

Expand Down Expand Up @@ -654,7 +666,7 @@ def search(self):
variables=search_variables | {"offset": i},
search_type=search_type,
)
for i in range(200, min(total, self.limit), 200)
for i in range(self.DEFAULT_PAGE_SIZE, min(total, self.limit), self.DEFAULT_PAGE_SIZE)
]

for future in as_completed(futures):
Expand Down
2 changes: 2 additions & 0 deletions homeharvest/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
"year_built",
"days_on_mls",
"list_price",
"list_price_min",
"list_price_max",
"list_date",
"sold_price",
"last_sold_date",
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "homeharvest"
version = "0.3.33"
version = "0.3.34"
description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/HomeHarvest"
Expand Down
61 changes: 57 additions & 4 deletions tests/test_realtor.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,8 @@ def test_realtor():
location="2530 Al Lipscomb Way",
listing_type="for_sale",
),
scrape_property(location="Phoenix, AZ", listing_type="for_rent"), #: does not support "city, state, USA" format
scrape_property(location="Dallas, TX", listing_type="sold"), #: does not support "city, state, USA" format
scrape_property(location="Phoenix, AZ", listing_type="for_rent", limit=1000), #: does not support "city, state, USA" format
scrape_property(location="Dallas, TX", listing_type="sold", limit=1000), #: does not support "city, state, USA" format
scrape_property(location="85281"),
]

Expand All @@ -117,6 +117,7 @@ def test_realtor_city():
results = scrape_property(
location="Atlanta, GA",
listing_type="for_sale",
limit=1000
)

assert results is not None and len(results) > 0
Expand All @@ -140,7 +141,7 @@ def test_realtor_foreclosed():


def test_realtor_agent():
scraped = scrape_property(location="Detroit, MI", listing_type="for_sale")
scraped = scrape_property(location="Detroit, MI", listing_type="for_sale", limit=1000)
assert scraped["agent"].nunique() > 1


Expand Down Expand Up @@ -182,6 +183,58 @@ def test_style_value_error():
location="Alaska, AK",
listing_type="sold",
extra_property_data=False,
limit=1000,
)

assert results is not None and len(results) > 0
assert results is not None and len(results) > 0


def test_primary_image_error():
results = scrape_property(
location="Spokane, PA",
listing_type="for_rent", # or (for_sale, for_rent, pending)
past_days=360,
radius=3,
extra_property_data=False,
)

assert results is not None and len(results) > 0


def test_limit():
over_limit = 876
extra_params = {"limit": over_limit}

over_results = scrape_property(
location="Waddell, AZ",
listing_type="for_sale",
**extra_params,
)

assert over_results is not None and len(over_results) <= over_limit

under_limit = 1
under_results = scrape_property(
location="Waddell, AZ",
listing_type="for_sale",
limit=under_limit,
)

assert under_results is not None and len(under_results) == under_limit


def test_apartment_list_price():
results = scrape_property(
location="Spokane, WA",
listing_type="for_rent", # or (for_sale, for_rent, pending)
extra_property_data=False,
)

assert results is not None

results = results[results["style"] == "APARTMENT"]

#: get percentage of results with atleast 1 of any column not none, list_price, list_price_min, list_price_max
assert len(results[results[["list_price", "list_price_min", "list_price_max"]].notnull().any(axis=1)]) / len(
results
) > 0.5

0 comments on commit 6d14b8d

Please sign in to comment.