Skip to content

Commit

Permalink
Remove reppy, add Crawl-delay, prep for 0.4.0 (#11)
Browse files Browse the repository at this point in the history
* rm reppy, improve typing, sup. Crawl-delay, more
* update dependencies to newer versions
  • Loading branch information
micahcochran authored Sep 27, 2023
1 parent 95fee3b commit 674ec47
Show file tree
Hide file tree
Showing 7 changed files with 119 additions and 53 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
__pycache__/
donotinclude/
.venv/
*.json
*.log
licenses*.md
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,14 @@ OR Install the python libraries individually:
* [Pendulum](https://pendulum.eustace.io/)
* [PyYAML](https://pyyaml.org/)
* [requests](https://docs.python-requests.org/)
* [reppy](https://github.com/seomoz/reppy)
* [recipe-scrapers](https://github.com/hhursev/recipe-scrapers)
* [scrape-schema-recipe](https://github.com/micahcochran/scrape-schema-recipe)
For development, install the dependent libaries typing:
```
> pip install -r requirements-dev.txt
```
### Optional Dependency
If you want to download pages using [Brotli compression](https://en.wikipedia.org/wiki/Brotli)
* ensure requests >= 2.26.0 is installed, in order to check your request version (two ways)
Expand Down
141 changes: 92 additions & 49 deletions recipe_crawler.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python3

#
# Copyright 2021 Micah Cochran
# Copyright 2021-2023 Micah Cochran
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -49,7 +49,7 @@
# * Sitemap support. This might help some of the crawlers that exhaust all their links.
#
# * Some websites almost instantly exit out. I'm not too sure why, but I'm sure other web crawlers have encountered this.
# Some of it may be due to using non-standard forms for URLs. Prehaps adding a sitemap mode might be a way to work around this.
# Some of it may be due to using non-standard forms for URLs. Perhaps adding a sitemap mode might be a way to work around this.
#
# * There could be more logic to detecting if recipes are the same by the URL, but this would require some pattern matching code in the URL.
# Logging the matching recipes would aid in figuring that out.
Expand All @@ -75,19 +75,19 @@
# ----- Python native imports -----
import argparse
from datetime import datetime, timedelta
from io import StringIO
from itertools import cycle
import json
from random import randint
import os.path
import sys
from time import sleep
import time
import urllib.parse
import urllib.robotparser

# Use this typing syntax.
# Python 3.9 allows builtins (dict, list) to be used
# In Python 3.7 and 3.8, use `from __future__ import annotations`
from typing import Dict, List, Tuple, Union
from typing import Any, Dict, List, Tuple, Union


# ----- external imports -----
Expand All @@ -97,8 +97,8 @@
import pendulum
import recipe_scrapers
from recipe_scrapers.settings import settings as recipe_scrapers_settings
from recipe_scrapers._abstract import AbstractScraper
from recipe_scrapers._exceptions import ElementNotFoundInHtml
import reppy
import requests
import scrape_schema_recipe
import yaml
Expand All @@ -107,18 +107,13 @@
# isn't performing correctly, when set to True. False turns this off.
SLOW_DEBUG = True

__version__ = "0.3.0"
__version__ = "0.4.0"
# This is the user-agent
USER_AGENT_ROBOTS = "RecipeCrawlerPY"
USER_AGENT = f"RecipeCrawlerPY/{__version__}"
REQUESTS_HEADERS = {"user-agent": USER_AGENT}


# put it in test mode for RecipeScrapersCrawler
if recipe_scrapers_settings.TEST_MODE is not False:
raise RuntimeError("TEST_MODE should be False.")
recipe_scrapers_settings.TEST_MODE = True


###### class MultiCrawler ##############################################
class MultiCrawler:
Expand All @@ -139,7 +134,7 @@ def add_crawler(
self,
url: str,
recipe_url: str = "",
license=None,
license: Union[str, None] = None,
start_url: str = "",
site_title: str = "",
) -> None:
Expand Down Expand Up @@ -171,8 +166,19 @@ def run(self) -> None:
# loop until the recipe limit is reached or there are no more crawlers left
while self.num_recipes < self.recipe_limit or len(self.crawlers) < 1:
crawler = next(self.crawler_iter)

# if True, delay crawling this URL due to Crawl-delay in robots.txt
if crawler.delay_crawl():
if SLOW_DEBUG is True:
time.sleep(1)
continue

try:
self.num_recipes += crawler.crawl()
try:
self.num_recipes += crawler.crawl()
except requests.exceptions.ReadTimeout as e:
logger.info(f"Caught ReadTimeout exception: {e}")
continue
except AnchorListsEmptyError as e:
logger.info(f"E is: {e}")
logger.info(f"Terminating crawler for {crawler.base_url}")
Expand All @@ -182,8 +188,9 @@ def run(self) -> None:
if SLOW_DEBUG is True:
logger.debug(f"self.num_recipes = {self.num_recipes}")
# Sleeps for 1 second, allow for the user to watch for the crawler messing up.
# This somewhat "throttles" how much bandwidth. Would need more code to actually implement throttling.
sleep(1)
# This somewhat "throttles" how much bandwidth is needed.
# Would need more code to actually implement throttling.
time.sleep(1)

def results_dict(self) -> List[Dict]:
"""resulting singular list of dictionaries that represent schema-recipes"""
Expand Down Expand Up @@ -280,12 +287,24 @@ def __init__(

robots_txt_url = urllib.parse.urljoin(url, "robots.txt")

try:
robots = reppy.robots.Robots.fetch(robots_txt_url)
except reppy.exceptions.ConnectionExceptions as e:
raise RecipeCrawlerNetworkError(e)
self.agent = robots.agent("*")
logger.debug(f"Reading robots.txt at: {robots_txt_url}")
self.robots_txt = urllib.robotparser.RobotFileParser(robots_txt_url)
self.robots_txt.read()

self.crawl_delay = False

if self.robots_txt.crawl_delay(USER_AGENT_ROBOTS) is not None:
self.crawl_delay = True
self.crawl_delay_seconds: int = int(
self.robots_txt.crawl_delay(USER_AGENT_ROBOTS)
)
self.last_crawl = time.time()

if self.robots_txt.request_rate(USER_AGENT_ROBOTS) is not None:
raise NotImplementedError(
"Software does not support Request-rate in robots.txt file."
)

logger.debug(f"Read robots.txt at: {robots_txt_url}")

self.urltest = URLTest(url)

Expand Down Expand Up @@ -316,7 +335,7 @@ def crawl(self) -> int:
logger.debug(f"URL 1: {scrapings['url']}")
logger.debug(f"URL 2: {self.recipe_json[similar_idx]['url']}")
else: # for lists
raise NotImplemented(
raise NotImplementedError(
"Recipes websites currently don't have multiple recipes on one webpage, not sure of the implications."
)
num_recipes += len(scrapings)
Expand Down Expand Up @@ -357,6 +376,7 @@ def _download_page(self) -> requests.models.Response:
returns a requests.Response object
"""
# logger.debug("called _download_page()")
# TODO Why is this code being ignored?
url = None
while not url:
# this picks a url
Expand All @@ -378,6 +398,10 @@ def _download_page(self) -> requests.models.Response:
self.been_there_urls.append(url)
logger.debug(f"Visiting {url}")

# set last_crawl time to now
if self.crawl_delay:
self.last_crawl = time.time()

self.num_get_calls += 1
resp = requests.get(url, headers=REQUESTS_HEADERS, timeout=5)
if resp.headers.get("Content-Length"):
Expand Down Expand Up @@ -471,9 +495,7 @@ def _rank_url(self, url: str) -> Tuple[int, str]:
return (-4, url)

# Check if robots.txt rules allow going to this URL
# if self.robotparse.can_fetch(USER_AGENT_ROBOTS, url) is False:
# if self.robotparse.can_fetch("*", url) is False:
if self.agent.allowed(url) is False:
if self.robots_txt.can_fetch(USER_AGENT_ROBOTS, url) is False:
return (-5, url)

if self._recipe_url != "" and url.startswith(self._recipe_url):
Expand Down Expand Up @@ -538,6 +560,14 @@ def license_report(self) -> str:

return md

def delay_crawl(self) -> bool:
"""Should this crawl be delayed due to a time delay?"""

if self.crawl_delay is False:
return False

return (time.time() - self.last_crawl) < self.crawl_delay_seconds


def load_website_sources_list(
config_filename: str = "website_sources.yaml",
Expand Down Expand Up @@ -573,30 +603,37 @@ def __init__(
elif "healthyeating.nhlbi.nih.gov" in url:
self.ScraperDriver = recipe_scrapers.NIHHealthyEating
else:
raise NotImplemented(f"Website '{url}' has not been implemented.")
raise NotImplementedError(f"Website '{url}' has not been implemented.")

super().__init__(url, recipe_url, license, start_url, site_title)

def _scrape_page(self, response: requests.models.Response) -> Dict:
def _scrape_page(
self, response: requests.models.Response
) -> Union[Dict[str, Any], None]:
"""
scrapes a page
input: response is a requests.models.Response object from requests.get()
return dict or List, if it is empty it will be None
"""
logger.debug("called RecipeScrapersCrawler._scrape_page()")
with StringIO(response.text) as fp:
# NOTE does recipe_scraper_obj need to be kept?
recipe_scraper_obj = self.ScraperDriver(fp)

dict = self._convert_recipe_scraper_to_schema_dict(
recipe_scraper_obj, response.url
)
return dict
# NOTE does recipe_scraper_obj need to be kept?
recipe_scraper_obj = self.ScraperDriver(url=response.url, html=response.text)

recipe_dict = self._convert_recipe_scraper_to_schema_dict(
recipe_scraper_obj, response.url
)
return recipe_dict

# TODO: This function will need to be rewritten based on how exceptions work in recipe-scrapers versions after 13.3.0
def _convert_recipe_scraper_to_schema_dict(self, rs, url: str) -> Union[Dict, None]:
def _convert_recipe_scraper_to_schema_dict(
self,
rs: AbstractScraper,
url: str,
) -> Union[Dict[str, Any], None]:
"""Convert recipe-scraper object into a recipe schema dictionary"""

logger.debug("called _convert_recipe_scraper_to_schema_dict()")
d = {}

Expand All @@ -620,7 +657,7 @@ def _convert_recipe_scraper_to_schema_dict(self, rs, url: str) -> Union[Dict, No
d["totalTime"] = isodate.duration_isoformat(
timedelta(minutes=rs.total_time())
)
except (NotImplemented, ElementNotFoundInHtml):
except (NotImplementedError, ElementNotFoundInHtml):
pass

try:
Expand Down Expand Up @@ -651,7 +688,7 @@ def _convert_recipe_scraper_to_schema_dict(self, rs, url: str) -> Union[Dict, No
class URLTest:
"""Class of tests for URLs"""

def __init__(self, baseurl: str = None):
def __init__(self, baseurl: Union[str, None] = None):
"""baseurl is the start URL of the website"""
self.baseurl = baseurl
self.basesplit = None
Expand Down Expand Up @@ -735,6 +772,7 @@ def __str__(self) -> str:
def main(sys_args: List = sys.argv[1:]) -> None:
website_sources_list: List = []

# handle command line arguments
parser = argparse.ArgumentParser(
description="Recipe Crawler to that saves a cookbook to a JSON file."
)
Expand Down Expand Up @@ -809,15 +847,20 @@ def arg_in_list(a):
for source in website_sources_list:
# logger.debug('KEYS: {}'.format(source['site'].keys()))

logger.debug(f"Adding crawler for: {source['site']['url']}")
mc.add_crawler(
source["site"]["url"],
source["site"].get("recipe_url"),
source["site"].get("license"),
# this is a URL the one that most-likely to get to recipes quickly, such as an index or landing pages for recipes.
source["site"].get("start_url"),
source["site"].get("title"),
)
try:
logger.debug(f"Adding crawler for: {source['site']['url']}")
mc.add_crawler(
source["site"]["url"],
source["site"].get("recipe_url"),
source["site"].get("license"),
# URL the one that most-likely to get to recipes quickly, such as an index or landing pages for recipes.
source["site"].get("start_url"),
source["site"].get("title"),
)
except NotImplementedError as e:
logger.debug(
f"Skipping crawler : '{source['site']['url']}' due to a NotImplementedError '{e}'"
)

mc.run()

Expand All @@ -827,7 +870,7 @@ def arg_in_list(a):
with open(args.output, "w") as fp:
json.dump(recipes_dict, fp)

license_filename = f"license-{args.output[:-5]}.md"
license_filename = f"licenses-{args.output[:-5]}.md"
with open(license_filename, "w") as fp:
fp.write(mc.generate_license_report())

Expand All @@ -842,7 +885,7 @@ def arg_in_list(a):
logger.info(" * Metric is not accurate.")
runtime = pendulum.now().diff(start_time)
# README.md row printer, INCOMPLETE
# logger.info(f"row: | {__version__} | {args.limit} | | {mc.results_num_get_calls()} | | {mc.results_num_get_calls() / args.limit} | |")
# logger.info(f"row: | {__version__} | {args.limit} | | {mc.results_num_get_calls()} | | {mc.results_num_get_calls() / args.limit} | |")
logger.info(f"Program's Runtime: {runtime.in_words()}")


Expand Down
4 changes: 4 additions & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Development related requirements including types
types-beautifulsoup4
types-PyYAML
types-requests
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
beautifulsoup4
loguru
recipe-scrapers>=13.3.3
recipe-scrapers>=14.48.0
requests
scrape-schema-recipe>=0.1.3
scrape-schema-recipe>=0.2.2
pendulum
PyYAML
9 changes: 8 additions & 1 deletion website_source-open_source.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,4 +41,11 @@ site:
url: https://healthyeating.nhlbi.nih.gov/
title: NIH Healthy Eating
recipe_url: https://healthyeating.nhlbi.nih.gov/recipedetail.aspx?
license: https://www.nhlbi.nih.gov/about/contact/trademark-branding-and-logo
license: https://www.nhlbi.nih.gov/about/contact/trademark-branding-and-logo
---
site:
url: https://www.foodista.com/
title: Foodista
start_url: https://www.foodista.com/blog/recipes-cooking
recipe_url: https://www.foodista.com/recipe/
license: http://creativecommons.org/licenses/by/3.0/
7 changes: 7 additions & 0 deletions website_sources.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,10 @@ site:
start_url: https://medlineplus.gov/recipes/
recipe_url: https://medlineplus.gov/recipes/
license: https://www.nlm.nih.gov/web_policies.html#copyright
---
site:
url: https://www.foodista.com/
title: Foodista
start_url: https://www.foodista.com/blog/recipes-cooking
recipe_url: https://www.foodista.com/recipe/
license: http://creativecommons.org/licenses/by/3.0/

0 comments on commit 674ec47

Please sign in to comment.