diff --git a/Pipfile b/Pipfile index 0b7a1b90..8b8bed95 100644 --- a/Pipfile +++ b/Pipfile @@ -35,6 +35,7 @@ gunicorn = "23.0.0" flask-api = {editable = true, ref = "develop", git = "git+https://github.com/flask-api/flask-api.git"} setuptools = "==75.6.0" certifi = "==2024.12.14" +2captcha-python = "*" [dev-packages] exceptiongroup = "*" diff --git a/Pipfile.lock b/Pipfile.lock index 47db1f6f..7f52e4d7 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "a94f84e411b121957f50530b8701f58ee38bc7de1c01daba577ccb64a6642bdd" + "sha256": "80889a97685207cfec6b000a448fbf8a5360ef0920f72772f9c65050ad71e2f3" }, "pipfile-spec": 6, "requires": { @@ -16,6 +16,15 @@ ] }, "default": { + "2captcha-python": { + "hashes": [ + "sha256:139cafc7ef60c0f3cea5b95c6c4211e50aedd602ea489cc21b32b49425a62f73", + "sha256:afafed5d4045f9d343a65be4b424c510622025891a07e87a0ee2432a17078296" + ], + "index": "pypi", + "markers": "python_version >= '3.6'", + "version": "==1.5.0" + }, "apprise": { "hashes": [ "sha256:7192c953eeb282a7afee012512d3de0104b5a6a11bdda29283435df5a79dfe7f", @@ -35,11 +44,11 @@ }, "attrs": { "hashes": [ - "sha256:5cfb1b9148b5b086569baec03f20d7b6bf3bcacc9a42bebf87ffaaca362f6346", - "sha256:81921eb96de3191c8258c199618104dd27ac608d9366f5e35d011eae1867ede2" + "sha256:8f5c07333d543103541ba7be0e2ce16eeee8130cb0b3f9238ab904ce1e85baff", + "sha256:ac96cd038792094f438ad1f6ff80837353805ac950cd2aa0e0625ef19850c308" ], - "markers": "python_version >= '3.7'", - "version": "==24.2.0" + "markers": "python_version >= '3.8'", + "version": "==24.3.0" }, "backoff": { "hashes": [ diff --git a/flathunter/abstract_crawler.py b/flathunter/abstract_crawler.py index 2e8bc3c8..08b34e94 100644 --- a/flathunter/abstract_crawler.py +++ b/flathunter/abstract_crawler.py @@ -5,6 +5,9 @@ from typing import Optional, Any import json +from io import BytesIO +import base64 + import backoff import requests # pylint: disable=unused-import @@ -13,10 +16,11 @@ from bs4 import BeautifulSoup from selenium.common.exceptions import NoSuchElementException, TimeoutException -from selenium.webdriver import Chrome +from selenium.webdriver import Chrome, Keys from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait +from selenium.webdriver.common.action_chains import ActionChains from flathunter import proxies from flathunter.captcha.captcha_solver import CaptchaUnsolvableError @@ -196,6 +200,7 @@ def resolve_geetest(self, driver): driver.refresh() raise + # pylint: disable=too-many-locals @backoff.on_exception(wait_gen=backoff.constant, exception=CaptchaUnsolvableError, max_tries=3) @@ -268,6 +273,62 @@ def log_filter(log_): driver.refresh() raise + @backoff.on_exception(wait_gen=backoff.constant, + exception=CaptchaUnsolvableError, + max_tries=3) + def resolve_amazon(self, driver): + """Resolve Amazon Captcha""" + try: + driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") + sleep(3) + shadowelement = driver.execute_script( + "return document.querySelector('awswaf-captcha').shadowRoot" + ) + my_img = shadowelement.find_element(By.ID, "root") + size = my_img.size + select_l = my_img.find_element(By.TAG_NAME, "select") + select_l.click() + sleep(1) + select_l.send_keys(Keys.DOWN) + sleep(3) + shadowelement = driver.execute_script( + "return document.querySelector('awswaf-captcha').shadowRoot" + ) + my_img = shadowelement.find_element(By.ID, "root") + screenshot = my_img.screenshot_as_png + screenshot_bytes = BytesIO(screenshot) + base64_screenshot = base64.b64encode(screenshot_bytes.getvalue()).decode('utf-8') + # Send image in 2captcha service + result = self.captcha_solver.solve_amazon(base64_screenshot) + logger.info(result.token) + l = result.token.split(':')[1].split(';') + l = [[int(val.split('=')[1]) for val in coord.split(',')] for coord in l] + button_coord = [size['width'] - 30, size['height'] - 30] + l.append(button_coord) + actions = ActionChains(driver) + for i in l: + actions.move_to_element_with_offset(my_img, i[0] - 160, i[1] - 211).click() + actions.perform() + sleep(0.5) + actions.reset_actions() + sleep(1) + try: + confirm_button = my_img.find_element(By.ID, "amzn-btn-verify-internal") + actions.move_to_element_with_offset(confirm_button, 40, 15).click() + actions.perform() + sleep(4) + except NoSuchElementException: + pass + try: + driver.find_element(By.TAG_NAME, "awswaf-captcha") + except NoSuchElementException: + logger.info("Captcha solved") + else: + raise CaptchaUnsolvableError() + except Exception as ex: + driver.refresh() + raise CaptchaUnsolvableError() from ex + @backoff.on_exception(wait_gen=backoff.constant, exception=CaptchaUnsolvableError, max_tries=3) diff --git a/flathunter/captcha/capmonster_solver.py b/flathunter/captcha/capmonster_solver.py index b310702d..e497f5eb 100644 --- a/flathunter/captcha/capmonster_solver.py +++ b/flathunter/captcha/capmonster_solver.py @@ -1,5 +1,4 @@ """Captcha solver for CapMonster Captcha Solving Service (https://capmonster.cloud)""" -import json from typing import Dict from time import sleep import backoff @@ -8,8 +7,6 @@ from flathunter.logging import logger from flathunter.captcha.captcha_solver import ( CaptchaSolver, - CaptchaBalanceEmpty, - CaptchaUnsolvableError, GeetestResponse, AwsAwfResponse, RecaptchaResponse, @@ -26,6 +23,8 @@ def solve_recaptcha(self, google_site_key: str, page_url: str) -> RecaptchaRespo """Should be implemented in subclass""" raise NotImplementedError("Recaptcha captcha solving is not implemented for Capmonster") + # pylint: disable=too-many-arguments + # pylint: disable=too-many-positional-arguments def solve_awswaf( self, sitekey: str, diff --git a/flathunter/captcha/captcha_solver.py b/flathunter/captcha/captcha_solver.py index 8824feb8..4cf0e6c6 100644 --- a/flathunter/captcha/captcha_solver.py +++ b/flathunter/captcha/captcha_solver.py @@ -22,7 +22,6 @@ class AwsAwfResponse: """Response from AWS WAF""" token: str - class CaptchaSolver: """Interface for Captcha solvers""" @@ -39,6 +38,8 @@ def solve_geetest(self, geetest: str, challenge: str, page_url: str) -> GeetestR """Should be implemented in subclass""" raise NotImplementedError() + # pylint: disable=too-many-arguments + # pylint: disable=too-many-positional-arguments def solve_awswaf( self, sitekey: str, diff --git a/flathunter/captcha/imagetyperz_solver.py b/flathunter/captcha/imagetyperz_solver.py index 8f93c032..e4d744ec 100644 --- a/flathunter/captcha/imagetyperz_solver.py +++ b/flathunter/captcha/imagetyperz_solver.py @@ -59,6 +59,8 @@ def solve_recaptcha(self, google_site_key: str, page_url: str) -> RecaptchaRespo ) return RecaptchaResponse(self.__retrieve_imagetyperz_result(captcha_id)) + # pylint: disable=too-many-arguments + # pylint: disable=too-many-positional-arguments def solve_awswaf( self, sitekey: str, diff --git a/flathunter/captcha/twocaptcha_solver.py b/flathunter/captcha/twocaptcha_solver.py index d034050c..71562400 100644 --- a/flathunter/captcha/twocaptcha_solver.py +++ b/flathunter/captcha/twocaptcha_solver.py @@ -4,6 +4,7 @@ from time import sleep import backoff import requests +from twocaptcha import TwoCaptcha from flathunter.logging import logger from flathunter.captcha.captcha_solver import ( @@ -47,6 +48,8 @@ def solve_recaptcha(self, google_site_key: str, page_url: str) -> RecaptchaRespo captcha_id = self.__submit_2captcha_request(params) return RecaptchaResponse(self.__retrieve_2captcha_result(captcha_id)) + # pylint: disable=too-many-arguments + # pylint: disable=too-many-positional-arguments def solve_awswaf( self, sitekey: str, @@ -56,8 +59,19 @@ def solve_awswaf( captcha_script: str, page_url: str ) -> AwsAwfResponse: - """Should be implemented at some point""" - raise NotImplementedError("AWS WAF captchas not supported for 2Captcha") + """Using the `solve_amazon` method instead""" + raise NotImplementedError() + + def solve_amazon( + self, + image_b64: str + ) -> AwsAwfResponse: + """Solve AWS WAF by processing an image""" + solver = TwoCaptcha(self.api_key, defaultTimeout=60, pollingInterval=5) + result = solver.coordinates(image_b64, lang='en') + if result is None: + raise CaptchaUnsolvableError("Got None from 2captcha solve") + return AwsAwfResponse(result["code"]) @backoff.on_exception(**CaptchaSolver.backoff_options) def __submit_2captcha_request(self, params: Dict[str, str]) -> str: diff --git a/flathunter/crawler/immobilienscout.py b/flathunter/crawler/immobilienscout.py index 6311572f..dc75dead 100644 --- a/flathunter/crawler/immobilienscout.py +++ b/flathunter/crawler/immobilienscout.py @@ -11,7 +11,6 @@ from flathunter.abstract_crawler import Crawler from flathunter.logging import logger from flathunter.chrome_wrapper import get_chrome_driver -from flathunter.captcha.twocaptcha_solver import TwoCaptchaSolver from flathunter.exceptions import DriverLoadException STATIC_URL_PATTERN = re.compile(r'https://www\.immobilienscout24\.de') @@ -35,7 +34,7 @@ class Immobilienscout(Crawler): URL_PATTERN = STATIC_URL_PATTERN - JSON_PATH_PARSER_ENTRIES = parse("$..['resultlist.realEstate']") + JSON_PATH_PARSER_ENTRIES = parse("$..['resultlistEntries']..['resultlist.realEstate']") JSON_PATH_PARSER_IMAGES = parse("$..galleryAttachments" "..attachment[?'@xsi.type'=='common:Picture']" "..['@href'].`sub(/(.*\\\\.jpe?g).*/, \\\\1)`") @@ -117,6 +116,8 @@ def get_results(self, search_url, max_pages=None): def get_entries_from_javascript(self): """Get entries from JavaScript""" + if "Warum haben wir deine Anfrage blockiert?" in self.get_driver_force().page_source: + self.resolve_amazon(self.get_driver_force()) try: result_json = self.get_driver_force().execute_script('return window.IS24.resultList;') except JavascriptException: diff --git a/flathunter/gmaps_duration_processor.py b/flathunter/gmaps_duration_processor.py index 843f2975..6ab8f81c 100644 --- a/flathunter/gmaps_duration_processor.py +++ b/flathunter/gmaps_duration_processor.py @@ -24,6 +24,8 @@ def process_expose(self, expose): def get_formatted_durations(self, address): """Return a formatted list of GoogleMaps durations""" + if address is None: + return "" out = "" for duration in self.config.get('durations', []): if 'destination' in duration and 'name' in duration: