From 0176bc0523715f0c4ce648ebb5757a6a71dd637f Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Tue, 26 Nov 2024 16:48:35 +0200 Subject: [PATCH 01/10] PlaywrightFetcher - Make proxies available without enabling stealth Thanks for @AbdullahY36 --- scrapling/engines/pw.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scrapling/engines/pw.py b/scrapling/engines/pw.py index 80ff55b..9330094 100644 --- a/scrapling/engines/pw.py +++ b/scrapling/engines/pw.py @@ -176,6 +176,7 @@ def fetch(self, url: str) -> Response: ) else: context = browser.new_context( + proxy=self.proxy, color_scheme='dark', user_agent=useragent, device_scale_factor=2, From 60d551d83972b2a9bf4617f8b5d6548be7b5f522 Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Tue, 26 Nov 2024 17:39:03 +0200 Subject: [PATCH 02/10] PlaywrightFetcher - Add the option to set locale of the browser --- README.md | 1 + scrapling/engines/pw.py | 6 +++++- scrapling/fetchers.py | 4 +++- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 3669b3a..de324cb 100644 --- a/README.md +++ b/README.md @@ -317,6 +317,7 @@ Add that to a lot of controlling/hiding options as you will see in the arguments | disable_webgl | Disables WebGL and WebGL 2.0 support entirely. | ✔️ | | stealth | Enables stealth mode, always check the documentation to see what stealth mode does currently. | ✔️ | | real_chrome | If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it. | ✔️ | +| locale | Set the locale for the browser if wanted. The default value is `en-US`. | ✔️ | | cdp_url | Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP. | ✔️ | | nstbrowser_mode | Enables NSTBrowser mode, **it have to be used with `cdp_url` argument or it will get completely ignored.** | ✔️ | | nstbrowser_config | The config you want to send with requests to the NSTBrowser. _If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config._ | ✔️ | diff --git a/scrapling/engines/pw.py b/scrapling/engines/pw.py index 9330094..d4dc5ea 100644 --- a/scrapling/engines/pw.py +++ b/scrapling/engines/pw.py @@ -26,6 +26,7 @@ def __init__( timeout: Optional[float] = 30000, page_action: Callable = do_nothing, wait_selector: Optional[str] = None, + locale: Optional[str] = 'en-US', wait_selector_state: Optional[str] = 'attached', stealth: Optional[bool] = False, real_chrome: Optional[bool] = False, @@ -50,6 +51,7 @@ def __init__( :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000 :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again. :param wait_selector: Wait for a specific css selector to be in a specific state. + :param locale: Set the locale for the browser if wanted. The default value is `en-US`. :param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`. :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently. :param real_chrome: If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it. @@ -64,6 +66,7 @@ def __init__( :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class. """ self.headless = headless + self.locale = check_type_validity(locale, [str], 'en-US', param_name='locale') self.disable_resources = disable_resources self.network_idle = bool(network_idle) self.stealth = bool(stealth) @@ -159,7 +162,7 @@ def fetch(self, url: str) -> Response: # Creating the context if self.stealth: context = browser.new_context( - locale='en-US', + locale=self.locale, is_mobile=False, has_touch=False, proxy=self.proxy, @@ -176,6 +179,7 @@ def fetch(self, url: str) -> Response: ) else: context = browser.new_context( + locale=self.locale, proxy=self.proxy, color_scheme='dark', user_agent=useragent, diff --git a/scrapling/fetchers.py b/scrapling/fetchers.py index bc47c8e..1e81ffd 100644 --- a/scrapling/fetchers.py +++ b/scrapling/fetchers.py @@ -148,7 +148,7 @@ def fetch( useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000, page_action: Optional[Callable] = do_nothing, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached', hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True, - proxy: Optional[Union[str, Dict[str, str]]] = None, + proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US', stealth: Optional[bool] = False, real_chrome: Optional[bool] = False, cdp_url: Optional[str] = None, nstbrowser_mode: Optional[bool] = False, nstbrowser_config: Optional[Dict] = None, @@ -163,6 +163,7 @@ def fetch( :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it. :param network_idle: Wait for the page until there are no network connections for at least 500 ms. :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000 + :param locale: Set the locale for the browser if wanted. The default value is `en-US`. :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again. :param wait_selector: Wait for a specific css selector to be in a specific state. :param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`. @@ -180,6 +181,7 @@ def fetch( """ engine = PlaywrightEngine( proxy=proxy, + locale=locale, timeout=timeout, stealth=stealth, cdp_url=cdp_url, From d6a59774eab4c6e29b99c0c653e0dc5d7993a5bf Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Tue, 26 Nov 2024 20:23:02 +0200 Subject: [PATCH 03/10] StealthyFetcher - The option to disable ads with `ublock origin` addon --- README.md | 1 + scrapling/engines/camo.py | 8 +++++++- scrapling/fetchers.py | 4 +++- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index de324cb..bf792c1 100644 --- a/README.md +++ b/README.md @@ -263,6 +263,7 @@ True | addons | List of Firefox addons to use. **Must be paths to extracted addons.** | ✔️ | | humanize | Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window. | ✔️ | | allow_webgl | Whether to allow WebGL. To prevent leaks, only use this for special cases. | ✔️ | +| disable_ads | Enabled by default, this installs `uBlock Origin` addon on the browser if enabled. | ✔️ | | network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ | | timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ | | wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ | diff --git a/scrapling/engines/camo.py b/scrapling/engines/camo.py index 0807a97..5853a18 100644 --- a/scrapling/engines/camo.py +++ b/scrapling/engines/camo.py @@ -12,6 +12,7 @@ generate_convincing_referer, ) +from camoufox import DefaultAddons from camoufox.sync_api import Camoufox @@ -21,7 +22,8 @@ def __init__( block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = False, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True, timeout: Optional[float] = 30000, page_action: Callable = do_nothing, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None, wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, - proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, adaptor_arguments: Dict = None + proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, + adaptor_arguments: Dict = None, ): """An engine that utilizes Camoufox library, check the `StealthyFetcher` class for more documentation. @@ -36,6 +38,7 @@ def __init__( :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window. :param allow_webgl: Whether to allow WebGL. To prevent leaks, only use this for special cases. :param network_idle: Wait for the page until there are no network connections for at least 500 ms. + :param disable_ads: Enabled by default, this installs `uBlock Origin` addon on the browser if enabled. :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS. :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000 :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again. @@ -54,6 +57,7 @@ def __init__( self.network_idle = bool(network_idle) self.google_search = bool(google_search) self.os_randomize = bool(os_randomize) + self.disable_ads = bool(disable_ads) self.extra_headers = extra_headers or {} self.proxy = construct_proxy_dict(proxy) self.addons = addons or [] @@ -75,9 +79,11 @@ def fetch(self, url: str) -> Response: :param url: Target url. :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers` """ + addons = [] if self.disable_ads else [DefaultAddons.UBO] with Camoufox( proxy=self.proxy, addons=self.addons, + exclude_addons=addons, headless=self.headless, humanize=self.humanize, i_know_what_im_doing=True, # To turn warnings off with the user configurations diff --git a/scrapling/fetchers.py b/scrapling/fetchers.py index 1e81ffd..e4e974f 100644 --- a/scrapling/fetchers.py +++ b/scrapling/fetchers.py @@ -78,7 +78,7 @@ def fetch( block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = False, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None, timeout: Optional[float] = 30000, page_action: Callable = do_nothing, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True, wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, proxy: Optional[Union[str, Dict[str, str]]] = None, - os_randomize: Optional[bool] = None + os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, ) -> Response: """ Opens up a browser and do your request based on your chosen options below. @@ -92,6 +92,7 @@ def fetch( This can help save your proxy usage but be careful with this option as it makes some websites never finish loading. :param block_webrtc: Blocks WebRTC entirely. :param addons: List of Firefox addons to use. Must be paths to extracted addons. + :param disable_ads: Enabled by default, this installs `uBlock Origin` addon on the browser if enabled. :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window. :param allow_webgl: Whether to allow WebGL. To prevent leaks, only use this for special cases. :param network_idle: Wait for the page until there are no network connections for at least 500 ms. @@ -111,6 +112,7 @@ def fetch( timeout=timeout, headless=headless, humanize=humanize, + disable_ads=disable_ads, allow_webgl=allow_webgl, page_action=page_action, network_idle=network_idle, From 1cf7cee1a20d2769b5d8c9d99a54c52515b6b12f Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Tue, 26 Nov 2024 20:26:54 +0200 Subject: [PATCH 04/10] Fetchers - Wait for page to be stable again if the user wanted to wait for a selector --- scrapling/engines/camo.py | 5 +++++ scrapling/engines/pw.py | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/scrapling/engines/camo.py b/scrapling/engines/camo.py index 5853a18..90d3378 100644 --- a/scrapling/engines/camo.py +++ b/scrapling/engines/camo.py @@ -111,6 +111,11 @@ def fetch(self, url: str) -> Response: if self.wait_selector and type(self.wait_selector) is str: waiter = page.locator(self.wait_selector) waiter.first.wait_for(state=self.wait_selector_state) + # Wait again after waiting for the selector, helpful with protections like Cloudflare + page.wait_for_load_state(state="load") + page.wait_for_load_state(state="domcontentloaded") + if self.network_idle: + page.wait_for_load_state('networkidle') # This will be parsed inside `Response` encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding diff --git a/scrapling/engines/pw.py b/scrapling/engines/pw.py index d4dc5ea..3f6a187 100644 --- a/scrapling/engines/pw.py +++ b/scrapling/engines/pw.py @@ -226,6 +226,11 @@ def fetch(self, url: str) -> Response: if self.wait_selector and type(self.wait_selector) is str: waiter = page.locator(self.wait_selector) waiter.first.wait_for(state=self.wait_selector_state) + # Wait again after waiting for the selector, helpful with protections like Cloudflare + page.wait_for_load_state(state="load") + page.wait_for_load_state(state="domcontentloaded") + if self.network_idle: + page.wait_for_load_state('networkidle') # This will be parsed inside `Response` encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding From dbd674aa7c276d55584e50c3f8c60bdca1437afc Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Tue, 26 Nov 2024 21:24:41 +0200 Subject: [PATCH 05/10] Adding proxy support for basic requests --- README.md | 4 +++- scrapling/engines/static.py | 28 ++++++++++++++++++++-------- scrapling/fetchers.py | 20 ++++++++++++-------- 3 files changed, 35 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index bf792c1..7b94658 100644 --- a/README.md +++ b/README.md @@ -233,9 +233,11 @@ Also, the `Response` object returned from all fetchers is the same as `Adaptor` This class is built on top of [httpx](https://www.python-httpx.org/) with additional configuration options, here you can do `GET`, `POST`, `PUT`, and `DELETE` requests. For all methods, you have `stealth_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default. + +You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods in this format `http://username:password@localhost:8030` ```python >> page = Fetcher().get('https://httpbin.org/get', stealth_headers=True, follow_redirects=True) ->> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'}) +>> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030') >> page = Fetcher().put('https://httpbin.org/put', data={'key': 'value'}) >> page = Fetcher().delete('https://httpbin.org/delete') ``` diff --git a/scrapling/engines/static.py b/scrapling/engines/static.py index 16e5350..d6b5a6c 100644 --- a/scrapling/engines/static.py +++ b/scrapling/engines/static.py @@ -63,54 +63,66 @@ def _prepare_response(self, response: httpxResponse) -> Response: **self.adaptor_arguments ) - def get(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response: + def get(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response: """Make basic HTTP GET request for you but with some added flavors. :param url: Target url. :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and create a referer header as if this request had came from Google's search of this URL's domain. + :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030` :param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details. :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers` """ headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers) - request = httpx.get(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs) + with httpx.Client(proxy=proxy) as client: + request = client.get(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs) + return self._prepare_response(request) - def post(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response: + def post(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response: """Make basic HTTP POST request for you but with some added flavors. :param url: Target url. :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and create a referer header as if this request had came from Google's search of this URL's domain. + :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030` :param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details. :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers` """ headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers) - request = httpx.post(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs) + with httpx.Client(proxy=proxy) as client: + request = client.post(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs) + return self._prepare_response(request) - def delete(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response: + def delete(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response: """Make basic HTTP DELETE request for you but with some added flavors. :param url: Target url. :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and create a referer header as if this request had came from Google's search of this URL's domain. + :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030` :param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details. :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers` """ headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers) - request = httpx.delete(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs) + with httpx.Client(proxy=proxy) as client: + request = client.delete(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs) + return self._prepare_response(request) - def put(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response: + def put(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response: """Make basic HTTP PUT request for you but with some added flavors. :param url: Target url. :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and create a referer header as if this request had came from Google's search of this URL's domain. + :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030` :param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details. :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers` """ headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers) - request = httpx.put(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs) + with httpx.Client(proxy=proxy) as client: + request = client.put(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs) + return self._prepare_response(request) diff --git a/scrapling/fetchers.py b/scrapling/fetchers.py index e4e974f..552f924 100644 --- a/scrapling/fetchers.py +++ b/scrapling/fetchers.py @@ -9,7 +9,7 @@ class Fetcher(BaseFetcher): Any additional keyword arguments passed to the methods below are passed to the respective httpx's method directly. """ - def get(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response: + def get(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, proxy: Optional[str] = None, **kwargs: Dict) -> Response: """Make basic HTTP GET request for you but with some added flavors. :param url: Target url. @@ -17,13 +17,14 @@ def get(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[i :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds. :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and create a referer header as if this request had came from Google's search of this URL's domain. + :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030` :param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details. :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers` """ - response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).get(url, stealthy_headers, **kwargs) + response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).get(url, proxy, stealthy_headers, **kwargs) return response_object - def post(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response: + def post(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, proxy: Optional[str] = None, **kwargs: Dict) -> Response: """Make basic HTTP POST request for you but with some added flavors. :param url: Target url. @@ -31,13 +32,14 @@ def post(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[ :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds. :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and create a referer header as if this request came from Google's search of this URL's domain. + :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030` :param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details. :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers` """ - response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).post(url, stealthy_headers, **kwargs) + response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).post(url, proxy, stealthy_headers, **kwargs) return response_object - def put(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response: + def put(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, proxy: Optional[str] = None, **kwargs: Dict) -> Response: """Make basic HTTP PUT request for you but with some added flavors. :param url: Target url @@ -45,14 +47,15 @@ def put(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[i :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds. :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and create a referer header as if this request came from Google's search of this URL's domain. + :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030` :param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details. :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers` """ - response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).put(url, stealthy_headers, **kwargs) + response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).put(url, proxy, stealthy_headers, **kwargs) return response_object - def delete(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response: + def delete(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, proxy: Optional[str] = None, **kwargs: Dict) -> Response: """Make basic HTTP DELETE request for you but with some added flavors. :param url: Target url @@ -60,10 +63,11 @@ def delete(self, url: str, follow_redirects: bool = True, timeout: Optional[Unio :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds. :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and create a referer header as if this request came from Google's search of this URL's domain. + :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030` :param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details. :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers` """ - response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).delete(url, stealthy_headers, **kwargs) + response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).delete(url, proxy, stealthy_headers, **kwargs) return response_object From ccf38606d82a811a7991186bec19b911036c59ac Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Tue, 26 Nov 2024 21:25:16 +0200 Subject: [PATCH 06/10] Making the README clearer a bit --- README.md | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 7b94658..08e43b1 100644 --- a/README.md +++ b/README.md @@ -44,10 +44,11 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha * [Text Extraction Speed Test (5000 nested elements).](#text-extraction-speed-test-5000-nested-elements) * [Extraction By Text Speed Test](#extraction-by-text-speed-test) * [Installation](#installation) - * [Fetching Websites Features](#fetching-websites-features) - * [Fetcher](#fetcher) - * [StealthyFetcher](#stealthyfetcher) - * [PlayWrightFetcher](#playwrightfetcher) + * [Fetching Websites](#fetching-websites) + * [Features](#features) + * [Fetcher class](#fetcher) + * [StealthyFetcher class](#stealthyfetcher) + * [PlayWrightFetcher class](#playwrightfetcher) * [Advanced Parsing Features](#advanced-parsing-features) * [Smart Navigation](#smart-navigation) * [Content-based Selection & Finding Similar Elements](#content-based-selection--finding-similar-elements) @@ -210,7 +211,10 @@ playwright install chromium python -m browserforge update ``` -## Fetching Websites Features +## Fetching Websites +Fetchers are basically interfaces that do requests or fetch pages for you in a single request fashion then return an `Adaptor` object for you. This feature was introduced because the only option we had before was to fetch the page as you want then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page. + +### Features You might be a little bit confused by now so let me clear things up. All fetcher-type classes are imported in the same way ```python from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher From d3fbcab3df28c0a2478a646996e3d5104f355104 Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Tue, 26 Nov 2024 21:27:22 +0200 Subject: [PATCH 07/10] Pumping version up to 0.2.7 --- scrapling/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/scrapling/__init__.py b/scrapling/__init__.py index 33342f4..26d815d 100644 --- a/scrapling/__init__.py +++ b/scrapling/__init__.py @@ -4,7 +4,7 @@ from scrapling.core.custom_types import TextHandler, AttributesHandler __author__ = "Karim Shoair (karim.shoair@pm.me)" -__version__ = "0.2.6" +__version__ = "0.2.7" __copyright__ = "Copyright (c) 2024 Karim Shoair" diff --git a/setup.cfg b/setup.cfg index be5aa12..1aa408a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = scrapling -version = 0.2.6 +version = 0.2.7 author = Karim Shoair author_email = karim.shoair@pm.me description = Scrapling is an undetectable, powerful, flexible, adaptive, and high-performance web scraping library for Python. diff --git a/setup.py b/setup.py index 281f567..91aa89c 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setup( name="scrapling", - version="0.2.6", + version="0.2.7", description="""Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It simplifies the process of extracting data from websites, even when they undergo structural changes, and offers impressive speed improvements over many popular scraping tools.""", From 14ffdbd6fc49b650bbeb7a88207f85f09daa554e Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Tue, 26 Nov 2024 22:27:05 +0200 Subject: [PATCH 08/10] Handling problematic encodings --- scrapling/engines/toolbelt/custom.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/scrapling/engines/toolbelt/custom.py b/scrapling/engines/toolbelt/custom.py index f0053d7..0cdc65e 100644 --- a/scrapling/engines/toolbelt/custom.py +++ b/scrapling/engines/toolbelt/custom.py @@ -39,7 +39,7 @@ def __parse_content_type(cls, header_value: str) -> Tuple[str, Dict[str, str]]: @classmethod @cache(maxsize=None) - def get_value(cls, content_type: Optional[str]) -> str: + def get_value(cls, content_type: Optional[str], text: Optional[str] = 'test') -> str: """Determine the appropriate character encoding from a content-type header. The encoding is determined by these rules in order: @@ -50,26 +50,30 @@ def get_value(cls, content_type: Optional[str]) -> str: 5. Default to UTF-8 if nothing else matches :param content_type: Content-Type header value or None + :param text: A text to test the encoding on it :return: String naming the character encoding """ if not content_type: return cls.__DEFAULT_ENCODING try: + encoding = None content_type, params = cls.__parse_content_type(content_type) # First check for explicit charset parameter if "charset" in params: encoding = params["charset"].strip("'\"") - "test".encode(encoding) # Validate encoding - return encoding # Apply content-type specific rules if content_type in cls.__ISO_8859_1_CONTENT_TYPES: - return "ISO-8859-1" + encoding = "ISO-8859-1" if content_type == "application/json": - return cls.__DEFAULT_ENCODING + encoding = cls.__DEFAULT_ENCODING + + if encoding: + _ = text.encode(encoding) # Validate encoding and validate it can encode the given text + return encoding return cls.__DEFAULT_ENCODING @@ -87,7 +91,7 @@ def __init__(self, url: str, text: str, body: bytes, status: int, reason: str, c self.cookies = cookies self.headers = headers self.request_headers = request_headers - encoding = ResponseEncoding.get_value(encoding) + encoding = ResponseEncoding.get_value(encoding, text) super().__init__(text=text, body=body, url=automatch_domain or url, encoding=encoding, **adaptor_arguments) # For back-ward compatibility self.adaptor = self From 28b783e86bc68f7e681c57cf13ff6f728a0691f4 Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Tue, 26 Nov 2024 22:41:04 +0200 Subject: [PATCH 09/10] Improving the stealth mode a bit for Playwright --- scrapling/engines/constants.py | 2 +- scrapling/engines/pw.py | 12 ++++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/scrapling/engines/constants.py b/scrapling/engines/constants.py index 245e5c0..926e238 100644 --- a/scrapling/engines/constants.py +++ b/scrapling/engines/constants.py @@ -44,7 +44,7 @@ '--disable-default-apps', '--disable-print-preview', '--disable-dev-shm-usage', - '--disable-popup-blocking', + # '--disable-popup-blocking', '--metrics-recording-only', '--disable-crash-reporter', '--disable-partial-raster', diff --git a/scrapling/engines/pw.py b/scrapling/engines/pw.py index 3f6a187..818720b 100644 --- a/scrapling/engines/pw.py +++ b/scrapling/engines/pw.py @@ -90,6 +90,14 @@ def __init__( self.nstbrowser_mode = bool(nstbrowser_mode) self.nstbrowser_config = nstbrowser_config self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {} + self.harmful_default_args = [ + # This will be ignored to avoid detection more and possibly avoid the popup crashing bug abuse: https://issues.chromium.org/issues/340836884 + '--enable-automation', + '--disable-popup-blocking', + # '--disable-component-update', + # '--disable-default-apps', + # '--disable-extensions', + ] def _cdp_url_logic(self, flags: Optional[List] = None) -> str: """Constructs new CDP URL if NSTBrowser is enabled otherwise return CDP URL as it is @@ -154,10 +162,10 @@ def fetch(self, url: str) -> Response: else: if self.stealth: browser = p.chromium.launch( - headless=self.headless, args=flags, ignore_default_args=['--enable-automation'], chromium_sandbox=True, channel='chrome' if self.real_chrome else 'chromium' + headless=self.headless, args=flags, ignore_default_args=self.harmful_default_args, chromium_sandbox=True, channel='chrome' if self.real_chrome else 'chromium' ) else: - browser = p.chromium.launch(headless=self.headless, ignore_default_args=['--enable-automation'], channel='chrome' if self.real_chrome else 'chromium') + browser = p.chromium.launch(headless=self.headless, ignore_default_args=self.harmful_default_args, channel='chrome' if self.real_chrome else 'chromium') # Creating the context if self.stealth: From 06a47f9644c57fddaac1fdff77851a7a5c0fb55e Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Tue, 26 Nov 2024 22:55:22 +0200 Subject: [PATCH 10/10] Small fix for the `ResponseEncoding` class --- scrapling/engines/toolbelt/custom.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapling/engines/toolbelt/custom.py b/scrapling/engines/toolbelt/custom.py index 0cdc65e..2c7cd94 100644 --- a/scrapling/engines/toolbelt/custom.py +++ b/scrapling/engines/toolbelt/custom.py @@ -65,10 +65,10 @@ def get_value(cls, content_type: Optional[str], text: Optional[str] = 'test') -> encoding = params["charset"].strip("'\"") # Apply content-type specific rules - if content_type in cls.__ISO_8859_1_CONTENT_TYPES: + elif content_type in cls.__ISO_8859_1_CONTENT_TYPES: encoding = "ISO-8859-1" - if content_type == "application/json": + elif content_type == "application/json": encoding = cls.__DEFAULT_ENCODING if encoding: