-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathwebsite_scraper.py
68 lines (51 loc) · 2.16 KB
/
website_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import os
from bs4 import BeautifulSoup
from markdownify import markdownify as md
from selenium.webdriver import ChromeOptions, Remote
from selenium.webdriver.chromium.remote_connection import ChromiumRemoteConnection
from selenium.webdriver.remote.remote_connection import ClientConfig
BRIGHTDATA_USERNAME = os.getenv("BRIGHTDATA_USERNAME")
BRIGHTDATA_PASSWORD = os.getenv("BRIGHTDATA_PASSWORD")
BRIGHTDATA_REMOTE_SERVER_ADDR = os.getenv("BRIGHTDATA_REMOTE_SERVER_ADDR")
config = ClientConfig(
username=BRIGHTDATA_USERNAME,
password=BRIGHTDATA_PASSWORD,
remote_server_addr=BRIGHTDATA_REMOTE_SERVER_ADDR,
)
def scrape_url(url):
print("Connecting to Scraping Browser...")
options = ChromeOptions()
options.page_load_strategy = "normal"
options.set_capability("timeouts", {"pageLoad": 60000, "script": 60000})
sbr_connection = ChromiumRemoteConnection(
None, "goog", "chrome", client_config=config
)
with Remote(sbr_connection, options=options) as driver:
try:
print("Connected! Navigating...")
driver.set_page_load_timeout(60)
driver.set_script_timeout(60)
driver.get(url)
print("Navigated! Scraping page content...")
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
# Extract Main Content
main_content = soup.find("main") # Look for <main> tag
if not main_content: # Fallback to <article> tag
main_content = soup.find("article")
if not main_content: # Last fallback - generic container with class
main_content = soup.find("div", {"class": "content"})
if main_content:
for img in main_content.find_all("img"):
img.decompose()
if main_content:
content_markdown = md(str(main_content))
else:
return None
return content_markdown
except Exception as e:
print(f"Error occurred: {e}")
if __name__ == "__main__":
url = "https://www.bbc.com/news/articles/ckgzprprlyeo"
content = scrape_url(url)
print(content)