-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlinkedin.py
97 lines (82 loc) · 4.17 KB
/
linkedin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from zenrows import ZenRowsClient
import time
import asyncio
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs
zenrows_params = {"js_render":"true","antibot":"true","premium_proxy":"true"}
# Set concurrency and retries
client = ZenRowsClient("7b26afa746c5aa85d837d1440875a2c44279615a",concurrency=10, retries=1)
# Create a function to parse HTML using Beautiful Soup
def parse_html(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
# Manipulate the Beautiful Soup object to extract data as needed
return soup
async def scrap_linkedin(urls):
responses = await asyncio.gather(*[client.get_async(url,zenrows_params) for url in urls])
result = []
for idx,response in enumerate(responses):
original_url = parse_qs(urlparse(response.request.url).query)["url"]
if response.status_code == 200:
# print({
# "response": response,
# "status_code": response.status_code,
# "request_url": original_url,
# })
soup = parse_html(response.content)
name_element = soup.find('h1', class_='top-card-layout__title')
role_element = soup.find('h2', class_='top-card-layout__headline')
info_div = soup.find('h3', class_='top-card-layout__first-subline')
# Find the first span child
location_element = info_div.find_all('span')
# name_element = h1_element.find('span')
elems = soup.find_all('span', class_='top-card-link__description')
# Select the second element (Python indexing starts from 0, hence the 2nd element will be at index 1)
experience_element = elems[0]
education_element = elems[1] if len(elems) > 1 else None
# Get Avatar Url
avatar_url=""
div_element = soup.find('div', class_='profile_header--profileHeaderAvatarContainer--7f99T')
if div_element:
# Get the style attribute value
avatar_img = div_element.find('img')
# Extract the image URL from the style attribute
avatar_url = avatar_img['src'] if avatar_img else ""
else:
avatar_url=""
# span_follow = soup.find_all('span', class_='profile_resources_grid--followsDataCount--HPMnb')
name = name_element.get_text(strip=True) if name_element else ""
role = role_element.get_text(strip=True) if role_element else ""
location = location_element[0].get_text(strip=True) if location_element[0] else ""
experience = experience_element.get_text(strip=True) if experience_element else ""
education = education_element.get_text(strip=True) if education_element else ""
# follower = span_follow[0].get_text(strip=True) if span_follow[0] else ""
# following = span_follow[1].get_text(strip=True) if span_follow[1] else ""
print({
"Name": name,
"Role": role,
"location": location,
"experience": experience,
"education": education,
# "Follower": follower,
# "Following": following,
# "Avatar-link": avatar_url,
# "Link": urls[idx]
})
else:
print(f"Not available --> {urls[idx]}")
# print(response.content)
if __name__ == '__main__':
st_time = time.monotonic()
urls = [
"https://www.linkedin.com/in/michael-bage-10214a112",
# "https://www.linkedin.com/in/michael-bage-242a6b204",
# "https://www.linkedin.com/in/michael-bage-4781a970",
# "https://www.linkedin.com/in/mike-bage-78757532",
# "https://uk.linkedin.com/in/michael-bage-13125447",
# "https://uk.linkedin.com/in/michael-bage-cgli-tmiet-33b87643",
# "https://uk.linkedin.com/in/michael-bage-8397154b",
# "https://uk.linkedin.com/in/mike-bage-357bb1b9",
# "https://br.linkedin.com/in/michael-bag%C3%A9-576968122"
]
asyncio.run(scrap_linkedin(urls))
print("\033[32m"+f"Extracting Done ... [{time.monotonic() - st_time:.2f}s]." + "\033[0m")