-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_jobs.py
158 lines (147 loc) · 6.84 KB
/
get_jobs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#! python3
# Program web-scrapes https://jobs.sap.com for given a search string.
import os, sys, requests, bs4, csv, re
from datetime import datetime
from jobs_to_db import create_table, insert_row
_url = 'https://jobs.sap.com/search/?q='
# Function to return number of hits given a search string
def get_hits(string):
string = string.strip().lower()
url_string = string.replace(' ', '+')
url = _url + url_string
res = requests.get(url)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, features="lxml")
return int(soup.select('.paginationLabel > b')[1].text)
# Function to extract relevant links from the up to 25 results returned per page
def search_page(string, page_after):
string = string.strip().lower()
url_string = string.replace(' ', '+')
url = _url + url_string + '&startrow={}'.format(page_after)
res = requests.get(url)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, features="lxml")
# CSS selector for link to job
all_jobs = soup.select('.jobTitle.hidden-phone > .jobTitle-link')
# Take only those jobs which have the search string in the title
filtered_jobs = [job for job in all_jobs if string in job.text.lower()]
# By choosing a dictionary, we omit jobs with the same job title as they are
# often duplicates
job_links = {job.text: 'https://jobs.sap.com' + job.get('href') \
for job in filtered_jobs}
return job_links
# Function to collect results from all result pages
def get_joblinks(string):
job_links = {}
for i in range(0, get_hits(string), 25):
job_links.update(search_page(string, i))
return job_links
# Function to extract all relevant features and return them as a list of lists given
# a dict of job titles with job links
def job_results(job_links):
# List of features to extract
feature_classes = {'Date Posted': 'datePosted', 'Location': 'jobLocation'}
job_desc = ['Requisition ID', 'Expected Travel', 'Career Status', 'Employment Type']
req = ['skill', 'competencies', 'requirement', 'bonus', 'pre-requisite', \
'prerequisite', 'experience', 'you have achieved', \
'competency', 'who you are', 'education', 'qualification', 'what do you need', \
'qualifications', 'requirements', 'prerequisites', 'experiences', 'skills', \
'you will need', 'also have']
jobs = [['Job Title'] + list(feature_classes.keys()) + job_desc + \
['Requirements']]
for title, link in job_links.items():
job_data = [title]
res = requests.get(link)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, features = "lxml")
# Extract 'feature_classes'
for feature in feature_classes.values():
feature_tag = soup.select('span[itemprop = "{}"]'.format(feature))[0]
# Strip spaces, tabs, and newlines
feature_value = feature_tag.text.strip('•\t\n\r ')
job_data.append(feature_value)
# Extract 'job_desc' from the job description block
job_desc_tags = soup.select('.jobdescription > p > span > span > strong')
job_desc_dict_all = {tag.text.strip(': '): tag.next_sibling \
for tag in job_desc_tags}
job_desc_list_wanted = [job_desc_dict_all.get(key) for key in job_desc]
job_data.extend(job_desc_list_wanted)
# Extract 'req' from job description block
req_tags = soup.select('.jobdescription > p')
job_req = ''
flag = None
for tag in req_tags:
# Check if tag is a sub-header and contains any of the req key words
hit1 = any(key in tag.text.lower() for key in req \
if len(tag.text.split()) <= 8)
# Check if tag begins (first three words) with any of the req key words
hit2 = any(key in tag.text.lower().split()[0:2] for key in req)
if hit1:
# Find 1st and 2nd sib to current tag that are not NavigableStrings
first_sib = tag.next_sibling
while isinstance(first_sib, bs4.element.NavigableString) or \
first_sib.text == '\xa0':
first_sib = first_sib.next_sibling
second_sib = first_sib.next_sibling
while isinstance(second_sib, bs4.element.NavigableString) or \
second_sib.text == '\xa0':
second_sib = second_sib.next_sibling
# Fetch first sibling
if first_sib:
job_req = re.sub(r'\n+', '\n', job_req + \
first_sib.text).strip('•\t\n\r ') + '\n'
# Fetch second sibling if longer than 8 words (not a sub-heading)
if second_sib and len(second_sib.text.split()) > 8 and \
first_sib.name != 'ul':
job_req = re.sub(r'\n+', '\n', job_req + \
second_sib.text).strip('•\t\n\r ') + '\n'
elif hit2:
job_req = re.sub(r'\n+', '\n', job_req + \
tag.text).strip('•\t\n\r ') + '\n'
job_data.append(re.sub(r'\xa0', ' ', job_req).strip('•\t\n\r '))
# Append finished job to list
jobs.append(job_data)
return jobs
# Wrapper function to get all jobs matching a search string as a list of lists
def get_jobs(string):
print('Fetching results...') # display text while downloading
jobs = job_results(get_joblinks(string))
print('Done.')
print('Found {} results.'.format(len(jobs) - 1))
return jobs
# Wrapper function to get all jobs matching a search string as a csv file
def get_jobs_to_csv(string):
# Check for forbidden filename characters.
forbidden = ['.', ':', '/', '\\']
if string == '' or any(char in string for char in forbidden):
flag = 'output'
else:
flag = string
print('Writing results to file "{}.csv"...'.format(flag))
# Get the path to the current working directory
base_path = os.path.dirname(sys.argv[0])
os.chdir(base_path)
csv_data = job_results(get_joblinks(string))
# Write data to csv
with open('{}.csv'.format(flag), 'w') as csv_file:
writer = csv.writer(csv_file)
writer.writerows(csv_data)
print('Done.')
print('Found {} results.'.format(len(csv_data) - 1))
# Wrapper function to get all jobs matching a search string into a MySQL DB
def get_jobs_to_db(string):
print('Fetching results...') # display text while downloading
jobs = job_results(get_joblinks(string))
string = string.replace(' ', '_') + '@' + datetime.now().strftime('%Y-%m-%d')
create_table(string)
for row in jobs[1:]:
insert_row(string, row)
print('Done.')
print('Added {} records to table {} in MySQL database.'.format(len(jobs) - 1, \
string))
# Main entry point for script
def main():
# Get all jobs in database
get_jobs__to_csv('')
if __name__ == '__main__':
sys.exit(main())