Skip to content
This repository has been archived by the owner on Dec 9, 2022. It is now read-only.

Commit

Permalink
Merge pull request #8 from viniciusgava/feature/non-selenium-version
Browse files Browse the repository at this point in the history
Non Selenium download implementation
  • Loading branch information
viniciusgava authored Feb 8, 2019
2 parents fac698b + 36b99f1 commit 97f64f3
Show file tree
Hide file tree
Showing 6 changed files with 76 additions and 128 deletions.
9 changes: 1 addition & 8 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,13 +1,6 @@
FROM python:3.6-alpine3.8

# update apk repo
RUN apk add --no-cache chromium chromium-chromedriver \
&& rm -rf /var/lib/apt/lists/* \
/var/cache/apk/* \
/usr/share/man \
/tmp/*

RUN pip3 install selenium==3.8.0 requests
RUN pip3 install requests

COPY src/ /usr/workspace

Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
build-docker:cleanup
docker build -t viniciusgava/portaldorh-holerite-download:latest .
docker build --no-cache -t viniciusgava/portaldorh-holerite-download:latest .

publish-image:
docker push viniciusgava/portaldorh-holerite-download:latest
Expand Down
8 changes: 3 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Portal do RH Download
Download holerite PDF using selenium, chrome and python at Portal do RH.
Download holerite PDF using python at Portal do RH.

## Additional Integrations
- E-mail with downloaded PDF attachment by Mail Gun
Expand Down Expand Up @@ -145,13 +145,11 @@ Notification Body

## Usage - Local
Makefile and instruction bellow expected you uses python3.

It also expected you already have a chrome webdrive installed.


1. Clone repository
2. Run ``make prepare-local``
3. Edit ``src/settings/local.py`` file with your information.
4. Run ``python3 src/app.py local`
4. Run ``python3 src/app.py local``

## Integrations Placeholder
Some integration fields accept placeholder, that means you can use internal fields used on integration on your texts.
Expand Down
180 changes: 71 additions & 109 deletions src/automate/downloader.py
Original file line number Diff line number Diff line change
@@ -1,53 +1,37 @@
# coding=utf-8
import os
import shutil
import time
import tempfile
import re
import requests
from html.parser import HTMLParser

import selenium
from selenium.webdriver.support.ui import Select

class GenericWebFormsParser(HTMLParser):

def __init__(self, *, convert_charrefs=True):
super().__init__(convert_charrefs=convert_charrefs)
self.inputs = {}

def handle_startendtag(self, tag, attrs):
if tag != 'input':
return
attr = dict(attrs)
if not attr['name'].startswith('__'):
return
if 'value' not in attr:
return

self.inputs[attr['name']] = attr['value']


class Downloader:
tmp_download_path = tempfile.mkdtemp()

def __init__(self, settings, logger):
self.session = requests.Session()
self.settings = settings
self.logger = logger

# Define driver options
driver_profile = {
"plugins.plugins_list": [
{
"enabled": False,
"name": "Chrome PDF Viewer"
}
],
"plugins.always_open_pdf_externally": True,
"download.default_directory": self.tmp_download_path,
"download.prompt_for_download": False,
"download.directory_upgrade": True,
"safebrowsing.enabled": True
}

driver_options = selenium.webdriver.ChromeOptions()
driver_options.add_experimental_option("prefs", driver_profile)

# Should use headless?
if settings.headless is True:
driver_options.add_argument('--no-sandbox')
driver_options.add_argument('--headless')
driver_options.add_argument('--disable-gpu')

# Configure driver
self.driver = selenium.webdriver.Chrome(chrome_options=driver_options)
self.driver.implicitly_wait(10)

# Workaround to fix headless problem to set default download path
if settings.headless is True:
self.enable_download_in_headless_chrome()

def run(self):
self.logger.info("Initializing Holerite download")
self.logger.info("Download path: " + self.settings.default_download_path)
Expand All @@ -56,129 +40,107 @@ def run(self):
self.logger.info("Portal do RH username: " + self.settings.portal_rh.username)
self.logger.info("Holerite year: " + self.settings.search_year)
self.logger.info("Holerite month: " + self.settings.search_month)
self.logger.info("Headless: %r" % self.settings.headless)

self.session = requests.Session()
self.session.verify = False

# Login
if self.login() is False:
return False

# Search document
if self.search_document() is False:
return False
response = self.search_document()

# Check result
result = self.check_result()
result = self.check_result(response)

self.driver.close()
self.logger.info("download finished")

return result

def login(self):
# Open login page
self.driver.get(self.settings.portal_rh.url)
home_response = self.session.get(self.settings.portal_rh.url)

# Input username
username_element = self.driver.find_element_by_id("CtrlLogin1_txtIDNumerico")
username_element.clear()
username_element.send_keys(self.settings.portal_rh.username)
# Fetch webforms validation fields
parser = GenericWebFormsParser()
parser.feed(home_response.text)
form_data = parser.inputs

# Input password
password_element = self.driver.find_element_by_id("CtrlLogin1_txtSenhaAlfanumerico")
password_element.clear()
password_element.send_keys(self.settings.portal_rh.password)
# Input username and password
form_data['CtrlLogin1$txtIDNumerico'] = self.settings.portal_rh.username
form_data['CtrlLogin1$txtSenhaAlfanumerico'] = self.settings.portal_rh.password
form_data['CtrlLogin1$btnIniciar'] = 'Iniciar'

self.logger.info("Username and password has been filled. Logging..")

# Sign-in
self.driver.find_element_by_id("CtrlLogin1_btnIniciar").click()
login_response = self.session.post(self.settings.portal_rh.url, data=form_data, allow_redirects=False)

home_pattern = re.compile(".*Auto_Default\.aspx.*")
# Sign-in?
if login_response.status_code == 302:
return True

# Sign-in Fail?
if home_pattern.match(self.driver.current_url) is not None:
self.logger.warning('Username or password invalid')
return False
# Process login error
m = re.search('<span id="CtrlLogin1_lblMensagemAcesso".+>(.+)</span>', login_response.text)
error_msg = m.group(1)

return True
self.logger.warning('Username or password invalid - "%s"' % error_msg)

return False

def search_document(self):
# Select mainFrame, where is the search form
self.driver.switch_to.frame("mainFrame")

# Choose document type
self.logger.info("Choosing document type: MENSAL")
type_select = Select(self.driver.find_element_by_id("controlsAscx111_cboFolha"))
type_select.select_by_visible_text("MENSAL")
# Open Search page
search_page_url = self.settings.portal_rh.url
search_page_url = search_page_url.replace('auto_default.aspx', 'Auto_PrincipalConteudo.aspx')
search_page_response = self.session.get(search_page_url)

# Given search date
self.logger.info("Inputing search date: " + self.get_search_date())
date_element = self.driver.find_element_by_id("controlsAscx111_txtDataRef")
date_element.clear()
date_element.send_keys(self.get_search_date())
# Fetch webforms validation fields
parser = GenericWebFormsParser()
parser.feed(search_page_response.text)

# Perform search
self.driver.find_element_by_id("controlsAscx111_btnDemoConsultar").click()
# Fill required fields to fetch PDF
form_data = parser.inputs
form_data['controlsAscx111$cboFolha'] = 'MENSAL 1'
form_data['controlsAscx111$txtDataRef'] = self.get_search_date()
form_data['controlsAscx111$btnDemoConsultar'] = 'Consultar'
form_data['controlsAscx113$cboAno'] = '2017'
form_data['PG'] = ''
form_data['scrollLeft'] = 0
form_data['scrollTop'] = 0

return True
# Fetch PDF
pdf_response = self.session.post(search_page_url, data=form_data)

def check_result(self):
return pdf_response

def check_result(self, response):
# Regex to check if given date is invalid
invalid_date_pattern = re.compile(
"Demonstrativo de Pagamento (.+) não liberado para emissão!. Será liberado a partir do dia ([0-9]{2}\/[0-9]{2}\/[0-9]{4})\.")

# is a invalid date?
if invalid_date_pattern.search(self.driver.page_source) is not None:
if invalid_date_pattern.search(response.text) is not None:
self.logger.warning("Invalid search date: " + self.get_search_date())
return False

# It is a valid date

# Download file path
download_file_path = os.path.join(self.tmp_download_path, 'Auto_PrincipalConteudo.aspx')

# Wait file download
self.logger.info("Waiting pdf download")
if self.wait_file_exists(download_file_path) is False:
return False

# Success - File downloaded
self.logger.info("Download finished. Moving file to final path...")

# Pdf file name
pdf_file_name = "%s-%s.pdf" % (self.settings.search_year, self.settings.search_month)

# Final file path
pdf_file_path = os.path.abspath(self.settings.default_download_path)
pdf_file_path = os.path.join(pdf_file_path, pdf_file_name)

# Move
shutil.move(download_file_path, pdf_file_path)
self.logger.info("File has been moved to: " + pdf_file_path)
# Save PDF
with open(pdf_file_path, 'wb') as f:
f.write(response.content)
f.close()
self.logger.info("File saved at: " + pdf_file_path)

return True

def get_search_date(self):
return "%s/%s" % (self.settings.search_month, self.settings.search_year)

def wait_file_exists(self, file_path):
timeout = 0
timeout_limit = 60
while not os.path.exists(file_path):
self.logger.info("Waiting download...")

time.sleep(3)
timeout += 3
if timeout > timeout_limit:
self.logger.error("timeout - Could not download file")
return False

return True

def enable_download_in_headless_chrome(self):
# add missing support for chrome "send_command" to selenium webdriver
self.driver.command_executor._commands["send_command"] = ("POST", '/session/$sessionId/chromium/send_command')

params = {'cmd': 'Page.setDownloadBehavior',
'params': {'behavior': 'allow', 'downloadPath': self.tmp_download_path}}
self.driver.execute("send_command", params)
2 changes: 0 additions & 2 deletions src/settings/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,6 @@
if os.path.exists(default_download_path) is False:
os.makedirs(default_download_path)

headless = True

# Mail Gun - Enable?
mailgun = ObjectDic({
'enable': os.environ.get("MAIL_GUN_ENABLE", 'false')
Expand Down
3 changes: 0 additions & 3 deletions src/settings/local.py.dist
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,6 @@ default_download_path = os.path.realpath(os.path.dirname(os.path.abspath(__file_
if os.path.exists(default_download_path) is False:
os.makedirs(default_download_path)

# Use Headless?
headless = False

# Mail Gun - Enable?
mailgun = ObjectDic({'enable': False})

Expand Down

0 comments on commit 97f64f3

Please sign in to comment.