From 36b99f19c11e95e4be758578abf0b585d7060013 Mon Sep 17 00:00:00 2001 From: Vinicius Gava Date: Fri, 8 Feb 2019 01:29:42 -0200 Subject: [PATCH] New implementation for download Holerites PDFs without Selenium and Chromium. --- Dockerfile | 9 +- Makefile | 2 +- README.md | 8 +- src/automate/downloader.py | 180 +++++++++++++++---------------------- src/settings/docker.py | 2 - src/settings/local.py.dist | 3 - 6 files changed, 76 insertions(+), 128 deletions(-) diff --git a/Dockerfile b/Dockerfile index b97ac06..5e8ba45 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,13 +1,6 @@ FROM python:3.6-alpine3.8 -# update apk repo -RUN apk add --no-cache chromium chromium-chromedriver \ - && rm -rf /var/lib/apt/lists/* \ - /var/cache/apk/* \ - /usr/share/man \ - /tmp/* - -RUN pip3 install selenium==3.8.0 requests +RUN pip3 install requests COPY src/ /usr/workspace diff --git a/Makefile b/Makefile index 12131e0..754530b 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ build-docker:cleanup - docker build -t viniciusgava/portaldorh-holerite-download:latest . + docker build --no-cache -t viniciusgava/portaldorh-holerite-download:latest . publish-image: docker push viniciusgava/portaldorh-holerite-download:latest diff --git a/README.md b/README.md index 0bb56ee..71bbfb7 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # Portal do RH Download -Download holerite PDF using selenium, chrome and python at Portal do RH. +Download holerite PDF using python at Portal do RH. ## Additional Integrations - E-mail with downloaded PDF attachment by Mail Gun @@ -145,13 +145,11 @@ Notification Body ## Usage - Local Makefile and instruction bellow expected you uses python3. - -It also expected you already have a chrome webdrive installed. - + 1. Clone repository 2. Run ``make prepare-local`` 3. Edit ``src/settings/local.py`` file with your information. -4. Run ``python3 src/app.py local` +4. Run ``python3 src/app.py local`` ## Integrations Placeholder Some integration fields accept placeholder, that means you can use internal fields used on integration on your texts. diff --git a/src/automate/downloader.py b/src/automate/downloader.py index 9d57086..9692e1e 100644 --- a/src/automate/downloader.py +++ b/src/automate/downloader.py @@ -1,53 +1,37 @@ # coding=utf-8 import os -import shutil -import time import tempfile import re +import requests +from html.parser import HTMLParser -import selenium -from selenium.webdriver.support.ui import Select + +class GenericWebFormsParser(HTMLParser): + + def __init__(self, *, convert_charrefs=True): + super().__init__(convert_charrefs=convert_charrefs) + self.inputs = {} + + def handle_startendtag(self, tag, attrs): + if tag != 'input': + return + attr = dict(attrs) + if not attr['name'].startswith('__'): + return + if 'value' not in attr: + return + + self.inputs[attr['name']] = attr['value'] class Downloader: tmp_download_path = tempfile.mkdtemp() def __init__(self, settings, logger): + self.session = requests.Session() self.settings = settings self.logger = logger - # Define driver options - driver_profile = { - "plugins.plugins_list": [ - { - "enabled": False, - "name": "Chrome PDF Viewer" - } - ], - "plugins.always_open_pdf_externally": True, - "download.default_directory": self.tmp_download_path, - "download.prompt_for_download": False, - "download.directory_upgrade": True, - "safebrowsing.enabled": True - } - - driver_options = selenium.webdriver.ChromeOptions() - driver_options.add_experimental_option("prefs", driver_profile) - - # Should use headless? - if settings.headless is True: - driver_options.add_argument('--no-sandbox') - driver_options.add_argument('--headless') - driver_options.add_argument('--disable-gpu') - - # Configure driver - self.driver = selenium.webdriver.Chrome(chrome_options=driver_options) - self.driver.implicitly_wait(10) - - # Workaround to fix headless problem to set default download path - if settings.headless is True: - self.enable_download_in_headless_chrome() - def run(self): self.logger.info("Initializing Holerite download") self.logger.info("Download path: " + self.settings.default_download_path) @@ -56,95 +40,93 @@ def run(self): self.logger.info("Portal do RH username: " + self.settings.portal_rh.username) self.logger.info("Holerite year: " + self.settings.search_year) self.logger.info("Holerite month: " + self.settings.search_month) - self.logger.info("Headless: %r" % self.settings.headless) + + self.session = requests.Session() + self.session.verify = False # Login if self.login() is False: return False # Search document - if self.search_document() is False: - return False + response = self.search_document() # Check result - result = self.check_result() + result = self.check_result(response) - self.driver.close() self.logger.info("download finished") return result def login(self): # Open login page - self.driver.get(self.settings.portal_rh.url) + home_response = self.session.get(self.settings.portal_rh.url) - # Input username - username_element = self.driver.find_element_by_id("CtrlLogin1_txtIDNumerico") - username_element.clear() - username_element.send_keys(self.settings.portal_rh.username) + # Fetch webforms validation fields + parser = GenericWebFormsParser() + parser.feed(home_response.text) + form_data = parser.inputs - # Input password - password_element = self.driver.find_element_by_id("CtrlLogin1_txtSenhaAlfanumerico") - password_element.clear() - password_element.send_keys(self.settings.portal_rh.password) + # Input username and password + form_data['CtrlLogin1$txtIDNumerico'] = self.settings.portal_rh.username + form_data['CtrlLogin1$txtSenhaAlfanumerico'] = self.settings.portal_rh.password + form_data['CtrlLogin1$btnIniciar'] = 'Iniciar' self.logger.info("Username and password has been filled. Logging..") # Sign-in - self.driver.find_element_by_id("CtrlLogin1_btnIniciar").click() + login_response = self.session.post(self.settings.portal_rh.url, data=form_data, allow_redirects=False) - home_pattern = re.compile(".*Auto_Default\.aspx.*") + # Sign-in? + if login_response.status_code == 302: + return True - # Sign-in Fail? - if home_pattern.match(self.driver.current_url) is not None: - self.logger.warning('Username or password invalid') - return False + # Process login error + m = re.search('(.+)', login_response.text) + error_msg = m.group(1) - return True + self.logger.warning('Username or password invalid - "%s"' % error_msg) + + return False def search_document(self): - # Select mainFrame, where is the search form - self.driver.switch_to.frame("mainFrame") - # Choose document type - self.logger.info("Choosing document type: MENSAL") - type_select = Select(self.driver.find_element_by_id("controlsAscx111_cboFolha")) - type_select.select_by_visible_text("MENSAL") + # Open Search page + search_page_url = self.settings.portal_rh.url + search_page_url = search_page_url.replace('auto_default.aspx', 'Auto_PrincipalConteudo.aspx') + search_page_response = self.session.get(search_page_url) - # Given search date - self.logger.info("Inputing search date: " + self.get_search_date()) - date_element = self.driver.find_element_by_id("controlsAscx111_txtDataRef") - date_element.clear() - date_element.send_keys(self.get_search_date()) + # Fetch webforms validation fields + parser = GenericWebFormsParser() + parser.feed(search_page_response.text) - # Perform search - self.driver.find_element_by_id("controlsAscx111_btnDemoConsultar").click() + # Fill required fields to fetch PDF + form_data = parser.inputs + form_data['controlsAscx111$cboFolha'] = 'MENSAL 1' + form_data['controlsAscx111$txtDataRef'] = self.get_search_date() + form_data['controlsAscx111$btnDemoConsultar'] = 'Consultar' + form_data['controlsAscx113$cboAno'] = '2017' + form_data['PG'] = '' + form_data['scrollLeft'] = 0 + form_data['scrollTop'] = 0 - return True + # Fetch PDF + pdf_response = self.session.post(search_page_url, data=form_data) - def check_result(self): + return pdf_response + + def check_result(self, response): # Regex to check if given date is invalid invalid_date_pattern = re.compile( "Demonstrativo de Pagamento (.+) não liberado para emissão!. Será liberado a partir do dia ([0-9]{2}\/[0-9]{2}\/[0-9]{4})\.") # is a invalid date? - if invalid_date_pattern.search(self.driver.page_source) is not None: + if invalid_date_pattern.search(response.text) is not None: self.logger.warning("Invalid search date: " + self.get_search_date()) return False # It is a valid date - # Download file path - download_file_path = os.path.join(self.tmp_download_path, 'Auto_PrincipalConteudo.aspx') - - # Wait file download - self.logger.info("Waiting pdf download") - if self.wait_file_exists(download_file_path) is False: - return False - - # Success - File downloaded - self.logger.info("Download finished. Moving file to final path...") - # Pdf file name pdf_file_name = "%s-%s.pdf" % (self.settings.search_year, self.settings.search_month) @@ -152,33 +134,13 @@ def check_result(self): pdf_file_path = os.path.abspath(self.settings.default_download_path) pdf_file_path = os.path.join(pdf_file_path, pdf_file_name) - # Move - shutil.move(download_file_path, pdf_file_path) - self.logger.info("File has been moved to: " + pdf_file_path) + # Save PDF + with open(pdf_file_path, 'wb') as f: + f.write(response.content) + f.close() + self.logger.info("File saved at: " + pdf_file_path) return True def get_search_date(self): return "%s/%s" % (self.settings.search_month, self.settings.search_year) - - def wait_file_exists(self, file_path): - timeout = 0 - timeout_limit = 60 - while not os.path.exists(file_path): - self.logger.info("Waiting download...") - - time.sleep(3) - timeout += 3 - if timeout > timeout_limit: - self.logger.error("timeout - Could not download file") - return False - - return True - - def enable_download_in_headless_chrome(self): - # add missing support for chrome "send_command" to selenium webdriver - self.driver.command_executor._commands["send_command"] = ("POST", '/session/$sessionId/chromium/send_command') - - params = {'cmd': 'Page.setDownloadBehavior', - 'params': {'behavior': 'allow', 'downloadPath': self.tmp_download_path}} - self.driver.execute("send_command", params) diff --git a/src/settings/docker.py b/src/settings/docker.py index ba57f6b..e632901 100644 --- a/src/settings/docker.py +++ b/src/settings/docker.py @@ -44,8 +44,6 @@ if os.path.exists(default_download_path) is False: os.makedirs(default_download_path) -headless = True - # Mail Gun - Enable? mailgun = ObjectDic({ 'enable': os.environ.get("MAIL_GUN_ENABLE", 'false') diff --git a/src/settings/local.py.dist b/src/settings/local.py.dist index 2006f02..cc97802 100644 --- a/src/settings/local.py.dist +++ b/src/settings/local.py.dist @@ -25,9 +25,6 @@ default_download_path = os.path.realpath(os.path.dirname(os.path.abspath(__file_ if os.path.exists(default_download_path) is False: os.makedirs(default_download_path) -# Use Headless? -headless = False - # Mail Gun - Enable? mailgun = ObjectDic({'enable': False})