Merge pull request #8 from viniciusgava/feature/non-selenium-version

Non Selenium download implementation
viniciusgava · Feb 8, 2019 · 97f64f3 · 97f64f3
2 parents fac698b + 36b99f1
commit 97f64f3
Show file tree

Hide file tree

Showing 6 changed files with 76 additions and 128 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,13 +1,6 @@
 FROM python:3.6-alpine3.8
 
-# update apk repo
-RUN apk add --no-cache chromium chromium-chromedriver \
-    && rm -rf /var/lib/apt/lists/* \
-    /var/cache/apk/* \
-    /usr/share/man \
-    /tmp/*
-
-RUN pip3 install selenium==3.8.0 requests
+RUN pip3 install requests
 
 COPY src/ /usr/workspace
 

diff --git a/Makefile b/Makefile
@@ -1,5 +1,5 @@
 build-docker:cleanup
-	docker build -t viniciusgava/portaldorh-holerite-download:latest .
+	docker build --no-cache -t viniciusgava/portaldorh-holerite-download:latest .
 
 publish-image:
 	docker push viniciusgava/portaldorh-holerite-download:latest

diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
 # Portal do RH Download
-Download holerite PDF using selenium, chrome and python at Portal do RH.
+Download holerite PDF using python at Portal do RH.
 
 ## Additional Integrations
 - E-mail with downloaded PDF attachment by Mail Gun
@@ -145,13 +145,11 @@ Notification Body
 
 ## Usage - Local
 Makefile and instruction bellow expected you uses python3. 
-
-It also expected you already have a chrome webdrive installed. 
-
+
 1. Clone repository
 2. Run ``make prepare-local``
 3. Edit ``src/settings/local.py`` file with your information.
-4. Run ``python3 src/app.py local`
+4. Run ``python3 src/app.py local``
 
 ## Integrations Placeholder
 Some integration fields accept placeholder, that means you can use internal fields used on integration on your texts.

diff --git a/src/automate/downloader.py b/src/automate/downloader.py
@@ -1,53 +1,37 @@
 # coding=utf-8
 import os
-import shutil
-import time
 import tempfile
 import re
+import requests
+from html.parser import HTMLParser
 
-import selenium
-from selenium.webdriver.support.ui import Select
+
+class GenericWebFormsParser(HTMLParser):
+
+    def __init__(self, *, convert_charrefs=True):
+        super().__init__(convert_charrefs=convert_charrefs)
+        self.inputs = {}
+
+    def handle_startendtag(self, tag, attrs):
+        if tag != 'input':
+            return
+        attr = dict(attrs)
+        if not attr['name'].startswith('__'):
+            return
+        if 'value' not in attr:
+            return
+
+        self.inputs[attr['name']] = attr['value']
 
 
 class Downloader:
     tmp_download_path = tempfile.mkdtemp()
 
     def __init__(self, settings, logger):
+        self.session = requests.Session()
         self.settings = settings
         self.logger = logger
 
-        # Define driver options
-        driver_profile = {
-            "plugins.plugins_list": [
-                {
-                    "enabled": False,
-                    "name": "Chrome PDF Viewer"
-                }
-            ],
-            "plugins.always_open_pdf_externally": True,
-            "download.default_directory": self.tmp_download_path,
-            "download.prompt_for_download": False,
-            "download.directory_upgrade": True,
-            "safebrowsing.enabled": True
-        }
-
-        driver_options = selenium.webdriver.ChromeOptions()
-        driver_options.add_experimental_option("prefs", driver_profile)
-
-        # Should use headless?
-        if settings.headless is True:
-            driver_options.add_argument('--no-sandbox')
-            driver_options.add_argument('--headless')
-            driver_options.add_argument('--disable-gpu')
-
-        # Configure driver
-        self.driver = selenium.webdriver.Chrome(chrome_options=driver_options)
-        self.driver.implicitly_wait(10)
-
-        # Workaround to fix headless problem to set default download path
-        if settings.headless is True:
-            self.enable_download_in_headless_chrome()
-
     def run(self):
         self.logger.info("Initializing Holerite download")
         self.logger.info("Download path: " + self.settings.default_download_path)
@@ -56,129 +40,107 @@ def run(self):
         self.logger.info("Portal do RH username: " + self.settings.portal_rh.username)
         self.logger.info("Holerite year: " + self.settings.search_year)
         self.logger.info("Holerite month: " + self.settings.search_month)
-        self.logger.info("Headless: %r" % self.settings.headless)
+
+        self.session = requests.Session()
+        self.session.verify = False
 
         # Login
         if self.login() is False:
             return False
 
         # Search document
-        if self.search_document() is False:
-            return False
+        response = self.search_document()
 
         # Check result
-        result = self.check_result()
+        result = self.check_result(response)
 
-        self.driver.close()
         self.logger.info("download finished")
 
         return result
 
     def login(self):
         # Open login page
-        self.driver.get(self.settings.portal_rh.url)
+        home_response = self.session.get(self.settings.portal_rh.url)
 
-        # Input username
-        username_element = self.driver.find_element_by_id("CtrlLogin1_txtIDNumerico")
-        username_element.clear()
-        username_element.send_keys(self.settings.portal_rh.username)
+        # Fetch webforms validation fields
+        parser = GenericWebFormsParser()
+        parser.feed(home_response.text)
+        form_data = parser.inputs
 
-        # Input password
-        password_element = self.driver.find_element_by_id("CtrlLogin1_txtSenhaAlfanumerico")
-        password_element.clear()
-        password_element.send_keys(self.settings.portal_rh.password)
+        # Input username and password
+        form_data['CtrlLogin1$txtIDNumerico'] = self.settings.portal_rh.username
+        form_data['CtrlLogin1$txtSenhaAlfanumerico'] = self.settings.portal_rh.password
+        form_data['CtrlLogin1$btnIniciar'] = 'Iniciar'
 
         self.logger.info("Username and password has been filled. Logging..")
 
         # Sign-in
-        self.driver.find_element_by_id("CtrlLogin1_btnIniciar").click()
+        login_response = self.session.post(self.settings.portal_rh.url, data=form_data, allow_redirects=False)
 
-        home_pattern = re.compile(".*Auto_Default\.aspx.*")
+        # Sign-in?
+        if login_response.status_code == 302:
+            return True
 
-        # Sign-in Fail?
-        if home_pattern.match(self.driver.current_url) is not None:
-            self.logger.warning('Username or password invalid')
-            return False
+        # Process login error
+        m = re.search('<span id="CtrlLogin1_lblMensagemAcesso".+>(.+)</span>', login_response.text)
+        error_msg = m.group(1)
 
-        return True
+        self.logger.warning('Username or password invalid - "%s"' % error_msg)
+
+        return False
 
     def search_document(self):
-        # Select mainFrame, where is the search form
-        self.driver.switch_to.frame("mainFrame")
 
-        # Choose document type
-        self.logger.info("Choosing document type: MENSAL")
-        type_select = Select(self.driver.find_element_by_id("controlsAscx111_cboFolha"))
-        type_select.select_by_visible_text("MENSAL")
+        # Open Search page
+        search_page_url = self.settings.portal_rh.url
+        search_page_url = search_page_url.replace('auto_default.aspx', 'Auto_PrincipalConteudo.aspx')
+        search_page_response = self.session.get(search_page_url)
 
-        # Given search date
-        self.logger.info("Inputing search date: " + self.get_search_date())
-        date_element = self.driver.find_element_by_id("controlsAscx111_txtDataRef")
-        date_element.clear()
-        date_element.send_keys(self.get_search_date())
+        # Fetch webforms validation fields
+        parser = GenericWebFormsParser()
+        parser.feed(search_page_response.text)
 
-        # Perform search
-        self.driver.find_element_by_id("controlsAscx111_btnDemoConsultar").click()
+        # Fill required fields to fetch PDF
+        form_data = parser.inputs
+        form_data['controlsAscx111$cboFolha'] = 'MENSAL	1'
+        form_data['controlsAscx111$txtDataRef'] = self.get_search_date()
+        form_data['controlsAscx111$btnDemoConsultar'] = 'Consultar'
+        form_data['controlsAscx113$cboAno'] = '2017'
+        form_data['PG'] = ''
+        form_data['scrollLeft'] = 0
+        form_data['scrollTop'] = 0
 
-        return True
+        # Fetch PDF
+        pdf_response = self.session.post(search_page_url, data=form_data)
 
-    def check_result(self):
+        return pdf_response
+
+    def check_result(self, response):
         # Regex to check if given date is invalid
         invalid_date_pattern = re.compile(
             "Demonstrativo de Pagamento (.+) não liberado para emissão!. Será liberado a partir do dia ([0-9]{2}\/[0-9]{2}\/[0-9]{4})\.")
 
         # is a invalid date?
-        if invalid_date_pattern.search(self.driver.page_source) is not None:
+        if invalid_date_pattern.search(response.text) is not None:
             self.logger.warning("Invalid search date: " + self.get_search_date())
             return False
 
         # It is a valid date
 
-        # Download file path
-        download_file_path = os.path.join(self.tmp_download_path, 'Auto_PrincipalConteudo.aspx')
-
-        # Wait file download
-        self.logger.info("Waiting pdf download")
-        if self.wait_file_exists(download_file_path) is False:
-            return False
-
-        # Success - File downloaded
-        self.logger.info("Download finished. Moving file to final path...")
-
         # Pdf file name
         pdf_file_name = "%s-%s.pdf" % (self.settings.search_year, self.settings.search_month)
 
         # Final file path
         pdf_file_path = os.path.abspath(self.settings.default_download_path)
         pdf_file_path = os.path.join(pdf_file_path, pdf_file_name)
 
-        # Move
-        shutil.move(download_file_path, pdf_file_path)
-        self.logger.info("File has been moved to: " + pdf_file_path)
+        # Save PDF
+        with open(pdf_file_path, 'wb') as f:
+            f.write(response.content)
+        f.close()
+        self.logger.info("File saved at: " + pdf_file_path)
 
         return True
 
     def get_search_date(self):
         return "%s/%s" % (self.settings.search_month, self.settings.search_year)
-
-    def wait_file_exists(self, file_path):
-        timeout = 0
-        timeout_limit = 60
-        while not os.path.exists(file_path):
-            self.logger.info("Waiting download...")
-
-            time.sleep(3)
-            timeout += 3
-            if timeout > timeout_limit:
-                self.logger.error("timeout - Could not download file")
-                return False
-
-        return True
-
-    def enable_download_in_headless_chrome(self):
-        # add missing support for chrome "send_command"  to selenium webdriver
-        self.driver.command_executor._commands["send_command"] = ("POST", '/session/$sessionId/chromium/send_command')
-
-        params = {'cmd': 'Page.setDownloadBehavior',
-                  'params': {'behavior': 'allow', 'downloadPath': self.tmp_download_path}}
-        self.driver.execute("send_command", params)
diff --git a/src/settings/docker.py b/src/settings/docker.py
@@ -44,8 +44,6 @@
 if os.path.exists(default_download_path) is False:
     os.makedirs(default_download_path)
 
-headless = True
-
 # Mail Gun - Enable?
 mailgun = ObjectDic({
     'enable': os.environ.get("MAIL_GUN_ENABLE", 'false')

diff --git a/src/settings/local.py.dist b/src/settings/local.py.dist
@@ -25,9 +25,6 @@ default_download_path = os.path.realpath(os.path.dirname(os.path.abspath(__file_
 if os.path.exists(default_download_path) is False:
     os.makedirs(default_download_path)
 
-# Use Headless?
-headless = False
-
 # Mail Gun - Enable?
 mailgun = ObjectDic({'enable': False})