From 3410b059e79835445162483ef5d890819ce6cdc8 Mon Sep 17 00:00:00 2001 From: Fernando Rios Date: Wed, 16 Jun 2021 09:09:44 -0700 Subject: [PATCH 01/26] Update prereq_script Add option for metadata only retrieval --- ldcoolp/scripts/prereq_script | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ldcoolp/scripts/prereq_script b/ldcoolp/scripts/prereq_script index 7fb4b28..0a8c464 100755 --- a/ldcoolp/scripts/prereq_script +++ b/ldcoolp/scripts/prereq_script @@ -31,6 +31,7 @@ if __name__ == '__main__': parser.add_argument('--article_id', required=True, help='Figshare article ID') parser.add_argument('--url_open', action='store_true', help='Whether to use urlopen') parser.add_argument('--browser', action='store_true', help='Whether to use urlopen') + parser.add_argument('--metadata_only', action='store_false', help='Do not retrieve data, only metadata') # parser.add_argument('--api_token', required=True, help='Figshare API token') args = parser.parse_args() @@ -95,7 +96,7 @@ if __name__ == '__main__': # Run pre-req steps main.workflow(articles[ii], url_open=args.url_open, browser=args.browser, - log=log, config_dict=config_dict) + log=log, config_dict=config_dict, metadata_only=args.metadata_only) count += 1 log.info(f"Completed: {articles[ii]} ...") From b3d9f7f0bbd0f58a0166ee2d8838d3b4ba415956 Mon Sep 17 00:00:00 2001 From: Fernando Rios Date: Wed, 16 Jun 2021 09:15:10 -0700 Subject: [PATCH 02/26] Update main.py Add flag for metadata-only download --- ldcoolp/curation/main.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ldcoolp/curation/main.py b/ldcoolp/curation/main.py index 0154a82..590dce5 100644 --- a/ldcoolp/curation/main.py +++ b/ldcoolp/curation/main.py @@ -129,7 +129,7 @@ def move_to_next(self): def workflow(article_id, url_open=False, browser=True, log=None, - config_dict=config_default_dict): + config_dict=config_default_dict, metadata_only=False): """ Purpose: This function follows our initial set-up to: @@ -145,6 +145,7 @@ def workflow(article_id, url_open=False, browser=True, log=None, :param log: logger.LogClass object. Default is stdout via python logging :param config_dict: dict of dict with hierarchy of sections (figshare, curation, qualtrics) follow by options + :param metadata_only: When True, only downloads the item metadata. """ # If log is not defined, then output log to stdout @@ -160,7 +161,8 @@ def workflow(article_id, url_open=False, browser=True, log=None, pw.reserve_doi() # Retrieve data and place in 1.ToDo curation folder - pw.download_data() + if not metadata_only: + pw.download_data() # Download curation report pw.download_report() From 1a4bc056804bc378b04101cb8264713e2497b7d1 Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Tue, 22 Jun 2021 13:56:41 -0700 Subject: [PATCH 03/26] Add metadata_only instance variable for PrerequisiteWorkflow - main.workflow: Require running download_data method --- ldcoolp/curation/main.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/ldcoolp/curation/main.py b/ldcoolp/curation/main.py index 590dce5..d0fa56f 100644 --- a/ldcoolp/curation/main.py +++ b/ldcoolp/curation/main.py @@ -38,7 +38,8 @@ class PrerequisiteWorkflow: """ def __init__(self, article_id, log=None, url_open=False, - config_dict=config_default_dict): + config_dict=config_default_dict, + metadata_only=False): # If log is not defined, then output log to stdout if isinstance(log, type(None)): @@ -71,6 +72,8 @@ def __init__(self, article_id, log=None, url_open=False, self.url_open = url_open + self.metadata_only = metadata_only + # Check if dataset has been retrieved try: source_stage = self.mc.get_source_stage(self.dn.folderName, verbose=False) @@ -153,7 +156,8 @@ def workflow(article_id, url_open=False, browser=True, log=None, log = log_stdout() pw = PrerequisiteWorkflow(article_id, url_open=url_open, log=log, - config_dict=config_dict) + config_dict=config_dict, + metadata_only=metadata_only) # Perform prerequisite workflow if dataset is entirely new if pw.new_set: @@ -161,8 +165,7 @@ def workflow(article_id, url_open=False, browser=True, log=None, pw.reserve_doi() # Retrieve data and place in 1.ToDo curation folder - if not metadata_only: - pw.download_data() + pw.download_data() # Download curation report pw.download_report() From e74d1dfc745517eb23a8b661c72bb6c8ff02f127 Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Tue, 22 Jun 2021 14:05:53 -0700 Subject: [PATCH 04/26] Add metadata_only option in retrieve.download_files() --- ldcoolp/curation/retrieve.py | 80 +++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 37 deletions(-) diff --git a/ldcoolp/curation/retrieve.py b/ldcoolp/curation/retrieve.py index b51bcc2..52f8495 100644 --- a/ldcoolp/curation/retrieve.py +++ b/ldcoolp/curation/retrieve.py @@ -63,7 +63,8 @@ def private_file_retrieve(url, filename=None, token=None, url_open=False, def download_files(article_id, fs, root_directory=None, data_directory=None, - metadata_directory=None, log=None, url_open=False): + metadata_directory=None, log=None, url_open=False, + metadata_only=False): """ Purpose: Retrieve data for a Figshare deposit following data curation workflow @@ -75,6 +76,8 @@ def download_files(article_id, fs, root_directory=None, data_directory=None, :param metadata_directory: Relative folder path for primary location of metadata (str) :param log: logger.LogClass object. Default is stdout via python logging :param url_open: bool indicates using urlopen over urlretrieve. Default: False + :param metadata_only: bool indicates whether to retrieve metadata. Default: True + If set, no files are downloaded """ if isinstance(log, type(None)): @@ -107,49 +110,52 @@ def download_files(article_id, fs, root_directory=None, data_directory=None, metadata_directory=metadata_directory, save_csv=True, log=log) - for n, file_dict in zip(range(n_files), file_list): - log.info(f"Retrieving {n+1} of {n_files} : {file_dict['name']} ({file_dict['size']})") - log.info(f"URL: {file_dict['download_url']}") - filename = os.path.join(dir_path, file_dict['name']) - retrieve_cnt = 0 - checksum_flag = False - if not exists(filename): - while retrieve_cnt < N_TRIES_MD5: - log.info(f"Retrieval attempt #{retrieve_cnt + 1}") - try: - private_file_retrieve(file_dict['download_url'], - filename=filename, token=fs.token, - url_open=url_open, log=log) - log.info("Download successful!") - retrieve_cnt += 1 - except HTTPError: - log.info(f"URL might be public: {file_dict['download_url']}") - log.info("Attempting retrieval without token") + if metadata_only: + for n, file_dict in zip(range(n_files), file_list): + log.info(f"Retrieving {n+1} of {n_files} : {file_dict['name']} ({file_dict['size']})") + log.info(f"URL: {file_dict['download_url']}") + filename = os.path.join(dir_path, file_dict['name']) + retrieve_cnt = 0 + checksum_flag = False + if not exists(filename): + while retrieve_cnt < N_TRIES_MD5: + log.info(f"Retrieval attempt #{retrieve_cnt + 1}") try: private_file_retrieve(file_dict['download_url'], - filename=filename, + filename=filename, token=fs.token, url_open=url_open, log=log) log.info("Download successful!") + retrieve_cnt += 1 except HTTPError: - log.warning(f"Failed to retrieve: {filename}") - retrieve_cnt += 1 - - # Perform checksum - if exists(filename): - if not file_dict['is_link_only']: - checksum_flag = check_md5(filename, - file_dict['supplied_md5']) - if checksum_flag: + log.info(f"URL might be public: {file_dict['download_url']}") + log.info("Attempting retrieval without token") + try: + private_file_retrieve(file_dict['download_url'], + filename=filename, + url_open=url_open, log=log) + log.info("Download successful!") + except HTTPError: + log.warning(f"Failed to retrieve: {filename}") + retrieve_cnt += 1 + + # Perform checksum + if exists(filename): + if not file_dict['is_link_only']: + checksum_flag = check_md5(filename, + file_dict['supplied_md5']) + if checksum_flag: + break + else: + log.info("Not performing checksum on linked-only record") break - else: - log.info("Not performing checksum on linked-only record") - break + else: + if not checksum_flag: + log.warning("File retrieval unsuccessful! " + f"Aborted after {N_TRIES_MD5} tries") else: - if not checksum_flag: - log.warning("File retrieval unsuccessful! " - f"Aborted after {N_TRIES_MD5} tries") - else: - log.info("File exists! Not overwriting!") + log.info("File exists! Not overwriting!") + else: + log.info(f"No file retrieval: metadata_only={metadata_only}") # Change permissions on folders and files # permissions.curation(dir_path) From 71585e30f5b591f344f2e507a68004efcee1c71f Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Tue, 22 Jun 2021 14:12:57 -0700 Subject: [PATCH 05/26] PrerequisiteWorkflow: Pass metadata_only to download_files --- ldcoolp/curation/main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ldcoolp/curation/main.py b/ldcoolp/curation/main.py index d0fa56f..c66d99a 100644 --- a/ldcoolp/curation/main.py +++ b/ldcoolp/curation/main.py @@ -120,7 +120,8 @@ def download_data(self): root_directory=self.root_directory, data_directory=self.data_directory, metadata_directory=self.metadata_directory, - log=self.log, url_open=self.url_open) + log=self.log, url_open=self.url_open, + metadata_only=self.metadata_only) def download_report(self): if self.new_set: From b4e23645dd2442e68dbe45cbe5b377ed9844705e Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Tue, 22 Jun 2021 15:24:42 -0700 Subject: [PATCH 06/26] Reverse if logic --- ldcoolp/curation/retrieve.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ldcoolp/curation/retrieve.py b/ldcoolp/curation/retrieve.py index 52f8495..c85a835 100644 --- a/ldcoolp/curation/retrieve.py +++ b/ldcoolp/curation/retrieve.py @@ -110,7 +110,7 @@ def download_files(article_id, fs, root_directory=None, data_directory=None, metadata_directory=metadata_directory, save_csv=True, log=log) - if metadata_only: + if not metadata_only: for n, file_dict in zip(range(n_files), file_list): log.info(f"Retrieving {n+1} of {n_files} : {file_dict['name']} ({file_dict['size']})") log.info(f"URL: {file_dict['download_url']}") From 4f08ebd89156c410ebc90ab946b89c2761c51fae Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Tue, 22 Jun 2021 15:30:02 -0700 Subject: [PATCH 07/26] Adjust log messages for workflow --- ldcoolp/curation/retrieve.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ldcoolp/curation/retrieve.py b/ldcoolp/curation/retrieve.py index c85a835..a6c7d2c 100644 --- a/ldcoolp/curation/retrieve.py +++ b/ldcoolp/curation/retrieve.py @@ -84,7 +84,10 @@ def download_files(article_id, fs, root_directory=None, data_directory=None, log = log_stdout() log.info("") - log.info("** DOWNLOADING DATA **") + if metadata_only: + log.info(f"** NO FILE RETRIEVAL: metadata_only={metadata_only} **") + else: + log.info("** DOWNLOADING DATA **") if root_directory is None: root_directory = os.getcwd() @@ -154,8 +157,6 @@ def download_files(article_id, fs, root_directory=None, data_directory=None, f"Aborted after {N_TRIES_MD5} tries") else: log.info("File exists! Not overwriting!") - else: - log.info(f"No file retrieval: metadata_only={metadata_only}") # Change permissions on folders and files # permissions.curation(dir_path) From 9493903f5cd267364b730ffafae38c8f0e440228 Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Tue, 22 Jun 2021 15:31:01 -0700 Subject: [PATCH 08/26] Minor PEP8 for log messages [ci skip] --- ldcoolp/curation/retrieve.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ldcoolp/curation/retrieve.py b/ldcoolp/curation/retrieve.py index a6c7d2c..85a604e 100644 --- a/ldcoolp/curation/retrieve.py +++ b/ldcoolp/curation/retrieve.py @@ -115,7 +115,8 @@ def download_files(article_id, fs, root_directory=None, data_directory=None, if not metadata_only: for n, file_dict in zip(range(n_files), file_list): - log.info(f"Retrieving {n+1} of {n_files} : {file_dict['name']} ({file_dict['size']})") + log.info(f"Retrieving {n+1} of {n_files} : " + f"{file_dict['name']} ({file_dict['size']})") log.info(f"URL: {file_dict['download_url']}") filename = os.path.join(dir_path, file_dict['name']) retrieve_cnt = 0 @@ -130,7 +131,8 @@ def download_files(article_id, fs, root_directory=None, data_directory=None, log.info("Download successful!") retrieve_cnt += 1 except HTTPError: - log.info(f"URL might be public: {file_dict['download_url']}") + log.info(f"URL might be public: " + f"{file_dict['download_url']}") log.info("Attempting retrieval without token") try: private_file_retrieve(file_dict['download_url'], From acda5f027e61b85b35094a02948e0b49e1f44ee4 Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Tue, 22 Jun 2021 15:42:38 -0700 Subject: [PATCH 09/26] Apply suggestions from code review Definitely require from testing --- ldcoolp/scripts/prereq_script | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ldcoolp/scripts/prereq_script b/ldcoolp/scripts/prereq_script index 0a8c464..c665a07 100755 --- a/ldcoolp/scripts/prereq_script +++ b/ldcoolp/scripts/prereq_script @@ -31,7 +31,7 @@ if __name__ == '__main__': parser.add_argument('--article_id', required=True, help='Figshare article ID') parser.add_argument('--url_open', action='store_true', help='Whether to use urlopen') parser.add_argument('--browser', action='store_true', help='Whether to use urlopen') - parser.add_argument('--metadata_only', action='store_false', help='Do not retrieve data, only metadata') + parser.add_argument('--metadata_only', action='store_true', help='Do not retrieve data, only metadata') # parser.add_argument('--api_token', required=True, help='Figshare API token') args = parser.parse_args() From 46065e8752dbaa52c33e8eb5f1ec50cbabd57969 Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Wed, 23 Jun 2021 10:38:52 -0700 Subject: [PATCH 10/26] Minor adjustment for PEP8 [ci skip] --- ldcoolp/scripts/prereq_script | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ldcoolp/scripts/prereq_script b/ldcoolp/scripts/prereq_script index c665a07..f731928 100755 --- a/ldcoolp/scripts/prereq_script +++ b/ldcoolp/scripts/prereq_script @@ -95,8 +95,9 @@ if __name__ == '__main__': log.info(f"Retrieving: {articles[ii]} ...") # ... {ii+1} / {num_articles}") # Run pre-req steps - main.workflow(articles[ii], url_open=args.url_open, browser=args.browser, - log=log, config_dict=config_dict, metadata_only=args.metadata_only) + main.workflow(articles[ii], url_open=args.url_open, + browser=args.browser, log=log, config_dict=config_dict, + metadata_only=args.metadata_only) count += 1 log.info(f"Completed: {articles[ii]} ...") From 90387bf931d8e13682c96a27581ba4177469ee64 Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Wed, 23 Jun 2021 12:28:22 -0700 Subject: [PATCH 11/26] Switch Qualtrics to use full config_dict --- ldcoolp/curation/api/qualtrics.py | 13 ++++--------- ldcoolp/curation/inspection/readme/__init__.py | 2 +- ldcoolp/curation/main.py | 2 +- ldcoolp/scripts/generate_qualtrics_links | 3 +-- 4 files changed, 7 insertions(+), 13 deletions(-) diff --git a/ldcoolp/curation/api/qualtrics.py b/ldcoolp/curation/api/qualtrics.py index f0c8b36..38d1a23 100644 --- a/ldcoolp/curation/api/qualtrics.py +++ b/ldcoolp/curation/api/qualtrics.py @@ -52,13 +52,7 @@ class Qualtrics: A Python interface for interaction with Qualtrics API for Deposit Agreement form survey - :param qualtrics_dict: Dict that contains Qualtrics configuration. - This should include: - - survey_id - - token - - datacenter - - download_url - - generate_url + :param config_dict: Dict that contains LD-Cool-P configuration. Default: config_default_dict from config/default.ini @@ -110,12 +104,13 @@ class Qualtrics: Generate URL with customized query strings based on Figshare metadata """ - def __init__(self, qualtrics_dict=config_default_dict['qualtrics'], log=None, + def __init__(self, config_dict=config_default_dict, log=None, interactive=True): self.interactive = interactive - self.dict = qualtrics_dict + self.curation_dict = config_dict['curation'] + self.dict = config_dict['qualtrics'] self.token = self.dict['token'] self.data_center = self.dict['datacenter'] diff --git a/ldcoolp/curation/inspection/readme/__init__.py b/ldcoolp/curation/inspection/readme/__init__.py index 680adc2..c7a6d5d 100644 --- a/ldcoolp/curation/inspection/readme/__init__.py +++ b/ldcoolp/curation/inspection/readme/__init__.py @@ -123,7 +123,7 @@ def __init__(self, dn, config_dict=config_default_dict, update=False, if q: self.q = q else: - self.q = Qualtrics(qualtrics_dict=self.config_dict['qualtrics'], + self.q = Qualtrics(config_dict=self.config_dict, interactive=interactive, log=self.log) self.curation_dict = self.config_dict['curation'] diff --git a/ldcoolp/curation/main.py b/ldcoolp/curation/main.py index 0154a82..a19600f 100644 --- a/ldcoolp/curation/main.py +++ b/ldcoolp/curation/main.py @@ -174,7 +174,7 @@ def workflow(article_id, url_open=False, browser=True, log=None, curation_dict['folder_ual_rdm'], ) log.debug(f"out_path: {out_path}") - q = Qualtrics(qualtrics_dict=config_dict['qualtrics'], log=log) + q = Qualtrics(config_dict=config_dict, log=log) q.retrieve_deposit_agreement(pw.dn.name_dict, out_path=out_path, browser=browser) diff --git a/ldcoolp/scripts/generate_qualtrics_links b/ldcoolp/scripts/generate_qualtrics_links index 1dce0d2..2345943 100755 --- a/ldcoolp/scripts/generate_qualtrics_links +++ b/ldcoolp/scripts/generate_qualtrics_links @@ -79,8 +79,7 @@ if __name__ == '__main__': fs_dict = config_dict['figshare'] fs_admin = FigshareInstituteAdmin(**fs_dict, log=log) - q_dict = config_dict['qualtrics'] - q = qualtrics.Qualtrics(qualtrics_dict=q_dict, log=log) + q = qualtrics.Qualtrics(config_dict=config_dict, log=log) dn = depositor_name.DepositorName(args.article_id, fs_admin, verbose=False) From 7ff05070a7004ac6e9ffde8dff55d0c07843b5c1 Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Wed, 23 Jun 2021 12:59:42 -0700 Subject: [PATCH 12/26] metadata.save_metadata: Add metadata_source input --- ldcoolp/curation/metadata.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ldcoolp/curation/metadata.py b/ldcoolp/curation/metadata.py index d596e0c..edf3ebe 100644 --- a/ldcoolp/curation/metadata.py +++ b/ldcoolp/curation/metadata.py @@ -10,6 +10,7 @@ def save_metadata(json_response: Union[list, dict], out_file_prefix: str, + metadata_source: str = 'CURATION', root_directory: str = '', metadata_directory: str = '', save_csv: bool = False, @@ -23,6 +24,7 @@ def save_metadata(json_response: Union[list, dict], :param root_directory: Full path containing the working directory :param metadata_directory: Metadata path :param save_csv: Save a CSV file. Default: False + :param metadata_source: Source of metadata, :param log: LogClass or logging object. Default: log_stdout() """ @@ -31,7 +33,7 @@ def save_metadata(json_response: Union[list, dict], log.debug("starting ...") log.info("") - log.info("** SAVING CURATION METADATA **") + log.info(f"** SAVING {metadata_source} METADATA **") if not root_directory: root_directory = os.getcwd() From 4e7b13e5bfe6f168c864cb09d0e26178ec08a665 Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Wed, 23 Jun 2021 13:05:27 -0700 Subject: [PATCH 13/26] Pass in full DepositorName object instead of name_dict instance var --- ldcoolp/curation/api/qualtrics.py | 25 ++++++++++++++----------- ldcoolp/curation/main.py | 2 +- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/ldcoolp/curation/api/qualtrics.py b/ldcoolp/curation/api/qualtrics.py index 38d1a23..40fbbf5 100644 --- a/ldcoolp/curation/api/qualtrics.py +++ b/ldcoolp/curation/api/qualtrics.py @@ -33,6 +33,7 @@ from figshare.figshare import issue_request # Read in default configuration settings +from ..depositor_name import DepositorName from ...config import config_default_dict # for quote and urlencode @@ -89,16 +90,16 @@ class Qualtrics: Retrieve pandas DataFrame containing responses for a survey See: https://api.qualtrics.com/docs/getting-survey-responses-via-the-new-export-apis - find_deposit_agreement(dn_dict) + find_deposit_agreement(dn) Call get_survey_responses() and identify response that matches based on depositor name (implemented) and deposit title (to be implemented). Returns ResponseID if a unique match is available - retrieve_deposit_agreement(dn_dict=, ResponseId=, browser=True) + retrieve_deposit_agreement(dn=, ResponseId=, browser=True) Opens up web browser to an HTML page containing the deposit agreement. It will call find_deposit_agreement() with DepositorName dict if ResponseId is not provided. Otherwise, it will use the provided - ResponseId. Note that either dn_dict or ResponseId must be provided + ResponseId. Note that either dn or ResponseId must be provided generate_url(dn_dict) Generate URL with customized query strings based on Figshare metadata @@ -244,11 +245,13 @@ def lookup_survey_shortname(self, lookup_survey_id): except KeyError: self.log.warn("survey_id not found among list") - def find_deposit_agreement(self, dn_dict): + def find_deposit_agreement(self, dn: DepositorName): """Get Response ID based on a match search for depositor name""" merged_df = self.merge_survey() + dn_dict = dn.name_dict + # First perform search via article_id or curation_id self.log.info("Attempting to identify using article_id or curation_id ...") article_id = str(dn_dict['article_id']) @@ -308,7 +311,7 @@ def find_deposit_agreement(self, dn_dict): raise ValueError - def retrieve_deposit_agreement(self, dn_dict=None, ResponseId=None, out_path='', + def retrieve_deposit_agreement(self, dn=None, ResponseId=None, out_path='', browser=True): """Opens web browser to navigate to a page with Deposit Agreement Form""" @@ -317,7 +320,7 @@ def retrieve_deposit_agreement(self, dn_dict=None, ResponseId=None, out_path='', if isinstance(ResponseId, type(None)): try: - ResponseId, SurveyId = self.find_deposit_agreement(dn_dict) + ResponseId, SurveyId = self.find_deposit_agreement(dn) self.log.info(f"Qualtrics ResponseID : {ResponseId}") self.log.info(f"Qualtrics SurveyID : {SurveyId}") except ValueError: @@ -335,7 +338,7 @@ def retrieve_deposit_agreement(self, dn_dict=None, ResponseId=None, out_path='', SurveyId = '' if ResponseId == '' or SurveyId == '': - custom_url = self.generate_url(dn_dict) + custom_url = self.generate_url(dn.name_dict) self.log.info("CUSTOM URL BELOW : ") self.log.info(custom_url) ResponseId = None @@ -500,9 +503,10 @@ def generate_readme_url(self, dn): return full_url - def find_qualtrics_readme(self, dn_dict): + def find_qualtrics_readme(self, dn: DepositorName): """Get Response ID based on a article_id,curation_id search""" + dn_dict = dn.name_dict qualtrics_df = self.get_survey_responses(self.readme_survey_id) # First perform search via article_id or curation_id @@ -535,13 +539,12 @@ def find_qualtrics_readme(self, dn_dict): def retrieve_qualtrics_readme(self, dn=None, ResponseId='', browser=True): """Retrieve response to Qualtrics README form""" - dn_dict = dn.name_dict if ResponseId: response_df = self.get_survey_response(self.readme_survey_id, ResponseId) else: try: - ResponseId, response_df = self.find_qualtrics_readme(dn_dict) + ResponseId, response_df = self.find_qualtrics_readme(dn) self.log.info(f"Qualtrics README ResponseID : {ResponseId}") except ValueError: self.log.warn("Error with retrieving ResponseId") @@ -589,7 +592,7 @@ def retrieve_qualtrics_readme(self, dn=None, ResponseId='', browser=True): self.log.info("Appending Deposit Agreement's Corresponding Author metadata") if not self.da_response_id: self.log.info("NO METADATA - Retrieving Deposit Agreement metadata") - self.find_deposit_agreement(dn_dict) + self.find_deposit_agreement(dn) else: self.log.info(f"Parsed ResponseId : {self.da_response_id}") self.log.info(f"Parsed SurveyID : {self.da_survey_id}") diff --git a/ldcoolp/curation/main.py b/ldcoolp/curation/main.py index a19600f..fcf993a 100644 --- a/ldcoolp/curation/main.py +++ b/ldcoolp/curation/main.py @@ -175,7 +175,7 @@ def workflow(article_id, url_open=False, browser=True, log=None, ) log.debug(f"out_path: {out_path}") q = Qualtrics(config_dict=config_dict, log=log) - q.retrieve_deposit_agreement(pw.dn.name_dict, out_path=out_path, + q.retrieve_deposit_agreement(pw.dn, out_path=out_path, browser=browser) # Check for README file and create one if it does not exist From fbd80842bf83310d00b6b9fc3d9d44292177816a Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Wed, 23 Jun 2021 13:14:17 -0700 Subject: [PATCH 14/26] Add save_metadata method to Qualtrics --- ldcoolp/curation/api/qualtrics.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/ldcoolp/curation/api/qualtrics.py b/ldcoolp/curation/api/qualtrics.py index 40fbbf5..24795e2 100644 --- a/ldcoolp/curation/api/qualtrics.py +++ b/ldcoolp/curation/api/qualtrics.py @@ -24,6 +24,7 @@ # Convert single-entry DataFrame to dictionary from ldcoolp.curation import df_to_dict_single +from ldcoolp.curation import metadata # Logging from redata.commons.logger import log_stdout @@ -604,3 +605,20 @@ def retrieve_qualtrics_readme(self, dn=None, ResponseId='', browser=True): qualtrics_dict['corr_author_affil'] = DA_dict['Q6_3'] return qualtrics_dict + + def save_metadata(self, response_dict: dict, dn: DepositorName, + out_file_prefix: str = 'qualtrics'): + """Save Qualtrics metadata to JSON file""" + + root_directory = join( + self.curation_dict[self.curation_dict['parent_dir']], + self.curation_dict['folder_todo'], + dn.folderName + ) + metadata_directory = self.curation_dict['folder_metadata'] + + metadata.save_metadata(response_dict, out_file_prefix, + metadata_source='QUALTRICS', + root_directory=root_directory, + metadata_directory=metadata_directory, + log=self.log) From 1e350a61cc073f57048dd448f3c7b58a669c5ec6 Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Wed, 23 Jun 2021 13:34:49 -0700 Subject: [PATCH 15/26] Call save_metadata method for Deposit Agreement, README form - Note: DA will not work if multiple responses are provided --- ldcoolp/curation/api/qualtrics.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ldcoolp/curation/api/qualtrics.py b/ldcoolp/curation/api/qualtrics.py index 24795e2..ff5e8d4 100644 --- a/ldcoolp/curation/api/qualtrics.py +++ b/ldcoolp/curation/api/qualtrics.py @@ -295,6 +295,8 @@ def find_deposit_agreement(self, dn: DepositorName): else: if response_df.shape[0] == 1: response_dict = df_to_dict_single(response_df) + self.save_metadata(response_dict, dn, out_file_prefix= + f'deposit_agreement_original_{article_id}') self.pandas_write_buffer(response_df[cols_order]) self.log.info("Only one entry found!") self.log.info(f"Survey completed on {response_dict['Date Completed']}") @@ -573,6 +575,8 @@ def retrieve_qualtrics_readme(self, dn=None, ResponseId='', browser=True): qualtrics_dict['references'] = [] else: qualtrics_dict = df_to_dict_single(response_df[readme_custom_content]) + self.save_metadata(qualtrics_dict, dn, out_file_prefix= + f"readme_original_{dn.name_dict['article_id']}") for key in qualtrics_dict.keys(): if isinstance(qualtrics_dict[key], float): qualtrics_dict[key] = str(qualtrics_dict[key]) From 0f7a102fa9a90d3327032adfde4b516df86d32ac Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Wed, 23 Jun 2021 13:43:43 -0700 Subject: [PATCH 16/26] Define write_json function - Add optional overwrite option in metadata.save_metadata --- ldcoolp/curation/metadata.py | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/ldcoolp/curation/metadata.py b/ldcoolp/curation/metadata.py index edf3ebe..77a084f 100644 --- a/ldcoolp/curation/metadata.py +++ b/ldcoolp/curation/metadata.py @@ -14,6 +14,7 @@ def save_metadata(json_response: Union[list, dict], root_directory: str = '', metadata_directory: str = '', save_csv: bool = False, + overwrite: bool = False, log=None): """ @@ -21,10 +22,11 @@ def save_metadata(json_response: Union[list, dict], :param json_response: Content in list or dict :param out_file_prefix: Filename prefix. Appends .json and .csv + :param metadata_source: Source of metadata, :param root_directory: Full path containing the working directory :param metadata_directory: Metadata path :param save_csv: Save a CSV file. Default: False - :param metadata_source: Source of metadata, + :param overwrite: Overwrite file if it exists. Default: False :param log: LogClass or logging object. Default: log_stdout() """ @@ -45,17 +47,30 @@ def save_metadata(json_response: Union[list, dict], # Write JSON file json_out_file = f"{out_file_prefix}.json" if not os.path.exists(json_out_file): - log.info(f"Writing: {json_out_file}") - with open(json_out_file, 'w') as f: - json.dump(json_response, f, indent=4) + write_json(json_out_file, json_response, log) else: - log.info(f"File exists: {out_file_prefix}") + log.info(f"File exists: {json_out_file}") + if overwrite: + log.info("Overwriting!") + write_json(json_out_file, json_response, log) # Write CSV file if save_csv: - csv_out_file = f"{out_file_prefix}.csv" df = pd.DataFrame.from_dict(json_response, orient='columns') - log.info(f"Writing: {csv_out_file}") - df.to_csv(csv_out_file, index=False) + csv_out_file = f"{out_file_prefix}.csv" + if not os.path.exists(csv_out_file): + log.info(f"Writing: {csv_out_file}") + df.to_csv(csv_out_file, index=False) + else: + log.info(f"File exists: {csv_out_file}") + if overwrite: + log.info("Overwriting!") + df.to_csv(csv_out_file, index=False) log.debug("finished.") + + +def write_json(json_out_file, json_response, log): + log.info(f"Writing: {json_out_file}") + with open(json_out_file, 'w') as f: + json.dump(json_response, f, indent=4) From 1d26aaa904ecdaf79ceea2a296d575badee095b4 Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Thu, 24 Jun 2021 13:20:14 -0700 Subject: [PATCH 17/26] Call Qualtrics.save_metadata method when README.txt are updated --- ldcoolp/curation/inspection/readme/__init__.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ldcoolp/curation/inspection/readme/__init__.py b/ldcoolp/curation/inspection/readme/__init__.py index c7a6d5d..1078c6e 100644 --- a/ldcoolp/curation/inspection/readme/__init__.py +++ b/ldcoolp/curation/inspection/readme/__init__.py @@ -427,6 +427,13 @@ def update(self): f = open(self.readme_file_path, 'w') f.writelines(content_list) f.close() + + # Saving Qualtrics README for metadata for updated README.txt + cur_time = datetime.now() + out_file_prefix = f"readme_revised_{self.article_id}_" + \ + f"{cur_time.isoformat(timespec='seconds').replace(':', '')}" + self.q.save_metadata(self.qualtrics_readme_dict, self.dn, + out_file_prefix=out_file_prefix) else: self.log.info("README.txt does not exist. Creating new one") From bf2b49e83a2f07b279e25c02348d0b397acdffba Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Thu, 24 Jun 2021 13:24:26 -0700 Subject: [PATCH 18/26] retrieve_qualtrics_readme: Move save_metadata call - Get full Qualtrics metadata for README construction --- ldcoolp/curation/api/qualtrics.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ldcoolp/curation/api/qualtrics.py b/ldcoolp/curation/api/qualtrics.py index ff5e8d4..e03c86b 100644 --- a/ldcoolp/curation/api/qualtrics.py +++ b/ldcoolp/curation/api/qualtrics.py @@ -575,8 +575,6 @@ def retrieve_qualtrics_readme(self, dn=None, ResponseId='', browser=True): qualtrics_dict['references'] = [] else: qualtrics_dict = df_to_dict_single(response_df[readme_custom_content]) - self.save_metadata(qualtrics_dict, dn, out_file_prefix= - f"readme_original_{dn.name_dict['article_id']}") for key in qualtrics_dict.keys(): if isinstance(qualtrics_dict[key], float): qualtrics_dict[key] = str(qualtrics_dict[key]) @@ -608,6 +606,10 @@ def retrieve_qualtrics_readme(self, dn=None, ResponseId='', browser=True): qualtrics_dict['corr_author_email'] = DA_dict['Q6_2'] qualtrics_dict['corr_author_affil'] = DA_dict['Q6_3'] + # Save Qualtrics README metadata + self.save_metadata(qualtrics_dict, dn, out_file_prefix= + f"readme_original_{dn.name_dict['article_id']}") + return qualtrics_dict def save_metadata(self, response_dict: dict, dn: DepositorName, From b32476e54f9726303929f7865b3ba56da9e6603b Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Thu, 24 Jun 2021 14:06:01 -0700 Subject: [PATCH 19/26] retrieve_qualtrics_readme: Add save_metadata bool flag - README metadata should be done in the ReadmeClass --- ldcoolp/curation/api/qualtrics.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/ldcoolp/curation/api/qualtrics.py b/ldcoolp/curation/api/qualtrics.py index e03c86b..d9b11d7 100644 --- a/ldcoolp/curation/api/qualtrics.py +++ b/ldcoolp/curation/api/qualtrics.py @@ -540,7 +540,8 @@ def find_qualtrics_readme(self, dn: DepositorName): self.log.warn("Multiple entries found") raise ValueError - def retrieve_qualtrics_readme(self, dn=None, ResponseId='', browser=True): + def retrieve_qualtrics_readme(self, dn=None, ResponseId='', browser=True, + save_metadata: bool = False): """Retrieve response to Qualtrics README form""" if ResponseId: @@ -607,8 +608,11 @@ def retrieve_qualtrics_readme(self, dn=None, ResponseId='', browser=True): qualtrics_dict['corr_author_affil'] = DA_dict['Q6_3'] # Save Qualtrics README metadata - self.save_metadata(qualtrics_dict, dn, out_file_prefix= - f"readme_original_{dn.name_dict['article_id']}") + if save_metadata: + out_file_prefix = "qualtrics_readme_original_" + \ + f"{dn.name_dict['article_id']}" + self.save_metadata(qualtrics_dict, dn, + out_file_prefix=out_file_prefix) return qualtrics_dict From 656a64d2aabc74b78faa5ac3170e0244ff01dcf5 Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Thu, 24 Jun 2021 14:07:26 -0700 Subject: [PATCH 20/26] ReadmeClass: Add save_metadata method - Call in construct - Call in update --- .../curation/inspection/readme/__init__.py | 29 +++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/ldcoolp/curation/inspection/readme/__init__.py b/ldcoolp/curation/inspection/readme/__init__.py index 1078c6e..b4e1910 100644 --- a/ldcoolp/curation/inspection/readme/__init__.py +++ b/ldcoolp/curation/inspection/readme/__init__.py @@ -12,12 +12,14 @@ # Logging from redata.commons.logger import log_stdout +from ... import metadata from ....admin import permissions, move # Read in default configuration settings from ....config import config_default_dict from ...api.qualtrics import Qualtrics +from ...depositor_name import DepositorName class ReadmeClass: @@ -394,6 +396,9 @@ def construct(self): qualtrics_dict=self.qualtrics_readme_dict) f.writelines(content_list) f.close() + + out_file_prefix = f"readme_original_{self.article_id}" + self.save_metadata(out_file_prefix=out_file_prefix) else: self.log.warn("Default README.txt file found! Not overwriting with template!") @@ -432,8 +437,7 @@ def update(self): cur_time = datetime.now() out_file_prefix = f"readme_revised_{self.article_id}_" + \ f"{cur_time.isoformat(timespec='seconds').replace(':', '')}" - self.q.save_metadata(self.qualtrics_readme_dict, self.dn, - out_file_prefix=out_file_prefix) + self.save_metadata(out_file_prefix=out_file_prefix) else: self.log.info("README.txt does not exist. Creating new one") @@ -454,6 +458,27 @@ def main(self): else: raise SystemExit("SKIPPING README.txt CONSTRUCTION") + def save_metadata(self, out_file_prefix: str = 'readme'): + """Save README metadata to JSON file""" + + response_dict = { + 'figshare': self.figshare_readme_dict, + 'qualtrics': self.qualtrics_readme_dict, + } + + root_directory = join( + self.curation_dict[self.curation_dict['parent_dir']], + self.curation_dict['folder_todo'], + self.dn.folderName + ) + metadata_directory = self.curation_dict['folder_metadata'] + + metadata.save_metadata(response_dict, out_file_prefix, + metadata_source='QUALTRICS', + root_directory=root_directory, + metadata_directory=metadata_directory, + log=self.log) + def walkthrough(data_path, ignore='', log=None): """ From 6248356e1b6eeeb3965bf73038d6bdf8ac2cb424 Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Thu, 24 Jun 2021 14:08:38 -0700 Subject: [PATCH 21/26] Type hinting for dn input [ci skip] --- ldcoolp/curation/inspection/readme/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ldcoolp/curation/inspection/readme/__init__.py b/ldcoolp/curation/inspection/readme/__init__.py index b4e1910..d37405b 100644 --- a/ldcoolp/curation/inspection/readme/__init__.py +++ b/ldcoolp/curation/inspection/readme/__init__.py @@ -87,8 +87,9 @@ class ReadmeClass: Construct README.txt by calling retrieve """ - def __init__(self, dn, config_dict=config_default_dict, update=False, - q: Qualtrics = None, interactive=True, log=None): + def __init__(self, dn: DepositorName, config_dict=config_default_dict, + update=False, q: Qualtrics = None, interactive=True, + log=None): self.config_dict = config_dict self.interactive = interactive From a08608eeac620b24de29f1907f0f630255e9f75d Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Thu, 24 Jun 2021 14:31:12 -0700 Subject: [PATCH 22/26] Change metadata_source input --- ldcoolp/curation/inspection/readme/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ldcoolp/curation/inspection/readme/__init__.py b/ldcoolp/curation/inspection/readme/__init__.py index d37405b..c2aa990 100644 --- a/ldcoolp/curation/inspection/readme/__init__.py +++ b/ldcoolp/curation/inspection/readme/__init__.py @@ -475,7 +475,7 @@ def save_metadata(self, out_file_prefix: str = 'readme'): metadata_directory = self.curation_dict['folder_metadata'] metadata.save_metadata(response_dict, out_file_prefix, - metadata_source='QUALTRICS', + metadata_source='README', root_directory=root_directory, metadata_directory=metadata_directory, log=self.log) From 09e1231492395cc09916fc1dc2a34dc453e2c6f7 Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Fri, 25 Jun 2021 08:55:45 -0700 Subject: [PATCH 23/26] Update CHANGELOG.md for unreleased changes #160 [ci skip] --- CHANGELOG.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ec9f049..5fefab7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,16 @@ # Changelog +## [UNRELEASED](https://github.com/UAL-ODIS/LD-Cool-P/tree/HEAD) (YYYY-MM-DD) + +**Implemented enhancements:** + - Enhancement: Dump JSON metadata from Qualtrics API + [#226](https://github.com/UAL-ODIS/LD-Cool-P/pull/226) + +**Closed issues:** + - Enhancement: Dump JSON metadata from Qualtrics API + [#160](https://github.com/UAL-ODIS/LD-Cool-P/issues/160) + + ## [v1.1.1](https://github.com/UAL-ODIS/LD-Cool-P/tree/v1.1.1) (2021-06-10) **Fixed bugs:** From 8ab776aba807dd174ce003c1504ab449be2363c8 Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Mon, 28 Jun 2021 12:40:04 -0700 Subject: [PATCH 24/26] Update CHANGELOG.md for unreleased changes [ci skip] --- CHANGELOG.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ec9f049..017729d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # Changelog +## [UNRELEASED](https://github.com/UAL-ODIS/LD-Cool-P/tree/HEAD) (YYYY-MM-DD) + +**Merged pull requests:** + - Allow for downloading only the metadata + [#223](https://github.com/UAL-ODIS/LD-Cool-P/pull/223) + + ## [v1.1.1](https://github.com/UAL-ODIS/LD-Cool-P/tree/v1.1.1) (2021-06-10) **Fixed bugs:** From 71b22f7ff76fa852fb8e0df3dbd4e5ebf26e0b01 Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Mon, 28 Jun 2021 12:48:04 -0700 Subject: [PATCH 25/26] Minor edit to CHANGELOG.md [ci skip] --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2de504c..1234447 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,10 +3,10 @@ ## [v1.1.2](https://github.com/UAL-ODIS/LD-Cool-P/tree/v1.1.2) (2021-06-28) **Implemented enhancements:** - - Enhancement: Dump JSON metadata from Qualtrics API - [#226](https://github.com/UAL-ODIS/LD-Cool-P/pull/226) - Allow for downloading only the metadata [#223](https://github.com/UAL-ODIS/LD-Cool-P/pull/223) + - Enhancement: Dump JSON metadata from Qualtrics API + [#226](https://github.com/UAL-ODIS/LD-Cool-P/pull/226) **Closed issues:** - Enhancement: Dump JSON metadata from Qualtrics API From 9217ea021583fffe2cc51d3054e5ddeef297d38d Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Mon, 28 Jun 2021 12:49:40 -0700 Subject: [PATCH 26/26] Bump version: v1.1.1 -> v1.1.2 --- README.md | 2 +- ldcoolp/__init__.py | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 42e241b..a8f39d3 100644 --- a/README.md +++ b/README.md @@ -116,7 +116,7 @@ You can confirm installation via `conda list` (curation) $ conda list ldcoolp ``` -You should see that the version is `1.1.1`. +You should see that the version is `1.1.2`. ### Configuration Settings diff --git a/ldcoolp/__init__.py b/ldcoolp/__init__.py index 31f8f15..df6d6da 100644 --- a/ldcoolp/__init__.py +++ b/ldcoolp/__init__.py @@ -1,6 +1,6 @@ from os import path -__version__ = "1.1.1" +__version__ = "1.1.2" CODE_NAME = "LD-Cool-P" diff --git a/setup.py b/setup.py index ff3ddd7..906b4df 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ setup( name='ldcoolp', - version='1.1.1', + version='1.1.2', packages=['ldcoolp'], url='https://github.com/UAL-ODIS/LD-Cool-P', license='MIT License',