From 809bca79ca35a91308ff3f0c6fa86ed905c6c36f Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Mon, 9 Dec 2024 12:43:03 -0500 Subject: [PATCH 1/8] work on partial processing and lint --- bedboss/bbuploader/main.py | 8 ++++- bedboss/bedboss.py | 31 +++++++++++-------- bedboss/refgenome_validator/main.py | 24 ++++++++------ bedboss/skipper.py | 1 - requirements/requirements-all.txt | 2 +- scripts/bb_text_search/main.py | 1 - scripts/bb_text_search/search_test.py | 1 - scripts/bbuploader/main.py | 10 +++++- .../ref_genome_validating/grab_chrom_sizes.py | 2 -- .../process_exclude_ranges.py | 1 - .../stats_compat_testing.py | 1 - 11 files changed, 49 insertions(+), 33 deletions(-) diff --git a/bedboss/bbuploader/main.py b/bedboss/bbuploader/main.py index 9d2db3f..be7f84d 100644 --- a/bedboss/bbuploader/main.py +++ b/bedboss/bbuploader/main.py @@ -283,6 +283,7 @@ def upload_gse( reinit_skipper=False, overwrite=False, overwrite_bedset=False, + light=False, ): """ Upload bed files from GEO series to BedBase @@ -302,10 +303,11 @@ def upload_gse( :param reinit_skipper: reinitialize skipper, if set to True, skipper will be reinitialized and all logs files will be cleaned :param overwrite: overwrite existing bedfiles :param overwrite_bedset: overwrite existing bedset + :param light: light mode, where skipping statistic processing for memory optimization and time saving :return: None """ - bbagent = BedBaseAgent(config=bedbase_config) + bbagent = BedBaseAgent(config=bedbase_config, init_ml=not light) with Session(bbagent.config.db_engine.engine) as session: _LOGGER.info(f"Processing: '{gse}'") @@ -352,6 +354,7 @@ def upload_gse( overwrite_bedset=overwrite_bedset, use_skipper=use_skipper, reinit_skipper=reinit_skipper, + light=light, ) except Exception as e: _LOGGER.error(f"Processing of '{gse}' failed with error: {e}") @@ -403,6 +406,7 @@ def _upload_gse( use_skipper: bool = True, reinit_skipper: bool = False, preload: bool = True, + light=False, ) -> ProjectProcessingStatus: """ Upload bed files from GEO series to BedBase @@ -421,6 +425,7 @@ def _upload_gse( and failed files. :param reinit_skipper: reinitialize skipper, if set to True, skipper will be reinitialized and all logs will be :param preload: pre - download files to the local folder (used for faster reproducibility) + :param light: light mode, where skipping statistic processing for memory optimization and time saving :return: None """ if isinstance(bedbase_config, str): @@ -540,6 +545,7 @@ def _upload_gse( upload_s3=True, upload_qdrant=True, force_overwrite=overwrite, + light=light, ) uploaded_files.append(file_digest) if skipper_obj: diff --git a/bedboss/bedboss.py b/bedboss/bedboss.py index 0f7387b..9945487 100644 --- a/bedboss/bedboss.py +++ b/bedboss/bedboss.py @@ -68,6 +68,7 @@ def run_all( upload_qdrant: bool = False, upload_s3: bool = False, upload_pephub: bool = False, + light: bool = False, # Universes universe: bool = False, universe_method: str = None, @@ -99,6 +100,7 @@ def run_all( :param bool upload_qdrant: whether to skip qdrant indexing :param bool upload_s3: whether to upload to s3 :param bool upload_pephub: whether to push bedfiles and metadata to pephub (default: False) + :param bool light: whether to run light version of the pipeline :param bool universe: whether to add the sample as the universe [Default: False] :param str universe_method: method used to create the universe [Default: None] @@ -145,17 +147,20 @@ def run_all( if not other_metadata: other_metadata = {"sample_name": name} - statistics_dict = bedstat( - bedfile=bed_metadata.bed_file, - outfolder=outfolder, - genome=genome, - ensdb=ensdb, - bed_digest=bed_metadata.bed_digest, - open_signal_matrix=open_signal_matrix, - just_db_commit=just_db_commit, - rfg_config=rfg_config, - pm=pm, - ) + if light: + statistics_dict = {} + else: + statistics_dict = bedstat( + bedfile=bed_metadata.bed_file, + outfolder=outfolder, + genome=genome, + ensdb=ensdb, + bed_digest=bed_metadata.bed_digest, + open_signal_matrix=open_signal_matrix, + just_db_commit=just_db_commit, + rfg_config=rfg_config, + pm=pm, + ) statistics_dict["bed_type"] = bed_metadata.bed_type statistics_dict["bed_format"] = bed_metadata.bed_format.value @@ -211,11 +216,12 @@ def run_all( classification=classification.model_dump(exclude_unset=True), ref_validation=ref_valid_stats, license_id=license_id, - upload_qdrant=upload_qdrant, + upload_qdrant=upload_qdrant and not light, upload_pephub=upload_pephub, upload_s3=upload_s3, local_path=outfolder, overwrite=force_overwrite, + processed=not light, nofail=True, ) @@ -310,7 +316,6 @@ def insert_pep( skipper.reinitialize() for i, pep_sample in enumerate(pep.samples): - is_processed = skipper.is_processed(pep_sample.sample_name) if is_processed: m.print_success( diff --git a/bedboss/refgenome_validator/main.py b/bedboss/refgenome_validator/main.py index 532693d..174ad1b 100644 --- a/bedboss/refgenome_validator/main.py +++ b/bedboss/refgenome_validator/main.py @@ -251,8 +251,10 @@ def determine_compatibility( for genome_model in self.genome_models: # First and Second Layer of Compatibility - model_compat_stats[genome_model.genome_alias]: CompatibilityStats = ( - self.calculate_chrom_stats(bed_chrom_info, genome_model.chrom_sizes) + model_compat_stats[ + genome_model.genome_alias + ]: CompatibilityStats = self.calculate_chrom_stats( + bed_chrom_info, genome_model.chrom_sizes ) # Third layer - IGD, only if layer 1 and layer 2 have passed @@ -264,13 +266,15 @@ def determine_compatibility( genome_model.genome_alias ].chrom_length_stats.beyond_range ): - model_compat_stats[genome_model.genome_alias].igd_stats = ( - self.get_igd_overlaps(bedfile) - ) + model_compat_stats[ + genome_model.genome_alias + ].igd_stats = self.get_igd_overlaps(bedfile) # Calculate compatibility rating - model_compat_stats[genome_model.genome_alias].compatibility = ( - self.calculate_rating(model_compat_stats[genome_model.genome_alias]) + model_compat_stats[ + genome_model.genome_alias + ].compatibility = self.calculate_rating( + model_compat_stats[genome_model.genome_alias] ) if concise: concise_dict = {} @@ -424,9 +428,9 @@ def predict(self, bedfile: str) -> Union[str, None]: """ _LOGGER.info(f"Predicting compatibility of {bedfile} with reference genomes...") - compatibility_stats: Dict[str, CompatibilityConcise] = ( - self.determine_compatibility(bedfile, concise=True) - ) + compatibility_stats: Dict[ + str, CompatibilityConcise + ] = self.determine_compatibility(bedfile, concise=True) best_rankings = [] diff --git a/bedboss/skipper.py b/bedboss/skipper.py index 12f118c..59fb58c 100644 --- a/bedboss/skipper.py +++ b/bedboss/skipper.py @@ -4,7 +4,6 @@ class Skipper: - def __init__(self, output_path: str, name: str): self.output_path = output_path self.name = name diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index bf90d2d..e344b67 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -6,7 +6,7 @@ yacman>=0.8.4 requests>=2.28.2 piper>=v0.14.3 # bbconf>=0.8.1 -bbconf @ git+https://github.com/databio/bbconf.git@dev#egg=bbconf +bbconf @ git+https://github.com/databio/bbconf.git@partial_processing#egg=bbconf refgenconf>=0.12.2 pandas>=2.0.0 ubiquerg>=0.6.2 diff --git a/scripts/bb_text_search/main.py b/scripts/bb_text_search/main.py index f6e0609..adc7dfd 100644 --- a/scripts/bb_text_search/main.py +++ b/scripts/bb_text_search/main.py @@ -18,7 +18,6 @@ def upload_text_embeddings(): - # lab qdrant client # qc = QdrantClient( # host=os.environ.get("QDRATN_HOST"), diff --git a/scripts/bb_text_search/search_test.py b/scripts/bb_text_search/search_test.py index 9e84fa2..34f5845 100644 --- a/scripts/bb_text_search/search_test.py +++ b/scripts/bb_text_search/search_test.py @@ -3,7 +3,6 @@ def search_test(): - # backend for text embeddings and bed embeddings text_backend = QdrantBackend( dim=384, diff --git a/scripts/bbuploader/main.py b/scripts/bbuploader/main.py index 2340569..fe04685 100644 --- a/scripts/bbuploader/main.py +++ b/scripts/bbuploader/main.py @@ -29,9 +29,14 @@ def runn(): def another_test(): from bedboss.bbuploader.main import upload_gse + # time it: + import time + + time1 = time.time() upload_gse( # gse="gse261411", - gse="gse261536", + # gse="gse261536", + gse="gse274130", # gse="gse246900", # gse="gse247593", # gse="gse241222", @@ -47,7 +52,10 @@ def another_test(): run_failed=True, run_skipped=True, reinit_skipper=True, + light=True, ) + time2 = time.time() + print(f"Time taken: {time2 - time1}") def upload_time(): diff --git a/scripts/ref_genome_validating/grab_chrom_sizes.py b/scripts/ref_genome_validating/grab_chrom_sizes.py index 46914ff..0cb6934 100644 --- a/scripts/ref_genome_validating/grab_chrom_sizes.py +++ b/scripts/ref_genome_validating/grab_chrom_sizes.py @@ -5,7 +5,6 @@ def main(): - # file_path = "/home/drc/Downloads/ncbi_ref_genome/ncbi_dataset/GCF_000001405.40_GRCh38.p14_genomic.fa" file_path = "/home/drc/Downloads/backup ref genome/GCA_000001405.29.fasta" FastaFile = open(file_path, "r") @@ -14,7 +13,6 @@ def main(): "/home/drc/GITHUB/bedboss/bedboss/scripts/ref_genome_validating/chrom_sizes/ensembl_hg38.chrom.sizes", "w", ) as file: - for rec in SeqIO.parse(FastaFile, "fasta"): name = rec.id seq = rec.seq diff --git a/scripts/ref_genome_validating/process_exclude_ranges.py b/scripts/ref_genome_validating/process_exclude_ranges.py index b43cdc2..fc12c8c 100644 --- a/scripts/ref_genome_validating/process_exclude_ranges.py +++ b/scripts/ref_genome_validating/process_exclude_ranges.py @@ -40,7 +40,6 @@ def main(species): print("Must supply species,e.g. mouse, homosapiens, rat, cow!") else: - # Make sure to have the IDE ignore these folders!!!! data_output_path = os.path.abspath("data") results_path = os.path.abspath("results") diff --git a/scripts/ref_genome_validating/stats_compat_testing.py b/scripts/ref_genome_validating/stats_compat_testing.py index 023b35f..cbf72dc 100644 --- a/scripts/ref_genome_validating/stats_compat_testing.py +++ b/scripts/ref_genome_validating/stats_compat_testing.py @@ -18,7 +18,6 @@ def main(): - # pull from tier rating column to get the final assessment tier_rating_keys = ["mm10", "hg38", "hg19"] From 1d57cfbb4c4ba2a15d12b333e43db25175e9ea88 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Mon, 9 Dec 2024 13:18:33 -0500 Subject: [PATCH 2/8] Added cli light --- bedboss/bbuploader/cli.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bedboss/bbuploader/cli.py b/bedboss/bbuploader/cli.py index 9073dbf..212b66f 100644 --- a/bedboss/bbuploader/cli.py +++ b/bedboss/bbuploader/cli.py @@ -124,6 +124,9 @@ def upload_gse( reinit_skipper: bool = typer.Option( False, help="Reinitialize skipper. [Default: False]" ), + light: bool = typer.Option( + False, help="Run the pipeline in light mode. [Default: False]" + ), ): from .main import upload_gse as upload_gse_function @@ -142,6 +145,7 @@ def upload_gse( reinit_skipper=reinit_skipper, overwrite=overwrite, overwrite_bedset=overwrite_bedset, + light=light, ) From 15821dde6fe9114ccdff8876b1ed5bb81f48f8fd Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Mon, 9 Dec 2024 14:04:31 -0500 Subject: [PATCH 3/8] strict versions --- requirements/requirements-all.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index e344b67..3e2a2ed 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -5,6 +5,7 @@ peppy>=0.40.7 yacman>=0.8.4 requests>=2.28.2 piper>=v0.14.3 +pybiocfilecache==0.4.1 # TODO: remove after geniml release # bbconf>=0.8.1 bbconf @ git+https://github.com/databio/bbconf.git@partial_processing#egg=bbconf refgenconf>=0.12.2 From 3726bd1a8d6672d226deac2822b53f5556fe0089 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Mon, 9 Dec 2024 16:23:28 -0500 Subject: [PATCH 4/8] added light mode to bedmaker --- bedboss/bedboss.py | 1 + bedboss/bedmaker/bedmaker.py | 24 +++++++++++++++--------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/bedboss/bedboss.py b/bedboss/bedboss.py index 9945487..f59dc67 100644 --- a/bedboss/bedboss.py +++ b/bedboss/bedboss.py @@ -142,6 +142,7 @@ def run_all( narrowpeak=narrowpeak, check_qc=check_qc, chrom_sizes=chrom_sizes, + light=light, pm=pm, ) if not other_metadata: diff --git a/bedboss/bedmaker/bedmaker.py b/bedboss/bedmaker/bedmaker.py index d3c66b2..539c2a6 100755 --- a/bedboss/bedmaker/bedmaker.py +++ b/bedboss/bedmaker/bedmaker.py @@ -315,6 +315,7 @@ def make_all( chrom_sizes: str = None, narrowpeak: bool = False, check_qc: bool = True, + light: bool = False, pm: pypiper.PipelineManager = None, ) -> BedMakerOutput: """ @@ -338,6 +339,7 @@ def make_all( :param narrowpeak: whether the regions are narrow (transcription factor implies narrow, histone mark implies broad peaks) :param check_qc: run quality control during bedmaking + :param light: run the pipeline in light mode (without producing bigBed files) :param pm: pypiper object :return: dict with generated bed metadata - BedMakerOutput object: @@ -382,15 +384,19 @@ def make_all( f"Quality control failed for {output_path}. Error: {e}" ) try: - output_bigbed = make_bigbed( - bed_path=output_bed, - output_path=output_path, - genome=genome, - bed_type=bed_type, - rfg_config=rfg_config, - chrom_sizes=chrom_sizes, - pm=pm, - ) + if light: + _LOGGER.info("Skipping bigBed generation due to light mode.") + output_bigbed = None + else: + output_bigbed = make_bigbed( + bed_path=output_bed, + output_path=output_path, + genome=genome, + bed_type=bed_type, + rfg_config=rfg_config, + chrom_sizes=chrom_sizes, + pm=pm, + ) except BedBossException: output_bigbed = None if pm_clean: From 97d4570c52135e6ef17de0ddb807ab9163d227fb Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Mon, 16 Dec 2024 16:32:42 -0500 Subject: [PATCH 5/8] Added light mode to bedbuncher and cli --- bedboss/bbuploader/cli.py | 4 ++++ bedboss/bbuploader/main.py | 6 +++++- bedboss/bedboss.py | 16 ++++++++++------ bedboss/bedbuncher/bedbuncher.py | 3 +++ bedboss/cli.py | 8 ++++++++ bedboss/utils.py | 3 +++ scripts/bbuploader/main.py | 11 ++++++----- 7 files changed, 39 insertions(+), 12 deletions(-) diff --git a/bedboss/bbuploader/cli.py b/bedboss/bbuploader/cli.py index 212b66f..a810d57 100644 --- a/bedboss/bbuploader/cli.py +++ b/bedboss/bbuploader/cli.py @@ -61,6 +61,9 @@ def upload_all( reinit_skipper: bool = typer.Option( False, help="Reinitialize skipper. [Default: False]" ), + light: bool = typer.Option( + False, help="Run the pipeline in light mode. [Default: False]" + ), ): from .main import upload_all as upload_all_function @@ -83,6 +86,7 @@ def upload_all( reinit_skipper=reinit_skipper, overwrite=overwrite, overwrite_bedset=overwrite_bedset, + light=light, ) diff --git a/bedboss/bbuploader/main.py b/bedboss/bbuploader/main.py index be7f84d..29de60c 100644 --- a/bedboss/bbuploader/main.py +++ b/bedboss/bbuploader/main.py @@ -54,6 +54,7 @@ def upload_all( reinit_skipper=False, overwrite=False, overwrite_bedset=False, + light=False, ): """ This is main function that is responsible for processing bed files from PEPHub. @@ -75,12 +76,13 @@ def upload_all( :param use_skipper: use skipper to skip already processed logged locally. Skipper creates local log of processed and failed files. :param reinit_skipper: reinitialize skipper, if set to True, skipper will be reinitialized and all logs files will be cleaned + :param light: light mode, where skipping statistic processing for memory optimization and time saving """ phc = PEPHubClient() os.makedirs(outfolder, exist_ok=True) - bbagent = BedBaseAgent(config=bedbase_config) + bbagent = BedBaseAgent(config=bedbase_config, init_ml=not light) genome = standardize_genome_name(genome) pep_annotation_list = find_peps( @@ -154,6 +156,7 @@ def upload_all( preload=preload, overwrite=overwrite, overwrite_bedset=overwrite_bedset, + light=light, ) except Exception as err: _LOGGER.error( @@ -577,6 +580,7 @@ def _upload_gse( upload_s3=True, no_fail=True, force_overwrite=overwrite_bedset, + light=light, ) else: diff --git a/bedboss/bedboss.py b/bedboss/bedboss.py index f59dc67..b231a13 100644 --- a/bedboss/bedboss.py +++ b/bedboss/bedboss.py @@ -95,12 +95,12 @@ def run_all( :param dict other_metadata: a dict containing all attributes from the sample :param str ensdb: a full path to the ensdb gtf file required for genomes not in GDdata [optional] (basically genomes that's not in GDdata) - :param bool just_db_commit: whether just to commit the JSON to the database (default: False) - :param bool force_overwrite: force overwrite analysis (default: False) - :param bool upload_qdrant: whether to skip qdrant indexing + :param bool just_db_commit: whether just to commit the JSON to the database [Default: False] + :param bool force_overwrite: force overwrite analysis [Default: False] + :param bool upload_qdrant: whether to skip qdrant indexing [Default: False] :param bool upload_s3: whether to upload to s3 - :param bool upload_pephub: whether to push bedfiles and metadata to pephub (default: False) - :param bool light: whether to run light version of the pipeline + :param bool upload_pephub: whether to push bedfiles and metadata to pephub [Default: False] + :param bool light: whether to run light version of the pipeline [Default: False] :param bool universe: whether to add the sample as the universe [Default: False] :param str universe_method: method used to create the universe [Default: None] @@ -109,7 +109,7 @@ def run_all( :return str bed_digest: bed digest """ if isinstance(bedbase_config, str): - bbagent = BedBaseAgent(bedbase_config) + bbagent = BedBaseAgent(config=bedbase_config, init_ml=not light) elif isinstance(bedbase_config, bbconf.BedBaseAgent): bbagent = bedbase_config else: @@ -259,6 +259,7 @@ def insert_pep( upload_qdrant: bool = False, no_fail: bool = False, standardize_pep: bool = False, + light: bool = False, rerun: bool = False, pm: pypiper.PipelineManager = None, ) -> None: @@ -285,6 +286,7 @@ def insert_pep( :param bool upload_pephub: whether to push bedfiles and metadata to pephub (default: False) :param bool upload_qdrant: whether to execute qdrant indexing :param bool no_fail: whether to raise an error if bedset was not added to the database + :param bool light: whether to run light version of the pipeline :param bool standardize_pep: whether to standardize the pep file before processing by using bedms. (default: False) :param bool rerun: whether to rerun processed samples :param pypiper.PipelineManager pm: pypiper object @@ -358,6 +360,7 @@ def insert_pep( universe=pep_sample.get("universe"), universe_method=pep_sample.get("universe_method"), universe_bedset=pep_sample.get("universe_bedset"), + light=light, pm=pm, ) @@ -384,6 +387,7 @@ def insert_pep( no_fail=no_fail, force_overwrite=force_overwrite, annotation=bedset_annotation, + light=light, ) else: _LOGGER.info( diff --git a/bedboss/bedbuncher/bedbuncher.py b/bedboss/bedbuncher/bedbuncher.py index e2c6a3f..80f8bc6 100644 --- a/bedboss/bedbuncher/bedbuncher.py +++ b/bedboss/bedbuncher/bedbuncher.py @@ -101,6 +101,7 @@ def run_bedbuncher( upload_s3: bool = False, no_fail: bool = False, force_overwrite: bool = False, + light: bool = False, ) -> None: """ Add bedset to the database @@ -118,6 +119,7 @@ def run_bedbuncher( :param upload_pephub: whether to create a view in pephub :param upload_s3: whether to upload files to s3 :param force_overwrite: whether to overwrite the record in the database + :param light: whether to run the pipeline in light mode # TODO: force_overwrite is not working!!! Fix it! :return: """ @@ -162,6 +164,7 @@ def run_bedbuncher( no_fail=no_fail, overwrite=force_overwrite, annotation=annotation, + processed=not light, ) diff --git a/bedboss/cli.py b/bedboss/cli.py index 670ef2d..9522021 100644 --- a/bedboss/cli.py +++ b/bedboss/cli.py @@ -89,6 +89,9 @@ def run_all( force_overwrite: bool = typer.Option( False, help="Force overwrite the output files" ), + light: bool = typer.Option( + False, help="Run the pipeline in light mode. [Default: False]" + ), upload_qdrant: bool = typer.Option(False, help="Upload to Qdrant"), upload_s3: bool = typer.Option(False, help="Upload to S3"), upload_pephub: bool = typer.Option(False, help="Upload to PEPHub"), @@ -129,6 +132,7 @@ def run_all( open_signal_matrix=open_signal_matrix, ensdb=ensdb, other_metadata=None, + light=light, just_db_commit=just_db_commit, force_overwrite=force_overwrite, upload_qdrant=upload_qdrant, @@ -170,6 +174,9 @@ def run_pep( no_fail: bool = typer.Option(False, help="Do not fail on error"), license_id: str = typer.Option(DEFAULT_LICENSE, help="License ID"), standardize_pep: bool = typer.Option(False, help="Standardize the PEP using bedMS"), + light: bool = typer.Option( + False, help="Run the pipeline in light mode. [Default: False]" + ), rerun: bool = typer.Option(False, help="Rerun already processed samples"), # PipelineManager multi: bool = typer.Option(False, help="Run multiple samples"), @@ -199,6 +206,7 @@ def run_pep( upload_qdrant=upload_qdrant, no_fail=no_fail, standardize_pep=standardize_pep, + light=light, rerun=rerun, pm=create_pm( outfolder=outfolder, diff --git a/bedboss/utils.py b/bedboss/utils.py index 1231a4b..c7450e8 100644 --- a/bedboss/utils.py +++ b/bedboss/utils.py @@ -24,6 +24,8 @@ def standardize_genome_name(input_genome: str, bedfile: str = None) -> str: :param bedfile: path to bed file :return: genome name string """ + if not isinstance(input_genome, str): + input_genome = "" input_genome = input_genome.strip().lower() # TODO: we have to add more genome options and preprocessing of the string if input_genome == "hg38" or input_genome == "grch38": @@ -47,6 +49,7 @@ def standardize_genome_name(input_genome: str, bedfile: str = None) -> str: return input_genome +# %% def download_file(url: str, path: str, no_fail: bool = False) -> None: """ Download file from the url to specific location diff --git a/scripts/bbuploader/main.py b/scripts/bbuploader/main.py index fe04685..0051dc5 100644 --- a/scripts/bbuploader/main.py +++ b/scripts/bbuploader/main.py @@ -64,19 +64,20 @@ def upload_time(): upload_all( bedbase_config="/home/bnt4me/virginia/repos/bbuploader/config_db_local.yaml", outfolder="/home/bnt4me/virginia/repos/bbuploader/data", - start_date="2024/06/01", - # end_date="2024/08/28", + start_date="2020/06/01", + end_date="2020/07/15", search_limit=1000, download_limit=10000, search_offset=0, - genome="hg38", + # genome="hg38", rerun=True, run_skipped=True, + light=True, ) if __name__ == "__main__": # runn() - another_test() - # upload_time() + # another_test() + upload_time() From 60563bde25da5d0f416009ebcbd444abc7d31747 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Wed, 18 Dec 2024 09:44:54 -0500 Subject: [PATCH 6/8] Added run unprocessed files --- bedboss/bbuploader/main.py | 2 +- bedboss/bedboss.py | 151 ++++++++++++++++++++++++++++++++----- bedboss/utils.py | 24 +++++- scripts/all/run_all.py | 12 +++ scripts/bbuploader/main.py | 10 ++- 5 files changed, 176 insertions(+), 23 deletions(-) create mode 100644 scripts/all/run_all.py diff --git a/bedboss/bbuploader/main.py b/bedboss/bbuploader/main.py index 29de60c..8523c5f 100644 --- a/bedboss/bbuploader/main.py +++ b/bedboss/bbuploader/main.py @@ -285,7 +285,7 @@ def upload_gse( use_skipper=True, reinit_skipper=False, overwrite=False, - overwrite_bedset=False, + overwrite_bedset=True, light=False, ): """ diff --git a/bedboss/bedboss.py b/bedboss/bedboss.py index b231a13..5de0264 100644 --- a/bedboss/bedboss.py +++ b/bedboss/bedboss.py @@ -4,6 +4,7 @@ from typing import Union import bbconf +import yaml import pephubclient import peppy import pypiper @@ -11,8 +12,10 @@ from bbconf.const import DEFAULT_LICENSE from bbconf.models.base_models import FileModel from eido import validate_project +import datetime from pephubclient.helpers import MessageHandler as m from pephubclient.helpers import is_registry_path +from geniml.bbclient import BBClient from bedboss._version import __version__ from bedboss.bedbuncher import run_bedbuncher @@ -29,7 +32,7 @@ ) from bedboss.refgenome_validator.main import ReferenceValidator from bedboss.skipper import Skipper -from bedboss.utils import get_genome_digest, standardize_genome_name +from bedboss.utils import get_genome_digest, standardize_genome_name, calculate_time from bedboss.utils import standardize_pep as pep_standardizer _LOGGER = logging.getLogger(PKG_NAME) @@ -65,6 +68,7 @@ def run_all( other_metadata: dict = None, just_db_commit: bool = False, force_overwrite: bool = False, + update: bool = False, upload_qdrant: bool = False, upload_s3: bool = False, upload_pephub: bool = False, @@ -97,6 +101,7 @@ def run_all( (basically genomes that's not in GDdata) :param bool just_db_commit: whether just to commit the JSON to the database [Default: False] :param bool force_overwrite: force overwrite analysis [Default: False] + :param bool update: whether to update the record in the database [Default: False] (if True, overwrites 'force_overwrite' and ignores it) :param bool upload_qdrant: whether to skip qdrant indexing [Default: False] :param bool upload_s3: whether to upload to s3 :param bool upload_pephub: whether to push bedfiles and metadata to pephub [Default: False] @@ -208,23 +213,42 @@ def run_all( else: ref_valid_stats = None - bbagent.bed.add( - identifier=bed_metadata.bed_digest, - stats=stats.model_dump(exclude_unset=True), - metadata=other_metadata, - plots=plots.model_dump(exclude_unset=True), - files=files.model_dump(exclude_unset=True), - classification=classification.model_dump(exclude_unset=True), - ref_validation=ref_valid_stats, - license_id=license_id, - upload_qdrant=upload_qdrant and not light, - upload_pephub=upload_pephub, - upload_s3=upload_s3, - local_path=outfolder, - overwrite=force_overwrite, - processed=not light, - nofail=True, - ) + if update: + bbagent.bed.update( + identifier=bed_metadata.bed_digest, + stats=stats.model_dump(exclude_unset=True), + metadata=other_metadata, + plots=plots.model_dump(exclude_unset=True), + files=files.model_dump(exclude_unset=True), + classification=classification.model_dump(exclude_unset=True), + ref_validation=ref_valid_stats, + license_id=license_id, + upload_qdrant=upload_qdrant and not light, + upload_pephub=upload_pephub, + upload_s3=upload_s3, + local_path=outfolder, + overwrite=True, + processed=not light, + nofail=True, + ) + else: + bbagent.bed.add( + identifier=bed_metadata.bed_digest, + stats=stats.model_dump(exclude_unset=True), + metadata=other_metadata, + plots=plots.model_dump(exclude_unset=True), + files=files.model_dump(exclude_unset=True), + classification=classification.model_dump(exclude_unset=True), + ref_validation=ref_valid_stats, + license_id=license_id, + upload_qdrant=upload_qdrant and not light, + upload_pephub=upload_pephub, + upload_s3=upload_s3, + local_path=outfolder, + overwrite=force_overwrite, + processed=not light, + nofail=True, + ) if universe: bbagent.bed.add_universe( @@ -400,3 +424,94 @@ def insert_pep( m.print_error(f"Failed samples: {failed_samples}") return None + + +@calculate_time +def run_unprocessed_beds( + bedbase_config: Union[str, BedBaseAgent], + output_folder: str, + limit: int = 10, + nofail: bool = False, +): + """ + Run bedboss pipeline for all unprocessed beds in the bedbase + + :param bedbase_config: bedbase configuration file path + :param output_folder: output folder of the pipeline + :param limit: limit of the number of beds to process + :param nofail: whether to raise an error if bedset was not added to the database + + :return: None + """ + + if isinstance(bedbase_config, str): + bbagent = BedBaseAgent(config=bedbase_config) + elif isinstance(bedbase_config, bbconf.BedBaseAgent): + bbagent = bedbase_config + else: + raise BedBossException("Incorrect bedbase_config type. Exiting...") + + unprocessed_beds = bbagent.bed.get_unprocessed(limit=limit) + + bbclient = BBClient() + failed_samples = [] + for bed_annot in unprocessed_beds.results: + bed_file = bbclient.load_bed(bed_annot.id) + + try: + run_all( + input_file=bed_file.path, + input_type="bed", + outfolder=output_folder, + genome=bed_annot.genome_alias, + bedbase_config=bbagent, + name=bed_annot.name, + license_id=bed_annot.license_id, + rfg_config=None, + check_qc=False, + validate_reference=True, + chrom_sizes=None, + open_signal_matrix=None, + ensdb=None, + other_metadata=None, + just_db_commit=False, + update=True, + upload_qdrant=True, + upload_s3=True, + upload_pephub=True, + light=False, + universe=False, + universe_method=None, + universe_bedset=None, + pm=None, + ) + except Exception as e: + _LOGGER.error(f"Failed to process {bed_annot.name}. See {e}") + if nofail: + raise BedBossException(f"Failed to process {bed_annot.name}. See {e}") + + failed_samples.append( + { + "id": bed_annot.id, + "error": e, + } + ) + + if failed_samples: + date_now = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + with open( + os.path.join(output_folder, f"failed_samples_{date_now}.yaml"), "w" + ) as file: + yaml.dump(failed_samples, file) + + from rich import print + + m.print_success(f"Processing completed successfully") + + print_values = dict( + unprocessed_files=unprocessed_beds.count, + processing_files=unprocessed_beds.limit, + failed_files=len(failed_samples), + success_files=unprocessed_beds.limit - len(failed_samples), + ) + print(print_values) diff --git a/bedboss/utils.py b/bedboss/utils.py index c7450e8..0272498 100644 --- a/bedboss/utils.py +++ b/bedboss/utils.py @@ -2,6 +2,7 @@ import logging import os import urllib.request +import time import peppy import requests @@ -9,6 +10,7 @@ from pephubclient.files_manager import FilesManager from peppy.const import SAMPLE_RAW_DICT_KEY from pypiper import PipelineManager +from functools import wraps from bedboss.refgenome_validator.main import ReferenceValidator @@ -37,7 +39,7 @@ def standardize_genome_name(input_genome: str, bedfile: str = None) -> str: elif input_genome == "mm9" or input_genome == "grcm37": return "mm9" - elif not input_genome or len(input_genome) > 10: + elif not input_genome or len(input_genome) > 7: if bedfile: predictor = ReferenceValidator() return predictor.predict(bedfile) or "" @@ -202,3 +204,23 @@ def cleanup_pm_temp(pm: PipelineManager) -> None: except Exception as e: _LOGGER.error(f"Error cleaning up: {e}") pm.cleanup_list_conditional = [] + + +def calculate_time(func): + @wraps(func) + def wrapper(*args, **kwargs): + start_time = time.time() + result = func(*args, **kwargs) + end_time = time.time() + execution_time = end_time - start_time + + hours, remainder = divmod(execution_time, 3600) + minutes, seconds = divmod(remainder, 60) + + print( + f"Function '{func.__name__}' executed in {int(hours)} hours, {int(minutes)} minutes, and {seconds:.2f} seconds" + ) + + return result + + return wrapper diff --git a/scripts/all/run_all.py b/scripts/all/run_all.py new file mode 100644 index 0000000..1d9cd0a --- /dev/null +++ b/scripts/all/run_all.py @@ -0,0 +1,12 @@ +def unprocessed_run(): + from bedboss.bedboss import run_unprocessed_beds + + run_unprocessed_beds( + bedbase_config="/home/bnt4me/virginia/repos/bbuploader/config_db_local.yaml", + limit=10, + output_folder="/home/bnt4me/virginia/repos/bbuploader/data", + ) + + +if __name__ == "__main__": + unprocessed_run() diff --git a/scripts/bbuploader/main.py b/scripts/bbuploader/main.py index 0051dc5..de5fce6 100644 --- a/scripts/bbuploader/main.py +++ b/scripts/bbuploader/main.py @@ -36,7 +36,9 @@ def another_test(): upload_gse( # gse="gse261411", # gse="gse261536", - gse="gse274130", + # gse="gse274130", + # Genome hg19 and mm10 + gse="gse151780", # gse="gse246900", # gse="gse247593", # gse="gse241222", @@ -53,6 +55,8 @@ def another_test(): run_skipped=True, reinit_skipper=True, light=True, + overwrite=True, + overwrite_bedset=True, ) time2 = time.time() print(f"Time taken: {time2 - time1}") @@ -79,5 +83,5 @@ def upload_time(): if __name__ == "__main__": # runn() - # another_test() - upload_time() + another_test() + # upload_time() From 5ce2d6d49bd9ae1db7e651ead0c137309bb4150a Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Wed, 18 Dec 2024 16:25:16 -0500 Subject: [PATCH 7/8] added reprocess functions for bed and bedsets and cli options --- bedboss/_version.py | 2 +- bedboss/bbuploader/main.py | 5 +- bedboss/bedboss.py | 129 +++++++++++++++++++++++++++++++++++-- bedboss/cli.py | 79 +++++++++++++++++++++++ bedboss/utils.py | 5 +- scripts/all/run_all.py | 15 ++++- scripts/bbuploader/main.py | 6 +- 7 files changed, 223 insertions(+), 18 deletions(-) diff --git a/bedboss/_version.py b/bedboss/_version.py index dd9b22c..906d362 100644 --- a/bedboss/_version.py +++ b/bedboss/_version.py @@ -1 +1 @@ -__version__ = "0.5.1" +__version__ = "0.6.0" diff --git a/bedboss/bbuploader/main.py b/bedboss/bbuploader/main.py index 8523c5f..ff5fa32 100644 --- a/bedboss/bbuploader/main.py +++ b/bedboss/bbuploader/main.py @@ -8,7 +8,6 @@ from pephubclient import PEPHubClient from pephubclient.helpers import MessageHandler from pephubclient.models import SearchReturnModel -from setuptools.command.egg_info import overwrite_arg from sqlalchemy import and_, select from sqlalchemy.orm import Session @@ -28,13 +27,14 @@ from bedboss.bedbuncher.bedbuncher import run_bedbuncher from bedboss.exceptions import BedBossException from bedboss.skipper import Skipper -from bedboss.utils import download_file, standardize_genome_name +from bedboss.utils import calculate_time, download_file, standardize_genome_name from bedboss.utils import standardize_pep as pep_standardizer _LOGGER = logging.getLogger(PKG_NAME) _LOGGER.setLevel(logging.DEBUG) +@calculate_time def upload_all( bedbase_config: str, outfolder: str = os.getcwd(), @@ -271,6 +271,7 @@ def find_peps( ) +@calculate_time def upload_gse( gse: str, bedbase_config: Union[str, BedBaseAgent], diff --git a/bedboss/bedboss.py b/bedboss/bedboss.py index 5de0264..f7047e7 100644 --- a/bedboss/bedboss.py +++ b/bedboss/bedboss.py @@ -1,21 +1,21 @@ +import datetime import logging import os import subprocess from typing import Union import bbconf -import yaml import pephubclient import peppy import pypiper +import yaml from bbconf.bbagent import BedBaseAgent from bbconf.const import DEFAULT_LICENSE from bbconf.models.base_models import FileModel from eido import validate_project -import datetime +from geniml.bbclient import BBClient from pephubclient.helpers import MessageHandler as m from pephubclient.helpers import is_registry_path -from geniml.bbclient import BBClient from bedboss._version import __version__ from bedboss.bedbuncher import run_bedbuncher @@ -32,7 +32,7 @@ ) from bedboss.refgenome_validator.main import ReferenceValidator from bedboss.skipper import Skipper -from bedboss.utils import get_genome_digest, standardize_genome_name, calculate_time +from bedboss.utils import calculate_time, get_genome_digest, standardize_genome_name from bedboss.utils import standardize_pep as pep_standardizer _LOGGER = logging.getLogger(PKG_NAME) @@ -50,6 +50,7 @@ def requirements_check() -> None: ) +@calculate_time def run_all( input_file: str, input_type: str, @@ -264,6 +265,7 @@ def run_all( return bed_metadata.bed_digest +@calculate_time def insert_pep( bedbase_config: str, output_folder: str, @@ -278,6 +280,7 @@ def insert_pep( ensdb: str = None, just_db_commit: bool = False, force_overwrite: bool = False, + update: bool = False, upload_s3: bool = False, upload_pephub: bool = False, upload_qdrant: bool = False, @@ -306,6 +309,7 @@ def insert_pep( :param str ensdb: a full path to the ensdb gtf file required for genomes not in GDdata :param bool just_db_commit: whether save only to the database (Without saving locally ) :param bool force_overwrite: whether to overwrite the existing record + :param bool update: whether to update the record in the database. This option will overwrite the force_overwrite option. [Default: False] :param bool upload_s3: whether to upload to s3 :param bool upload_pephub: whether to push bedfiles and metadata to pephub (default: False) :param bool upload_qdrant: whether to execute qdrant indexing @@ -378,6 +382,7 @@ def insert_pep( ensdb=ensdb, just_db_commit=just_db_commit, force_overwrite=force_overwrite, + update=update, upload_qdrant=upload_qdrant, upload_s3=upload_s3, upload_pephub=upload_pephub, @@ -427,12 +432,12 @@ def insert_pep( @calculate_time -def run_unprocessed_beds( +def reprocess_all( bedbase_config: Union[str, BedBaseAgent], output_folder: str, limit: int = 10, nofail: bool = False, -): +) -> None: """ Run bedboss pipeline for all unprocessed beds in the bedbase @@ -504,7 +509,7 @@ def run_unprocessed_beds( ) as file: yaml.dump(failed_samples, file) - from rich import print + m.print_warning(f"Logs with failed samples are saved in {output_folder}") m.print_success(f"Processing completed successfully") @@ -515,3 +520,113 @@ def run_unprocessed_beds( success_files=unprocessed_beds.limit - len(failed_samples), ) print(print_values) + + +@calculate_time +def reprocess_one( + bedbase_config: Union[str, BedBaseAgent], + output_folder: str, + identifier: str, +) -> None: + """ + Run bedboss pipeline for one bed in the bedbase [Reprocess] + + :param bedbase_config: bedbase configuration file path + :param output_folder: output folder of the pipeline + :param identifier: bed identifier + + :return: None + """ + + if isinstance(bedbase_config, str): + bbagent = BedBaseAgent(config=bedbase_config) + elif isinstance(bedbase_config, bbconf.BedBaseAgent): + bbagent = bedbase_config + else: + raise BedBossException("Incorrect bedbase_config type. Exiting...") + + bbclient = BBClient() + + bed_annot = bbagent.bed.get(identifier) + bed_file = bbclient.load_bed(bed_annot.id) + + run_all( + input_file=bed_file.path, + input_type="bed", + outfolder=output_folder, + genome=bed_annot.genome_alias, + bedbase_config=bbagent, + name=bed_annot.name, + license_id=bed_annot.license_id, + rfg_config=None, + check_qc=False, + validate_reference=True, + chrom_sizes=None, + open_signal_matrix=None, + ensdb=None, + other_metadata=None, + just_db_commit=False, + update=True, + upload_qdrant=True, + upload_s3=True, + upload_pephub=True, + light=False, + universe=False, + universe_method=None, + universe_bedset=None, + pm=None, + ) + + _LOGGER.info(f"Successfully processed {identifier}") + + +@calculate_time +def reprocess_bedset( + bedbase_config: Union[str, BedBaseAgent], + output_folder: str, + identifier: str, + no_fail: bool = True, + heavy: bool = False, +): + """ + Recalculate bedset from the bedbase + + :param bedbase_config: bedbase configuration file path + :param output_folder: output folder of the pipeline + :param identifier: bedset identifier + :param no_fail: whether to raise an error if bedset was not added to the database + :param heavy: whether to use heavy processing. Calculate plots for bedset + + :return: None + """ + + if isinstance(bedbase_config, str): + bbagent = BedBaseAgent(config=bedbase_config) + elif isinstance(bedbase_config, bbconf.BedBaseAgent): + bbagent = bedbase_config + else: + raise BedBossException("Incorrect bedbase_config type. Exiting...") + + bedset_annot = bbagent.bedset.get(identifier) + + run_bedbuncher( + bedbase_config=bbagent, + record_id=bedset_annot.id, + bed_set=bedset_annot.bed_ids, + name=bedset_annot.name, + output_folder=output_folder, + description=bedset_annot.description, + heavy=heavy, + upload_pephub=False, + upload_s3=heavy, + no_fail=no_fail, + force_overwrite=True, + annotation={ + **bedset_annot.model_dump( + exclude={ + "bed_ids", + } + ) + }, + light=False, + ) diff --git a/bedboss/cli.py b/bedboss/cli.py index 9522021..13d78db 100644 --- a/bedboss/cli.py +++ b/bedboss/cli.py @@ -89,6 +89,10 @@ def run_all( force_overwrite: bool = typer.Option( False, help="Force overwrite the output files" ), + update: bool = typer.Option( + False, + help="Update the bedbase database with the new record if it exists. This overwrites 'force_overwrite' option", + ), light: bool = typer.Option( False, help="Run the pipeline in light mode. [Default: False]" ), @@ -135,6 +139,7 @@ def run_all( light=light, just_db_commit=just_db_commit, force_overwrite=force_overwrite, + update=update, upload_qdrant=upload_qdrant, upload_s3=upload_s3, upload_pephub=upload_pephub, @@ -168,6 +173,10 @@ def run_pep( force_overwrite: bool = typer.Option( False, help="Force overwrite the output files" ), + update: bool = typer.Option( + False, + help="Update the bedbase database with the new record if it exists. This overwrites 'force_overwrite' option", + ), upload_qdrant: bool = typer.Option(True, help="Upload to Qdrant"), upload_s3: bool = typer.Option(True, help="Upload to S3"), upload_pephub: bool = typer.Option(True, help="Upload to PEPHub"), @@ -200,6 +209,7 @@ def run_pep( ensdb=ensdb, just_db_commit=just_db_commit, force_overwrite=force_overwrite, + update=update, license_id=license_id, upload_s3=upload_s3, upload_pephub=upload_pephub, @@ -218,6 +228,75 @@ def run_pep( ) +@app.command(help="Run unprocessed files, or reprocess them") +def reprocess_all( + bedbase_config: str = typer.Option( + ..., + help="Path to the bedbase config file", + exists=True, + file_okay=True, + readable=True, + ), + outfolder: str = typer.Option(..., help="Path to the output folder"), + limit: int = typer.Option(100, help="Limit the number of files to reprocess"), + no_fail: bool = typer.Option(True, help="Do not fail on error"), +): + from bedboss.bedboss import reprocess_all as reprocess_all_function + + reprocess_all( + bedbase_config=bedbase_config, + output_folder=outfolder, + limit=limit, + no_fail=no_fail, + ) + + +@app.command(help="Run unprocessed file, or reprocess it [Only 1 file]") +def reprocess_one( + bedbase_config: str = typer.Option( + ..., + help="Path to the bedbase config file", + exists=True, + file_okay=True, + readable=True, + ), + outfolder: str = typer.Option(..., help="Path to the output folder"), + identifier: str = typer.Option(..., help="Identifier of the bed file"), +): + from bedboss.bedboss import reprocess_one as reprocess_one_function + + reprocess_one( + bedbase_config=bedbase_config, + output_folder=outfolder, + identifier=identifier, + ) + + +@app.command(help="Reprocess a bedset") +def reprocess_bedset( + bedbase_config: str = typer.Option( + ..., + help="Path to the bedbase config file", + exists=True, + file_okay=True, + readable=True, + ), + outfolder: str = typer.Option(..., help="Path to the output folder"), + identifier: str = typer.Option(..., help="Bedset ID"), + no_fail: bool = typer.Option(True, help="Do not fail on error"), + heavy: bool = typer.Option(False, help="Run the heavy version of the pipeline"), +): + from bedboss.bedboss import reprocess_bedset as reprocess_bedset_function + + reprocess_bedset_function( + bedbase_config=bedbase_config, + output_folder=outfolder, + identifier=identifier, + no_fail=no_fail, + heavy=heavy, + ) + + @app.command(help=f"Create a bed files form a [{', '.join(options_list)}] file") def make_bed( input_file: str = typer.Option( diff --git a/bedboss/utils.py b/bedboss/utils.py index 0272498..a8d6829 100644 --- a/bedboss/utils.py +++ b/bedboss/utils.py @@ -1,8 +1,9 @@ import glob import logging import os -import urllib.request import time +import urllib.request +from functools import wraps import peppy import requests @@ -10,7 +11,6 @@ from pephubclient.files_manager import FilesManager from peppy.const import SAMPLE_RAW_DICT_KEY from pypiper import PipelineManager -from functools import wraps from bedboss.refgenome_validator.main import ReferenceValidator @@ -51,7 +51,6 @@ def standardize_genome_name(input_genome: str, bedfile: str = None) -> str: return input_genome -# %% def download_file(url: str, path: str, no_fail: bool = False) -> None: """ Download file from the url to specific location diff --git a/scripts/all/run_all.py b/scripts/all/run_all.py index 1d9cd0a..7ad2e1f 100644 --- a/scripts/all/run_all.py +++ b/scripts/all/run_all.py @@ -1,5 +1,5 @@ def unprocessed_run(): - from bedboss.bedboss import run_unprocessed_beds + from bedboss.bedboss import reprocess_all run_unprocessed_beds( bedbase_config="/home/bnt4me/virginia/repos/bbuploader/config_db_local.yaml", @@ -8,5 +8,16 @@ def unprocessed_run(): ) +def reprocess_one(): + from bedboss.bedboss import reprocess_one + + reprocess_one( + identifier="a0f1889fd8026780df8bba6a8ddac00e", + bedbase_config="/home/bnt4me/virginia/repos/bbuploader/config_db_local.yaml", + output_folder="/home/bnt4me/virginia/repos/bbuploader/data", + ) + + if __name__ == "__main__": - unprocessed_run() + # unprocessed_run() + reprocess_one() diff --git a/scripts/bbuploader/main.py b/scripts/bbuploader/main.py index de5fce6..279d9e4 100644 --- a/scripts/bbuploader/main.py +++ b/scripts/bbuploader/main.py @@ -27,18 +27,18 @@ def runn(): def another_test(): - from bedboss.bbuploader.main import upload_gse - # time it: import time + from bedboss.bbuploader.main import upload_gse + time1 = time.time() upload_gse( # gse="gse261411", # gse="gse261536", # gse="gse274130", # Genome hg19 and mm10 - gse="gse151780", + gse="gse280839", # gse="gse246900", # gse="gse247593", # gse="gse241222", From a4f4cea182959519d0fc3eced1d1d27800661634 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Thu, 19 Dec 2024 13:02:54 -0500 Subject: [PATCH 8/8] changed light -> lite --- bedboss/bbuploader/cli.py | 12 +++++------ bedboss/bbuploader/main.py | 24 +++++++++++----------- bedboss/bedboss.py | 32 ++++++++++++++--------------- bedboss/bedbuncher/bedbuncher.py | 6 +++--- bedboss/bedmaker/bedmaker.py | 8 ++++---- bedboss/cli.py | 12 +++++------ bedboss/refgenome_validator/main.py | 24 +++++++++------------- scripts/bbuploader/main.py | 4 ++-- 8 files changed, 59 insertions(+), 63 deletions(-) diff --git a/bedboss/bbuploader/cli.py b/bedboss/bbuploader/cli.py index a810d57..cb415c1 100644 --- a/bedboss/bbuploader/cli.py +++ b/bedboss/bbuploader/cli.py @@ -61,8 +61,8 @@ def upload_all( reinit_skipper: bool = typer.Option( False, help="Reinitialize skipper. [Default: False]" ), - light: bool = typer.Option( - False, help="Run the pipeline in light mode. [Default: False]" + lite: bool = typer.Option( + False, help="Run the pipeline in lite mode. [Default: False]" ), ): from .main import upload_all as upload_all_function @@ -86,7 +86,7 @@ def upload_all( reinit_skipper=reinit_skipper, overwrite=overwrite, overwrite_bedset=overwrite_bedset, - light=light, + lite=lite, ) @@ -128,8 +128,8 @@ def upload_gse( reinit_skipper: bool = typer.Option( False, help="Reinitialize skipper. [Default: False]" ), - light: bool = typer.Option( - False, help="Run the pipeline in light mode. [Default: False]" + lite: bool = typer.Option( + False, help="Run the pipeline in lite mode. [Default: False]" ), ): from .main import upload_gse as upload_gse_function @@ -149,7 +149,7 @@ def upload_gse( reinit_skipper=reinit_skipper, overwrite=overwrite, overwrite_bedset=overwrite_bedset, - light=light, + lite=lite, ) diff --git a/bedboss/bbuploader/main.py b/bedboss/bbuploader/main.py index ff5fa32..170f8e0 100644 --- a/bedboss/bbuploader/main.py +++ b/bedboss/bbuploader/main.py @@ -54,7 +54,7 @@ def upload_all( reinit_skipper=False, overwrite=False, overwrite_bedset=False, - light=False, + lite=False, ): """ This is main function that is responsible for processing bed files from PEPHub. @@ -76,13 +76,13 @@ def upload_all( :param use_skipper: use skipper to skip already processed logged locally. Skipper creates local log of processed and failed files. :param reinit_skipper: reinitialize skipper, if set to True, skipper will be reinitialized and all logs files will be cleaned - :param light: light mode, where skipping statistic processing for memory optimization and time saving + :param lite: lite mode, where skipping statistic processing for memory optimization and time saving """ phc = PEPHubClient() os.makedirs(outfolder, exist_ok=True) - bbagent = BedBaseAgent(config=bedbase_config, init_ml=not light) + bbagent = BedBaseAgent(config=bedbase_config, init_ml=not lite) genome = standardize_genome_name(genome) pep_annotation_list = find_peps( @@ -156,7 +156,7 @@ def upload_all( preload=preload, overwrite=overwrite, overwrite_bedset=overwrite_bedset, - light=light, + lite=lite, ) except Exception as err: _LOGGER.error( @@ -287,7 +287,7 @@ def upload_gse( reinit_skipper=False, overwrite=False, overwrite_bedset=True, - light=False, + lite=False, ): """ Upload bed files from GEO series to BedBase @@ -307,11 +307,11 @@ def upload_gse( :param reinit_skipper: reinitialize skipper, if set to True, skipper will be reinitialized and all logs files will be cleaned :param overwrite: overwrite existing bedfiles :param overwrite_bedset: overwrite existing bedset - :param light: light mode, where skipping statistic processing for memory optimization and time saving + :param lite: lite mode, where skipping statistic processing for memory optimization and time saving :return: None """ - bbagent = BedBaseAgent(config=bedbase_config, init_ml=not light) + bbagent = BedBaseAgent(config=bedbase_config, init_ml=not lite) with Session(bbagent.config.db_engine.engine) as session: _LOGGER.info(f"Processing: '{gse}'") @@ -358,7 +358,7 @@ def upload_gse( overwrite_bedset=overwrite_bedset, use_skipper=use_skipper, reinit_skipper=reinit_skipper, - light=light, + lite=lite, ) except Exception as e: _LOGGER.error(f"Processing of '{gse}' failed with error: {e}") @@ -410,7 +410,7 @@ def _upload_gse( use_skipper: bool = True, reinit_skipper: bool = False, preload: bool = True, - light=False, + lite=False, ) -> ProjectProcessingStatus: """ Upload bed files from GEO series to BedBase @@ -429,7 +429,7 @@ def _upload_gse( and failed files. :param reinit_skipper: reinitialize skipper, if set to True, skipper will be reinitialized and all logs will be :param preload: pre - download files to the local folder (used for faster reproducibility) - :param light: light mode, where skipping statistic processing for memory optimization and time saving + :param lite: lite mode, where skipping statistic processing for memory optimization and time saving :return: None """ if isinstance(bedbase_config, str): @@ -549,7 +549,7 @@ def _upload_gse( upload_s3=True, upload_qdrant=True, force_overwrite=overwrite, - light=light, + lite=lite, ) uploaded_files.append(file_digest) if skipper_obj: @@ -581,7 +581,7 @@ def _upload_gse( upload_s3=True, no_fail=True, force_overwrite=overwrite_bedset, - light=light, + lite=lite, ) else: diff --git a/bedboss/bedboss.py b/bedboss/bedboss.py index f7047e7..621824b 100644 --- a/bedboss/bedboss.py +++ b/bedboss/bedboss.py @@ -73,7 +73,7 @@ def run_all( upload_qdrant: bool = False, upload_s3: bool = False, upload_pephub: bool = False, - light: bool = False, + lite: bool = False, # Universes universe: bool = False, universe_method: str = None, @@ -106,7 +106,7 @@ def run_all( :param bool upload_qdrant: whether to skip qdrant indexing [Default: False] :param bool upload_s3: whether to upload to s3 :param bool upload_pephub: whether to push bedfiles and metadata to pephub [Default: False] - :param bool light: whether to run light version of the pipeline [Default: False] + :param bool lite: whether to run lite version of the pipeline [Default: False] :param bool universe: whether to add the sample as the universe [Default: False] :param str universe_method: method used to create the universe [Default: None] @@ -115,7 +115,7 @@ def run_all( :return str bed_digest: bed digest """ if isinstance(bedbase_config, str): - bbagent = BedBaseAgent(config=bedbase_config, init_ml=not light) + bbagent = BedBaseAgent(config=bedbase_config, init_ml=not lite) elif isinstance(bedbase_config, bbconf.BedBaseAgent): bbagent = bedbase_config else: @@ -148,13 +148,13 @@ def run_all( narrowpeak=narrowpeak, check_qc=check_qc, chrom_sizes=chrom_sizes, - light=light, + lite=lite, pm=pm, ) if not other_metadata: other_metadata = {"sample_name": name} - if light: + if lite: statistics_dict = {} else: statistics_dict = bedstat( @@ -224,12 +224,12 @@ def run_all( classification=classification.model_dump(exclude_unset=True), ref_validation=ref_valid_stats, license_id=license_id, - upload_qdrant=upload_qdrant and not light, + upload_qdrant=upload_qdrant and not lite, upload_pephub=upload_pephub, upload_s3=upload_s3, local_path=outfolder, overwrite=True, - processed=not light, + processed=not lite, nofail=True, ) else: @@ -242,12 +242,12 @@ def run_all( classification=classification.model_dump(exclude_unset=True), ref_validation=ref_valid_stats, license_id=license_id, - upload_qdrant=upload_qdrant and not light, + upload_qdrant=upload_qdrant and not lite, upload_pephub=upload_pephub, upload_s3=upload_s3, local_path=outfolder, overwrite=force_overwrite, - processed=not light, + processed=not lite, nofail=True, ) @@ -286,7 +286,7 @@ def insert_pep( upload_qdrant: bool = False, no_fail: bool = False, standardize_pep: bool = False, - light: bool = False, + lite: bool = False, rerun: bool = False, pm: pypiper.PipelineManager = None, ) -> None: @@ -314,7 +314,7 @@ def insert_pep( :param bool upload_pephub: whether to push bedfiles and metadata to pephub (default: False) :param bool upload_qdrant: whether to execute qdrant indexing :param bool no_fail: whether to raise an error if bedset was not added to the database - :param bool light: whether to run light version of the pipeline + :param bool lite: whether to run lite version of the pipeline :param bool standardize_pep: whether to standardize the pep file before processing by using bedms. (default: False) :param bool rerun: whether to rerun processed samples :param pypiper.PipelineManager pm: pypiper object @@ -389,7 +389,7 @@ def insert_pep( universe=pep_sample.get("universe"), universe_method=pep_sample.get("universe_method"), universe_bedset=pep_sample.get("universe_bedset"), - light=light, + lite=lite, pm=pm, ) @@ -416,7 +416,7 @@ def insert_pep( no_fail=no_fail, force_overwrite=force_overwrite, annotation=bedset_annotation, - light=light, + lite=lite, ) else: _LOGGER.info( @@ -484,7 +484,7 @@ def reprocess_all( upload_qdrant=True, upload_s3=True, upload_pephub=True, - light=False, + lite=False, universe=False, universe_method=None, universe_bedset=None, @@ -570,7 +570,7 @@ def reprocess_one( upload_qdrant=True, upload_s3=True, upload_pephub=True, - light=False, + lite=False, universe=False, universe_method=None, universe_bedset=None, @@ -628,5 +628,5 @@ def reprocess_bedset( } ) }, - light=False, + lite=False, ) diff --git a/bedboss/bedbuncher/bedbuncher.py b/bedboss/bedbuncher/bedbuncher.py index 80f8bc6..9550431 100644 --- a/bedboss/bedbuncher/bedbuncher.py +++ b/bedboss/bedbuncher/bedbuncher.py @@ -101,7 +101,7 @@ def run_bedbuncher( upload_s3: bool = False, no_fail: bool = False, force_overwrite: bool = False, - light: bool = False, + lite: bool = False, ) -> None: """ Add bedset to the database @@ -119,7 +119,7 @@ def run_bedbuncher( :param upload_pephub: whether to create a view in pephub :param upload_s3: whether to upload files to s3 :param force_overwrite: whether to overwrite the record in the database - :param light: whether to run the pipeline in light mode + :param lite: whether to run the pipeline in lite mode # TODO: force_overwrite is not working!!! Fix it! :return: """ @@ -164,7 +164,7 @@ def run_bedbuncher( no_fail=no_fail, overwrite=force_overwrite, annotation=annotation, - processed=not light, + processed=not lite, ) diff --git a/bedboss/bedmaker/bedmaker.py b/bedboss/bedmaker/bedmaker.py index 539c2a6..7de8945 100755 --- a/bedboss/bedmaker/bedmaker.py +++ b/bedboss/bedmaker/bedmaker.py @@ -315,7 +315,7 @@ def make_all( chrom_sizes: str = None, narrowpeak: bool = False, check_qc: bool = True, - light: bool = False, + lite: bool = False, pm: pypiper.PipelineManager = None, ) -> BedMakerOutput: """ @@ -339,7 +339,7 @@ def make_all( :param narrowpeak: whether the regions are narrow (transcription factor implies narrow, histone mark implies broad peaks) :param check_qc: run quality control during bedmaking - :param light: run the pipeline in light mode (without producing bigBed files) + :param lite: run the pipeline in lite mode (without producing bigBed files) :param pm: pypiper object :return: dict with generated bed metadata - BedMakerOutput object: @@ -384,8 +384,8 @@ def make_all( f"Quality control failed for {output_path}. Error: {e}" ) try: - if light: - _LOGGER.info("Skipping bigBed generation due to light mode.") + if lite: + _LOGGER.info("Skipping bigBed generation due to lite mode.") output_bigbed = None else: output_bigbed = make_bigbed( diff --git a/bedboss/cli.py b/bedboss/cli.py index 13d78db..ffe2782 100644 --- a/bedboss/cli.py +++ b/bedboss/cli.py @@ -93,8 +93,8 @@ def run_all( False, help="Update the bedbase database with the new record if it exists. This overwrites 'force_overwrite' option", ), - light: bool = typer.Option( - False, help="Run the pipeline in light mode. [Default: False]" + lite: bool = typer.Option( + False, help="Run the pipeline in lite mode. [Default: False]" ), upload_qdrant: bool = typer.Option(False, help="Upload to Qdrant"), upload_s3: bool = typer.Option(False, help="Upload to S3"), @@ -136,7 +136,7 @@ def run_all( open_signal_matrix=open_signal_matrix, ensdb=ensdb, other_metadata=None, - light=light, + lite=lite, just_db_commit=just_db_commit, force_overwrite=force_overwrite, update=update, @@ -183,8 +183,8 @@ def run_pep( no_fail: bool = typer.Option(False, help="Do not fail on error"), license_id: str = typer.Option(DEFAULT_LICENSE, help="License ID"), standardize_pep: bool = typer.Option(False, help="Standardize the PEP using bedMS"), - light: bool = typer.Option( - False, help="Run the pipeline in light mode. [Default: False]" + lite: bool = typer.Option( + False, help="Run the pipeline in lite mode. [Default: False]" ), rerun: bool = typer.Option(False, help="Rerun already processed samples"), # PipelineManager @@ -216,7 +216,7 @@ def run_pep( upload_qdrant=upload_qdrant, no_fail=no_fail, standardize_pep=standardize_pep, - light=light, + lite=lite, rerun=rerun, pm=create_pm( outfolder=outfolder, diff --git a/bedboss/refgenome_validator/main.py b/bedboss/refgenome_validator/main.py index 174ad1b..532693d 100644 --- a/bedboss/refgenome_validator/main.py +++ b/bedboss/refgenome_validator/main.py @@ -251,10 +251,8 @@ def determine_compatibility( for genome_model in self.genome_models: # First and Second Layer of Compatibility - model_compat_stats[ - genome_model.genome_alias - ]: CompatibilityStats = self.calculate_chrom_stats( - bed_chrom_info, genome_model.chrom_sizes + model_compat_stats[genome_model.genome_alias]: CompatibilityStats = ( + self.calculate_chrom_stats(bed_chrom_info, genome_model.chrom_sizes) ) # Third layer - IGD, only if layer 1 and layer 2 have passed @@ -266,15 +264,13 @@ def determine_compatibility( genome_model.genome_alias ].chrom_length_stats.beyond_range ): - model_compat_stats[ - genome_model.genome_alias - ].igd_stats = self.get_igd_overlaps(bedfile) + model_compat_stats[genome_model.genome_alias].igd_stats = ( + self.get_igd_overlaps(bedfile) + ) # Calculate compatibility rating - model_compat_stats[ - genome_model.genome_alias - ].compatibility = self.calculate_rating( - model_compat_stats[genome_model.genome_alias] + model_compat_stats[genome_model.genome_alias].compatibility = ( + self.calculate_rating(model_compat_stats[genome_model.genome_alias]) ) if concise: concise_dict = {} @@ -428,9 +424,9 @@ def predict(self, bedfile: str) -> Union[str, None]: """ _LOGGER.info(f"Predicting compatibility of {bedfile} with reference genomes...") - compatibility_stats: Dict[ - str, CompatibilityConcise - ] = self.determine_compatibility(bedfile, concise=True) + compatibility_stats: Dict[str, CompatibilityConcise] = ( + self.determine_compatibility(bedfile, concise=True) + ) best_rankings = [] diff --git a/scripts/bbuploader/main.py b/scripts/bbuploader/main.py index 279d9e4..f698c54 100644 --- a/scripts/bbuploader/main.py +++ b/scripts/bbuploader/main.py @@ -54,7 +54,7 @@ def another_test(): run_failed=True, run_skipped=True, reinit_skipper=True, - light=True, + lite=True, overwrite=True, overwrite_bedset=True, ) @@ -76,7 +76,7 @@ def upload_time(): # genome="hg38", rerun=True, run_skipped=True, - light=True, + lite=True, )