diff --git a/bedboss/_version.py b/bedboss/_version.py index dd9b22c..906d362 100644 --- a/bedboss/_version.py +++ b/bedboss/_version.py @@ -1 +1 @@ -__version__ = "0.5.1" +__version__ = "0.6.0" diff --git a/bedboss/bbuploader/cli.py b/bedboss/bbuploader/cli.py index 9073dbf..cb415c1 100644 --- a/bedboss/bbuploader/cli.py +++ b/bedboss/bbuploader/cli.py @@ -61,6 +61,9 @@ def upload_all( reinit_skipper: bool = typer.Option( False, help="Reinitialize skipper. [Default: False]" ), + lite: bool = typer.Option( + False, help="Run the pipeline in lite mode. [Default: False]" + ), ): from .main import upload_all as upload_all_function @@ -83,6 +86,7 @@ def upload_all( reinit_skipper=reinit_skipper, overwrite=overwrite, overwrite_bedset=overwrite_bedset, + lite=lite, ) @@ -124,6 +128,9 @@ def upload_gse( reinit_skipper: bool = typer.Option( False, help="Reinitialize skipper. [Default: False]" ), + lite: bool = typer.Option( + False, help="Run the pipeline in lite mode. [Default: False]" + ), ): from .main import upload_gse as upload_gse_function @@ -142,6 +149,7 @@ def upload_gse( reinit_skipper=reinit_skipper, overwrite=overwrite, overwrite_bedset=overwrite_bedset, + lite=lite, ) diff --git a/bedboss/bbuploader/main.py b/bedboss/bbuploader/main.py index 9d2db3f..170f8e0 100644 --- a/bedboss/bbuploader/main.py +++ b/bedboss/bbuploader/main.py @@ -8,7 +8,6 @@ from pephubclient import PEPHubClient from pephubclient.helpers import MessageHandler from pephubclient.models import SearchReturnModel -from setuptools.command.egg_info import overwrite_arg from sqlalchemy import and_, select from sqlalchemy.orm import Session @@ -28,13 +27,14 @@ from bedboss.bedbuncher.bedbuncher import run_bedbuncher from bedboss.exceptions import BedBossException from bedboss.skipper import Skipper -from bedboss.utils import download_file, standardize_genome_name +from bedboss.utils import calculate_time, download_file, standardize_genome_name from bedboss.utils import standardize_pep as pep_standardizer _LOGGER = logging.getLogger(PKG_NAME) _LOGGER.setLevel(logging.DEBUG) +@calculate_time def upload_all( bedbase_config: str, outfolder: str = os.getcwd(), @@ -54,6 +54,7 @@ def upload_all( reinit_skipper=False, overwrite=False, overwrite_bedset=False, + lite=False, ): """ This is main function that is responsible for processing bed files from PEPHub. @@ -75,12 +76,13 @@ def upload_all( :param use_skipper: use skipper to skip already processed logged locally. Skipper creates local log of processed and failed files. :param reinit_skipper: reinitialize skipper, if set to True, skipper will be reinitialized and all logs files will be cleaned + :param lite: lite mode, where skipping statistic processing for memory optimization and time saving """ phc = PEPHubClient() os.makedirs(outfolder, exist_ok=True) - bbagent = BedBaseAgent(config=bedbase_config) + bbagent = BedBaseAgent(config=bedbase_config, init_ml=not lite) genome = standardize_genome_name(genome) pep_annotation_list = find_peps( @@ -154,6 +156,7 @@ def upload_all( preload=preload, overwrite=overwrite, overwrite_bedset=overwrite_bedset, + lite=lite, ) except Exception as err: _LOGGER.error( @@ -268,6 +271,7 @@ def find_peps( ) +@calculate_time def upload_gse( gse: str, bedbase_config: Union[str, BedBaseAgent], @@ -282,7 +286,8 @@ def upload_gse( use_skipper=True, reinit_skipper=False, overwrite=False, - overwrite_bedset=False, + overwrite_bedset=True, + lite=False, ): """ Upload bed files from GEO series to BedBase @@ -302,10 +307,11 @@ def upload_gse( :param reinit_skipper: reinitialize skipper, if set to True, skipper will be reinitialized and all logs files will be cleaned :param overwrite: overwrite existing bedfiles :param overwrite_bedset: overwrite existing bedset + :param lite: lite mode, where skipping statistic processing for memory optimization and time saving :return: None """ - bbagent = BedBaseAgent(config=bedbase_config) + bbagent = BedBaseAgent(config=bedbase_config, init_ml=not lite) with Session(bbagent.config.db_engine.engine) as session: _LOGGER.info(f"Processing: '{gse}'") @@ -352,6 +358,7 @@ def upload_gse( overwrite_bedset=overwrite_bedset, use_skipper=use_skipper, reinit_skipper=reinit_skipper, + lite=lite, ) except Exception as e: _LOGGER.error(f"Processing of '{gse}' failed with error: {e}") @@ -403,6 +410,7 @@ def _upload_gse( use_skipper: bool = True, reinit_skipper: bool = False, preload: bool = True, + lite=False, ) -> ProjectProcessingStatus: """ Upload bed files from GEO series to BedBase @@ -421,6 +429,7 @@ def _upload_gse( and failed files. :param reinit_skipper: reinitialize skipper, if set to True, skipper will be reinitialized and all logs will be :param preload: pre - download files to the local folder (used for faster reproducibility) + :param lite: lite mode, where skipping statistic processing for memory optimization and time saving :return: None """ if isinstance(bedbase_config, str): @@ -540,6 +549,7 @@ def _upload_gse( upload_s3=True, upload_qdrant=True, force_overwrite=overwrite, + lite=lite, ) uploaded_files.append(file_digest) if skipper_obj: @@ -571,6 +581,7 @@ def _upload_gse( upload_s3=True, no_fail=True, force_overwrite=overwrite_bedset, + lite=lite, ) else: diff --git a/bedboss/bedboss.py b/bedboss/bedboss.py index 0f7387b..621824b 100644 --- a/bedboss/bedboss.py +++ b/bedboss/bedboss.py @@ -1,3 +1,4 @@ +import datetime import logging import os import subprocess @@ -7,10 +8,12 @@ import pephubclient import peppy import pypiper +import yaml from bbconf.bbagent import BedBaseAgent from bbconf.const import DEFAULT_LICENSE from bbconf.models.base_models import FileModel from eido import validate_project +from geniml.bbclient import BBClient from pephubclient.helpers import MessageHandler as m from pephubclient.helpers import is_registry_path @@ -29,7 +32,7 @@ ) from bedboss.refgenome_validator.main import ReferenceValidator from bedboss.skipper import Skipper -from bedboss.utils import get_genome_digest, standardize_genome_name +from bedboss.utils import calculate_time, get_genome_digest, standardize_genome_name from bedboss.utils import standardize_pep as pep_standardizer _LOGGER = logging.getLogger(PKG_NAME) @@ -47,6 +50,7 @@ def requirements_check() -> None: ) +@calculate_time def run_all( input_file: str, input_type: str, @@ -65,9 +69,11 @@ def run_all( other_metadata: dict = None, just_db_commit: bool = False, force_overwrite: bool = False, + update: bool = False, upload_qdrant: bool = False, upload_s3: bool = False, upload_pephub: bool = False, + lite: bool = False, # Universes universe: bool = False, universe_method: str = None, @@ -94,11 +100,13 @@ def run_all( :param dict other_metadata: a dict containing all attributes from the sample :param str ensdb: a full path to the ensdb gtf file required for genomes not in GDdata [optional] (basically genomes that's not in GDdata) - :param bool just_db_commit: whether just to commit the JSON to the database (default: False) - :param bool force_overwrite: force overwrite analysis (default: False) - :param bool upload_qdrant: whether to skip qdrant indexing + :param bool just_db_commit: whether just to commit the JSON to the database [Default: False] + :param bool force_overwrite: force overwrite analysis [Default: False] + :param bool update: whether to update the record in the database [Default: False] (if True, overwrites 'force_overwrite' and ignores it) + :param bool upload_qdrant: whether to skip qdrant indexing [Default: False] :param bool upload_s3: whether to upload to s3 - :param bool upload_pephub: whether to push bedfiles and metadata to pephub (default: False) + :param bool upload_pephub: whether to push bedfiles and metadata to pephub [Default: False] + :param bool lite: whether to run lite version of the pipeline [Default: False] :param bool universe: whether to add the sample as the universe [Default: False] :param str universe_method: method used to create the universe [Default: None] @@ -107,7 +115,7 @@ def run_all( :return str bed_digest: bed digest """ if isinstance(bedbase_config, str): - bbagent = BedBaseAgent(bedbase_config) + bbagent = BedBaseAgent(config=bedbase_config, init_ml=not lite) elif isinstance(bedbase_config, bbconf.BedBaseAgent): bbagent = bedbase_config else: @@ -140,22 +148,26 @@ def run_all( narrowpeak=narrowpeak, check_qc=check_qc, chrom_sizes=chrom_sizes, + lite=lite, pm=pm, ) if not other_metadata: other_metadata = {"sample_name": name} - statistics_dict = bedstat( - bedfile=bed_metadata.bed_file, - outfolder=outfolder, - genome=genome, - ensdb=ensdb, - bed_digest=bed_metadata.bed_digest, - open_signal_matrix=open_signal_matrix, - just_db_commit=just_db_commit, - rfg_config=rfg_config, - pm=pm, - ) + if lite: + statistics_dict = {} + else: + statistics_dict = bedstat( + bedfile=bed_metadata.bed_file, + outfolder=outfolder, + genome=genome, + ensdb=ensdb, + bed_digest=bed_metadata.bed_digest, + open_signal_matrix=open_signal_matrix, + just_db_commit=just_db_commit, + rfg_config=rfg_config, + pm=pm, + ) statistics_dict["bed_type"] = bed_metadata.bed_type statistics_dict["bed_format"] = bed_metadata.bed_format.value @@ -202,22 +214,42 @@ def run_all( else: ref_valid_stats = None - bbagent.bed.add( - identifier=bed_metadata.bed_digest, - stats=stats.model_dump(exclude_unset=True), - metadata=other_metadata, - plots=plots.model_dump(exclude_unset=True), - files=files.model_dump(exclude_unset=True), - classification=classification.model_dump(exclude_unset=True), - ref_validation=ref_valid_stats, - license_id=license_id, - upload_qdrant=upload_qdrant, - upload_pephub=upload_pephub, - upload_s3=upload_s3, - local_path=outfolder, - overwrite=force_overwrite, - nofail=True, - ) + if update: + bbagent.bed.update( + identifier=bed_metadata.bed_digest, + stats=stats.model_dump(exclude_unset=True), + metadata=other_metadata, + plots=plots.model_dump(exclude_unset=True), + files=files.model_dump(exclude_unset=True), + classification=classification.model_dump(exclude_unset=True), + ref_validation=ref_valid_stats, + license_id=license_id, + upload_qdrant=upload_qdrant and not lite, + upload_pephub=upload_pephub, + upload_s3=upload_s3, + local_path=outfolder, + overwrite=True, + processed=not lite, + nofail=True, + ) + else: + bbagent.bed.add( + identifier=bed_metadata.bed_digest, + stats=stats.model_dump(exclude_unset=True), + metadata=other_metadata, + plots=plots.model_dump(exclude_unset=True), + files=files.model_dump(exclude_unset=True), + classification=classification.model_dump(exclude_unset=True), + ref_validation=ref_valid_stats, + license_id=license_id, + upload_qdrant=upload_qdrant and not lite, + upload_pephub=upload_pephub, + upload_s3=upload_s3, + local_path=outfolder, + overwrite=force_overwrite, + processed=not lite, + nofail=True, + ) if universe: bbagent.bed.add_universe( @@ -233,6 +265,7 @@ def run_all( return bed_metadata.bed_digest +@calculate_time def insert_pep( bedbase_config: str, output_folder: str, @@ -247,11 +280,13 @@ def insert_pep( ensdb: str = None, just_db_commit: bool = False, force_overwrite: bool = False, + update: bool = False, upload_s3: bool = False, upload_pephub: bool = False, upload_qdrant: bool = False, no_fail: bool = False, standardize_pep: bool = False, + lite: bool = False, rerun: bool = False, pm: pypiper.PipelineManager = None, ) -> None: @@ -274,10 +309,12 @@ def insert_pep( :param str ensdb: a full path to the ensdb gtf file required for genomes not in GDdata :param bool just_db_commit: whether save only to the database (Without saving locally ) :param bool force_overwrite: whether to overwrite the existing record + :param bool update: whether to update the record in the database. This option will overwrite the force_overwrite option. [Default: False] :param bool upload_s3: whether to upload to s3 :param bool upload_pephub: whether to push bedfiles and metadata to pephub (default: False) :param bool upload_qdrant: whether to execute qdrant indexing :param bool no_fail: whether to raise an error if bedset was not added to the database + :param bool lite: whether to run lite version of the pipeline :param bool standardize_pep: whether to standardize the pep file before processing by using bedms. (default: False) :param bool rerun: whether to rerun processed samples :param pypiper.PipelineManager pm: pypiper object @@ -310,7 +347,6 @@ def insert_pep( skipper.reinitialize() for i, pep_sample in enumerate(pep.samples): - is_processed = skipper.is_processed(pep_sample.sample_name) if is_processed: m.print_success( @@ -346,12 +382,14 @@ def insert_pep( ensdb=ensdb, just_db_commit=just_db_commit, force_overwrite=force_overwrite, + update=update, upload_qdrant=upload_qdrant, upload_s3=upload_s3, upload_pephub=upload_pephub, universe=pep_sample.get("universe"), universe_method=pep_sample.get("universe_method"), universe_bedset=pep_sample.get("universe_bedset"), + lite=lite, pm=pm, ) @@ -378,6 +416,7 @@ def insert_pep( no_fail=no_fail, force_overwrite=force_overwrite, annotation=bedset_annotation, + lite=lite, ) else: _LOGGER.info( @@ -390,3 +429,204 @@ def insert_pep( m.print_error(f"Failed samples: {failed_samples}") return None + + +@calculate_time +def reprocess_all( + bedbase_config: Union[str, BedBaseAgent], + output_folder: str, + limit: int = 10, + nofail: bool = False, +) -> None: + """ + Run bedboss pipeline for all unprocessed beds in the bedbase + + :param bedbase_config: bedbase configuration file path + :param output_folder: output folder of the pipeline + :param limit: limit of the number of beds to process + :param nofail: whether to raise an error if bedset was not added to the database + + :return: None + """ + + if isinstance(bedbase_config, str): + bbagent = BedBaseAgent(config=bedbase_config) + elif isinstance(bedbase_config, bbconf.BedBaseAgent): + bbagent = bedbase_config + else: + raise BedBossException("Incorrect bedbase_config type. Exiting...") + + unprocessed_beds = bbagent.bed.get_unprocessed(limit=limit) + + bbclient = BBClient() + failed_samples = [] + for bed_annot in unprocessed_beds.results: + bed_file = bbclient.load_bed(bed_annot.id) + + try: + run_all( + input_file=bed_file.path, + input_type="bed", + outfolder=output_folder, + genome=bed_annot.genome_alias, + bedbase_config=bbagent, + name=bed_annot.name, + license_id=bed_annot.license_id, + rfg_config=None, + check_qc=False, + validate_reference=True, + chrom_sizes=None, + open_signal_matrix=None, + ensdb=None, + other_metadata=None, + just_db_commit=False, + update=True, + upload_qdrant=True, + upload_s3=True, + upload_pephub=True, + lite=False, + universe=False, + universe_method=None, + universe_bedset=None, + pm=None, + ) + except Exception as e: + _LOGGER.error(f"Failed to process {bed_annot.name}. See {e}") + if nofail: + raise BedBossException(f"Failed to process {bed_annot.name}. See {e}") + + failed_samples.append( + { + "id": bed_annot.id, + "error": e, + } + ) + + if failed_samples: + date_now = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + with open( + os.path.join(output_folder, f"failed_samples_{date_now}.yaml"), "w" + ) as file: + yaml.dump(failed_samples, file) + + m.print_warning(f"Logs with failed samples are saved in {output_folder}") + + m.print_success(f"Processing completed successfully") + + print_values = dict( + unprocessed_files=unprocessed_beds.count, + processing_files=unprocessed_beds.limit, + failed_files=len(failed_samples), + success_files=unprocessed_beds.limit - len(failed_samples), + ) + print(print_values) + + +@calculate_time +def reprocess_one( + bedbase_config: Union[str, BedBaseAgent], + output_folder: str, + identifier: str, +) -> None: + """ + Run bedboss pipeline for one bed in the bedbase [Reprocess] + + :param bedbase_config: bedbase configuration file path + :param output_folder: output folder of the pipeline + :param identifier: bed identifier + + :return: None + """ + + if isinstance(bedbase_config, str): + bbagent = BedBaseAgent(config=bedbase_config) + elif isinstance(bedbase_config, bbconf.BedBaseAgent): + bbagent = bedbase_config + else: + raise BedBossException("Incorrect bedbase_config type. Exiting...") + + bbclient = BBClient() + + bed_annot = bbagent.bed.get(identifier) + bed_file = bbclient.load_bed(bed_annot.id) + + run_all( + input_file=bed_file.path, + input_type="bed", + outfolder=output_folder, + genome=bed_annot.genome_alias, + bedbase_config=bbagent, + name=bed_annot.name, + license_id=bed_annot.license_id, + rfg_config=None, + check_qc=False, + validate_reference=True, + chrom_sizes=None, + open_signal_matrix=None, + ensdb=None, + other_metadata=None, + just_db_commit=False, + update=True, + upload_qdrant=True, + upload_s3=True, + upload_pephub=True, + lite=False, + universe=False, + universe_method=None, + universe_bedset=None, + pm=None, + ) + + _LOGGER.info(f"Successfully processed {identifier}") + + +@calculate_time +def reprocess_bedset( + bedbase_config: Union[str, BedBaseAgent], + output_folder: str, + identifier: str, + no_fail: bool = True, + heavy: bool = False, +): + """ + Recalculate bedset from the bedbase + + :param bedbase_config: bedbase configuration file path + :param output_folder: output folder of the pipeline + :param identifier: bedset identifier + :param no_fail: whether to raise an error if bedset was not added to the database + :param heavy: whether to use heavy processing. Calculate plots for bedset + + :return: None + """ + + if isinstance(bedbase_config, str): + bbagent = BedBaseAgent(config=bedbase_config) + elif isinstance(bedbase_config, bbconf.BedBaseAgent): + bbagent = bedbase_config + else: + raise BedBossException("Incorrect bedbase_config type. Exiting...") + + bedset_annot = bbagent.bedset.get(identifier) + + run_bedbuncher( + bedbase_config=bbagent, + record_id=bedset_annot.id, + bed_set=bedset_annot.bed_ids, + name=bedset_annot.name, + output_folder=output_folder, + description=bedset_annot.description, + heavy=heavy, + upload_pephub=False, + upload_s3=heavy, + no_fail=no_fail, + force_overwrite=True, + annotation={ + **bedset_annot.model_dump( + exclude={ + "bed_ids", + } + ) + }, + lite=False, + ) diff --git a/bedboss/bedbuncher/bedbuncher.py b/bedboss/bedbuncher/bedbuncher.py index e2c6a3f..9550431 100644 --- a/bedboss/bedbuncher/bedbuncher.py +++ b/bedboss/bedbuncher/bedbuncher.py @@ -101,6 +101,7 @@ def run_bedbuncher( upload_s3: bool = False, no_fail: bool = False, force_overwrite: bool = False, + lite: bool = False, ) -> None: """ Add bedset to the database @@ -118,6 +119,7 @@ def run_bedbuncher( :param upload_pephub: whether to create a view in pephub :param upload_s3: whether to upload files to s3 :param force_overwrite: whether to overwrite the record in the database + :param lite: whether to run the pipeline in lite mode # TODO: force_overwrite is not working!!! Fix it! :return: """ @@ -162,6 +164,7 @@ def run_bedbuncher( no_fail=no_fail, overwrite=force_overwrite, annotation=annotation, + processed=not lite, ) diff --git a/bedboss/bedmaker/bedmaker.py b/bedboss/bedmaker/bedmaker.py index d3c66b2..7de8945 100755 --- a/bedboss/bedmaker/bedmaker.py +++ b/bedboss/bedmaker/bedmaker.py @@ -315,6 +315,7 @@ def make_all( chrom_sizes: str = None, narrowpeak: bool = False, check_qc: bool = True, + lite: bool = False, pm: pypiper.PipelineManager = None, ) -> BedMakerOutput: """ @@ -338,6 +339,7 @@ def make_all( :param narrowpeak: whether the regions are narrow (transcription factor implies narrow, histone mark implies broad peaks) :param check_qc: run quality control during bedmaking + :param lite: run the pipeline in lite mode (without producing bigBed files) :param pm: pypiper object :return: dict with generated bed metadata - BedMakerOutput object: @@ -382,15 +384,19 @@ def make_all( f"Quality control failed for {output_path}. Error: {e}" ) try: - output_bigbed = make_bigbed( - bed_path=output_bed, - output_path=output_path, - genome=genome, - bed_type=bed_type, - rfg_config=rfg_config, - chrom_sizes=chrom_sizes, - pm=pm, - ) + if lite: + _LOGGER.info("Skipping bigBed generation due to lite mode.") + output_bigbed = None + else: + output_bigbed = make_bigbed( + bed_path=output_bed, + output_path=output_path, + genome=genome, + bed_type=bed_type, + rfg_config=rfg_config, + chrom_sizes=chrom_sizes, + pm=pm, + ) except BedBossException: output_bigbed = None if pm_clean: diff --git a/bedboss/cli.py b/bedboss/cli.py index 670ef2d..ffe2782 100644 --- a/bedboss/cli.py +++ b/bedboss/cli.py @@ -89,6 +89,13 @@ def run_all( force_overwrite: bool = typer.Option( False, help="Force overwrite the output files" ), + update: bool = typer.Option( + False, + help="Update the bedbase database with the new record if it exists. This overwrites 'force_overwrite' option", + ), + lite: bool = typer.Option( + False, help="Run the pipeline in lite mode. [Default: False]" + ), upload_qdrant: bool = typer.Option(False, help="Upload to Qdrant"), upload_s3: bool = typer.Option(False, help="Upload to S3"), upload_pephub: bool = typer.Option(False, help="Upload to PEPHub"), @@ -129,8 +136,10 @@ def run_all( open_signal_matrix=open_signal_matrix, ensdb=ensdb, other_metadata=None, + lite=lite, just_db_commit=just_db_commit, force_overwrite=force_overwrite, + update=update, upload_qdrant=upload_qdrant, upload_s3=upload_s3, upload_pephub=upload_pephub, @@ -164,12 +173,19 @@ def run_pep( force_overwrite: bool = typer.Option( False, help="Force overwrite the output files" ), + update: bool = typer.Option( + False, + help="Update the bedbase database with the new record if it exists. This overwrites 'force_overwrite' option", + ), upload_qdrant: bool = typer.Option(True, help="Upload to Qdrant"), upload_s3: bool = typer.Option(True, help="Upload to S3"), upload_pephub: bool = typer.Option(True, help="Upload to PEPHub"), no_fail: bool = typer.Option(False, help="Do not fail on error"), license_id: str = typer.Option(DEFAULT_LICENSE, help="License ID"), standardize_pep: bool = typer.Option(False, help="Standardize the PEP using bedMS"), + lite: bool = typer.Option( + False, help="Run the pipeline in lite mode. [Default: False]" + ), rerun: bool = typer.Option(False, help="Rerun already processed samples"), # PipelineManager multi: bool = typer.Option(False, help="Run multiple samples"), @@ -193,12 +209,14 @@ def run_pep( ensdb=ensdb, just_db_commit=just_db_commit, force_overwrite=force_overwrite, + update=update, license_id=license_id, upload_s3=upload_s3, upload_pephub=upload_pephub, upload_qdrant=upload_qdrant, no_fail=no_fail, standardize_pep=standardize_pep, + lite=lite, rerun=rerun, pm=create_pm( outfolder=outfolder, @@ -210,6 +228,75 @@ def run_pep( ) +@app.command(help="Run unprocessed files, or reprocess them") +def reprocess_all( + bedbase_config: str = typer.Option( + ..., + help="Path to the bedbase config file", + exists=True, + file_okay=True, + readable=True, + ), + outfolder: str = typer.Option(..., help="Path to the output folder"), + limit: int = typer.Option(100, help="Limit the number of files to reprocess"), + no_fail: bool = typer.Option(True, help="Do not fail on error"), +): + from bedboss.bedboss import reprocess_all as reprocess_all_function + + reprocess_all( + bedbase_config=bedbase_config, + output_folder=outfolder, + limit=limit, + no_fail=no_fail, + ) + + +@app.command(help="Run unprocessed file, or reprocess it [Only 1 file]") +def reprocess_one( + bedbase_config: str = typer.Option( + ..., + help="Path to the bedbase config file", + exists=True, + file_okay=True, + readable=True, + ), + outfolder: str = typer.Option(..., help="Path to the output folder"), + identifier: str = typer.Option(..., help="Identifier of the bed file"), +): + from bedboss.bedboss import reprocess_one as reprocess_one_function + + reprocess_one( + bedbase_config=bedbase_config, + output_folder=outfolder, + identifier=identifier, + ) + + +@app.command(help="Reprocess a bedset") +def reprocess_bedset( + bedbase_config: str = typer.Option( + ..., + help="Path to the bedbase config file", + exists=True, + file_okay=True, + readable=True, + ), + outfolder: str = typer.Option(..., help="Path to the output folder"), + identifier: str = typer.Option(..., help="Bedset ID"), + no_fail: bool = typer.Option(True, help="Do not fail on error"), + heavy: bool = typer.Option(False, help="Run the heavy version of the pipeline"), +): + from bedboss.bedboss import reprocess_bedset as reprocess_bedset_function + + reprocess_bedset_function( + bedbase_config=bedbase_config, + output_folder=outfolder, + identifier=identifier, + no_fail=no_fail, + heavy=heavy, + ) + + @app.command(help=f"Create a bed files form a [{', '.join(options_list)}] file") def make_bed( input_file: str = typer.Option( diff --git a/bedboss/skipper.py b/bedboss/skipper.py index 12f118c..59fb58c 100644 --- a/bedboss/skipper.py +++ b/bedboss/skipper.py @@ -4,7 +4,6 @@ class Skipper: - def __init__(self, output_path: str, name: str): self.output_path = output_path self.name = name diff --git a/bedboss/utils.py b/bedboss/utils.py index 1231a4b..a8d6829 100644 --- a/bedboss/utils.py +++ b/bedboss/utils.py @@ -1,7 +1,9 @@ import glob import logging import os +import time import urllib.request +from functools import wraps import peppy import requests @@ -24,6 +26,8 @@ def standardize_genome_name(input_genome: str, bedfile: str = None) -> str: :param bedfile: path to bed file :return: genome name string """ + if not isinstance(input_genome, str): + input_genome = "" input_genome = input_genome.strip().lower() # TODO: we have to add more genome options and preprocessing of the string if input_genome == "hg38" or input_genome == "grch38": @@ -35,7 +39,7 @@ def standardize_genome_name(input_genome: str, bedfile: str = None) -> str: elif input_genome == "mm9" or input_genome == "grcm37": return "mm9" - elif not input_genome or len(input_genome) > 10: + elif not input_genome or len(input_genome) > 7: if bedfile: predictor = ReferenceValidator() return predictor.predict(bedfile) or "" @@ -199,3 +203,23 @@ def cleanup_pm_temp(pm: PipelineManager) -> None: except Exception as e: _LOGGER.error(f"Error cleaning up: {e}") pm.cleanup_list_conditional = [] + + +def calculate_time(func): + @wraps(func) + def wrapper(*args, **kwargs): + start_time = time.time() + result = func(*args, **kwargs) + end_time = time.time() + execution_time = end_time - start_time + + hours, remainder = divmod(execution_time, 3600) + minutes, seconds = divmod(remainder, 60) + + print( + f"Function '{func.__name__}' executed in {int(hours)} hours, {int(minutes)} minutes, and {seconds:.2f} seconds" + ) + + return result + + return wrapper diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index bf90d2d..3e2a2ed 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -5,8 +5,9 @@ peppy>=0.40.7 yacman>=0.8.4 requests>=2.28.2 piper>=v0.14.3 +pybiocfilecache==0.4.1 # TODO: remove after geniml release # bbconf>=0.8.1 -bbconf @ git+https://github.com/databio/bbconf.git@dev#egg=bbconf +bbconf @ git+https://github.com/databio/bbconf.git@partial_processing#egg=bbconf refgenconf>=0.12.2 pandas>=2.0.0 ubiquerg>=0.6.2 diff --git a/scripts/all/run_all.py b/scripts/all/run_all.py new file mode 100644 index 0000000..7ad2e1f --- /dev/null +++ b/scripts/all/run_all.py @@ -0,0 +1,23 @@ +def unprocessed_run(): + from bedboss.bedboss import reprocess_all + + run_unprocessed_beds( + bedbase_config="/home/bnt4me/virginia/repos/bbuploader/config_db_local.yaml", + limit=10, + output_folder="/home/bnt4me/virginia/repos/bbuploader/data", + ) + + +def reprocess_one(): + from bedboss.bedboss import reprocess_one + + reprocess_one( + identifier="a0f1889fd8026780df8bba6a8ddac00e", + bedbase_config="/home/bnt4me/virginia/repos/bbuploader/config_db_local.yaml", + output_folder="/home/bnt4me/virginia/repos/bbuploader/data", + ) + + +if __name__ == "__main__": + # unprocessed_run() + reprocess_one() diff --git a/scripts/bb_text_search/main.py b/scripts/bb_text_search/main.py index f6e0609..adc7dfd 100644 --- a/scripts/bb_text_search/main.py +++ b/scripts/bb_text_search/main.py @@ -18,7 +18,6 @@ def upload_text_embeddings(): - # lab qdrant client # qc = QdrantClient( # host=os.environ.get("QDRATN_HOST"), diff --git a/scripts/bb_text_search/search_test.py b/scripts/bb_text_search/search_test.py index 9e84fa2..34f5845 100644 --- a/scripts/bb_text_search/search_test.py +++ b/scripts/bb_text_search/search_test.py @@ -3,7 +3,6 @@ def search_test(): - # backend for text embeddings and bed embeddings text_backend = QdrantBackend( dim=384, diff --git a/scripts/bbuploader/main.py b/scripts/bbuploader/main.py index 2340569..f698c54 100644 --- a/scripts/bbuploader/main.py +++ b/scripts/bbuploader/main.py @@ -27,11 +27,18 @@ def runn(): def another_test(): + # time it: + import time + from bedboss.bbuploader.main import upload_gse + time1 = time.time() upload_gse( # gse="gse261411", - gse="gse261536", + # gse="gse261536", + # gse="gse274130", + # Genome hg19 and mm10 + gse="gse280839", # gse="gse246900", # gse="gse247593", # gse="gse241222", @@ -47,7 +54,12 @@ def another_test(): run_failed=True, run_skipped=True, reinit_skipper=True, + lite=True, + overwrite=True, + overwrite_bedset=True, ) + time2 = time.time() + print(f"Time taken: {time2 - time1}") def upload_time(): @@ -56,14 +68,15 @@ def upload_time(): upload_all( bedbase_config="/home/bnt4me/virginia/repos/bbuploader/config_db_local.yaml", outfolder="/home/bnt4me/virginia/repos/bbuploader/data", - start_date="2024/06/01", - # end_date="2024/08/28", + start_date="2020/06/01", + end_date="2020/07/15", search_limit=1000, download_limit=10000, search_offset=0, - genome="hg38", + # genome="hg38", rerun=True, run_skipped=True, + lite=True, ) diff --git a/scripts/ref_genome_validating/grab_chrom_sizes.py b/scripts/ref_genome_validating/grab_chrom_sizes.py index 46914ff..0cb6934 100644 --- a/scripts/ref_genome_validating/grab_chrom_sizes.py +++ b/scripts/ref_genome_validating/grab_chrom_sizes.py @@ -5,7 +5,6 @@ def main(): - # file_path = "/home/drc/Downloads/ncbi_ref_genome/ncbi_dataset/GCF_000001405.40_GRCh38.p14_genomic.fa" file_path = "/home/drc/Downloads/backup ref genome/GCA_000001405.29.fasta" FastaFile = open(file_path, "r") @@ -14,7 +13,6 @@ def main(): "/home/drc/GITHUB/bedboss/bedboss/scripts/ref_genome_validating/chrom_sizes/ensembl_hg38.chrom.sizes", "w", ) as file: - for rec in SeqIO.parse(FastaFile, "fasta"): name = rec.id seq = rec.seq diff --git a/scripts/ref_genome_validating/process_exclude_ranges.py b/scripts/ref_genome_validating/process_exclude_ranges.py index b43cdc2..fc12c8c 100644 --- a/scripts/ref_genome_validating/process_exclude_ranges.py +++ b/scripts/ref_genome_validating/process_exclude_ranges.py @@ -40,7 +40,6 @@ def main(species): print("Must supply species,e.g. mouse, homosapiens, rat, cow!") else: - # Make sure to have the IDE ignore these folders!!!! data_output_path = os.path.abspath("data") results_path = os.path.abspath("results") diff --git a/scripts/ref_genome_validating/stats_compat_testing.py b/scripts/ref_genome_validating/stats_compat_testing.py index 023b35f..cbf72dc 100644 --- a/scripts/ref_genome_validating/stats_compat_testing.py +++ b/scripts/ref_genome_validating/stats_compat_testing.py @@ -18,7 +18,6 @@ def main(): - # pull from tier rating column to get the final assessment tier_rating_keys = ["mm10", "hg38", "hg19"]