From bb957c7db7074494a89fc1fbcfe4a5d63d91435b Mon Sep 17 00:00:00 2001 From: George Grant Date: Wed, 8 May 2024 06:49:38 -0400 Subject: [PATCH 1/2] VS-1335 bgzip on extract ah var store edition (#8809) * Optionally extract to bgz format. * Set bgzipping to be off (everywhere) by default. * Update assert_identical_outputs to handle bgzipped outputs. --- .../variantstore/docs/aou/AOU_DELIVERABLES.md | 1 + scripts/variantstore/wdl/GvsExtractCallset.wdl | 8 +++++--- .../wdl/GvsExtractCohortFromSampleNames.wdl | 2 ++ .../wdl/GvsJointVariantCalling.wdl | 2 ++ .../wdl/GvsQuickstartHailIntegration.wdl | 6 +++++- .../wdl/GvsQuickstartIntegration.wdl | 2 ++ .../wdl/GvsQuickstartVcfIntegration.wdl | 18 +++++++++++------- 7 files changed, 28 insertions(+), 11 deletions(-) diff --git a/scripts/variantstore/docs/aou/AOU_DELIVERABLES.md b/scripts/variantstore/docs/aou/AOU_DELIVERABLES.md index 31993322c71..4636408944a 100644 --- a/scripts/variantstore/docs/aou/AOU_DELIVERABLES.md +++ b/scripts/variantstore/docs/aou/AOU_DELIVERABLES.md @@ -95,6 +95,7 @@ - Specify the `interval_weights_bed` appropriate for the PGEN / VCF extraction run you are performing. `gs://gvs_quickstart_storage/weights/gvs_full_vet_weights_1kb_padded_orig.bed` is the interval weights BED used for Quickstart. - For both `GvsExtractCallset` and `GvsExtractCallsetPgenMerged`, select the workflow option "Retry with more memory" and choose a "Memory retry factor" of 1.5 - For `GvsExtractCallset`, make sure to specify the appropriate `maximum_alternate_alleles` value (currently 100). + - For `GvsExtractCallset`, if you want to output VCFs that are compressed using bgzip, set the `bgzip_output_vcfs` input to `true` to generate VCFs that are compressed using bgzip. - These workflows do not use the Terra Data Entity Model to run, so be sure to select the `Run workflow with inputs defined by file paths` workflow submission option. 1. `GvsCalculatePrecisionAndSensitivity` workflow - Please see the detailed instructions for running the Precision and Sensitivity workflow [here](../../tieout/AoU_PRECISION_SENSITIVITY.md). diff --git a/scripts/variantstore/wdl/GvsExtractCallset.wdl b/scripts/variantstore/wdl/GvsExtractCallset.wdl index d585a0f356c..501f3807f23 100644 --- a/scripts/variantstore/wdl/GvsExtractCallset.wdl +++ b/scripts/variantstore/wdl/GvsExtractCallset.wdl @@ -20,6 +20,7 @@ workflow GvsExtractCallset { Int? scatter_count Int? extract_memory_override_gib Int? disk_override + Boolean bgzip_output_vcfs = false Boolean zero_pad_output_vcf_filenames = true # set to "NONE" if all the reference data was loaded into GVS in GvsImportGenomes @@ -74,7 +75,8 @@ workflow GvsExtractCallset { Boolean emit_pls = false Boolean emit_ads = true - String intervals_file_extension = if (zero_pad_output_vcf_filenames) then '-~{output_file_base_name}.vcf.gz.interval_list' else '-scattered.interval_list' + String intervals_file_extension = if (zero_pad_output_vcf_filenames) then '-~{output_file_base_name}.interval_list' else '-scattered.interval_list' + String vcf_extension = if (bgzip_output_vcfs) then '.vcf.bgz' else '.vcf.gz' if (!defined(git_hash) || !defined(gatk_docker) || !defined(cloud_sdk_docker) || !defined(variants_docker)) { call Utils.GetToolVersions { @@ -201,7 +203,7 @@ workflow GvsExtractCallset { scatter(i in range(length(SplitIntervals.interval_files))) { String interval_filename = basename(SplitIntervals.interval_files[i]) - String vcf_filename = if (zero_pad_output_vcf_filenames) then sub(interval_filename, ".interval_list", "") else "~{output_file_base_name}_${i}.vcf.gz" + String vcf_filename = if (zero_pad_output_vcf_filenames) then sub(interval_filename, ".interval_list", "") else "~{output_file_base_name}_${i}" call ExtractTask { input: @@ -228,7 +230,7 @@ workflow GvsExtractCallset { fq_filter_set_tranches_table = if (use_VQSR_lite) then none else fq_filter_set_tranches_table, filter_set_name = filter_set_name, drop_state = drop_state, - output_file = vcf_filename, + output_file = vcf_filename + vcf_extension, output_gcs_dir = output_gcs_dir, max_last_modified_timestamp = GetBQTablesMaxLastModifiedTimestamp.max_last_modified_timestamp, extract_preemptible_override = extract_preemptible_override, diff --git a/scripts/variantstore/wdl/GvsExtractCohortFromSampleNames.wdl b/scripts/variantstore/wdl/GvsExtractCohortFromSampleNames.wdl index 00ab98d330c..0e936917979 100644 --- a/scripts/variantstore/wdl/GvsExtractCohortFromSampleNames.wdl +++ b/scripts/variantstore/wdl/GvsExtractCohortFromSampleNames.wdl @@ -31,6 +31,7 @@ workflow GvsExtractCohortFromSampleNames { String? output_gcs_dir # set to "NONE" if all the reference data was loaded into GVS in GvsImportGenomes String drop_state = "NONE" + Boolean bgzip_output_vcfs = false File? interval_list Int? extract_preemptible_override @@ -142,6 +143,7 @@ workflow GvsExtractCohortFromSampleNames { output_gcs_dir = output_gcs_dir, drop_state = drop_state, + bgzip_output_vcfs = bgzip_output_vcfs, extract_preemptible_override = extract_preemptible_override, extract_maxretries_override = extract_maxretries_override, split_intervals_disk_size_override = split_intervals_disk_size_override, diff --git a/scripts/variantstore/wdl/GvsJointVariantCalling.wdl b/scripts/variantstore/wdl/GvsJointVariantCalling.wdl index 57fbcaa8c64..dfab2a8f76c 100644 --- a/scripts/variantstore/wdl/GvsJointVariantCalling.wdl +++ b/scripts/variantstore/wdl/GvsJointVariantCalling.wdl @@ -19,6 +19,7 @@ workflow GvsJointVariantCalling { String project_id String call_set_identifier String? extract_output_gcs_dir + Boolean bgzip_output_vcfs = false String drop_state = "FORTY" Boolean use_classic_VQSR = false Boolean use_compressed_references = false @@ -224,6 +225,7 @@ workflow GvsJointVariantCalling { split_intervals_mem_override = split_intervals_mem_override, do_not_filter_override = extract_do_not_filter_override, drop_state = drop_state, + bgzip_output_vcfs = bgzip_output_vcfs, is_wgs = is_wgs, maximum_alternate_alleles = maximum_alternate_alleles, } diff --git a/scripts/variantstore/wdl/GvsQuickstartHailIntegration.wdl b/scripts/variantstore/wdl/GvsQuickstartHailIntegration.wdl index 68aff1021ce..91710eae55d 100644 --- a/scripts/variantstore/wdl/GvsQuickstartHailIntegration.wdl +++ b/scripts/variantstore/wdl/GvsQuickstartHailIntegration.wdl @@ -16,6 +16,7 @@ workflow GvsQuickstartHailIntegration { Boolean extract_do_not_filter_override String dataset_suffix = "hail" Boolean use_default_dockers = false + Boolean bgzip_output_vcfs = false String? basic_docker String? cloud_sdk_docker @@ -74,6 +75,7 @@ workflow GvsQuickstartHailIntegration { use_default_dockers = use_default_dockers, check_expected_cost_and_table_size_outputs = false, gatk_override = gatk_override, + bgzip_output_vcfs = bgzip_output_vcfs, is_wgs = is_wgs, interval_list = interval_list, expected_output_prefix = expected_output_prefix, @@ -132,6 +134,7 @@ workflow GvsQuickstartHailIntegration { vds_path = GvsExtractAvroFilesForHail.vds_output_path, tieout_vcfs = GvsQuickstartVcfIntegration.output_vcfs, tieout_vcf_indexes = GvsQuickstartVcfIntegration.output_vcf_indexes, + tieout_vcf_suffix = if (bgzip_output_vcfs) then ".bgz" else ".gz", cloud_sdk_slim_docker = effective_cloud_sdk_slim_docker, hail_version = effective_hail_version, } @@ -156,6 +159,7 @@ task TieOutVds { String vds_path Array[File] tieout_vcfs Array[File] tieout_vcf_indexes + String tieout_vcf_suffix String cloud_sdk_slim_docker String hail_version } @@ -224,7 +228,7 @@ task TieOutVds { export JOINED_MATRIX_TABLE_PATH=${WORK}/joined.mt - python3 ./hail_join_vds_vcfs.py --vds-path ${VDS_PATH} --joined-matrix-table-path ${JOINED_MATRIX_TABLE_PATH} *.vcf.gz + python3 ./hail_join_vds_vcfs.py --vds-path ${VDS_PATH} --joined-matrix-table-path ${JOINED_MATRIX_TABLE_PATH} *.vcf~{tieout_vcf_suffix} pip install pytest ln -s ${WORK}/joined.mt . diff --git a/scripts/variantstore/wdl/GvsQuickstartIntegration.wdl b/scripts/variantstore/wdl/GvsQuickstartIntegration.wdl index 58e1b7331d1..f038ae69b5b 100644 --- a/scripts/variantstore/wdl/GvsQuickstartIntegration.wdl +++ b/scripts/variantstore/wdl/GvsQuickstartIntegration.wdl @@ -76,6 +76,7 @@ workflow GvsQuickstartIntegration { # necessarily the same as the branch name selected in Terra for the integration `GvsQuickstartIntegration` workflow, # though in practice likely they are the same. if (run_hail_integration) { + # This test workflow is probably best representative of the AoU workflow. Parameters used here should be those used for AoU callsets call QuickstartHailIntegration.GvsQuickstartHailIntegration as GvsQuickstartHailVQSRLiteIntegration { input: git_branch_or_tag = git_branch_or_tag, @@ -92,6 +93,7 @@ workflow GvsQuickstartIntegration { vcf_files_column_name = wgs_vcf_files_column_name, vcf_index_files_column_name = wgs_vcf_index_files_column_name, sample_set_name = select_first([wgs_sample_set_name, "wgs_integration_sample_set"]), + bgzip_output_vcfs = true, basic_docker = effective_basic_docker, cloud_sdk_docker = effective_cloud_sdk_docker, cloud_sdk_slim_docker = effective_cloud_sdk_slim_docker, diff --git a/scripts/variantstore/wdl/GvsQuickstartVcfIntegration.wdl b/scripts/variantstore/wdl/GvsQuickstartVcfIntegration.wdl index 02b39cfa048..570242cfc49 100644 --- a/scripts/variantstore/wdl/GvsQuickstartVcfIntegration.wdl +++ b/scripts/variantstore/wdl/GvsQuickstartVcfIntegration.wdl @@ -13,6 +13,7 @@ workflow GvsQuickstartVcfIntegration { Boolean use_compressed_references = false Boolean load_vcf_headers = false String drop_state = "FORTY" + Boolean bgzip_output_vcfs = false String dataset_suffix Boolean is_wgs = true File? interval_list @@ -94,6 +95,7 @@ workflow GvsQuickstartVcfIntegration { # (and the initial version of this integration test does not allow for inexact matching of actual and expected results.) extract_do_not_filter_override = extract_do_not_filter_override, drop_state = drop_state, + bgzip_output_vcfs = bgzip_output_vcfs, is_wgs = is_wgs, interval_list = interval_list, sample_id_column_name = sample_id_column_name, @@ -119,8 +121,9 @@ workflow GvsQuickstartVcfIntegration { call AssertIdenticalOutputs { input: expected_output_prefix = expected_prefix, + expected_output_suffix = if (bgzip_output_vcfs) then ".bgz" else ".gz", actual_vcfs = JointVariantCalling.output_vcfs, - cloud_sdk_docker = effective_cloud_sdk_docker, + gatk_docker = effective_gatk_docker } if (check_expected_cost_and_table_size_outputs) { @@ -161,8 +164,9 @@ workflow GvsQuickstartVcfIntegration { task AssertIdenticalOutputs { input { String expected_output_prefix + String expected_output_suffix Array[File] actual_vcfs - String cloud_sdk_docker + String gatk_docker } parameter_meta { actual_vcfs: { @@ -184,8 +188,8 @@ task AssertIdenticalOutputs { # Download all the expected data mkdir expected cd expected - gcloud storage cp -r "${expected_prefix}"'/*.vcf.gz' . - gzip -d *.gz + gcloud storage cp -r "${expected_prefix}"'/*.vcf~{expected_output_suffix}' . + gzip -S ~{expected_output_suffix} -d *~{expected_output_suffix} cd .. mkdir actual @@ -201,7 +205,7 @@ task AssertIdenticalOutputs { cat actual_manifest.txt | gcloud storage cp -I . # Unzip actual result data. - ls -1 | grep -E '\.vcf\.gz$' | xargs gzip -d + ls -1 | grep -E '\.vcf\~{expected_output_suffix}$' | xargs gzip -S ~{expected_output_suffix} -d cd .. echo "Header Check" @@ -257,7 +261,7 @@ task AssertIdenticalOutputs { >>> runtime { - docker: cloud_sdk_docker + docker: gatk_docker disks: "local-disk 500 HDD" } @@ -349,7 +353,7 @@ task AssertCostIsTrackedAndExpected { DIFF_FOUND=$(echo $EXP_BYTES $OBS_BYTES | awk '{print ($1-$2)/$1}') fi - if ! awk "BEGIN{ exit ($DIFF_FOUND > $TOLERANCE) }" + if ! awk "BEGIN{ exit ($DIFF_FOUND -gt $TOLERANCE) }" then echo "FAIL!!! The relative difference between these is $DIFF_FOUND, which is greater than the allowed tolerance ($TOLERANCE)" echo "1" > ret_val.txt From df03bd77c51419a7446dc7ab17583465df241850 Mon Sep 17 00:00:00 2001 From: George Grant Date: Wed, 8 May 2024 10:07:00 -0400 Subject: [PATCH 2/2] VS_1327 ensure sample ids are unique (#8818) * Have GvsAssignIds.wdl validate that input sample names (in the provided input file) are unique. --- scripts/variantstore/wdl/GvsAssignIds.wdl | 55 +++++++++++++++++++- scripts/variantstore/wdl/GvsCreateTables.wdl | 9 ++-- 2 files changed, 60 insertions(+), 4 deletions(-) diff --git a/scripts/variantstore/wdl/GvsAssignIds.wdl b/scripts/variantstore/wdl/GvsAssignIds.wdl index d7ceebe8e11..1edc7b94b4b 100644 --- a/scripts/variantstore/wdl/GvsAssignIds.wdl +++ b/scripts/variantstore/wdl/GvsAssignIds.wdl @@ -39,10 +39,17 @@ workflow GvsAssignIds { String effective_cloud_sdk_docker = select_first([cloud_sdk_docker, GetToolVersions.cloud_sdk_docker]) String effective_git_hash = select_first([git_hash, GetToolVersions.git_hash]) + call ValidateSamples { + input: + sample_names_file = external_sample_names, + cloud_sdk_docker = effective_cloud_sdk_docker, + } + call GvsCreateTables.CreateTables as CreateSampleInfoTable { input: project_id = project_id, dataset_name = dataset_name, + go = ValidateSamples.done, datatype = "sample_info", schema_json = sample_info_schema_json, max_table_id = 1, @@ -55,6 +62,7 @@ workflow GvsAssignIds { input: project_id = project_id, dataset_name = dataset_name, + go = ValidateSamples.done, datatype = "sample_load_status", schema_json = sample_load_status_schema_json, max_table_id = 1, @@ -68,6 +76,7 @@ workflow GvsAssignIds { input: project_id = project_id, dataset_name = dataset_name, + go = ValidateSamples.done, datatype = "vcf_header_lines_scratch", schema_json = vcf_header_lines_scratch_schema_json, max_table_id = 1, @@ -80,6 +89,7 @@ workflow GvsAssignIds { input: project_id = project_id, dataset_name = dataset_name, + go = ValidateSamples.done, datatype = "vcf_header_lines", schema_json = vcf_header_lines_schema_json, max_table_id = 1, @@ -92,6 +102,7 @@ workflow GvsAssignIds { input: project_id = project_id, dataset_name = dataset_name, + go = ValidateSamples.done, datatype = "sample_vcf_header", schema_json = sample_vcf_header_schema_json, max_table_id = 1, @@ -105,6 +116,7 @@ workflow GvsAssignIds { input: project_id = project_id, dataset_name = dataset_name, + go = ValidateSamples.done, cloud_sdk_docker = effective_cloud_sdk_docker, } @@ -147,7 +159,7 @@ task AssignIds { String sample_info_table File sample_names Boolean samples_are_controls - String table_creation_done + Boolean table_creation_done String cloud_sdk_docker } meta { @@ -235,6 +247,7 @@ task CreateCostObservabilityTable { input { String project_id String dataset_name + Boolean go String cloud_sdk_docker } @@ -280,3 +293,43 @@ task CreateCostObservabilityTable { } } +task ValidateSamples { + input { + File sample_names_file + String cloud_sdk_docker + } + + command <<< + # Prepend date, time and pwd to xtrace log entries. + PS4='\D{+%F %T} \w $ ' + set -o errexit -o nounset -o pipefail -o xtrace + + if [[ ! -s ~{sample_names_file} ]] + then + echo "ERROR: The input file ~{sample_names_file} is empty" + exit 1; + fi + + sort ~{sample_names_file} | uniq -d > output.txt + if [[ -s output.txt ]] + then + echo "ERROR: The input file ~{sample_names_file} contains the following duplicate entries:" + cat output.txt + exit 1; + fi + + >>> + + runtime { + docker: cloud_sdk_docker + memory: "3 GB" + cpu: "1" + preemptible: 1 + maxRetries: 0 + disks: "local-disk 100 HDD" + } + + output { + Boolean done = true + } +} \ No newline at end of file diff --git a/scripts/variantstore/wdl/GvsCreateTables.wdl b/scripts/variantstore/wdl/GvsCreateTables.wdl index 743fd28891a..f144c8d7194 100644 --- a/scripts/variantstore/wdl/GvsCreateTables.wdl +++ b/scripts/variantstore/wdl/GvsCreateTables.wdl @@ -36,6 +36,7 @@ workflow CreateBQTables { input: project_id = project_id, dataset_name = dataset_name, + go = true, datatype = "vet", max_table_id = max_table_id, schema_json = vet_schema_json, @@ -49,6 +50,7 @@ workflow CreateBQTables { input: project_id = project_id, dataset_name = dataset_name, + go = true, datatype = "ref_ranges", max_table_id = max_table_id, schema_json = ref_ranges_schema_used, @@ -59,8 +61,8 @@ workflow CreateBQTables { } output { - String vetDone = CreateVetTables.done - String refDone = CreateRefRangesTables.done + Boolean vetDone = CreateVetTables.done + Boolean refDone = CreateRefRangesTables.done String recorded_git_hash = effective_git_hash } } @@ -71,6 +73,7 @@ task CreateTables { input { String project_id String dataset_name + Boolean go String datatype Int max_table_id String schema_json @@ -126,7 +129,7 @@ task CreateTables { >>> output { - String done = "true" + Boolean done = true } runtime {