From bb957c7db7074494a89fc1fbcfe4a5d63d91435b Mon Sep 17 00:00:00 2001
From: George Grant <ggrant@broadinstitute.org>
Date: Wed, 8 May 2024 06:49:38 -0400
Subject: [PATCH 1/2] VS-1335 bgzip on extract ah var store edition (#8809)

* Optionally extract to bgz format.
* Set bgzipping to be off (everywhere) by default.
* Update assert_identical_outputs to handle bgzipped outputs.
---
 .../variantstore/docs/aou/AOU_DELIVERABLES.md  |  1 +
 scripts/variantstore/wdl/GvsExtractCallset.wdl |  8 +++++---
 .../wdl/GvsExtractCohortFromSampleNames.wdl    |  2 ++
 .../wdl/GvsJointVariantCalling.wdl             |  2 ++
 .../wdl/GvsQuickstartHailIntegration.wdl       |  6 +++++-
 .../wdl/GvsQuickstartIntegration.wdl           |  2 ++
 .../wdl/GvsQuickstartVcfIntegration.wdl        | 18 +++++++++++-------
 7 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/scripts/variantstore/docs/aou/AOU_DELIVERABLES.md b/scripts/variantstore/docs/aou/AOU_DELIVERABLES.md
index 31993322c71..4636408944a 100644
--- a/scripts/variantstore/docs/aou/AOU_DELIVERABLES.md
+++ b/scripts/variantstore/docs/aou/AOU_DELIVERABLES.md
@@ -95,6 +95,7 @@
     - Specify the `interval_weights_bed` appropriate for the PGEN / VCF extraction run you are performing. `gs://gvs_quickstart_storage/weights/gvs_full_vet_weights_1kb_padded_orig.bed` is the interval weights BED used for Quickstart.
     - For both `GvsExtractCallset` and `GvsExtractCallsetPgenMerged`, select the workflow option "Retry with more memory" and choose a "Memory retry factor" of 1.5
     - For `GvsExtractCallset`, make sure to specify the appropriate `maximum_alternate_alleles` value (currently 100).
+    - For `GvsExtractCallset`, if you want to output VCFs that are compressed using bgzip, set the `bgzip_output_vcfs` input to `true` to generate VCFs that are compressed using bgzip.
     - These workflows do not use the Terra Data Entity Model to run, so be sure to select the `Run workflow with inputs defined by file paths` workflow submission option.
 1. `GvsCalculatePrecisionAndSensitivity` workflow
     - Please see the detailed instructions for running the Precision and Sensitivity workflow [here](../../tieout/AoU_PRECISION_SENSITIVITY.md).
diff --git a/scripts/variantstore/wdl/GvsExtractCallset.wdl b/scripts/variantstore/wdl/GvsExtractCallset.wdl
index d585a0f356c..501f3807f23 100644
--- a/scripts/variantstore/wdl/GvsExtractCallset.wdl
+++ b/scripts/variantstore/wdl/GvsExtractCallset.wdl
@@ -20,6 +20,7 @@ workflow GvsExtractCallset {
     Int? scatter_count
     Int? extract_memory_override_gib
     Int? disk_override
+    Boolean bgzip_output_vcfs = false
     Boolean zero_pad_output_vcf_filenames = true
 
     # set to "NONE" if all the reference data was loaded into GVS in GvsImportGenomes
@@ -74,7 +75,8 @@ workflow GvsExtractCallset {
   Boolean emit_pls = false
   Boolean emit_ads = true
 
-  String intervals_file_extension = if (zero_pad_output_vcf_filenames) then '-~{output_file_base_name}.vcf.gz.interval_list' else '-scattered.interval_list'
+  String intervals_file_extension = if (zero_pad_output_vcf_filenames) then '-~{output_file_base_name}.interval_list' else '-scattered.interval_list'
+  String vcf_extension = if (bgzip_output_vcfs) then '.vcf.bgz' else '.vcf.gz'
 
   if (!defined(git_hash) || !defined(gatk_docker) || !defined(cloud_sdk_docker) || !defined(variants_docker)) {
     call Utils.GetToolVersions {
@@ -201,7 +203,7 @@ workflow GvsExtractCallset {
 
   scatter(i in range(length(SplitIntervals.interval_files))) {
     String interval_filename = basename(SplitIntervals.interval_files[i])
-    String vcf_filename = if (zero_pad_output_vcf_filenames) then sub(interval_filename, ".interval_list", "") else "~{output_file_base_name}_${i}.vcf.gz"
+    String vcf_filename = if (zero_pad_output_vcf_filenames) then sub(interval_filename, ".interval_list", "") else "~{output_file_base_name}_${i}"
 
     call ExtractTask {
       input:
@@ -228,7 +230,7 @@ workflow GvsExtractCallset {
         fq_filter_set_tranches_table          = if (use_VQSR_lite) then none else fq_filter_set_tranches_table,
         filter_set_name                       = filter_set_name,
         drop_state                            = drop_state,
-        output_file                           = vcf_filename,
+        output_file                           = vcf_filename + vcf_extension,
         output_gcs_dir                        = output_gcs_dir,
         max_last_modified_timestamp           = GetBQTablesMaxLastModifiedTimestamp.max_last_modified_timestamp,
         extract_preemptible_override          = extract_preemptible_override,
diff --git a/scripts/variantstore/wdl/GvsExtractCohortFromSampleNames.wdl b/scripts/variantstore/wdl/GvsExtractCohortFromSampleNames.wdl
index 00ab98d330c..0e936917979 100644
--- a/scripts/variantstore/wdl/GvsExtractCohortFromSampleNames.wdl
+++ b/scripts/variantstore/wdl/GvsExtractCohortFromSampleNames.wdl
@@ -31,6 +31,7 @@ workflow GvsExtractCohortFromSampleNames {
     String? output_gcs_dir
     # set to "NONE" if all the reference data was loaded into GVS in GvsImportGenomes
     String drop_state = "NONE"
+    Boolean bgzip_output_vcfs = false
 
     File? interval_list
     Int? extract_preemptible_override
@@ -142,6 +143,7 @@ workflow GvsExtractCohortFromSampleNames {
       output_gcs_dir = output_gcs_dir,
 
       drop_state = drop_state,
+      bgzip_output_vcfs = bgzip_output_vcfs,
       extract_preemptible_override = extract_preemptible_override,
       extract_maxretries_override = extract_maxretries_override,
       split_intervals_disk_size_override = split_intervals_disk_size_override,
diff --git a/scripts/variantstore/wdl/GvsJointVariantCalling.wdl b/scripts/variantstore/wdl/GvsJointVariantCalling.wdl
index 57fbcaa8c64..dfab2a8f76c 100644
--- a/scripts/variantstore/wdl/GvsJointVariantCalling.wdl
+++ b/scripts/variantstore/wdl/GvsJointVariantCalling.wdl
@@ -19,6 +19,7 @@ workflow GvsJointVariantCalling {
         String project_id
         String call_set_identifier
         String? extract_output_gcs_dir
+        Boolean bgzip_output_vcfs = false
         String drop_state = "FORTY"
         Boolean use_classic_VQSR = false
         Boolean use_compressed_references = false
@@ -224,6 +225,7 @@ workflow GvsJointVariantCalling {
             split_intervals_mem_override = split_intervals_mem_override,
             do_not_filter_override = extract_do_not_filter_override,
             drop_state = drop_state,
+            bgzip_output_vcfs = bgzip_output_vcfs,
             is_wgs = is_wgs,
             maximum_alternate_alleles = maximum_alternate_alleles,
     }
diff --git a/scripts/variantstore/wdl/GvsQuickstartHailIntegration.wdl b/scripts/variantstore/wdl/GvsQuickstartHailIntegration.wdl
index 68aff1021ce..91710eae55d 100644
--- a/scripts/variantstore/wdl/GvsQuickstartHailIntegration.wdl
+++ b/scripts/variantstore/wdl/GvsQuickstartHailIntegration.wdl
@@ -16,6 +16,7 @@ workflow GvsQuickstartHailIntegration {
         Boolean extract_do_not_filter_override
         String dataset_suffix = "hail"
         Boolean use_default_dockers = false
+        Boolean bgzip_output_vcfs = false
 
         String? basic_docker
         String? cloud_sdk_docker
@@ -74,6 +75,7 @@ workflow GvsQuickstartHailIntegration {
             use_default_dockers = use_default_dockers,
             check_expected_cost_and_table_size_outputs = false,
             gatk_override = gatk_override,
+            bgzip_output_vcfs = bgzip_output_vcfs,
             is_wgs = is_wgs,
             interval_list = interval_list,
             expected_output_prefix = expected_output_prefix,
@@ -132,6 +134,7 @@ workflow GvsQuickstartHailIntegration {
             vds_path = GvsExtractAvroFilesForHail.vds_output_path,
             tieout_vcfs = GvsQuickstartVcfIntegration.output_vcfs,
             tieout_vcf_indexes = GvsQuickstartVcfIntegration.output_vcf_indexes,
+            tieout_vcf_suffix = if (bgzip_output_vcfs) then ".bgz" else ".gz",
             cloud_sdk_slim_docker = effective_cloud_sdk_slim_docker,
             hail_version = effective_hail_version,
     }
@@ -156,6 +159,7 @@ task TieOutVds {
         String vds_path
         Array[File] tieout_vcfs
         Array[File] tieout_vcf_indexes
+        String tieout_vcf_suffix
         String cloud_sdk_slim_docker
         String hail_version
     }
@@ -224,7 +228,7 @@ task TieOutVds {
 
         export JOINED_MATRIX_TABLE_PATH=${WORK}/joined.mt
 
-        python3 ./hail_join_vds_vcfs.py --vds-path ${VDS_PATH} --joined-matrix-table-path ${JOINED_MATRIX_TABLE_PATH} *.vcf.gz
+        python3 ./hail_join_vds_vcfs.py --vds-path ${VDS_PATH} --joined-matrix-table-path ${JOINED_MATRIX_TABLE_PATH} *.vcf~{tieout_vcf_suffix}
 
         pip install pytest
         ln -s ${WORK}/joined.mt .
diff --git a/scripts/variantstore/wdl/GvsQuickstartIntegration.wdl b/scripts/variantstore/wdl/GvsQuickstartIntegration.wdl
index 58e1b7331d1..f038ae69b5b 100644
--- a/scripts/variantstore/wdl/GvsQuickstartIntegration.wdl
+++ b/scripts/variantstore/wdl/GvsQuickstartIntegration.wdl
@@ -76,6 +76,7 @@ workflow GvsQuickstartIntegration {
     # necessarily the same as the branch name selected in Terra for the integration `GvsQuickstartIntegration` workflow,
     # though in practice likely they are the same.
     if (run_hail_integration) {
+        # This test workflow is probably best representative of the AoU workflow. Parameters used here should be those used for AoU callsets
         call QuickstartHailIntegration.GvsQuickstartHailIntegration as GvsQuickstartHailVQSRLiteIntegration {
             input:
                 git_branch_or_tag = git_branch_or_tag,
@@ -92,6 +93,7 @@ workflow GvsQuickstartIntegration {
                 vcf_files_column_name = wgs_vcf_files_column_name,
                 vcf_index_files_column_name = wgs_vcf_index_files_column_name,
                 sample_set_name = select_first([wgs_sample_set_name, "wgs_integration_sample_set"]),
+                bgzip_output_vcfs = true,
                 basic_docker = effective_basic_docker,
                 cloud_sdk_docker = effective_cloud_sdk_docker,
                 cloud_sdk_slim_docker = effective_cloud_sdk_slim_docker,
diff --git a/scripts/variantstore/wdl/GvsQuickstartVcfIntegration.wdl b/scripts/variantstore/wdl/GvsQuickstartVcfIntegration.wdl
index 02b39cfa048..570242cfc49 100644
--- a/scripts/variantstore/wdl/GvsQuickstartVcfIntegration.wdl
+++ b/scripts/variantstore/wdl/GvsQuickstartVcfIntegration.wdl
@@ -13,6 +13,7 @@ workflow GvsQuickstartVcfIntegration {
         Boolean use_compressed_references = false
         Boolean load_vcf_headers = false
         String drop_state = "FORTY"
+        Boolean bgzip_output_vcfs = false
         String dataset_suffix
         Boolean is_wgs = true
         File? interval_list
@@ -94,6 +95,7 @@ workflow GvsQuickstartVcfIntegration {
             # (and the initial version of this integration test does not allow for inexact matching of actual and expected results.)
             extract_do_not_filter_override = extract_do_not_filter_override,
             drop_state = drop_state,
+            bgzip_output_vcfs = bgzip_output_vcfs,
             is_wgs = is_wgs,
             interval_list = interval_list,
             sample_id_column_name = sample_id_column_name,
@@ -119,8 +121,9 @@ workflow GvsQuickstartVcfIntegration {
         call AssertIdenticalOutputs {
             input:
                 expected_output_prefix = expected_prefix,
+                expected_output_suffix = if (bgzip_output_vcfs) then ".bgz" else ".gz",
                 actual_vcfs = JointVariantCalling.output_vcfs,
-                cloud_sdk_docker = effective_cloud_sdk_docker,
+                gatk_docker = effective_gatk_docker
         }
 
         if (check_expected_cost_and_table_size_outputs) {
@@ -161,8 +164,9 @@ workflow GvsQuickstartVcfIntegration {
 task AssertIdenticalOutputs {
     input {
         String expected_output_prefix
+        String expected_output_suffix
         Array[File] actual_vcfs
-        String cloud_sdk_docker
+        String gatk_docker
     }
     parameter_meta {
         actual_vcfs: {
@@ -184,8 +188,8 @@ task AssertIdenticalOutputs {
         # Download all the expected data
         mkdir expected
         cd expected
-        gcloud storage cp -r "${expected_prefix}"'/*.vcf.gz' .
-        gzip -d *.gz
+        gcloud storage cp -r "${expected_prefix}"'/*.vcf~{expected_output_suffix}' .
+        gzip -S ~{expected_output_suffix} -d *~{expected_output_suffix}
         cd ..
 
         mkdir actual
@@ -201,7 +205,7 @@ task AssertIdenticalOutputs {
 
         cat actual_manifest.txt | gcloud storage cp -I .
         # Unzip actual result data.
-        ls -1 | grep -E '\.vcf\.gz$' | xargs gzip -d
+        ls -1 | grep -E '\.vcf\~{expected_output_suffix}$' | xargs gzip -S ~{expected_output_suffix} -d
         cd ..
 
         echo "Header Check"
@@ -257,7 +261,7 @@ task AssertIdenticalOutputs {
     >>>
 
     runtime {
-        docker: cloud_sdk_docker
+        docker: gatk_docker
         disks: "local-disk 500 HDD"
     }
 
@@ -349,7 +353,7 @@ task AssertCostIsTrackedAndExpected {
                 DIFF_FOUND=$(echo $EXP_BYTES $OBS_BYTES | awk '{print ($1-$2)/$1}')
               fi
 
-              if ! awk "BEGIN{ exit ($DIFF_FOUND > $TOLERANCE) }"
+              if ! awk "BEGIN{ exit ($DIFF_FOUND -gt $TOLERANCE) }"
               then
                 echo "FAIL!!! The relative difference between these is $DIFF_FOUND, which is greater than the allowed tolerance ($TOLERANCE)"
                 echo "1" > ret_val.txt

From df03bd77c51419a7446dc7ab17583465df241850 Mon Sep 17 00:00:00 2001
From: George Grant <ggrant@broadinstitute.org>
Date: Wed, 8 May 2024 10:07:00 -0400
Subject: [PATCH 2/2] VS_1327 ensure sample ids are unique (#8818)

* Have GvsAssignIds.wdl validate that input sample names (in the provided input file) are unique.
---
 scripts/variantstore/wdl/GvsAssignIds.wdl    | 55 +++++++++++++++++++-
 scripts/variantstore/wdl/GvsCreateTables.wdl |  9 ++--
 2 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/scripts/variantstore/wdl/GvsAssignIds.wdl b/scripts/variantstore/wdl/GvsAssignIds.wdl
index d7ceebe8e11..1edc7b94b4b 100644
--- a/scripts/variantstore/wdl/GvsAssignIds.wdl
+++ b/scripts/variantstore/wdl/GvsAssignIds.wdl
@@ -39,10 +39,17 @@ workflow GvsAssignIds {
   String effective_cloud_sdk_docker = select_first([cloud_sdk_docker, GetToolVersions.cloud_sdk_docker])
   String effective_git_hash = select_first([git_hash, GetToolVersions.git_hash])
 
+  call ValidateSamples {
+    input:
+      sample_names_file = external_sample_names,
+      cloud_sdk_docker = effective_cloud_sdk_docker,
+  }
+
   call GvsCreateTables.CreateTables as CreateSampleInfoTable {
   	input:
       project_id = project_id,
       dataset_name = dataset_name,
+      go = ValidateSamples.done,
       datatype = "sample_info",
       schema_json = sample_info_schema_json,
       max_table_id = 1,
@@ -55,6 +62,7 @@ workflow GvsAssignIds {
     input:
       project_id = project_id,
       dataset_name = dataset_name,
+      go = ValidateSamples.done,
       datatype = "sample_load_status",
       schema_json = sample_load_status_schema_json,
       max_table_id = 1,
@@ -68,6 +76,7 @@ workflow GvsAssignIds {
       input:
         project_id = project_id,
         dataset_name = dataset_name,
+        go = ValidateSamples.done,
         datatype = "vcf_header_lines_scratch",
         schema_json = vcf_header_lines_scratch_schema_json,
         max_table_id = 1,
@@ -80,6 +89,7 @@ workflow GvsAssignIds {
       input:
         project_id = project_id,
         dataset_name = dataset_name,
+        go = ValidateSamples.done,
         datatype = "vcf_header_lines",
         schema_json = vcf_header_lines_schema_json,
         max_table_id = 1,
@@ -92,6 +102,7 @@ workflow GvsAssignIds {
       input:
         project_id = project_id,
         dataset_name = dataset_name,
+        go = ValidateSamples.done,
         datatype = "sample_vcf_header",
         schema_json = sample_vcf_header_schema_json,
         max_table_id = 1,
@@ -105,6 +116,7 @@ workflow GvsAssignIds {
     input:
       project_id = project_id,
       dataset_name = dataset_name,
+      go = ValidateSamples.done,
       cloud_sdk_docker = effective_cloud_sdk_docker,
   }
 
@@ -147,7 +159,7 @@ task AssignIds {
     String sample_info_table
     File sample_names
     Boolean samples_are_controls
-    String table_creation_done
+    Boolean table_creation_done
     String cloud_sdk_docker
   }
   meta {
@@ -235,6 +247,7 @@ task CreateCostObservabilityTable {
   input {
     String project_id
     String dataset_name
+    Boolean go
     String cloud_sdk_docker
   }
 
@@ -280,3 +293,43 @@ task CreateCostObservabilityTable {
   }
 }
 
+task ValidateSamples {
+  input {
+    File sample_names_file
+    String cloud_sdk_docker
+  }
+
+  command <<<
+    # Prepend date, time and pwd to xtrace log entries.
+    PS4='\D{+%F %T} \w $ '
+    set -o errexit -o nounset -o pipefail -o xtrace
+
+    if [[ ! -s ~{sample_names_file} ]]
+    then
+      echo "ERROR: The input file ~{sample_names_file} is empty"
+      exit 1;
+    fi
+
+    sort ~{sample_names_file} | uniq -d > output.txt
+    if [[ -s output.txt ]]
+    then
+      echo "ERROR: The input file ~{sample_names_file} contains the following duplicate entries:"
+      cat output.txt
+      exit 1;
+    fi
+
+  >>>
+
+  runtime {
+    docker: cloud_sdk_docker
+    memory: "3 GB"
+    cpu: "1"
+    preemptible: 1
+    maxRetries: 0
+    disks: "local-disk 100 HDD"
+  }
+
+  output {
+    Boolean done = true
+  }
+}
\ No newline at end of file
diff --git a/scripts/variantstore/wdl/GvsCreateTables.wdl b/scripts/variantstore/wdl/GvsCreateTables.wdl
index 743fd28891a..f144c8d7194 100644
--- a/scripts/variantstore/wdl/GvsCreateTables.wdl
+++ b/scripts/variantstore/wdl/GvsCreateTables.wdl
@@ -36,6 +36,7 @@ workflow CreateBQTables {
     input:
       project_id = project_id,
       dataset_name = dataset_name,
+      go = true,
       datatype = "vet",
       max_table_id = max_table_id,
       schema_json = vet_schema_json,
@@ -49,6 +50,7 @@ workflow CreateBQTables {
     input:
       project_id = project_id,
       dataset_name = dataset_name,
+      go = true,
       datatype = "ref_ranges",
       max_table_id = max_table_id,
       schema_json = ref_ranges_schema_used,
@@ -59,8 +61,8 @@ workflow CreateBQTables {
   }
 
   output {
-    String vetDone = CreateVetTables.done
-    String refDone = CreateRefRangesTables.done
+    Boolean vetDone = CreateVetTables.done
+    Boolean refDone = CreateRefRangesTables.done
     String recorded_git_hash = effective_git_hash
   }
 }
@@ -71,6 +73,7 @@ task CreateTables {
   input {
     String project_id
     String dataset_name
+    Boolean go
     String datatype
     Int max_table_id
     String schema_json
@@ -126,7 +129,7 @@ task CreateTables {
   >>>
 
   output {
-    String done = "true"
+    Boolean done = true
   }
 
   runtime {