Skip to content

Commit

Permalink
Merge branch 'ah_var_store' into rsa_vs_1300
Browse files Browse the repository at this point in the history
  • Loading branch information
rsasch committed May 8, 2024
2 parents c40fd6f + df03bd7 commit 7d52fc4
Show file tree
Hide file tree
Showing 9 changed files with 88 additions and 16 deletions.
1 change: 1 addition & 0 deletions scripts/variantstore/docs/aou/AOU_DELIVERABLES.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@
- Specify the `interval_weights_bed` appropriate for the PGEN / VCF extraction run you are performing. `gs://gvs_quickstart_storage/weights/gvs_full_vet_weights_1kb_padded_orig.bed` is the interval weights BED used for Quickstart.
- For both `GvsExtractCallset` and `GvsExtractCallsetPgenMerged`, select the workflow option "Retry with more memory" and choose a "Memory retry factor" of 1.5
- For `GvsExtractCallset`, make sure to specify the appropriate `maximum_alternate_alleles` value (currently 100).
- For `GvsExtractCallset`, if you want to output VCFs that are compressed using bgzip, set the `bgzip_output_vcfs` input to `true` to generate VCFs that are compressed using bgzip.
- These workflows do not use the Terra Data Entity Model to run, so be sure to select the `Run workflow with inputs defined by file paths` workflow submission option.
1. `GvsCalculatePrecisionAndSensitivity` workflow
- Please see the detailed instructions for running the Precision and Sensitivity workflow [here](../../tieout/AoU_PRECISION_SENSITIVITY.md).
Expand Down
55 changes: 54 additions & 1 deletion scripts/variantstore/wdl/GvsAssignIds.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,17 @@ workflow GvsAssignIds {
String effective_cloud_sdk_docker = select_first([cloud_sdk_docker, GetToolVersions.cloud_sdk_docker])
String effective_git_hash = select_first([git_hash, GetToolVersions.git_hash])

call ValidateSamples {
input:
sample_names_file = external_sample_names,
cloud_sdk_docker = effective_cloud_sdk_docker,
}

call GvsCreateTables.CreateTables as CreateSampleInfoTable {
input:
project_id = project_id,
dataset_name = dataset_name,
go = ValidateSamples.done,
datatype = "sample_info",
schema_json = sample_info_schema_json,
max_table_id = 1,
Expand All @@ -55,6 +62,7 @@ workflow GvsAssignIds {
input:
project_id = project_id,
dataset_name = dataset_name,
go = ValidateSamples.done,
datatype = "sample_load_status",
schema_json = sample_load_status_schema_json,
max_table_id = 1,
Expand All @@ -68,6 +76,7 @@ workflow GvsAssignIds {
input:
project_id = project_id,
dataset_name = dataset_name,
go = ValidateSamples.done,
datatype = "vcf_header_lines_scratch",
schema_json = vcf_header_lines_scratch_schema_json,
max_table_id = 1,
Expand All @@ -80,6 +89,7 @@ workflow GvsAssignIds {
input:
project_id = project_id,
dataset_name = dataset_name,
go = ValidateSamples.done,
datatype = "vcf_header_lines",
schema_json = vcf_header_lines_schema_json,
max_table_id = 1,
Expand All @@ -92,6 +102,7 @@ workflow GvsAssignIds {
input:
project_id = project_id,
dataset_name = dataset_name,
go = ValidateSamples.done,
datatype = "sample_vcf_header",
schema_json = sample_vcf_header_schema_json,
max_table_id = 1,
Expand All @@ -105,6 +116,7 @@ workflow GvsAssignIds {
input:
project_id = project_id,
dataset_name = dataset_name,
go = ValidateSamples.done,
cloud_sdk_docker = effective_cloud_sdk_docker,
}

Expand Down Expand Up @@ -147,7 +159,7 @@ task AssignIds {
String sample_info_table
File sample_names
Boolean samples_are_controls
String table_creation_done
Boolean table_creation_done
String cloud_sdk_docker
}
meta {
Expand Down Expand Up @@ -235,6 +247,7 @@ task CreateCostObservabilityTable {
input {
String project_id
String dataset_name
Boolean go
String cloud_sdk_docker
}

Expand Down Expand Up @@ -280,3 +293,43 @@ task CreateCostObservabilityTable {
}
}

task ValidateSamples {
input {
File sample_names_file
String cloud_sdk_docker
}

command <<<
# Prepend date, time and pwd to xtrace log entries.
PS4='\D{+%F %T} \w $ '
set -o errexit -o nounset -o pipefail -o xtrace

if [[ ! -s ~{sample_names_file} ]]
then
echo "ERROR: The input file ~{sample_names_file} is empty"
exit 1;
fi

sort ~{sample_names_file} | uniq -d > output.txt
if [[ -s output.txt ]]
then
echo "ERROR: The input file ~{sample_names_file} contains the following duplicate entries:"
cat output.txt
exit 1;
fi

>>>

runtime {
docker: cloud_sdk_docker
memory: "3 GB"
cpu: "1"
preemptible: 1
maxRetries: 0
disks: "local-disk 100 HDD"
}

output {
Boolean done = true
}
}
9 changes: 6 additions & 3 deletions scripts/variantstore/wdl/GvsCreateTables.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ workflow CreateBQTables {
input:
project_id = project_id,
dataset_name = dataset_name,
go = true,
datatype = "vet",
max_table_id = max_table_id,
schema_json = vet_schema_json,
Expand All @@ -49,6 +50,7 @@ workflow CreateBQTables {
input:
project_id = project_id,
dataset_name = dataset_name,
go = true,
datatype = "ref_ranges",
max_table_id = max_table_id,
schema_json = ref_ranges_schema_used,
Expand All @@ -59,8 +61,8 @@ workflow CreateBQTables {
}

output {
String vetDone = CreateVetTables.done
String refDone = CreateRefRangesTables.done
Boolean vetDone = CreateVetTables.done
Boolean refDone = CreateRefRangesTables.done
String recorded_git_hash = effective_git_hash
}
}
Expand All @@ -71,6 +73,7 @@ task CreateTables {
input {
String project_id
String dataset_name
Boolean go
String datatype
Int max_table_id
String schema_json
Expand Down Expand Up @@ -126,7 +129,7 @@ task CreateTables {
>>>

output {
String done = "true"
Boolean done = true
}

runtime {
Expand Down
8 changes: 5 additions & 3 deletions scripts/variantstore/wdl/GvsExtractCallset.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ workflow GvsExtractCallset {
Int? scatter_count
Int? extract_memory_override_gib
Int? disk_override
Boolean bgzip_output_vcfs = false
Boolean zero_pad_output_vcf_filenames = true

# set to "NONE" if all the reference data was loaded into GVS in GvsImportGenomes
Expand Down Expand Up @@ -74,7 +75,8 @@ workflow GvsExtractCallset {
Boolean emit_pls = false
Boolean emit_ads = true

String intervals_file_extension = if (zero_pad_output_vcf_filenames) then '-~{output_file_base_name}.vcf.gz.interval_list' else '-scattered.interval_list'
String intervals_file_extension = if (zero_pad_output_vcf_filenames) then '-~{output_file_base_name}.interval_list' else '-scattered.interval_list'
String vcf_extension = if (bgzip_output_vcfs) then '.vcf.bgz' else '.vcf.gz'

if (!defined(git_hash) || !defined(gatk_docker) || !defined(cloud_sdk_docker) || !defined(variants_docker)) {
call Utils.GetToolVersions {
Expand Down Expand Up @@ -201,7 +203,7 @@ workflow GvsExtractCallset {

scatter(i in range(length(SplitIntervals.interval_files))) {
String interval_filename = basename(SplitIntervals.interval_files[i])
String vcf_filename = if (zero_pad_output_vcf_filenames) then sub(interval_filename, ".interval_list", "") else "~{output_file_base_name}_${i}.vcf.gz"
String vcf_filename = if (zero_pad_output_vcf_filenames) then sub(interval_filename, ".interval_list", "") else "~{output_file_base_name}_${i}"

call ExtractTask {
input:
Expand All @@ -228,7 +230,7 @@ workflow GvsExtractCallset {
fq_filter_set_tranches_table = if (use_VQSR_lite) then none else fq_filter_set_tranches_table,
filter_set_name = filter_set_name,
drop_state = drop_state,
output_file = vcf_filename,
output_file = vcf_filename + vcf_extension,
output_gcs_dir = output_gcs_dir,
max_last_modified_timestamp = GetBQTablesMaxLastModifiedTimestamp.max_last_modified_timestamp,
extract_preemptible_override = extract_preemptible_override,
Expand Down
2 changes: 2 additions & 0 deletions scripts/variantstore/wdl/GvsExtractCohortFromSampleNames.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ workflow GvsExtractCohortFromSampleNames {
String? output_gcs_dir
# set to "NONE" if all the reference data was loaded into GVS in GvsImportGenomes
String drop_state = "NONE"
Boolean bgzip_output_vcfs = false

File? interval_list
Int? extract_preemptible_override
Expand Down Expand Up @@ -142,6 +143,7 @@ workflow GvsExtractCohortFromSampleNames {
output_gcs_dir = output_gcs_dir,

drop_state = drop_state,
bgzip_output_vcfs = bgzip_output_vcfs,
extract_preemptible_override = extract_preemptible_override,
extract_maxretries_override = extract_maxretries_override,
split_intervals_disk_size_override = split_intervals_disk_size_override,
Expand Down
3 changes: 2 additions & 1 deletion scripts/variantstore/wdl/GvsJointVariantCalling.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@ workflow GvsJointVariantCalling {
String dataset_name
String extract_output_gcs_dir
String project_id

String sample_id_column_name ## Note that a column WILL exist that is the <entity>_id from the table name. However, some users will want to specify an alternate column for the sample_name during ingest
String vcf_files_column_name
String vcf_index_files_column_name

Boolean bgzip_output_vcfs = false
String drop_state = "FORTY"
Boolean use_classic_VQSR = false
Boolean use_compressed_references = false
Expand Down Expand Up @@ -226,6 +226,7 @@ workflow GvsJointVariantCalling {
split_intervals_mem_override = split_intervals_mem_override,
do_not_filter_override = extract_do_not_filter_override,
drop_state = drop_state,
bgzip_output_vcfs = bgzip_output_vcfs,
is_wgs = is_wgs,
maximum_alternate_alleles = maximum_alternate_alleles,
}
Expand Down
6 changes: 5 additions & 1 deletion scripts/variantstore/wdl/GvsQuickstartHailIntegration.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ workflow GvsQuickstartHailIntegration {
Boolean extract_do_not_filter_override
String dataset_suffix = "hail"
Boolean use_default_dockers = false
Boolean bgzip_output_vcfs = false

String? basic_docker
String? cloud_sdk_docker
Expand Down Expand Up @@ -74,6 +75,7 @@ workflow GvsQuickstartHailIntegration {
use_default_dockers = use_default_dockers,
check_expected_cost_and_table_size_outputs = false,
gatk_override = gatk_override,
bgzip_output_vcfs = bgzip_output_vcfs,
is_wgs = is_wgs,
interval_list = interval_list,
expected_output_prefix = expected_output_prefix,
Expand Down Expand Up @@ -132,6 +134,7 @@ workflow GvsQuickstartHailIntegration {
vds_path = GvsExtractAvroFilesForHail.vds_output_path,
tieout_vcfs = GvsQuickstartVcfIntegration.output_vcfs,
tieout_vcf_indexes = GvsQuickstartVcfIntegration.output_vcf_indexes,
tieout_vcf_suffix = if (bgzip_output_vcfs) then ".bgz" else ".gz",
cloud_sdk_slim_docker = effective_cloud_sdk_slim_docker,
hail_version = effective_hail_version,
}
Expand All @@ -156,6 +159,7 @@ task TieOutVds {
String vds_path
Array[File] tieout_vcfs
Array[File] tieout_vcf_indexes
String tieout_vcf_suffix
String cloud_sdk_slim_docker
String hail_version
}
Expand Down Expand Up @@ -224,7 +228,7 @@ task TieOutVds {

export JOINED_MATRIX_TABLE_PATH=${WORK}/joined.mt

python3 ./hail_join_vds_vcfs.py --vds-path ${VDS_PATH} --joined-matrix-table-path ${JOINED_MATRIX_TABLE_PATH} *.vcf.gz
python3 ./hail_join_vds_vcfs.py --vds-path ${VDS_PATH} --joined-matrix-table-path ${JOINED_MATRIX_TABLE_PATH} *.vcf~{tieout_vcf_suffix}

pip install pytest
ln -s ${WORK}/joined.mt .
Expand Down
2 changes: 2 additions & 0 deletions scripts/variantstore/wdl/GvsQuickstartIntegration.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ workflow GvsQuickstartIntegration {
# necessarily the same as the branch name selected in Terra for the integration `GvsQuickstartIntegration` workflow,
# though in practice likely they are the same.
if (run_hail_integration) {
# This test workflow is probably best representative of the AoU workflow. Parameters used here should be those used for AoU callsets
call QuickstartHailIntegration.GvsQuickstartHailIntegration as GvsQuickstartHailVQSRLiteIntegration {
input:
git_branch_or_tag = git_branch_or_tag,
Expand All @@ -92,6 +93,7 @@ workflow GvsQuickstartIntegration {
vcf_files_column_name = wgs_vcf_files_column_name,
vcf_index_files_column_name = wgs_vcf_index_files_column_name,
sample_set_name = select_first([wgs_sample_set_name, "wgs_integration_sample_set"]),
bgzip_output_vcfs = true,
basic_docker = effective_basic_docker,
cloud_sdk_docker = effective_cloud_sdk_docker,
cloud_sdk_slim_docker = effective_cloud_sdk_slim_docker,
Expand Down
18 changes: 11 additions & 7 deletions scripts/variantstore/wdl/GvsQuickstartVcfIntegration.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ workflow GvsQuickstartVcfIntegration {
Boolean use_compressed_references = false
Boolean load_vcf_headers = false
String drop_state = "FORTY"
Boolean bgzip_output_vcfs = false
String dataset_suffix
Boolean is_wgs = true
File? interval_list
Expand Down Expand Up @@ -94,6 +95,7 @@ workflow GvsQuickstartVcfIntegration {
# (and the initial version of this integration test does not allow for inexact matching of actual and expected results.)
extract_do_not_filter_override = extract_do_not_filter_override,
drop_state = drop_state,
bgzip_output_vcfs = bgzip_output_vcfs,
is_wgs = is_wgs,
interval_list = interval_list,
sample_id_column_name = sample_id_column_name,
Expand All @@ -119,8 +121,9 @@ workflow GvsQuickstartVcfIntegration {
call AssertIdenticalOutputs {
input:
expected_output_prefix = expected_prefix,
expected_output_suffix = if (bgzip_output_vcfs) then ".bgz" else ".gz",
actual_vcfs = JointVariantCalling.output_vcfs,
cloud_sdk_docker = effective_cloud_sdk_docker,
gatk_docker = effective_gatk_docker
}

if (check_expected_cost_and_table_size_outputs) {
Expand Down Expand Up @@ -161,8 +164,9 @@ workflow GvsQuickstartVcfIntegration {
task AssertIdenticalOutputs {
input {
String expected_output_prefix
String expected_output_suffix
Array[File] actual_vcfs
String cloud_sdk_docker
String gatk_docker
}
parameter_meta {
actual_vcfs: {
Expand All @@ -184,8 +188,8 @@ task AssertIdenticalOutputs {
# Download all the expected data
mkdir expected
cd expected
gcloud storage cp -r "${expected_prefix}"'/*.vcf.gz' .
gzip -d *.gz
gcloud storage cp -r "${expected_prefix}"'/*.vcf~{expected_output_suffix}' .
gzip -S ~{expected_output_suffix} -d *~{expected_output_suffix}
cd ..

mkdir actual
Expand All @@ -201,7 +205,7 @@ task AssertIdenticalOutputs {

cat actual_manifest.txt | gcloud storage cp -I .
# Unzip actual result data.
ls -1 | grep -E '\.vcf\.gz$' | xargs gzip -d
ls -1 | grep -E '\.vcf\~{expected_output_suffix}$' | xargs gzip -S ~{expected_output_suffix} -d
cd ..

echo "Header Check"
Expand Down Expand Up @@ -257,7 +261,7 @@ task AssertIdenticalOutputs {
>>>

runtime {
docker: cloud_sdk_docker
docker: gatk_docker
disks: "local-disk 500 HDD"
}

Expand Down Expand Up @@ -349,7 +353,7 @@ task AssertCostIsTrackedAndExpected {
DIFF_FOUND=$(echo $EXP_BYTES $OBS_BYTES | awk '{print ($1-$2)/$1}')
fi

if ! awk "BEGIN{ exit ($DIFF_FOUND > $TOLERANCE) }"
if ! awk "BEGIN{ exit ($DIFF_FOUND -gt $TOLERANCE) }"
then
echo "FAIL!!! The relative difference between these is $DIFF_FOUND, which is greater than the allowed tolerance ($TOLERANCE)"
echo "1" > ret_val.txt
Expand Down

0 comments on commit 7d52fc4

Please sign in to comment.