From fcb59111faac4ef1d27d5668f0f4416bdf133fd2 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Tue, 28 May 2024 11:50:50 -0400 Subject: [PATCH] even better error messages --- .../GvsCreateVATFilesFromBigQuery.wdl | 2 +- .../GvsValidateVAT.wdl | 32 +++++++++---------- scripts/variantstore/wdl/GvsAssignIds.wdl | 8 ++--- scripts/variantstore/wdl/GvsCallsetCost.wdl | 4 +-- .../variantstore/wdl/GvsCallsetStatistics.wdl | 14 ++++---- .../variantstore/wdl/GvsCreateVATfromVDS.wdl | 4 +-- .../variantstore/wdl/GvsExtractCallset.wdl | 2 +- .../wdl/GvsExtractCallsetPgen.wdl | 2 +- scripts/variantstore/wdl/GvsImportGenomes.wdl | 10 +++--- scripts/variantstore/wdl/GvsIngestTieout.wdl | 2 +- .../variantstore/wdl/GvsPopulateAltAllele.wdl | 4 +-- .../wdl/GvsQuickstartVcfIntegration.wdl | 4 +-- scripts/variantstore/wdl/GvsUtils.wdl | 16 +++++----- .../variantstore/wdl/GvsWithdrawSamples.wdl | 4 +-- 14 files changed, 54 insertions(+), 54 deletions(-) diff --git a/scripts/variantstore/variant_annotations_table/GvsCreateVATFilesFromBigQuery.wdl b/scripts/variantstore/variant_annotations_table/GvsCreateVATFilesFromBigQuery.wdl index a22fc195922..ab4ebc3c694 100644 --- a/scripts/variantstore/variant_annotations_table/GvsCreateVATFilesFromBigQuery.wdl +++ b/scripts/variantstore/variant_annotations_table/GvsCreateVATFilesFromBigQuery.wdl @@ -82,7 +82,7 @@ task BigQueryExportVat { echo "project_id = ~{project_id}" > ~/.bigqueryrc # note: tab delimiter and compression creates tsv.gz files - # bq query check: ok export + # bq query --max_rows check: ok export bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} \ 'EXPORT DATA OPTIONS( uri="~{export_path}", diff --git a/scripts/variantstore/variant_annotations_table/GvsValidateVAT.wdl b/scripts/variantstore/variant_annotations_table/GvsValidateVAT.wdl index 6622a9e0752..3b9e30958a1 100644 --- a/scripts/variantstore/variant_annotations_table/GvsValidateVAT.wdl +++ b/scripts/variantstore/variant_annotations_table/GvsValidateVAT.wdl @@ -315,7 +315,7 @@ task EnsureVatTableHasVariants { echo "project_id = ~{project_id}" > ~/.bigqueryrc - # bq query check: ok one row + # bq query --max_rows check: ok one row bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT COUNT (DISTINCT vid) AS count FROM `~{fq_vat_table}`' > bq_variant_count.csv NUMVARS=$(python3 -c "csvObj=open('bq_variant_count.csv','r');csvContents=csvObj.read();print(csvContents.split('\n')[1]);") @@ -370,7 +370,7 @@ task SpotCheckForExpectedTranscripts { echo "project_id = ~{project_id}" > ~/.bigqueryrc - # bq query check: may produce > 100 rows but anything > 0 is an error; error message updated + # bq query --max_rows check: ok may produce > 100 rows but anything > 0 is an error, error message explicit about row limit bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT contig, position, @@ -440,7 +440,7 @@ task SchemaNoNullRequiredFields { # non-nullable fields: vid, contig, position, ref_allele, alt_allele, gvs_all_ac, gvs_all_an, gvs_all_af, variant_type, genomic_location - # bq query check: may produce > 100 rows but anything > 0 is an error; error message updated + # bq query --max_rows check: ok may produce > 100 rows but anything > 0 is an error, error message explicit about row limit bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv \ 'SELECT contig, @@ -520,7 +520,7 @@ task SchemaOnlyOneRowPerNullTranscript { echo "project_id = ~{project_id}" > ~/.bigqueryrc - # bq query check: may produce > 100 rows but anything > 0 is an error; error message updated + # bq query --max_rows check: ok may produce > 100 rows but anything > 0 is an error, error message explicit about row limit bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT vid, COUNT(vid) AS num_rows @@ -581,7 +581,7 @@ task SchemaPrimaryKey { echo "project_id = ~{project_id}" > ~/.bigqueryrc - # bq query check: may produce > 100 rows but anything > 0 is an error; error message updated + # bq query --max_rows check: ok may produce > 100 rows but anything > 0 is an error, error message explicit about row limit bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv \ 'SELECT vid, @@ -641,7 +641,7 @@ task SchemaEnsemblTranscripts { echo "project_id = ~{project_id}" > ~/.bigqueryrc - # bq query check: may produce > 100 rows but anything > 0 is an error; error message updated + # bq query --max_rows check: ok may produce > 100 rows but anything > 0 is an error, error message explicit about row limit bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT contig, position, @@ -702,7 +702,7 @@ task SchemaNonzeroAcAn { echo "project_id = ~{project_id}" > ~/.bigqueryrc - # bq query check: may produce > 100 rows but anything > 0 is an error; error message updated + # bq query --max_rows check: ok may produce > 100 rows but anything > 0 is an error, error message explicit about row limit bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT contig, position, @@ -765,7 +765,7 @@ task SchemaNullTranscriptsExist { echo "project_id = ~{project_id}" > ~/.bigqueryrc - # bq query check: may produce > 100 rows but anything > 0 is fine; zero is the error case. + # bq query --max_rows check: may produce > 100 rows but anything > 0 is fine; zero is the error case, error message is fine. bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT vid FROM @@ -823,7 +823,7 @@ task SubpopulationMax { # gvs subpopulations: [ "afr", "amr", "eas", "eur", "mid", "oth", "sas"] - # bq query check: may produce > 100 rows but anything > 0 is an error; error message is fine + # bq query --max_rows check: may produce > 100 rows but anything > 0 is an error; error message is fine bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT vid FROM @@ -886,7 +886,7 @@ task SubpopulationAlleleCount { # gvs subpopulations: [ "afr", "amr", "eas", "eur", "mid", "oth", "sas"] - # bq query check: may produce > 100 rows but anything > 0 is an error; error message is fine + # bq query --max_rows check: may produce > 100 rows but anything > 0 is an error; error message is fine bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT vid FROM @@ -943,7 +943,7 @@ task SubpopulationAlleleNumber { # gvs subpopulations: [ "afr", "amr", "eas", "eur", "mid", "oth", "sas"] - # bq query check: may produce > 100 rows but anything > 0 is an error; error message is fine + # bq query --max_rows check: may produce > 100 rows but anything > 0 is an error; error message is fine bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT vid FROM @@ -999,7 +999,7 @@ task DuplicateAnnotations { echo "project_id = ~{query_project_id}" > ~/.bigqueryrc - # bq query check: may produce > 100 rows but anything > 0 is an error; updated error message and other fixes + # bq query --max_rows check: may produce > 100 rows but anything > 0 is an error; error message is fine. bq --apilog=false query --nouse_legacy_sql --project_id=~{query_project_id} --format=csv ' SELECT contig, position, gvs_all_an, COUNT(DISTINCT gvs_all_an) AS an_count FROM `~{fq_vat_table}` @@ -1007,7 +1007,7 @@ task DuplicateAnnotations { HAVING an_count > 1 ' > bq_an_output.csv - # bq query check: may produce > 100 rows but anything > 0 is an error; updated error message and other fixes + # bq query --max_rows check: may produce > 100 rows but anything > 0 is an error; error message is fine. bq --apilog=false query --nouse_legacy_sql --project_id=~{query_project_id} --format=csv ' SELECT contig, position, gvs_all_ac, COUNT(DISTINCT gvs_all_ac) AS ac_count FROM `~{fq_vat_table}` @@ -1086,7 +1086,7 @@ task ClinvarSignificance { # "other", # "not provided"] - # bq query check: we currently expect this to be at least 13 but it could be more, set --max_rows to a ridiculously high value. + # bq query --max_rows check: we currently expect this to be at least 13 but it could be more, set --max_rows to a ridiculously high value. bq --apilog=false query --max_rows 1000000 --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT distinct(unnested_clinvar_classification) FROM @@ -1161,7 +1161,7 @@ task SchemaAAChangeAndExonNumberConsistent { echo "project_id = ~{project_id}" > ~/.bigqueryrc - # bq query check: ok single row + # bq query --max_rows check: ok single row bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT COUNT (DISTINCT vid) AS count FROM ( @@ -1262,7 +1262,7 @@ task SpotCheckForAAChangeAndExonNumberConsistency { echo "project_id = ~{project_id}" > ~/.bigqueryrc - # bq query check: ok single row + # bq query --max_rows check: ok single row bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT COUNT (DISTINCT vid) FROM ( diff --git a/scripts/variantstore/wdl/GvsAssignIds.wdl b/scripts/variantstore/wdl/GvsAssignIds.wdl index c7bce2459dc..795fc7d9699 100644 --- a/scripts/variantstore/wdl/GvsAssignIds.wdl +++ b/scripts/variantstore/wdl/GvsAssignIds.wdl @@ -203,23 +203,23 @@ task AssignIds { bq --apilog=false load --project_id=~{project_id} ~{dataset_name}.sample_id_assignment_lock ~{sample_names} "sample_name:STRING" # add sample_name to sample_info_table - # bq query check: ok insert + # bq query --max_rows check: ok insert bq --apilog=false --project_id=~{project_id} query --use_legacy_sql=false ~{bq_labels} \ 'INSERT into `~{dataset_name}.~{sample_info_table}` (sample_name, is_control) select sample_name, ~{samples_are_controls} from `~{dataset_name}.sample_id_assignment_lock` m where m.sample_name not in (SELECT sample_name FROM `~{dataset_name}.~{sample_info_table}`)' # get the current maximum id, or 0 if there are none - # bq query check: ok single row + # bq query --max_rows check: ok single row bq --apilog=false --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} 'SELECT IFNULL(MAX(sample_id),0) FROM `~{dataset_name}.~{sample_info_table}`' > maxid offset=$(tail -1 maxid) # perform actual id assignment - # bq query check: ok update + # bq query --max_rows check: ok update bq --apilog=false --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} --parameter=offset:INTEGER:$offset \ 'UPDATE `~{dataset_name}.~{sample_info_table}` m SET m.sample_id = id_assign.id FROM (SELECT sample_name, @offset + ROW_NUMBER() OVER(order by sample_name) as id FROM `~{dataset_name}.~{sample_info_table}` WHERE sample_id IS NULL) id_assign WHERE m.sample_name = id_assign.sample_name;' # retrieve the list of assigned ids and samples to update the datamodel echo "entity:sample_id,gvs_id" > update.tsv - # bq query check: ok num samples explicit + # bq query --max_rows check: ok num samples explicit bq --apilog=false --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} -n $num_samples --parameter=offset:INTEGER:$offset \ 'SELECT sample_name, sample_id from `~{dataset_name}.~{sample_info_table}` WHERE sample_id >= @offset' > update.tsv cat update.tsv | sed -e 's/sample_id/gvs_id/' -e 's/sample_name/entity:sample_id/' -e 's/,/\t/g' > gvs_ids.tsv diff --git a/scripts/variantstore/wdl/GvsCallsetCost.wdl b/scripts/variantstore/wdl/GvsCallsetCost.wdl index f1e8500c49d..1d16e75357f 100644 --- a/scripts/variantstore/wdl/GvsCallsetCost.wdl +++ b/scripts/variantstore/wdl/GvsCallsetCost.wdl @@ -117,7 +117,7 @@ task CoreStorageModelSizes { local table_pattern="$1" local output_file_name="$2" - # bq query check: explicitly set massive max rows as we expect there to be as many rows as there are superpartitions + # bq query --max_rows check: explicitly set massive max rows as we expect there to be as many rows as there are superpartitions bq --apilog=false query --max_rows 10000000 --project_id='~{project_id}' --format=csv --use_legacy_sql=false \ 'SELECT round(sum(total_billable_bytes) / (1024*1024*1024),2) FROM `~{project_id}.~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS` @@ -155,7 +155,7 @@ task ReadCostObservabilityTable { PS4='\D{+%F %T} \w $ ' set -o errexit -o nounset -o pipefail -o xtrace - # bq query check: explicitly set massive max rows as we expect there to be as many rows as there are superpartitions + # bq query --max_rows check: explicitly set massive max rows as we expect there to be as many rows as there are superpartitions bq --apilog=false query --max_rows 10000000 --project_id=~{project_id} --format=prettyjson --use_legacy_sql=false \ 'SELECT step, event_key, round(sum(event_bytes) / (1024*1024*1024), 2) AS sum_event_gibibytes FROM `~{project_id}.~{dataset_name}.cost_observability` diff --git a/scripts/variantstore/wdl/GvsCallsetStatistics.wdl b/scripts/variantstore/wdl/GvsCallsetStatistics.wdl index 1f7c1be2b97..5a40a30f36b 100644 --- a/scripts/variantstore/wdl/GvsCallsetStatistics.wdl +++ b/scripts/variantstore/wdl/GvsCallsetStatistics.wdl @@ -322,7 +322,7 @@ task CollectMetricsForChromosome { echo "project_id = ~{project_id}" > ~/.bigqueryrc - # bq query check: ok one row + # bq query --max_rows check: ok one row bq --apilog=false query --project_id=~{project_id} --format=csv --use_legacy_sql=false ' SELECT COUNT(*) from `~{project_id}.~{dataset_name}.~{metrics_table}` WHERE chromosome = ~{chromosome}' | sed 1d > existing_row_count.txt @@ -333,7 +333,7 @@ task CollectMetricsForChromosome { exit 1 fi - # bq query check: ok insert (elaborate one) + # bq query --max_rows check: ok insert (elaborate one) bq --apilog=false query --project_id=~{project_id} --use_legacy_sql=false ' CREATE TEMPORARY FUNCTION titv(ref STRING, allele STRING) RETURNS STRING @@ -455,7 +455,7 @@ task AggregateMetricsAcrossChromosomes { PS4='\D{+%F %T} \w $ ' set -o errexit -o nounset -o pipefail -o xtrace - # bq query check: ok one row + # bq query --max_rows check: ok one row bq --apilog=false query --project_id=~{project_id} --format=csv --use_legacy_sql=false ' SELECT COUNT(*) from `~{project_id}.~{dataset_name}.~{aggregate_metrics_table}` ' | sed 1d > existing_row_count.txt @@ -467,7 +467,7 @@ task AggregateMetricsAcrossChromosomes { exit 1 fi - # bq query check: ok insert + # bq query --max_rows check: ok insert bq --apilog=false query --project_id=~{project_id} --use_legacy_sql=false ' INSERT `~{project_id}.~{dataset_name}.~{aggregate_metrics_table}` ( filter_set_name, @@ -532,7 +532,7 @@ task CollectStatistics { PS4='\D{+%F %T} \w $ ' set -o errexit -o nounset -o pipefail -o xtrace - # bq query check: ok one row + # bq query --max_rows check: ok one row bq --apilog=false query --project_id=~{project_id} --format=csv --use_legacy_sql=false ' SELECT COUNT(*) from `~{project_id}.~{dataset_name}.~{statistics_table}` ' | sed 1d > existing_row_count.txt @@ -544,7 +544,7 @@ task CollectStatistics { exit 1 fi - # bq query check: ok insert + # bq query --max_rows check: ok insert bq --apilog=false query --project_id=~{project_id} --format=csv --use_legacy_sql=false ' INSERT `~{project_id}.~{dataset_name}.~{statistics_table}` ( sample_id, @@ -604,7 +604,7 @@ task ExportToCSV { PS4='\D{+%F %T} \w $ ' set -o errexit -o nounset -o pipefail -o xtrace - # bq query check: max rows set to at least the number of samples + # bq query --max_rows check: max rows set to at least the number of samples bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv --max_rows 1000000000 ' SELECT * FROM `~{project_id}.~{dataset_name}.~{statistics_table}` ORDER BY SAMPLE_NAME diff --git a/scripts/variantstore/wdl/GvsCreateVATfromVDS.wdl b/scripts/variantstore/wdl/GvsCreateVATfromVDS.wdl index 4356a256071..3df6e6ea6cc 100644 --- a/scripts/variantstore/wdl/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/wdl/GvsCreateVATfromVDS.wdl @@ -863,7 +863,7 @@ task BigQueryLoadJson { # We want the vat creation query to overwrite the destination table because if new data has been put into the pre-vat tables # and this workflow has been run an additional time, we dont want duplicates being appended from the original run - # bq query check: ok selecting into a table + # bq query --max_rows check: ok selecting into a table bq --apilog=false query --nouse_legacy_sql --destination_table=~{dataset_name}.~{vat_table_name} --replace --project_id=~{project_id} \ 'SELECT v.vid, @@ -1033,7 +1033,7 @@ task DeduplicateVatInBigQuery { # Now we query the original VAT table and recreate it, but remove any rows that appear twice. - # bq query check: ok selecting into a table + # bq query --max_rows check: ok selecting into a table bq --apilog=false query --nouse_legacy_sql --destination_table=~{dataset_name}.~{output_vat_table_name} --replace --project_id=~{project_id} \ ' SELECT * EXCEPT(row_number) FROM ( SELECT diff --git a/scripts/variantstore/wdl/GvsExtractCallset.wdl b/scripts/variantstore/wdl/GvsExtractCallset.wdl index 8bd89d479b7..2a55e316c7c 100644 --- a/scripts/variantstore/wdl/GvsExtractCallset.wdl +++ b/scripts/variantstore/wdl/GvsExtractCallset.wdl @@ -557,7 +557,7 @@ task GenerateSampleListFile { echo "project_id = ~{query_project}" > ~/.bigqueryrc - # bq query check: max rows set to at least the number of samples + # bq query --max_rows check: max rows set to at least the number of samples bq --apilog=false --project_id=~{query_project} --format=csv query --max_rows 1000000000 --use_legacy_sql=false ~{bq_labels} \ 'SELECT sample_name FROM `~{fq_samples_to_extract_table}`' | sed 1d > sample-name-list.txt diff --git a/scripts/variantstore/wdl/GvsExtractCallsetPgen.wdl b/scripts/variantstore/wdl/GvsExtractCallsetPgen.wdl index a0f6517827e..cbbd90601b0 100644 --- a/scripts/variantstore/wdl/GvsExtractCallsetPgen.wdl +++ b/scripts/variantstore/wdl/GvsExtractCallsetPgen.wdl @@ -603,7 +603,7 @@ task GenerateSampleListFile { echo "project_id = ~{query_project}" > ~/.bigqueryrc - # bq query check: max rows set to at least the number of samples + # bq query --max_rows check: max rows set to at least the number of samples bq --apilog=false --project_id=~{query_project} --format=csv query --max_rows 1000000000 --use_legacy_sql=false ~{bq_labels} \ 'SELECT sample_name FROM `~{fq_samples_to_extract_table}`' | sed 1d > sample-name-list.txt diff --git a/scripts/variantstore/wdl/GvsImportGenomes.wdl b/scripts/variantstore/wdl/GvsImportGenomes.wdl index 8a3cc60271d..7af8683244a 100644 --- a/scripts/variantstore/wdl/GvsImportGenomes.wdl +++ b/scripts/variantstore/wdl/GvsImportGenomes.wdl @@ -304,7 +304,7 @@ task LoadData { bq --apilog=false load --project_id=~{project_id} ~{temp_table} $NAMES_FILE "sample_name:STRING" # Get the current min/max id, or 0 if there are none. Withdrawn samples still have IDs so don't filter them out. - # bq query check: ok one row + # bq query --max_rows check: ok one row bq --apilog=false --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} ' SELECT IFNULL(MIN(sample_id),0) as min, IFNULL(MAX(sample_id),0) as max FROM `~{dataset_name}.~{table_name}` AS samples JOIN `~{temp_table}` AS temp ON samples.sample_name = temp.sample_name' > results.csv @@ -319,7 +319,7 @@ task LoadData { samples.sample_id NOT IN (SELECT sample_id FROM \`~{dataset_name}.sample_load_status\` WHERE status = '$status') AND samples.withdrawn is NULL" > query.txt - # bq query check: ok sets max rows explicitly + # bq query --max_rows check: ok sets max rows explicitly cat query.txt | bq --apilog=false --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} -n ~{num_samples} > \ $status.status_bucket.csv @@ -456,7 +456,7 @@ task SetIsLoadedColumn { # an exponential backoff, but at the number of samples that are being loaded this would introduce significant delays # in workflow processing. So this method is used to set *all* of the saple_info.is_loaded flags at one time. - # bq query check: ok update + # bq query --max_rows check: ok update bq --apilog=false --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} \ 'UPDATE `~{dataset_name}.sample_info` SET is_loaded = true WHERE sample_id IN (SELECT CAST(partition_id AS INT64) @@ -527,7 +527,7 @@ task GetUningestedSampleIds { bq --apilog=false load --project_id=~{project_id} ~{temp_table} ~{external_sample_names} "sample_name:STRING" # Get the current min/max id, or 0 if there are none. Withdrawn samples still have IDs so don't filter them out. - # bq query check: ok one row + # bq query --max_rows check: ok one row bq --apilog=false --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} ' SELECT IFNULL(MIN(sample_id),0) as min, IFNULL(MAX(sample_id),0) as max FROM `~{dataset_name}.~{table_name}` @@ -558,7 +558,7 @@ task GetUningestedSampleIds { samples.sample_id NOT IN (SELECT sample_id FROM \`~{dataset_name}.sample_load_status\` WHERE status = '$status') AND samples.withdrawn is NULL" > query.txt - # bq query check: ok sets max rows explicitly + # bq query --max_rows check: ok sets max rows explicitly cat query.txt | bq --apilog=false --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} -n ~{num_samples} > \ $status.status_bucket.csv diff --git a/scripts/variantstore/wdl/GvsIngestTieout.wdl b/scripts/variantstore/wdl/GvsIngestTieout.wdl index c38047346cb..68d8d8c6c26 100644 --- a/scripts/variantstore/wdl/GvsIngestTieout.wdl +++ b/scripts/variantstore/wdl/GvsIngestTieout.wdl @@ -88,7 +88,7 @@ task IngestTieout { check_table() { local table_name=$1 - # bq query check: ok, anything > 0 is an error, error message explicit with 100 row limit. + # bq query --max_rows check: ok, anything > 0 is an error, error message explicit about 100 row limit. bq --apilog=false query --project_id=~{project} --format=csv --use_legacy_sql=false \ '(SELECT sample_id, count(*) AS count diff --git a/scripts/variantstore/wdl/GvsPopulateAltAllele.wdl b/scripts/variantstore/wdl/GvsPopulateAltAllele.wdl index 6d94a750cc1..89d6f67ae05 100644 --- a/scripts/variantstore/wdl/GvsPopulateAltAllele.wdl +++ b/scripts/variantstore/wdl/GvsPopulateAltAllele.wdl @@ -98,7 +98,7 @@ task GetMaxSampleId { set -o errexit -o nounset -o pipefail -o xtrace echo "project_id = ~{project_id}" > ~/.bigqueryrc - # bq query check: ok one row + # bq query --max_rows check: ok one row bq --apilog=false query --project_id=~{project_id} --format=csv --use_legacy_sql=false \ 'SELECT IFNULL(MAX(sample_id), 0) AS max_sample_id FROM `~{dataset_name}.alt_allele`' > num_rows.csv @@ -206,7 +206,7 @@ task CreateAltAlleleTable { set -o errexit -o nounset -o pipefail -o xtrace echo "project_id = ~{project_id}" > ~/.bigqueryrc - # bq query check: ok create table + # bq query --max_rows check: ok create table bq --apilog=false query --project_id=~{project_id} --format=csv --use_legacy_sql=false ~{bq_labels} \ 'CREATE TABLE IF NOT EXISTS `~{project_id}.~{dataset_name}.alt_allele` ( location INT64, diff --git a/scripts/variantstore/wdl/GvsQuickstartVcfIntegration.wdl b/scripts/variantstore/wdl/GvsQuickstartVcfIntegration.wdl index ef0c284ef25..81b7bcc7871 100644 --- a/scripts/variantstore/wdl/GvsQuickstartVcfIntegration.wdl +++ b/scripts/variantstore/wdl/GvsQuickstartVcfIntegration.wdl @@ -296,7 +296,7 @@ task AssertCostIsTrackedAndExpected { echo "project_id = ~{project_id}" > ~/.bigqueryrc # Note that in this query we are using the ROW_NUMBER() functionality to ignore extra entries caused # by preemption (for instance there might be two rows for shard_identifier '*033') - # bq query check: max rows set >> 100 + # bq query --max_rows check: max_rows explicitly set to massive number bq --apilog=false query --max_rows 100000000 --project_id=~{project_id} --format=csv --use_legacy_sql=false \ 'SELECT call, step, event_key, sum(event_bytes) FROM ( SELECT *, ROW_NUMBER() @@ -403,7 +403,7 @@ task AssertTableSizesAreExpected { mkdir output echo "project_id = ~{project_id}" > ~/.bigqueryrc - # bq query check: aggregating and unioning should produce two rows + # bq query --max_rows check: aggregating and unioning should produce two rows bq --apilog=false query --project_id=~{project_id} --format=csv --use_legacy_sql=false \ "SELECT 'vet_total' AS total_name, sum(total_billable_bytes) AS total_bytes FROM \ \`~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS\` WHERE table_name LIKE 'vet_%' \ diff --git a/scripts/variantstore/wdl/GvsUtils.wdl b/scripts/variantstore/wdl/GvsUtils.wdl index dbd4b89953e..38195d27440 100644 --- a/scripts/variantstore/wdl/GvsUtils.wdl +++ b/scripts/variantstore/wdl/GvsUtils.wdl @@ -414,7 +414,7 @@ task GetBQTablesMaxLastModifiedTimestamp { echo "project_id = ~{query_project}" > ~/.bigqueryrc - # bq query check: ok one row + # bq query --max_rows check: ok one row bq --apilog=false --project_id=~{query_project} query --format=csv --use_legacy_sql=false \ 'SELECT UNIX_MICROS(MAX(last_modified_time)) last_modified_time FROM `~{data_project}`.~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS WHERE table_name like "~{sep=" OR table_name like " table_patterns}"' > results.txt @@ -742,7 +742,7 @@ task GetNumSamplesLoaded { bash ~{monitoring_script} > monitoring.log & echo "project_id = ~{project_id}" > ~/.bigqueryrc - # bq query check: ok one row + # bq query --max_rows check: ok one row bq --apilog=false query --project_id=~{project_id} --format=csv --use_legacy_sql=false ' SELECT COUNT(*) FROM `~{fq_sample_table}` WHERE @@ -787,7 +787,7 @@ task CountSuperpartitions { bash ~{monitoring_script} > monitoring.log & - # bq query check: ok one row + # bq query --max_rows check: ok one row bq --apilog=false query --project_id=~{project_id} --format=csv --use_legacy_sql=false ' SELECT COUNT(*) FROM `~{project_id}.~{dataset_name}.INFORMATION_SCHEMA.TABLES` @@ -831,7 +831,7 @@ task ValidateFilterSetName { echo "project_id = ~{project_id}" > ~/.bigqueryrc - # bq query check: enlarged max rows in case we get a lot of filter names + # bq query --max_rows check: enlarged max rows in case we get a lot of filter names OUTPUT=$(bq --apilog=false --project_id=~{project_id} --format=csv query --max_rows 1000000 --use_legacy_sql=false ~{bq_labels} 'SELECT filter_set_name as available_filter_set_names FROM `~{fq_filter_set_info_table}` GROUP BY filter_set_name') FILTERSETS=${OUTPUT#"available_filter_set_names"} @@ -880,7 +880,7 @@ task IsVQSRLite { echo "project_id = ~{project_id}" > ~/.bigqueryrc - # bq query check: ok one row + # bq query --max_rows check: ok one row bq --apilog=false query --project_id=~{project_id} --format=csv --use_legacy_sql=false ~{bq_labels} \ 'BEGIN SELECT COUNT(1) AS counted FROM `~{fq_filter_set_info_table}` WHERE filter_set_name = "~{filter_set_name}" @@ -891,7 +891,7 @@ task IsVQSRLite { LITE_COUNT=`cat lite_count_file.txt` - # bq query check: ok one row + # bq query --max_rows check: ok one row bq --apilog=false query --project_id=~{project_id} --format=csv --use_legacy_sql=false ~{bq_labels} \ 'SELECT COUNT(1) FROM `~{fq_filter_set_info_table}` WHERE filter_set_name = "~{filter_set_name}" AND vqslod IS NOT NULL' | tail -1 > classic_count_file.txt @@ -939,7 +939,7 @@ task IsUsingCompressedReferences { PS4='\D{+%F %T} \w $ ' set -o errexit -o nounset -o pipefail -o xtrace - # bq query check: ok one row + # bq query --max_rows check: ok one row bq --apilog=false query --project_id=~{project_id} --format=csv --use_legacy_sql=false ' SELECT column_name @@ -1000,7 +1000,7 @@ task GetExtractVetTableVersion { PS4='\D{+%F %T} \w $ ' set -o errexit -o nounset -o pipefail -o xtrace - # bq query check: ok one row + # bq query --max_rows check: ok one row bq --apilog=false query --project_id=~{query_project} --format=csv --use_legacy_sql=false ' SELECT count(1) diff --git a/scripts/variantstore/wdl/GvsWithdrawSamples.wdl b/scripts/variantstore/wdl/GvsWithdrawSamples.wdl index 19342605cc5..ea1e0f057b4 100644 --- a/scripts/variantstore/wdl/GvsWithdrawSamples.wdl +++ b/scripts/variantstore/wdl/GvsWithdrawSamples.wdl @@ -98,7 +98,7 @@ task WithdrawSamples { # Now, determine if there are any samples in the uploaded list that are NOT in sample_info and report this echo "Determining if there are any new samples that should be uploaded" - # bq query check: max rows for at least as many samples as we have + # bq query --max_rows check: max rows for at least as many samples as we have bq --apilog=false --project_id=~{project_id} query --max_rows 100000000 --format=csv --use_legacy_sql=false \ 'SELECT callset.sample_name FROM `~{project_id}.'"${TEMP_TABLE_NAME}"'` callset @@ -120,7 +120,7 @@ task WithdrawSamples { # Update sample_info.withdrawn by joining on the temp table to figure out which samples should be marked as withdrawn echo "Updating samples that should be withdrawn" - # bq query check: ok update + # bq query --max_rows check: ok update bq --apilog=false --project_id=~{project_id} query --format=csv --use_legacy_sql=false \ 'UPDATE `~{dataset_name}.sample_info` AS samples SET withdrawn = "~{withdrawn_timestamp}" WHERE NOT EXISTS