even better error messages

broadinstitute · May 28, 2024 · fcb5911 · fcb5911
1 parent 3cfb543
commit fcb5911
Show file tree

Hide file tree

Showing 14 changed files with 54 additions and 54 deletions.
diff --git a/scripts/variantstore/variant_annotations_table/GvsCreateVATFilesFromBigQuery.wdl b/scripts/variantstore/variant_annotations_table/GvsCreateVATFilesFromBigQuery.wdl
@@ -82,7 +82,7 @@ task BigQueryExportVat {
         echo "project_id = ~{project_id}" > ~/.bigqueryrc
 
         # note: tab delimiter and compression creates tsv.gz files
-        # bq query check: ok export
+        # bq query --max_rows check: ok export
         bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} \
         'EXPORT DATA OPTIONS(
         uri="~{export_path}",

diff --git a/scripts/variantstore/variant_annotations_table/GvsValidateVAT.wdl b/scripts/variantstore/variant_annotations_table/GvsValidateVAT.wdl
@@ -315,7 +315,7 @@ task EnsureVatTableHasVariants {
 
         echo "project_id = ~{project_id}" > ~/.bigqueryrc
 
-        # bq query check: ok one row
+        # bq query --max_rows check: ok one row
         bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT COUNT (DISTINCT vid) AS count FROM `~{fq_vat_table}`' > bq_variant_count.csv
 
         NUMVARS=$(python3 -c "csvObj=open('bq_variant_count.csv','r');csvContents=csvObj.read();print(csvContents.split('\n')[1]);")
@@ -370,7 +370,7 @@ task SpotCheckForExpectedTranscripts {
 
         echo "project_id = ~{project_id}" > ~/.bigqueryrc
 
-        # bq query check: may produce > 100 rows but anything > 0 is an error; error message updated
+        # bq query --max_rows check: ok may produce > 100 rows but anything > 0 is an error, error message explicit about row limit
         bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT
             contig,
             position,
@@ -440,7 +440,7 @@ task SchemaNoNullRequiredFields {
 
         # non-nullable fields: vid, contig, position, ref_allele, alt_allele, gvs_all_ac, gvs_all_an, gvs_all_af, variant_type, genomic_location
 
-        # bq query check: may produce > 100 rows but anything > 0 is an error; error message updated
+        # bq query --max_rows check: ok may produce > 100 rows but anything > 0 is an error, error message explicit about row limit
         bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv \
         'SELECT
             contig,
@@ -520,7 +520,7 @@ task SchemaOnlyOneRowPerNullTranscript {
 
         echo "project_id = ~{project_id}" > ~/.bigqueryrc
 
-        # bq query check: may produce > 100 rows but anything > 0 is an error; error message updated
+        # bq query --max_rows check: ok may produce > 100 rows but anything > 0 is an error, error message explicit about row limit
         bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT
             vid,
             COUNT(vid) AS num_rows
@@ -581,7 +581,7 @@ task SchemaPrimaryKey {
 
         echo "project_id = ~{project_id}" > ~/.bigqueryrc
 
-        # bq query check: may produce > 100 rows but anything > 0 is an error; error message updated
+        # bq query --max_rows check: ok may produce > 100 rows but anything > 0 is an error, error message explicit about row limit
         bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv \
         'SELECT
             vid,
@@ -641,7 +641,7 @@ task SchemaEnsemblTranscripts {
 
         echo "project_id = ~{project_id}" > ~/.bigqueryrc
 
-        # bq query check: may produce > 100 rows but anything > 0 is an error; error message updated
+        # bq query --max_rows check: ok may produce > 100 rows but anything > 0 is an error, error message explicit about row limit
         bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT
             contig,
             position,
@@ -702,7 +702,7 @@ task SchemaNonzeroAcAn {
 
         echo "project_id = ~{project_id}" > ~/.bigqueryrc
 
-        # bq query check: may produce > 100 rows but anything > 0 is an error; error message updated
+        # bq query --max_rows check: ok may produce > 100 rows but anything > 0 is an error, error message explicit about row limit
         bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT
             contig,
             position,
@@ -765,7 +765,7 @@ task SchemaNullTranscriptsExist {
 
         echo "project_id = ~{project_id}" > ~/.bigqueryrc
 
-        # bq query check: may produce > 100 rows but anything > 0 is fine; zero is the error case.
+        # bq query --max_rows check: may produce > 100 rows but anything > 0 is fine; zero is the error case, error message is fine.
         bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT
             vid
         FROM
@@ -823,7 +823,7 @@ task SubpopulationMax {
 
         # gvs subpopulations:  [ "afr", "amr", "eas", "eur", "mid", "oth", "sas"]
 
-        # bq query check: may produce > 100 rows but anything > 0 is an error; error message is fine
+        # bq query --max_rows check: may produce > 100 rows but anything > 0 is an error; error message is fine
         bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT
             vid
         FROM
@@ -886,7 +886,7 @@ task SubpopulationAlleleCount {
 
         # gvs subpopulations:  [ "afr", "amr", "eas", "eur", "mid", "oth", "sas"]
 
-        # bq query check: may produce > 100 rows but anything > 0 is an error; error message is fine
+        # bq query --max_rows check: may produce > 100 rows but anything > 0 is an error; error message is fine
         bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT
             vid
         FROM
@@ -943,7 +943,7 @@ task SubpopulationAlleleNumber {
 
         # gvs subpopulations:  [ "afr", "amr", "eas", "eur", "mid", "oth", "sas"]
 
-        # bq query check: may produce > 100 rows but anything > 0 is an error; error message is fine
+        # bq query --max_rows check: may produce > 100 rows but anything > 0 is an error; error message is fine
         bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT
         vid
         FROM
@@ -999,15 +999,15 @@ task DuplicateAnnotations {
 
         echo "project_id = ~{query_project_id}" > ~/.bigqueryrc
 
-        # bq query check: may produce > 100 rows but anything > 0 is an error; updated error message and other fixes
+        # bq query --max_rows check: may produce > 100 rows but anything > 0 is an error; error message is fine.
         bq --apilog=false query --nouse_legacy_sql --project_id=~{query_project_id} --format=csv '
         SELECT contig, position, gvs_all_an, COUNT(DISTINCT gvs_all_an) AS an_count
         FROM `~{fq_vat_table}`
         GROUP BY contig, position, gvs_all_an
         HAVING an_count > 1
         ' > bq_an_output.csv
 
-        # bq query check: may produce > 100 rows but anything > 0 is an error; updated error message and other fixes
+        # bq query --max_rows check: may produce > 100 rows but anything > 0 is an error; error message is fine.
         bq --apilog=false query --nouse_legacy_sql --project_id=~{query_project_id} --format=csv '
         SELECT contig, position, gvs_all_ac, COUNT(DISTINCT gvs_all_ac) AS ac_count
         FROM `~{fq_vat_table}`
@@ -1086,7 +1086,7 @@ task ClinvarSignificance {
         #                                 "other",
         #                                 "not provided"]
 
-        # bq query check: we currently expect this to be at least 13 but it could be more, set --max_rows to a ridiculously high value.
+        # bq query --max_rows check: we currently expect this to be at least 13 but it could be more, set --max_rows to a ridiculously high value.
         bq --apilog=false query --max_rows 1000000 --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT
           distinct(unnested_clinvar_classification)
           FROM
@@ -1161,7 +1161,7 @@ task SchemaAAChangeAndExonNumberConsistent {
 
         echo "project_id = ~{project_id}" > ~/.bigqueryrc
 
-        # bq query check: ok single row
+        # bq query --max_rows check: ok single row
         bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT
         COUNT (DISTINCT vid) AS count FROM
         (
@@ -1262,7 +1262,7 @@ task SpotCheckForAAChangeAndExonNumberConsistency {
 
         echo "project_id = ~{project_id}" > ~/.bigqueryrc
 
-        # bq query check: ok single row
+        # bq query --max_rows check: ok single row
         bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT
         COUNT (DISTINCT vid) FROM
         (

diff --git a/scripts/variantstore/wdl/GvsAssignIds.wdl b/scripts/variantstore/wdl/GvsAssignIds.wdl
@@ -203,23 +203,23 @@ task AssignIds {
     bq --apilog=false load --project_id=~{project_id} ~{dataset_name}.sample_id_assignment_lock ~{sample_names} "sample_name:STRING"
 
     # add sample_name to sample_info_table
-    # bq query check: ok insert
+    # bq query --max_rows check: ok insert
     bq --apilog=false --project_id=~{project_id} query --use_legacy_sql=false ~{bq_labels} \
       'INSERT into `~{dataset_name}.~{sample_info_table}` (sample_name, is_control) select sample_name, ~{samples_are_controls} from `~{dataset_name}.sample_id_assignment_lock` m where m.sample_name not in (SELECT sample_name FROM `~{dataset_name}.~{sample_info_table}`)'
 
     # get the current maximum id, or 0 if there are none
-    # bq query check: ok single row
+    # bq query --max_rows check: ok single row
     bq --apilog=false --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} 'SELECT IFNULL(MAX(sample_id),0) FROM `~{dataset_name}.~{sample_info_table}`' > maxid
     offset=$(tail -1 maxid)
 
     # perform actual id assignment
-    # bq query check: ok update
+    # bq query --max_rows check: ok update
     bq --apilog=false --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} --parameter=offset:INTEGER:$offset \
       'UPDATE `~{dataset_name}.~{sample_info_table}` m SET m.sample_id = id_assign.id FROM (SELECT sample_name, @offset + ROW_NUMBER() OVER(order by sample_name) as id FROM `~{dataset_name}.~{sample_info_table}` WHERE sample_id IS NULL) id_assign WHERE m.sample_name = id_assign.sample_name;'
 
     # retrieve the list of assigned ids and samples to update the datamodel
     echo "entity:sample_id,gvs_id" > update.tsv
-    # bq query check: ok num samples explicit
+    # bq query --max_rows check: ok num samples explicit
     bq --apilog=false --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} -n $num_samples --parameter=offset:INTEGER:$offset \
       'SELECT sample_name, sample_id from `~{dataset_name}.~{sample_info_table}` WHERE sample_id >= @offset' > update.tsv
     cat update.tsv | sed -e 's/sample_id/gvs_id/' -e 's/sample_name/entity:sample_id/' -e 's/,/\t/g' > gvs_ids.tsv

diff --git a/scripts/variantstore/wdl/GvsCallsetCost.wdl b/scripts/variantstore/wdl/GvsCallsetCost.wdl
@@ -117,7 +117,7 @@ task CoreStorageModelSizes {
             local table_pattern="$1"
             local output_file_name="$2"
 
-            # bq query check: explicitly set massive max rows as we expect there to be as many rows as there are superpartitions
+            # bq query --max_rows check: explicitly set massive max rows as we expect there to be as many rows as there are superpartitions
             bq --apilog=false query --max_rows 10000000 --project_id='~{project_id}' --format=csv --use_legacy_sql=false \
                 'SELECT round(sum(total_billable_bytes) / (1024*1024*1024),2)
                     FROM `~{project_id}.~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS`
@@ -155,7 +155,7 @@ task ReadCostObservabilityTable {
         PS4='\D{+%F %T} \w $ '
         set -o errexit -o nounset -o pipefail -o xtrace
 
-        # bq query check: explicitly set massive max rows as we expect there to be as many rows as there are superpartitions
+        # bq query --max_rows check: explicitly set massive max rows as we expect there to be as many rows as there are superpartitions
         bq --apilog=false query --max_rows 10000000 --project_id=~{project_id} --format=prettyjson --use_legacy_sql=false \
             'SELECT step, event_key, round(sum(event_bytes) / (1024*1024*1024), 2) AS sum_event_gibibytes
                 FROM `~{project_id}.~{dataset_name}.cost_observability`

diff --git a/scripts/variantstore/wdl/GvsCallsetStatistics.wdl b/scripts/variantstore/wdl/GvsCallsetStatistics.wdl
@@ -322,7 +322,7 @@ task CollectMetricsForChromosome {
 
         echo "project_id = ~{project_id}" > ~/.bigqueryrc
 
-        # bq query check: ok one row
+        # bq query --max_rows check: ok one row
         bq --apilog=false query --project_id=~{project_id} --format=csv --use_legacy_sql=false '
             SELECT COUNT(*) from `~{project_id}.~{dataset_name}.~{metrics_table}` WHERE chromosome = ~{chromosome}' | sed 1d > existing_row_count.txt
 
@@ -333,7 +333,7 @@ task CollectMetricsForChromosome {
             exit 1
         fi
 
-        # bq query check: ok insert (elaborate one)
+        # bq query --max_rows check: ok insert (elaborate one)
         bq --apilog=false query --project_id=~{project_id} --use_legacy_sql=false '
         CREATE TEMPORARY FUNCTION titv(ref STRING, allele STRING)
         RETURNS STRING
@@ -455,7 +455,7 @@ task AggregateMetricsAcrossChromosomes {
         PS4='\D{+%F %T} \w $ '
         set -o errexit -o nounset -o pipefail -o xtrace
 
-        # bq query check: ok one row
+        # bq query --max_rows check: ok one row
         bq --apilog=false query --project_id=~{project_id} --format=csv --use_legacy_sql=false '
             SELECT COUNT(*) from `~{project_id}.~{dataset_name}.~{aggregate_metrics_table}`
         ' | sed 1d > existing_row_count.txt
@@ -467,7 +467,7 @@ task AggregateMetricsAcrossChromosomes {
             exit 1
         fi
 
-        # bq query check: ok insert
+        # bq query --max_rows check: ok insert
         bq --apilog=false query --project_id=~{project_id} --use_legacy_sql=false '
         INSERT `~{project_id}.~{dataset_name}.~{aggregate_metrics_table}` (
             filter_set_name,
@@ -532,7 +532,7 @@ task CollectStatistics {
         PS4='\D{+%F %T} \w $ '
         set -o errexit -o nounset -o pipefail -o xtrace
 
-        # bq query check: ok one row
+        # bq query --max_rows check: ok one row
         bq --apilog=false query --project_id=~{project_id} --format=csv --use_legacy_sql=false '
             SELECT COUNT(*) from `~{project_id}.~{dataset_name}.~{statistics_table}`
         ' | sed 1d > existing_row_count.txt
@@ -544,7 +544,7 @@ task CollectStatistics {
             exit 1
         fi
 
-        # bq query check: ok insert
+        # bq query --max_rows check: ok insert
         bq --apilog=false query --project_id=~{project_id} --format=csv --use_legacy_sql=false '
         INSERT `~{project_id}.~{dataset_name}.~{statistics_table}` (
             sample_id,
@@ -604,7 +604,7 @@ task ExportToCSV {
         PS4='\D{+%F %T} \w $ '
         set -o errexit -o nounset -o pipefail -o xtrace
 
-        # bq query check: max rows set to at least the number of samples
+        # bq query --max_rows check: max rows set to at least the number of samples
         bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv --max_rows 1000000000 '
 
           SELECT * FROM `~{project_id}.~{dataset_name}.~{statistics_table}` ORDER BY SAMPLE_NAME

diff --git a/scripts/variantstore/wdl/GvsCreateVATfromVDS.wdl b/scripts/variantstore/wdl/GvsCreateVATfromVDS.wdl
@@ -863,7 +863,7 @@ task BigQueryLoadJson {
         # We want the vat creation query to overwrite the destination table because if new data has been put into the pre-vat tables
         # and this workflow has been run an additional time, we dont want duplicates being appended from the original run
 
-        # bq query check: ok selecting into a table
+        # bq query --max_rows check: ok selecting into a table
         bq --apilog=false query --nouse_legacy_sql --destination_table=~{dataset_name}.~{vat_table_name} --replace --project_id=~{project_id} \
         'SELECT
             v.vid,
@@ -1033,7 +1033,7 @@ task DeduplicateVatInBigQuery {
 
         # Now we query the original VAT table and recreate it, but remove any rows that appear twice.
 
-        # bq query check: ok selecting into a table
+        # bq query --max_rows check: ok selecting into a table
         bq --apilog=false query --nouse_legacy_sql --destination_table=~{dataset_name}.~{output_vat_table_name} --replace --project_id=~{project_id} \
         ' SELECT * EXCEPT(row_number) FROM (
             SELECT

diff --git a/scripts/variantstore/wdl/GvsExtractCallset.wdl b/scripts/variantstore/wdl/GvsExtractCallset.wdl
@@ -557,7 +557,7 @@ task GenerateSampleListFile {
 
     echo "project_id = ~{query_project}" > ~/.bigqueryrc
 
-    # bq query check: max rows set to at least the number of samples
+    # bq query --max_rows check: max rows set to at least the number of samples
     bq --apilog=false --project_id=~{query_project} --format=csv query --max_rows 1000000000 --use_legacy_sql=false ~{bq_labels} \
       'SELECT sample_name FROM `~{fq_samples_to_extract_table}`' | sed 1d > sample-name-list.txt
 

diff --git a/scripts/variantstore/wdl/GvsExtractCallsetPgen.wdl b/scripts/variantstore/wdl/GvsExtractCallsetPgen.wdl
@@ -603,7 +603,7 @@ task GenerateSampleListFile {
 
         echo "project_id = ~{query_project}" > ~/.bigqueryrc
 
-        # bq query check: max rows set to at least the number of samples
+        # bq query --max_rows check: max rows set to at least the number of samples
         bq --apilog=false --project_id=~{query_project} --format=csv query --max_rows 1000000000 --use_legacy_sql=false ~{bq_labels} \
         'SELECT sample_name FROM `~{fq_samples_to_extract_table}`' | sed 1d > sample-name-list.txt
 

diff --git a/scripts/variantstore/wdl/GvsImportGenomes.wdl b/scripts/variantstore/wdl/GvsImportGenomes.wdl
@@ -304,7 +304,7 @@ task LoadData {
     bq --apilog=false load --project_id=~{project_id} ~{temp_table} $NAMES_FILE "sample_name:STRING"
 
     # Get the current min/max id, or 0 if there are none. Withdrawn samples still have IDs so don't filter them out.
-    # bq query check: ok one row
+    # bq query --max_rows check: ok one row
     bq --apilog=false --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} '
       SELECT IFNULL(MIN(sample_id),0) as min, IFNULL(MAX(sample_id),0) as max FROM `~{dataset_name}.~{table_name}`
         AS samples JOIN `~{temp_table}` AS temp ON samples.sample_name = temp.sample_name' > results.csv
@@ -319,7 +319,7 @@ task LoadData {
         samples.sample_id NOT IN (SELECT sample_id FROM \`~{dataset_name}.sample_load_status\` WHERE status = '$status') AND
         samples.withdrawn is NULL" > query.txt
 
-      # bq query check: ok sets max rows explicitly
+      # bq query --max_rows check: ok sets max rows explicitly
       cat query.txt |
         bq --apilog=false --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} -n ~{num_samples} > \
         $status.status_bucket.csv
@@ -456,7 +456,7 @@ task SetIsLoadedColumn {
     # an exponential backoff, but at the number of samples that are being loaded this would introduce significant delays
     # in workflow processing. So this method is used to set *all* of the saple_info.is_loaded flags at one time.
 
-    # bq query check: ok update
+    # bq query --max_rows check: ok update
     bq --apilog=false --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} \
     'UPDATE `~{dataset_name}.sample_info` SET is_loaded = true
     WHERE sample_id IN (SELECT CAST(partition_id AS INT64)
@@ -527,7 +527,7 @@ task GetUningestedSampleIds {
     bq --apilog=false load --project_id=~{project_id} ~{temp_table} ~{external_sample_names} "sample_name:STRING"
 
     # Get the current min/max id, or 0 if there are none. Withdrawn samples still have IDs so don't filter them out.
-    # bq query check: ok one row
+    # bq query --max_rows check: ok one row
     bq --apilog=false --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} '
 
       SELECT IFNULL(MIN(sample_id),0) as min, IFNULL(MAX(sample_id),0) as max FROM `~{dataset_name}.~{table_name}`
@@ -558,7 +558,7 @@ task GetUningestedSampleIds {
         samples.sample_id NOT IN (SELECT sample_id FROM \`~{dataset_name}.sample_load_status\` WHERE status = '$status') AND
         samples.withdrawn is NULL" > query.txt
 
-      # bq query check: ok sets max rows explicitly
+      # bq query --max_rows check: ok sets max rows explicitly
       cat query.txt |
         bq --apilog=false --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} -n ~{num_samples} > \
         $status.status_bucket.csv