Skip to content

Commit

Permalink
even better error messages
Browse files Browse the repository at this point in the history
  • Loading branch information
mcovarr committed May 28, 2024
1 parent 3cfb543 commit fcb5911
Show file tree
Hide file tree
Showing 14 changed files with 54 additions and 54 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ task BigQueryExportVat {
echo "project_id = ~{project_id}" > ~/.bigqueryrc

# note: tab delimiter and compression creates tsv.gz files
# bq query check: ok export
# bq query --max_rows check: ok export
bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} \
'EXPORT DATA OPTIONS(
uri="~{export_path}",
Expand Down
32 changes: 16 additions & 16 deletions scripts/variantstore/variant_annotations_table/GvsValidateVAT.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ task EnsureVatTableHasVariants {

echo "project_id = ~{project_id}" > ~/.bigqueryrc

# bq query check: ok one row
# bq query --max_rows check: ok one row
bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT COUNT (DISTINCT vid) AS count FROM `~{fq_vat_table}`' > bq_variant_count.csv

NUMVARS=$(python3 -c "csvObj=open('bq_variant_count.csv','r');csvContents=csvObj.read();print(csvContents.split('\n')[1]);")
Expand Down Expand Up @@ -370,7 +370,7 @@ task SpotCheckForExpectedTranscripts {

echo "project_id = ~{project_id}" > ~/.bigqueryrc

# bq query check: may produce > 100 rows but anything > 0 is an error; error message updated
# bq query --max_rows check: ok may produce > 100 rows but anything > 0 is an error, error message explicit about row limit
bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT
contig,
position,
Expand Down Expand Up @@ -440,7 +440,7 @@ task SchemaNoNullRequiredFields {

# non-nullable fields: vid, contig, position, ref_allele, alt_allele, gvs_all_ac, gvs_all_an, gvs_all_af, variant_type, genomic_location

# bq query check: may produce > 100 rows but anything > 0 is an error; error message updated
# bq query --max_rows check: ok may produce > 100 rows but anything > 0 is an error, error message explicit about row limit
bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv \
'SELECT
contig,
Expand Down Expand Up @@ -520,7 +520,7 @@ task SchemaOnlyOneRowPerNullTranscript {

echo "project_id = ~{project_id}" > ~/.bigqueryrc

# bq query check: may produce > 100 rows but anything > 0 is an error; error message updated
# bq query --max_rows check: ok may produce > 100 rows but anything > 0 is an error, error message explicit about row limit
bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT
vid,
COUNT(vid) AS num_rows
Expand Down Expand Up @@ -581,7 +581,7 @@ task SchemaPrimaryKey {

echo "project_id = ~{project_id}" > ~/.bigqueryrc

# bq query check: may produce > 100 rows but anything > 0 is an error; error message updated
# bq query --max_rows check: ok may produce > 100 rows but anything > 0 is an error, error message explicit about row limit
bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv \
'SELECT
vid,
Expand Down Expand Up @@ -641,7 +641,7 @@ task SchemaEnsemblTranscripts {

echo "project_id = ~{project_id}" > ~/.bigqueryrc

# bq query check: may produce > 100 rows but anything > 0 is an error; error message updated
# bq query --max_rows check: ok may produce > 100 rows but anything > 0 is an error, error message explicit about row limit
bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT
contig,
position,
Expand Down Expand Up @@ -702,7 +702,7 @@ task SchemaNonzeroAcAn {

echo "project_id = ~{project_id}" > ~/.bigqueryrc

# bq query check: may produce > 100 rows but anything > 0 is an error; error message updated
# bq query --max_rows check: ok may produce > 100 rows but anything > 0 is an error, error message explicit about row limit
bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT
contig,
position,
Expand Down Expand Up @@ -765,7 +765,7 @@ task SchemaNullTranscriptsExist {

echo "project_id = ~{project_id}" > ~/.bigqueryrc

# bq query check: may produce > 100 rows but anything > 0 is fine; zero is the error case.
# bq query --max_rows check: may produce > 100 rows but anything > 0 is fine; zero is the error case, error message is fine.
bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT
vid
FROM
Expand Down Expand Up @@ -823,7 +823,7 @@ task SubpopulationMax {

# gvs subpopulations: [ "afr", "amr", "eas", "eur", "mid", "oth", "sas"]

# bq query check: may produce > 100 rows but anything > 0 is an error; error message is fine
# bq query --max_rows check: may produce > 100 rows but anything > 0 is an error; error message is fine
bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT
vid
FROM
Expand Down Expand Up @@ -886,7 +886,7 @@ task SubpopulationAlleleCount {

# gvs subpopulations: [ "afr", "amr", "eas", "eur", "mid", "oth", "sas"]

# bq query check: may produce > 100 rows but anything > 0 is an error; error message is fine
# bq query --max_rows check: may produce > 100 rows but anything > 0 is an error; error message is fine
bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT
vid
FROM
Expand Down Expand Up @@ -943,7 +943,7 @@ task SubpopulationAlleleNumber {

# gvs subpopulations: [ "afr", "amr", "eas", "eur", "mid", "oth", "sas"]

# bq query check: may produce > 100 rows but anything > 0 is an error; error message is fine
# bq query --max_rows check: may produce > 100 rows but anything > 0 is an error; error message is fine
bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT
vid
FROM
Expand Down Expand Up @@ -999,15 +999,15 @@ task DuplicateAnnotations {

echo "project_id = ~{query_project_id}" > ~/.bigqueryrc

# bq query check: may produce > 100 rows but anything > 0 is an error; updated error message and other fixes
# bq query --max_rows check: may produce > 100 rows but anything > 0 is an error; error message is fine.
bq --apilog=false query --nouse_legacy_sql --project_id=~{query_project_id} --format=csv '
SELECT contig, position, gvs_all_an, COUNT(DISTINCT gvs_all_an) AS an_count
FROM `~{fq_vat_table}`
GROUP BY contig, position, gvs_all_an
HAVING an_count > 1
' > bq_an_output.csv

# bq query check: may produce > 100 rows but anything > 0 is an error; updated error message and other fixes
# bq query --max_rows check: may produce > 100 rows but anything > 0 is an error; error message is fine.
bq --apilog=false query --nouse_legacy_sql --project_id=~{query_project_id} --format=csv '
SELECT contig, position, gvs_all_ac, COUNT(DISTINCT gvs_all_ac) AS ac_count
FROM `~{fq_vat_table}`
Expand Down Expand Up @@ -1086,7 +1086,7 @@ task ClinvarSignificance {
# "other",
# "not provided"]

# bq query check: we currently expect this to be at least 13 but it could be more, set --max_rows to a ridiculously high value.
# bq query --max_rows check: we currently expect this to be at least 13 but it could be more, set --max_rows to a ridiculously high value.
bq --apilog=false query --max_rows 1000000 --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT
distinct(unnested_clinvar_classification)
FROM
Expand Down Expand Up @@ -1161,7 +1161,7 @@ task SchemaAAChangeAndExonNumberConsistent {

echo "project_id = ~{project_id}" > ~/.bigqueryrc

# bq query check: ok single row
# bq query --max_rows check: ok single row
bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT
COUNT (DISTINCT vid) AS count FROM
(
Expand Down Expand Up @@ -1262,7 +1262,7 @@ task SpotCheckForAAChangeAndExonNumberConsistency {

echo "project_id = ~{project_id}" > ~/.bigqueryrc

# bq query check: ok single row
# bq query --max_rows check: ok single row
bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT
COUNT (DISTINCT vid) FROM
(
Expand Down
8 changes: 4 additions & 4 deletions scripts/variantstore/wdl/GvsAssignIds.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -203,23 +203,23 @@ task AssignIds {
bq --apilog=false load --project_id=~{project_id} ~{dataset_name}.sample_id_assignment_lock ~{sample_names} "sample_name:STRING"

# add sample_name to sample_info_table
# bq query check: ok insert
# bq query --max_rows check: ok insert
bq --apilog=false --project_id=~{project_id} query --use_legacy_sql=false ~{bq_labels} \
'INSERT into `~{dataset_name}.~{sample_info_table}` (sample_name, is_control) select sample_name, ~{samples_are_controls} from `~{dataset_name}.sample_id_assignment_lock` m where m.sample_name not in (SELECT sample_name FROM `~{dataset_name}.~{sample_info_table}`)'

# get the current maximum id, or 0 if there are none
# bq query check: ok single row
# bq query --max_rows check: ok single row
bq --apilog=false --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} 'SELECT IFNULL(MAX(sample_id),0) FROM `~{dataset_name}.~{sample_info_table}`' > maxid
offset=$(tail -1 maxid)

# perform actual id assignment
# bq query check: ok update
# bq query --max_rows check: ok update
bq --apilog=false --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} --parameter=offset:INTEGER:$offset \
'UPDATE `~{dataset_name}.~{sample_info_table}` m SET m.sample_id = id_assign.id FROM (SELECT sample_name, @offset + ROW_NUMBER() OVER(order by sample_name) as id FROM `~{dataset_name}.~{sample_info_table}` WHERE sample_id IS NULL) id_assign WHERE m.sample_name = id_assign.sample_name;'

# retrieve the list of assigned ids and samples to update the datamodel
echo "entity:sample_id,gvs_id" > update.tsv
# bq query check: ok num samples explicit
# bq query --max_rows check: ok num samples explicit
bq --apilog=false --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} -n $num_samples --parameter=offset:INTEGER:$offset \
'SELECT sample_name, sample_id from `~{dataset_name}.~{sample_info_table}` WHERE sample_id >= @offset' > update.tsv
cat update.tsv | sed -e 's/sample_id/gvs_id/' -e 's/sample_name/entity:sample_id/' -e 's/,/\t/g' > gvs_ids.tsv
Expand Down
4 changes: 2 additions & 2 deletions scripts/variantstore/wdl/GvsCallsetCost.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ task CoreStorageModelSizes {
local table_pattern="$1"
local output_file_name="$2"

# bq query check: explicitly set massive max rows as we expect there to be as many rows as there are superpartitions
# bq query --max_rows check: explicitly set massive max rows as we expect there to be as many rows as there are superpartitions
bq --apilog=false query --max_rows 10000000 --project_id='~{project_id}' --format=csv --use_legacy_sql=false \
'SELECT round(sum(total_billable_bytes) / (1024*1024*1024),2)
FROM `~{project_id}.~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS`
Expand Down Expand Up @@ -155,7 +155,7 @@ task ReadCostObservabilityTable {
PS4='\D{+%F %T} \w $ '
set -o errexit -o nounset -o pipefail -o xtrace

# bq query check: explicitly set massive max rows as we expect there to be as many rows as there are superpartitions
# bq query --max_rows check: explicitly set massive max rows as we expect there to be as many rows as there are superpartitions
bq --apilog=false query --max_rows 10000000 --project_id=~{project_id} --format=prettyjson --use_legacy_sql=false \
'SELECT step, event_key, round(sum(event_bytes) / (1024*1024*1024), 2) AS sum_event_gibibytes
FROM `~{project_id}.~{dataset_name}.cost_observability`
Expand Down
14 changes: 7 additions & 7 deletions scripts/variantstore/wdl/GvsCallsetStatistics.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,7 @@ task CollectMetricsForChromosome {
echo "project_id = ~{project_id}" > ~/.bigqueryrc
# bq query check: ok one row
# bq query --max_rows check: ok one row
bq --apilog=false query --project_id=~{project_id} --format=csv --use_legacy_sql=false '
SELECT COUNT(*) from `~{project_id}.~{dataset_name}.~{metrics_table}` WHERE chromosome = ~{chromosome}' | sed 1d > existing_row_count.txt
Expand All @@ -333,7 +333,7 @@ task CollectMetricsForChromosome {
exit 1
fi
# bq query check: ok insert (elaborate one)
# bq query --max_rows check: ok insert (elaborate one)
bq --apilog=false query --project_id=~{project_id} --use_legacy_sql=false '
CREATE TEMPORARY FUNCTION titv(ref STRING, allele STRING)
RETURNS STRING
Expand Down Expand Up @@ -455,7 +455,7 @@ task AggregateMetricsAcrossChromosomes {
PS4='\D{+%F %T} \w $ '
set -o errexit -o nounset -o pipefail -o xtrace
# bq query check: ok one row
# bq query --max_rows check: ok one row
bq --apilog=false query --project_id=~{project_id} --format=csv --use_legacy_sql=false '
SELECT COUNT(*) from `~{project_id}.~{dataset_name}.~{aggregate_metrics_table}`
' | sed 1d > existing_row_count.txt
Expand All @@ -467,7 +467,7 @@ task AggregateMetricsAcrossChromosomes {
exit 1
fi
# bq query check: ok insert
# bq query --max_rows check: ok insert
bq --apilog=false query --project_id=~{project_id} --use_legacy_sql=false '
INSERT `~{project_id}.~{dataset_name}.~{aggregate_metrics_table}` (
filter_set_name,
Expand Down Expand Up @@ -532,7 +532,7 @@ task CollectStatistics {
PS4='\D{+%F %T} \w $ '
set -o errexit -o nounset -o pipefail -o xtrace
# bq query check: ok one row
# bq query --max_rows check: ok one row
bq --apilog=false query --project_id=~{project_id} --format=csv --use_legacy_sql=false '
SELECT COUNT(*) from `~{project_id}.~{dataset_name}.~{statistics_table}`
' | sed 1d > existing_row_count.txt
Expand All @@ -544,7 +544,7 @@ task CollectStatistics {
exit 1
fi
# bq query check: ok insert
# bq query --max_rows check: ok insert
bq --apilog=false query --project_id=~{project_id} --format=csv --use_legacy_sql=false '
INSERT `~{project_id}.~{dataset_name}.~{statistics_table}` (
sample_id,
Expand Down Expand Up @@ -604,7 +604,7 @@ task ExportToCSV {
PS4='\D{+%F %T} \w $ '
set -o errexit -o nounset -o pipefail -o xtrace
# bq query check: max rows set to at least the number of samples
# bq query --max_rows check: max rows set to at least the number of samples
bq --apilog=false query --nouse_legacy_sql --project_id=~{project_id} --format=csv --max_rows 1000000000 '
SELECT * FROM `~{project_id}.~{dataset_name}.~{statistics_table}` ORDER BY SAMPLE_NAME
Expand Down
4 changes: 2 additions & 2 deletions scripts/variantstore/wdl/GvsCreateVATfromVDS.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -863,7 +863,7 @@ task BigQueryLoadJson {
# We want the vat creation query to overwrite the destination table because if new data has been put into the pre-vat tables
# and this workflow has been run an additional time, we dont want duplicates being appended from the original run
# bq query check: ok selecting into a table
# bq query --max_rows check: ok selecting into a table
bq --apilog=false query --nouse_legacy_sql --destination_table=~{dataset_name}.~{vat_table_name} --replace --project_id=~{project_id} \
'SELECT
v.vid,
Expand Down Expand Up @@ -1033,7 +1033,7 @@ task DeduplicateVatInBigQuery {
# Now we query the original VAT table and recreate it, but remove any rows that appear twice.
# bq query check: ok selecting into a table
# bq query --max_rows check: ok selecting into a table
bq --apilog=false query --nouse_legacy_sql --destination_table=~{dataset_name}.~{output_vat_table_name} --replace --project_id=~{project_id} \
' SELECT * EXCEPT(row_number) FROM (
SELECT
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsExtractCallset.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -557,7 +557,7 @@ task GenerateSampleListFile {

echo "project_id = ~{query_project}" > ~/.bigqueryrc

# bq query check: max rows set to at least the number of samples
# bq query --max_rows check: max rows set to at least the number of samples
bq --apilog=false --project_id=~{query_project} --format=csv query --max_rows 1000000000 --use_legacy_sql=false ~{bq_labels} \
'SELECT sample_name FROM `~{fq_samples_to_extract_table}`' | sed 1d > sample-name-list.txt

Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsExtractCallsetPgen.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -603,7 +603,7 @@ task GenerateSampleListFile {

echo "project_id = ~{query_project}" > ~/.bigqueryrc

# bq query check: max rows set to at least the number of samples
# bq query --max_rows check: max rows set to at least the number of samples
bq --apilog=false --project_id=~{query_project} --format=csv query --max_rows 1000000000 --use_legacy_sql=false ~{bq_labels} \
'SELECT sample_name FROM `~{fq_samples_to_extract_table}`' | sed 1d > sample-name-list.txt

Expand Down
10 changes: 5 additions & 5 deletions scripts/variantstore/wdl/GvsImportGenomes.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ task LoadData {
bq --apilog=false load --project_id=~{project_id} ~{temp_table} $NAMES_FILE "sample_name:STRING"

# Get the current min/max id, or 0 if there are none. Withdrawn samples still have IDs so don't filter them out.
# bq query check: ok one row
# bq query --max_rows check: ok one row
bq --apilog=false --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} '
SELECT IFNULL(MIN(sample_id),0) as min, IFNULL(MAX(sample_id),0) as max FROM `~{dataset_name}.~{table_name}`
AS samples JOIN `~{temp_table}` AS temp ON samples.sample_name = temp.sample_name' > results.csv
Expand All @@ -319,7 +319,7 @@ task LoadData {
samples.sample_id NOT IN (SELECT sample_id FROM \`~{dataset_name}.sample_load_status\` WHERE status = '$status') AND
samples.withdrawn is NULL" > query.txt

# bq query check: ok sets max rows explicitly
# bq query --max_rows check: ok sets max rows explicitly
cat query.txt |
bq --apilog=false --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} -n ~{num_samples} > \
$status.status_bucket.csv
Expand Down Expand Up @@ -456,7 +456,7 @@ task SetIsLoadedColumn {
# an exponential backoff, but at the number of samples that are being loaded this would introduce significant delays
# in workflow processing. So this method is used to set *all* of the saple_info.is_loaded flags at one time.

# bq query check: ok update
# bq query --max_rows check: ok update
bq --apilog=false --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} \
'UPDATE `~{dataset_name}.sample_info` SET is_loaded = true
WHERE sample_id IN (SELECT CAST(partition_id AS INT64)
Expand Down Expand Up @@ -527,7 +527,7 @@ task GetUningestedSampleIds {
bq --apilog=false load --project_id=~{project_id} ~{temp_table} ~{external_sample_names} "sample_name:STRING"

# Get the current min/max id, or 0 if there are none. Withdrawn samples still have IDs so don't filter them out.
# bq query check: ok one row
# bq query --max_rows check: ok one row
bq --apilog=false --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} '
SELECT IFNULL(MIN(sample_id),0) as min, IFNULL(MAX(sample_id),0) as max FROM `~{dataset_name}.~{table_name}`
Expand Down Expand Up @@ -558,7 +558,7 @@ task GetUningestedSampleIds {
samples.sample_id NOT IN (SELECT sample_id FROM \`~{dataset_name}.sample_load_status\` WHERE status = '$status') AND
samples.withdrawn is NULL" > query.txt

# bq query check: ok sets max rows explicitly
# bq query --max_rows check: ok sets max rows explicitly
cat query.txt |
bq --apilog=false --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} -n ~{num_samples} > \
$status.status_bucket.csv
Expand Down
Loading

0 comments on commit fcb5911

Please sign in to comment.