diff --git a/CMakeLists.txt b/CMakeLists.txt index 0c839bb..413c530 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,6 +19,7 @@ add_custom_command( OUTPUT $<$:${download_dir}/idc_index.csv.zip> $<$:${download_dir}/idc_index.parquet> + $<$:${download_dir}/prior_versions_index.parquet> COMMAND python ${CMAKE_CURRENT_SOURCE_DIR}/scripts/python/idc_index_data_manager.py $<$:--generate-csv-archive> $<$:--generate-parquet> @@ -28,10 +29,12 @@ add_custom_target(run_idc_index_data_manager ALL DEPENDS $<$:${download_dir}/idc_index.csv.zip> $<$:${download_dir}/idc_index.parquet> + $<$:${download_dir}/prior_versions_index.parquet> ) install( FILES $<$:${download_dir}/idc_index.csv.zip> $<$:${download_dir}/idc_index.parquet> + $<$:${download_dir}/prior_versions_index.parquet> DESTINATION "idc_index_data") diff --git a/scripts/sql/prior_versions_index.sql b/scripts/sql/prior_versions_index.sql new file mode 100644 index 0000000..43ff81f --- /dev/null +++ b/scripts/sql/prior_versions_index.sql @@ -0,0 +1,89 @@ +-- For details on the syntax, see +-- https://cloud.google.com/bigquery/docs/reference/standard-sql/procedural-language +-- +-- Step 1: Declare variables +DECLARE idc_versions ARRAY; +DECLARE latest_idc_version INT64 DEFAULT 18; +DECLARE union_all_query STRING; + +--Step 2 +--SET latest_idc_version = ( +--SELECT max(idc_version) +--FROM +--bigquery-public-data.idc_current.version_metadata +--); + +-- Step 3: Get all idc_versions +SET idc_versions = ( + SELECT GENERATE_ARRAY(1, latest_idc_version) + -- SELECT [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18] + --SELECT ARRAY_AGG(idc_version) + --FROM + --`bigquery-public-data.idc_current.version_metadata` +); + +-- Step 4: Generate the UNION ALL query dynamically +SET union_all_query = ( + SELECT STRING_AGG( + FORMAT(""" + SELECT + %d AS idc_version, + collection_id, + PatientID, + SeriesInstanceUID, + StudyInstanceUID, + Modality, + regexp_extract(gcs_url, 'gs://([^/]+)/') as gcs_bucket, + crdc_series_uuid, + ROUND(SUM(SAFE_CAST(instance_size AS float64))/1000000, 2) AS series_size_MB, + FROM + `bigquery-public-data.idc_v%d.dicom_all` AS dicom_all + where crdc_series_uuid not in (select distinct crdc_series_uuid from `bigquery-public-data.idc_v%d.dicom_all`) + GROUP BY + 1,2,3,4,5,6,7,8 + + """, + version, version, latest_idc_version), + " UNION ALL " + ) + FROM UNNEST(idc_versions) AS version +); + +-- Step 5: Execute the complete query +EXECUTE IMMEDIATE FORMAT(""" +WITH all_versions AS ( + %s +) +SELECT + collection_id, + PatientID, + SeriesInstanceUID, + StudyInstanceUID, + Modality, + gcs_bucket, + crdc_series_uuid, + series_size_MB, + CASE + + # map GCS bucket to AWS bucket, since for idc-index we prefer AWS + # if new buckets are included in IDC, this will need to be updated! + + WHEN gcs_bucket='public-datasets-idc' THEN CONCAT('s3://','idc-open-data/',crdc_series_uuid, '/*') + WHEN gcs_bucket='idc-open-idc1' THEN CONCAT('s3://','idc-open-data-two/',crdc_series_uuid, '/*') + WHEN gcs_bucket='idc-open-cr' THEN CONCAT('s3://','idc-open-data-cr/',crdc_series_uuid, '/*') + END AS series_aws_url, + MIN(idc_version) AS min_idc_version, + MAX(idc_version) AS max_idc_version +FROM all_versions + +where gcs_bucket not in ('idc-open-idc') + +#per @bcli4d:idc-open-idc was our public bucket before we moved most data to the Google owned public-datasets-idc. +#We decided at the time to not touch BQ. To deal with this and other cases where some metadata can change (Licences), +#we include the mutable_metadata table which maps crdc_instance_uuid to current gcs_url, aws_url, license, doi. + +GROUP BY + 1,2,3,4,5,6,7,8 + """, + union_all_query +); diff --git a/src/idc_index_data/__init__.py b/src/idc_index_data/__init__.py index 6ba42e1..f2fcd4c 100644 --- a/src/idc_index_data/__init__.py +++ b/src/idc_index_data/__init__.py @@ -15,6 +15,7 @@ "__version__", "IDC_INDEX_CSV_ARCHIVE_FILEPATH", "IDC_INDEX_PARQUET_FILEPATH", + "PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH", ] @@ -36,3 +37,6 @@ def _lookup(path: str, optional: bool = False) -> Path | None: "idc_index_data/idc_index.csv.zip", optional=True ) IDC_INDEX_PARQUET_FILEPATH: Path | None = _lookup("idc_index_data/idc_index.parquet") +PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH: Path | None = _lookup( + "idc_index_data/prior_versions_index.parquet" +) diff --git a/tests/test_package.py b/tests/test_package.py index 66c7a46..1a35706 100644 --- a/tests/test_package.py +++ b/tests/test_package.py @@ -38,3 +38,8 @@ def test_reading_index(): assert m.IDC_INDEX_PARQUET_FILEPATH.is_file() df_parquet = pd.read_parquet(m.IDC_INDEX_PARQUET_FILEPATH) assert not df_parquet.empty + + if m.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH is not None: + assert m.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH.is_file() + df_parquet = pd.read_parquet(m.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH) + assert not df_parquet.empty