Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: add metadata necessary to download from previous idc-versions #32

Merged
merged 9 commits into from
Aug 2, 2024
3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ add_custom_command(
OUTPUT
$<$<BOOL:${IDC_INDEX_DATA_GENERATE_CSV_ARCHIVE}>:${download_dir}/idc_index.csv.zip>
$<$<BOOL:${IDC_INDEX_DATA_GENERATE_PARQUET}>:${download_dir}/idc_index.parquet>
$<$<BOOL:${IDC_INDEX_DATA_GENERATE_PARQUET}>:${download_dir}/prior_versions_index.parquet>
COMMAND python ${CMAKE_CURRENT_SOURCE_DIR}/scripts/python/idc_index_data_manager.py
$<$<BOOL:${IDC_INDEX_DATA_GENERATE_CSV_ARCHIVE}>:--generate-csv-archive>
$<$<BOOL:${IDC_INDEX_DATA_GENERATE_PARQUET}>:--generate-parquet>
Expand All @@ -28,10 +29,12 @@ add_custom_target(run_idc_index_data_manager ALL
DEPENDS
$<$<BOOL:${IDC_INDEX_DATA_GENERATE_CSV_ARCHIVE}>:${download_dir}/idc_index.csv.zip>
$<$<BOOL:${IDC_INDEX_DATA_GENERATE_PARQUET}>:${download_dir}/idc_index.parquet>
$<$<BOOL:${IDC_INDEX_DATA_GENERATE_PARQUET}>:${download_dir}/prior_versions_index.parquet>
)

install(
FILES
$<$<BOOL:${IDC_INDEX_DATA_GENERATE_CSV_ARCHIVE}>:${download_dir}/idc_index.csv.zip>
$<$<BOOL:${IDC_INDEX_DATA_GENERATE_PARQUET}>:${download_dir}/idc_index.parquet>
$<$<BOOL:${IDC_INDEX_DATA_GENERATE_PARQUET}>:${download_dir}/prior_versions_index.parquet>
DESTINATION "idc_index_data")
89 changes: 89 additions & 0 deletions scripts/sql/prior_versions_index.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
-- For details on the syntax, see
-- https://cloud.google.com/bigquery/docs/reference/standard-sql/procedural-language
--
-- Step 1: Declare variables
DECLARE idc_versions ARRAY<INT64>;
DECLARE latest_idc_version INT64 DEFAULT 18;
DECLARE union_all_query STRING;

--Step 2
--SET latest_idc_version = (
--SELECT max(idc_version)
--FROM
--bigquery-public-data.idc_current.version_metadata
--);

-- Step 3: Get all idc_versions
SET idc_versions = (
SELECT GENERATE_ARRAY(1, latest_idc_version)
-- SELECT [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18]
--SELECT ARRAY_AGG(idc_version)
--FROM
--`bigquery-public-data.idc_current.version_metadata`
);

-- Step 4: Generate the UNION ALL query dynamically
SET union_all_query = (
SELECT STRING_AGG(
FORMAT("""
SELECT
%d AS idc_version,
collection_id,
PatientID,
SeriesInstanceUID,
StudyInstanceUID,
Modality,
regexp_extract(gcs_url, 'gs://([^/]+)/') as gcs_bucket,
crdc_series_uuid,
ROUND(SUM(SAFE_CAST(instance_size AS float64))/1000000, 2) AS series_size_MB,
FROM
`bigquery-public-data.idc_v%d.dicom_all` AS dicom_all
where crdc_series_uuid not in (select distinct crdc_series_uuid from `bigquery-public-data.idc_v%d.dicom_all`)
GROUP BY
1,2,3,4,5,6,7,8

""",
version, version, latest_idc_version),
" UNION ALL "
)
FROM UNNEST(idc_versions) AS version
);

-- Step 5: Execute the complete query
EXECUTE IMMEDIATE FORMAT("""
WITH all_versions AS (
%s
)
SELECT
collection_id,
PatientID,
SeriesInstanceUID,
StudyInstanceUID,
Modality,
gcs_bucket,
crdc_series_uuid,
series_size_MB,
CASE

# map GCS bucket to AWS bucket, since for idc-index we prefer AWS
# if new buckets are included in IDC, this will need to be updated!

WHEN gcs_bucket='public-datasets-idc' THEN CONCAT('s3://','idc-open-data/',crdc_series_uuid, '/*')
WHEN gcs_bucket='idc-open-idc1' THEN CONCAT('s3://','idc-open-data-two/',crdc_series_uuid, '/*')
WHEN gcs_bucket='idc-open-cr' THEN CONCAT('s3://','idc-open-data-cr/',crdc_series_uuid, '/*')
END AS series_aws_url,
MIN(idc_version) AS min_idc_version,
MAX(idc_version) AS max_idc_version
FROM all_versions

where gcs_bucket not in ('idc-open-idc')

#per @bcli4d:idc-open-idc was our public bucket before we moved most data to the Google owned public-datasets-idc.
#We decided at the time to not touch BQ. To deal with this and other cases where some metadata can change (Licences),
#we include the mutable_metadata table which maps crdc_instance_uuid to current gcs_url, aws_url, license, doi.

GROUP BY
1,2,3,4,5,6,7,8
""",
union_all_query
);
4 changes: 4 additions & 0 deletions src/idc_index_data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"__version__",
"IDC_INDEX_CSV_ARCHIVE_FILEPATH",
"IDC_INDEX_PARQUET_FILEPATH",
"PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH",
]


Expand All @@ -36,3 +37,6 @@ def _lookup(path: str, optional: bool = False) -> Path | None:
"idc_index_data/idc_index.csv.zip", optional=True
)
IDC_INDEX_PARQUET_FILEPATH: Path | None = _lookup("idc_index_data/idc_index.parquet")
PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH: Path | None = _lookup(
"idc_index_data/prior_versions_index.parquet"
)
5 changes: 5 additions & 0 deletions tests/test_package.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,8 @@ def test_reading_index():
assert m.IDC_INDEX_PARQUET_FILEPATH.is_file()
df_parquet = pd.read_parquet(m.IDC_INDEX_PARQUET_FILEPATH)
assert not df_parquet.empty

if m.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH is not None:
assert m.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH.is_file()
df_parquet = pd.read_parquet(m.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH)
assert not df_parquet.empty
Loading