Skip to content

Commit

Permalink
routing merge"
Browse files Browse the repository at this point in the history
  • Loading branch information
Akaud committed Sep 19, 2024
1 parent 7452996 commit fbe20ef
Show file tree
Hide file tree
Showing 4 changed files with 583 additions and 22 deletions.
20 changes: 0 additions & 20 deletions api/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,26 +33,6 @@
GNOMAD_PATH,
)

# DATA COLLECTION IMPORT
from .collection import (
# Custom exceptions
BadResponseException,
DownloadError,

# Custom utility functions
get_file_from_url,

# Functions for downloading databases
download_lovd_database_for_eys_gene,
download_genes_lovd,
download_database_for_eys_gene,
download_data_from_gnomad_eys,

# Functions for storing databases
store_database_for_eys_gene

)

# DATA REFACTORING IMPORT
from .refactoring import (
# Functions for refactoring data
Expand Down
2 changes: 2 additions & 0 deletions api/data/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
LOVD_PATH = os.path.join(DATA_PATH, "lovd/")
GNOMAD_PATH = os.path.join(DATA_PATH, "gnomad/")
CLINVAR_PATH = os.path.join(DATA_PATH, "clinvar/")
DEFAULT_SAVE_PATH = os.path.join(DATA_PATH, "merged_data/")
SAVE_LOVD_GNOMAD = "../data/merged_lovd_gnomad/lovd_gnomad.csv"

# variable data types
LOVD_TABLES_DATA_TYPES = {
Expand Down
62 changes: 60 additions & 2 deletions api/data/refactoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@

from pyliftover import LiftOver

from .constants import LOVD_TABLES_DATA_TYPES, LOVD_PATH, GNOMAD_TABLES_DATA_TYPES, GNOMAD_PATH

from .constants import LOVD_TABLES_DATA_TYPES, LOVD_PATH, GNOMAD_TABLES_DATA_TYPES, GNOMAD_PATH, DEFAULT_SAVE_PATH, \
SAVE_LOVD_GNOMAD


def set_lovd_dtypes(df_dict):
Expand Down Expand Up @@ -348,3 +348,61 @@ def find_popmax_in_gnomad(data):
data.loc[i, 'Popmax'] = max_pop
data.loc[i, 'Popmax population'] = population_mapping[max_id]


def routing_merge(lovd_path:str=LOVD_PATH,gnomad_path:str=GNOMAD_PATH,save_path:str=DEFAULT_SAVE_PATH,overwrite:bool=False):
"""
Merges data from provided paths and saves to new location
:param overwrite: does file requires overwriting
:param lovd_path: path to LOVD dataframe
:param gnomad_path: path to gnomAD dataframe
:param save_path: path where to save merged data
:return:
"""

if overwrite:
return

lovd_file = os.path.join(lovd_path, "lovd_data.txt")
gnomad_file = os.path.join(gnomad_path, "gnomad_data.csv")

if not os.path.exists(lovd_file):
raise FileNotFoundError(f"LOVD data file not found at: {lovd_file}")

if not os.path.exists(gnomad_file):
raise FileNotFoundError(f"gnomAD data file not found at: {gnomad_file}")

lovd_data = parse_lovd(lovd_path + "/lovd_data.txt")
gnomad_data = parse_gnomad(gnomad_path + '/gnomad_data.csv')

set_lovd_dtypes(lovd_data)
set_gnomad_dtypes(gnomad_data)

# Extract "Variants_On_Genome" and merge it with "Variants_On_Transcripts"
variants_on_genome = lovd_data["Variants_On_Genome"].copy()

lovd_data = pd.merge(
lovd_data["Variants_On_Transcripts"],
variants_on_genome[['id', 'VariantOnGenome/DNA', 'VariantOnGenome/DNA/hg38']],
on='id',
how='left'
)

# Copy gnomAD data and merge with LOVD data
gnomad_data = gnomad_data.copy()
final_data = merge_gnomad_lovd(lovd_data, gnomad_data)

if not os.path.exists(os.path.dirname(save_path)):
os.makedirs(os.path.dirname(save_path))
try:
final_data.to_csv(SAVE_LOVD_GNOMAD)
print(f"Merged data saved to {save_path}")
except OSError as e:
print(f"Error saving file: {e}")

save_to = SAVE_LOVD_GNOMAD

# check if directory exists, if not - create
save_to_dir = os.path.dirname(save_to)
if not os.path.exists(save_to_dir):
os.makedirs(save_to_dir)

Loading

0 comments on commit fbe20ef

Please sign in to comment.