-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfiles.py
138 lines (107 loc) · 5.02 KB
/
files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# Copyright (C) 2018 Ahmad A. A. (https://github.com/bbpgrs/)
import requests
import json
import os
import common as cmn
import logging
import gzip
import shutil
"""
A script to download GDC RNA-seq or miRNA-seq files using the list generated by
the accompanying file_lists.py
"""
# GDC API data endpoint URL (downloading files)
URL = cmn.BASE_URL+'data'
def _download_files(file_list, file_number):
"""
Download files listed in one manifest list object
:param file_list: Object a json object of the list from one manifest-list file
:param file_number: List contains 1 element that is the current number/index of the file being downloaded
"""
# 'file' here is a file manifest, or information of a file
for file in file_list["data"]["hits"]:
logging.info("Processing file #%s ..." % file_number[0])
'''
this is where the organizational structure of the directories
for the downloaded files is defined. Using sample_manifest.json
as a guideline. Directory path for each file is defined then created
if necessary
'''
file_dir_path = [cmn.DL_DIR]
try:
file_dir_path.append(file["cases"][0]["project"]["primary_site"])
except KeyError:
logging.warning("File manifest did not provide attribute 'primary_site' for tissue sample.")
file_dir_path.append("unknown")
try:
file_dir_path.append(file["cases"][0]["demographic"]["gender"])
except KeyError:
logging.warning("File manifest did not provide attribute 'gender' for tissue sample.")
file_dir_path.append("unknown")
try:
file_dir_path.append(file["cases"][0]["samples"][0]["sample_id"])
except KeyError:
logging.warning("File manifest did not provide attribute 'sample_id' for tissue sample.")
file_dir_path.append("unknown")
file_dir = os.path.join(*file_dir_path)
cmn.make_dir(file_dir)
file_path = os.path.join(file_dir, file["file_name"])
gzip_archive = False
if file_path.endswith('.gz') or file_path.endswith('.GZ'):
logging.info("File is a gzip archive.")
gzip_archive = True
if (not gzip_archive and not os.path.isfile(file_path)) or (gzip_archive and not os.path.isfile('%s.txt' % file_path[:-3])):
logging.info("Requesting file '%s'..." % file["file_name"])
# send a GET request for this single file using it's UUID
r = requests.get("%s/%s" % (URL, file["file_id"]))
logging.info(r)
if r.status_code == requests.codes.ok:
logging.info("Request ok, downloading file ...")
logging.info("Writing file to '%s' ..." % file_path)
# write HTTP response to the file as a byte stream
with open(file_path, 'wb') as file_out:
for chunk in r.iter_content(chunk_size=128):
file_out.write(chunk)
if gzip_archive:
logging.info("Extracting archive to '%s' ..." % ('%s.txt' % file_path[:-3]))
with gzip.open(file_path, 'rb') as f_in, open('%s.txt' % file_path[:-3], 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
logging.info("Text file extracted, removing gzip archive ...")
os.remove(file_path)
logging.info("Gzip archive removed.")
logging.info("File downloaded.\n")
else:
logging.error("Request failed, skipping file.\n")
else:
if gzip_archive:
logging.info("File '%s' already exists.\n" % '%s.txt' % file_path[:-3])
else:
logging.info("File '%s' already exists.\n" % file_path)
file_number[0] += 1
def download_files(file_number):
"""
Read the manifest lists previously acquired and download
the files listed in them.
:param file_number: List contains 1 element that is the current number/index of the file being downloaded
"""
if not os.path.exists(cmn.FILE_LIST_DIR):
logging.info("Directory '%s' could not be found. Terminating ...\n" % cmn.FILE_LIST_DIR)
return
logging.info("Scanning directory '%s' for manifest-list files ...\n" % cmn.FILE_LIST_DIR)
list_file_labels = os.listdir(cmn.FILE_LIST_DIR)
for lf in list_file_labels:
list_file = os.path.join(cmn.FILE_LIST_DIR, lf)
if os.path.isdir(list_file) or not os.path.exists(list_file):
continue
logging.info("Downloading files listed in manifest-list file '%s' ...\n" % list_file)
_download_files(json.load(open(list_file)), file_number)
logging.info("%s files processed.\n" % (file_number[0]-1))
def run():
"""
Run the downloading script to download files listed in previously
acquired manifest lists.
"""
# number of file currently being requested/downloaded
file_number = [1]
cmn.make_dir(cmn.DL_DIR)
download_files(file_number)