-
Notifications
You must be signed in to change notification settings - Fork 0
/
nanotime_v2.py
92 lines (73 loc) · 3.05 KB
/
nanotime_v2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/usr/bin/env python
#coding=utf-8
from argparse import ArgumentParser
import os, gzip, pathlib
# import numpy as np
# import pandas as pd
# from dateutil.parser import parse as dparse
# from dateutil.relativedelta import relativedelta
"""
In order to obtain the pool start time, the nanotime.py firstly scans all fastq/signal files,
finds the earliest start_time among files.
"""
def get_args():
parser = ArgumentParser(description="Nanotime v2. Nanopore reads extraction based on the start time of sequencing; The process on fast5(signal) file is still under development")
parser.add_argument('-q', "--fqz", help="the path of compressed fastq file (e.g. *.fastq.gz)", required=True, type = str)
parser.add_argument('-s', "--ses", help="the path to sequencing_summary.txt", required=True, type = str)
parser.add_argument('-l', "--len", help="the length of run of sequencing (e.g. 1)", required=True, default=1, type=str)
parser.add_argument('-o', "--out", help="the path of output", required=True, type = str)
return parser.parse_args()
def main():
input_args = get_args()
downsampling_fastq_creator(input_args)
return
def downsampling_fastq_creator(input_args):
readids_dict = readids_dict_collector(input_args.ses) # {readids, }
fq_dict = {}
with gzip.open(input_args.fqz, 'rb') as file:
for lcn, _line in enumerate(file):
line = _line.decode('utf-8')
# print(line)
if line.startswith('@') and lcn % 4 ==0:
readid = line[1:].split(" ")[0]
if readids_dict[readid] <= float(input_args.len) * 3600: # convert hours to seconds
qname = line
fq_dict[qname] = qname
else:
qname = None
else:
if qname is None:
continue
else:
fq_dict[qname] += line.lstrip()
fastq_saver(fq_dict, input_args)
return
def readids_dict_collector(summary_path):
readids_time_dict = {}
with open(summary_path, 'r') as file:
for lcn, line in enumerate(file):
line_lst = line.strip().split()
if lcn == 0:
for ind, i in enumerate(line_lst):
if i == 'start_time':
start_time_col = ind
elif i == 'read_id':
read_id_col = ind
else:
next
# print("start_time_col", start_time_col, )
else:
start_time = line_lst[start_time_col]
read_id = line_lst[read_id_col]
readids_time_dict[read_id] = float(start_time)
return readids_time_dict
def fastq_saver(fq_dict, input_args):
output_dir = pathlib.Path(input_args.out)
output_dir.parent.mkdir(parents=True, exist_ok=True)
output = gzip.open( output_dir, 'wb' )
for readid in fq_dict:
output.write(fq_dict[readid].encode('utf-8'))
output.close()
return
if __name__ == "__main__":
main()