-
Notifications
You must be signed in to change notification settings - Fork 0
/
cLikeFastaStats.py
153 lines (118 loc) · 5.34 KB
/
cLikeFastaStats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/usr/bin/python
import sys
import os
import os.path
import argparse
import re
import string
import logging
import warnings
## simpleFastaStats.py takes a single fasta-formatted DNA/RNA text file and
## outputs contig count, average contig length, N50 contig lengths, maximum contig length, and cumulative contig length
## Function: A closure for file extension checking
def ext_check(expected_ext, openner):
def extension(filename):
if not filename.lower().endswith(expected_ext):
raise ValueError()
return openner(filename)
return extension
## Function: Filename extractor from filepath
def getIsolateID(filePathString):
splitStr = re.split(pattern='/', string=filePathString)
fileNameIdx = len(splitStr) - 1
isolateString = re.split(pattern='\.', string=splitStr[fileNameIdx])
if(len(isolateString[0]) < 10):
isolateString = re.split(pattern='\.', string=splitStr[0])
return isolateString[0]
parser = argparse.ArgumentParser(description='output contig count, average contig length, N50 contig, and maximum contig length for input.assembly.fasta', usage="simpleFastaStats.py filepath/input.assembly.fasta --minLength 500(default) --format [brief(default)|verbose|tsv|csv]")
parser.add_argument("filename",type=ext_check('.fasta', argparse.FileType('r')))
parser.add_argument("--minLength", '-min', default='500', type=int)
parser.add_argument("--format", default='brief', type = lambda s : s.lower(), choices=['tsv', 'csv', 'brief', 'verbose', 'clcstyle', 'c', 's', 'b', 'v', 'l'])
args = parser.parse_args()
## call function to extract filename/isolate from inFilePath
inFileName = getIsolateID(args.filename.name)
## Open input file
filehandle = open(args.filename.name, 'r')
intMinLen = args.minLength
draftContigs = []
draftGenome = {}
contigLengths = {}
idCount = 0
contigID = ""
contigStr = ""
contigCount = 0
contigMax = 0
contigN75 = 0
contigN50 = 0
contigN25 = 0
draftLength = 0
avgContig = 0
for line in filehandle:
if(re.search(r'^>', line)):
if(idCount > 0):
draftGenome[contigID] = contigStr
contigID = ""
contigStr = ""
contigID = line.strip()
if(re.search(r'\(paired\)', contigID)):
contigID = contigID.replace('_(paired)', '')
if(re.search('R1_001_', contigID)):
contigID = contigID.replace('R1_001_', '')
draftContigs.append(contigID)
idCount = idCount + 1
#print(contigID)
elif(re.search(r'^(A|T|G|C|U|N)+', line)):
contigStr = contigStr + line.strip()
draftGenome[contigID] = contigStr
## Close input file
args.filename.close()
for contigKey in draftGenome:
if( len(draftGenome[contigKey]) > (intMinLen - 1) ):
contigLengths[contigKey] = len(draftGenome[contigKey])
##print(contigKey + " => " + str(contigLengths[contigKey]))
## new dictionary, sorted from longest contigs to shortest
#contigSortGenome = {}
#sortContigLengths = {}
#sortContigLengths = sorted(contigLengths, reverse=True)
#contigSortGenome = sorted(draftGenome, key=contigLengths.__getitem__, reverse=True)
count = 0
### Loop to find longest contig and contig count given length > intMinLen
for contigID in sorted(contigLengths, key=contigLengths.__getitem__, reverse=True):
if( contigLengths[contigID] > (intMinLen - 1) ):
if(count == 0):
contigMax = contigLengths[contigID]
top = 1
count = count + 1
draftLength = draftLength + contigLengths[contigID]
contigCount = count
avgContig = draftLength/contigCount
### to compute N50, find the contig that 'resides' at 1/2 of draftLength
#contigSortGenome = sorted(draftGenome, key=contigLengths.__getitem__)
cumulativeLength = 0;
foundN75 = 0
foundN50 = 0
for contigID in sorted(contigLengths, key=contigLengths.__getitem__):
if( contigLengths[contigID] > (intMinLen - 1) ):
cumulativeLength = cumulativeLength + contigLengths[contigID]
if(( (cumulativeLength > (draftLength)/4)) and (foundN75 == 0)):
contigN75 = contigLengths[contigID]
foundN75 = 1
if( (cumulativeLength > (draftLength/2)) and (foundN50 == 0) ):
contigN50 = contigLengths[contigID]
foundN50 = 1
if( cumulativeLength > 3*(draftLength/4) ):
contigN25 = contigLengths[contigID]
break
if ( args.format == 'verbose' or args.format == 'v' ):
print("Assembly File\tMinimum Contig Length:\tcontigCount\tavgContig\tN75\tN50\tN25\tmaxContig\tdraftLength")
print("{}\t".format(inFileName), ">", intMinLen - 1 ,"bp:\t", contigCount, "\t", "%.0f" % avgContig, "\t", contigN75, "\t", contigN50, "\t", contigN25, "\t", contigMax, "\t", draftLength)
elif( args.format == 'brief' or args.format == 'b' ):
print("Assembly\tcontigCount\tN75\tN50\tN25\tmaxContig\tdraftLength")
print(inFileName + "\t" + str(contigCount) + "\t" + str(contigN75) + "\t" + str(contigN50) + "\t" + str(contigN25) + "\t" + str(contigMax) + "\t" + str(draftLength))
elif ( args.format == 'tsv' or args.format == 't'):
print(str(contigCount) + "\t" + str(contigN75) + "\t" + str(contigN50) + "\t" + str(contigN25) + "\t" + str(contigMax))
elif ( args.format == 'clcstyle' or args.format == 'l'):
print("Assembly_File\tN75\tN50\tN25\tmaxContig\tavgContig\tcontigCount\tdraftLength")
print("{}\t".format(inFileName), "\t", contigN75, "\t", contigN50, "\t", contigN25, "\t", contigMax, "\t", "%.0f" % avgContig, "\t", contigCount ,"\t", draftLength)
elif ( args.format == 'csv' or args.format == 'c' ):
print(inFileName + "," + str(contigCount) + "," + str(contigN75) + "," + str(contigN50) + "," + str(contigMax))