forked from sjspence/epicPCR
-
Notifications
You must be signed in to change notification settings - Fork 0
/
filterDSRb.py
executable file
·78 lines (72 loc) · 2.69 KB
/
filterDSRb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/env python
# filterDSRb.py
# 10/20/2014 Sarah J. Spencer, Alm Lab, MIT
# Filter Illumina sequence reads for matches to a section of the dsrB
# gene and known bridge primer sequences. Export 16S sequences trimmed to a
# common length.
from optparse import OptionParser
import re
# Import data from fasta file and save in header and sequence variables
def importData(inputFileName):
rawHeaders = []
rawSeqs = []
inputFile = open(inputFileName,'r')
rawHeaders = []
rawSeqs = []
track = 'header'
for line in inputFile:
if track == 'header':
rawHeaders.append(line)
track = 'seq'
else:
rawSeqs.append(line)
track = 'header'
inputFile.close()
return rawHeaders, rawSeqs
# Parse the joined fasta reads for designed primer sequence structure
# Output: 16S sequences from fusions that match the designed structure
def fwdparse(rawHeaders, rawSeqs, outputLength):
usableHeaders = []
usableSeqs = []
primerfwd = '[CT][AG][CT][AG][ACG]AG[ACG]AT[GC]GCGAT[AG]TCGGA'
primerbridge = 'C[AG]CC[AG]CACAT[AG]TT[GC]AGGCACAGC[AC]GCCGCGGTA'
for i,rawSeq in enumerate(rawSeqs):
if ((re.search(primerfwd, rawSeq[0:22])) and
(re.search(primerbridge, rawSeq))):
pass
else:
continue
splitSeq = re.split(primerbridge,rawSeq)
if len(splitSeq[1]) < outputLength+1:
continue
if splitSeq[1][outputLength] == '\n':
continue
usableHeaders.append(rawHeaders[i])
usableSeqs.append(splitSeq[1][1:outputLength+1] + '\n')
return usableHeaders, usableSeqs
# This wrapper function takes user-defined input and output files as well as
# the common length to trim 16S sequences. These variables are used to parse
# and export 16S fasta sequences from fusions that match the dsrB fusion
# structure.
def main():
parser = OptionParser()
parser.add_option("-i", "--input", dest="i",
help="Input fasta file of non-chimeric sequences.",
metavar="INPUT")
parser.add_option("-l", "--length", dest="l",
help="Length to trim output sequences.",
metavar="LENGTH")
parser.add_option("-o", "--output", dest="o",
help="Output fasta file of structure-filtered sequences.",
metavar="OUTPUT")
(options, args) = parser.parse_args()
#Filter each sample for sequence structure and save filtered sequences
rawHeaders, rawSeqs = importData(options.i)
usableHeaders, usableSeqs = fwdparse(rawHeaders, rawSeqs, int(options.l))
outFile = open(options.o, 'w')
for i, header in enumerate(usableHeaders):
outFile.write(usableHeaders[i])
outFile.write(usableSeqs[i])
outFile.close()
if __name__ == "__main__":
main()