-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrlcsa_grep.cpp
155 lines (138 loc) · 3.46 KB
/
rlcsa_grep.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#include <iostream>
#include "rlcsa.h"
#include "misc/utils.h"
using namespace CSA;
enum mode_type { COUNT, TOTAL, START, RELATIVE, DISPLAY, CONTEXT };
void printUsage()
{
std::cout << "Usage: rlcsa_grep [-c|-t|-s|-r|-NUM] pattern base_name" << std::endl;
std::cout << " -c print the number of matching sequences" << std::endl;
std::cout << " -t print the total number of occurrences" << std::endl;
std::cout << " -s print the start positions of matches" << std::endl;
std::cout << " -r print the relative start positions of matches (sequence, position)" << std::endl;
std::cout << " -NUM display NUM characters of leading and trailing context instead of" << std::endl;
std::cout << " the entire line" << std::endl;
}
int main(int argc, char** argv)
{
int base_arg = 2, pattern_arg = 1;
mode_type mode = DISPLAY;
usint context = 0;
if(argc < base_arg + 1)
{
printUsage();
return 1;
}
if(argv[1][0] == '-')
{
base_arg++; pattern_arg++;
if(std::string("-c").compare(argv[1]) == 0)
{
mode = COUNT;
}
else if(std::string("-t").compare(argv[1]) == 0)
{
mode = TOTAL;
}
else if(std::string("-s").compare(argv[1]) == 0)
{
mode = START;
}
else if(std::string("-r").compare(argv[1]) == 0)
{
mode = RELATIVE;
}
else
{
mode = CONTEXT;
context = atoi(&(argv[1][1]));
}
if(argc < base_arg + 1)
{
printUsage();
return 2;
}
}
RLCSA rlcsa(argv[base_arg], false);
if(!rlcsa.isOk())
{
return 3;
}
usint len = std::string(argv[pattern_arg]).length();
pair_type result_range = rlcsa.count(argv[pattern_arg]);
usint occurrences = length(result_range);
if(mode == TOTAL)
{
std::cout << occurrences << std::endl;
return 0;
}
if(occurrences == 0)
{
if(mode == COUNT)
{
std::cout << 0 << std::endl;
}
return 0;
}
usint last_row = 0;
usint* results = rlcsa.locate(result_range);
if(mode == COUNT || mode == DISPLAY)
{
// Make results hold text numbers instead of positions.
rlcsa.getSequenceForPosition(results, occurrences);
}
std::sort(results, results + occurrences);
if(mode == COUNT || mode == DISPLAY)
{
// Find each text only once, even if it matches multiple times.
for(usint i = 1; i < occurrences; i++)
{
if(results[i] != results[last_row])
{
last_row++; results[last_row] = results[i];
}
}
}
if(mode == COUNT)
{
std::cout << (last_row + 1) << std::endl;
}
else if(mode == DISPLAY)
{
for(usint i = 0; i <= last_row; i++)
{
uchar* row = rlcsa.display(results[i]);
std::cout.write((char*)row, length(rlcsa.getSequenceRange(results[i])));
std::cout << std::endl;
delete[] row;
}
}
else if(mode == START)
{
for(usint i = 0; i < occurrences; i++)
{
std::cout << results[i] << std::endl;
}
}
else if(mode == RELATIVE)
{
for(usint i = 0; i < occurrences; i++)
{
pair_type relative = rlcsa.getRelativePosition(results[i]);
std::cout << relative.first << ", " << relative.second << std::endl;
}
}
else if(mode == CONTEXT)
{
usint result_length = 0;
for(usint i = 0; i < occurrences; i++)
{
uchar* text = rlcsa.display(results[i], len, context, result_length);
std::cout.write((char*)text, result_length);
std::cout << std::endl;
delete[] text;
}
}
delete[] results;
return 0;
}