-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathsally.cfg
34 lines (31 loc) · 906 Bytes
/
sally.cfg
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# Configuration of input
input = {
# Format of input. Supported types "dir", "arc", "lines"
input_format = "dir";
# Number of strings to process in one chunk
chunk_size = 256;
stoptoken_file = "stoptokens.txt"
};
# Configuration of feature extraction
features = {
# Length of n-grams.
ngram_len = 3;
# Granularity of n-grams.
granularity = "tokens";
# Delimiters for tokens.
token_delim = "%0a%0d%20%22.,:;!?";
# Embedding mode for vectors. Supported types "cnt", "bin", "tfidf"
vect_embed = "cnt";
# Normalization mode for vectors. Supported types "l1", "l2", "none".
vect_norm = "none";
# Number of hash bits to use.
hash_bits = 22;
thres_low = 3
# TFIDF normalization file
# tfidf_file = "tfidf.fv";
};
# Configuration of output
output = {
# Format of output. Supported types: "libsvm", "text", "matlab"
output_format = "libsvm";
};