forked from idhmc-tamu/eMOP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtess-script.sh
executable file
·150 lines (119 loc) · 4.64 KB
/
tess-script.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/bin/sh
### tess-script #############################################
#
# Written by Matt Christy
# for The Early-Modern OCR Project
#
# Copyright 2013 The Early-Modern OCR Project
#
#########################################################
clear
echo " "
echo " "
echo "### tess-script ############################################"
echo "# "
echo "# An tesseract training utiility for "
echo "# The Early-Modern OCR Project (eMOP) "
echo "# "
echo "# Copyright 2013 - eMOP "
echo "# "
echo "#######################################################"
echo " "
echo " "
#### ------------------mjc: 05022013----------------------------------------------------
# The script takes an input file name (without an extension, so something like "emop.mfle.exp18") and running all the necessary commands to build the training files.
#
# The command looks like this:
## ==============================================================================
## sh ./tess-script.sh <inputfile(s)>
## ==============================================================================
# Where:
# <inputfile(s)>: input file(s) with path --(relative from the folder where your XSLT is running)
# NOTE: the input file is the common prefix of the name of a set of tiff/box file pairs. i.e. leave off the .tif/.box
#### ------------------mjc: 050213----------------------------------------------------
#Loop through passed params and assign global var values
infile=($@)
len=${#infile[@]}
for ((i=0; i<=$len-1; i++))
do
echo " "
echo "#####################################"
echo "tesseract ${infile[$i]}.tif ${infile[$i]} nobatch box.train"
echo "#####################################"
#run tesseract on the passed in tifs to create training files for each
tesseract ${infile[$i]}.tif ${infile[$i]} nobatch box.train
#rename the passed in files to have the '.tr' extension
#infile[$i]=${infile[$i]}.tr
done
#create the training file name for the whole set of passed in files by taking a prefix that doesn't include the exp #, the append a '.tr' extension.
outlen=${#infile[0]}-2
outfile=${infile[0]:0:$outlen}.tr
#concat all created training files into one
echo " "
echo "#####################################"
echo "concat all training files into one"
for ((i=0; i<=$len-1; i++))
do
trin[$i]=${infile[$i]}.tr
done
echo "cat ${trin[@]} > $outfile"
echo "#####################################"
cat ${trin[@]} > $outfile
#extract unicharset from all related box files
echo " "
echo "#####################################"
echo "extract unicharset from all related box files"
for ((i=0; i<=$len-1; i++))
do
boxin[$i]=${infile[$i]}.box
done
echo "unicharset_extractor ${boxin[@]}"
echo "#####################################"
unicharset_extractor ${boxin[@]}
echo " "
echo "#####################################"
echo "shapeclustering -F emop.font_properties -U unicharset $outfile"
echo "#####################################"
shapeclustering -F emop.font_properties -U unicharset $outfile
echo " "
echo "#####################################"
echo "mftraining -F emop.font_properties -U unicharset -O emop.unicharset $outfile"
echo "#####################################"
mftraining -F emop.font_properties -U unicharset -O emop.unicharset $outfile
echo " "
echo "#####################################"
echo "cntraining $outfile"
echo "#####################################"
cntraining $outfile
echo " "
echo "#####################################"
echo "change output filenames"
echo "mv inttemp emop.inttemp"
echo "mv normproto emop.normproto"
echo "mv pffmtable emop.pffmtable"
echo "mv shapetable emop.shapetable"
echo "#####################################"
mv inttemp emop.inttemp
mv normproto emop.normproto
mv pffmtable emop.pffmtable
mv shapetable emop.shapetable
echo " "
echo "#####################################"
echo "Create DAWG files using unicharset from training files"
echo "wordlist2dawg frequent-words-list.txt emop.freq-dawg emop.unicharset"
wordlist2dawg frequent-words-list.txt emop.freq-dawg emop.unicharset
echo "#####################################"
echo "wordlist2dawg frequent-words-list.txt emop.freq-dawg emop.unicharset"
echo "#####################################"
wordlist2dawg words-list.txt emop.word-dawg emop.unicharset
echo " "
echo "#####################################"
echo "combine_tessdata emop."
echo "#####################################"
combine_tessdata emop.
echo " "
echo "#####################################"
echo "cp emop.traineddata /Users/matthewchristy/tesseract-ocr/tessdata"
echo "#####################################"
cp emop.traineddata /Users/matthewchristy/tesseract-ocr/tessdata
#tesseract emop.mfle.exp17.tif out.a4.m94-17.mfle17 -l emop <path>config.txt