-
-
Notifications
You must be signed in to change notification settings - Fork 73
/
reader.go
278 lines (256 loc) · 9.21 KB
/
reader.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
package fixedlength
import (
"bufio"
"errors"
"fmt"
"io"
"github.com/antchfx/xpath"
"github.com/jf-tech/go-corelib/ios"
"github.com/jf-tech/omniparser/extensions/omniv21/fileformat/flatfile"
"github.com/jf-tech/omniparser/idr"
)
type line struct {
lineNum int // 1-based
b []byte // either a copy of a line content or a direct ref into bufio.Reader.
copied bool // see notes in reader.readLine()
}
type reader struct {
inputName string
r *bufio.Reader
hr *flatfile.HierarchyReader
linesRead int // total number of lines read in so far
linesBuf []line // linesBuf contains all the unprocessed lines
}
// NewReader creates an FormatReader for fixed-length file format.
func NewReader(
inputName string, r io.Reader, decl *FileDecl, targetXPathExpr *xpath.Expr) *reader {
reader := &reader{
inputName: inputName,
r: bufio.NewReader(r),
}
reader.hr = flatfile.NewHierarchyReader(
toFlatFileRecDecls(decl.Envelopes), reader, targetXPathExpr)
return reader
}
// Read implements fileformat.FormatReader interface, reading in data from input and returns
// target IDR node.
func (r *reader) Read() (*idr.Node, error) {
n, err := r.hr.Read()
switch {
case err == nil:
return n, nil
case flatfile.IsErrFewerThanMinOccurs(err):
e := err.(flatfile.ErrFewerThanMinOccurs)
envelopeDecl := e.RecDecl.(*EnvelopeDecl)
return nil, ErrInvalidFixedLength(r.fmtErrStr(r.unprocessedLineNum(),
"envelope/envelope_group '%s' needs min occur %d, but only got %d",
envelopeDecl.fqdn, envelopeDecl.MinOccurs(), e.ActualOcccurs))
case flatfile.IsErrUnexpectedData(err):
return nil, ErrInvalidFixedLength(r.fmtErrStr(r.unprocessedLineNum(), "unexpected data"))
default:
return nil, err
}
}
// MoreUnprocessedData implements flatfile.RecReader, telling whether there is still unprocessed
// data or not.
func (r *reader) MoreUnprocessedData() (bool, error) {
if len(r.linesBuf) > 0 {
return true, nil
}
if err := r.readLine(); err != nil && err != io.EOF {
return false, err
}
return len(r.linesBuf) > 0, nil
}
// ReadAndMatch implements flatfile.RecReader, reading unprocessed data (from buffer or from IO),
// trying to match against the given non-group typed record decl, and converting the data into IDR
// node if asked to.
func (r *reader) ReadAndMatch(
decl flatfile.RecDecl, createIDR bool) (matched bool, node *idr.Node, err error) {
envelopeDecl := decl.(*EnvelopeDecl)
if envelopeDecl.rowsBased() {
return r.readAndMatchRowsBasedEnvelope(envelopeDecl, createIDR)
}
return r.readAndMatchHeaderFooterBasedEnvelope(envelopeDecl, createIDR)
}
func (r *reader) readAndMatchRowsBasedEnvelope(
decl *EnvelopeDecl, createNode bool) (bool, *idr.Node, error) {
for len(r.linesBuf) < decl.rows() {
if err := r.readLine(); err != nil {
if err != io.EOF || len(r.linesBuf) == 0 {
// So either the err isn't io.EOF, we need to return this critical error directly;
// or it's io.EOF and our line buf is empty, i.e. all has been processed, so we
// should return io.EOF to indicate end. Both case, we simply return "not matched"
// plus err.
return false, nil, err
}
// we're here when the err is io.EOF and line buf isn't empty, so we return
// "no match", and "no error", hoping the non-empty line buf will be matched
// by subsequent calls with different decls.
return false, nil, nil
}
}
if createNode {
n := r.linesToNode(decl, decl.rows())
// Once those rows have been converted into IDR node, we're done with them, and remove
// them from the unprocessed line buffer.
r.popFrontLinesBuf(decl.rows())
return true, n, nil
}
return true, nil, nil
}
func (r *reader) readAndMatchHeaderFooterBasedEnvelope(
decl *EnvelopeDecl, createNode bool) (bool, *idr.Node, error) {
if len(r.linesBuf) <= 0 {
if err := r.readLine(); err != nil {
// io.EOF or not, since r.linesBuf is empty, we can directly return err.
return false, nil, err
}
}
if !decl.matchHeader(r.linesBuf[0].b) {
return false, nil, nil
}
i := 0 // we'll match the footer starting on the same line.
for {
if decl.matchFooter(r.linesBuf[i].b) {
if createNode {
n := r.linesToNode(decl, i+1)
r.popFrontLinesBuf(i + 1)
return true, n, nil
}
return true, nil, nil
}
// if by the end of r.linesBuf we still haven't matched footer, we need to
// read more line in for footer match.
if i >= len(r.linesBuf)-1 {
if err := r.readLine(); err != nil {
if err != io.EOF { // io reading error, directly return err.
return false, nil, err
}
// io.EOF encountered and since r.linesBuf isn't empty,
// we need to return false for matching, but nil for error (we only return io.EOF
// when r.linesBuf is empty.
return false, nil, nil
}
}
i++
}
}
func (r *reader) readLine() error {
// https://github.com/jf-tech/omniparser/issues/213
//
// If we're dealing with multi-lined envelope (either by rows or by header/footer), this
// readLine() will be called several times, thus whatever ios.ByteReadLine, which uses
// bufio.Reader underneath, returns in a previous call may be potentially be invalidated due to
// bufio.Reader's internal buf rollover. If we read the previous line directly, it would cause
// corruption.
//
// To fix the problem the easiest solution would be simply copying the return []byte from
// ios.ByteReadLine every single time. But for files with single-line envelope, which are the
// vast majority cases, this copy becomes unnecessary and burdensome on gc. So the trick is to
// has a flag on reader.linesBuf's last element to tell if it contains a reference into the
// bufio.Reader's internal buffer, or it's a copy. Every time before we call bufio.Reader read,
// we check reader.liensBuf's last element flag, if it is not a copy, then we will turn it into
// a copy.
//
// This way, we optimize for the vast majority cases without
// needing allocations, and avoid any potential corruptions in the multi-lined envelope cases.
linesBufLen := len(r.linesBuf)
if linesBufLen > 0 && !r.linesBuf[linesBufLen-1].copied {
cp := make([]byte, len(r.linesBuf[linesBufLen-1].b))
copy(cp, r.linesBuf[linesBufLen-1].b)
r.linesBuf[linesBufLen-1].b = cp
r.linesBuf[linesBufLen-1].copied = true
}
for {
// note1: ios.ByteReadLine returns a line with trailing '\n' (and/or '\r') dropped.
// note2: ios.ByteReadLine won't return io.EOF if line returned isn't empty.
// note3: ios.ByteReadLine's returned []byte is merely pointing into the bufio.Reader's
// internal buffer, thus the content will be invalided if ios.ByteReadLine is called
// again. Caution!
b, err := ios.ByteReadLine(r.r)
switch {
case err == io.EOF:
return io.EOF
case err != nil:
return ErrInvalidFixedLength(r.fmtErrStr(r.linesRead+1, err.Error()))
}
r.linesRead++
if len(b) > 0 {
r.linesBuf = append(r.linesBuf, line{lineNum: r.linesRead, b: b})
return nil
}
}
}
func (r *reader) linesToNode(decl *EnvelopeDecl, n int) *idr.Node {
if len(r.linesBuf) < n {
panic(
fmt.Sprintf("linesBuf has %d lines but requested %d lines to convert",
len(r.linesBuf), n))
}
node := idr.CreateNode(idr.ElementNode, decl.Name)
for col := range decl.Columns {
colDecl := decl.Columns[col]
for i := 0; i < n; i++ {
if !colDecl.lineMatch(i, r.linesBuf[i].b) {
continue
}
colNode := idr.CreateNode(idr.ElementNode, colDecl.Name)
idr.AddChild(node, colNode)
colVal := idr.CreateNode(idr.TextNode, colDecl.lineToColumnValue(r.linesBuf[i].b))
idr.AddChild(colNode, colVal)
break
}
}
return node
}
func (r *reader) popFrontLinesBuf(n int) {
if n > len(r.linesBuf) {
panic(fmt.Sprintf(
"less lines (%d) in r.linesBuf than requested pop front count (%d)",
len(r.linesBuf), n))
}
newLen := len(r.linesBuf) - n
for i := 0; i < newLen; i++ {
r.linesBuf[i] = r.linesBuf[i+n]
}
r.linesBuf = r.linesBuf[:newLen]
}
func (r *reader) unprocessedLineNum() int {
if len(r.linesBuf) > 0 {
return r.linesBuf[0].lineNum
}
return r.linesRead + 1
}
// Release implements fileformat.FormatReader interface, releasing a finished IDR target node.
func (r *reader) Release(n *idr.Node) {
r.hr.Release(n)
}
// IsContinuableError implements fileformat..FormatReader interface, checking if an error is
// fatal or not.
func (r *reader) IsContinuableError(err error) bool {
return !IsErrInvalidFixedLength(err) && err != io.EOF
}
// FmtErr implements errs.CtxAwareErr embedded in fileformat.FormatReader, formatting an error
// with line info.
func (r *reader) FmtErr(format string, args ...interface{}) error {
return errors.New(r.fmtErrStr(r.unprocessedLineNum(), format, args...))
}
func (r *reader) fmtErrStr(line int, format string, args ...interface{}) string {
return fmt.Sprintf("input '%s' line %d: %s",
r.inputName, line, fmt.Sprintf(format, args...))
}
// ErrInvalidFixedLength indicates the fixed-length content is corrupted or IO failure.
// This is a fatal, non-continuable error.
type ErrInvalidFixedLength string
// Error implements error interface.
func (e ErrInvalidFixedLength) Error() string { return string(e) }
// IsErrInvalidFixedLength checks if the `err` is of ErrInvalidFixedLength type.
func IsErrInvalidFixedLength(err error) bool {
switch err.(type) {
case ErrInvalidFixedLength:
return true
default:
return false
}
}