-
Notifications
You must be signed in to change notification settings - Fork 2
/
nq-extract.py
154 lines (124 loc) · 3.43 KB
/
nq-extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import simplejson
import pprint
import re
from scraper.scraperlib import TreeScraper,TextEditor,TextParser
unifier= [
[r"^[0-9,]+$", lambda a: a.string.replace(",","") ],
["\s\s+", " "],
["^\s+", "" ],
["\s+$", "" ],
["Date of reporting period: (\d\d)/(\d\d)/(\d\d\d\d)", r"\3-\2-\1"],
["\s+\(concluded\)", "" ],
["\s+\(continued\)", "" ],
{ 're' : re.compile("\s*\x97.*",re.DOTALL), 'repl' : "" },
{ 're' : re.compile("\s*\x96.*",re.DOTALL), 'repl' : "" },
{ 're' : re.compile(", Inc.:?"), 'repl' : "" },
{ 're' : re.compile("^\xa0"), 'repl' : "" },
{ 're' : re.compile("^\$"), 'repl' : "" },
]
files={ "in" : [
"data/d/d538216dnq.htm",
"data/d/i00449_if-nq.htm",
"data/d/i00222_if-nq.htm",
"data/d/i00499_if-nq.htm",
"data/d/i00242_ifi-nq.htm",
"data/d/i00001_indexfunds-nqa.htm",
],
"csv" : "extract.csv" }
parse={
"container" : "//table",
"properties" : {
"date" : "//p/font[contains(text(),'Date of reporting period')]/text()",
"fund" : [ "//table[1]/tr/td//p/font[contains(text(),'Name of Fund')]/../../../../tr[1]/td[2]/p/font/text()",
"//p/font[contains(text(),'Name of Fund:')]/text()" ]
},
"row" : ".//tr",
"columns" : {
"category" : "./td[1]//font[@size='2']/b/text()",
"name" : "./td[1]//font[@size='2']/text()",
"shares" : "./td[4]//font[@size='2']/text()",
"value" : ["./td[7]//font[@size='2']/text()", "./td[8]//font[@size='2']/text()"] ,
},
"ok" : [
"name|shares|value",
],
"table" : [ "fund","date","category","name","shares","value"]
}
def extract(e,p) :
if type(p)==type("") :
p=[p]
for pp in p :
try :
a=e.xpath(pp)
if a:
return a
except Exception, e:
print "pp %s - unmatched" % pp
pass
return None
def stringstring(a) :
try :
s=etree.tostring(a)
except TypeError :
if type(a) == type([]) :
s="".join(a)
else :
s=a
for u in unifier :
try :
s=u['re'].sub(u["repl"],s)
except Exception, e:
pass
return s
def output_ok(h,fields) :
fs=fields.split("|")
return set(fs)==set(filter(lambda a: h[a],h.keys()))
def output_row(h,row) :
rrow=[]
for f in row :
if f in h:
rrow.append(h[f])
else :
rrow.append("")
return rrow
ofile=UnicodeWriter(open(files["csv"],"w"))
for fn in files["in"] :
tree=etree.parse(open(fn),etree.HTMLParser())
data=[]
props={}
for (k,v) in parse["properties"].items() :
props[k]=stringstring(extract(tree,v))
pprint.pprint(props)
for container in tree.xpath(parse["container"]) :
cdata=[]
for row in container.xpath(parse["row"]) :
rd={}
# print "row: %s" % (etree.tostring(row)[:10])
for (k,x) in parse["columns"].items() :
if type(x) != type([]) :
x=[x]
a=[]
for xx in x :
a.extend(row.xpath(xx))
if a:
# print "matched"
rd[k]="".join([stringstring(ss) for ss in a])
if len(rd[k]) == 1 :
rd[k]=rd[k][0]
of=False
props.update(rd)
for o in parse["ok"] :
if output_ok(rd,o) :
cdata.append(output_row(props,parse["table"]))
of=True
break
# if not of:
# rd.update({"file" : "o"})
# cdata.append(output_row(rd,parse["table"]))
data.append(cdata)
# pprint.pprint(map(lambda a: { "a" : a.get("reaction","-"), "b" : a.get("color",""), "c" : a.get("amendment","") },data));
print "%s containers\n" % len(data)
for container in data :
if len(container)>0 :
pprint.pprint(container)
ofile.writerows(container)