-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathmain.py
103 lines (90 loc) · 2.64 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import requests
import re
import cl_spider
import time
import urllib
import urlparse
import web_parse
import db_handler
import gevent
from gevent import coros
from gevent import monkey
from gevent.queue import Queue
monkey.patch_all()
Lock = coros.Semaphore()
def forum_page_fetch(db, P_que, F_que, sleep_len, sess):
time.sleep(sleep_len)
while not P_que.empty():
#========================
if F_que.qsize() > 60:
gevent.sleep()
continue
#========================
url = P_que.get()
print "Forum Queue len = %s, Page Queue len = %s"%(F_que.qsize(), P_que.qsize())
Base = urlparse.urlsplit(url)
print "Start Catch %s"%url
FL = cl_spider.scan_forum(db, url, sess)
if not FL:
while not P_que.empty():
P_que.get()
print "P_que is empty!"
return
for i in FL:
i['link'] = (urlparse.urlunparse(urlparse.ParseResult(
scheme = Base.scheme,
netloc = Base.netloc,
path = i["link"],
params = '',
query = '',
fragment = ''
))
)
F_que.put(i)
gevent.sleep(1)
return
def Image_fetcher(db, P_que, F_que, sleep_len, sess):
time.sleep(sleep_len)
while ((not F_que.empty()) or (not P_que.empty())) :
try:
dic = F_que.get(timeout = 2)
except:
continue
print "Catch %s, last %s"%(dic["link"] , F_que.qsize())
try:
con = sess.get(dic["link"]).content
except:
continue
lst = web_parse.img_list(con)
Lock.acquire()
try:
db.data_in(lst, dic["name"], web_parse.url_to_Mark(urlparse.urlsplit(dic["link"]).path[1:]), dic["tag"])
except:
raise
print "Insert Data Failed @ %s"%dic["link"]
print lst, dic["name"], web_parse.url_to_Mark(urlparse.urlsplit(dic["link"]).path[1:]), dic["tag"]
finally:
Lock.release()
return
def Pre_fetch(HomePage_url, sess):
query_dict = dict([(k,v[0]) for k,v in urlparse.parse_qs( urlparse.urlsplit(HomePage_url).query ).items()])
Base = urlparse.urlsplit(HomePage_url)
for i in range(1, (1+ web_parse.max_page_index(sess.get(HomePage_url).content))):
query_dict["page"] = i
yield urlparse.urlunparse(urlparse.ParseResult(
scheme = Base.scheme,
netloc = Base.netloc,
path = Base.path,
params = "",
query = urllib.urlencode(query_dict),
fragment = ""
))
if __name__ == "__main__":
Page_list = Queue()
Forum_list = Queue()
DB = db_handler.db_proc()
Sess = requests.session()
for i in Pre_fetch("https://www.t66y.com/thread0806.php?fid=8&search=&page=1", Sess):
Page_list.put(i)
print "Catch First Page Complete"
gevent.joinall([gevent.spawn(forum_page_fetch, DB, Page_list, Forum_list, 0.5*i, Sess) for i in range(5)]+[gevent.spawn(Image_fetcher, DB, Page_list, Forum_list, 0.5*i, Sess) for i in range(20)])