generated from Quang7hong81/cafef.vn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_full_detail.py
51 lines (40 loc) · 1.32 KB
/
get_full_detail.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import pandas as pd
import numpy as np
#internet package
import urllib3
from bs4 import BeautifulSoup
http = urllib3.PoolManager()
#functional package
import logging
import requests
from newspaper import Article
from newspaper import fulltext
from tqdm import tqdm
from tqdm import tnrange, tqdm_notebook
logging.basicConfig(filename="crawlfulltext.log",level=logging.ERROR)
news_url = pd.read_csv("news_url.csv", usecols = range(1,5))
def get_details_from_url(df_to_get):
data_dicts = []
for newsdate, url in tqdm(zip(df_to_get['newsdate'].values, df_to_get['url'].values)):
row_dict = {}
try:
article = Article(url, language='vi')
article.download()
article.parse()
article.nlp()
row_dict['newsdate'] = newsdate
row_dict['fulltext'] = article.text
row_dict['summnary'] = article.summary
row_dict['keyword'] = article.keywords
data_dicts.append(row_dict)
except Exception:
# logging.error()
continue
return data_dicts
# from multiprocessing.dummy import Pool as ThreadPool
# pool = ThreadPool(32)
# results = pool.map(get_details_from_url(news_url))
# pool.close()
# pool.join()
newfull = pd.DataFrame(get_details_from_url(news_url))
newfull.to_csv('newfull.csv')