-
Notifications
You must be signed in to change notification settings - Fork 1
/
getnews.py
272 lines (205 loc) · 10.1 KB
/
getnews.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
# RSS Feed Harvest: Python script designed to parse news feeds from a specified file (news_feeds.md)
# and save new entries into a Markdown file (news_{current_datetime}.md) at the desktop.
import feedparser
import os
from datetime import datetime
from dateutil import parser
import re
import html
import requests
import logging
# Function to read RSS feeds from a file
def read_rss_feeds_from_file(file_path):
rss_feeds = []
with open(file_path, "r", encoding="utf-8") as file:
for line in file:
rss_feeds.append(line.strip())
return rss_feeds
# Function to get the group name from a line starting with "##"
def get_group_name(line):
return line.strip(" #")
# Function to parse the news feeds and group them
def parse_news_feeds(file_path):
grouped_feeds = {}
current_group = None
with open(file_path, "r", encoding="utf-8") as file:
for line in file:
line = line.strip()
if line.startswith("##"):
current_group = get_group_name(line)
grouped_feeds[current_group] = []
elif line.startswith("http"):
if current_group:
grouped_feeds[current_group].append(line)
return grouped_feeds
# Function to get the current date and time as a formatted string
def get_current_datetime():
return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# Function to create a folder if it doesn't exist
def create_folder_if_not_exists(folder_path):
if not os.path.exists(folder_path):
os.makedirs(folder_path)
# Function to remove soft hyphens from text
def remove_soft_hyphens(text):
return text.replace("\xad", "")
# Function to clean the title text
def clean_title(title):
# Remove new lines
title = title.replace("\n", "")
# Replace "*in" with ":in"
title = title.replace('*in', ':in')
# Replace non-breaking space characters with regular space
title = title.replace("\xa0", " ")
# Shorten after 300 characters with "[...]"
title = title[:300] + "[...]" if len(title) > 300 else title
return title
# Function to clean the description text
def clean_description(description):
# Remove HTML tags and other unwanted HTML entities
description = re.sub(r'<[^>]*>|\[link\]|\[comments\]|\[...\]|&#\d+;|&[^;]+;', '', description)
# Replace "*in" with ":in"
description = description.replace('*in', ':in')
# Replace consecutive spaces with a single space
description = re.sub(r'\s+', ' ', description)
# Replace non-breaking space characters with regular space
description = description.replace("\xa0", " ")
# Remove leading and trailing spaces
description = description.strip()
return description
# Prompt the user in the terminal
print("\nWelcome to RSS Feed Harvest.\nParsing RSS-feeds:")
# Function to read existing URLs from MD files in the output folder
def read_existing_urls(output_folder):
existing_urls = set()
for root, _, files in os.walk(output_folder):
for file in files:
if file.endswith(".md"):
file_path = os.path.join(root, file)
with open(file_path, "r", encoding="utf-8") as md_file:
text = md_file.read()
urls = re.findall(r'URL:\s*([^\n]+)', text)
existing_urls.update(urls)
return existing_urls
# Get the news feeds from the "news_feeds.md" file
news_feeds = parse_news_feeds("news_feeds.md")
# Get the current date and time for the output
current_datetime = get_current_datetime()
# Extract the current year from the current date
current_year = datetime.now().year
# Desktop path
desktop_path = os.path.expanduser("~/Desktop")
# Update the subfolder variable to "RSS/news"
subfolder = "RSS/news"
# Update the subfolder_path to create the subfolder within "RSS" on the desktop
subfolder_path = os.path.join(desktop_path, subfolder)
# Ensure the subfolder exists, create it if it doesn't
if not os.path.exists(subfolder_path):
os.makedirs(subfolder_path)
# Output file name within the subfolder
output_file = os.path.join(subfolder_path, f"news_{current_datetime}.md")
# Set to store unique URLs from new matches
unique_urls = set()
# Read existing URLs from MD files in the output folder
existing_urls = read_existing_urls(subfolder_path)
# Open the output file for writing
with open(output_file, "w", encoding="utf-8") as file:
# Initialize a count for new entries
new_entries_count = 0
# Write the total new entries count at the beginning with initial pad of spaces
file.write(f"Total New Entries: ")
# Write the date at the beginning of the file
file.write(f"\nCurrent Date and Time: {current_datetime}\n\n")
# Iterate through the grouped news feeds
for group, feeds in news_feeds.items():
# Write the group name as a subheading
file.write(f"## {group}\n\n")
print(f"\n{group}")
# Iterate through the RSS feeds in the current group
for rss_feed in feeds:
# Extract and print only the domain from the URL
domain = re.search(r"https?://(?:www\.)?(.+?)/", rss_feed)
if domain:
domain = domain.group(1)
print(f"... {domain}")
try:
# Parse the feed
feed = feedparser.parse(rss_feed)
status_code = feed.status
if 400 <= status_code < 600:
# Print a warning message for 4xx and 5xx status codes
print(f"\n{rss_feed} returned HTTP status code {status_code}\n")
else:
# Feed parsing was successful or is a redirection (3xx), continue processing entries
for entry in feed.entries:
entry_link = entry.link
# Remove tracking parameters from URLs
entry_link = re.sub(r'\?utm_source=[^&]+&utm_medium=[^&]+&utm_campaign=[^&]+|\?wt_mc=rss.red.unbekannt.unbekannt.atom.beitrag.beitrag|#ref=rss', '', entry_link)
# Check if the URL is unique in the new matches and not in existing URLs
if entry_link not in unique_urls and entry_link not in existing_urls:
unique_urls.add(entry_link)
# Try to parse the date using dateutil.parser
try:
entry_date = parser.parse(entry.published)
except (ValueError, AttributeError):
entry_date = None
# Initialize entry_date_str as "N/A" if entry_date is None, or format it as a string if available
entry_date_str = "N/A" if entry_date is None else entry_date.strftime("%Y-%m-%d %H:%M:%S")
# Check if the entry_date is available and in the current year or newer
if entry_date is not None:
entry_year = entry_date.year
if entry_year < current_year:
continue # Skip entries from previous years
# Initialize other attributes as empty strings
description = entry.get("description", "")
title = entry.get("title", "") # Get the title of the feed entry
# Increment the count of new entries
new_entries_count += 1
# Clean the title text
title = clean_title(title)
# Clean the description text
description = clean_description(description)
# Truncate the description to 500 characters or less
description = description[:500] + "[...]" if len(description) > 500 else description
# Add three asterisks before each entry in the output file
file.write("***\n\n")
# Write entry details
file.write(f"{remove_soft_hyphens(title)}\n")
file.write(f"URL: {remove_soft_hyphens(entry_link)}\n")
# Add the date if available (or leave it as "N/A" if it's not available)
file.write(f"Date: {entry_date_str}\n")
# Add the cleaned description to the output
file.write(f"Description: {remove_soft_hyphens(description)}\n\n")
except Exception as e:
# Handle other exceptions (e.g., network issues)
print(f"\nError parsing {rss_feed}: {e}\n")
# Go back to the beginning of the file
file.seek(0)
# Convert the count to a string
count_str = str(new_entries_count)
# Rewrite the total new entries count at the beginning without adding extra spaces
file.write(f"Total New Entries: {count_str}")
# Count the total number of feeds
with open("news_feeds.md", "r") as file:
rss_feeds = [line.strip() for line in file if not line.strip().startswith("##")]
# Initialize counters and lists for unresponsive feeds and their reasons
total_feeds = len(rss_feeds)
# Configure logging
logging.basicConfig(filename='rss_parser.log', level=logging.ERROR, format='%(asctime)s - %(levelname)s: %(message)s')
# Function to log errors
def log_error(feed_url, error):
logging.error(f"Error parsing {feed_url}: {error}")
# Inside the main loop where feed parsing occurs
try:
feed = feedparser.parse(rss_feed)
status_code = feed.status
if 400 <= status_code < 600:
log_error(rss_feed, f"HTTP status code {status_code}")
else:
# Feed parsing was successful or is a redirection (3xx), continue processing entries
for entry in feed.entries:
# Parsing and processing entries
except Exception as e:
log_error(rss_feed, e)
# Prompt the user with the number of feeds checked
print(f"\n[{total_feeds}] feeds checked.")
print(f"\n[{new_entries_count}] new entries found and saved to:\n{output_file}\n")