-
Notifications
You must be signed in to change notification settings - Fork 0
/
archive_comments.py
123 lines (102 loc) · 3.7 KB
/
archive_comments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# Internal packages
import argparse
import json
import logging
import sys
import time
from collections import OrderedDict
from concurrent.futures import ThreadPoolExecutor
# External packages
import praw
import yaml
# Set up logger
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger = logging.getLogger("COMMENT_ARCHIVER")
# Load configs
with open("config.yaml", "r") as cfg_file:
cfg = yaml.load(cfg_file)
def connect():
# Authentication
return praw.Reddit(
client_id=cfg["client_id"],
client_secret=cfg["client_secret"],
user_agent=cfg["user_agent"],
username=cfg["username"],
password=cfg["pw"],
)
def get_file_path():
# Command-line options parser
parser = argparse.ArgumentParser(description="Archives Reddit comments.")
parser.add_argument("-p", "--path", dest="path", help="Set the path for the exported file.")
args = parser.parse_args()
if args.path is not None:
path = args.path
if not path.endswith("/") and not path.endswith("\\"):
if "\\" in path:
path += "\\"
else:
path += "/"
return path + cfg["default_file_name"]
else:
return cfg["default_file_name"]
def export_comments(reddit):
file_path = get_file_path()
logger.info(f"Saving all comments to file {file_path}...")
try:
file = open(file_path, "a+")
except OSError:
logger.exception(f"Unable to create file {file_path}. Please check that the file path is valid.")
raise
with file:
fields = (
"subreddit_name_prefixed",
"link_title",
"link_id",
"link_url",
"name",
"id",
"parent_id",
"created",
"created_utc",
"permalink",
"score",
"body",
)
comment_list = []
# Reddit PRAW API only allows you to retrieve last 1000 comments
for comment in reddit.redditor(cfg["username"]).comments.new(limit=None):
if comment.body == "[deleted]":
continue
comment_dict = vars(comment)
sub_dict = OrderedDict()
for field in fields:
sub_dict[field] = comment_dict[field]
sub_dict["permalink"] = "https://www.reddit.com" + sub_dict["permalink"]
# Convert time to readable timestamp
sub_dict["local_time"] = time.strftime("%Y-%m-%d %I:%M:%S %p EST", time.localtime(sub_dict["created_utc"]))
comment_list.append(sub_dict)
json_string = json.dumps(comment_list, indent=4)
file.write(json_string)
def overwrite_and_delete_comments(reddit):
logger.info("Overwriting all comments (this may take a few minutes)...")
with ThreadPoolExecutor(max_workers=5) as executor:
deleted_comment_count = 0
for comment in reddit.redditor(cfg["username"]).comments.new(limit=1000):
deleted_comment_count += 1
executor.submit(delete_comment_worker, comment, deleted_comment_count)
# If you make requests too fast, you get rate-limited by reddit (with backoff, so it slows to a crawl)
time.sleep(1)
def delete_comment_worker(comment, deleted_comment_count: int):
if comment.body != "[deleted]":
comment.edit("[deleted]")
comment.delete()
# These can be out of order due to multithreading
logger.info(f"Deleted comment #{deleted_comment_count}")
def main():
reddit = connect()
logger.info(f"Logged in as user u/{cfg['username']}")
export_comments(reddit)
overwrite_and_delete_comments(reddit)
logger.info("Finished successfully!")
if __name__ == "__main__":
main()