-
Notifications
You must be signed in to change notification settings - Fork 2
/
gsheet2shortlinks.py
252 lines (209 loc) · 9.45 KB
/
gsheet2shortlinks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
import click
import csv
from html.parser import HTMLParser
import json
import pprint
import re
import requests
import textwrap
import urllib
from commands.common import common_params
from commands.utils.rebrandly import Rebrandly, AmbiguousCustomDomainError, NoCustomDomainsExistError
CONTEXT_SETTINGS = dict(help_option_names=['--help', '-h'])
GSHEET_URL_RE = re.compile('https://docs.google.com/spreadsheets/d/([\w_-]+)/(?:edit|view)(?:#gid=([0-9]+))?')
def parse_gsheet_url(url):
matches = GSHEET_URL_RE.match(url)
# Raise error if key not parseable.
spreadsheet_key = matches.group(1)
if spreadsheet_key == None:
raise 'Could not parse key from spreadsheet url'
# Assume first worksheet if not specified.
worksheet_id = matches.group(2)
if worksheet_id == None:
worksheet_id = 0
return spreadsheet_key, worksheet_id
def lookup_link(links=[], slashtag=''):
matched_link = [l for l in links if l['slashtag'] == slashtag]
if matched_link:
return matched_link.pop()
else:
return None
# See: https://stackoverflow.com/a/36650753/504018
class TitleParser(HTMLParser):
# Customized: self.found_once ensures we only grab the first tag, as some
# modern single-page application framewords seem to have multiple.
# (whatever Meetup.com uses)
def __init__(self):
HTMLParser.__init__(self)
self.match = False
self.found_already = False
self.title = ''
def handle_starttag(self, tag, attributes):
self.match = True if tag == 'title' else False
def handle_data(self, data):
if self.match and not self.found_already:
self.found_already = True
self.title = data
self.match = False
@click.command(context_settings=CONTEXT_SETTINGS)
@click.option('--gsheet',
required=True,
envvar='CTTO_SHORTLINK_GSHEET',
help='URL to publicly readable Google Spreadsheet, including sheet ID gid',
metavar='<url>')
@click.option('--rebrandly-api-key',
required=True,
help='API key for Rebrandly',
metavar='<string>')
@click.option('--domain-name', '-d',
envvar='CTTO_SHORTLINK_DOMAIN',
help='Shortlink domain on Rebrandly [required if multiple domains on account]',
metavar='<example.com>')
@common_params
# TODO: Accomodate --verbose flag.
def gsheet2rebrandly(rebrandly_api_key, gsheet, domain_name, yes, verbose, debug, noop):
"""Create/update Rebrandly shortlinks from a Google Docs spreadsheet.
Here are some notes on spreadsheet columns:
* slashtag: the shortlink component of path.
* destination_url: where the shortlink points to. Leaving blank will delete on next run.
* If the following columns exist and a --google-creds option is passed, they will be updated:
* Note: These features are not yet implemented.
* created: date and time when the link was created and tracking began.
* clicks: number of click-through since creation.
* Extra columns will have no effect.
Sample Spreadsheet:
https://docs.google.com/spreadsheets/d/12VUXPCpActC77wy6Q8Khyb-iZ_nlNwshO8XswYRj5XE/edit#gid=776462093
"""
if debug: click.echo('>>> Debug mode: enabled')
if noop: click.echo('>>> No-op mode: enabled (No operations affecting data will be run)')
### Fetch spreadsheet
spreadsheet_key, worksheet_id = parse_gsheet_url(gsheet)
CSV_URL_TEMPLATE = 'https://docs.google.com/spreadsheets/d/{key}/export?format=csv&id={key}&gid={id}'
csv_url = CSV_URL_TEMPLATE.format(key=spreadsheet_key, id=worksheet_id)
# Fetch and parse shortlink CSV.
r = requests.get(csv_url)
if r.status_code != requests.codes.ok:
raise click.Abort()
csv_content = r.content.decode('utf-8')
csv_content = csv_content.split('\r\n')
### Confirm spreadsheet title
cd_header = r.headers.get('Content-Disposition')
# See: https://tools.ietf.org/html/rfc5987#section-3.2.1 (ext-value definition)
m = re.search("filename\*=(?P<charset>.+)'(?P<language>.*)'(?P<filename>.+)", cd_header)
filename = m.group('filename')
filename = urllib.parse.unquote(filename)
# Remove csv filename suffix.
filename = filename[:-len('.csv')]
### Confirm domain
rebrandly = Rebrandly(rebrandly_api_key)
if domain_name:
# If --domain-name provided, check it.
rebrandly.set_domain_by_name(domain_name)
if not rebrandly.default_domain:
click.echo('Provided domain not attached to account. Exitting...')
raise click.Abort()
else:
try:
rebrandly.autodetect_domain()
except AmbiguousCustomDomainError:
click.echo('More than one domain found. Please specify one via --domain option:', err=True)
except NoCustomDomainsExistError:
click.echo('No custom domains attached to account. Exiting...', err=True)
domain_name = rebrandly.default_domain['fullName']
### Output confirmation to user
if not yes:
confirmation_details = """\
We are using the following configuration:
* Shortlink Domain: {domain}
* Spreadsheet - Worksheet: {name}
* Spreadsheet URL: {url}"""
# TODO: Find and display spreadsheet title
# Get from the file download name.
confirmation_details = confirmation_details.format(domain=domain_name, url=gsheet, name=filename)
click.echo(textwrap.dedent(confirmation_details))
click.confirm('Do you want to continue?', abort=True)
# TODO: Move pagination into rebrandly client class.
all_links = []
first = True
last_links = None
while first or last_links:
payload = {}
if last_links:
payload.update({'last': last_links[-1]['id']})
r = rebrandly.get('/links',
data=payload)
this_links = r.json()
all_links += this_links
last_links = this_links
first = False
# Iterate through CSV content and perform actions on data
reader = csv.DictReader(csv_content, delimiter=',')
for row in reader:
link = lookup_link(all_links, row['slashtag'])
if debug: click.echo(link, err=True)
# If destination_url empty, delete link.
if not row['destination_url']:
if not link:
click.echo('Non-existent shortlink: {} (already deleted)'.format(row['slashtag']))
continue
# NOTE: Not possible to "trash", only to fully delete, as per support chat question.
r = requests.delete('https://api.rebrandly.com/v1/links/'+link['id'],
headers={'apikey': rebrandly_api_key})
if debug: click.echo(pprint.pformat(r))
click.echo('Deleted shortlink: '+row['slashtag'])
continue
# Sometimes requests gets blocked
try:
r = requests.get(row['destination_url'], allow_redirects=True)
if 'text/html' in r.headers['Content-Type']:
# Extract page title after redirects.
parser = TitleParser()
# FIXME: Title parser. Not working.
title = parser.feed(r.content.decode('utf-8'))
else:
title = 'File: '+r.headers['Content-Type']
except requests.ConnectionError:
title = ''
payload = {
'slashtag': row['slashtag'],
# Don't use url with redirect resolution, because permissioned
# pages (like Google Docs) will redirect to login page.
'destination': row['destination_url'],
'title': title,
}
if debug: click.echo('>>> resolved as: ' + pprint.pformat(payload))
if link:
if noop:
pass
else:
r = requests.post('https://api.rebrandly.com/v1/links/'+link['id'],
data=json.dumps(payload),
headers={
'apikey': rebrandly_api_key,
'Content-Type': 'application/json',
})
if debug: click.echo('>>> ' + pprint.pformat(r.json()))
if r.status_code != requests.codes.ok:
click.echo(pprint.pformat(r.__dict__))
raise click.Abort()
click.echo('Updated shortlink: '+row['slashtag'])
else:
if noop:
pass
else:
payload['domain'] = {'fullName': domain_name}
payload['slashtag'] = row['slashtag']
r = requests.post('https://api.rebrandly.com/v1/links',
data=json.dumps(payload),
headers={
'apikey': rebrandly_api_key,
'Content-Type': 'application/json',
})
if debug: click.echo('>>> ' + pprint.pformat(r.json()))
if r.status_code != requests.codes.ok:
click.echo(pprint.pformat(r))
raise click.Abort()
click.echo('Created shortlink: '+row['slashtag'])
if noop: click.echo('Command exited no-op mode without creating/updating any data.')
if __name__ == '__main__':
gsheet2rebrandly(auto_envvar_prefix='CTTO')