-
Notifications
You must be signed in to change notification settings - Fork 0
/
mklist-creative-commons
executable file
·111 lines (101 loc) · 3.45 KB
/
mklist-creative-commons
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Fetch list of movies with Creative Commons licensing.
"""
import argparse
import json
import lxml.html
import re
import urllib2
import urlparse
import movielib
def fetch_wikipedia_list(list, url):
try:
root = lxml.html.fromstring(movielib.http_get_read(url))
except urllib2.HTTPError as e:
return None
for a in root.cssselect("a[href]"):
entryurl = a.attrib['href']
entryurl = entryurl.replace('https://', 'http://')
# Only pick imdb.com links
if -1 != entryurl.find('imdb.com/title/'):
print entryurl
list[entryurl] = {
'status' : 'free',
'freenessurl' : url,
#'title' : info['title'],
#'year' : info['year'],
}
return list
def fetch_wikipedia_category(list, url, baseinfo = {}):
try:
html = movielib.http_get_read(url)
if html:
root = lxml.html.fromstring(html)
else:
return list
except urllib2.HTTPError as e:
return None
for a in root.cssselect("div#mw-pages div.mw-content-ltr a[href]"):
entryurl = urlparse.urljoin(url, a.attrib['href'])
wpinfo = movielib.wikipedia_lookup(entryurl)
ref = entryurl
if 'imdb' in wpinfo:
ref = wpinfo['imdb']
#print list, ref, entryurl
if -1 != entryurl.find('/wiki/'):
print entryurl, ref
info = {}
for k, v in baseinfo.items():
info[k] = v
info['status'] = 'free'
info['freenessurl'] = url
info['wp'] = entryurl
if 'title' in wpinfo:
info['title'] = wpinfo['title']
if 'year' in wpinfo:
info['year'] = wpinfo['year']
list[ref] = info
return list
def oldfilms():
"""
According to
https://en.wikipedia.org/wiki/List_of_films_in_the_public_domain_in_the_United_States,
every movie published in the USA before 1923 is now in the public
domain in USA.
"""
l = {}
for y in xrange(1874, 1923):
url = 'https://en.wikipedia.org/wiki/Category:%d_films' % y
fetch_wikipedia_category(l, url, {'year' : y})
movielib.savelist(l, name='free-movies-wikipedia-oldfilms.json')
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--output', default='free-movies-creative-commons.json')
parser.add_argument('--old', action='store_true', default=False,
help='fetch list of old movies listed on wikipedia')
args = parser.parse_args()
if args.old:
return oldfilms()
# List from English Wikipedia
url = "https://en.wikipedia.org/wiki/List_of_open-source_films"
l = {}
fetch_wikipedia_list(l, url)
# List of CC movies on CC wiki
ccurls = [
"https://wiki.creativecommons.org/wiki/Category:Film",
]
for c in ccurls:
fetch_wikipedia_category(l, c)
# List of Creative Commons licenced movies on English Wikipedia
urls = [
"https://en.wikipedia.org/wiki/Category:Creative_Commons-licensed_films",
"https://en.wikipedia.org/wiki/Category:Creative_Commons-licensed_documentary_films",
"https://en.wikipedia.org/wiki/Category:Creative_Commons-licensed_science_fiction_films",
]
for c in urls:
fetch_wikipedia_category(l, c)
movielib.savelist(l, name=args.output)
if __name__ == '__main__':
main()