mklist-creative-commons

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Fetch list of movies with Creative Commons licensing.
"""

import argparse
import json
import lxml.html
import re
import urllib2
import urlparse
import movielib

def fetch_wikipedia_list(list, url):
    try:
        root = lxml.html.fromstring(movielib.http_get_read(url))
    except urllib2.HTTPError as e:
        return None
    for a in root.cssselect("a[href]"):
        entryurl = a.attrib['href']
        entryurl = entryurl.replace('https://', 'http://')
        # Only pick imdb.com links
        if -1 != entryurl.find('imdb.com/title/'):
            print entryurl
            list[entryurl] = {
                'status' : 'free',
                'freenessurl' : url,
                #'title' : info['title'],
                #'year' : info['year'],
                }
    return list

def fetch_wikipedia_category(list, url, baseinfo = {}):
    try:
        html = movielib.http_get_read(url)
        if html:
            root = lxml.html.fromstring(html)
        else:
            return list
    except urllib2.HTTPError as e:
        return None
    for a in root.cssselect("div#mw-pages div.mw-content-ltr a[href]"):
        entryurl = urlparse.urljoin(url, a.attrib['href'])
        wpinfo =  movielib.wikipedia_lookup(entryurl)
        ref = entryurl
        if 'imdb' in wpinfo:
            ref = wpinfo['imdb']
        #print list, ref, entryurl
        if -1 != entryurl.find('/wiki/'):
            print  entryurl, ref
            info = {}
            for k, v in baseinfo.items():
                info[k] = v
            info['status'] = 'free'
            info['freenessurl'] = url
            info['wp'] = entryurl
            if 'title' in wpinfo:
                info['title'] = wpinfo['title']
            if 'year' in wpinfo:
                info['year'] = wpinfo['year']
            list[ref] = info
    return list

def oldfilms():
    """
According to
https://en.wikipedia.org/wiki/List_of_films_in_the_public_domain_in_the_United_States,
every movie published in the USA before 1923 is now in the public
domain in USA.
"""
    l = {}
    for y in xrange(1874, 1923):
        url = 'https://en.wikipedia.org/wiki/Category:%d_films' % y
        fetch_wikipedia_category(l, url, {'year' : y})
    movielib.savelist(l, name='free-movies-wikipedia-oldfilms.json')

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--output', default='free-movies-creative-commons.json')
    parser.add_argument('--old', action='store_true', default=False,
                        help='fetch list of old movies listed on wikipedia')
    args = parser.parse_args()
    if args.old:
        return oldfilms()
    
    # List from English Wikipedia
    url = "https://en.wikipedia.org/wiki/List_of_open-source_films"
    l = {}
    fetch_wikipedia_list(l, url)

    # List of CC movies on CC wiki
    ccurls = [
        "https://wiki.creativecommons.org/wiki/Category:Film",
    ]
    for c in ccurls:
        fetch_wikipedia_category(l, c)

    # List of Creative Commons licenced movies on English Wikipedia
    urls = [
        "https://en.wikipedia.org/wiki/Category:Creative_Commons-licensed_films",
        "https://en.wikipedia.org/wiki/Category:Creative_Commons-licensed_documentary_films",
        "https://en.wikipedia.org/wiki/Category:Creative_Commons-licensed_science_fiction_films",
    ]
    for c in urls:
        fetch_wikipedia_category(l, c)
    movielib.savelist(l, name=args.output)

if __name__ == '__main__':
    main()