diff --git a/.gitignore b/.gitignore index c724ba7..99b69ee 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ dist build *.sublime* +.cache +__pycache__ diff --git a/.travis.yml b/.travis.yml index 9fd187d..30fca7a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -29,4 +29,16 @@ install: - conda install -c conda-forge netCDF4 # For optional DAP file size calculation - conda install -c conda-forge pytest -script: py.test -rx -v +script: + - py.test -rx -v + - conda install -n root conda-build anaconda-client + - conda build conda-recipe --python $TRAVIS_PYTHON_VERSION + - conda install thredds_crawler --use-local + +deploy: + provider: releases + api_key: + secure: XAx2aeocMQWn2acXcQ5LevsO977glpvPKOnk/2yafHTMd+VROVy8jZjsVTTwOEhzag2xOYgTyDYbX5PRT2uG2Uz/RPwJA0PbB+9NIiT1gvHZ/sfFEm7AfOQ257I2IL72ZGUuSZoa0I1pZnIFaew84FZGQ/jsNtfWZzo1veXI6A0= + on: + tags: true + repo: ioos/thredds_crawler diff --git a/README.md b/README.md index 0e47eb4..7793932 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,4 @@ -thredds_crawler -=============== +# thredds_crawler [![Build Status](https://travis-ci.org/ioos/thredds_crawler.svg?branch=master)](https://travis-ci.org/ioos/thredds_crawler) @@ -26,7 +25,7 @@ You can select datasets based on their THREDDS ID using the 'select' parameter. ```python from thredds_crawler.crawl import Crawl -c = Crawl("http://tds.maracoos.org/thredds/MODIS.xml", select=[".*-Agg"]) +c = Crawl('http://tds.maracoos.org/thredds/MODIS.xml', select=[".*-Agg"]) print c.datasets [ , @@ -74,7 +73,11 @@ If you need to remove or add a new `skip`, it is **strongly** encouraged you use ```python from thredds_crawler.crawl import Crawl skips = Crawl.SKIPS + [".*-Day-Aggregation"] -c = Crawl("http://tds.maracoos.org/thredds/MODIS.xml", select=[".*-Agg"], skip=skips) +c = Crawl( + 'http://tds.maracoos.org/thredds/MODIS.xml', + select=[".*-Agg"], + skip=skips +) print c.datasets [ @@ -128,24 +131,42 @@ You can select data by the THREDDS `modified_time` by using a the `before` and ` import pytz from thredds_crawler.crawl import Crawl -# after +bf = datetime(2016, 1, 5, 0, 0) af = datetime(2015, 12, 30, 0, 0, tzinfo=pytz.utc) -c = Crawl("http://tds.maracoos.org/thredds/catalog/MODIS-Chesapeake-Salinity/raw/2015/catalog.xml", after=af) +url = 'http://tds.maracoos.org/thredds/catalog/MODIS-Chesapeake-Salinity/raw/2016/catalog.xml' + +# after +c = Crawl(url, after=af) assert len(c.datasets) == 3 # before -bf = datetime(2016, 1, 5, 0, 0) -c = Crawl("http://tds.maracoos.org/thredds/catalog/MODIS-Chesapeake-Salinity/raw/2016/catalog.xml", before=bf) +c = Crawl(url, before=bf) assert len(c.datasets) == 3 # both af = datetime(2016, 1, 20, 0, 0) bf = datetime(2016, 2, 1, 0, 0) -c = Crawl("http://tds.maracoos.org/thredds/catalog/MODIS-Chesapeake-Salinity/raw/2016/catalog.xml", before=bf, after=af) +c = Crawl(url, before=bf, after=af) assert len(c.datasets) == 11 ``` +### Authentication + +You can pass an auth parameter as needed. It needs to be a [requests compatible auth object](http://docs.python-requests.org/en/latest/user/authentication/). + +```python +from thredds_crawler.crawl import Crawl +auth = ('user', 'password') +c = Crawl( + 'http://tds.maracoos.org/thredds/MODIS.xml', + select=['.*-Agg'], + skip=Crawl.SKIPS, + auth=auth +) +``` + + ### Debugging You can pass in a `debug=True` parameter to Crawl to log to STDOUT what is actually happening. @@ -153,7 +174,12 @@ You can pass in a `debug=True` parameter to Crawl to log to STDOUT what is actua ```python from thredds_crawler.crawl import Crawl skips = Crawl.SKIPS + [".*-Day-Aggregation"] -c = Crawl("http://tds.maracoos.org/thredds/MODIS.xml", select=[".*-Agg"], skip=skips, debug=True) +c = Crawl( + 'http://tds.maracoos.org/thredds/MODIS.xml', + select=['.*-Agg'], + skip=skips, + debug=True +) Crawling: http://tds.maracoos.org/thredds/MODIS.xml Skipping catalogRef based on 'skips'. Title: MODIS Individual Files @@ -189,7 +215,7 @@ You can get some basic information about a LeafDataset, including the services a ```python from thredds_crawler.crawl import Crawl -c = Crawl("http://tds.maracoos.org/thredds/MODIS.xml", select=[".*-Agg"]) +c = Crawl('http://tds.maracoos.org/thredds/MODIS.xml', select=['.*-Agg']) dataset = c.datasets[0] print dataset.id MODIS-Agg @@ -214,7 +240,7 @@ If you have a list of datasets you can easily return all endpoints of a certain ```python from thredds_crawler.crawl import Crawl -c = Crawl("http://tds.maracoos.org/thredds/MODIS.xml", select=[".*-Agg"]) +c = Crawl('http://tds.maracoos.org/thredds/MODIS.xml', select=['.*-Agg']) urls = [s.get("url") for d in c.datasets for s in d.services if s.get("service").lower() == "opendap"] print urls [ @@ -236,7 +262,10 @@ This isn't necessarialy the size on disk, because it does not account for `missi ```python from thredds_crawler.crawl import Crawl -c = Crawl("http://thredds.axiomalaska.com/thredds/catalogs/cencoos.html", select=["MB_.*"]) +c = Crawl( + 'http://thredds.axiomalaska.com/thredds/catalogs/cencoos.html', + select=['MB_.*'] +) sizes = [d.size for d in c.datasets] print sizes [29247.410283999998, 72166.289680000002] @@ -249,7 +278,7 @@ The entire THREDDS catalog metadata record is saved along with the dataset objec ```python from thredds_crawler.crawl import Crawl -c = Crawl("http://tds.maracoos.org/thredds/MODIS.xml", select=[".*-Agg"]) +c = Crawl('http://tds.maracoos.org/thredds/MODIS.xml', select=['.*-Agg']) dataset = c.datasets[0] print dataset.metadata.find("{http://www.unidata.ucar.edu/namespaces/thredds/InvCatalog/v1.0}documentation").text Ocean Color data are provided as a service to the broader community, and can be diff --git a/VERSION b/VERSION index 26ca594..4cda8f1 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.5.1 +1.5.2 diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml new file mode 100644 index 0000000..90dbcd6 --- /dev/null +++ b/conda-recipe/meta.yaml @@ -0,0 +1,33 @@ +package: + name: thredds_crawler + version: "1.5.2" + +source: + path: ../ + +build: + number: 0 + script: python setup.py install --single-version-externally-managed --record=record.txt + +requirements: + build: + - python + - setuptools + - requests + - lxml + - pytz + run: + - python + - requests + - lxml + - netcdf4 + - pytz + +test: + imports: + - thredds_crawler + +about: + home: https://github.com/ioos/thredds_crawler + license: MIT License + summary: 'A Python library for crawling THREDDS servers' diff --git a/thredds_crawler/__init__.py b/thredds_crawler/__init__.py index 77f1c8e..c3b3841 100644 --- a/thredds_crawler/__init__.py +++ b/thredds_crawler/__init__.py @@ -1 +1 @@ -__version__ = '1.5.0' +__version__ = '1.5.2' diff --git a/thredds_crawler/crawl.py b/thredds_crawler/crawl.py index 7efc75e..c491ca4 100644 --- a/thredds_crawler/crawl.py +++ b/thredds_crawler/crawl.py @@ -1,4 +1,3 @@ -from thredds_crawler.etree import etree try: import urlparse from urllib import quote_plus @@ -6,19 +5,22 @@ from urllib import parse as urlparse from urllib.parse import quote_plus import requests +from requests.packages.urllib3.exceptions import InsecureRequestWarning import os import sys import re +import logging from datetime import datetime import pytz +from lxml import etree from thredds_crawler.utils import construct_url from dateutil.parser import parse import multiprocessing as mp INV_NS = "http://www.unidata.ucar.edu/namespaces/thredds/InvCatalog/v1.0" XLINK_NS = "http://www.w3.org/1999/xlink" +requests.packages.urllib3.disable_warnings(InsecureRequestWarning) -import logging try: # Python >= 2.7 from logging import NullHandler @@ -27,36 +29,37 @@ class NullHandler(logging.Handler): def emit(self, record): pass -logger = logging.getLogger("thredds_crawler") -logger.addHandler(NullHandler()) +logger = logging.getLogger(__name__) -def request_xml(url): +def request_xml(url, auth=None): ''' Returns an etree.XMLRoot object loaded from the url :param str url: URL for the resource to load as an XML ''' try: - r = requests.get(url, verify=False) + r = requests.get(url, auth=auth, verify=False) return r.text.encode('utf-8') except BaseException: logger.error("Skipping %s (error parsing the XML)" % url) return -def make_leaf(url): - return LeafDataset(url) +def make_leaf(url, auth): + return LeafDataset(url, auth=auth) class Crawl(object): SKIPS = [".*files.*", ".*Individual Files.*", ".*File_Access.*", ".*Forecast Model Run.*", ".*Constant Forecast Offset.*", ".*Constant Forecast Date.*"] - def __init__(self, catalog_url, select=None, skip=None, before=None, after=None, debug=None, workers=4): + def __init__(self, catalog_url, select=None, skip=None, before=None, after=None, debug=None, workers=None, auth=None): """ - select: a list of dataset IDs. Python regex supported. - skip: list of dataset names and/or a catalogRef titles. Python regex supported. + :param select list: Dataset IDs. Python regex supported. + :param list skip: Dataset names and/or a catalogRef titles. Python regex supported. + :param requests.auth.AuthBase auth: requets auth object to use """ + workers = workers or 4 self.pool = mp.Pool(processes=workers) if debug is True: @@ -66,6 +69,8 @@ def __init__(self, catalog_url, select=None, skip=None, before=None, after=None, formatter = logging.Formatter('%(asctime)s - [%(levelname)s] %(message)s') ch.setFormatter(formatter) logger.addHandler(ch) + else: + logger.addHandler(NullHandler()) # Only process these dataset IDs if select is not None: @@ -102,9 +107,9 @@ def __init__(self, catalog_url, select=None, skip=None, before=None, after=None, self.visited = [] datasets = [] - urls = list(self._run(url=catalog_url)) + urls = list(self._run(url=catalog_url, auth=auth)) - jobs = [self.pool.apply_async(make_leaf, args=(url,)) for url in urls] + jobs = [self.pool.apply_async(make_leaf, args=(url, auth)) for url in urls] datasets = [j.get() for j in jobs] self.datasets = [ x for x in datasets if x.id is not None ] @@ -182,11 +187,12 @@ def _compile_references(self, url, tree): references.append(construct_url(url, ref.get("{%s}href" % XLINK_NS))) return references - def _run(self, url): + def _run(self, url, auth): ''' Performs a multiprocess depth-first-search of the catalog references and yields a URL for each leaf dataset found :param str url: URL for the current catalog + :param requests.auth.AuthBase auth: requets auth object to use ''' if url in self.visited: logger.debug("Skipping %s (already crawled)" % url) @@ -197,7 +203,7 @@ def _run(self, url): url = self._get_catalog_url(url) # Get an etree object - xml_content = request_xml(url) + xml_content = request_xml(url, auth) for ds in self._build_catalog(url, xml_content): yield ds @@ -229,7 +235,7 @@ def _build_catalog(self, url, xml_content): class LeafDataset(object): - def __init__(self, dataset_url): + def __init__(self, dataset_url, auth=None): self.services = [] self.id = None @@ -239,55 +245,60 @@ def __init__(self, dataset_url): self.data_size = None # Get an etree object - r = requests.get(dataset_url, verify=False) + r = requests.get(dataset_url, auth=auth, verify=False) try: tree = etree.XML(r.text.encode('utf-8')) except etree.XMLSyntaxError: logger.error("Error procesing %s, invalid XML" % dataset_url) else: - dataset = tree.find("{%s}dataset" % INV_NS) - self.id = dataset.get("ID") - self.name = dataset.get("name") - self.metadata = dataset.find("{%s}metadata" % INV_NS) - self.catalog_url = dataset_url.split("?")[0] - - # Data Size - http://www.unidata.ucar.edu/software/thredds/current/tds/catalog/InvCatalogSpec.html#dataSize - data_size = dataset.find("{%s}dataSize" % INV_NS) - if data_size is not None: - self.data_size = float(data_size.text) - data_units = data_size.get('units') - # Convert to MB - if data_units == "bytes": - self.data_size *= 1e-6 - elif data_units == "Kbytes": - self.data_size *= 0.001 - elif data_units == "Gbytes": - self.data_size /= 0.001 - elif data_units == "Tbytes": - self.data_size /= 1e-6 - - # Services - service_tag = dataset.find("{%s}serviceName" % INV_NS) - if service_tag is None: - service_tag = self.metadata.find("{%s}serviceName" % INV_NS) - service_name = service_tag.text - - for service in tree.findall(".//{%s}service[@name='%s']" % (INV_NS, service_name)): - if service.get("serviceType") == "Compound": - for s in service.findall("{%s}service" % INV_NS): - url = construct_url(dataset_url, s.get('base')) + dataset.get("urlPath") - if s.get("suffix") is not None: - url += s.get("suffix") + try: + dataset = tree.find("{%s}dataset" % INV_NS) + self.id = dataset.get("ID") + self.name = dataset.get("name") + self.metadata = dataset.find("{%s}metadata" % INV_NS) + self.catalog_url = dataset_url.split("?")[0] + + # Data Size - http://www.unidata.ucar.edu/software/thredds/current/tds/catalog/InvCatalogSpec.html#dataSize + data_size = dataset.find("{%s}dataSize" % INV_NS) + if data_size is not None: + self.data_size = float(data_size.text) + data_units = data_size.get('units') + # Convert to MB + if data_units == "bytes": + self.data_size *= 1e-6 + elif data_units == "Kbytes": + self.data_size *= 0.001 + elif data_units == "Gbytes": + self.data_size /= 0.001 + elif data_units == "Tbytes": + self.data_size /= 1e-6 + + # Services + service_tag = dataset.find("{%s}serviceName" % INV_NS) + if service_tag is None: + service_tag = self.metadata.find("{%s}serviceName" % INV_NS) + if service_tag is None: + raise ValueError("No serviceName definition found!") + service_name = service_tag.text + + for service in tree.findall(".//{%s}service[@name='%s']" % (INV_NS, service_name)): + if service.get("serviceType") == "Compound": + for s in service.findall("{%s}service" % INV_NS): + url = construct_url(dataset_url, s.get('base')) + dataset.get("urlPath") + if s.get("suffix") is not None: + url += s.get("suffix") + # ISO like services need additional parameters + if s.get('name') in ["iso", "ncml", "uddc"]: + url += "?dataset=%s&catalog=%s" % (self.id, quote_plus(self.catalog_url)) + self.services.append( {'name' : s.get('name'), 'service' : s.get('serviceType'), 'url' : url } ) + else: + url = construct_url(dataset_url, service.get('base')) + dataset.get("urlPath") + service.get("suffix", "") # ISO like services need additional parameters - if s.get('name') in ["iso", "ncml", "uddc"]: + if service.get('name') in ["iso", "ncml", "uddc"]: url += "?dataset=%s&catalog=%s" % (self.id, quote_plus(self.catalog_url)) - self.services.append( {'name' : s.get('name'), 'service' : s.get('serviceType'), 'url' : url } ) - else: - url = construct_url(dataset_url, service.get('base')) + dataset.get("urlPath") + service.get("suffix", "") - # ISO like services need additional parameters - if service.get('name') in ["iso", "ncml", "uddc"]: - url += "?dataset=%s&catalog=%s" % (self.id, quote_plus(self.catalog_url)) - self.services.append( {'name' : service.get('name'), 'service' : service.get('serviceType'), 'url' : url } ) + self.services.append( {'name' : service.get('name'), 'service' : service.get('serviceType'), 'url' : url } ) + except BaseException as e: + logger.error('Could not process {}. {}.'.format(dataset_url, e)) @property def size(self): diff --git a/thredds_crawler/etree.py b/thredds_crawler/etree.py deleted file mode 100644 index 4dfea6d..0000000 --- a/thredds_crawler/etree.py +++ /dev/null @@ -1,20 +0,0 @@ -try: - from lxml import etree -except ImportError: - try: - # Python 2.5 - import xml.etree.cElementTree as etree - except ImportError: - try: - # Python 2.5 - import xml.etree.ElementTree as etree - except ImportError: - try: - # normal cElementTree install - import cElementTree as etree - except ImportError: - try: - # normal ElementTree install - import elementtree.ElementTree as etree - except ImportError: - raise RuntimeError('You need either lxml or ElementTree') diff --git a/tests/__init__.py b/thredds_crawler/tests/__init__.py similarity index 100% rename from tests/__init__.py rename to thredds_crawler/tests/__init__.py diff --git a/tests/test_crawler.py b/thredds_crawler/tests/test_crawler.py similarity index 66% rename from tests/test_crawler.py rename to thredds_crawler/tests/test_crawler.py index 7ad1c08..f5e8530 100644 --- a/tests/test_crawler.py +++ b/thredds_crawler/tests/test_crawler.py @@ -2,8 +2,12 @@ from datetime import datetime, timedelta import pytz +import logging from thredds_crawler.crawl import Crawl +logger = logging.getLogger('thredds_crawler') +logger.setLevel(logging.DEBUG) +logger.handlers = [logging.StreamHandler()] class CrawlerTest(unittest.TestCase): @@ -33,41 +37,60 @@ def test_regex_skips(self): assert len(c.datasets) == 0 def test_iso_links(self): - c = Crawl("http://thredds.axiomalaska.com/thredds/catalogs/global.html", debug=True) + c = Crawl("http://thredds.axiomdatascience.com/thredds/global.html") isos = [s.get("url") for d in c.datasets for s in d.services if s.get("service").lower() == "iso"] assert "?dataset=" in isos[0] assert "&catalog=" in isos[0] def test_dataset_size_using_xml(self): - c = Crawl("http://tds.maracoos.org/thredds/catalog/MODIS-Composites-1Day/2014/catalog.xml", debug=True) + c = Crawl("http://tds.maracoos.org/thredds/catalog/MODIS-Composites-1Day/2014/catalog.xml") self.assertIsNotNone(c.datasets[0].size) def test_dataset_size_using_dap(self): - c = Crawl("http://tds.maracoos.org/thredds/MODIS.xml", select=["MODIS-One-Agg"], debug=True) + c = Crawl("http://tds.maracoos.org/thredds/MODIS.xml", select=["MODIS-One-Agg"]) self.assertIsNotNone(c.datasets[0].size) def test_modified_time(self): # after with timezone af = datetime(2015, 12, 30, 0, 0, tzinfo=pytz.utc) - c = Crawl("http://tds.maracoos.org/thredds/catalog/MODIS-Chesapeake-Salinity/raw/2015/catalog.xml", after=af, debug=True) + c = Crawl("http://tds.maracoos.org/thredds/catalog/MODIS-Chesapeake-Salinity/raw/2015/catalog.xml", after=af) assert len(c.datasets) == 3 # after without timezone af = datetime(2015, 12, 30, 0, 0) - c = Crawl("http://tds.maracoos.org/thredds/catalog/MODIS-Chesapeake-Salinity/raw/2015/catalog.xml", after=af, debug=True) + c = Crawl("http://tds.maracoos.org/thredds/catalog/MODIS-Chesapeake-Salinity/raw/2015/catalog.xml", after=af) assert len(c.datasets) == 3 # before bf = datetime(2016, 1, 5, 0, 0) - c = Crawl("http://tds.maracoos.org/thredds/catalog/MODIS-Chesapeake-Salinity/raw/2016/catalog.xml", before=bf, debug=True) + c = Crawl("http://tds.maracoos.org/thredds/catalog/MODIS-Chesapeake-Salinity/raw/2016/catalog.xml", before=bf) assert len(c.datasets) == 3 # both af = datetime(2016, 1, 20, 0, 0) bf = datetime(2016, 2, 1, 0, 0) - c = Crawl("http://tds.maracoos.org/thredds/catalog/MODIS-Chesapeake-Salinity/raw/2016/catalog.xml", before=bf, after=af, debug=True) + c = Crawl("http://tds.maracoos.org/thredds/catalog/MODIS-Chesapeake-Salinity/raw/2016/catalog.xml", before=bf, after=af) assert len(c.datasets) == 11 def test_ssl(self): - c = Crawl("https://opendap.co-ops.nos.noaa.gov/thredds/catalog/NOAA/DBOFS/MODELS/201501/catalog.xml", debug=True) + c = Crawl("https://opendap.co-ops.nos.noaa.gov/thredds/catalog/NOAA/DBOFS/MODELS/201501/catalog.xml") assert len(c.datasets) > 0 + + def test_unidata_parse(self): + selects = [".*Best.*"] + skips = Crawl.SKIPS + [".*grib2", ".*grib1", ".*GrbF.*", ".*ncx2", + "Radar Data", "Station Data", + "Point Feature Collections", "Satellite Data", + "Unidata NEXRAD Composites \(GINI\)", + "Unidata case studies", + ".*Reflectivity-[0-9]{8}"] + c = Crawl( + 'http://thredds.ucar.edu/thredds/catalog.xml', + select=selects, + skip=skips + ) + + assert len(c.datasets) > 0 + + isos = [(d.id, s.get("url")) for d in c.datasets for s in d.services if s.get("service").lower() == "iso"] + assert len(isos) > 0