diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 00000000..48226e55 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,7 @@ +[run] +branch = True + +[report] +show_missing = True +omit = + textile/tests/* \ No newline at end of file diff --git a/.gitignore b/.gitignore index 09c1722b..d4599882 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,9 @@ *~ *.pyo *.egg-info +.cache/ .coverage +.eggs/ .noseids* docs/build docs/coverage @@ -12,6 +14,7 @@ build bin dist eggs +htmlcov parts develop-eggs .DS_Store diff --git a/.travis.yml b/.travis.yml index dec1c730..7f40ce9b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,15 +1,25 @@ language: python +env: + - REQUIREMENTS=true + - REQUIREMENTS=false python: - "2.6" - "2.7" - "3.2" - "3.3" - "3.4" + - "3.5" - "pypy" # command to install dependencies install: - - pip install -r requirements.txt - - python setup.py -q install - - if [[ ! $TRAVIS_PYTHON_VERSION == pypy ]] ; then pip install regex; fi + - if [[ $REQUIREMENTS == true ]] ; then pip install -r requirements.txt ; fi + - if [[ $TRAVIS_PYTHON_VERSION == '3.2' ]] ; then pip install coverage==3.7.1; fi + - pip install coveralls pytest pytest-cov coverage codecov + - pip install -e . + - if [[ ! $TRAVIS_PYTHON_VERSION == 'pypy' ]] ; then pip install regex; fi # command to run tests -script: nosetests +script: py.test +sudo: false +after_success: + - coveralls + - codecov diff --git a/CHANGELOG.textile b/CHANGELOG.textile index a8cafbea..c9466fce 100644 --- a/CHANGELOG.textile +++ b/CHANGELOG.textile @@ -1,19 +1,38 @@ h1. Textile Changelog -h1. Version 2.2.1 +h2. Version 2.3.1 +* Regression bugfix: empty string input returns empty string again. + +h2. Version 2.3.0 + +* Bugfixes: +** Support data URIs in img tags +** Fix autolink urls with image references ("#17":https://github.com/textile/python-textile/issues/17) +** Fix textile links containing parentheses ("#20":https://github.com/textile/python-textile/issues/20) +** Fix double-encoding of code blocks ("#21":https://github.com/textile/python-textile/issues/21) +** Fix handling of scheme in self-linked URLs ("#16":https://github.com/textile/python-textile/issues/16) +** Fix Markup not parsed if followed by certain characters ("#22":Markup not parsed if followed by certain characters) +* Convert testing over to "py.test":http://pytest.org/, improving unicode testing +* Update functionality for tables, notelists, and footnotes. This involved a major reworking of parts of the code, but it should now match php-textile and txstyle.org precisely. Please file an issue for any bugs you come across. + +h2. Version 2.2.2 + +* bugfix: "regex":https://pypi.python.org/pypi/regex is now an optional dependency + +h2. Version 2.2.1 * drop textilefactory support for html. * Various development-related bugfixes. * Added this changelog. -h1. Version 2.2.0 +h2. Version 2.2.0 * Started refactoring the code to be less repetitive. @textile.Textile().parse()@ is a little more friendly than @textile.Textile().textile()@ There may be more work to be done on this front to make the flow a little smoother. * We now support versions 2.6 - 3.4 (including 3.2) using the same codebase. Many thanks to Radek Czajka for this. * Drop support for html4. We now only output xhtml or html5. * Various development-related bugfixes. -h1. Version 2.1.8 +h2. Version 2.1.8 * Add support for html5 output. * Lots of new functionality added bringing us in line with the official Textile 2.4 diff --git a/README.textile b/README.textile index b2cc5372..e645e341 100644 --- a/README.textile +++ b/README.textile @@ -1,14 +1,16 @@ -!https://travis-ci.org/textile/python-textile.svg?branch=develop!:https://travis-ci.org/textile/python-textile +!https://travis-ci.org/textile/python-textile.svg!:https://travis-ci.org/textile/python-textile !https://coveralls.io/repos/github/textile/python-textile/badge.svg!:https://coveralls.io/github/textile/python-textile?branch=master !https://codecov.io/github/textile/python-textile/coverage.svg!:https://codecov.io/github/textile/python-textile h1. python-textile -python-textile is a Python port of Textile, Dean Allen's humane web text generator. +python-textile is a Python port of "Textile":http://txstyle.org/, Dean Allen's humane web text generator. h2. Installation -Install the 'textile' folder on your python path, or @pip install textile@. -Optional dependencies include PIL/Pillow (for checking images size) -and regex (for faster unicode-aware string matching). +@pip install textile@ + +Optional dependencies include: +* "PIL/Pillow":http://python-pillow.github.io/ (for checking images size) +* "regex":https://pypi.python.org/pypi/regex (for faster unicode-aware string matching). h2. Usage diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 00000000..882527b9 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +testpaths = tests +addopts = --cov=textile --cov-report=html --cov-append --cov-report=term-missing \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 42633e26..5cfb4428 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,2 @@ -nose==1.3.4 -coverage==3.7.1 html5lib==0.999 -Pillow==2.6.0 +Pillow==3.0.0 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index 5544feea..3a3405b0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,9 +1,4 @@ -[nosetests] -detailed-errors=1 -with-coverage=1 -cover-package=textile -cover-erase=1 -with-doctest=1 -with-id = 1 +[aliases] +test=pytest [bdist_wheel] universal=1 diff --git a/setup.py b/setup.py index 386c7744..897f07dd 100644 --- a/setup.py +++ b/setup.py @@ -2,14 +2,6 @@ import os import sys -install_requires = [] - - -if 'develop' in sys.argv: - install_requires.extend([ - 'tox', - ]) - def get_version(): basedir = os.path.dirname(__file__) with open(os.path.join(basedir, 'textile/version.py')) as f: @@ -32,14 +24,24 @@ def get_version(): 'Operating System :: OS Independent', 'Programming Language :: Python', 'Topic :: Software Development :: Libraries :: Python Modules', + 'Programming Language :: Python', + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 2.6', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.2', + 'Programming Language :: Python :: 3.3', + 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', ], - keywords='textile,text', - install_requires=install_requires, + keywords='textile,text,html markup', + install_requires=['six',], extras_require={ ':python_version=="2.6"': ['ordereddict>=1.1'], + 'develop': ['regex', 'pytest', 'pytest-cov'], }, - test_suite='nose.collector', - tests_require=['nose'], + setup_requires=['pytest-runner'], + tests_require=['pytest', 'pytest-cov'], include_package_data=True, zip_safe=False, ) diff --git a/tests/test_attributes.py b/tests/test_attributes.py new file mode 100644 index 00000000..70da8422 --- /dev/null +++ b/tests/test_attributes.py @@ -0,0 +1,15 @@ +from textile.utils import parse_attributes +import re + +def test_parse_attributes(): + assert parse_attributes('\\1', element='td') == {'colspan': '1'} + assert parse_attributes('/1', element='td') == {'rowspan': '1'} + assert parse_attributes('^', element='td') == {'style': 'vertical-align:top;'} + assert parse_attributes('{color: blue}') == {'style': 'color: blue;'} + assert parse_attributes('[en]') == {'lang': 'en'} + assert parse_attributes('(cssclass)') == {'class': 'cssclass'} + assert parse_attributes('(') == {'style': 'padding-left:1em;'} + assert parse_attributes(')') == {'style': 'padding-right:1em;'} + assert parse_attributes('<') == {'style': 'text-align:left;'} + assert parse_attributes('(c#i)') == {'class': 'c', 'id': 'i'} + assert parse_attributes('\\2 100', element='col') == {'span': '2', 'width': '100'} diff --git a/tests/test_block.py b/tests/test_block.py new file mode 100644 index 00000000..16873006 --- /dev/null +++ b/tests/test_block.py @@ -0,0 +1,49 @@ +from __future__ import unicode_literals + +from textile import Textile +from textile.objects import Block + +try: + from collections import OrderedDict +except ImportError: + from ordereddict import OrderedDict + +def test_block(): + t = Textile() + result = t.block('h1. foobar baby') + expect = '\t
google.com google.com blackhole@sun.comet
' + assert result == expect + +def test_github_issue_17(): + result = textile.textile('!http://www.ox.ac.uk/favicon.ico!') + expect = '\t' + assert result == expect + +def test_github_issue_20(): + text = 'This is a link to a ["Wikipedia article about Textile":http://en.wikipedia.org/wiki/Textile_(markup_language)].' + result = textile.textile(text) + expect = '\tThis is a link to a Wikipedia article about Textile.
' + assert result == expect + +def test_github_issue_21(): + text = '''h1. xml example + +bc. +\n<foo>\n bar\n</foo>\n
'
+ assert result == expect
+
+def test_github_issue_22():
+ text = '''_(artist-name)Ty Segall_’s'''
+ result = textile.textile(text)
+ expect = '\tTy Segall’s
' + assert result == expect + +def test_github_issue_26(): + text = '' + result = textile.textile(text) + expect = '' + assert result == expect diff --git a/tests/test_glyphs.py b/tests/test_glyphs.py new file mode 100644 index 00000000..fcf2636d --- /dev/null +++ b/tests/test_glyphs.py @@ -0,0 +1,32 @@ +from textile import Textile + +def test_glyphs(): + t = Textile() + + result = t.glyphs("apostrophe's") + expect = 'apostrophe’s' + assert result == expect + + result = t.glyphs("back in '88") + expect = 'back in ’88' + assert result == expect + + result = t.glyphs('foo ...') + expect = 'foo …' + assert result == expect + + result = t.glyphs('--') + expect = '—' + assert result == expect + + result = t.glyphs('FooBar[tm]') + expect = 'FooBar™' + assert result == expect + + result = t.glyphs("Cat's Cradle by Vonnegut
") + expect = 'Cat’s Cradle by Vonnegut
' + assert result == expect + + result = t.glyphs('test"') + expect = 'test” ' + assert result == expect diff --git a/tests/test_image.py b/tests/test_image.py new file mode 100644 index 00000000..aad39e29 --- /dev/null +++ b/tests/test_image.py @@ -0,0 +1,21 @@ +from textile import Textile + +def test_image(): + t = Textile() + result = t.image('!/imgs/myphoto.jpg!:http://jsamsa.com') + expect = (''.format( + t.uid)) + assert result == expect + assert t.refCache[1] == 'http://jsamsa.com' + assert t.refCache[2] == '/imgs/myphoto.jpg' + + result = t.image('!'.format(t.uid) + assert result == expect + assert t.refCache[3] == '/imgs/myphoto.jpg' + + t = Textile(rel='nofollow') + result = t.image('!/imgs/myphoto.jpg!:http://jsamsa.com') + expect = (''.format(t.uid)) + assert result == expect diff --git a/tests/test_imagesize.py b/tests/test_imagesize.py new file mode 100644 index 00000000..112989e1 --- /dev/null +++ b/tests/test_imagesize.py @@ -0,0 +1,13 @@ +import textile + +def test_imagesize(): + imgurl = 'http://www.google.com/intl/en_ALL/images/srpr/logo1w.png' + result = textile.tools.imagesize.getimagesize(imgurl) + try: + import PIL + + expect = (275, 95) + assert result == expect + except ImportError: + expect = '' + assert result == expect diff --git a/tests/test_lists.py b/tests/test_lists.py new file mode 100644 index 00000000..4e85f4c8 --- /dev/null +++ b/tests/test_lists.py @@ -0,0 +1,7 @@ +from textile import Textile + +def test_lists(): + t = Textile() + result = t.textileLists("* one\n* two\n* three") + expect = '\tTest „quotes”.
' + result = TextilePL().parse(test) + assert expect == result + + # Base Textile is unchanged. + expect = '\tTest “quotes”.
' + result = textile.textile(test) + assert expect == result diff --git a/tests/test_table.py b/tests/test_table.py new file mode 100644 index 00000000..0a3cb0d6 --- /dev/null +++ b/tests/test_table.py @@ -0,0 +1,12 @@ +from textile import Textile + +def test_table(): + t = Textile() + result = t.table('(rowclass). |one|two|three|\n|a|b|c|') + expect = '\tone | \n\t\t\ttwo | \n\t\t\tthree | \n\t\t
a | \n\t\t\tb | \n\t\t\tc | \n\t\t
one | \n\t\t\ttwo | \n\t\t\tthree | \n\t\t
a | \n\t\t\tb | \n\t\t\tc | \n\t\t
* test\n* test | \n\t\t\t1 | \n\t\t\t2 | \n\t\t
YACC1
', html) is not None + +def test_Footnote(): + html = textile.textile('This is covered elsewhere[1].\n\nfn1. Down here, in fact.\n\nfn2. Here is another footnote.') + assert re.search(r'^\tThis is covered elsewhere1.
\n\n\t1 Down here, in fact.
\n\n\t2 Here is another footnote.
$', html) is not None + + html = textile.textile('''See[1] for details -- or perhaps[100] at a push.\n\nfn1. Here are the details.\n\nfn100(footy#otherid). A totally unrelated footnote.''') + assert re.search(r'^\tSee1 for details — or perhaps100 at a push.
\n\n\t1 Here are the details.
\n\n\t100 A totally unrelated footnote.
$', html) is not None + + html = textile.textile('''See[2] for details, and later, reference it again[2].\n\nfn2^(footy#otherid)[en]. Here are the details.''') + assert re.search(r'^\tSee2 for details, and later, reference it again2.
\n\n\t2 Here are the details.
$', html) is not None + + html = textile.textile('''See[3!] for details.\n\nfn3. Here are the details.''') + assert re.search(r'^\tSee3 for details.
\n\n\t3 Here are the details.
$', html) is not None + + html = textile.textile('''See[4!] for details.\n\nfn4^. Here are the details.''') + assert re.search(r'^\tSee4 for details.
\n\n\t4 Here are the details.
$', html) is not None + +def test_issue_35(): + result = textile.textile('"z"') + expect = '\t“z”
' + assert result == expect + + result = textile.textile('" z"') + expect = '\t“ z”
' + assert result == expect + +def test_restricted(): + #Note that the HTML is escaped, thus rendering the " + result = textile.textile_restricted(test) + expect = "\tHere is some text.
\n<script>alert(‘hello world’)</script>
Here’s some <!— commented out —> text.
" + + assert result == expect + + test = "p[fr]. Partir, c'est toujours mourir un peu." + result = textile.textile_restricted(test) + expect = '\tPartir, c’est toujours mourir un peu.
' + + assert result == expect + +def test_unicode_footnote(): + html = textile.textile('текст[1]') + assert re.compile(r'^\tтекст1
$', re.U).search(html) is not None + +def test_autolinking(): + test = """some text "test":http://www.google.com http://www.google.com "$":http://www.google.com""" + result = """\tsome text test http://www.google.com www.google.com
""" + expect = textile.textile(test) + + assert result == expect + +def test_sanitize(): + test = "a paragraph of benign text" + result = "\ta paragraph of benign text
" + try: + expect = textile.Textile().parse(test, sanitize=True) + assert result == expect + + test = """a paragraph of evil text
""" + result = 'a paragraph of evil text
' + expect = textile.Textile().parse(test, sanitize=True) + assert result == expect + + test = """a paragraph of benign text
and more text
a paragraph of benign text
\nand more text
Scientists say the moon is slowly shrinking1.
\n\n\tTim Berners-Lee is one of the pioneer voices in favour of Net Neutrality1 and has expressed the view that ISPs should supply “connectivity with no strings attached”1 2
\n\n\tBerners-Lee admitted that the forward slashes \(“//”\) in a web address were actually unnecessary. He told the newspaper that he could easily have designed URLs not to have the forward slashes. “… it seemed like a good idea at the time,”3
\n\n\tScientists say1 the moon is quite small. But I, for one, don’t believe them. Others claim it to be made of cheese2. If this proves true I suspect we are in for troubled times3 as people argue over their “share” of the moon’s cheese. In the end, its limited size1 may prove problematic.
\n\n\tScientists say1 the moon is quite small. But I, for one, don’t believe them. Others claim it to be made of cheese2. If this proves true I suspect we are in for troubled times3 as people argue over their “share” of the moon’s cheese. In the end, its limited size1 may prove problematic.
\n\n\tScientists say the moon is slowly shrinking1.
\n\n\tSee2 for details, and later, reference it again2.
\n\n\t2 Here are the details.
$' + assert re.compile(searchstring).search(html) is not None + +def test_footnote_without_reflink(): + html = textile.textile('''See[3!] for details.\n\nfn3. Here are the details.''') + searchstring = r'^\tSee3 for details.
\n\n\t3 Here are the details.
$' + assert re.compile(searchstring).search(html) is not None + +def testSquareBrackets(): + html = textile.textile("""1[^st^], 2[^nd^], 3[^rd^]. 2 log[~n~]\n\nA close[!http://textpattern.com/favicon.ico!]image.\nA tight["text":http://textpattern.com/]link.\nA ["footnoted link":http://textpattern.com/][182].""") + searchstring = r'^\t1st, 2nd, 3rd. 2 logn
\n\n\tA closeimage.
\nA tighttextlink.
\nA footnoted link182.
We use CSS.
' + expect = textile.textile(test, html_type="html5") + assert result == expect + +def test_relURL(): + t = textile.Textile() + t.restricted = True + assert t.relURL("gopher://gopher.com/") == '#' diff --git a/tests/test_textilefactory.py b/tests/test_textilefactory.py new file mode 100644 index 00000000..846b9275 --- /dev/null +++ b/tests/test_textilefactory.py @@ -0,0 +1,28 @@ +from textile import textilefactory +import pytest + +def test_TextileFactory(): + f = textilefactory.TextileFactory() + result = f.process("some text here") + expect = '\tsome text here
' + assert result == expect + + f = textilefactory.TextileFactory(restricted=True) + result = f.process("more text here") + expect = '\tmore text here
' + assert result == expect + + f = textilefactory.TextileFactory(noimage=True) + result = f.process("this covers a partial branch.") + expect = '\tthis covers a partial branch.
' + assert result == expect + + # Certain parameter values are not permitted because they are illogical: + + with pytest.raises(ValueError) as ve: + f = textilefactory.TextileFactory(lite=True) + assert 'lite can only be enabled in restricted mode' in str(ve.value) + + with pytest.raises(ValueError) as ve: + f = textilefactory.TextileFactory(html_type='invalid') + assert "html_type must be 'xhtml' or 'html5'" in str(ve.value) diff --git a/tests/test_urls.py b/tests/test_urls.py new file mode 100644 index 00000000..69bae4f5 --- /dev/null +++ b/tests/test_urls.py @@ -0,0 +1,53 @@ +# -*- coding: utf-8 -*- +from textile import Textile +import re + +def test_urls(): + t = Textile() + assert t.relURL("http://www.google.com/") == 'http://www.google.com/' + + result = t.links('fooobar "Google":http://google.com/foobar/ and hello world "flickr":http://flickr.com/photos/jsamsa/ ') + expect = 'fooobar {0}2:shelve and hello world {0}4:shelve '.format(t.uid) + assert result == expect + + result = t.links('""Open the door, HAL!"":https://xkcd.com/375/') + expect = '{0}6:shelve'.format(t.uid) + assert result == expect + + result = t.links('"$":http://domain.tld/test_[brackets]') + expect = '{0}8:shelve'.format(t.uid) + assert result == expect + + result = t.links('"$":http://domain.tld/test_') + expect = '{0}10:shelve'.format(t.uid) + assert result == expect + + expect = '"":test' + result = t.links(expect) + assert result == expect + + expect = '"$":htt://domain.tld' + result = t.links(expect) + assert result == expect + + result = t.shelveURL('') + expect = '' + assert result == expect + + result = t.retrieveURLs('{0}2:url'.format(t.uid)) + expect = '' + assert result == expect + + result = t.encode_url('http://domain.tld/übermensch') + expect = 'http://domain.tld/%C3%BCbermensch' + assert result == expect + + result = t.parse('A link that starts with an h is "handled":/test/ incorrectly.') + expect = '\tA link that starts with an h is handled incorrectly.
' + assert result == expect + +def test_rel_attribute(): + t = Textile(rel='nofollow') + result = t.parse('"$":http://domain.tld') + expect = '\t' + assert result == expect diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 00000000..9acb3dc8 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +from textile import utils + +def test_encode_html(): + result = utils.encode_html('''this is a "test" of text that's safe to ''' + 'put in an attribute.') + expect = ('this is a "test" of text that's safe to put in an ' + '<html> attribute.') + assert result == expect + +def test_has_raw_text(): + assert utils.has_raw_text('foo bar biz baz
') is False + assert utils.has_raw_text(' why yes, yes it does') is True + +def test_is_rel_url(): + assert utils.is_rel_url("http://www.google.com/") is False + assert utils.is_rel_url("/foo") is True + +def test_generate_tag(): + result = utils.generate_tag('span', 'inner text', {'class': 'test'}) + expect = 'inner text' + assert result == expect + + text = 'Übermensch' + attributes = {'href': 'http://de.wikipedia.org/wiki/%C3%C9bermensch'} + expect = 'Übermensch' + result = utils.generate_tag('a', text, attributes) + assert result == expect diff --git a/tests/test_values.py b/tests/test_values.py new file mode 100644 index 00000000..4a6fe606 --- /dev/null +++ b/tests/test_values.py @@ -0,0 +1,322 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals +import textile +import pytest + +xhtml_known_values = ( + ('hello, world', '\thello, world
'), + + ('A single paragraph.\n\nFollowed by another.', + '\tA single paragraph.
\n\n\tFollowed by another.
'), + + ('I am very serious.\n\n\nI am very serious.\n', + '\t
I am very serious.
\n\n\nI am <b>very</b> serious.\n'), + + ('I spoke.\nAnd none replied.', '\t
I spoke.
\nAnd none replied.
“Observe!”
'), + + ('Observe -- very nice!', '\tObserve — very nice!
'), + + ('Observe - tiny and brief.', '\tObserve – tiny and brief.
'), + + ('Observe...', '\tObserve…
'), + + ('Observe ...', '\tObserve …
'), + + ('Observe: 2 x 2.', '\tObserve: 2 × 2.
'), + + ('one(TM), two(R), three(C).', '\tone™, two®, three©.
'), + + ('h1. Header 1', '\tAn old text
\n\n\t\n\t\t\n\n\tA block quotation.
\n\t
Any old text
'), + + ('I _believe_ every word.', '\tI believe every word.
'), + + ('And then? She *fell*!', '\tAnd then? She fell!
'), + + ('I __know__.\nI **really** __know__.', '\tI know.
\nI really know.
Cat’s Cradle by Vonnegut
'), + + ('Convert with @str(foo)@', '\tConvert with str(foo)
I’m sure not sure.
You are a pleasant child.
'), + + ('a ^2^ + b ^2^ = c ^2^', '\ta 2 + b 2 = c 2
'), + + ('log ~2~ x', '\tlog 2 x
'), + + ('I\'m %unaware% of most soft drinks.', '\tI’m unaware of most soft drinks.
'), + + ("I'm %{color:red}unaware%\nof most soft drinks.", '\tI’m unaware
\nof most soft drinks.
An example
'), + + ('p(#big-red). Red here', '\tRed here
'), + + ('p(example1#big-red2). Red here', '\tRed here
'), + + ('p{color:blue;margin:30px}. Spacey blue', '\tSpacey blue
'), + + ('p[fr]. rouge', '\trouge
'), + + ('I seriously *{color:red}blushed*\nwhen I _(big)sprouted_ that\ncorn stalk from my\n%[es]cabeza%.', + '\tI seriously blushed
\nwhen I sprouted'
+ ' that
\ncorn stalk from my
\ncabeza.
align left
'), + + ('p>. align right', '\talign right
'), + + ('p=. centered', '\tcentered
'), + + ('p<>. justified', '\tjustified
'), + + ('p(. left ident 1em', '\tleft ident 1em
'), + + ('p((. left ident 2em', '\tleft ident 2em
'), + + ('p))). right ident 3em', '\tright ident 3em
'), + + ('h2()>. Bingo.', '\t\n\na.gsub!( /, "" )\n
\n
',
+ '\n\na.gsub!( /</, "" )\n
\n
'),
+
+ ('The main text of the
\n'
+ 'page goes here and will
\nstay to the left of the
\nsidebar.
I searched Google.
'), + + ('I searched "a search engine (Google)":http://google.com.', '\tI searched a search engine.
'), + + ('I am crazy about "Hobix":hobix\nand "it\'s":hobix "all":hobix I ever\n"link to":hobix!\n\n[hobix]http://hobix.com', + '\tI am crazy about Hobix
\nand it’s '
+ 'all I ever
\nlink to!
And others sat all round the small
\nmachine and paid it to sing to them.
We use CSS.
'), + + ('|one|two|three|\n|a|b|c|', + '\tone | \n\t\t\ttwo | \n\t\t\tthree | \n\t\t
a | \n\t\t\tb | \n\t\t\tc | \n\t\t
name | \n\t\t\tage | \n\t\t\tsex | \n\t\t
joan | \n\t\t\t24 | \n\t\t\tf | \n\t\t
archie | \n\t\t\t29 | \n\t\t\tm | \n\t\t
bella | \n\t\t\t45 | \n\t\t\tf | \n\t\t
name | \n\t\t\tage | \n\t\t\tsex | \n\t\t
---|---|---|
joan | \n\t\t\t24 | \n\t\t\tf | \n\t\t
archie | \n\t\t\t29 | \n\t\t\tm | \n\t\t
bella | \n\t\t\t45 | \n\t\t\tf | \n\t\t
Hello\n\nHello Again\n\n\n\t
normal text
'), + + ('this is in a pre tag', '
this is in a pre tag'), + + ('"test1":http://foo.com/bar--baz\n\n"test2":http://foo.com/bar---baz\n\n"test3":http://foo.com/bar-17-18-baz', + '\t\n\n\t' + '\n\n\t' + ''), + + ('"foo ==(bar)==":#foobar', '\t'), + + ('!http://render.mathim.com/A%5EtAx%20%3D%20A%5Et%28Ax%29.!', + '\t'), + + ('* Point one\n* Point two\n## Step 1\n## Step 2\n## Step 3\n* Point three\n** Sub point 1\n** Sub point 2', + '\t
array[4] = 8
Links (like this), are now mangled in 2.1.0, whereas 2.0 parsed them correctly.
'), + + ('@monospaced text@, followed by text', + '\tmonospaced text
, followed by text
some text
'), + + ('pre.. foo bar baz\nquux', 'foo bar baz\nquux\n'), + + ('line of text\n\n leading spaces', + '\t
line of text
\n\n leading spaces'), + + ('"some text":http://www.example.com/?q=foo%20bar and more text', + '\tsome text and more text
'), + + ('(??some text??)', '\t(some text)
'), + + ('(*bold text*)', '\t(bold text)
'), + + ('H[~2~]O', '\tH2O
'), + + ("p=. Où est l'école, l'église s'il vous plaît?", + """\tOù est l’école, l’église s’il vous plaît?
"""), + + ("p=. *_The_* _*Prisoner*_", + """\tThe Prisoner
"""), + + ("""p=. "An emphasised _word._" & "*A spanned phrase.*" """, + """\t“An emphasised word.” & “A spanned phrase.”
"""), + + ("""p=. "*Here*'s a word!" """, + """\t“Here’s a word!”
"""), + + ("""p=. "Please visit our "Textile Test Page":http://textile.sitemonks.com" """, + """\t“Please visit our Textile Test Page”
"""), + ("""| Foreign EXPÓŅÉNTIAL |""", + """\tForeign EXPÓŅÉNTIAL | \n\t\t
Piękne ŹDŹBŁO
"""), + + ("""p=. Tell me, what is AJAX(Asynchronous Javascript and XML), please?""", + """\tTell me, what is AJAX, please?
"""), + ('p{font-size:0.8em}. *TxStyle* is a documentation project of Textile 2.4 for "Textpattern CMS":http://texpattern.com.', + '\tTxStyle is a documentation project of Textile 2.4 for Textpattern CMS.
'), + (""""Übermensch":http://de.wikipedia.org/wiki/Übermensch""", """\t"""), + ("""Here is some text with a block.\n\n\n\n\n\nbc. """, + """\tHere is some text with a block.
\n\n\t\n\n\t\n\n<!-- Here is a comment block in a code block. -->\n
"""),
+ (""""Textile(c)" is a registered(r) 'trademark' of Textpattern(tm) -- or TXP(That's textpattern!) -- at least it was - back in '88 when 2x4 was (+/-)5(o)C ... QED!\n\np{font-size: 200%;}. 2(1/4) 3(1/2) 4(3/4)""",
+ """\t“Textile©” is a registered® ‘trademark’ of Textpattern™ — or TXP — at least it was – back in ’88 when 2×4 was ±5°C … QED!
\n\n\t2¼ 3½ 4¾
"""), + ("""|=. Testing colgroup and col syntax\n|:\\5. 80\n|a|b|c|d|e|\n\n|=. Testing colgroup and col syntax|\n|:\\5. 80|\n|a|b|c|d|e|""", """\ta | \n\t\t\tb | \n\t\t\tc | \n\t\t\td | \n\t\t\te | \n\t\t
a | \n\t\t\tb | \n\t\t\tc | \n\t\t\td | \n\t\t\te | \n\t\t
Title | \n\t\t\tStarring | \n\t\t\tDirector | \n\t\t\tWriter | \n\t\t\tNotes | \n\t\t
---|---|---|---|---|
This is the tfoot, centred | \n\t\t||||
The Usual Suspects | \n\t\t\tBenicio Del Toro, Gabriel Byrne, Stephen Baldwin, Kevin Spacey | \n\t\t\tBryan Singer | \n\t\t\tChris McQaurrie | \n\t\t\tOne of the finest films ever made | \n\t\t
Se7en | \n\t\t\tMorgan Freeman, Brad Pitt, Kevin Spacey | \n\t\t\tDavid Fincher | \n\t\t\tAndrew Kevin Walker | \n\t\t\tGreat psychological thriller | \n\t\t
Primer | \n\t\t\tDavid Sullivan, Shane Carruth | \n\t\t\tShane Carruth | \n\t\t\tShane Carruth | \n\t\t\t Amazing insight into trust and human psychology \nrather than science fiction. Terrific! | \n\t\t
District 9 | \n\t\t\tSharlto Copley, Jason Cope | \n\t\t\tNeill Blomkamp | \n\t\t\tNeill Blomkamp, Terri Tatchell | \n\t\t\t Social commentary layered on thick, \nbut boy is it done well | \n\t\t
Arlington Road | \n\t\t\tTim Robbins, Jeff Bridges | \n\t\t\tMark Pellington | \n\t\t\tEhren Kruger | \n\t\t\tAwesome study in neighbourly relations | \n\t\t
Phone Booth | \n\t\t\tColin Farrell, Kiefer Sutherland, Forest Whitaker | \n\t\t\tJoel Schumacher | \n\t\t\tLarry Cohen | \n\t\t\t Edge-of-the-seat stuff in this \nshort but brilliantly executed thriller | \n\t\t
Nourishing beverage for baby cows.
\nCold drink that goes great with cookies.
Here is a comment
\n\n\tHere is a comment
\n\n\tHere is a class that is a little extended and is
\nfollowed by a strong word!
; Content-type: text/javascript\n; Cache-Control: no-store, no-cache, must-revalidate, pre-check=0, post-check=0, max-age=0\n; Expires: Sat, 24 Jul 2003 05:00:00 GMT\n; Last-Modified: Wed, 1 Jan 2025 05:00:00 GMT\n; Pragma: no-cache\n
\n\n\t123 test
\n\n\ttest 123
\n\n\t123 test
\n\n\ttest 123
"""), + ("""#_(first#list) one\n# two\n# three\n\ntest\n\n#(ordered#list2).\n# one\n# two\n# three\n\ntest\n\n#_(class_4).\n# four\n# five\n# six\n\ntest\n\n#_ seven\n# eight\n# nine\n\ntest\n\n# one\n# two\n# three\n\ntest\n\n#22 22\n# 23\n# 24""", + """\ttest
\n\n\ttest
\n\n\ttest
\n\n\ttest
\n\n\ttest
\n\n\ttest
\n\n\ttest
\n\n\t\t
| \n\t\t
\t
| \n\t\t
\n | \n\t\t
table | \n\t\t\tmore | \n\t\t\tbadass | \n\t\t
---|---|---|
Horizontal span of 3 | \n\t\t||
first | \n\t\t\tHAL | \n\t\t\t1 | \n\t\t
some | \n\t\t\tstyled | \n\t\t\tcontent | \n\t\t
spans 2 rows | \n\t\t\tthis is | \n\t\t\tquite a | \n\t\t
deep test | \n\t\t\tdon’t you think? | \n\t\t|
fifth | \n\t\t\tI’m a lumberjack | \n\t\t\t5 | \n\t\t
sixth | \n\t\t\tbold italics | \n\t\t\t6 | \n\t\t
strong | \n\t\t
em | \n\t\t
Inter-word | \n\t\t\tZIP-codes are 5- or 9-digit codes | \n\t\t
attribute list | \n\t\t
---|
align left | \n\t\t
align right | \n\t\t
center | \n\t\t
justify me | \n\t\t
valign top | \n\t\t
bottom | \n\t\t
Goodbye.
"""), + ("""h2. A Definition list which covers the instance where a new definition list is created with a term without a definition\n\n- term :=\n- term2 := def""", """\t& test
'), +) + +# A few extra cases for HTML4 +html_known_values = ( + ('I spoke.\nAnd none replied.', '\tI spoke.
\nAnd none replied.
I know.
\nI really know.
I’m unaware
\nof most soft drinks.
I seriously blushed
\nwhen I sprouted'
+ ' that
\ncorn stalk from my
\ncabeza.
\n\na.gsub!( /, "" )\n
\n
',
+ '\n\na.gsub!( /</, "" )\n
\n
'),
+ ('The main text of the
\n'
+ 'page goes here and will
\nstay to the left of the
\nsidebar.
I am crazy about Hobix
\nand it’s '
+ 'all I ever
\nlink to!
And others sat all round the small
\nmachine and paid it to sing to them.
quux
'), + ('"foo":http://google.com/one--two', '\t'), + # issue 24 colspan + ('|\\2. spans two cols |\n| col 1 | col 2 |', '\tspans two cols | \n\t\t|
col 1 | \n\t\t\tcol 2 | \n\t\t
Hello\n\nAgain\n\n\n\t
normal text
'), + # url with parentheses + ('"python":http://en.wikipedia.org/wiki/Python_(programming_language)', '\t'), + # table with hyphen styles + ('table(linkblog-thumbnail).\n|(linkblog-thumbnail-cell). apple|bear|', '\tapple | \n\t\t\tbear | \n\t\t
thing | \n\t\t\t\n\t\t\t | \n\t\t\t | otherthing | \n\t\t
test text
"), + ("_*test text*_", "\ttest text
"), + # quotes in code block + ("'quoted string'
", "\t'quoted string'
some preformatted textother text", "\t
some preformatted textother text"), + # at sign and notextile in table + ("|@
<A1> | \n\t\t\t<A2> <A3> | \n\t\t
*B1* | \n\t\t\t*B2* *B3* | \n\t\t
\n\t\t'), + ('Hello ["(Mum) & dad"]', '\tText…
\n\t
Hello [“(Mum) & dad”]
'), +) + +@pytest.mark.parametrize("input, expected_output", xhtml_known_values) +def test_KnownValuesXHTML(input, expected_output): + # XHTML + output = textile.textile(input, html_type='xhtml') + assert output == expected_output + +@pytest.mark.parametrize("input, expected_output", html_known_values) +def test_KnownValuesHTML(input, expected_output): + # HTML5 + output = textile.textile(input, html_type='html5') + assert output == expected_output diff --git a/textile/core.py b/textile/core.py index d580bc6f..dc5b13ac 100644 --- a/textile/core.py +++ b/textile/core.py @@ -20,87 +20,41 @@ """ import uuid +import six from textile.tools import sanitizer, imagesize +from textile.regex_strings import (align_re_s, cls_re_s, halign_re_s, + pnct_re_s, regex_snippets, syms_re_s, table_span_re_s, valign_re_s) +from textile.utils import (decode_high, encode_high, encode_html, generate_tag, + has_raw_text, is_rel_url, is_valid_url, list_type, normalize_newlines, + parse_attributes, pba) +from textile.objects import Block, Table -# We're going to use the Python 2.7+ OrderedDict data type. Import it if it's -# available, otherwise, use the included tool. try: from collections import OrderedDict except ImportError: from ordereddict import OrderedDict +from six.moves import urllib +urlparse, urlsplit, urlunsplit, quote, unquote = (urllib.parse.urlparse, + urllib.parse.urlsplit, urllib.parse.urlunsplit, urllib.parse.quote, + urllib.parse.unquote) try: - # Python 3 - from urllib.parse import urlparse, urlsplit, urlunsplit, quote, unquote - from html.parser import HTMLParser - xrange = range - unichr = chr - unicode = str -except (ImportError): - # Python 2 - from urllib import quote, unquote - from urlparse import urlparse, urlsplit, urlunsplit - from HTMLParser import HTMLParser - - -try: - # Use regex module for matching uppercase characters if installed, - # otherwise fall back to finding all the uppercase chars in a loop. import regex as re - upper_re_s = r'\p{Lu}' except ImportError: import re - from sys import maxunicode - upper_re_s = "".join([unichr(c) for c in - xrange(maxunicode) if unichr(c).isupper()]) - - -def _normalize_newlines(string): - out = string.strip() - out = re.sub(r'\r\n', '\n', out) - out = re.sub(r'\n{3,}', '\n\n', out) - out = re.sub(r'\n\s*\n', '\n\n', out) - out = re.sub(r'"$', '" ', out) - return out class Textile(object): - halign_re_s = r'(?:\<(?!>)|(?|\<\>|\=|[()]+(?! ))' - valign_re_s = r'[\-^~]' - class_re_s = r'(?:\([^)\n]+\))' # Don't allow classes/ids, - language_re_s = r'(?:\[[^\]\n]+\])' # languages, - style_re_s = r'(?:\{[^}\n]+\})' # or styles to span across newlines - colspan_re_s = r'(?:\\\d+)' - rowspan_re_s = r'(?:\/\d+)' - align_re_s = r'(?:%s|%s)*' % (halign_re_s, valign_re_s) - table_span_re_s = r'(?:%s|%s)*' % (colspan_re_s, rowspan_re_s) - # regex string to match class, style, language and horizontal alignment - # attributes - cslh_re_s = r'(?:%s)*' % '|'.join([class_re_s, style_re_s, language_re_s, - halign_re_s]) - # regex string to match class, style and language attributes - csl_re_s = r'(?:%s)*' % '|'.join([class_re_s, style_re_s, language_re_s]) - - pnct_re_s = r'[-!"#$%&()*+,/:;<=>?@\'\[\\\]\.^_`{|}~]' - urlchar_re_s = r'[\w"$\-_.+!*\'(),";\/?:@=&%#{}|\\^~\[\]`]' - syms_re_s = '¤§µ¶†‡•∗∴◊♠♣♥♦' - restricted_url_schemes = ('http', 'https', 'ftp', 'mailto') unrestricted_url_schemes = restricted_url_schemes + ('file', 'tel', - 'callto', 'sftp') + 'callto', 'sftp', 'data') btag = ('bq', 'bc', 'notextile', 'pre', 'h[1-6]', 'fn\d+', 'p', '###') btag_lite = ('bq', 'bc', 'p') - iAlign = {'<': 'float: left;', - '>': 'float: right;', - '=': 'display: block; margin: 0 auto;'} - vAlign = {'^': 'top', '-': 'middle', '~': 'bottom'} - hAlign = {'<': 'left', '=': 'center', '>': 'right', '<>': 'justify'} - note_index = 1 doctype_whitelist = ['xhtml', 'html5'] @@ -126,26 +80,35 @@ class Textile(object): 'threequarters': '¾', 'degrees': '°', 'plusminus': '±', - 'fn_ref_pattern': '%(marker)s', - 'fn_foot_pattern': '%(marker)s', - 'nl_ref_pattern': '%(marker)s', } def __init__(self, restricted=False, lite=False, noimage=False, - auto_link=False, get_sizes=False, html_type='xhtml'): + get_sizes=False, html_type='xhtml', rel='', block_tags=True): """Textile properties that are common to regular textile and textile_restricted""" self.restricted = restricted self.lite = lite self.noimage = noimage self.get_sizes = get_sizes - self.auto_link = auto_link self.fn = {} self.urlrefs = {} self.shelf = {} - self.rel = '' + self.rel = rel self.html_type = html_type self.max_span_depth = 5 + self.span_depth = 0 + uid = uuid.uuid4().hex + self.uid = 'textileRef:{0}:'.format(uid) + self.linkPrefix = '{0}-'.format(uid) + self.linkIndex = 0 + self.refCache = {} + self.refIndex = 0 + self.block_tags = block_tags + + cur = r'' + if regex_snippets['cur']: # pragma: no branch + cur = r'(?:[{0}]{1}*)?'.format(regex_snippets['cur'], + regex_snippets['space']) # We'll be searching for characters that need to be HTML-encoded to # produce properly valid html. These are the defaults that work in @@ -153,457 +116,189 @@ def __init__(self, restricted=False, lite=False, noimage=False, # to make it work for characters at the beginning of the string. self.glyph_search = [ # apostrophe's - re.compile(r"(^|\w)'(\w)", re.U), + re.compile(r"(^|{0}|\))'({0})".format(regex_snippets['wrd']), + flags=re.U), # back in '88 - re.compile(r"(\s)'(\d+\w?)\b(?!')", re.U), + re.compile(r"({0})'(\d+{1}?)\b(?![.]?[{1}]*?')".format( + regex_snippets['space'], regex_snippets['wrd']), + flags=re.U), + # single opening following an open bracket. + re.compile(r"([([{])'(?=\S)", flags=re.U), # single closing - re.compile(r"(^|\S)'(?=\s|%s|$)" % self.pnct_re_s, re.U), + re.compile(r"(^|\S)'(?={0}|{1}|<|$)".format( + regex_snippets['space'], pnct_re_s), flags=re.U), # single opening re.compile(r"'", re.U), + # double opening following an open bracket. Allows things like + # Hello ["(Mum) & dad"] + re.compile(r'([([{])"(?=\S)', flags=re.U), # double closing - re.compile(r'(^|\S)"(?=\s|%s|$)' % self.pnct_re_s, re.U), + re.compile(r'(^|\S)"(?={0}|{1}|<|$)'.format( + regex_snippets['space'], pnct_re_s), re.U), # double opening re.compile(r'"'), # ellipsis - re.compile(r'([^.]?)\.{3}', re.U), + re.compile(r'([^.]?)\.{3}'), # ampersand - re.compile(r'(\s)&(\s)', re.U), + re.compile(r'(\s?)&(\s)', re.U), # em dash - re.compile(r'(\s?)--(\s?)', re.U), + re.compile(r'(\s?)--(\s?)'), # en dash - re.compile(r'\s-(?:\s|$)', re.U), + re.compile(r' - '), # dimension sign - re.compile(r'(\d+)( ?)x( ?)(?=\d+)', re.U), + re.compile(r'([0-9]+[\])]?[\'"]? ?)[x]( ?[\[(]?)' + r'(?=[+-]?{0}[0-9]*\.?[0-9]+)'.format(cur), flags=re.I | re.U), # trademark - re.compile(r'\b ?[([]TM[])]', re.I | re.U), + re.compile(r'(\b ?|{0}|^)[([]TM[])]'.format(regex_snippets['space'] + ), flags=re.I | re.U), # registered - re.compile(r'\b ?[([]R[])]', re.I | re.U), + re.compile(r'(\b ?|{0}|^)[([]R[])]'.format(regex_snippets['space'] + ), flags=re.I | re.U), # copyright - re.compile(r'\b ?[([]C[])]', re.I | re.U), + re.compile(r'(\b ?|{0}|^)[([]C[])]'.format(regex_snippets['space'] + ), flags=re.I | re.U), # 1/2 - re.compile(r'[([]1\/2[])]', re.I | re.U), + re.compile(r'[([]1\/2[])]'), # 1/4 - re.compile(r'[([]1\/4[])]', re.I | re.U), + re.compile(r'[([]1\/4[])]'), # 3/4 - re.compile(r'[([]3\/4[])]', re.I | re.U), + re.compile(r'[([]3\/4[])]'), # degrees - re.compile(r'[([]o[])]', re.I | re.U), + re.compile(r'[([]o[])]'), # plus/minus - re.compile(r'[([]\+\/-[])]', re.I | re.U), + re.compile(r'[([]\+\/-[])]'), # 3+ uppercase acronym - re.compile(r'\b([%s][%s0-9]{2,})\b(?:[(]([^)]*)[)])' % (upper_re_s, upper_re_s)), + re.compile(r'\b([{0}][{1}]{{2,}})\b(?:[(]([^)]*)[)])'.format( + regex_snippets['abr'], regex_snippets['acr']), flags=re.U), # 3+ uppercase - re.compile(r"""(?:(?<=^)|(?<=\s)|(?<=[>\(;-]))([%s]{3,})(\w*)(?=\s|%s|$)(?=[^">]*?(<|$))""" % - (upper_re_s, self.pnct_re_s)), + re.compile(r'({space}|^|[>(;-])([{abr}]{{3,}})([{nab}]*)' + '(?={space}|{pnct}|<|$)(?=[^">]*?(<|$))'.format(**{ 'space': + regex_snippets['space'], 'abr': regex_snippets['abr'], + 'nab': regex_snippets['nab'], 'pnct': pnct_re_s}), re.U), ] # These are the changes that need to be made for characters that occur # at the beginning of the string. self.glyph_search_initial = list(self.glyph_search) # apostrophe's - self.glyph_search_initial[0] = re.compile(r"(\w)'(\w)", re.U) + self.glyph_search_initial[0] = re.compile(r"({0}|\))'({0})".format( + regex_snippets['wrd']), flags=re.U) # single closing - self.glyph_search_initial[2] = re.compile(r"(\S)'(?=\s|%s|$)" % - self.pnct_re_s, re.U) + self.glyph_search_initial[3] = re.compile(r"(\S)'(?={0}|{1}|$)".format( + regex_snippets['space'], pnct_re_s), re.U) # double closing - self.glyph_search_initial[4] = re.compile(r'(\S)"(?=\s|%s|$)' % - self.pnct_re_s, re.U) - - self.glyph_replace = [x % self.glyph_definitions for x in ( - r'\1%(apostrophe)s\2', # apostrophe's - r'\1%(apostrophe)s\2', # back in '88 - r'\1%(quote_single_close)s', # single closing - r'%(quote_single_open)s', # single opening - r'\1%(quote_double_close)s', # double closing - r'%(quote_double_open)s', # double opening - r'\1%(ellipsis)s', # ellipsis - r'\1%(ampersand)s\2', # ampersand - r'\1%(emdash)s\2', # em dash - r' %(endash)s ', # en dash - r'\1\2%(dimension)s\3', # dimension sign - r'%(trademark)s', # trademark - r'%(registered)s', # registered - r'%(copyright)s', # copyright - r'%(half)s', # 1/2 - r'%(quarter)s', # 1/4 - r'%(threequarters)s', # 3/4 - r'%(degrees)s', # degrees - r'%(plusminus)s', # plus/minus + self.glyph_search_initial[6] = re.compile(r'(\S)"(?={0}|{1}|<|$)'.format( + regex_snippets['space'], pnct_re_s), re.U) + + self.glyph_replace = [x.format(**self.glyph_definitions) for x in ( + r'\1{apostrophe}\2', # apostrophe's + r'\1{apostrophe}\2', # back in '88 + r'\1{quote_single_open}', # single opening after bracket + r'\1{quote_single_close}', # single closing + r'{quote_single_open}', # single opening + r'\1{quote_double_open}', # double opening after bracket + r'\1{quote_double_close}', # double closing + r'{quote_double_open}', # double opening + r'\1{ellipsis}', # ellipsis + r'\1{ampersand}\2', # ampersand + r'\1{emdash}\2', # em dash + r' {endash} ', # en dash + r'\1{dimension}\2', # dimension sign + r'\1{trademark}', # trademark + r'\1{registered}', # registered + r'\1{copyright}', # copyright + r'{half}', # 1/2 + r'{quarter}', # 1/4 + r'{threequarters}', # 3/4 + r'{degrees}', # degrees + r'{plusminus}', # plus/minus r'\1', # 3+ uppercase acronym - r'\1\2', # 3+ uppercase + r'\1{0}:glyph:\2' # 3+ uppercase + r'\3'.format(self.uid), )] if self.html_type == 'html5': - self.glyph_replace[19] = r'\1' + self.glyph_replace[21] = r'\1' if self.restricted is True: self.url_schemes = self.restricted_url_schemes else: self.url_schemes = self.unrestricted_url_schemes - - def parse(self, text, rel=None, head_offset=0, sanitize=False): - """ - >>> import textile - >>> Py3 << textile.textile('some textile') - '\\tsome textile
' - """ + def parse(self, text, rel=None, sanitize=False): + """Parse the input text as textile and return html output.""" self.notes = OrderedDict() self.unreferencedNotes = OrderedDict() self.notelist_cache = OrderedDict() - text = _normalize_newlines(text) + if text == '': + return text if self.restricted: - text = self.encode_html(text, quotes=False) + text = encode_html(text, quotes=False) + + text = normalize_newlines(text) + text = text.replace(self.uid, '') + + if self.block_tags: + if self.lite: + self.blocktag_whitelist = ['bq', 'p'] + text = self.block(text) + else: + self.blocktag_whitelist = [ 'bq', 'p', 'bc', 'notextile', + 'pre', 'h[1-6]', + 'fn{0}+'.format(regex_snippets['digit']), '###'] + text = self.block(text) + text = self.placeNoteLists(text) + else: + # Inline markup (em, strong, sup, sub, del etc). + text = self.span(text) + + # Glyph level substitutions (mainly typographic -- " & ' => curly + # quotes, -- => em-dash etc. + text = self.glyphs(text) if rel: - self.rel = ' rel="%s"' % rel + self.rel = ' rel="{0}"'.format(rel) text = self.getRefs(text) - # The original php puts the below within an if not self.lite, but our - # block function handles self.lite itself. - text = self.block(text, int(head_offset)) - if not self.lite: text = self.placeNoteLists(text) text = self.retrieve(text) + text = text.replace('{0}:glyph:'.format(self.uid), '') if sanitize: text = sanitizer.sanitize(text) + text = self.retrieveURLs(text) + # if the text contains a break tag (foo bar biz baz
') - False - - >>> t.hasRawText(' why yes, yes it does') - True - - """ - r = re.compile(r'<(p|blockquote|div|form|table|ul|ol|dl|pre|h\d)[^>]*?>.*\1>', - re.S).sub('', text.strip()).strip() - r = re.compile(r'<(hr|br)[^>]*?/>').sub('', r) - return '' != r - def table(self, text): - r""" - >>> t = Textile() - >>> Py3 << t.table('(rowclass). |one|two|three|\n|a|b|c|') - '\tone | \n\t\t\ttwo | \n\t\t\tthree | \n\t\t
a | \n\t\t\tb | \n\t\t\tc | \n\t\t
\\n', '\\t\\t', False) - - >>> Py3 << t.fBlock("bq", "", None, "http://google.com", "Hello BlockQuote") - ('\\t', 'Hello BlockQuote', '
', '\\n\\t
\\n', '\\t\\t', False) - - >>> Py3 << t.fBlock("bc", "", None, "", 'printf "Hello, World";') # doctest: +ELLIPSIS - ('', 'Hello BlockQuote', '
', '\\n\\t
', '', ..., '
', '
', False)
-
- >>> Py3 << t.fBlock("h1", "", None, "", "foobar")
- ('', '\\t\n" % (cite, atts) - o2 = "\t\t" - - elif tag == 'bc': - o1 = "" % atts - c2 = "
" - c1 = "\n\t
" % atts
- o2 = "" % atts
- c2 = "
"
- c1 = "
"
- content = self.shelve(self.encode_html(content.rstrip("\n") +
- "\n"))
-
- elif tag == 'notextile':
- content = self.shelve(content)
- o1 = o2 = ''
- c1 = c2 = ''
-
- elif tag == 'pre':
- content = self.shelve(self.encode_html(content.rstrip("\n") +
- "\n"))
- o1 = "" % atts - o2 = c2 = '' - c1 = '' - - elif tag == '###': - eat = True - - else: - o2 = "\t<%s%s>" % (tag, atts) - c2 = "%s>" % tag - - if not eat: - content = self.graf(content) - else: - content = '' - return o1, o2, content, c2, c1, eat - - def formatFootnote(self, marker, atts='', anchor=True): - if anchor: - pattern = self.glyph_definitions['fn_foot_pattern'] - else: - pattern = self.glyph_definitions['fn_ref_pattern'] - return pattern % {'atts': atts, 'marker': marker} - def footnoteRef(self, text): - """ - >>> t = Textile() - >>> Py3 << t.footnoteRef('foo[1] ') # doctest: +ELLIPSIS - 'foo1 ' - """ - return re.compile(r'(?<=\S)\[(\d+)(!?)\](\s)?', re.U).sub( - self.footnoteID, text - ) - - def footnoteID(self, match): - footnoteNum, nolink, space = match.groups() - if not space: - space = '' - backref = ' class="footnote"' - if footnoteNum not in self.fn: - a = uuid.uuid4().hex - self.fn[footnoteNum] = a - backref = '%s id="fnrev%s"' % (backref, a) - footnoteID = self.fn[footnoteNum] - footref = '!' == nolink and footnoteNum or '%s' % ( - footnoteID, footnoteNum - ) - footref = self.formatFootnote(footref, backref, False) - return footref + space + # somehow php-textile gets away with not capturing the space. + return re.compile(r'(?<=\S)\[(?P
Cat's Cradle by Vonnegut
") - 'Cat’s Cradle by Vonnegut
' - """ # fix: hackish - text = re.sub(r'"\Z', r'" ', text) + if text.endswith('"'): + text = '{0} '.format(text) + text = text.rstrip('\n') result = [] searchlist = self.glyph_search_initial - for i, line in enumerate(re.compile(r'(<[\w\/!?].*?>)', - re.U).split(text)): + # split the text by any angle-bracketed tags + for i, line in enumerate(re.compile(r'(<[\w\/!?].*?>)', re.U).split( + text)): if not i % 2: for s, r in zip(searchlist, self.glyph_replace): line = s.sub(r, line) @@ -999,16 +539,7 @@ def glyphs(self, text): return ''.join(result) def getRefs(self, text): - """ - Capture and store URL references in self.urlrefs. - - >>> t = Textile() - >>> Py3 << t.getRefs("some text [Google]http://www.google.com") - 'some text ' - >>> Py3 << t.urlrefs - {'Google': 'http://www.google.com'} - - """ + """Capture and store URL references in self.urlrefs.""" pattern = re.compile(r'(?:(?<=^)|(?<=\s))\[(.+)\]((?:http(?:s?):\/\/|\/)\S+)(?=\s|$)', re.U) text = pattern.sub(self.refs, text) @@ -1019,50 +550,19 @@ def refs(self, match): self.urlrefs[flag] = url return '' - def checkRefs(self, url): - return self.urlrefs.get(url, url) - - def isRelURL(self, url): - """ - Identify relative urls. - - >>> t = Textile() - >>> t.isRelURL("http://www.google.com/") - False - >>> t.isRelURL("/foo") - True - - """ - (scheme, netloc) = urlparse(url)[0:2] - return not scheme and not netloc - def relURL(self, url): - """ - >>> t = Textile() - >>> Py3 << t.relURL("http://www.google.com/") - 'http://www.google.com/' - >>> t.restricted = True - >>> Py3 << t.relURL("gopher://gopher.com/") - '#' - - """ scheme = urlparse(url)[0] if scheme and scheme not in self.url_schemes: return '#' return url def shelve(self, text): - itemID = uuid.uuid4().hex + self.refIndex = self.refIndex + 1 + itemID = '{0}{1}:shelve'.format(self.uid, self.refIndex) self.shelf[itemID] = text return itemID def retrieve(self, text): - """ - >>> t = Textile() - >>> id = t.shelve("foobar") - >>> Py3 << t.retrieve(id) - 'foobar' - """ while True: old = text for k, v in self.shelf.items(): @@ -1071,25 +571,6 @@ def retrieve(self, text): break return text - def encode_html(self, text, quotes=True): - """Return text that's safe for an HTML attribute. - >>> t = Textile() - >>> Py3 << t.encode_html('this is a "test" of text that\\\'s safe to put in an attribute.') - 'this is a "test" of text that's safe to put in an <html> attribute.' - """ - a = ( - ('&', '&'), - ('<', '<'), - ('>', '>')) - - if quotes: - a = a + (("'", '''), - ('"', '"')) - - for k, v in a: - text = text.replace(k, v) - return text - def graf(self, text): if not self.lite: text = self.noTextile(text) @@ -1099,9 +580,6 @@ def graf(self, text): text = self.getRefs(text) text = self.links(text) - if self.auto_link: - text = self.autoLink(text) - text = self.links(text) if not self.noimage: text = self.image(text) @@ -1109,7 +587,7 @@ def graf(self, text): if not self.lite: text = self.table(text) text = self.redcloth_list(text) - text = self.lists(text) + text = self.textileLists(text) text = self.span(text) text = self.footnoteRef(text) @@ -1118,89 +596,288 @@ def graf(self, text): return text.rstrip('\n') - def autoLink(self, text): - """ - >>> t = Textile() - >>> Py3 << t.autoLink("http://www.ya.ru") - '"$":http://www.ya.ru' - """ - - pattern = re.compile(r"""\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))""", - re.U | re.I) - return pattern.sub(r'"$":\1', text) - def links(self, text): - """ - >>> t = Textile() - >>> Py3 << t.links('fooobar "Google":http://google.com/foobar/ and hello world "flickr":http://flickr.com/photos/jsamsa/ ') # doctest: +ELLIPSIS - 'fooobar ... and hello world ...' - """ + """For some reason, the part of the regex below that matches the url + does not match a trailing parenthesis. It gets caught by tail, and + we check later to see if it should be included as part of the url.""" + text = self.markStartOfLinks(text) + + return self.replaceLinks(text) + + def markStartOfLinks(self, text): + """Finds and marks the start of well formed links in the input text.""" + # Slice text on '":^|(?<=[\s>.\(\|])|[{[])? # leading text - " # opening quote - (?P%s) # block attributes - (?P [^"]+?) # link text - \s? # optional space - (?:\((?P [^)]+?)\)(?="))? # optional title - ": # closing quote, colon - (?P %s+?) # URL - (?P \/)? # slash - (?P [^\w\/]*?) # trailing text - (?P [\]})]|(?=\s|$|\|)) # tail - ''' % (self.cslh_re_s, self.urlchar_re_s) - - text = re.compile(pattern, re.X | re.U).sub(self.fLink, text) + return text + def replaceLinks(self, text): + """Replaces links with tokens and stores them on the shelf.""" + stopchars = r"\s|^'\"*" + pattern = r""" + (?P \[)? # Optionally open with a square bracket eg. Look ["here":url] + {0}linkStartMarker:" # marks start of the link + (?P(?:.|\n)*?) # grab the content of the inner "..." part of the link, can be anything but + # do not worry about matching class, id, lang or title yet + ": # literal ": marks end of atts + text + title block + (?P [^{1}]*) # url upto a stopchar + """.format(self.uid, stopchars) + text = re.compile(pattern, flags=re.X | re.U).sub(self.fLink, text) return text - def fLink(self, match): - pre, atts, text, title, url, slash, post, tail = match.groups() + def fLink(self, m): + in_ = m.group() + pre, inner, url = m.groups() + pre = pre or '' + + if inner == '': + return '{0}"{1}":{2}'.format(pre, inner, url) + + m = re.search(r'''^ + (?P {0}) # $atts (if any) + {1}* # any optional spaces + (?P # $text is... + (!.+!) # an image + | # else... + .+? # link text + ) # end of $text + (?:\((?P [^)]+?)\))? # $title (if any) + $'''.format(cls_re_s, regex_snippets['space']), inner, + flags=re.X | re.U) + + atts = m.group('atts') or '' + text = m.group('text') or '' or inner + title = m.group('title') or '' + + pop, tight = '', '' + counts = { '[': None, ']': url.count(']'), '(': None, ')': None } + + # Look for footnotes or other square-bracket delimited stuff at the end + # of the url... + # + # eg. "text":url][otherstuff... will have "[otherstuff" popped back + # out. + # + # "text":url?q[]=x][123] will have "[123]" popped off the back, the + # remaining closing square brackets will later be tested for balance + if (counts[']']): + m = re.search('(?P ^.*\])(?P \[.*?)$', url, flags=re.U) + if m: + url, tight = m.groups() + + # Split off any trailing text that isn't part of an array assignment. + # eg. "text":...?q[]=value1&q[]=value2 ... is ok + # "text":...?q[]=value1]following ... would have "following" popped + # back out and the remaining square bracket will later be tested for + # balance + if (counts[']']): + m = re.search(r'(?P ^.*\])(?!=)(?P .*?)$', url, flags=re.U) + url = m.group('url') + tight = '{0}{1}'.format(m.group('end'), tight) + + # Now we have the array of all the multi-byte chars in the url we will + # parse the uri backwards and pop off any chars that don't belong + # there (like . or , or unmatched brackets of various kinds). + first = True + popped = True + + counts[']'] = url.count(']') + url_chars = list(url) + + def _endchar(c, pop, popped, url_chars, counts, pre): + """Textile URL shouldn't end in these characters, we pop them off + the end and push them out the back of the url again.""" + pop = '{0}{1}'.format(c, pop) + url_chars.pop() + popped = True + return pop, popped, url_chars, counts, pre + + def _rightanglebracket(c, pop, popped, url_chars, counts, pre): + url_chars.pop() + urlLeft = ''.join(url_chars) + + m = re.search(r'(?P .*)(?P <\/[a-z]+)$', urlLeft) + url_chars = m.group('url_chars') + pop = '{0}{1}{2}'.format(m.group('tag'), c, pop) + popped = True + return pop, popped, url_chars, counts, pre + + def _closingsquarebracket(c, pop, popped, url_chars, counts, pre): + """If we find a closing square bracket we are going to see if it is + balanced. If it is balanced with matching opening bracket then it + is part of the URL else we spit it back out of the URL.""" + # If counts['['] is None, count the occurrences of '[' + counts['['] = counts['['] or url.count('[') + + if counts['['] == counts[']']: + # It is balanced, so keep it + url_chars.append(c) + else: + # In the case of un-matched closing square brackets we just eat + # it + popped = True + url_chars.pop() + counts[']'] = counts[']'] - 1; + if first: # pragma: no branch + pre = '' + return pop, popped, url_chars, counts, pre + + def _closingparenthesis(c, pop, popped, url_chars, counts, pre): + if counts[')'] is None: # pragma: no branch + counts['('] = url.count('(') + counts[')'] = url.count(')') + + if counts['('] != counts[')']: + # Unbalanced so spit it out the back end + popped = True + pop = '{0}{1}'.format(url_chars.pop(), pop) + counts[')'] = counts[')'] - 1 + return pop, popped, url_chars, counts, pre + + def _casesdefault(c, pop, popped, url_chars, counts, pre): + return pop, popped, url_chars, counts, pre + + cases = { + '!': _endchar, + '?': _endchar, + ':': _endchar, + ';': _endchar, + '.': _endchar, + ',': _endchar, + '>': _rightanglebracket, + ']': _closingsquarebracket, + ')': _closingparenthesis, + } + for c in url_chars[-1::-1]: # pragma: no branch + popped = False + pop, popped, url_chars, counts, pre = cases.get(c, + _casesdefault)(c, pop, popped, url_chars, counts, pre) + first = False + if popped is False: + break - if not pre: - pre = '' + url = ''.join(url_chars) + uri_parts = urlsplit(url) - if not slash: - slash = '' + scheme_in_list = uri_parts.scheme in self.url_schemes + valid_scheme = (uri_parts.scheme and scheme_in_list) + if not is_valid_url(url) and not valid_scheme: + return in_.replace('{0}linkStartMarker:'.format(self.uid), '') if text == '$': - text = re.sub(r'^\w+://(.+)', r'\1', url) - - # assume ) at the end of the url is not actually part of the url - # unless the url also contains a ( - if tail == ')' and url.find('(') > -1: - url = url + tail - tail = None - - url = self.checkRefs(url) - try: - url = self.encode_url(url) - except: - pass + text = url + if "://" in text: + text = text.split("://")[1] + else: + text = text.split(":")[1] - atts = self.pba(atts) - if title: - atts = atts + ' title="%s"' % self.encode_html(title) + text = text.strip() + title = encode_html(title) - if not self.noimage: + if not self.noimage: # pragma: no branch text = self.image(text) - text = self.span(text) text = self.glyphs(text) + url = self.shelveURL(self.encode_url(urlunsplit(uri_parts))) + attributes = parse_attributes(atts) + if title: + attributes['title'] = title + attributes['href'] = url + if self.rel: + attributes['rel'] = self.rel + a_text = generate_tag('a', text, attributes) + a_shelf_id = self.shelve(a_text) - url = self.relURL(url) + slash - out = '%s' % (self.encode_html(url), atts, - self.rel, text) - - if (pre and not tail) or (tail and not pre): - out = ''.join([pre, out, post, tail]) - post = '' + out = '{0}{1}{2}{3}'.format(pre, a_shelf_id, pop, tight) - out = self.shelve(out) - return ''.join([out, post]) + return out def encode_url(self, url): """ @@ -1211,19 +888,23 @@ def encode_url(self, url): http://stackoverflow.com/a/804380/72656 """ # turn string into unicode - if not isinstance(url, unicode): + if not isinstance(url, six.text_type): url = url.decode('utf8') # parse it parsed = urlsplit(url) - # divide the netloc further - netloc_pattern = re.compile(r""" - (?:(?P [^:@]+)(?::(?P [^:@]+))?@)? - (?P [^:]+) - (?::(?P [0-9]+))? - """, re.X | re.U) - netloc_parsed = netloc_pattern.match(parsed.netloc).groupdict() + if parsed.netloc: + # divide the netloc further + netloc_pattern = re.compile(r""" + (?:(?P [^:@]+)(?::(?P [^:@]+))?@)? + (?P [^:]+) + (?::(?P [0-9]+))? + """, re.X | re.U) + netloc_parsed = netloc_pattern.match(parsed.netloc).groupdict() + else: + netloc_parsed = {'user': '', 'password': '', 'host': '', 'port': + ''} # encode each component scheme = parsed.scheme @@ -1242,42 +923,41 @@ def encode_url(self, url): # put it back together netloc = '' if user: - netloc += user + netloc = '{0}{1}'.format(netloc, user) if password: - netloc += ':' + password - netloc += '@' - netloc += host + netloc = '{0}:{1}'.format(netloc, password) + netloc = '{0}@'.format(netloc) + netloc = '{0}{1}'.format(netloc, host) if port: - netloc += ':'+port + netloc = '{0}:{1}'.format(netloc, port) return urlunsplit((scheme, netloc, path, query, fragment)) def span(self, text): - """ - >>> t = Textile() - >>> Py3 << t.span(r"hello %(bob)span *strong* and **bold**% goodbye") - 'hello span strong and bold goodbye' - """ qtags = (r'\*\*', r'\*', r'\?\?', r'\-', r'__', r'_', r'%', r'\+', r'~', r'\^') - pnct = ".,\"'?!;:(" - - for qtag in qtags: - pattern = re.compile(r""" - (?:^|(?<=[\s>%(pnct)s])|([\[{])) - (%(qtag)s)(?!%(qtag)s) - (%(c)s) - (?::\(([^)]+?)\))? - ([^\s%(qtag)s]+|\S[^%(qtag)s\n]*[^\s%(qtag)s\n]) - ([%(pnct)s]*) - %(qtag)s - (?:$|([\]}])|(?=%(selfpnct)s{1,2}|\s)) - """ % {'qtag': qtag, 'c': self.cslh_re_s, 'pnct': pnct, - 'selfpnct': self.pnct_re_s}, re.X) - text = pattern.sub(self.fSpan, text) + pnct = r""".,"'?!;:‹›«»„“”‚‘’""" + self.span_depth = self.span_depth + 1 + + if self.span_depth <= self.max_span_depth: + for tag in qtags: + pattern = re.compile(r""" + (?P ^|(?<=[\s>{pnct}\(])|[{{[]) + (?P{tag})(?!{tag}) + (?P {cls}) + (?!{tag}) + (?::(?P\S+[^{tag}]{space}))? + (?P [^{space}{tag}]+|\S.*?[^\s{tag}\n]) + (?P [{pnct}]*) + {tag} + (?P $|[\[\]}}<]|(?=[{pnct}]{{1,2}}[^0-9]|\s|\))) + """.format(**{'tag': tag, 'cls': cls_re_s, 'pnct': pnct, + 'space': regex_snippets['space']}), flags=re.X | re.U) + text = pattern.sub(self.fSpan, text) + self.span_depth = self.span_depth - 1 return text def fSpan(self, match): - _, tag, atts, cite, content, end, _ = match.groups() + pre, tag, atts, cite, content, end, tail = match.groups() qtags = { '*': 'strong', @@ -1293,42 +973,37 @@ def fSpan(self, match): } tag = qtags[tag] - atts = self.pba(atts) + atts = pba(atts) if cite: - atts = atts + ' cite="%s"' % cite + atts = '{0} cite="{1}"'.format(atts, cite.rstrip()) content = self.span(content) - out = "<%s%s>%s%s%s>" % (tag, atts, content, end, tag) + out = "<{0}{1}>{2}{3}{4}>".format(tag, atts, content, end, tag) + if pre and not tail or tail and not pre: + out = '{0}{1}{2}'.format(pre, out, tail) return out def image(self, text): - """ - >>> t = Textile() - >>> Py3 << t.image('!/imgs/myphoto.jpg!:http://jsamsa.com') - '' - >>> Py3 << t.image('!' - """ pattern = re.compile(r""" - (?:[\[{])? # pre - \! # opening ! - (\<|\=|\>)? # optional alignment atts - (%s) # optional style,class atts - (?:\. )? # optional dot-space - ([^\s(!]+) # presume this is the src - \s? # optional space - (?:\(([^\)]+)\))? # optional title - \! # closing - (?::(\S+))? # optional href - (?:[\]}]|(?=\s|$)) # lookahead: space or end of string - """ % self.cslh_re_s, re.U | re.X) + (?:[\[{{])? # pre + \! # opening ! + (\<|\=|\>)? # optional alignment atts + ({0}) # optional style,class atts + (?:\. )? # optional dot-space + ([^\s(!]+) # presume this is the src + \s? # optional space + (?:\(([^\)]+)\))? # optional title + \! # closing + (?::(\S+))? # optional href + (?:[\]}}]|(?=\s|$)) # lookahead: space or end of string + """.format(cls_re_s), re.U | re.X) return pattern.sub(self.fImage, text) def fImage(self, match): # (None, '', '/imgs/myphoto.jpg', None, None) - align, atts, url, title, href = match.groups() - atts = self.pba(atts) + align, attributes, url, title, href = match.groups() + atts = OrderedDict() size = None alignments = {'<': 'left', '=': 'center', '>': 'right'} @@ -1336,36 +1011,33 @@ def fImage(self, match): if not title: title = '' - if not self.isRelURL(url) and self.get_sizes: + if not is_rel_url(url) and self.get_sizes: size = imagesize.getimagesize(url) if href: - href = self.checkRefs(href) + href = self.shelveURL(href) - url = self.checkRefs(url) - url = self.relURL(url) + url = self.shelveURL(url) - out = [] - if href: - out.append('' % href) - out.append('') + atts.update(width=six.text_type(size[0])) + img = generate_tag('img', ' /', atts) if href: - out.append('') - - return ''.join(out) + a_atts = OrderedDict(href=href) + if self.rel: + a_atts.update(rel=self.rel) + img = generate_tag('a', img, a_atts) + return img def code(self, text): text = self.doSpecial(text, ' ', '
', self.fCode) @@ -1375,25 +1047,22 @@ def code(self, text): def fCode(self, match): before, text, after = match.groups() - if after is None: - after = '' + after = after or '' # text needs to be escaped - if not self.restricted: - text = self.encode_html(text, quotes=False) - return ''.join([before, self.shelve('%s
' % text), after]) + text = encode_html(text, quotes=False) + return ''.join([before, self.shelve('{0}
'.format(text)), after]) def fPre(self, match): before, text, after = match.groups() if after is None: after = '' # text needs to be escaped - if not self.restricted: - text = self.encode_html(text) + text = encode_html(text) return ''.join([before, '', self.shelve(text), '', after]) def doSpecial(self, text, start, end, method): - pattern = re.compile(r'(^|\s|[\[({>|])%s(.*?)%s($|[\])}])?' - % (re.escape(start), re.escape(end)), re.M | re.S) + pattern = re.compile(r'(^|\s|[\[({{>|]){0}(.*?){1}($|[\])}}])?'.format( + re.escape(start), re.escape(end)), re.M | re.S) return pattern.sub(method, text) def noTextile(self, text): @@ -1403,7 +1072,7 @@ def noTextile(self, text): def fTextile(self, match): before, notextile, after = match.groups() - if after is None: + if after is None: # pragma: no branch after = '' return ''.join([before, self.shelve(notextile), after]) @@ -1423,31 +1092,28 @@ def fParseHTMLComments(self, match): """ before, commenttext, after = match.groups() commenttext = self.shelve(commenttext) - return '' % commenttext + return '{0}'.format(before, commenttext) def redcloth_list(self, text): """Parse the text for definition lists and send them to be formatted.""" - pattern = re.compile(r"^([-]+%s[ .].*:=.*)$(?![^-])" % self.csl_re_s, - re.M | re.U | re.S) + pattern = re.compile(r"^([-]+{0}[ .].*:=.*)$(?![^-])".format(cls_re_s), + re.M | re.U | re.S) return pattern.sub(self.fRCList, text) def fRCList(self, match): """Format a definition list.""" out = [] - try: - text = re.split(r'\n(?=[-])', match.group(), flags=re.M) - except TypeError: - text = re.compile(r'\n(?=[-])', re.M).split(match.group()) + text = re.split(r'\n(?=[-])', match.group(), flags=re.M) for line in text: # parse the attributes and content - m = re.match(r'^[-]+(%s)[ .](.*)$' % self.csl_re_s, line, re.M | - re.S) + m = re.match(r'^[-]+({0})[ .](.*)$'.format(cls_re_s), line, + flags=re.M | re.S) atts, content = m.groups() # cleanup content = content.strip() - atts = self.pba(atts) + atts = pba(atts) # split the content into the term and definition xm = re.match(r'^(.*?)[\s]*:=(.*?)[\s]*(=:|:=)?[\s]*$', content, @@ -1460,21 +1126,21 @@ def fRCList(self, match): # if this is the first time through, out as a bool is False if not out: if definition == '': - dltag = "" % atts + dltag = "
".format(atts) else: dltag = "
" out.append(dltag) if definition != '' and term != '': if definition.startswith('\n'): - definition = '
') out = '\n'.join(out) @@ -1492,12 +1158,12 @@ def placeNoteLists(self, text): else: self.unreferencedNotes[label] = info - if o: + if o: # pragma: no branch # sort o by key o = OrderedDict(sorted(o.items(), key=lambda t: t[0])) self.notes = o - text_re = re.compile('%s
' % definition.lstrip() + definition = '{0}
'.format(definition.lstrip()) definition = definition.replace('\n', '
').strip() term = self.graf(term) definition = self.graf(definition) - out.extend(['\t- %s
' % (atts, term), '\t- %s
' % - definition]) + out.extend(['\t- {1}
'.format(atts, term), + '\t- {0}
'.format(definition)]) out.append('notelist(%s)(?:\:([\w|%s]))?([\^!]?)(\+?)\.?[\s]*
' - % (self.cslh_re_s, self.syms_re_s), re.U) + text_re = re.compile('notelist({0})(?:\:([\w|{1}]))?([\^!]?)(\+?)' + '\.?[\s]*
'.format(cls_re_s, syms_re_s), re.U) text = text_re.sub(self.fNoteLists, text) return text @@ -1505,12 +1171,12 @@ def fNoteLists(self, match): """Given the text that matches as a note, format it into HTML.""" att, start_char, g_links, extras = match.groups() start_char = start_char or 'a' - index = '%s%s%s' % (g_links, extras, start_char) + index = '{0}{1}{2}'.format(g_links, extras, start_char) result = '' - if index not in self.notelist_cache: + if index not in self.notelist_cache: # pragma: no branch o = [] - if self.notes: + if self.notes: # pragma: no branch for seq, info in self.notes.items(): links = self.makeBackrefLink(info, g_links, start_char) atts = '' @@ -1518,24 +1184,23 @@ def fNoteLists(self, match): infoid = info['id'] atts = info['def']['atts'] content = info['def']['content'] - li = ("""\t%s %s """ - % (atts, links, infoid, content)) + li = ('\t\t{1} ' + '{3} ').format(atts, links, infoid, + content) else: - li = ("""\t%s Undefined Note [#%s]. """ % - (atts, links, info['seq'])) + li = ('\t\t {1} Undefined Note [#{2}]. ' + ).format(atts, links, info['seq']) o.append(li) if '+' == extras and self.unreferencedNotes: for seq, info in self.unreferencedNotes.items(): - if info['def']: - atts = info['def']['atts'] - content = info['def']['content'] - li = """\t %s """ % (atts, content) + atts = info['def']['atts'] + content = info['def']['content'] + li = '\t\t{1} '.format(atts, content) o.append(li) self.notelist_cache[index] = "\n".join(o) result = self.notelist_cache[index] - if result: - list_atts = self.pba(att) - result = """\n%s\n
""" % (list_atts, result) + list_atts = pba(att) + result = '\n{1}\n\t
'.format(list_atts, result) return result def makeBackrefLink(self, info, g_links, i): @@ -1544,50 +1209,54 @@ def makeBackrefLink(self, info, g_links, i): if 'def' in info: link = info['def']['link'] backlink_type = link or g_links - i_ = self.encode_high(i) - allow_inc = i not in self.syms_re_s + i_ = encode_high(i) + allow_inc = i not in syms_re_s i_ = int(i_) if backlink_type == "!": return '' elif backlink_type == '^': - return """%s""" % ( + return """{1}""".format( info['refids'][0], i) else: result = [] for refid in info['refids']: - i_entity = self.decode_high(i_) - sup = """%s""" % (refid, - i_entity) + i_entity = decode_high(i_) + sup = """{1}""".format( + refid, i_entity) if allow_inc: - i_ += 1 + i_ = i_ + 1 result.append(sup) result = ' '.join(result) return result def fParseNoteDefs(self, m): """Parse the note definitions and format them as HTML""" - label, link, att, content = m.groups() + label = m.group('label') + link = m.group('link') + att = m.group('att') + content = m.group('content') # Assign an id if the note reference parse hasn't found the label yet. if label not in self.notes: - self.notes[label] = {'id': uuid.uuid4().hex} + self.notes[label] = {'id': '{0}{1}'.format(self.linkPrefix, + self._increment_link_index())} # Ignores subsequent defs using the same label - if 'def' not in self.notes[label]: - self.notes[label]['def'] = {'atts': self.pba(att), 'content': - self.graf(content), 'link': link} + if 'def' not in self.notes[label]: # pragma: no branch + self.notes[label]['def'] = {'atts': pba(att), 'content': + self.graf(content), 'link': link} return '' def noteRef(self, text): """Search the text looking for note references.""" text_re = re.compile(r""" \[ # start - (%s) # !atts + ({0}) # !atts \# - ([^\]!]+) # !label + ([^\]!]+) # !label ([!]?) # !nolink - \]""" % self.cslh_re_s, re.X) + \]""".format(cls_re_s), re.X) text = text_re.sub(self.fParseNoteRefs, text) return text @@ -1597,7 +1266,7 @@ def fParseNoteRefs(self, match): processed into the notes array. So now we can resolve the link numbers in the order we process the refs...""" atts, label, nolink = match.groups() - atts = self.pba(atts) + atts = pba(atts) nolink = nolink == '!' # Assign a sequence number to this reference if there isn't one already @@ -1608,57 +1277,70 @@ def fParseNoteRefs(self, match): 'seq': self.note_index, 'refids': [], 'id': '' } num = self.note_index - self.note_index += 1 + self.note_index = self.note_index + 1 # Make our anchor point and stash it for possible use in backlinks when # the note list is generated later... - refid = uuid.uuid4().hex + refid = '{0}{1}'.format(self.linkPrefix, self._increment_link_index()) self.notes[label]['refids'].append(refid) # If we are referencing a note that hasn't had the definition parsed # yet, then assign it an ID... if not self.notes[label]['id']: - self.notes[label]['id'] = uuid.uuid4().hex + self.notes[label]['id'] = '{0}{1}'.format(self.linkPrefix, + self._increment_link_index()) labelid = self.notes[label]['id'] # Build the link (if any)... - result = '%s' % (refid, num) + result = '{1}'.format(refid, num) if not nolink: - result = """%s""" % (labelid, result) + result = '{1}'.format(labelid, result) # Build the reference... - result = '%s' % (atts, result) + result = '{1}'.format(atts, result) return result - def encode_high(self, text): - """Encode the text so that it is an appropriate HTML entity.""" - return ord(text) + def shelveURL(self, text): + if text == '': + return '' + self.refIndex = self.refIndex + 1 + self.refCache[self.refIndex] = text + output = '{0}{1}{2}'.format(self.uid, self.refIndex, ':url') + return output + + def retrieveURLs(self, text): + return re.sub(r'{0}(?P[0-9]+):url'.format(self.uid), self.retrieveURL, text) + + def retrieveURL(self, match): + url = self.refCache.get(int(match.group('token')), '') + if url is '': + return url + + if url in self.urlrefs: + url = self.urlrefs[url] + + return url - def decode_high(self, text): - """Decode encoded HTML entities.""" - h = HTMLParser() - text = '%s;' % text - return h.unescape(text) + def _increment_link_index(self): + """The self.linkIndex property needs to be incremented in various + places. Don't Repeat Yourself.""" + self.linkIndex = self.linkIndex + 1 + return self.linkIndex -def textile(text, head_offset=0, html_type='xhtml', auto_link=False, - encoding=None, output=None): +def textile(text, html_type='xhtml', encoding=None, output=None): """ Apply Textile to a block of text. This function takes the following additional parameters: - auto_link - enable automatic linking of URLs (default: False) - head_offset - offset to apply to heading levels (default: 0) html_type - 'xhtml' or 'html5' style tags (default: 'xhtml') """ - return Textile(auto_link=auto_link, html_type=html_type).parse(text, - head_offset=head_offset) + return Textile(html_type=html_type).parse(text) -def textile_restricted(text, lite=True, noimage=True, html_type='xhtml', - auto_link=False): +def textile_restricted(text, lite=True, noimage=True, html_type='xhtml'): """ Apply Textile to a block of text, with restrictions designed for weblog comments and other untrusted input. Raw HTML is escaped, style attributes @@ -1666,22 +1348,11 @@ def textile_restricted(text, lite=True, noimage=True, html_type='xhtml', This function takes the following additional parameters: - auto_link - enable automatic linking of URLs (default: False) html_type - 'xhtml' or 'html5' style tags (default: 'xhtml') lite - restrict block tags to p, bq, and bc, disable tables (default: True) noimage - disable image tags (default: True) """ return Textile(restricted=True, lite=lite, noimage=noimage, - auto_link=auto_link, html_type=html_type).parse( text, - rel='nofollow') - - -def setup_module(mod): - """Inject Py3 to builtins for doctests.""" - try: - import builtins - except ImportError: - import __builtin__ as builtins - from textile.tools.doctest_utils import Py3 - builtins.Py3 = Py3 + html_type=html_type, rel='nofollow').parse( + text) diff --git a/textile/objects/__init__.py b/textile/objects/__init__.py new file mode 100644 index 00000000..b2373809 --- /dev/null +++ b/textile/objects/__init__.py @@ -0,0 +1,4 @@ +from .block import Block +from .table import Table + +__all__ = ['Block', 'Table'] diff --git a/textile/objects/block.py b/textile/objects/block.py new file mode 100644 index 00000000..7b46bc1f --- /dev/null +++ b/textile/objects/block.py @@ -0,0 +1,123 @@ +try: + from collections import OrderedDict +except ImportError: + from ordereddict import OrderedDict +try: + import regex as re +except ImportError: + import re + +from textile.regex_strings import cls_re_s, regex_snippets +from textile.utils import encode_html, generate_tag, parse_attributes + + +class Block(object): + def __init__(self, textile, tag, atts, ext, cite, content): + self.textile = textile + self.tag = tag + self.atts = atts + self.ext = ext + self.cite = cite + self.content = content + + self.attributes = parse_attributes(atts) + self.outer_tag = '' + self.inner_tag = '' + self.outer_atts = OrderedDict() + self.inner_atts = OrderedDict() + self.eat = False + self.process() + + def process(self): + if self.tag == 'p': + # is this an anonymous block with a note definition? + notedef_re = re.compile(r""" + ^note\# # start of note def marker + (?P