Skip to content

Commit

Permalink
Merge pull request #45 from seomoz/dan/url-cpp-psl
Browse files Browse the repository at this point in the history
BIG-3884 - Use PSL support from url-cpp.
  • Loading branch information
Dan Lecocq authored Aug 26, 2016
2 parents 0d58ce7 + aeede32 commit 216e0a9
Show file tree
Hide file tree
Showing 10 changed files with 12,580 additions and 888 deletions.
18 changes: 18 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,24 @@ Not all functions are chainable -- some return a value other than a `URL` object

- `encode(...)` -- return a version of the url in an arbitrary encoding

Public Suffix List
==================
This library comes bundled with a version of the public suffix list. However, it may not
suit your needs (whether you need to stay pinned to an old list, or need to update to a
new list). As such, you can provide the PSL you'd like to use, as a `UTF-8` string:

```python
import url

# Read it from a file
with open('path/to/my/psl') as fin:
url.set_psl(fin.read())

# Grab it from the PSL site
import requests
url.set_psl(requests.get('https://publicsuffix.org/list/public_suffix_list.dat').content)
```

Properties
==========
Many attributes are available on URL objects:
Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ coverage==4.1
Cython==0.24.1
nose==1.3.7
nose-timer==0.6.0
publicsuffix==1.1.0
python-termstyle==0.1.10
rednose==1.2.1
termcolor==1.1.0
11 changes: 6 additions & 5 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@
ext_files = [
'url/url-cpp/src/url.cpp',
'url/url-cpp/src/utf8.cpp',
'url/url-cpp/src/punycode.cpp'
'url/url-cpp/src/punycode.cpp',
'url/url-cpp/src/psl.cpp'
]

kwargs = {}
Expand All @@ -50,7 +51,7 @@

setup(
name = 'url',
version = '0.3.0rc1',
version = '0.3.0rc2',
description = 'URL Parsing',
long_description = '''
Some helper functions for parsing URLs, sanitizing them, normalizing them.
Expand All @@ -77,9 +78,9 @@
package_dir = {
'url': 'url'
},
install_requires = [
'publicsuffix'
],
package_data = {
'url': ['psl/*']
},
tests_require = [
'coverage',
'nose'
Expand Down
43 changes: 38 additions & 5 deletions test.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
#! /usr/bin/env python
# -*- coding: utf-8 -*-

import url
import pkgutil
import unittest

from nose.tools import assert_equal, assert_not_equal, assert_raises

import url


def test_bad_port():
def test(example):
Expand Down Expand Up @@ -574,9 +576,15 @@ def test(query, result):
assert_equal(url.parse(query).pld, result)

examples = [
('http://foo.com/bar' , 'foo.com'),
('http://bar.foo.com/bar', 'foo.com'),
('/foo' , '')
('http://foo.com/bar' , 'foo.com'),
('http://bar.foo.com/bar' , 'foo.com'),
('/foo' , ''),
('http://com/bar' , ''),
('http://foo.გე' , 'foo.გე'),
('http://bar.foo.გე' , 'foo.გე'),
('http://foo.xn--node' , 'foo.xn--node'),
('http://bar.foo.xn--node', 'foo.xn--node'),
('http://foo.co.uk' , 'foo.co.uk')
]
for query, result in examples:
yield test, query, result
Expand All @@ -589,7 +597,13 @@ def test(query, result):
examples = [
('http://foo.com/bar' , 'com'),
('http://bar.foo.com/bar', 'com'),
('/foo' , '')
('/foo' , ''),
('http://com/bar' , 'com'),
('http://foo.გე' , 'გე'),
('http://bar.foo.გე' , 'გე'),
('http://foo.xn--node' , 'xn--node'),
('http://bar.foo.xn--node', 'xn--node'),
('http://foo.co.uk' , 'co.uk')
]
for query, result in examples:
yield test, query, result
Expand Down Expand Up @@ -630,3 +644,22 @@ def test(example):
]
for example in examples:
yield test, example

def test_set_psl():
'''Can set the PSL to use.'''

def test(rules, example, pld, tld):
try:
url.set_psl(rules)
assert_equal(url.parse(example).pld, pld)
assert_equal(url.parse(example).tld, tld)
finally:
url.set_psl(pkgutil.get_data('url', 'psl/2016-08-16.psl'))

examples = [
('uk', 'http://foo.co.uk/', 'co.uk', 'uk' ),
('co.uk', 'http://foo.co.uk/', 'foo.co.uk', 'co.uk')
]

for rules, example, pld, tld in examples:
yield test, rules, example, pld, tld
2 changes: 1 addition & 1 deletion url/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
'''This is a module for dealing with urls. In particular, sanitizing them.'''


from .url import URL
from .url import URL, set_psl

def parse(url, encoding='utf-8'):
'''Parse the provided url string and return an URL object'''
Expand Down
Loading

0 comments on commit 216e0a9

Please sign in to comment.