import urllib3

This commit is contained in:
Kenneth Reitz
2011-09-17 20:30:08 -04:00
parent 784b671559
commit 84434fddee
5 changed files with 727 additions and 0 deletions
+22
View File
@@ -0,0 +1,22 @@
"""
urllib3 - Thread-safe connection pooling and re-using.
"""
from connectionpool import (
connection_from_url,
get_host,
HTTPConnectionPool,
HTTPSConnectionPool,
make_headers)
# Possible exceptions
from connectionpool import (
HTTPError,
MaxRetryError,
SSLError,
TimeoutError)
from filepost import encode_multipart_formdata
__author__ = "Andrey Petrov (andrey.petrov@shazow.net)"
__license__ = "MIT"
__version__ = "$Rev$"
+544
View File
@@ -0,0 +1,544 @@
import gzip
import zlib
import logging
import socket
from urllib import urlencode
from httplib import HTTPConnection, HTTPSConnection, HTTPException
from Queue import Queue, Empty, Full
from select import select
from socket import error as SocketError, timeout as SocketTimeout
try:
import ssl
BaseSSLError = ssl.SSLError
except ImportError, e:
ssl = None
BaseSSLError = None
try:
from cStringIO import StringIO
except ImportError, e:
from StringIO import StringIO
from filepost import encode_multipart_formdata
log = logging.getLogger(__name__)
## Exceptions
class HTTPError(Exception):
"Base exception used by this module."
pass
class SSLError(Exception):
"Raised when SSL certificate fails in an HTTPS connection."
pass
class MaxRetryError(HTTPError):
"Raised when the maximum number of retries is exceeded."
pass
class TimeoutError(HTTPError):
"Raised when a socket timeout occurs."
pass
class HostChangedError(HTTPError):
"Raised when an existing pool gets a request for a foreign host."
pass
## Response objects
class HTTPResponse(object):
"""
HTTP Response container.
Similar to httplib's HTTPResponse but the data is pre-loaded.
"""
def __init__(self, data='', headers=None, status=0, version=0, reason=None,
strict=0):
self.data = data
self.headers = headers or {}
self.status = status
self.version = version
self.reason = reason
self.strict = strict
@staticmethod
def from_httplib(r):
"""
Given an httplib.HTTPResponse instance, return a corresponding
urllib3.HTTPResponse object.
NOTE: This method will perform r.read() which will have side effects
on the original http.HTTPResponse object.
"""
tmp_data = r.read()
try:
if r.getheader('content-encoding') == 'gzip':
log.debug("Received response with content-encoding: gzip, "
"decompressing with gzip.")
gzipper = gzip.GzipFile(fileobj=StringIO(tmp_data))
data = gzipper.read()
elif r.getheader('content-encoding') == 'deflate':
log.debug("Received response with content-encoding: deflate, "
"decompressing with zlib.")
try:
data = zlib.decompress(tmp_data)
except zlib.error, e:
data = zlib.decompress(tmp_data, -zlib.MAX_WBITS)
else:
data = tmp_data
except IOError:
raise HTTPError("Received response with content-encoding: %s, "
"but failed to decompress it." %
(r.getheader('content-encoding')))
return HTTPResponse(data=data,
headers=dict(r.getheaders()),
status=r.status,
version=r.version,
reason=r.reason,
strict=r.strict)
# Backwards-compatibility methods for httplib.HTTPResponse
def getheaders(self):
return self.headers
def getheader(self, name, default=None):
return self.headers.get(name, default)
## Connection objects
class VerifiedHTTPSConnection(HTTPSConnection):
"""
Based on httplib.HTTPSConnection but wraps the socket with
SSL certification.
"""
def set_cert(self, key_file=None, cert_file=None, cert_reqs='CERT_NONE',
ca_certs=None):
ssl_req_scheme = {
'CERT_NONE': ssl.CERT_NONE,
'CERT_OPTIONAL': ssl.CERT_OPTIONAL,
'CERT_REQUIRED': ssl.CERT_REQUIRED
}
self.key_file = key_file
self.cert_file = cert_file
self.cert_reqs = ssl_req_scheme.get(cert_reqs) or ssl.CERT_NONE
self.ca_certs = ca_certs
def connect(self):
# Add certificate verification
sock = socket.create_connection((self.host, self.port), self.timeout)
# Wrap socket using verification with the root certs in
# trusted_root_certs
self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file,
cert_reqs=self.cert_reqs,
ca_certs=self.ca_certs)
## Pool objects
class HTTPConnectionPool(object):
"""
Thread-safe connection pool for one host.
host
Host used for this HTTP Connection (e.g. "localhost"), passed into
httplib.HTTPConnection()
port
Port used for this HTTP Connection (None is equivalent to 80), passed
into httplib.HTTPConnection()
strict
Causes BadStatusLine to be raised if the status line can't be parsed
as a valid HTTP/1.0 or 1.1 status line, passed into
httplib.HTTPConnection()
timeout
Socket timeout for each individual connection, can be a float. None
disables timeout.
maxsize
Number of connections to save that can be reused. More than 1 is useful
in multithreaded situations. If ``block`` is set to false, more
connections will be created but they will not be saved once they've
been used.
block
If set to True, no more than ``maxsize`` connections will be used at
a time. When no free connections are available, the call will block
until a connection has been released. This is a useful side effect for
particular multithreaded situations where one does not want to use more
than maxsize connections per host to prevent flooding.
headers
Headers to include with all requests, unless other headers are given
explicitly.
"""
scheme = 'http'
def __init__(self, host, port=None, strict=False, timeout=None, maxsize=1,
block=False, headers=None):
self.host = host
self.port = port
self.strict = strict
self.timeout = timeout
self.pool = Queue(maxsize)
self.block = block
self.headers = headers or {}
# Fill the queue up so that doing get() on it will block properly
[self.pool.put(None) for i in xrange(maxsize)]
self.num_connections = 0
self.num_requests = 0
def _new_conn(self):
"""
Return a fresh HTTPConnection.
"""
self.num_connections += 1
log.info("Starting new HTTP connection (%d): %s" %
(self.num_connections, self.host))
return HTTPConnection(host=self.host, port=self.port)
def _get_conn(self, timeout=None):
"""
Get a connection. Will return a pooled connection if one is available.
Otherwise, a fresh connection is returned.
"""
conn = None
try:
conn = self.pool.get(block=self.block, timeout=timeout)
# If this is a persistent connection, check if it got disconnected
if conn and conn.sock and select([conn.sock], [], [], 0.0)[0]:
# Either data is buffered (bad), or the connection is dropped.
log.warning("Connection pool detected dropped "
"connection, resetting: %s" % self.host)
conn.close()
except Empty, e:
pass # Oh well, we'll create a new connection then
return conn or self._new_conn()
def _put_conn(self, conn):
"""
Put a connection back into the pool.
If the pool is already full, the connection is discarded because we
exceeded maxsize. If connections are discarded frequently, then maxsize
should be increased.
"""
try:
self.pool.put(conn, block=False)
except Full, e:
# This should never happen if self.block == True
log.warning("HttpConnectionPool is full, discarding connection: %s"
% self.host)
def is_same_host(self, url):
return (url.startswith('/') or
get_host(url) == (self.scheme, self.host, self.port))
def urlopen(self, method, url, body=None, headers=None, retries=3,
redirect=True, assert_same_host=True):
"""
Get a connection from the pool and perform an HTTP request.
method
HTTP request method (such as GET, POST, PUT, etc.)
body
Data to send in the request body (useful for creating
POST requests, see HTTPConnectionPool.post_url for
more convenience).
headers
Dictionary of custom headers to send, such as User-Agent,
If-None-Match, etc. If None, pool headers are used. If provided,
these headers completely replace any pool-specific headers.
retries
Number of retries to allow before raising
a MaxRetryError exception.
redirect
Automatically handle redirects (status codes 301, 302, 303, 307),
each redirect counts as a retry.
assert_same_host
If True, will make sure that the host of the pool requests is
consistent else will raise HostChangedError. When False, you can
use the pool on an HTTP proxy and request foreign hosts.
"""
if headers == None:
headers = self.headers
if retries < 0:
raise MaxRetryError("Max retries exceeded for url: %s" % url)
# Check host
if assert_same_host and not self.is_same_host(url):
host = "%s://%s" % (self.scheme, self.host)
if self.port:
host = "%s:%d" % (host, self.port)
raise HostChangedError("Connection pool with host '%s' tried to "
"open a foreign host: %s" % (host, url))
try:
# Request a connection from the queue
conn = self._get_conn()
# Make the request
self.num_requests += 1
conn.request(method, url, body=body, headers=headers)
conn.sock.settimeout(self.timeout)
httplib_response = conn.getresponse()
log.debug("\"%s %s %s\" %s %s" %
(method, url, conn._http_vsn_str,
httplib_response.status, httplib_response.length))
# from_httplib will perform httplib_response.read() which will have
# the side effect of letting us use this connection for another
# request.
response = HTTPResponse.from_httplib(httplib_response)
# Put the connection back to be reused
self._put_conn(conn)
except (SocketTimeout, Empty), e:
# Timed out either by socket or queue
raise TimeoutError("Request timed out after %f seconds" %
self.timeout)
except (BaseSSLError), e:
# SSL certificate error
raise SSLError(e)
except (HTTPException, SocketError), e:
log.warn("Retrying (%d attempts remain) after connection "
"broken by '%r': %s" % (retries, e, url))
return self.urlopen(method, url, body, headers, retries - 1,
redirect, assert_same_host) # Try again
# Handle redirection
if (redirect and
response.status in [301, 302, 303, 307] and
'location' in response.headers): # Redirect, retry
log.info("Redirecting %s -> %s" %
(url, response.headers.get('location')))
return self.urlopen(method, response.headers.get('location'), body,
headers, retries - 1, redirect,
assert_same_host)
return response
def get_url(self, url, fields=None, headers=None, retries=3,
redirect=True):
"""
Wrapper for performing GET with urlopen (see urlopen for more details).
Supports an optional ``fields`` dictionary parameter key/value strings.
If provided, they will be added to the url.
"""
if fields:
url += '?' + urlencode(fields)
return self.urlopen('GET', url, headers=headers, retries=retries,
redirect=redirect)
def post_url(self, url, fields=None, headers=None, retries=3,
redirect=True, encode_multipart=True):
"""
Wrapper for performing POST with urlopen (see urlopen
for more details).
Supports an optional ``fields`` parameter of key/value strings AND
key/filetuple. A filetuple is a (filename, data) tuple. For example:
fields = {
'foo': 'bar',
'foofile': ('foofile.txt', 'contents of foofile'),
}
If encode_multipart=True (default), then
``urllib3.filepost.encode_multipart_formdata`` is used to encode the
payload with the appropriate content type. Otherwise
``urllib.urlencode`` is used with 'application/x-www-form-urlencoded'
content type.
Multipart encoding must be used when posting files, and it's reasonably
safe to use it other times too. It may break request signing, such as
OAuth.
NOTE: If ``headers`` are supplied, the 'Content-Type' value will be
overwritten because it depends on the dynamic random boundary string
which is used to compose the body of the request.
"""
if encode_multipart:
body, content_type = encode_multipart_formdata(fields or {})
else:
body, content_type = (
urlencode(fields or {}),
'application/x-www-form-urlencoded')
headers = headers or {}
headers.update({'Content-Type': content_type})
return self.urlopen('POST', url, body, headers=headers,
retries=retries, redirect=redirect)
class HTTPSConnectionPool(HTTPConnectionPool):
"""
Same as HTTPConnectionPool, but HTTPS.
"""
scheme = 'https'
def __init__(self, host, port=None, strict=False, timeout=None, maxsize=1,
block=False, headers=None, key_file=None,
cert_file=None, cert_reqs='CERT_NONE', ca_certs=None):
self.host = host
self.port = port
self.strict = strict
self.timeout = timeout
self.pool = Queue(maxsize)
self.block = block
self.headers = headers or {}
self.key_file = key_file
self.cert_file = cert_file
self.cert_reqs = cert_reqs
self.ca_certs = ca_certs
# Fill the queue up so that doing get() on it will block properly
[self.pool.put(None) for i in xrange(maxsize)]
self.num_connections = 0
self.num_requests = 0
def _new_conn(self):
"""
Return a fresh HTTPSConnection.
"""
self.num_connections += 1
log.info("Starting new HTTPS connection (%d): %s"
% (self.num_connections, self.host))
if not ssl:
return HTTPSConnection(host=self.host, port=self.port)
connection = VerifiedHTTPSConnection(host=self.host, port=self.port)
connection.set_cert(key_file=self.key_file, cert_file=self.cert_file,
cert_reqs=self.cert_reqs, ca_certs=self.ca_certs)
return connection
## Helpers
def make_headers(keep_alive=None, accept_encoding=None, user_agent=None,
basic_auth=None):
"""
Shortcuts for generating request headers.
keep_alive
If true, adds 'connection: keep-alive' header.
accept_encoding
Can be a boolean, list, or string.
True translates to 'gzip,deflate'.
List will get joined by comma.
String will be used as provided.
user_agent
String representing the user-agent you want, such as
"python-urllib3/0.6"
basic_auth
Colon-separated username:password string for 'authorization: basic ...'
auth header.
"""
headers = {}
if accept_encoding:
if isinstance(accept_encoding, str):
pass
elif isinstance(accept_encoding, list):
accept_encoding = ','.join(accept_encoding)
else:
accept_encoding = 'gzip,deflate'
headers['accept-encoding'] = accept_encoding
if user_agent:
headers['user-agent'] = user_agent
if keep_alive:
headers['connection'] = 'keep-alive'
if basic_auth:
headers['authorization'] = 'Basic ' + \
basic_auth.encode('base64').strip()
return headers
def get_host(url):
"""
Given a url, return its scheme, host and port (None if it's not there).
For example:
>>> get_host('http://google.com/mail/')
http, google.com, None
>>> get_host('google.com:80')
http, google.com, 80
"""
# This code is actually similar to urlparse.urlsplit, but much
# simplified for our needs.
port = None
scheme = 'http'
if '//' in url:
scheme, url = url.split('://', 1)
if '/' in url:
url, path = url.split('/', 1)
if ':' in url:
url, port = url.split(':', 1)
port = int(port)
return scheme, url, port
def connection_from_url(url, **kw):
"""
Given a url, return an HTTP(S)ConnectionPool instance of its host.
This is a shortcut for not having to determine the host of the url
before creating an HTTP(S)ConnectionPool instance.
Passes on whatever kw arguments to the constructor of
HTTP(S)ConnectionPool. (e.g. timeout, maxsize, block)
"""
scheme, host, port = get_host(url)
if scheme == 'https':
return HTTPSConnectionPool(host, port=port, **kw)
else:
return HTTPConnectionPool(host, port=port, **kw)
@@ -0,0 +1,111 @@
"""
NTLM authenticating pool, contributed by erikcederstran
Issue #10, see: http://code.google.com/p/urllib3/issues/detail?id=10
"""
import httplib
from logging import getLogger
from ntlm import ntlm
from urllib3 import HTTPSConnectionPool
log = getLogger(__name__)
class NTLMConnectionPool(HTTPSConnectionPool):
"""
Implements an NTLM authentication version of an urllib3 connection pool
"""
scheme = 'https'
def __init__(self, user, pw, authurl, *args, **kwargs):
"""
authurl is a random URL on the server that is protected by NTLM.
user is the Windows user, probably in the DOMAIN\username format.
pw is the password for the user.
"""
super(NTLMConnectionPool, self).__init__(*args, **kwargs)
self.authurl = authurl
self.rawuser = user
user_parts = user.split('\\', 1)
self.domain = user_parts[0].upper()
self.user = user_parts[1]
self.pw = pw
def _new_conn(self):
# Performs the NTLM handshake that secures the connection. The socket
# must be kept open while requests are performed.
self.num_connections += 1
log.debug('Starting NTLM HTTPS connection no. %d: https://%s%s' %
(self.num_connections, self.host, self.authurl))
headers = {}
headers['Connection'] = 'Keep-Alive'
req_header = 'Authorization'
resp_header = 'www-authenticate'
conn = httplib.HTTPSConnection(host=self.host, port=self.port)
# Send negotiation message
headers[req_header] = (
'NTLM %s' % ntlm.create_NTLM_NEGOTIATE_MESSAGE(self.rawuser))
log.debug('Request headers: %s' % headers)
conn.request('GET', self.authurl, None, headers)
res = conn.getresponse()
reshdr = dict(res.getheaders())
log.debug('Response status: %s %s' % (res.status, res.reason))
log.debug('Response headers: %s' % reshdr)
log.debug('Response data: %s [...]' % res.read(100))
# Remove the reference to the socket, so that it can not be closed by
# the response object (we want to keep the socket open)
res.fp = None
# Server should respond with a challenge message
auth_header_values = reshdr[resp_header].split(', ')
auth_header_value = None
for s in auth_header_values:
if s[:5] == 'NTLM ':
auth_header_value = s[5:]
if auth_header_value is None:
raise Exception('Unexpected %s response header: %s' %
(resp_header, reshdr[resp_header]))
# Send authentication message
ServerChallenge, NegotiateFlags = \
ntlm.parse_NTLM_CHALLENGE_MESSAGE(auth_header_value)
auth_msg = ntlm.create_NTLM_AUTHENTICATE_MESSAGE(ServerChallenge,
self.user,
self.domain,
self.pw,
NegotiateFlags)
headers[req_header] = 'NTLM %s' % auth_msg
log.debug('Request headers: %s' % headers)
conn.request('GET', self.authurl, None, headers)
res = conn.getresponse()
log.debug('Response status: %s %s' % (res.status, res.reason))
log.debug('Response headers: %s' % dict(res.getheaders()))
log.debug('Response data: %s [...]' % res.read()[:100])
if res.status != 200:
if res.status == 401:
raise Exception('Server rejected request: wrong '
'username or password')
raise Exception('Wrong server response: %s %s' %
(res.status, res.reason))
res.fp = None
log.debug('Connection established')
return conn
def urlopen(self, method, url, body=None, headers=None, retries=3,
redirect=True, assert_same_host=True):
if headers is None:
headers = {}
headers['Connection'] = 'Keep-Alive'
return super(NTLMConnectionPool, self).urlopen(method, url, body,
headers, retries,
redirect,
assert_same_host)
+50
View File
@@ -0,0 +1,50 @@
import codecs
import mimetools
import mimetypes
try:
from cStringIO import StringIO
except:
from StringIO import StringIO
writer = codecs.lookup('utf-8')[3]
def get_content_type(filename):
return mimetypes.guess_type(filename)[0] or 'application/octet-stream'
def encode_multipart_formdata(fields):
body = StringIO()
BOUNDARY = mimetools.choose_boundary()
for fieldname, value in fields.iteritems():
body.write('--%s\r\n' % (BOUNDARY))
if isinstance(value, tuple):
filename, data = value
writer(body).write('Content-Disposition: form-data; name="%s"; '
'filename="%s"\r\n' % (fieldname, filename))
body.write('Content-Type: %s\r\n\r\n' %
(get_content_type(filename)))
else:
data = value
writer(body).write('Content-Disposition: form-data; name="%s"\r\n'
% (fieldname))
body.write('Content-Type: text/plain\r\n\r\n')
if isinstance(data, int):
data = str(data) # Backwards compatibility
if isinstance(data, unicode):
writer(body).write(data)
else:
body.write(data)
body.write('\r\n')
body.write('--%s--\r\n' % (BOUNDARY))
content_type = 'multipart/form-data; boundary=%s' % BOUNDARY
return body.getvalue(), content_type