Hello community, here is the log from the commit of package python-urlgrabber checked in at Fri May 26 15:06:49 CEST 2006. -------- --- python-urlgrabber/python-urlgrabber.changes 2006-02-28 16:46:03.000000000 +0100 +++ python-urlgrabber/python-urlgrabber.changes 2006-05-25 14:27:40.000000000 +0200 @@ -1,0 +2,14 @@ +Thu May 25 14:19:34 CEST 2006 - cthiel@suse.de + +- update to version 2.9.9 + * Added tests to make sure that the "quote" option works as advertised + * Significant improvement to URL parsing. Parsing is now broken out into + a separate class (URLParser). It will now (by default) guess whether a + URL is already quoted, properly handle local files and URLs on windows, + and display un-quoted versions of the filename in the progress meter. + * Added a reget progress bar patch from Menno, and fixed the annoying next + _IndexError bug. +- added urlgrabber-read-error.patch (from Fedora) +- removed python-urlgrabber-2.9.7-reget.patch (included upstream) + +------------------------------------------------------------------- Old: ---- python-urlgrabber-2.9.7-reget.diff python-urlgrabber-2.9.7.diff urlgrabber-2.9.7.tar.bz2 New: ---- python-urlgrabber-2.9.9.patch urlgrabber-2.9.9.tar.bz2 urlgrabber-read-error.patch ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ python-urlgrabber.spec ++++++ --- /var/tmp/diff_new_pack.dGSdOG/_old 2006-05-26 15:04:41.000000000 +0200 +++ /var/tmp/diff_new_pack.dGSdOG/_new 2006-05-26 15:04:41.000000000 +0200 @@ -1,5 +1,5 @@ # -# spec file for package python-urlgrabber (Version 2.9.7) +# spec file for package python-urlgrabber (Version 2.9.9) # # Copyright (c) 2006 SUSE LINUX Products GmbH, Nuernberg, Germany. # This file and all modifications and additions to the pristine @@ -12,15 +12,15 @@ Name: python-urlgrabber BuildRequires: python-devel -Version: 2.9.7 -Release: 6 +Version: 2.9.9 +Release: 1 Summary: A high-level cross-protocol url-grabber Group: Development/Libraries/Python License: LGPL URL: http://linux.duke.edu/projects/urlgrabber/ Source: urlgrabber-%{version}.tar.bz2 -Patch: %{name}-%{version}-reget.diff -Patch1: %{name}-%{version}.diff +Patch: %{name}-%{version}.patch +Patch1: urlgrabber-read-error.patch BuildRoot: %{_tmppath}/%{name}-%{version}-build %py_requires @@ -39,7 +39,7 @@ %prep %setup -q -n urlgrabber-%{version} -%patch -p1 +%patch %patch1 %build @@ -59,6 +59,17 @@ %{py_sitedir}/urlgrabber %changelog -n python-urlgrabber +* Thu May 25 2006 - cthiel@suse.de +- update to version 2.9.9 + * Added tests to make sure that the "quote" option works as advertised + * Significant improvement to URL parsing. Parsing is now broken out into + a separate class (URLParser). It will now (by default) guess whether a + URL is already quoted, properly handle local files and URLs on windows, + and display un-quoted versions of the filename in the progress meter. + * Added a reget progress bar patch from Menno, and fixed the annoying next + _IndexError bug. +- added urlgrabber-read-error.patch (from Fedora) +- removed python-urlgrabber-2.9.7-reget.patch (included upstream) * Tue Feb 28 2006 - jmatejek@suse.cz - updated to reflect python changes due to #149809 * Wed Jan 25 2006 - mls@suse.de ++++++ python-urlgrabber-2.9.9.patch ++++++ --- setup.py +++ setup.py @@ -15,8 +15,6 @@ packages = ['urlgrabber'] package_dir = {'urlgrabber':'urlgrabber'} scripts = ['scripts/urlgrabber'] -data_files = [('share/doc/' + name + '-' + version, - ['README','LICENSE', 'TODO', 'ChangeLog'])] options = { 'clean' : { 'all' : 1 } } classifiers = [ 'Development Status :: 4 - Beta', ++++++ urlgrabber-2.9.7.tar.bz2 -> urlgrabber-2.9.9.tar.bz2 ++++++ diff -urN --exclude=CVS --exclude=.cvsignore --exclude=.svn --exclude=.svnignore old/urlgrabber-2.9.7/ChangeLog new/urlgrabber-2.9.9/ChangeLog --- old/urlgrabber-2.9.7/ChangeLog 2005-10-23 00:05:26.000000000 +0200 +++ new/urlgrabber-2.9.9/ChangeLog 2006-03-02 22:06:42.000000000 +0100 @@ -1,3 +1,52 @@ +2006-03-02 Michael D. Stenner <mstenner@linux.duke.edu> + + * urlgrabber/__init__.py: + + release 2.9.9 + +2006-03-02 Michael D. Stenner <mstenner@linux.duke.edu> + + * test/test_grabber.py: + + Added tests to make sure that the "quote" option works as + advertised. + +2006-03-02 Michael D. Stenner <mstenner@linux.duke.edu> + + * scripts/urlgrabber, test/test_grabber.py, urlgrabber/grabber.py: + + Significant improvement to URL parsing. Parsing is now broken out + into a separate class (URLParser). It will now (by default) guess + whether a URL is already quoted, properly handle local files and + URLs on windows, and display un-quoted versions of the filename in + the progress meter. + +2006-02-22 Michael D. Stenner <mstenner@linux.duke.edu> + + * ChangeLog: + + updated ChangeLog + +2006-02-22 Michael D. Stenner <mstenner@linux.duke.edu> + + * urlgrabber/__init__.py: + + release 2.9.8 + +2006-02-22 Michael D. Stenner <mstenner@linux.duke.edu> + + * urlgrabber/: grabber.py, mirror.py: + + Added a reget progress bar patch from Menno, and fixed the annoying + _next IndexError bug. Thanks to Edinelson Keiji Shimokawa for + getting me looking in the right direction. + +2005-10-22 Michael D. Stenner <mstenner@linux.duke.edu> + + * ChangeLog: + + updated ChangeLog + 2005-10-22 Michael D. Stenner <mstenner@linux.duke.edu> * ChangeLog, TODO, urlgrabber/__init__.py: diff -urN --exclude=CVS --exclude=.cvsignore --exclude=.svn --exclude=.svnignore old/urlgrabber-2.9.7/PKG-INFO new/urlgrabber-2.9.9/PKG-INFO --- old/urlgrabber-2.9.7/PKG-INFO 2005-10-23 00:05:37.000000000 +0200 +++ new/urlgrabber-2.9.9/PKG-INFO 2006-03-02 22:06:52.000000000 +0100 @@ -1,6 +1,6 @@ Metadata-Version: 1.0 Name: urlgrabber -Version: 2.9.7 +Version: 2.9.9 Summary: A high-level cross-protocol url-grabber Home-page: http://linux.duke.edu/projects/urlgrabber/ Author: Michael D. Stenner, Ryan Tomayko diff -urN --exclude=CVS --exclude=.cvsignore --exclude=.svn --exclude=.svnignore old/urlgrabber-2.9.7/scripts/urlgrabber new/urlgrabber-2.9.9/scripts/urlgrabber --- old/urlgrabber-2.9.7/scripts/urlgrabber 2004-09-07 23:19:54.000000000 +0200 +++ new/urlgrabber-2.9.9/scripts/urlgrabber 2006-03-02 21:56:56.000000000 +0100 @@ -62,7 +62,7 @@ explicitly. """ -# $Id: urlgrabber,v 1.4 2004/09/07 21:19:54 mstenner Exp $ +# $Id: urlgrabber,v 1.5 2006/03/02 20:56:56 mstenner Exp $ import sys import urlgrabber.grabber @@ -112,7 +112,7 @@ print 'URL: ', url print 'FILE: ', file - try: from progress import text_progress_meter + try: from urlgrabber.progress import text_progress_meter except ImportError, e: pass else: kwargs['progress_obj'] = text_progress_meter() diff -urN --exclude=CVS --exclude=.cvsignore --exclude=.svn --exclude=.svnignore old/urlgrabber-2.9.7/test/test_grabber.py new/urlgrabber-2.9.9/test/test_grabber.py --- old/urlgrabber-2.9.7/test/test_grabber.py 2005-10-22 23:57:27.000000000 +0200 +++ new/urlgrabber-2.9.9/test/test_grabber.py 2006-03-02 22:06:00.000000000 +0100 @@ -21,7 +21,7 @@ """grabber.py tests""" -# $Id: test_grabber.py,v 1.28 2005/10/22 21:57:27 mstenner Exp $ +# $Id: test_grabber.py,v 1.30 2006/03/02 21:06:00 mstenner Exp $ import sys import os @@ -33,30 +33,34 @@ import urlgrabber import urlgrabber.grabber as grabber -from urlgrabber.grabber import URLGrabber, URLGrabError, CallbackObject +from urlgrabber.grabber import URLGrabber, URLGrabError, CallbackObject, \ + URLParser from urlgrabber.progress import text_progress_meter class FileObjectTests(TestCase): def setUp(self): self.filename = tempfile.mktemp() - fo = open(self.filename, 'w') + fo = file(self.filename, 'wb') fo.write(reference_data) fo.close() self.fo_input = cStringIO.StringIO(reference_data) self.fo_output = cStringIO.StringIO() - self.wrapper = grabber.URLGrabberFileObject('file://' + self.filename, self.fo_output, - grabber.default_grabber.opts) + (url, parts) = grabber.default_grabber.opts.urlparser.parse( + self.filename, grabber.default_grabber.opts) + self.wrapper = grabber.URLGrabberFileObject( + url, self.fo_output, grabber.default_grabber.opts) def tearDown(self): + self.wrapper.close() os.unlink(self.filename) def test_readall(self): "URLGrabberFileObject .read() method" s = self.wrapper.read() self.fo_output.write(s) - self.assertEqual(reference_data, self.fo_output.getvalue()) + self.assert_(reference_data == self.fo_output.getvalue()) def test_readline(self): "URLGrabberFileObject .readline() method" @@ -64,13 +68,13 @@ s = self.wrapper.readline() self.fo_output.write(s) if not s: break - self.assertEqual(reference_data, self.fo_output.getvalue()) + self.assert_(reference_data == self.fo_output.getvalue()) def test_readlines(self): "URLGrabberFileObject .readlines() method" li = self.wrapper.readlines() self.fo_output.write(string.join(li, '')) - self.assertEqual(reference_data, self.fo_output.getvalue()) + self.assert_(reference_data == self.fo_output.getvalue()) def test_smallread(self): "URLGrabberFileObject .read(N) with small N" @@ -78,7 +82,7 @@ s = self.wrapper.read(23) self.fo_output.write(s) if not s: break - self.assertEqual(reference_data, self.fo_output.getvalue()) + self.assert_(reference_data == self.fo_output.getvalue()) class HTTPTests(TestCase): def test_reference_file(self): @@ -86,11 +90,11 @@ filename = tempfile.mktemp() grabber.urlgrab(ref_http, filename) - fo = open(filename) + fo = file(filename, 'rb') contents = fo.read() fo.close() - self.assertEqual(contents, reference_data) + self.assert_(contents == reference_data) def test_post(self): "do an HTTP post" @@ -130,7 +134,8 @@ """Test grabber.URLGrabber class""" def setUp(self): - self.meter = text_progress_meter( fo=open('/dev/null', 'w') ) + + self.meter = text_progress_meter( fo=cStringIO.StringIO() ) pass def tearDown(self): @@ -180,49 +185,110 @@ nopts.opener = None self.assertEquals( nopts.opener, None ) - def test_parse_url(self): - """grabber.URLGrabber._parse_url()""" - g = URLGrabber() - (url, parts) = g._parse_url('http://user:pass@host.com/path/part/basename.ext?arg1=val1&arg2=val2#hash') - (scheme, host, path, parm, query, frag) = parts - self.assertEquals('http://host.com/path/part/basename.ext?arg1=val1&arg2=val2#hash',url) - self.assertEquals('http', scheme) - self.assertEquals('host.com', host) - self.assertEquals('/path/part/basename.ext', path) - self.assertEquals('arg1=val1&arg2=val2', query) - self.assertEquals('hash', frag) - - def test_parse_url_local_filename(self): - """grabber.URLGrabber._parse_url('/local/file/path') """ + def test_make_callback(self): + """grabber.URLGrabber._make_callback() tests""" + def cb(e): pass + tup_cb = (cb, ('stuff'), {'some': 'dict'}) g = URLGrabber() - (url, parts) = g._parse_url('/etc/redhat-release') - (scheme, host, path, parm, query, frag) = parts - self.assertEquals('file:///etc/redhat-release',url) - self.assertEquals('file', scheme) - self.assertEquals('', host) - self.assertEquals('/etc/redhat-release', path) - self.assertEquals('', query) - self.assertEquals('', frag) + self.assertEquals(g._make_callback(cb), (cb, (), {})) + self.assertEquals(g._make_callback(tup_cb), tup_cb) + +class URLParserTestCase(TestCase): + def setUp(self): + pass + + def tearDown(self): + pass def test_parse_url_with_prefix(self): - """grabber.URLGrabber._parse_url() with .prefix""" + """grabber.URLParser.parse() with opts.prefix""" base = 'http://foo.com/dir' bases = [base, base+'/'] - file = 'bar/baz' - target = base + '/' + file + filename = 'bar/baz' + target = base + '/' + filename for b in bases: g = URLGrabber(prefix=b) - (url, parts) = g._parse_url(file) + (url, parts) = g.opts.urlparser.parse(filename, g.opts) self.assertEquals(url, target) - def test_make_callback(self): - """grabber.URLGrabber._make_callback() tests""" - def cb(e): pass - tup_cb = (cb, ('stuff'), {'some': 'dict'}) + def _test_url(self, urllist): g = URLGrabber() - self.assertEquals(g._make_callback(cb), (cb, (), {})) - self.assertEquals(g._make_callback(tup_cb), tup_cb) + try: quote = urllist[3] + except IndexError: quote = None + g.opts.quote = quote + (url, parts) = g.opts.urlparser.parse(urllist[0], g.opts) + + if 1: + self.assertEquals(url, urllist[1]) + self.assertEquals(parts, urllist[2]) + else: + if url == urllist[1] and parts == urllist[2]: + print 'OK: %s' % urllist[0] + else: + print 'ERROR: %s' % urllist[0] + print ' ' + urllist[1] + print ' ' + url + print ' ' + urllist[2] + print ' ' + parts + + + url_tests_all = ( + ['http://host.com/path/basename.ext?arg1=val1&arg2=val2#hash', + 'http://host.com/path/basename.ext?arg1=val1&arg2=val2#hash', + ('http', 'host.com', '/path/basename.ext', '', + 'arg1=val1&arg2=val2', 'hash')], + ['http://host.com/Path With Spaces/', + 'http://host.com/Path%20With%20Spaces/', + ('http', 'host.com', '/Path%20With%20Spaces/', '', '', '')], + ['http://user:pass@host.com:80/', + 'http://host.com:80/', + ('http', 'host.com:80', '/', '', '', '')], + ['http://host.com/Already%20Quoted', + 'http://host.com/Already%20Quoted', + ('http', 'host.com', '/Already%20Quoted', '', '', '')], + ['http://host.com/Should Be Quoted', + 'http://host.com/Should Be Quoted', + ('http', 'host.com', '/Should Be Quoted', '', '', ''), 0], + ['http://host.com/Should%20Not', + 'http://host.com/Should%2520Not', + ('http', 'host.com', '/Should%2520Not', '', '', ''), 1], + ) + + url_tests_posix = ( + ['/etc/passwd', + 'file:///etc/passwd', + ('file', '', '/etc/passwd', '', '', '')], + ) + + url_tests_nt = ( + [r'\\foo.com\path\file.ext', + 'file://foo.com/path/file.ext', + ('file', '', '//foo.com/path/file.ext', '', '', '')], + [r'C:\path\file.ext', + 'file:///C|/path/file.ext', + ('file', '', '/C|/path/file.ext', '', '', '')], + ) + + def test_url_parser_all_os(self): + """test url parsing common to all OSs""" + for f in self.url_tests_all: + self._test_url(f) + + def test_url_parser_posix(self): + """test url parsing on posix systems""" + if not os.name == 'posix': + self.skip() + for f in self.url_tests_posix: + self._test_url(f) + + def test_url_parser_nt(self): + """test url parsing on windows systems""" + if not os.name == 'nt': + self.skip() + for f in self.url_tests_nt: + self._test_url(f) + class FailureTestCase(TestCase): """Test failure behavior""" @@ -383,12 +449,12 @@ except: pass def _make_half_zero_file(self): - fo = open(self.filename, 'w') + fo = file(self.filename, 'wb') fo.write('0'*self.hl) fo.close() def _read_file(self): - fo = open(self.filename, 'r') + fo = file(self.filename, 'rb') data = fo.read() fo.close() return data @@ -451,12 +517,14 @@ def setUp(self): self.ref = short_reference_data tmp = tempfile.mktemp() - tmpfo = open(tmp, 'w') + tmpfo = file(tmp, 'wb') tmpfo.write(self.ref) tmpfo.close() self.tmp = tmp - self.url = 'file://' + tmp + (url, parts) = grabber.default_grabber.opts.urlparser.parse( + tmp, grabber.default_grabber.opts) + self.url = url self.grabber = grabber.URLGrabber(reget='check_timestamp', copy_local=1) diff -urN --exclude=CVS --exclude=.cvsignore --exclude=.svn --exclude=.svnignore old/urlgrabber-2.9.7/urlgrabber/__init__.py new/urlgrabber-2.9.9/urlgrabber/__init__.py --- old/urlgrabber-2.9.7/urlgrabber/__init__.py 2005-10-23 00:05:12.000000000 +0200 +++ new/urlgrabber-2.9.9/urlgrabber/__init__.py 2006-03-02 22:06:35.000000000 +0100 @@ -12,9 +12,9 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko +# Copyright 2002-2006 Michael D. Stenner, Ryan Tomayko -# $Id: __init__.py,v 1.15 2005/10/22 22:05:12 mstenner Exp $ +# $Id: __init__.py,v 1.17 2006/03/02 21:06:35 mstenner Exp $ """A high-level cross-protocol url-grabber. @@ -44,8 +44,8 @@ automatically switching mirrors if there is a failure. """ -__version__ = '2.9.7' -__date__ = '2005/10/22' +__version__ = '2.9.9' +__date__ = '2006/03/02' __author__ = 'Michael D. Stenner <mstenner@linux.duke.edu>, ' \ 'Ryan Tomayko <rtomayko@naeblis.cx>' __url__ = 'http://linux.duke.edu/projects/urlgrabber/' diff -urN --exclude=CVS --exclude=.cvsignore --exclude=.svn --exclude=.svnignore old/urlgrabber-2.9.7/urlgrabber/grabber.py new/urlgrabber-2.9.9/urlgrabber/grabber.py --- old/urlgrabber-2.9.7/urlgrabber/grabber.py 2005-10-22 23:57:28.000000000 +0200 +++ new/urlgrabber-2.9.9/urlgrabber/grabber.py 2006-03-02 21:56:57.000000000 +0100 @@ -283,6 +283,28 @@ passed the same arguments, so you could use the same function for both. + urlparser = URLParser() + + The URLParser class handles pre-processing of URLs, including + auth-handling for user/pass encoded in http urls, file handing + (that is, filenames not sent as a URL), and URL quoting. If you + want to override any of this behavior, you can pass in a + replacement instance. See also the 'quote' option. + + quote = None + + Whether or not to quote the path portion of a url. + quote = 1 -> quote the URLs (they're not quoted yet) + quote = 0 -> do not quote them (they're already quoted) + quote = None -> guess what to do + + This option only affects proper urls like 'file:///etc/passwd'; it + does not affect 'raw' filenames like '/etc/passwd'. The latter + will always be quoted as they are converted to URLs. Also, only + the path part of a url is quoted. If you need more fine-grained + control, you should probably subclass URLParser and pass it in via + the 'urlparser' option. + BANDWIDTH THROTTLING urlgrabber supports throttling via two values: throttle and @@ -342,7 +364,7 @@ """ -# $Id: grabber.py,v 1.43 2005/10/22 21:57:28 mstenner Exp $ +# $Id: grabber.py,v 1.45 2006/03/02 20:56:57 mstenner Exp $ import os import os.path @@ -593,6 +615,123 @@ return default_grabber.urlread(url, limit, **kwargs) +class URLParser: + """Process the URLs before passing them to urllib2. + + This class does several things: + + * add any prefix + * translate a "raw" file to a proper file: url + * handle any http or https auth that's encoded within the url + * quote the url + + Only the "parse" method is called directly, and it calls sub-methods. + + An instance of this class is held in the options object, which + means that it's easy to change the behavior by sub-classing and + passing the replacement in. It need only have a method like: + + url, parts = urlparser.parse(url, opts) + """ + + def parse(self, url, opts): + """parse the url and return the (modified) url and its parts + + Note: a raw file WILL be quoted when it's converted to a URL. + However, other urls (ones which come with a proper scheme) may + or may not be quoted according to opts.quote + + opts.quote = 1 --> quote it + opts.quote = 0 --> do not quote it + opts.quote = None --> guess + """ + quote = opts.quote + + if opts.prefix: + url = self.add_prefix(url, opts.prefix) + + parts = urlparse.urlparse(url) + (scheme, host, path, parm, query, frag) = parts + + if not scheme or (len(scheme) == 1 and scheme in string.letters): + # if a scheme isn't specified, we guess that it's "file:" + if url[0] not in '/\\': url = os.path.abspath(url) + url = 'file:' + urllib.pathname2url(url) + parts = urlparse.urlparse(url) + quote = 0 # pathname2url quotes, so we won't do it again + + if scheme in ['http', 'https']: + parts = self.process_http(parts) + + if quote is None: + quote = self.guess_should_quote(parts) + if quote: + parts = self.quote(parts) + + url = urlparse.urlunparse(parts) + return url, parts + + def add_prefix(self, url, prefix): + if prefix[-1] == '/' or url[0] == '/': + url = prefix + url + else: + url = prefix + '/' + url + return url + + def process_http(self, parts): + (scheme, host, path, parm, query, frag) = parts + + if '@' in host and auth_handler: + try: + user_pass, host = host.split('@', 1) + if ':' in user_pass: + user, password = user_pass.split(':', 1) + except ValueError, e: + raise URLGrabError(1, _('Bad URL: %s') % url) + if DEBUG: DEBUG.info('adding HTTP auth: %s, %s', user, password) + auth_handler.add_password(None, host, user, password) + + return (scheme, host, path, parm, query, frag) + + def quote(self, parts): + """quote the URL + + This method quotes ONLY the path part. If you need to quote + other parts, you should override this and pass in your derived + class. The other alternative is to quote other parts before + passing into urlgrabber. + """ + (scheme, host, path, parm, query, frag) = parts + path = urllib.quote(path) + return (scheme, host, path, parm, query, frag) + + hexvals = '0123456789ABCDEF' + def guess_should_quote(self, parts): + """ + Guess whether we should quote a path. This amounts to + guessing whether it's already quoted. + + find ' ' -> 1 + find '%' -> 1 + find '%XX' -> 0 + else -> 1 + """ + (scheme, host, path, parm, query, frag) = parts + if ' ' in path: + return 1 + ind = string.find(path, '%') + if ind > -1: + while ind > -1: + if len(path) < ind+3: + return 1 + code = path[ind+1:ind+3].upper() + if code[0] not in self.hexvals or \ + code[1] not in self.hexvals: + return 1 + ind = string.find(path, '%', ind+1) + return 0 + return 1 + class URLGrabberOptions: """Class to ease kwargs handling.""" @@ -667,6 +806,8 @@ self.http_headers = None self.ftp_headers = None self.data = None + self.urlparser = URLParser() + self.quote = None class URLGrabber: """Provides easy opening of URLs with a variety of options. @@ -735,7 +876,7 @@ like any other file object. """ opts = self.opts.derive(**kwargs) - (url,parts) = self._parse_url(url) + (url,parts) = opts.urlparser.parse(url, opts) def retryfunc(opts, url): return URLGrabberFileObject(url, filename=None, opts=opts) return self._retry(opts, retryfunc, url) @@ -747,16 +888,16 @@ different from the passed-in filename if copy_local == 0. """ opts = self.opts.derive(**kwargs) - (url, parts) = self._parse_url(url) + (url,parts) = opts.urlparser.parse(url, opts) (scheme, host, path, parm, query, frag) = parts if filename is None: - if scheme in [ 'http', 'https' ]: - filename = os.path.basename( urllib.unquote(path) ) - else: - filename = os.path.basename( path ) + filename = os.path.basename( urllib.unquote(path) ) if scheme == 'file' and not opts.copy_local: # just return the name of the local file - don't make a # copy currently + path = urllib.url2pathname(path) + if host: + path = os.path.normpath('//' + host + path) if not os.path.exists(path): raise URLGrabError(2, _('Local file does not exist: %s') % (path, )) @@ -791,7 +932,7 @@ into memory, but don't use too much' """ opts = self.opts.derive(**kwargs) - (url, parts) = self._parse_url(url) + (url,parts) = opts.urlparser.parse(url, opts) if limit is not None: limit = limit + 1 @@ -823,41 +964,6 @@ _('Exceeded limit (%i): %s') % (limit, url)) return s - def _parse_url(self,url): - """break up the url into its component parts - - This function disassembles a url and - 1) "normalizes" it, tidying it up a bit - 2) does any authentication stuff it needs to do - - it returns the (cleaned) url and a tuple of component parts - """ - if self.opts.prefix: - p = self.opts.prefix - if p[-1] == '/' or url[0] == '/': url = p + url - else: url = p + '/' + url - - (scheme, host, path, parm, query, frag) = \ - urlparse.urlparse(url) - if not scheme: - if not url[0] == '/': url = os.path.abspath(url) - url = 'file:' + url - (scheme, host, path, parm, query, frag) = \ - urlparse.urlparse(url) - path = os.path.normpath(path) - if scheme in ['http', 'https']: path = urllib.quote(path) - if '@' in host and auth_handler and scheme in ['http', 'https']: - try: - user_pass, host = host.split('@', 1) - if ':' in user_pass: user, password = user_pass.split(':', 1) - except ValueError, e: - raise URLGrabError(1, _('Bad URL: %s') % url) - if DEBUG: DEBUG.info('adding HTTP auth: %s, %s', user, password) - auth_handler.add_password(None, host, user, password) - parts = (scheme, host, path, parm, query, frag) - url = urlparse.urlunparse(parts) - return url, parts - def _make_callback(self, callback_obj): if callable(callback_obj): return callback_obj, (), {} @@ -980,6 +1086,7 @@ fo, hdr = self._make_request(req, opener) (scheme, host, path, parm, query, frag) = urlparse.urlparse(self.url) + path = urllib.unquote(path) if not (self.opts.progress_obj or self.opts.raw_throttle() \ or self.opts.timeout): # if we're not using the progress_obj, throttling, or timeout @@ -989,12 +1096,16 @@ if hasattr(fo, 'readline'): self.readline = fo.readline elif self.opts.progress_obj: - try: length = int(hdr['Content-Length']) - except: length = None - self.opts.progress_obj.start(str(self.filename), self.url, + try: + length = int(hdr['Content-Length']) + length = length + self._amount_read # Account for regets + except (KeyError, ValueError, TypeError): + length = None + + self.opts.progress_obj.start(str(self.filename), + urllib.unquote(self.url), os.path.basename(path), - length, - text=self.opts.text) + length, text=self.opts.text) self.opts.progress_obj.update(0) (self.fo, self.hdr) = (fo, hdr) @@ -1024,6 +1135,10 @@ else: self.reget_time = s[ST_MTIME] reget_length = s[ST_SIZE] + + # Set initial length when regetting + self._amount_read = reget_length + rt = reget_length, '' self.append = 1 diff -urN --exclude=CVS --exclude=.cvsignore --exclude=.svn --exclude=.svnignore old/urlgrabber-2.9.7/urlgrabber/mirror.py new/urlgrabber-2.9.9/urlgrabber/mirror.py --- old/urlgrabber-2.9.7/urlgrabber/mirror.py 2005-10-22 23:57:28.000000000 +0200 +++ new/urlgrabber-2.9.9/urlgrabber/mirror.py 2006-02-22 19:26:46.000000000 +0100 @@ -86,7 +86,7 @@ """ -# $Id: mirror.py,v 1.13 2005/10/22 21:57:28 mstenner Exp $ +# $Id: mirror.py,v 1.14 2006/02/22 18:26:46 mstenner Exp $ import random import thread # needed for locking to make this threadsafe @@ -346,14 +346,14 @@ del self.mirrors[ind] elif self._next == ind and action.get('increment_master', 1): self._next += 1 - if self._next >= len(self.mirrors): self._next = 0 + if self._next >= len(self.mirrors): self._next = 0 self._lock.release() if action.get('remove', 1): del gr.mirrors[gr._next] elif action.get('increment', 1): gr._next += 1 - if gr._next >= len(gr.mirrors): gr._next = 0 + if gr._next >= len(gr.mirrors): gr._next = 0 if DEBUG: grm = [m['mirror'] for m in gr.mirrors] ++++++ urlgrabber-read-error.patch ++++++ --- urlgrabber/grabber.py +++ urlgrabber/grabber.py @@ -1140,6 +1140,8 @@ raise URLGrabError(4, _('Socket Error: %s') % (e, )) except TimeoutError, e: raise URLGrabError(12, _('Timeout: %s') % (e, )) + except IOError, e: + raise URLGrabError(4, _('IOError: %s') %(e,)) newsize = len(new) if not newsize: break # no more to read ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Remember to have fun...
participants (1)
-
root@suse.de