commit python-urlgrabber

26 May 2006

Hello community,

here is the log from the commit of package python-urlgrabber
checked in at Fri May 26 15:06:49 CEST 2006.

--------

--- python-urlgrabber/python-urlgrabber.changes	2006-02-28 16:46:03.000000000 +0100
+++ python-urlgrabber/python-urlgrabber.changes	2006-05-25 14:27:40.000000000 +0200
@@ -1,0 +2,14 @@
+Thu May 25 14:19:34 CEST 2006 - cthiel@suse.de
+
+- update to version 2.9.9
+  * Added tests to make sure that the "quote" option works as advertised
+  * Significant improvement to URL parsing.  Parsing is now broken out into
+    a separate class (URLParser).  It will now (by default) guess whether a
+    URL is already quoted, properly handle local files and URLs on windows,
+    and display un-quoted versions of the filename in the progress meter.
+  * Added a reget progress bar patch from Menno, and fixed the annoying next
+    _IndexError bug.
+- added urlgrabber-read-error.patch (from Fedora)
+- removed python-urlgrabber-2.9.7-reget.patch (included upstream)
+
+-------------------------------------------------------------------

Old:
----
  python-urlgrabber-2.9.7-reget.diff
  python-urlgrabber-2.9.7.diff
  urlgrabber-2.9.7.tar.bz2

New:
----
  python-urlgrabber-2.9.9.patch
  urlgrabber-2.9.9.tar.bz2
  urlgrabber-read-error.patch

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Other differences:
------------------
++++++ python-urlgrabber.spec ++++++
--- /var/tmp/diff_new_pack.dGSdOG/_old	2006-05-26 15:04:41.000000000 +0200
+++ /var/tmp/diff_new_pack.dGSdOG/_new	2006-05-26 15:04:41.000000000 +0200
@@ -1,5 +1,5 @@
 #
-# spec file for package python-urlgrabber (Version 2.9.7)
+# spec file for package python-urlgrabber (Version 2.9.9)
 #
 # Copyright (c) 2006 SUSE LINUX Products GmbH, Nuernberg, Germany.
 # This file and all modifications and additions to the pristine
@@ -12,15 +12,15 @@
 
 Name:           python-urlgrabber
 BuildRequires:  python-devel
-Version:        2.9.7
-Release:        6
+Version:        2.9.9
+Release:        1
 Summary:        A high-level cross-protocol url-grabber
 Group:          Development/Libraries/Python
 License:        LGPL
 URL:            http://linux.duke.edu/projects/urlgrabber/
 Source:         urlgrabber-%{version}.tar.bz2
-Patch:          %{name}-%{version}-reget.diff
-Patch1:         %{name}-%{version}.diff
+Patch:          %{name}-%{version}.patch
+Patch1:         urlgrabber-read-error.patch
 BuildRoot:      %{_tmppath}/%{name}-%{version}-build
 
 %py_requires
@@ -39,7 +39,7 @@
 
 %prep
 %setup -q -n urlgrabber-%{version}
-%patch -p1
+%patch
 %patch1
 
 %build
@@ -59,6 +59,17 @@
 %{py_sitedir}/urlgrabber
 
 %changelog -n python-urlgrabber
+* Thu May 25 2006 - cthiel@suse.de
+- update to version 2.9.9
+  * Added tests to make sure that the "quote" option works as advertised
+  * Significant improvement to URL parsing.  Parsing is now broken out into
+  a separate class (URLParser).  It will now (by default) guess whether a
+  URL is already quoted, properly handle local files and URLs on windows,
+  and display un-quoted versions of the filename in the progress meter.
+  * Added a reget progress bar patch from Menno, and fixed the annoying next
+  _IndexError bug.
+- added urlgrabber-read-error.patch (from Fedora)
+- removed python-urlgrabber-2.9.7-reget.patch (included upstream)
 * Tue Feb 28 2006 - jmatejek@suse.cz
 - updated to reflect python changes due to #149809
 * Wed Jan 25 2006 - mls@suse.de

++++++ python-urlgrabber-2.9.9.patch ++++++
--- setup.py
+++ setup.py
@@ -15,8 +15,6 @@
 packages = ['urlgrabber']
 package_dir = {'urlgrabber':'urlgrabber'}
 scripts = ['scripts/urlgrabber']
-data_files = [('share/doc/' + name + '-' + version,
-               ['README','LICENSE', 'TODO', 'ChangeLog'])]
 options = { 'clean' : { 'all' : 1 } }
 classifiers = [
         'Development Status :: 4 - Beta',
++++++ urlgrabber-2.9.7.tar.bz2 -> urlgrabber-2.9.9.tar.bz2 ++++++
diff -urN --exclude=CVS --exclude=.cvsignore --exclude=.svn --exclude=.svnignore old/urlgrabber-2.9.7/ChangeLog new/urlgrabber-2.9.9/ChangeLog
--- old/urlgrabber-2.9.7/ChangeLog	2005-10-23 00:05:26.000000000 +0200
+++ new/urlgrabber-2.9.9/ChangeLog	2006-03-02 22:06:42.000000000 +0100
@@ -1,3 +1,52 @@
+2006-03-02  Michael D. Stenner <mstenner@linux.duke.edu>
+
+	* urlgrabber/__init__.py:
+
+	release 2.9.9
+
+2006-03-02  Michael D. Stenner <mstenner@linux.duke.edu>
+
+	* test/test_grabber.py:
+
+	Added tests to make sure that the "quote" option works as
+	advertised.
+
+2006-03-02  Michael D. Stenner <mstenner@linux.duke.edu>
+
+	* scripts/urlgrabber, test/test_grabber.py, urlgrabber/grabber.py:
+
+	Significant improvement to URL parsing.  Parsing is now broken out
+	into a separate class (URLParser).  It will now (by default) guess
+	whether a URL is already quoted, properly handle local files and
+	URLs on windows, and display un-quoted versions of the filename in
+	the progress meter.
+
+2006-02-22  Michael D. Stenner <mstenner@linux.duke.edu>
+
+	* ChangeLog:
+
+	updated ChangeLog
+
+2006-02-22  Michael D. Stenner <mstenner@linux.duke.edu>
+
+	* urlgrabber/__init__.py:
+
+	release 2.9.8
+
+2006-02-22  Michael D. Stenner <mstenner@linux.duke.edu>
+
+	* urlgrabber/: grabber.py, mirror.py:
+
+	Added a reget progress bar patch from Menno, and fixed the annoying
+	_next IndexError bug.  Thanks to Edinelson Keiji Shimokawa for
+	getting me looking in the right direction.
+
+2005-10-22  Michael D. Stenner <mstenner@linux.duke.edu>
+
+	* ChangeLog:
+
+	updated ChangeLog
+
 2005-10-22  Michael D. Stenner <mstenner@linux.duke.edu>
 
 	* ChangeLog, TODO, urlgrabber/__init__.py:
diff -urN --exclude=CVS --exclude=.cvsignore --exclude=.svn --exclude=.svnignore old/urlgrabber-2.9.7/PKG-INFO new/urlgrabber-2.9.9/PKG-INFO
--- old/urlgrabber-2.9.7/PKG-INFO	2005-10-23 00:05:37.000000000 +0200
+++ new/urlgrabber-2.9.9/PKG-INFO	2006-03-02 22:06:52.000000000 +0100
@@ -1,6 +1,6 @@
 Metadata-Version: 1.0
 Name: urlgrabber
-Version: 2.9.7
+Version: 2.9.9
 Summary: A high-level cross-protocol url-grabber
 Home-page: http://linux.duke.edu/projects/urlgrabber/
 Author: Michael D. Stenner, Ryan Tomayko
diff -urN --exclude=CVS --exclude=.cvsignore --exclude=.svn --exclude=.svnignore old/urlgrabber-2.9.7/scripts/urlgrabber new/urlgrabber-2.9.9/scripts/urlgrabber
--- old/urlgrabber-2.9.7/scripts/urlgrabber	2004-09-07 23:19:54.000000000 +0200
+++ new/urlgrabber-2.9.9/scripts/urlgrabber	2006-03-02 21:56:56.000000000 +0100
@@ -62,7 +62,7 @@
                        explicitly.
 """
 
-# $Id: urlgrabber,v 1.4 2004/09/07 21:19:54 mstenner Exp $
+# $Id: urlgrabber,v 1.5 2006/03/02 20:56:56 mstenner Exp $
 
 import sys
 import urlgrabber.grabber
@@ -112,7 +112,7 @@
         print 'URL:    ', url
         print 'FILE:   ', file
         
-    try: from progress import text_progress_meter
+    try: from urlgrabber.progress import text_progress_meter
     except ImportError, e: pass
     else: kwargs['progress_obj'] = text_progress_meter()
     
diff -urN --exclude=CVS --exclude=.cvsignore --exclude=.svn --exclude=.svnignore old/urlgrabber-2.9.7/test/test_grabber.py new/urlgrabber-2.9.9/test/test_grabber.py
--- old/urlgrabber-2.9.7/test/test_grabber.py	2005-10-22 23:57:27.000000000 +0200
+++ new/urlgrabber-2.9.9/test/test_grabber.py	2006-03-02 22:06:00.000000000 +0100
@@ -21,7 +21,7 @@
 
 """grabber.py tests"""
 
-# $Id: test_grabber.py,v 1.28 2005/10/22 21:57:27 mstenner Exp $
+# $Id: test_grabber.py,v 1.30 2006/03/02 21:06:00 mstenner Exp $
 
 import sys
 import os
@@ -33,30 +33,34 @@
 
 import urlgrabber
 import urlgrabber.grabber as grabber
-from urlgrabber.grabber import URLGrabber, URLGrabError, CallbackObject
+from urlgrabber.grabber import URLGrabber, URLGrabError, CallbackObject, \
+     URLParser
 from urlgrabber.progress import text_progress_meter
 
 class FileObjectTests(TestCase):
     
     def setUp(self):
         self.filename = tempfile.mktemp()
-        fo = open(self.filename, 'w')
+        fo = file(self.filename, 'wb')
         fo.write(reference_data)
         fo.close()
 
         self.fo_input = cStringIO.StringIO(reference_data)
         self.fo_output = cStringIO.StringIO()
-        self.wrapper = grabber.URLGrabberFileObject('file://' + self.filename, self.fo_output,
-                             grabber.default_grabber.opts)
+        (url, parts) = grabber.default_grabber.opts.urlparser.parse(
+            self.filename, grabber.default_grabber.opts)
+        self.wrapper = grabber.URLGrabberFileObject(
+            url, self.fo_output, grabber.default_grabber.opts)
 
     def tearDown(self):
+        self.wrapper.close()
         os.unlink(self.filename)
 
     def test_readall(self):
         "URLGrabberFileObject .read() method"
         s = self.wrapper.read()
         self.fo_output.write(s)
-        self.assertEqual(reference_data, self.fo_output.getvalue())
+        self.assert_(reference_data == self.fo_output.getvalue())
 
     def test_readline(self):
         "URLGrabberFileObject .readline() method"
@@ -64,13 +68,13 @@
             s = self.wrapper.readline()
             self.fo_output.write(s)
             if not s: break
-        self.assertEqual(reference_data, self.fo_output.getvalue())
+        self.assert_(reference_data == self.fo_output.getvalue())
 
     def test_readlines(self):
         "URLGrabberFileObject .readlines() method"
         li = self.wrapper.readlines()
         self.fo_output.write(string.join(li, ''))
-        self.assertEqual(reference_data, self.fo_output.getvalue())
+        self.assert_(reference_data == self.fo_output.getvalue())
 
     def test_smallread(self):
         "URLGrabberFileObject .read(N) with small N"
@@ -78,7 +82,7 @@
             s = self.wrapper.read(23)
             self.fo_output.write(s)
             if not s: break
-        self.assertEqual(reference_data, self.fo_output.getvalue())
+        self.assert_(reference_data == self.fo_output.getvalue())
     
 class HTTPTests(TestCase):
     def test_reference_file(self):
@@ -86,11 +90,11 @@
         filename = tempfile.mktemp()
         grabber.urlgrab(ref_http, filename)
 
-        fo = open(filename)
+        fo = file(filename, 'rb')
         contents = fo.read()
         fo.close()
 
-        self.assertEqual(contents, reference_data)
+        self.assert_(contents == reference_data)
 
     def test_post(self):
         "do an HTTP post"
@@ -130,7 +134,8 @@
     """Test grabber.URLGrabber class"""
     
     def setUp(self):
-        self.meter = text_progress_meter( fo=open('/dev/null', 'w') )
+        
+        self.meter = text_progress_meter( fo=cStringIO.StringIO() )
         pass
     
     def tearDown(self):
@@ -180,49 +185,110 @@
         nopts.opener = None
         self.assertEquals( nopts.opener, None )
         
-    def test_parse_url(self):
-        """grabber.URLGrabber._parse_url()"""
-        g = URLGrabber()
-        (url, parts) = g._parse_url('http://user:pass@host.com/path/part/basename.ext?arg1=val1&arg2=val2#hash')
-        (scheme, host, path, parm, query, frag) = parts
-        self.assertEquals('http://host.com/path/part/basename.ext?arg1=val1&arg2=val2#hash',url)
-        self.assertEquals('http', scheme)
-        self.assertEquals('host.com', host)
-        self.assertEquals('/path/part/basename.ext', path)
-        self.assertEquals('arg1=val1&arg2=val2', query)
-        self.assertEquals('hash', frag)
-        
-    def test_parse_url_local_filename(self):
-        """grabber.URLGrabber._parse_url('/local/file/path') """
+    def test_make_callback(self):
+        """grabber.URLGrabber._make_callback() tests"""
+        def cb(e): pass
+        tup_cb = (cb, ('stuff'), {'some': 'dict'})
         g = URLGrabber()
-        (url, parts) = g._parse_url('/etc/redhat-release')
-        (scheme, host, path, parm, query, frag) = parts
-        self.assertEquals('file:///etc/redhat-release',url)
-        self.assertEquals('file', scheme)
-        self.assertEquals('', host)
-        self.assertEquals('/etc/redhat-release', path)
-        self.assertEquals('', query)
-        self.assertEquals('', frag)
+        self.assertEquals(g._make_callback(cb),     (cb, (), {}))
+        self.assertEquals(g._make_callback(tup_cb), tup_cb)
+
+class URLParserTestCase(TestCase):
+    def setUp(self):
+        pass
+    
+    def tearDown(self):
+        pass
 
     def test_parse_url_with_prefix(self):
-        """grabber.URLGrabber._parse_url() with .prefix"""
+        """grabber.URLParser.parse() with opts.prefix"""
         base = 'http://foo.com/dir'
         bases = [base, base+'/']
-        file = 'bar/baz'
-        target = base + '/' + file
+        filename = 'bar/baz'
+        target = base + '/' + filename
         
         for b in bases:
             g = URLGrabber(prefix=b)
-            (url, parts) = g._parse_url(file)
+            (url, parts) = g.opts.urlparser.parse(filename, g.opts)
             self.assertEquals(url, target)
 
-    def test_make_callback(self):
-        """grabber.URLGrabber._make_callback() tests"""
-        def cb(e): pass
-        tup_cb = (cb, ('stuff'), {'some': 'dict'})
+    def _test_url(self, urllist):
         g = URLGrabber()
-        self.assertEquals(g._make_callback(cb),     (cb, (), {}))
-        self.assertEquals(g._make_callback(tup_cb), tup_cb)
+        try: quote = urllist[3]
+        except IndexError: quote = None
+        g.opts.quote = quote
+        (url, parts) = g.opts.urlparser.parse(urllist[0], g.opts)
+        
+        if 1:
+            self.assertEquals(url, urllist[1])
+            self.assertEquals(parts, urllist[2])
+        else:
+            if url == urllist[1] and parts == urllist[2]:
+                print 'OK: %s' % urllist[0]
+            else:
+                print 'ERROR: %s' % urllist[0]
+                print '  ' + urllist[1]
+                print '  ' + url
+                print '  ' + urllist[2]
+                print '  ' + parts
+                
+
+    url_tests_all = (
+        ['http://host.com/path/basename.ext?arg1=val1&arg2=val2#hash',
+         'http://host.com/path/basename.ext?arg1=val1&arg2=val2#hash',
+         ('http', 'host.com', '/path/basename.ext', '',
+          'arg1=val1&arg2=val2', 'hash')],
+        ['http://host.com/Path With Spaces/',
+         'http://host.com/Path%20With%20Spaces/',
+         ('http', 'host.com', '/Path%20With%20Spaces/', '', '', '')],
+        ['http://user:pass@host.com:80/',
+         'http://host.com:80/',
+         ('http', 'host.com:80', '/', '', '', '')],
+        ['http://host.com/Already%20Quoted',
+         'http://host.com/Already%20Quoted',
+         ('http', 'host.com', '/Already%20Quoted', '', '', '')],
+        ['http://host.com/Should Be Quoted',
+         'http://host.com/Should Be Quoted',
+         ('http', 'host.com', '/Should Be Quoted', '', '', ''), 0],
+        ['http://host.com/Should%20Not',
+         'http://host.com/Should%2520Not',
+         ('http', 'host.com', '/Should%2520Not', '', '', ''), 1],
+        )
+        
+    url_tests_posix = (
+        ['/etc/passwd',
+         'file:///etc/passwd',
+         ('file', '', '/etc/passwd', '', '', '')],
+        )
+    
+    url_tests_nt = (
+        [r'\\foo.com\path\file.ext',
+         'file://foo.com/path/file.ext',
+         ('file', '', '//foo.com/path/file.ext', '', '', '')],
+        [r'C:\path\file.ext',
+         'file:///C|/path/file.ext',
+         ('file', '', '/C|/path/file.ext', '', '', '')],
+        )
+
+    def test_url_parser_all_os(self):
+        """test url parsing common to all OSs"""
+        for f in self.url_tests_all:
+            self._test_url(f)
+
+    def test_url_parser_posix(self):
+        """test url parsing on posix systems"""
+        if not os.name == 'posix':
+            self.skip()
+        for f in self.url_tests_posix:
+            self._test_url(f)
+
+    def test_url_parser_nt(self):
+        """test url parsing on windows systems"""
+        if not os.name == 'nt':
+            self.skip()
+        for f in self.url_tests_nt:
+            self._test_url(f)
+
 
 class FailureTestCase(TestCase):
     """Test failure behavior"""
@@ -383,12 +449,12 @@
         except: pass
 
     def _make_half_zero_file(self):
-        fo = open(self.filename, 'w')
+        fo = file(self.filename, 'wb')
         fo.write('0'*self.hl)
         fo.close()
 
     def _read_file(self):
-        fo = open(self.filename, 'r')
+        fo = file(self.filename, 'rb')
         data = fo.read()
         fo.close()
         return data
@@ -451,12 +517,14 @@
     def setUp(self):
         self.ref = short_reference_data
         tmp = tempfile.mktemp()
-        tmpfo = open(tmp, 'w')
+        tmpfo = file(tmp, 'wb')
         tmpfo.write(self.ref)
         tmpfo.close()
         self.tmp = tmp
         
-        self.url = 'file://' + tmp
+        (url, parts) = grabber.default_grabber.opts.urlparser.parse(
+            tmp, grabber.default_grabber.opts)
+        self.url = url
 
         self.grabber = grabber.URLGrabber(reget='check_timestamp',
                                           copy_local=1)
diff -urN --exclude=CVS --exclude=.cvsignore --exclude=.svn --exclude=.svnignore old/urlgrabber-2.9.7/urlgrabber/__init__.py new/urlgrabber-2.9.9/urlgrabber/__init__.py
--- old/urlgrabber-2.9.7/urlgrabber/__init__.py	2005-10-23 00:05:12.000000000 +0200
+++ new/urlgrabber-2.9.9/urlgrabber/__init__.py	2006-03-02 22:06:35.000000000 +0100
@@ -12,9 +12,9 @@
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 
-# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
+# Copyright 2002-2006 Michael D. Stenner, Ryan Tomayko
 
-# $Id: __init__.py,v 1.15 2005/10/22 22:05:12 mstenner Exp $
+# $Id: __init__.py,v 1.17 2006/03/02 21:06:35 mstenner Exp $
 
 """A high-level cross-protocol url-grabber.
 
@@ -44,8 +44,8 @@
     automatically switching mirrors if there is a failure.
 """
 
-__version__ = '2.9.7'
-__date__    = '2005/10/22'
+__version__ = '2.9.9'
+__date__    = '2006/03/02'
 __author__  = 'Michael D. Stenner <mstenner@linux.duke.edu>, ' \
               'Ryan Tomayko <rtomayko@naeblis.cx>'
 __url__     = 'http://linux.duke.edu/projects/urlgrabber/'
diff -urN --exclude=CVS --exclude=.cvsignore --exclude=.svn --exclude=.svnignore old/urlgrabber-2.9.7/urlgrabber/grabber.py new/urlgrabber-2.9.9/urlgrabber/grabber.py
--- old/urlgrabber-2.9.7/urlgrabber/grabber.py	2005-10-22 23:57:28.000000000 +0200
+++ new/urlgrabber-2.9.9/urlgrabber/grabber.py	2006-03-02 21:56:57.000000000 +0100
@@ -283,6 +283,28 @@
     passed the same arguments, so you could use the same function for
     both.
       
+  urlparser = URLParser()
+
+    The URLParser class handles pre-processing of URLs, including
+    auth-handling for user/pass encoded in http urls, file handing
+    (that is, filenames not sent as a URL), and URL quoting.  If you
+    want to override any of this behavior, you can pass in a
+    replacement instance.  See also the 'quote' option.
+
+  quote = None
+
+    Whether or not to quote the path portion of a url.
+      quote = 1    ->  quote the URLs (they're not quoted yet)
+      quote = 0    ->  do not quote them (they're already quoted)
+      quote = None ->  guess what to do
+
+    This option only affects proper urls like 'file:///etc/passwd'; it
+    does not affect 'raw' filenames like '/etc/passwd'.  The latter
+    will always be quoted as they are converted to URLs.  Also, only
+    the path part of a url is quoted.  If you need more fine-grained
+    control, you should probably subclass URLParser and pass it in via
+    the 'urlparser' option.
+
 BANDWIDTH THROTTLING
 
   urlgrabber supports throttling via two values: throttle and
@@ -342,7 +364,7 @@
 
 """
 
-# $Id: grabber.py,v 1.43 2005/10/22 21:57:28 mstenner Exp $
+# $Id: grabber.py,v 1.45 2006/03/02 20:56:57 mstenner Exp $
 
 import os
 import os.path
@@ -593,6 +615,123 @@
     return default_grabber.urlread(url, limit, **kwargs)
 
 
+class URLParser:
+    """Process the URLs before passing them to urllib2.
+
+    This class does several things:
+
+      * add any prefix
+      * translate a "raw" file to a proper file: url
+      * handle any http or https auth that's encoded within the url
+      * quote the url
+
+    Only the "parse" method is called directly, and it calls sub-methods.
+
+    An instance of this class is held in the options object, which
+    means that it's easy to change the behavior by sub-classing and
+    passing the replacement in.  It need only have a method like:
+
+        url, parts = urlparser.parse(url, opts)
+    """
+
+    def parse(self, url, opts):
+        """parse the url and return the (modified) url and its parts
+
+        Note: a raw file WILL be quoted when it's converted to a URL.
+        However, other urls (ones which come with a proper scheme) may
+        or may not be quoted according to opts.quote
+
+          opts.quote = 1     --> quote it
+          opts.quote = 0     --> do not quote it
+          opts.quote = None  --> guess
+        """
+        quote = opts.quote
+        
+        if opts.prefix:
+            url = self.add_prefix(url, opts.prefix)
+            
+        parts = urlparse.urlparse(url)
+        (scheme, host, path, parm, query, frag) = parts
+
+        if not scheme or (len(scheme) == 1 and scheme in string.letters):
+            # if a scheme isn't specified, we guess that it's "file:"
+            if url[0] not in '/\\': url = os.path.abspath(url)
+            url = 'file:' + urllib.pathname2url(url)
+            parts = urlparse.urlparse(url)
+            quote = 0 # pathname2url quotes, so we won't do it again
+            
+        if scheme in ['http', 'https']:
+            parts = self.process_http(parts)
+            
+        if quote is None:
+            quote = self.guess_should_quote(parts)
+        if quote:
+            parts = self.quote(parts)
+        
+        url = urlparse.urlunparse(parts)
+        return url, parts
+
+    def add_prefix(self, url, prefix):
+        if prefix[-1] == '/' or url[0] == '/':
+            url = prefix + url
+        else:
+            url = prefix + '/' + url
+        return url
+
+    def process_http(self, parts):
+        (scheme, host, path, parm, query, frag) = parts
+
+        if '@' in host and auth_handler:
+            try:
+                user_pass, host = host.split('@', 1)
+                if ':' in user_pass:
+                    user, password = user_pass.split(':', 1)
+            except ValueError, e:
+                raise URLGrabError(1, _('Bad URL: %s') % url)
+            if DEBUG: DEBUG.info('adding HTTP auth: %s, %s', user, password)
+            auth_handler.add_password(None, host, user, password)
+
+        return (scheme, host, path, parm, query, frag)
+
+    def quote(self, parts):
+        """quote the URL
+
+        This method quotes ONLY the path part.  If you need to quote
+        other parts, you should override this and pass in your derived
+        class.  The other alternative is to quote other parts before
+        passing into urlgrabber.
+        """
+        (scheme, host, path, parm, query, frag) = parts
+        path = urllib.quote(path)
+        return (scheme, host, path, parm, query, frag)
+
+    hexvals = '0123456789ABCDEF'
+    def guess_should_quote(self, parts):
+        """
+        Guess whether we should quote a path.  This amounts to
+        guessing whether it's already quoted.
+
+        find ' '   ->  1
+        find '%'   ->  1
+        find '%XX' ->  0
+        else       ->  1
+        """
+        (scheme, host, path, parm, query, frag) = parts
+        if ' ' in path:
+            return 1
+        ind = string.find(path, '%')
+        if ind > -1:
+            while ind > -1:
+                if len(path) < ind+3:
+                    return 1
+                code = path[ind+1:ind+3].upper()
+                if     code[0] not in self.hexvals or \
+                       code[1] not in self.hexvals:
+                    return 1
+                ind = string.find(path, '%', ind+1)
+            return 0
+        return 1
+    
 class URLGrabberOptions:
     """Class to ease kwargs handling."""
 
@@ -667,6 +806,8 @@
         self.http_headers = None
         self.ftp_headers = None
         self.data = None
+        self.urlparser = URLParser()
+        self.quote = None
 
 class URLGrabber:
     """Provides easy opening of URLs with a variety of options.
@@ -735,7 +876,7 @@
         like any other file object.
         """
         opts = self.opts.derive(**kwargs)
-        (url,parts) = self._parse_url(url) 
+        (url,parts) = opts.urlparser.parse(url, opts) 
         def retryfunc(opts, url):
             return URLGrabberFileObject(url, filename=None, opts=opts)
         return self._retry(opts, retryfunc, url)
@@ -747,16 +888,16 @@
         different from the passed-in filename if copy_local == 0.
         """
         opts = self.opts.derive(**kwargs)
-        (url, parts) = self._parse_url(url)
+        (url,parts) = opts.urlparser.parse(url, opts) 
         (scheme, host, path, parm, query, frag) = parts
         if filename is None:
-            if scheme in [ 'http', 'https' ]:
-                filename = os.path.basename( urllib.unquote(path) )
-            else:
-                filename = os.path.basename( path )
+            filename = os.path.basename( urllib.unquote(path) )
         if scheme == 'file' and not opts.copy_local:
             # just return the name of the local file - don't make a 
             # copy currently
+            path = urllib.url2pathname(path)
+            if host:
+                path = os.path.normpath('//' + host + path)
             if not os.path.exists(path):
                 raise URLGrabError(2, 
                       _('Local file does not exist: %s') % (path, ))
@@ -791,7 +932,7 @@
         into memory, but don't use too much'
         """
         opts = self.opts.derive(**kwargs)
-        (url, parts) = self._parse_url(url)
+        (url,parts) = opts.urlparser.parse(url, opts) 
         if limit is not None:
             limit = limit + 1
             
@@ -823,41 +964,6 @@
                         _('Exceeded limit (%i): %s') % (limit, url))
         return s
         
-    def _parse_url(self,url):
-        """break up the url into its component parts
-
-        This function disassembles a url and
-        1) "normalizes" it, tidying it up a bit
-        2) does any authentication stuff it needs to do
-
-        it returns the (cleaned) url and a tuple of component parts
-        """
-        if self.opts.prefix:
-            p = self.opts.prefix
-            if p[-1] == '/' or url[0] == '/': url = p + url
-            else: url = p + '/' + url
-            
-        (scheme, host, path, parm, query, frag) = \
-                                             urlparse.urlparse(url)
-        if not scheme:
-            if not url[0] == '/': url = os.path.abspath(url)
-            url = 'file:' + url
-            (scheme, host, path, parm, query, frag) = \
-                                             urlparse.urlparse(url)
-        path = os.path.normpath(path)
-        if scheme in ['http', 'https']: path = urllib.quote(path)
-        if '@' in host and auth_handler and scheme in ['http', 'https']:
-            try:
-                user_pass, host = host.split('@', 1)
-                if ':' in user_pass: user, password = user_pass.split(':', 1)
-            except ValueError, e:
-                raise URLGrabError(1, _('Bad URL: %s') % url)
-            if DEBUG: DEBUG.info('adding HTTP auth: %s, %s', user, password)
-            auth_handler.add_password(None, host, user, password)
-        parts = (scheme, host, path, parm, query, frag)
-        url = urlparse.urlunparse(parts)
-        return url, parts
-        
     def _make_callback(self, callback_obj):
         if callable(callback_obj):
             return callback_obj, (), {}
@@ -980,6 +1086,7 @@
                 fo, hdr = self._make_request(req, opener)
 
         (scheme, host, path, parm, query, frag) = urlparse.urlparse(self.url)
+        path = urllib.unquote(path)
         if not (self.opts.progress_obj or self.opts.raw_throttle() \
                 or self.opts.timeout):
             # if we're not using the progress_obj, throttling, or timeout
@@ -989,12 +1096,16 @@
             if hasattr(fo, 'readline'):
                 self.readline = fo.readline
         elif self.opts.progress_obj:
-            try:    length = int(hdr['Content-Length'])
-            except: length = None
-            self.opts.progress_obj.start(str(self.filename), self.url, 
+            try:    
+                length = int(hdr['Content-Length'])
+                length = length + self._amount_read     # Account for regets
+            except (KeyError, ValueError, TypeError): 
+                length = None
+
+            self.opts.progress_obj.start(str(self.filename),
+                                         urllib.unquote(self.url),
                                          os.path.basename(path), 
-                                         length,
-                                         text=self.opts.text)
+                                         length, text=self.opts.text)
             self.opts.progress_obj.update(0)
         (self.fo, self.hdr) = (fo, hdr)
     
@@ -1024,6 +1135,10 @@
             else:
                 self.reget_time = s[ST_MTIME]
                 reget_length = s[ST_SIZE]
+
+                # Set initial length when regetting
+                self._amount_read = reget_length    
+
                 rt = reget_length, ''
                 self.append = 1
                 
diff -urN --exclude=CVS --exclude=.cvsignore --exclude=.svn --exclude=.svnignore old/urlgrabber-2.9.7/urlgrabber/mirror.py new/urlgrabber-2.9.9/urlgrabber/mirror.py
--- old/urlgrabber-2.9.7/urlgrabber/mirror.py	2005-10-22 23:57:28.000000000 +0200
+++ new/urlgrabber-2.9.9/urlgrabber/mirror.py	2006-02-22 19:26:46.000000000 +0100
@@ -86,7 +86,7 @@
 
 """
 
-# $Id: mirror.py,v 1.13 2005/10/22 21:57:28 mstenner Exp $
+# $Id: mirror.py,v 1.14 2006/02/22 18:26:46 mstenner Exp $
 
 import random
 import thread  # needed for locking to make this threadsafe
@@ -346,14 +346,14 @@
                 del self.mirrors[ind]
             elif self._next == ind and action.get('increment_master', 1):
                 self._next += 1
-                if self._next >= len(self.mirrors): self._next = 0
+            if self._next >= len(self.mirrors): self._next = 0
         self._lock.release()
         
         if action.get('remove', 1):
             del gr.mirrors[gr._next]
         elif action.get('increment', 1):
             gr._next += 1
-            if gr._next >= len(gr.mirrors): gr._next = 0
+        if gr._next >= len(gr.mirrors): gr._next = 0
 
         if DEBUG:
             grm = [m['mirror'] for m in gr.mirrors]

++++++ urlgrabber-read-error.patch ++++++
--- urlgrabber/grabber.py
+++ urlgrabber/grabber.py
@@ -1140,6 +1140,8 @@
                 raise URLGrabError(4, _('Socket Error: %s') % (e, ))
             except TimeoutError, e:
                 raise URLGrabError(12, _('Timeout: %s') % (e, ))
+            except IOError, e:
+                raise URLGrabError(4, _('IOError: %s') %(e,))
             newsize = len(new)
             if not newsize: break # no more to read
 

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++



Remember to have fun...

    

root＠suse.de

tags

participants (1)