def get_html_source(url, referer = '', data = 0, cj = 0, retry_counter = 0):
if retry_counter > 0:
print 'Trying Again...'
if retry_counter > 3:
print 'Could not get source from url:', url
return '', ''
try:
if cj:
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
else:
opener = urllib2.build_opener()
opener.addheaders = [('Referer', referer),
('Content-Type', 'application/x-www-form-urlencoded'),
('User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.14) Gecko/20080404 Firefox/2.0.0.14'),
('Accept-Encoding', 'gzip,deflate')]
if data:
# HTTP POST
usock = opener.open(url, data)
else:
# HTTP GET
usock = opener.open(url)
content = decode(usock) # I think I have already written the code of decode funtion
# in another post. If you can't find it, just leave a comment
# here and I shall post the code again.
usock.close()
return content, cj
except urllib2.HTTPError, e:
print 'The server couldn\'t fulfill the request. for url: ', url
print 'Error code: ', e.code
return get_source(url, referer, data, cj, retry_counter + 1)
except urllib2.URLError, e:
print 'We failed to reach a server.'
print 'Reason: ', e.reason
return get_source(url, referer, data, cj, retry_counter + 1)
Please suggest any necessary update / modification of this code.
Xem đầy đủ bài viết tại http://love-python.blogspot.com/2009/03/updated-python-code-for-get-html-source.html
No comments:
Post a Comment