爬虫下载页面

简介

爬虫下载页面

代码

简易下载

#!/usr/bin/env python
#coding=utf-8
import urllib2def download(url):print('Download:',url)try:html = urllib2.urlopen(url).read()except urllib2.URLError as e:print('Download error:', e.reason)html = Nonereturn htmlif __name__ == '__main__':download('http://www.baidu.com')

似乎并没有把百度的html 下载下来

多次尝试下载 5XX服务器错误并设置代理

很多网站都不喜欢被爬虫程序访问，但又没有办法完全禁止，于是就设置了一些反爬策略。比如User Agent，中文名为用户代理，简称UA。User Agent存放于Headers中，服务器就是通过查看Headers中的User Agent来判断是谁在访问。
通过不同的浏览器访问，会有不同的User Agent，如果爬虫不设置的话，很容易被识别出来，就会被限制访问。一般的做法是收集很多不同的User Agent，然后随机使用。

def download(url, user_agent='wswp', num_retries=2):print 'Downloading:', urlheaders = {'User-agent':user_agent}request = urllib2.Request(url, headers=headers)try:html = urllib2.urlopen(request).read()except urllib2.URLError as e:print('Download error:', e.reason)html = Noneif num_retries > 0:if hasattr(e, 'code') and 500 <= e.code < 600:#retyr 5XX HTTP errorsreturn download(url, user_agent, num_retries-1)return html

使用网站地图下载相关的页面

def crawl_sitemap(url):# download the sitemap filesitemap = download(url)# extract the sitemap linkslinks = re.findall('<loc>(.*?)</loc>', sitemap)# download each linkfor link in links:html = download(link)print link

网站可能会把前面的字符串忽略然后可以只用后面的数字

def crawl_string():for page in itertools.count(1):url = 'http://example.webscraping.com/view/-%d' % pagehtml = download(url)if ( html is None):breakelse:pass

网站通过一个页面的链接下载

def get_links(html):webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']',re.IGNORECASE)return webpage_regex.findall(html)def link_crawler(seed_url, link_regex):crawl_queue = [seed_url]# keep track which URL's have seen beforeseen = set(crawl_queue)while crawl_queue:url = crawl_queue.pop()html = download(url)print "getlinks", get_links(html)for link in get_links(html):if re.match(link_regex, link):link = urlparse.urljoin(seed_url, link)if link not in seen:seen.add(link)crawl_queue.append(link)if __name__ == '__main__':link_crawler('http://example.webscraping.com', '/places/default/(index|view)')

支持对 robots.txt 的解析

def link_crawler(seed_url, link_regex):rp = robotparser.RobotFileParser()rp.set_url(seed_url+'/robots.txt')rp.read()crawl_queue = [seed_url]# keep track which URL's have seen beforeseen = set(crawl_queue)while crawl_queue:url = crawl_queue.pop()user_agent = 'wswp'if rp.can_fetch(user_agent, url):html = download(url)print "getlinks", get_links(html)for link in get_links(html):if re.match(link_regex, link):link = urlparse.urljoin(seed_url, link)if link not in seen:seen.add(link)crawl_queue.append(link)else:print 'Blocked by robots.txt:', url

代理

def link_crawler(seed_url, link_regex, proxy=False):if proxy: # 暂时无法代理proxy_info={'host':'106.12.38.133','port':22}# We create a handler for the proxyproxy_support = urllib2.ProxyHandler({"http" : "http://%(host)s:%(port)d" % proxy_info})# We create an opener which uses this handler:opener = urllib2.build_opener(proxy_support)# Then we install this opener as the default opener for urllib2:urllib2.install_opener(opener)#如果代理需要验证proxy_info = { 'host' : '106.12.38.133','port' : 20,'user' : 'root','pass' : 'Woaini7758258!'}proxy_support = urllib2.ProxyHandler({"http" : "http://%(user)s:%(pass)s@%(host)s:%(port)d" % proxy_info})opener = urllib2.build_opener(proxy_support)urllib2.install_opener(opener)#htmlpage = urllib2.urlopen("http://sebsauvage.net/").read(200000)rp = robotparser.RobotFileParser()rp.set_url(seed_url+'/robots.txt')rp.read()crawl_queue = [seed_url]# keep track which URL's have seen beforeseen = set(crawl_queue)while crawl_queue:url = crawl_queue.pop()user_agent = 'wswp'if rp.can_fetch(user_agent, url):html = download(url)print "getlinks", get_links(html)for link in get_links(html):if re.match(link_regex, link):link = urlparse.urljoin(seed_url, link)if link not in seen:seen.add(link)crawl_queue.append(link)else:print 'Blocked by robots.txt:', url

下载限速

class Throttle:"""下载延迟 下载之前调用"""def __init__(self, delay):self.delay = delayself.domains()def wait(self, url):domain = urlparse.urlparse(url).netloclast_accessed = self.domains.get(domain)if self.delay > 0 and last_accessed is not None:sleep_secs = self.delay - (datetime.datetime.now() - last_accessed).secondsif sleep_secs > 0:time.sleep(sleep_secs)self.domains[domain] = datetime.datetime.now()