简介
爬虫下载页面
代码
简易下载
#!/usr/bin/env python
#coding=utf-8
import urllib2def download(url):print('Download:',url)try:html = urllib2.urlopen(url).read()except urllib2.URLError as e:print('Download error:', e.reason)html = Nonereturn htmlif __name__ == '__main__':download('http://www.baidu.com')
似乎并没有把百度的html 下载下来
多次尝试下载 5XX服务器错误 并设置 代理
很多网站都不喜欢被爬虫程序访问,但又没有办法完全禁止,于是就设置了一些反爬策略。比如User Agent,中文名为用户代理,简称UA。User Agent存放于Headers中,服务器就是通过查看Headers中的User Agent来判断是谁在访问。
通过不同的浏览器访问,会有不同的User Agent,如果爬虫不设置的话,很容易被识别出来,就会被限制访问。一般的做法是收集很多不同的User Agent,然后随机使用。
def download(url, user_agent='wswp', num_retries=2):print 'Downloading:', urlheaders = {'User-agent':user_agent}request = urllib2.Request(url, headers=headers)try:html = urllib2.urlopen(request).read()except urllib2.URLError as e:print('Download error:', e.reason)html = Noneif num_retries > 0:if hasattr(e, 'code') and 500 <= e.code < 600:#retyr 5XX HTTP errorsreturn download(url, user_agent, num_retries-1)return html
使用网站地图下载相关的页面
def crawl_sitemap(url):# download the sitemap filesitemap = download(url)# extract the sitemap linkslinks = re.findall('<loc>(.*?)</loc>', sitemap)# download each linkfor link in links:html = download(link)print link
网站可能会把前面的字符串忽略然后可以只用后面的数字
def crawl_string():for page in itertools.count(1):url = 'http://example.webscraping.com/view/-%d' % pagehtml = download(url)if ( html is None):breakelse:pass
网站通过一个页面的链接下载
def get_links(html):webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']',re.IGNORECASE)return webpage_regex.findall(html)def link_crawler(seed_url, link_regex):crawl_queue = [seed_url]# keep track which URL's have seen beforeseen = set(crawl_queue)while crawl_queue:url = crawl_queue.pop()html = download(url)print "getlinks", get_links(html)for link in get_links(html):if re.match(link_regex, link):link = urlparse.urljoin(seed_url, link)if link not in seen:seen.add(link)crawl_queue.append(link)if __name__ == '__main__':link_crawler('http://example.webscraping.com', '/places/default/(index|view)')
支持对 robots.txt 的解析
def link_crawler(seed_url, link_regex):rp = robotparser.RobotFileParser()rp.set_url(seed_url+'/robots.txt')rp.read()crawl_queue = [seed_url]# keep track which URL's have seen beforeseen = set(crawl_queue)while crawl_queue:url = crawl_queue.pop()user_agent = 'wswp'if rp.can_fetch(user_agent, url):html = download(url)print "getlinks", get_links(html)for link in get_links(html):if re.match(link_regex, link):link = urlparse.urljoin(seed_url, link)if link not in seen:seen.add(link)crawl_queue.append(link)else:print 'Blocked by robots.txt:', url
代理
def link_crawler(seed_url, link_regex, proxy=False):if proxy: # 暂时无法代理proxy_info={'host':'106.12.38.133','port':22}# We create a handler for the proxyproxy_support = urllib2.ProxyHandler({"http" : "http://%(host)s:%(port)d" % proxy_info})# We create an opener which uses this handler:opener = urllib2.build_opener(proxy_support)# Then we install this opener as the default opener for urllib2:urllib2.install_opener(opener)#如果代理需要验证proxy_info = { 'host' : '106.12.38.133','port' : 20,'user' : 'root','pass' : 'Woaini7758258!'}proxy_support = urllib2.ProxyHandler({"http" : "http://%(user)s:%(pass)s@%(host)s:%(port)d" % proxy_info})opener = urllib2.build_opener(proxy_support)urllib2.install_opener(opener)#htmlpage = urllib2.urlopen("http://sebsauvage.net/").read(200000)rp = robotparser.RobotFileParser()rp.set_url(seed_url+'/robots.txt')rp.read()crawl_queue = [seed_url]# keep track which URL's have seen beforeseen = set(crawl_queue)while crawl_queue:url = crawl_queue.pop()user_agent = 'wswp'if rp.can_fetch(user_agent, url):html = download(url)print "getlinks", get_links(html)for link in get_links(html):if re.match(link_regex, link):link = urlparse.urljoin(seed_url, link)if link not in seen:seen.add(link)crawl_queue.append(link)else:print 'Blocked by robots.txt:', url
下载限速
class Throttle:"""下载延迟 下载之前调用"""def __init__(self, delay):self.delay = delayself.domains()def wait(self, url):domain = urlparse.urlparse(url).netloclast_accessed = self.domains.get(domain)if self.delay > 0 and last_accessed is not None:sleep_secs = self.delay - (datetime.datetime.now() - last_accessed).secondsif sleep_secs > 0:time.sleep(sleep_secs)self.domains[domain] = datetime.datetime.now()
参考链接
https://tieba.baidu.com/p/5832236970