代码没有执行apply_async中添加的函数就直接结束了
from bs4 import BeautifulSoup
import random
import requests
import pymongo
import datetime
import random
import time
from multiprocessing import Pool
user_agents = [
'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 '
'Mobile/13B143 Safari/601.1]',
'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/48.0.2564.23 Mobile Safari/537.36',
'Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/48.0.2564.23 Mobile Safari/537.36']
heads = {
'User_Agent': random.choice(user_agents)
}
ipHeads = {
'Upgrade-Insecure-Requests':'1',
'User-Agent':random.choice(user_agents),
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Referer':'http://www.xicidaili.com/nn/',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'zh-CN,zh;q=0.8',
}
class douban():
def __init__(self):
self.client = pymongo.MongoClient('localhost', 27017)
self.db = self.client['books']
self.tool = self.client['tool']
self.collectIp = self.tool['ip']
def getFromSQL(self):
item = self.collectIp.find_one({'http': 'http'})
proxies = {}
proxies[item['http']] = 'http://' + item['ip'] + ':' + item['port']
return proxies
def getAllTag(self):
ipDic = {}
url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all'
proxies = self.getFromSQL()
s = requests.get(url,headers=heads,proxies=proxies)
if s.status_code == 403:
values = list(proxies.values())[0]
ip = values.split('//')[1].split(':')[0]
self.collect.remove({'ip': ip})
proxies = getFromSQL()
s = requests.get(url,headers=heads,proxies=proxies)
soup = BeautifulSoup(s.text,'lxml')
titleTags= soup.find_all('a', class_='tag-title-wrapper')
tagList = soup.find_all('table',class_='tagCol')
href = {}
titleList = []
i = 0
for titleTag in titleTags:
title = titleTag['name']
titleList.append(title)
trs = tagList[i].find_all('tr')
hreflist = []
for tr in trs:
hreflist.append(tr.td.a['href'])
href[title] = hreflist
i = i + 1
return titleList,href
def getAllBookUrl(self,title, hrefDic):
print('a')
collect = self.db[title]
for href in hrefDic[title]:
index = 0
while 1:
url = 'https://book.douban.com' + href +'?start='+ str(index) + '&type=T'
proxies = self.getFromSQL()
s = requests.get(url, headers=heads,proxies=proxies)
if s.status_code == 403:
values = list(proxies.values())[0]
ip = values.split('//')[1].split(':')[0]
collect.remove({'ip': ip})
proxies = self.getFromSQL()
s = requests.get(url,headers=heads,proxies=proxies)
html = s.text
soup = BeautifulSoup(html, 'lxml')
liList= soup.find_all('li',class_='subject-item')
if len(liList):
for li in liList:
id = li.find('a')['href'][32:-1]
collect.insert({'bookId':id})
index += 20
time.sleep(3)
else:
break
if __name__== '__main__':
p = Pool(4)
a = douban()
titleList, hrefDic = a.getAllTag()
for i in range(len(titleList)):
print('开始爬取%s'%titleList[i])
p.apply_async(a.getAllBookUrl, args=(titleList[i],hrefDic))
p.close()
p.join()
# a = douban()
# titleList, hrefDic = a.getAllTag()
# a.getAllBookUrl(titleList[0],hrefDic)
# print('done')