‘’’
爬取思路:
 1、requests(url)
 2、requests + json
 3、requests + XPath
 4、requests + BeautifulSoup
 5、selenium
 6、scrapy框架
 7、scrapy-redis 及分布式
===============================================
 OS:
 import os
 os.system(“C: && p.txt”)
 os.system(“ping 127.0.0.1”)
===============================================
 requests:
 requests.get(url, headers=headers, data={’’:’’}, proxies=proxies)
===============================================
 Proxies:
 proxies = {‘http’: ‘124.207.82.166:8008’} # 47.98.129.198
 response = requests.get(request_url, proxies=proxies) # 发起请求
===============================================
 File:
 with open(path,‘w’) as f:
 f.write(text)
===============================================
 Threading:
 import threading
 threading.Thread(target=fun, kwargs={‘list_url’:list_url,‘path_order’:path_order1}).start()
===============================================
 requests、json:
 1.data = json.load(open(“package1.json”,encoding=“utf-8”))
 response = requests.get(url, headers=headers)
 print(response.text)
2.response = requests.get(url)
 data = response.text
 obj = json.loads(data)
===============================================
 requests、XPath
 from lxml import etree
 response = requests.get(list_url, headers=headers)
 content = response.content
 selector = etree.HTML(scontent) # 将页面装入etree树
 items = selector.xpath(path_order) # 按照XPath查找树,返回迭代,
 title = item.xpath("./div/p[1]/a/text()")[0].strip() # 迭代对象item可继续用XPath查找
===============================================
 requests、BeautifulSoup
 from bs4 import BeautifulSoup
 response = requests.get(url)
 html= response.text
 soup = BeautifulSoup(html, ‘lxml’)
 soup_str = soup.prettify() # 标准化html
 tag = soup.b
 tag的一系类操作
===============================================
 selenium: 安装对应chrome版本的 Selenium driver https://www.cnblogs.com/JHblogs/p/7699951.html
 并且安装依赖库 pip install selenium
 from selenium import webdriver
 chromedriver = “G:/4Anaconda/chromedriver.exe” # 驱动若在python路径下 即可省略这一步
 browser = webdriver.Chrome(chromedriver)
 #打开一个网页
 browser.get(“http://www.baidu.com”)
 browser.find_element_by_id(“kw”).send_keys(“selenium”)
 browser.find_element_by_id(“su”).click()
 browser.title
 browser.set_window_size(480, 800) #参数数字为像素点
 browser.back()
 browser.forward()
 #退出并关闭窗口的每一个相关的驱动程序
browser.quit()
#关闭当前窗口
 #browser.close()
 
隐式等待
from selenium import webdriver
browser = webdriver.Chrome()
这里用implicitly_wait()实现了隐式等待
browser.implicitly_wait(10)
 browser.get(‘https://www.zhihu.com/explore’)
 input = browser.find_element_by_class_name(‘zu-top-add-question’)
 print(input)
 
显示等待
 from selenium import webdriver
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
browser = webdriver.Chrome()
 browser.get(‘https://www.taobao.com/’)
 wait = WebDriverWait(browser, 10)
 input = wait.until(EC.presence_of_element_located((By.ID, ‘q’)))
 button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ‘.btn-search’)))
 print(input, button)
‘’’
‘’’
 #一、 创建scrapy项目(cmd):
 scrapy startproject weibospider
 cd weibospider
    #二、 创建sipder语句cmd:scrapy genspider WeiboSpider image.baidu.com拒绝爬虫协议 ROBOTSTXT_OBEY = False 运行爬虫 scrapy crawl baiduimg#三、 设置数据结构name = scrapy.Field()#四、 导入 数据 from hotnewsSpider.items import WeiboSpiderItem使用 weiboitem = WeiboSpiderItem()weiboitem['name'] = '123'返回 yield weiboitem#五、 发送请求传递 (在parse中)yield scrapy.Request(url=url, headers=self.headers, cookies=self.cookies, callback=self.clickFindMore)# 发送请求传递并回调,加参 callbackyield scrapy.Request(link,callback=self.parse_detail)#六、重写初始化请求def start_requests(self):for url in self.urls:yield scrapy.Request(url=url, headers=self.headers, cookies=self.cookies, callback=self.parse)#七、接收responsedef parse(self,response):pass
‘’’