8.1 基本使用方法
-
创建一个项目
scrapy startproject '项目名'
-
创建一个爬虫文件
cd '项目名' scrapy genspider '爬虫文件名' www.xxx.com
-
启动项目
scrapy crawl '爬虫文件名'
8.2 基本配置
-
settings.py
# UA伪装 USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36' # robots协议 ROBOTSTXT_OBEY = False # 日志级别 LOG_LEVEL = 'ERROR'
-
爬虫文件.py
class TxtSpider(scrapy.Spider): name = 'txt' # allowed_domains = ['www.xxx.com'] start_urls = ['http://xxx.com'] # 起始爬取的url
8.3 存储
-
指令
# 执行输出指定格式进行存储:将爬取到的数据写入不同格式的文件中进行存储 scrapy crawl '爬虫名称' -o 'xxx.json' scrapy crawl '爬虫名称' -o 'xxx.xml' scrapy crawl '爬虫名称' -o 'xxx.csv'
-
管道
# items.py: # 1、定义数据属性。 import scrapy class XxxItem(scrapy.Item): name = scrapy.Field() # 爬虫文件.py: # 1、将爬取到的数据封装到items对象中。 # 2、使用yield将items对象提交给pipelines管道。 def parse(self, response): item = XxxItem() item['name'] = name yield item # pipelines.py: # 1、process_item方法中接收item对象。 # 2、将item对象中存储的数据进行持久化存储。 class XxxPipeline(object): def __init__(self): self.fp = None # 重写父类方法:开始爬虫时,执行一次 def open_spider(self,spider): print('爬虫开始') self.fp = open('./data.txt', 'w') # 重写父类方法:该方法会被执行调用多次 def process_item(self, item, spider): #将爬虫程序提交的item进行持久化存储 self.fp.write(item['author'] + ':' + item['content'] + '\n') return item # 重写父类方法:结束爬虫时,执行一次 def close_spider(self,spider): self.fp.close() print('爬虫结束') class XxxPipeline_2(object): pass # settings.py: # 1、开启管道 ITEM_PIPELINES = { 'xxx.pipelines.XxxPipeline': 300, # 300表示为优先级,值越小优先级越高 'xxx.pipelines.XxxPipeline_2': 200, }
8.4 分页
-
爬虫文件.py
class TxtSpider(scrapy.Spider): name = 'txt' # allowed_domains = ['www.xxx.com'] start_urls = ['http://xxx.com'] # 起始爬取的url # 分页URL模版 url = 'https://www.qiushibaike.com/text/page/%s/' pageNumMax = 100 # 最大页码 pageNum = 1 # 起始页码 def parse(self, response): pass # 爬取所有分页数据 if self.pageNum <= self.pageNumMax: self.pageNum += 1 url = format(self.url % self.pageNum) # 递归爬取数据:callback参数的值为回调函数 yield scrapy.Request(url=url, callback=self.parse)
8.5 详情页
-
爬虫文件.py
class TxtSpider(scrapy.Spider): name = 'txt' # allowed_domains = ['www.xxx.com'] start_urls = ['http://xxx.com'] # 起始爬取的url # 详情页 def parse_detail(self, response): # 调用参数 item = response.meta['item'] pass yield item # 当前页面 def parse(self, response, **kwargs): article_url_list = response.xpath('//div[@class="Volume"]//dd/a/@href').extract() for article_url in article_url_list: pass # 请求传参:meta={},可以将meta字典传递给请求对应的回调函数 yield scrapy.Request(article_url, callback=self.parse_detail, meta={'item': item})
8.6 提高效率
-
增加并发
# 默认scrapy开启的并发线程为32个 CONCURRENT_REQUESTS = 100
-
降低日志级别
# 在运行scrapy时,会有大量日志信息的输出,为了减少CPU的使用率。可以设置log输出信息为INFO或者ERROR即可 LOG_LEVEL = 'INFO' LOG_LEVEL = 'ERROR'
-
禁止cookie
# 如果不是真的需要cookie,则在scrapy爬取数据时可以禁止cookie从而减少CPU的使用率,提升爬取效率 COOKIES_ENABLED = False
-
禁止重试
# 对失败的HTTP进行重新请求(重试)会减慢爬取速度,因此可以禁止重试 RETRY_ENABLED = False
-
减少下载超时
# 如果对一个非常慢的链接进行爬取,减少下载超时可以让卡住的链接快速被放弃,从而提升效率 DOWNLOAD_TIMEOUT = 10 # 超时时间为10s
8.7 图片
-
settinss.py
# 指定图片存储的目录 IMAGES_STORE = './img'
-
pipelines.py
from scrapy.pipelines.images import ImagesPipeline import scrapy class imgsPileLine(ImagesPipeline): # 根据图片地址进行图片数据的请求 def get_media_requests(self, item, info): yield scrapy.Request(item['img_url']) # 指定图片完整路径 + 图片名 def file_path(self, request, response=None, info=None, *, item=None): imgName = request.url.split('/')[-1].split('?')[0] print('下载完成!') return imgName def item_completed(self, results, item, info): # 返回给下一个即将被执行的管道类 return item
8.8 中间件(代理IP)
-
settings.py
DOWNLOADER_MIDDLEWARES = { 'xxx.middlewares.XxxDownloaderMiddleware': 543, }
-
middlewares.py
import random class XxxDownloaderMiddleware(object): # 对请求设置随机的User-Agent user_agent_list = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 " "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ] # 对请求设置随机的代理 PROXY_http = [ '153.180.102.104:80', '195.208.131.189:56055', ] PROXY_https = [ '120.83.49.90:9000', '95.189.112.214:35508', ] # 拦截请求 def process_request(self, request, spider): # UA伪装 request.headers['User-Agent'] = random.choice(self.user_agent_list) # 为了验证代理的操作是否生效 request.meta['proxy'] = 'http://183.146.213.198:80' return None # 拦截所有的响应 def process_response(self, request, response, spider): pass return response # 拦截发生异常的请求 def process_exception(self, request, exception, spider): if request.url.split(':')[0] == 'http': # 代理 request.meta['proxy'] = 'http://' + random.choice(self.PROXY_http) else: request.meta['proxy'] = 'https://' + random.choice(self.PROXY_https) return request # 将修正之后的请求对象进行重新的请求发送
8.9 浏览器对象selenium
-
爬虫文件.py
from selenium import webdriver class TxtSpider(scrapy.Spider): name = 'txt' # allowed_domains = ['www.xxx.com'] start_urls = ['http://xxx.com'] # 起始爬取的url models_urls = [] # 存储需要动态加载的URL # 实例化一个浏览器对象 def __init__(self): self.bro = webdriver.Chrome(executable_path='./浏览器驱动路径') pass # 重写父类方法:爬虫结束时被调用,关闭浏览器对象 def closed(self, spider): self.bro.quit()
-
middlewares.py
from scrapy.http import HtmlResponse from time import sleep class XxxDownloaderMiddleware(object): # 拦截请求 def process_request(self, request, spider): pass return None # 拦截所有的响应 def process_response(self, request, response, spider): # 获取爬虫类中定义的浏览器对象 bro = spider.bro # 挑选出指定的响应对象进行篡改 if request.url in spider.models_urls: bro.get(request.url) sleep(3) # 包含了动态加载的新闻数据 page_text = bro.page_source # 篡改后的相应对象 new_response = HtmlResponse(url=request.url, body=page_text, encoding='utf-8', request=request) return new_response else: # 其他请求对应的响应对象 return response # 拦截发生异常的请求 def process_exception(self, request, exception, spider): pass return request
8.10 CrawlSpider类
-
创建一个项目
scrapy startproject '项目名'
-
创建一个CrawlSpider爬虫文件
cd '项目名' scrapy genspider -t crawl '爬虫文件名' www.xxx.com
-
启动项目
scrapy crawl '爬虫文件名'
-
爬虫文件.py
from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from sunPro.items import SunproItem, DetailItem # 需求:爬取sun网站中的编号,新闻标题,新闻内容,标号 class SunSpider(CrawlSpider): name = 'sun' # allowed_domains = ['www.xxx.com'] start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page='] # 链接提取器:根据指定规则(allow="正则")进行指定链接的提取 # 分页 link = LinkExtractor(allow=r'type=4&page=\d+') # 详情页 link_detail = LinkExtractor(allow=r'question/\d+/\d+\.shtml') # 规则解析器:将链接提取器提取到的链接进行指定规则(callback)的解析操作 rules = ( # follow=True:可以将链接提取器 继续作用到 连接提取器提取到的链接 所对应的页面中,即找到所有分页 Rule(link, callback='parse_item', follow=True), Rule(link_detail, callback='parse_detail') ) # 如下两个解析方法不可以实现请求传参,可以依次存储到两个item # 解析新闻编号和新闻的标题 def parse_item(self, response): # 注意:xpath表达式中不可以出现tbody标签 tr_list = response.xpath('//*[@id="morelist"]/div/table[2]//tr/td/table//tr') for tr in tr_list: new_num = tr.xpath('./td[1]/text()').extract_first() new_title = tr.xpath('./td[2]/a[2]/@title').extract_first() item = SunproItem() item['title'] = new_title item['new_num'] = new_num yield item # 解析新闻内容和新闻编号 def parse_detail(self, response): new_id = response.xpath('/html/body/div[9]/table[1]//tr/td[2]/span[2]/text()').extract_first() new_content = response.xpath('/html/body/div[9]/table[2]//tr[1]//text()').extract() new_content = ''.join(new_content) item = DetailItem() item['content'] = new_content item['new_id'] = new_id yield item