八、scrapy框架


返回

8.1 基本使用方法

  • 创建一个项目

    scrapy startproject '项目名'
    
  • 创建一个爬虫文件

    cd '项目名'
    scrapy genspider '爬虫文件名' www.xxx.com
    
  • 启动项目

    scrapy crawl '爬虫文件名'
    

8.2 基本配置

  • settings.py

    # UA伪装
    USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36'
    
    # robots协议
    ROBOTSTXT_OBEY = False
    
    # 日志级别
      LOG_LEVEL = 'ERROR'
    
    
  • 爬虫文件.py

    class TxtSpider(scrapy.Spider):
        name = 'txt'
        # allowed_domains = ['www.xxx.com']
        start_urls = ['http://xxx.com']  # 起始爬取的url
    
    

8.3 存储

  • 指令

    # 执行输出指定格式进行存储:将爬取到的数据写入不同格式的文件中进行存储
    scrapy crawl '爬虫名称' -o 'xxx.json'
    scrapy crawl '爬虫名称' -o 'xxx.xml'
    scrapy crawl '爬虫名称' -o 'xxx.csv'
    
  • 管道

    # items.py:
    # 1、定义数据属性。
    import scrapy
    class XxxItem(scrapy.Item):
        name = scrapy.Field()
    
        
    # 爬虫文件.py:
    # 1、将爬取到的数据封装到items对象中。
    # 2、使用yield将items对象提交给pipelines管道。
    def parse(self, response):
        item = XxxItem()
        item['name'] = name
        yield item
    
        
    # pipelines.py:
    # 1、process_item方法中接收item对象。
    # 2、将item对象中存储的数据进行持久化存储。
        
    class XxxPipeline(object):
        def __init__(self):
            self.fp = None  
        # 重写父类方法:开始爬虫时,执行一次
        def open_spider(self,spider):
            print('爬虫开始')
            self.fp = open('./data.txt', 'w')
        # 重写父类方法:该方法会被执行调用多次
        def process_item(self, item, spider):
            #将爬虫程序提交的item进行持久化存储
            self.fp.write(item['author'] + ':' + item['content'] + '\n')
            return item
        # 重写父类方法:结束爬虫时,执行一次
        def close_spider(self,spider):
            self.fp.close()
            print('爬虫结束')
    
    class XxxPipeline_2(object):
    	  pass
            
    # settings.py:
    # 1、开启管道
    ITEM_PIPELINES = {
        'xxx.pipelines.XxxPipeline': 300, # 300表示为优先级,值越小优先级越高
        'xxx.pipelines.XxxPipeline_2': 200, 
    }
    
    

8.4 分页

  • 爬虫文件.py

    class TxtSpider(scrapy.Spider):
        name = 'txt'
        # allowed_domains = ['www.xxx.com']
        start_urls = ['http://xxx.com']  # 起始爬取的url
        
        # 分页URL模版
        url = 'https://www.qiushibaike.com/text/page/%s/'
        pageNumMax = 100 # 最大页码
        pageNum = 1 # 起始页码
        
        def parse(self, response):
            pass
          	# 爬取所有分页数据
            if self.pageNum <= self.pageNumMax:
                self.pageNum += 1
                url = format(self.url % self.pageNum)
                # 递归爬取数据:callback参数的值为回调函数
                yield scrapy.Request(url=url, callback=self.parse)
       
    

8.5 详情页

  • 爬虫文件.py

    class TxtSpider(scrapy.Spider):
        name = 'txt'
        # allowed_domains = ['www.xxx.com']
        start_urls = ['http://xxx.com']  # 起始爬取的url
        
        # 详情页
        def parse_detail(self, response):
          	# 调用参数
          	item = response.meta['item']
            pass
            yield item
    		
        # 当前页面
        def parse(self, response, **kwargs):
            article_url_list = response.xpath('//div[@class="Volume"]//dd/a/@href').extract()
            for article_url in article_url_list:
                pass
                # 请求传参:meta={},可以将meta字典传递给请求对应的回调函数
                yield scrapy.Request(article_url, callback=self.parse_detail, meta={'item': item})
    
    

8.6 提高效率

  • 增加并发

    # 默认scrapy开启的并发线程为32个
    CONCURRENT_REQUESTS = 100
    
  • 降低日志级别

    # 在运行scrapy时,会有大量日志信息的输出,为了减少CPU的使用率。可以设置log输出信息为INFO或者ERROR即可
    LOG_LEVEL = 'INFO'
    LOG_LEVEL = 'ERROR'
    
  • 禁止cookie

    # 如果不是真的需要cookie,则在scrapy爬取数据时可以禁止cookie从而减少CPU的使用率,提升爬取效率
    COOKIES_ENABLED = False
    
  • 禁止重试

    # 对失败的HTTP进行重新请求(重试)会减慢爬取速度,因此可以禁止重试
    RETRY_ENABLED = False
    
  • 减少下载超时

    # 如果对一个非常慢的链接进行爬取,减少下载超时可以让卡住的链接快速被放弃,从而提升效率
    DOWNLOAD_TIMEOUT = 10 # 超时时间为10s
    

8.7 图片

  • settinss.py

    # 指定图片存储的目录
    IMAGES_STORE = './img'
    
  • pipelines.py

    from scrapy.pipelines.images import ImagesPipeline
    import scrapy
    
    
    class imgsPileLine(ImagesPipeline):
    
        # 根据图片地址进行图片数据的请求
        def get_media_requests(self, item, info):
            yield scrapy.Request(item['img_url'])
    
        # 指定图片完整路径 + 图片名
        def file_path(self, request, response=None, info=None, *, item=None):
            imgName = request.url.split('/')[-1].split('?')[0]
            print('下载完成!')
            return imgName
    
        def item_completed(self, results, item, info):
          	# 返回给下一个即将被执行的管道类
            return item  
          
    

8.8 中间件(代理IP)

  • settings.py

    DOWNLOADER_MIDDLEWARES = {
       'xxx.middlewares.XxxDownloaderMiddleware': 543,
    }
    
  • middlewares.py

    import random
    
    class XxxDownloaderMiddleware(object):
      
        # 对请求设置随机的User-Agent
        user_agent_list = [
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
            "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
            "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
            "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
            "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
            "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
            "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
            "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
            "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
            "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
            "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
            "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
        ]
        
        # 对请求设置随机的代理
        PROXY_http = [
            '153.180.102.104:80',
            '195.208.131.189:56055',
        ]
        PROXY_https = [
            '120.83.49.90:9000',
            '95.189.112.214:35508',
        ]
    
        # 拦截请求
        def process_request(self, request, spider):
            # UA伪装
            request.headers['User-Agent'] = random.choice(self.user_agent_list)
            # 为了验证代理的操作是否生效
            request.meta['proxy'] = 'http://183.146.213.198:80'
            return None
    
        # 拦截所有的响应
        def process_response(self, request, response, spider):
          	pass
            return response
    
        # 拦截发生异常的请求
        def process_exception(self, request, exception, spider):
            if request.url.split(':')[0] == 'http':
                # 代理
                request.meta['proxy'] = 'http://' + random.choice(self.PROXY_http)
            else:
                request.meta['proxy'] = 'https://' + random.choice(self.PROXY_https)
            return request  # 将修正之后的请求对象进行重新的请求发送
    
    

8.9 浏览器对象selenium

  • 爬虫文件.py

    from selenium import webdriver
    
    class TxtSpider(scrapy.Spider):
        name = 'txt'
        # allowed_domains = ['www.xxx.com']
        start_urls = ['http://xxx.com']  # 起始爬取的url
        
        models_urls = []  # 存储需要动态加载的URL
        
        # 实例化一个浏览器对象
        def __init__(self):
            self.bro = webdriver.Chrome(executable_path='./浏览器驱动路径')
            
        pass
      
      	# 重写父类方法:爬虫结束时被调用,关闭浏览器对象
        def closed(self, spider):
          	self.bro.quit()
    
    
  • middlewares.py

    from scrapy.http import HtmlResponse
    from time import sleep
    
    class XxxDownloaderMiddleware(object):
      
        # 拦截请求
        def process_request(self, request, spider):
            pass
            return None
    
        # 拦截所有的响应
        def process_response(self, request, response, spider): 
          	# 获取爬虫类中定义的浏览器对象
            bro = spider.bro  
    
            # 挑选出指定的响应对象进行篡改
            if request.url in spider.models_urls:
                bro.get(request.url) 
                sleep(3)
                # 包含了动态加载的新闻数据
                page_text = bro.page_source  
    						# 篡改后的相应对象
                new_response = HtmlResponse(url=request.url, body=page_text, encoding='utf-8', request=request)
                return new_response
            else:
                # 其他请求对应的响应对象
                return response
    
        # 拦截发生异常的请求
        def process_exception(self, request, exception, spider):
            pass
            return request
    
    

8.10 CrawlSpider类

  • 创建一个项目

    scrapy startproject '项目名'
    
  • 创建一个CrawlSpider爬虫文件

    cd '项目名'
    scrapy genspider -t crawl '爬虫文件名' www.xxx.com
    
  • 启动项目

    scrapy crawl '爬虫文件名'
    
  • 爬虫文件.py

    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from sunPro.items import SunproItem, DetailItem
    
    
    # 需求:爬取sun网站中的编号,新闻标题,新闻内容,标号
    class SunSpider(CrawlSpider):
        name = 'sun'
        # allowed_domains = ['www.xxx.com']
        start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page=']
    
        # 链接提取器:根据指定规则(allow="正则")进行指定链接的提取
        # 分页
        link = LinkExtractor(allow=r'type=4&page=\d+')
        # 详情页
        link_detail = LinkExtractor(allow=r'question/\d+/\d+\.shtml')
        
        # 规则解析器:将链接提取器提取到的链接进行指定规则(callback)的解析操作
        rules = (
          	# follow=True:可以将链接提取器 继续作用到 连接提取器提取到的链接 所对应的页面中,即找到所有分页
            Rule(link, callback='parse_item', follow=True),
            Rule(link_detail, callback='parse_detail')
        )
    
        # 如下两个解析方法不可以实现请求传参,可以依次存储到两个item
        # 解析新闻编号和新闻的标题
        def parse_item(self, response):
            # 注意:xpath表达式中不可以出现tbody标签
            tr_list = response.xpath('//*[@id="morelist"]/div/table[2]//tr/td/table//tr')
            for tr in tr_list:
                new_num = tr.xpath('./td[1]/text()').extract_first()
                new_title = tr.xpath('./td[2]/a[2]/@title').extract_first()
                item = SunproItem()
                item['title'] = new_title
                item['new_num'] = new_num
    
                yield item
    
        # 解析新闻内容和新闻编号
        def parse_detail(self, response):
            new_id = response.xpath('/html/body/div[9]/table[1]//tr/td[2]/span[2]/text()').extract_first()
            new_content = response.xpath('/html/body/div[9]/table[2]//tr[1]//text()').extract()
            new_content = ''.join(new_content)
            item = DetailItem()
            item['content'] = new_content
            item['new_id'] = new_id
    
            yield item
            
    
返回