第155集 Scrapy爬虫编写

1 使用CrawlSpider自动爬取

1.1 什么是CrawlSpider?

CrawlSpider是Scrapy提供的一种高级爬虫类,它可以通过定义规则(rules)来自动提取链接并跟进,非常适合需要遍历整个网站的爬取任务。

1.2 创建CrawlSpider

scrapy genspider -t crawl mycrawler example.com

1.3 编写CrawlSpider代码

import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from myspider.items import ArticleItem

class MyCrawlerSpider(CrawlSpider):
    name = 'mycrawler'
    allowed_domains = ['example.com']
    start_urls = ['http://example.com/']

    # 定义规则
    rules = (
        # 提取所有符合正则表达式的链接,并使用parse_item方法解析
        Rule(LinkExtractor(allow=r'articles/\d+'), callback='parse_item', follow=True),
        # 提取下一页链接并跟进,不使用回调函数
        Rule(LinkExtractor(restrict_css='a.next-page'), follow=True),
    )

    def parse_item(self, response):
        item = ArticleItem()
        item['title'] = response.css('h1.article-title::text').get()
        item['content'] = response.css('div.article-content::text').getall()
        item['author'] = response.css('span.author::text').get()
        item['publish_date'] = response.css('time.publish-date::attr(datetime)').get()
        yield item

1.4 LinkExtractor详解

LinkExtractor用于提取链接,可以通过多种方式过滤链接:

# 提取所有链接
LinkExtractor(allow=(), deny=())

# 使用正则表达式提取链接
LinkExtractor(allow=r'articles/\d+')

# 排除特定链接
LinkExtractor(deny=r'/login|/admin')

# 使用CSS选择器提取链接
LinkExtractor(restrict_css='div.content')

# 使用XPath提取链接
LinkExtractor(restrict_xpaths='//div[@class="content"]')

# 限制深度
LinkExtractor(depth=1)

# 限制标签
LinkExtractor(tags=['a', 'area'])

# 限制属性
LinkExtractor(attrs=['href'])

2 使用FormRequest模拟表单提交

2.1 基本用法

import scrapy

class LoginSpider(scrapy.Spider):
    name = 'login'
    start_urls = ['http://example.com/login']

    def parse(self, response):
        # 提取CSRF token
        csrf_token = response.css('input[name="csrf_token"]::attr(value)').get()
        
        # 模拟表单提交
        return scrapy.FormRequest(
            url='http://example.com/login',
            formdata={
                'username': 'myusername',
                'password': 'mypassword',
                'csrf_token': csrf_token
            },
            callback=self.after_login
        )

    def after_login(self, response):
        # 检查是否登录成功
        if 'Welcome, myusername' in response.text:
            self.logger.info('Login successful!')
            # 登录成功后继续爬取
            yield scrapy.Request('http://example.com/dashboard', callback=self.parse_dashboard)
        else:
            self.logger.error('Login failed!')

    def parse_dashboard(self, response):
        # 解析仪表盘数据
        # ...

2.2 使用FormRequest.from_response

def parse(self, response):
    # 使用from_response自动提取表单数据
    return scrapy.FormRequest.from_response(
        response,
        formxpath='//form[@id="login-form"]',
        formdata={
            'username': 'myusername',
            'password': 'mypassword'
        },
        callback=self.after_login
    )

3 自定义中间件

3.1 下载器中间件

下载器中间件用于处理请求和响应,可以实现如代理IP、用户代理切换、请求头添加等功能。

# middlewares.py
class ProxyMiddleware:
    def process_request(self, request, spider):
        # 设置代理
        request.meta['proxy'] = 'http://proxy.example.com:8080'
        return None

class UserAgentMiddleware:
    def process_request(self, request, spider):
        # 设置随机User-Agent
        user_agents = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
        ]
        import random
        request.headers['User-Agent'] = random.choice(user_agents)
        return None

class CustomDownloaderMiddleware:
    def process_request(self, request, spider):
        # 在发送请求前处理
        spider.logger.info(f'Sending request to {request.url}')
        return None

    def process_response(self, request, response, spider):
        # 在收到响应后处理
        if response.status == 403:
            spider.logger.error(f'Forbidden: {request.url}')
            # 可以返回新的请求或修改响应
        return response

    def process_exception(self, request, exception, spider):
        # 处理请求异常
        spider.logger.error(f'Exception: {exception} for {request.url}')
        # 可以返回新的请求或忽略异常
        return None

3.2 启用中间件

在settings.py中启用中间件:

DOWNLOADER_MIDDLEWARES = {
    'myspider.middlewares.ProxyMiddleware': 543,
    'myspider.middlewares.UserAgentMiddleware': 544,
    'myspider.middlewares.CustomDownloaderMiddleware': 545,
}

3.3 爬虫中间件

爬虫中间件用于处理爬虫的输入(响应)和输出(请求和项目)。

# middlewares.py
class CustomSpiderMiddleware:
    def process_spider_input(self, response, spider):
        # 处理爬虫输入(响应)
        return None

    def process_spider_output(self, response, result, spider):
        # 处理爬虫输出(请求和项目)
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        # 处理爬虫异常
        spider.logger.error(f'Spider exception: {exception}')
        return None

    def process_start_requests(self, start_requests, spider):
        # 处理起始请求
        for r in start_requests:
            yield r

4 自定义扩展

4.1 基本扩展

# extensions.py
class CustomExtension:
    def __init__(self, crawler):
        self.crawler = crawler
        self.items_scraped = 0
        
        # 连接信号
        crawler.signals.connect(self.spider_opened, signal=scrapy.signals.spider_opened)
        crawler.signals.connect(self.spider_closed, signal=scrapy.signals.spider_closed)
        crawler.signals.connect(self.item_scraped, signal=scrapy.signals.item_scraped)

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

    def spider_closed(self, spider):
        spider.logger.info('Spider closed: %s' % spider.name)
        spider.logger.info('Total items scraped: %d' % self.items_scraped)

    def item_scraped(self, item, spider):
        self.items_scraped += 1
        if self.items_scraped % 100 == 0:
            spider.logger.info('Scraped %d items' % self.items_scraped)

4.2 启用扩展

在settings.py中启用扩展:

EXTENSIONS = {
    'myspider.extensions.CustomExtension': 500,
}

5 处理JavaScript渲染的页面

5.1 使用Selenium

# middlewares.py
from scrapy import signals
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time

class SeleniumMiddleware:
    def __init__(self):
        chrome_options = Options()
        chrome_options.add_argument('--headless')  # 无头模式
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        
        self.driver = webdriver.Chrome(
            service=Service(ChromeDriverManager().install()),
            options=chrome_options
        )

    def process_request(self, request, spider):
        # 只处理需要JavaScript渲染的请求
        if request.meta.get('render_js', False):
            self.driver.get(request.url)
            time.sleep(2)  # 等待页面加载
            
            # 获取渲染后的HTML
            body = self.driver.page_source
            
            # 创建新的响应
            from scrapy.http import HtmlResponse
            return HtmlResponse(
                url=request.url,
                body=body,
                encoding='utf-8',
                request=request
            )
        return None

    def __del__(self):
        self.driver.quit()

5.2 使用方式

def parse(self, response):
    # 正常请求
    # ...
    
    # 需要JavaScript渲染的请求
    yield scrapy.Request(
        url='http://example.com/dynamic-page',
        callback=self.parse_dynamic,
        meta={'render_js': True}
    )

def parse_dynamic(self, response):
    # 解析动态渲染后的页面
    # ...

6 使用ItemLoader简化数据提取

6.1 基本用法

from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, MapCompose, Join
from myspider.items import ArticleItem

class ArticleLoader(ItemLoader):
    default_item_class = ArticleItem
    default_output_processor = TakeFirst()
    
    # 自定义处理器
    title_in = MapCompose(str.strip)
    content_out = Join(' ')
    author_in = MapCompose(str.strip)

def parse_item(self, response):
    loader = ArticleLoader(item=ArticleItem(), response=response)
    
    # 添加字段
    loader.add_css('title', 'h1.article-title::text')
    loader.add_css('content', 'div.article-content p::text')
    loader.add_css('author', 'span.author::text')
    loader.add_css('publish_date', 'time.publish-date::attr(datetime)')
    loader.add_value('url', response.url)
    
    # 加载并返回item
    yield loader.load_item()

6.2 处理器详解

from scrapy.loader.processors import (
    TakeFirst, MapCompose, Join, Compose, Identity,
    SelectJmes, MapCompose
)

# TakeFirst: 返回列表中的第一个非空值
TakeFirst()

# MapCompose: 对列表中的每个元素应用多个函数
MapCompose(str.strip, str.upper)

# Join: 将列表中的元素连接成字符串
Join(' ')

# Compose: 对整个列表应用多个函数
Compose(lambda x: x[0].strip())

# Identity: 返回原始值
Identity()

# SelectJmes: 使用JMESPath选择器提取JSON数据
SelectJmes('name')

7 实际案例:爬取电影信息

7.1 创建项目和爬虫

scrapy startproject movie_spider
cd movie_spider
scrapy genspider -t crawl movie_crawler example.com

7.2 定义Item

# items.py
import scrapy

class MovieItem(scrapy.Item):
    title = scrapy.Field()
    director = scrapy.Field()
    actors = scrapy.Field()
    genre = scrapy.Field()
    release_date = scrapy.Field()
    rating = scrapy.Field()
    description = scrapy.Field()
    poster_url = scrapy.Field()

7.3 编写爬虫

# spiders/movie_crawler.py
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from movie_spider.items import MovieItem
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, MapCompose, Join

class MovieLoader(ItemLoader):
    default_item_class = MovieItem
    default_output_processor = TakeFirst()
    actors_out = Join(', ')
    genre_out = Join(', ')

class MovieCrawlerSpider(CrawlSpider):
    name = 'movie_crawler'
    allowed_domains = ['example.com']
    start_urls = ['http://example.com/movies']

    rules = (
        # 提取电影详情页链接
        Rule(LinkExtractor(allow=r'movies/\d+'), callback='parse_movie', follow=True),
        # 提取下一页链接
        Rule(LinkExtractor(restrict_css='a.next-page'), follow=True),
    )

    def parse_movie(self, response):
        loader = MovieLoader(item=MovieItem(), response=response)
        
        # 使用CSS选择器提取数据
        loader.add_css('title', 'h1.movie-title::text')
        loader.add_css('director', 'div.director::text')
        loader.add_css('actors', 'div.actors a::text')
        loader.add_css('genre', 'div.genre span::text')
        loader.add_css('release_date', 'time.release-date::attr(datetime)')
        loader.add_css('rating', 'div.rating span.value::text')
        loader.add_css('description', 'div.description p::text')
        loader.add_css('poster_url', 'div.poster img::attr(src)')
        
        # 使用XPath选择器提取数据
        loader.add_xpath('title', '//h1[@class="movie-title"]/text()')
        
        # 添加额外字段
        loader.add_value('url', response.url)
        
        yield loader.load_item()

7.4 编写管道

# pipelines.py
import json
import os

class MovieSpiderPipeline:
    def __init__(self):
        self.file = None

    def open_spider(self, spider):
        # 在爬虫启动时打开文件
        self.file = open('movies.json', 'w', encoding='utf-8')
        self.file.write('[\n')
        self.first_item = True

    def process_item(self, item, spider):
        # 处理数据
        item['rating'] = float(item['rating']) if item['rating'] else 0.0
        
        # 写入文件
        if not self.first_item:
            self.file.write(',\n')
        else:
            self.first_item = False
        
        line = json.dumps(dict(item), ensure_ascii=False, indent=2)
        self.file.write(line)
        
        return item

    def close_spider(self, spider):
        # 在爬虫关闭时关闭文件
        self.file.write('\n]')
        self.file.close()

7.5 配置项目

# settings.py
ROBOTSTXT_OBEY = True
CONCURRENT_REQUESTS = 16
DOWNLOAD_DELAY = 1
COOKIES_ENABLED = False
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'

ITEM_PIPELINES = {
    'movie_spider.pipelines.MovieSpiderPipeline': 300,
}

7.6 运行爬虫

scrapy crawl movie_crawler

8 总结

通过本集的学习,我们掌握了Scrapy爬虫编写的高级技巧,包括:

  1. 使用CrawlSpider自动爬取网站
  2. 使用FormRequest模拟表单提交
  3. 编写自定义中间件(下载器中间件和爬虫中间件)
  4. 编写自定义扩展
  5. 处理JavaScript渲染的页面
  6. 使用ItemLoader简化数据提取
  7. 实际案例:爬取电影信息

这些技巧将帮助我们更高效地编写复杂的爬虫程序,应对各种爬取场景。

« 上一篇 Scrapy框架基础 下一篇 » 数据存储策略