第155集 Scrapy爬虫编写
1 使用CrawlSpider自动爬取
1.1 什么是CrawlSpider?
CrawlSpider是Scrapy提供的一种高级爬虫类,它可以通过定义规则(rules)来自动提取链接并跟进,非常适合需要遍历整个网站的爬取任务。
1.2 创建CrawlSpider
scrapy genspider -t crawl mycrawler example.com1.3 编写CrawlSpider代码
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from myspider.items import ArticleItem
class MyCrawlerSpider(CrawlSpider):
name = 'mycrawler'
allowed_domains = ['example.com']
start_urls = ['http://example.com/']
# 定义规则
rules = (
# 提取所有符合正则表达式的链接,并使用parse_item方法解析
Rule(LinkExtractor(allow=r'articles/\d+'), callback='parse_item', follow=True),
# 提取下一页链接并跟进,不使用回调函数
Rule(LinkExtractor(restrict_css='a.next-page'), follow=True),
)
def parse_item(self, response):
item = ArticleItem()
item['title'] = response.css('h1.article-title::text').get()
item['content'] = response.css('div.article-content::text').getall()
item['author'] = response.css('span.author::text').get()
item['publish_date'] = response.css('time.publish-date::attr(datetime)').get()
yield item1.4 LinkExtractor详解
LinkExtractor用于提取链接,可以通过多种方式过滤链接:
# 提取所有链接
LinkExtractor(allow=(), deny=())
# 使用正则表达式提取链接
LinkExtractor(allow=r'articles/\d+')
# 排除特定链接
LinkExtractor(deny=r'/login|/admin')
# 使用CSS选择器提取链接
LinkExtractor(restrict_css='div.content')
# 使用XPath提取链接
LinkExtractor(restrict_xpaths='//div[@class="content"]')
# 限制深度
LinkExtractor(depth=1)
# 限制标签
LinkExtractor(tags=['a', 'area'])
# 限制属性
LinkExtractor(attrs=['href'])2 使用FormRequest模拟表单提交
2.1 基本用法
import scrapy
class LoginSpider(scrapy.Spider):
name = 'login'
start_urls = ['http://example.com/login']
def parse(self, response):
# 提取CSRF token
csrf_token = response.css('input[name="csrf_token"]::attr(value)').get()
# 模拟表单提交
return scrapy.FormRequest(
url='http://example.com/login',
formdata={
'username': 'myusername',
'password': 'mypassword',
'csrf_token': csrf_token
},
callback=self.after_login
)
def after_login(self, response):
# 检查是否登录成功
if 'Welcome, myusername' in response.text:
self.logger.info('Login successful!')
# 登录成功后继续爬取
yield scrapy.Request('http://example.com/dashboard', callback=self.parse_dashboard)
else:
self.logger.error('Login failed!')
def parse_dashboard(self, response):
# 解析仪表盘数据
# ...2.2 使用FormRequest.from_response
def parse(self, response):
# 使用from_response自动提取表单数据
return scrapy.FormRequest.from_response(
response,
formxpath='//form[@id="login-form"]',
formdata={
'username': 'myusername',
'password': 'mypassword'
},
callback=self.after_login
)3 自定义中间件
3.1 下载器中间件
下载器中间件用于处理请求和响应,可以实现如代理IP、用户代理切换、请求头添加等功能。
# middlewares.py
class ProxyMiddleware:
def process_request(self, request, spider):
# 设置代理
request.meta['proxy'] = 'http://proxy.example.com:8080'
return None
class UserAgentMiddleware:
def process_request(self, request, spider):
# 设置随机User-Agent
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
]
import random
request.headers['User-Agent'] = random.choice(user_agents)
return None
class CustomDownloaderMiddleware:
def process_request(self, request, spider):
# 在发送请求前处理
spider.logger.info(f'Sending request to {request.url}')
return None
def process_response(self, request, response, spider):
# 在收到响应后处理
if response.status == 403:
spider.logger.error(f'Forbidden: {request.url}')
# 可以返回新的请求或修改响应
return response
def process_exception(self, request, exception, spider):
# 处理请求异常
spider.logger.error(f'Exception: {exception} for {request.url}')
# 可以返回新的请求或忽略异常
return None3.2 启用中间件
在settings.py中启用中间件:
DOWNLOADER_MIDDLEWARES = {
'myspider.middlewares.ProxyMiddleware': 543,
'myspider.middlewares.UserAgentMiddleware': 544,
'myspider.middlewares.CustomDownloaderMiddleware': 545,
}3.3 爬虫中间件
爬虫中间件用于处理爬虫的输入(响应)和输出(请求和项目)。
# middlewares.py
class CustomSpiderMiddleware:
def process_spider_input(self, response, spider):
# 处理爬虫输入(响应)
return None
def process_spider_output(self, response, result, spider):
# 处理爬虫输出(请求和项目)
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# 处理爬虫异常
spider.logger.error(f'Spider exception: {exception}')
return None
def process_start_requests(self, start_requests, spider):
# 处理起始请求
for r in start_requests:
yield r4 自定义扩展
4.1 基本扩展
# extensions.py
class CustomExtension:
def __init__(self, crawler):
self.crawler = crawler
self.items_scraped = 0
# 连接信号
crawler.signals.connect(self.spider_opened, signal=scrapy.signals.spider_opened)
crawler.signals.connect(self.spider_closed, signal=scrapy.signals.spider_closed)
crawler.signals.connect(self.item_scraped, signal=scrapy.signals.item_scraped)
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
def spider_closed(self, spider):
spider.logger.info('Spider closed: %s' % spider.name)
spider.logger.info('Total items scraped: %d' % self.items_scraped)
def item_scraped(self, item, spider):
self.items_scraped += 1
if self.items_scraped % 100 == 0:
spider.logger.info('Scraped %d items' % self.items_scraped)4.2 启用扩展
在settings.py中启用扩展:
EXTENSIONS = {
'myspider.extensions.CustomExtension': 500,
}5 处理JavaScript渲染的页面
5.1 使用Selenium
# middlewares.py
from scrapy import signals
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
class SeleniumMiddleware:
def __init__(self):
chrome_options = Options()
chrome_options.add_argument('--headless') # 无头模式
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
self.driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=chrome_options
)
def process_request(self, request, spider):
# 只处理需要JavaScript渲染的请求
if request.meta.get('render_js', False):
self.driver.get(request.url)
time.sleep(2) # 等待页面加载
# 获取渲染后的HTML
body = self.driver.page_source
# 创建新的响应
from scrapy.http import HtmlResponse
return HtmlResponse(
url=request.url,
body=body,
encoding='utf-8',
request=request
)
return None
def __del__(self):
self.driver.quit()5.2 使用方式
def parse(self, response):
# 正常请求
# ...
# 需要JavaScript渲染的请求
yield scrapy.Request(
url='http://example.com/dynamic-page',
callback=self.parse_dynamic,
meta={'render_js': True}
)
def parse_dynamic(self, response):
# 解析动态渲染后的页面
# ...6 使用ItemLoader简化数据提取
6.1 基本用法
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, MapCompose, Join
from myspider.items import ArticleItem
class ArticleLoader(ItemLoader):
default_item_class = ArticleItem
default_output_processor = TakeFirst()
# 自定义处理器
title_in = MapCompose(str.strip)
content_out = Join(' ')
author_in = MapCompose(str.strip)
def parse_item(self, response):
loader = ArticleLoader(item=ArticleItem(), response=response)
# 添加字段
loader.add_css('title', 'h1.article-title::text')
loader.add_css('content', 'div.article-content p::text')
loader.add_css('author', 'span.author::text')
loader.add_css('publish_date', 'time.publish-date::attr(datetime)')
loader.add_value('url', response.url)
# 加载并返回item
yield loader.load_item()6.2 处理器详解
from scrapy.loader.processors import (
TakeFirst, MapCompose, Join, Compose, Identity,
SelectJmes, MapCompose
)
# TakeFirst: 返回列表中的第一个非空值
TakeFirst()
# MapCompose: 对列表中的每个元素应用多个函数
MapCompose(str.strip, str.upper)
# Join: 将列表中的元素连接成字符串
Join(' ')
# Compose: 对整个列表应用多个函数
Compose(lambda x: x[0].strip())
# Identity: 返回原始值
Identity()
# SelectJmes: 使用JMESPath选择器提取JSON数据
SelectJmes('name')7 实际案例:爬取电影信息
7.1 创建项目和爬虫
scrapy startproject movie_spider
cd movie_spider
scrapy genspider -t crawl movie_crawler example.com7.2 定义Item
# items.py
import scrapy
class MovieItem(scrapy.Item):
title = scrapy.Field()
director = scrapy.Field()
actors = scrapy.Field()
genre = scrapy.Field()
release_date = scrapy.Field()
rating = scrapy.Field()
description = scrapy.Field()
poster_url = scrapy.Field()7.3 编写爬虫
# spiders/movie_crawler.py
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from movie_spider.items import MovieItem
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, MapCompose, Join
class MovieLoader(ItemLoader):
default_item_class = MovieItem
default_output_processor = TakeFirst()
actors_out = Join(', ')
genre_out = Join(', ')
class MovieCrawlerSpider(CrawlSpider):
name = 'movie_crawler'
allowed_domains = ['example.com']
start_urls = ['http://example.com/movies']
rules = (
# 提取电影详情页链接
Rule(LinkExtractor(allow=r'movies/\d+'), callback='parse_movie', follow=True),
# 提取下一页链接
Rule(LinkExtractor(restrict_css='a.next-page'), follow=True),
)
def parse_movie(self, response):
loader = MovieLoader(item=MovieItem(), response=response)
# 使用CSS选择器提取数据
loader.add_css('title', 'h1.movie-title::text')
loader.add_css('director', 'div.director::text')
loader.add_css('actors', 'div.actors a::text')
loader.add_css('genre', 'div.genre span::text')
loader.add_css('release_date', 'time.release-date::attr(datetime)')
loader.add_css('rating', 'div.rating span.value::text')
loader.add_css('description', 'div.description p::text')
loader.add_css('poster_url', 'div.poster img::attr(src)')
# 使用XPath选择器提取数据
loader.add_xpath('title', '//h1[@class="movie-title"]/text()')
# 添加额外字段
loader.add_value('url', response.url)
yield loader.load_item()7.4 编写管道
# pipelines.py
import json
import os
class MovieSpiderPipeline:
def __init__(self):
self.file = None
def open_spider(self, spider):
# 在爬虫启动时打开文件
self.file = open('movies.json', 'w', encoding='utf-8')
self.file.write('[\n')
self.first_item = True
def process_item(self, item, spider):
# 处理数据
item['rating'] = float(item['rating']) if item['rating'] else 0.0
# 写入文件
if not self.first_item:
self.file.write(',\n')
else:
self.first_item = False
line = json.dumps(dict(item), ensure_ascii=False, indent=2)
self.file.write(line)
return item
def close_spider(self, spider):
# 在爬虫关闭时关闭文件
self.file.write('\n]')
self.file.close()7.5 配置项目
# settings.py
ROBOTSTXT_OBEY = True
CONCURRENT_REQUESTS = 16
DOWNLOAD_DELAY = 1
COOKIES_ENABLED = False
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
ITEM_PIPELINES = {
'movie_spider.pipelines.MovieSpiderPipeline': 300,
}7.6 运行爬虫
scrapy crawl movie_crawler8 总结
通过本集的学习,我们掌握了Scrapy爬虫编写的高级技巧,包括:
- 使用CrawlSpider自动爬取网站
- 使用FormRequest模拟表单提交
- 编写自定义中间件(下载器中间件和爬虫中间件)
- 编写自定义扩展
- 处理JavaScript渲染的页面
- 使用ItemLoader简化数据提取
- 实际案例:爬取电影信息
这些技巧将帮助我们更高效地编写复杂的爬虫程序,应对各种爬取场景。