scrapy利用selenium爬取豆瓣阅读的全步骤


Posted in Python onSeptember 20, 2020

首先创建scrapy项目

命令:scrapy startproject douban_read

创建spider

命令:scrapy genspider douban_spider url

网址:https://read.douban.com/charts

关键注释代码中有,若有不足,请多指教

scrapy项目目录结构如下

scrapy利用selenium爬取豆瓣阅读的全步骤

douban_spider.py文件代码

爬虫文件

import scrapy
import re, json

from ..items import DoubanReadItem


class DoubanSpiderSpider(scrapy.Spider):
 name = 'douban_spider'
 # allowed_domains = ['www']
 start_urls = ['https://read.douban.com/charts']

 def parse(self, response):
 # print(response.text)
 # 获取图书分类的url
 type_urls = response.xpath('//div[@class="rankings-nav"]/a[position()>1]/@href').extract()
 # print(type_urls)
 for type_url in type_urls:
  # /charts?type=unfinished_column&index=featured&dcs=charts&dcm=charts-nav
  part_param = re.search(r'charts\?(.*?)&dcs', type_url).group(1)
  # https://read.douban.com/j/index//charts?type=intermediate_finalized&index=science_fiction&verbose=1
  ajax_url = 'https://read.douban.com/j/index//charts?{}&verbose=1'.format(part_param)
  yield scrapy.Request(ajax_url, callback=self.parse_ajax, encoding='utf-8', meta={'request_type': 'ajax'})

 def parse_ajax(self, response):

 # print(response.text)
 # 获取分类中图书的json数据
 json_data = json.loads(response.text)
 for data in json_data['list']:
  item = DoubanReadItem()
  item['book_id'] = data['works']['id']
  item['book_url'] = data['works']['url']
  item['book_title'] = data['works']['title']
  item['book_author'] = data['works']['author']
  item['book_cover_image'] = data['works']['cover']
  item['book_abstract'] = data['works']['abstract']
  item['book_wordCount'] = data['works']['wordCount']
  item['book_kinds'] = data['works']['kinds']
  # 把item yield给Itempipeline
  yield item

item.py文件代码

项目的目标文件

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class DoubanReadItem(scrapy.Item):
 # define the fields for your item here like:
 book_id = scrapy.Field()
 book_url = scrapy.Field()
 book_title = scrapy.Field()
 book_author = scrapy.Field()
 book_cover_image = scrapy.Field()
 book_abstract = scrapy.Field()
 book_wordCount = scrapy.Field()
 book_kinds = scrapy.Field()

my_download_middle.py文件代码

所有request都会经过下载中间件,可以通过定制中间件,来完成设置代理,动态设置请求头,自定义下载等操作

import random
import time
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from scrapy.http.response.html import HtmlResponse


class MymiddleWares(object):
 def __init__(self):
 # 请求头列表
 self.USER_AGENT_LIST = [
  "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
  "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
  "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
  "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
  "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
  "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
  "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
  "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
  "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
  "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
  "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
  "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
  "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
  "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
  "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
  "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
  "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
  "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
 ]

 def process_request(self, request, spider):
 '''
 下载中间件处理requests的方法
 :param request:马上要被下载器下载request
 :param spider:
 :return:
 '''
 # 在spider中设置了meta的request_type的值为ajax meta参数会贯穿整个scrapy
 request_type = request.meta.get('request_type')
 # 如果不是ajax请求就需要通过selenium来自定义下载request
 if not request_type:
  print('in middler')
  # 1、创建driver
  driver = webdriver.Chrome()
  # 2、请求url
  driver.get(request.url)
  # 3、等待
  # driver.implicitly_wait(20)
  time.sleep(3)
  # 4、获取页面内容
  html_str = driver.page_source
  # 直接返回HtmlResponse给spider解析 下载器就不会下载这个request 达到自定义下载的目的
  return HtmlResponse(url=request.url, body=html_str, request=request, encoding='utf-8')

 else:
  # 如果是ajax请求就需要通过scrapy下载器来下载request
  # ajax请求直接返回json数据不适合上面的selenium下载
  ua = random.choice(self.USER_AGENT_LIST)
  # 设置请求头
  if ua:
  request.headers.setdefault('User-Agent', ua)
  request.headers.setdefault('X-Requested-With', 'XMLHttpRequest')

pipeline.py文件代码

项目的管道文件

import pymongo
from itemadapter import ItemAdapter


class MongoPipeline:
 # 存储集合名字
 collection_name = 'book'

 def __init__(self, mongo_uri, mongo_db):
 self.mongo_uri = mongo_uri
 self.mongo_db = mongo_db

 @classmethod
 def from_crawler(cls, crawler):
 return cls(
  mongo_uri=crawler.settings.get('MONGO_URI'),
  mongo_db=crawler.settings.get('MONGO_DATABASE', 'items')
 )

 def open_spider(self, spider):
 '''
 当spider启动的时候调用
 :param spider:
 :return:
 '''
 self.client = pymongo.MongoClient(self.mongo_uri)
 self.db = self.client[self.mongo_db]

 def close_spider(self, spider):
 self.client.close()

 # 保存到mongo的douban_read数据库下的book集合中
 def process_item(self, item, spider):
 self.db[self.collection_name].update({'book_id': item['book_id']}, {'$set': dict(item)}, True)
 # True:有则修改 无则新增
 print(item)
 return item

settings.py文件代码

配置信息

# Scrapy settings for douban_read project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'douban_read'

SPIDER_MODULES = ['douban_read.spiders']
NEWSPIDER_MODULE = 'douban_read.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'douban_read (+http://www.yourdomain.com)'

# Obey robots.txt rules
# robot协议
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
# 默认请求头
DEFAULT_REQUEST_HEADERS = {
 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 'Accept-Language': 'en',
 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36',

}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'douban_read.middlewares.DoubanReadSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# 配置下载器中间件
DOWNLOADER_MIDDLEWARES = {
 'douban_read.my_download_middle.MymiddleWares': 543,
}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# 配置ITEM_PIPELINES
ITEM_PIPELINES = {
 'douban_read.pipelines.MongoPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

# 配置mongo
MONGO_URI = 'localhost'
# 创建数据库:douban_read
MONGO_DATABASE = 'douban_read'

最后启动该项目即可

scrapy crawl douban_spider

数据就保存到mongo数据库了

scrapy利用selenium爬取豆瓣阅读的全步骤

总结

到此这篇关于scrapy利用selenium爬取豆瓣阅读的文章就介绍到这了,更多相关scrapy用selenium爬取豆瓣阅读内容请搜索三水点靠木以前的文章或继续浏览下面的相关文章希望大家以后多多支持三水点靠木!

Python 相关文章推荐
Python中利用函数装饰器实现备忘功能
Mar 30 Python
Python的组合模式与责任链模式编程示例
Feb 02 Python
python+opencv识别图片中的圆形
Mar 25 Python
python和opencv实现抠图
Jul 18 Python
Python中logging.NullHandler 的使用教程
Nov 29 Python
对Python中DataFrame选择某列值为XX的行实例详解
Jan 29 Python
python实现坦克大战
Apr 24 Python
部署Django到阿里云服务器教程示例
Jun 03 Python
如何在mac版pycharm选择python版本
Jul 21 Python
详解python内置模块urllib
Sep 09 Python
自己搭建resnet18网络并加载torchvision自带权重的操作
May 13 Python
只用40行Python代码就能写出pdf转word小工具
May 31 Python
Python操作dict时避免出现KeyError的几种解决方法
Sep 20 #Python
python中random.randint和random.randrange的区别详解
Sep 20 #Python
详解如何在pyqt中通过OpenCV实现对窗口的透视变换
Sep 20 #Python
Python Pillow(PIL)库的用法详解
Sep 19 #Python
Python自动化xpath实现自动抢票抢货
Sep 19 #Python
python 贪心算法的实现
Sep 18 #Python
详解KMP算法以及python如何实现
Sep 18 #Python
You might like
PHP 出现乱码和Sessions验证问题的解决方法!
2008/12/06 PHP
PHP 删除文件与文件夹操作 unlink()与rmdir()这两个函数的使用
2011/07/17 PHP
PHP取二进制文件头快速判断文件类型的实现代码
2013/08/05 PHP
php中this关键字用法分析
2016/12/07 PHP
thinkPHP5.0框架独立配置与动态配置方法
2017/03/17 PHP
PHP实现redis限制单ip、单用户的访问次数功能示例
2018/06/16 PHP
如何解决手机浏览器页面点击不跳转浏览器双击放大网页
2016/07/01 Javascript
javascript实现图片左右滚动效果【可自动滚动,有左右按钮】
2016/09/19 Javascript
node.js 中间件express-session使用详解
2017/05/20 Javascript
在vue中使用Autoprefixed的方法
2018/07/27 Javascript
layui获取选中行数据的实例讲解
2018/08/19 Javascript
vuex2中使用mapGetters/mapActions报错的解决方法
2018/10/20 Javascript
微信小程序Page中data数据操作和函数调用方法
2019/05/08 Javascript
vue实现登录页面的验证码以及验证过程解析(面向新手)
2019/08/02 Javascript
[06:24]DOTA2 2015国际邀请赛中国区预选赛第二日TOP10
2015/05/27 DOTA
简明 Python 基础学习教程
2007/02/08 Python
python将html转成PDF的实现代码(包含中文)
2013/03/04 Python
Python二叉树的镜像转换实现方法示例
2019/03/06 Python
将python文件打包成EXE应用程序的方法
2019/05/22 Python
详解解决Python memory error的问题(四种解决方案)
2019/08/08 Python
python3的数据类型及数据类型转换实例详解
2019/08/20 Python
python正则表达式的懒惰匹配和贪婪匹配说明
2020/07/13 Python
使用BeautifulSoup4解析XML的方法小结
2020/12/07 Python
北美三大旅游网站之一:Travelocity
2017/08/12 全球购物
捷克时尚网上商店:OTTO
2018/03/15 全球购物
美国椅子和沙发制造商:La-Z-Boy
2020/10/25 全球购物
介绍java中初始化块的使用
2012/09/11 面试题
js实现弹框效果
2021/03/24 Javascript
会计自荐书
2013/12/02 职场文书
社区居务公开实施方案
2014/03/27 职场文书
应聘教师自荐书
2014/06/16 职场文书
先进基层党组织事迹材料
2014/12/25 职场文书
2015幼儿园新学期寄语
2015/02/27 职场文书
2015新生加入学生会自荐书
2015/03/24 职场文书
2019西餐厅创业计划书范文!
2019/07/12 职场文书
Vue项目中如何封装axios(统一管理http请求)
2021/05/02 Vue.js