Scrapy项目实战之爬取某社区用户详情


Posted in Python onSeptember 17, 2020

本文介绍了Scrapy项目实战之爬取某社区用户详情,分享给大家,具有如下:

get_cookies.py

from selenium import webdriver
from pymongo import MongoClient
from scrapy.crawler import overridden_settings
# from segmentfault import settings
import time
import settings

class GetCookies(object):
 def __init__(self):
  # 初始化组件
  # 设定webdriver选项
  self.opt = webdriver.ChromeOptions()
  # self.opt.add_argument("--headless")
  # 初始化用户列表
  self.user_list = settings.USER_LIST
  # 初始化MongoDB参数
  self.client = MongoClient(settings.MONGO_URI)
  self.db = self.client[settings.MONGO_DB]
  self.collection = self.db["cookies"]

 def get_cookies(self,username,password):
  """

  :param username:
  :param password:
  :return: cookies
  """
  # 使用webdriver选项创建driver
  driver = webdriver.Chrome(executable_path="/Users/Hank/scrapy/segmentfault/segmentfault/chromedriver",options=self.opt)
  driver.get("https://segmentfault.com/user/login")
  driver.find_element_by_name("username").send_keys(username)
  driver.find_element_by_name("password").send_keys(password)
  driver.find_element_by_xpath("//button[@type='submit']").click()
  time.sleep(2)
  driver.get("https://segmentfault.com/u/luwangmeilun/users/following")
  # 登陆之后获取页面cookies
  cookies = driver.get_cookies()
  driver.quit()

  return cookies

 def format_cookies(self,cookies):
  """

  :param cookies:
  从driver.get_cookies的形式为:
  [{'domain': 'segmentfault.com', 'httpOnly': False, 'name': 'PHPSESSID',
  'path': '/', 'secure': False, 'value': 'web2~5grmfa89j12eksub8hja3bvaq4'},
  {'domain': '.segmentfault.com', 'expiry': 1581602940, 'httpOnly': False,
  'name': 'Hm_lvt_e23800c454aa573c0ccb16b52665ac26', 'path': '/', 'secure': False,
  'value': '1550066940'},
  {'domain': '.segmentfault.com', 'httpOnly': False,
  'name': 'Hm_lpvt_e23800c454aa573c0ccb16b52665ac26',
  'path': '/', 'secure': False, 'value': '1550066940'},
  {'domain': '.segmentfault.com', 'expiry': 1550067000, 'httpOnly': False,
  'name': '_gat', 'path': '/', 'secure': False, 'value': '1'},
  {'domain': '.segmentfault.com', 'expiry': 1550153340, 'httpOnly': False,
  'name': '_gid', 'path': '/', 'secure': False, 'value': 'GA1.2.783265084.1550066940'},
  {'domain': '.segmentfault.com', 'expiry': 1613138940, 'httpOnly': False, 'name': '_ga',
  'path': '/', 'secure': False, 'value': 'GA1.2.1119166665.1550066940'}]
  只需提取每一项的name与value即可

  :return:
  """
  c = dict()
  for item in cookies:
   c[item['name']] = item['value']

  return c

 def save(self):
  print("开始获取Cookies....")
  # 从用户列表中获取用户名与密码,分别登陆获取cookies
  for username,password in self.user_list:
   cookies = self.get_cookies(username,password)
   f_cookies = self.format_cookies(cookies)
   print("insert cookie:{}".format(f_cookies))
   # 将格式整理后的cookies插入MongoDB数据库
   self.collection.insert_one(f_cookies)

  # s = db[self.collection].find()
  # for i in s:
  #  print(i)


if __name__ == '__main__':

 cookies = GetCookies()
 for i in range(20):
  cookies.save()

item.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class SegmentfaultItem(scrapy.Item):
 # define the fields for your item here like:
 # 个人属性
 # 姓名
 name = scrapy.Field()
 # 声望
 rank = scrapy.Field()
 # 学校
 school = scrapy.Field()
 # 专业
 majors = scrapy.Field()
 # 公司
 company = scrapy.Field()
 # 工作
 job = scrapy.Field()
 # blog
 blog = scrapy.Field()
 # 社交活动数据
 # 关注人数
 following = scrapy.Field()
 # 粉丝数
 fans = scrapy.Field()
 # 回答数
 answers = scrapy.Field()
 # 提问数
 questions = scrapy.Field()
 # 文章数
 articles = scrapy.Field()
 # 讲座数
 lives = scrapy.Field()
 # 徽章数
 badges = scrapy.Field()
 # 技能属性
 # 点赞数
 like = scrapy.Field()
 # 技能
 skills = scrapy.Field()
 # 注册日期
 register_date = scrapy.Field()
 # 问答统计
 # 回答最高得票数
 answers_top_score = scrapy.Field()
 # 得票数最高的回答对应的问题的标题
 answers_top_title = scrapy.Field()
 # 得票数最高的回答对应的问题的标签
 answers_top_tags = scrapy.Field()
 # 得票数最高的回答对应的问题的内容
 answers_top_question = scrapy.Field()
 # 得票数最高的回答对应的问题的内容
 answers_top_content = scrapy.Field()

pipeline.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo

class SegmentfaultPipeline(object):
 # 设定MongoDB集合名称
 collection_name = 'userinfo'

 def __init__(self,mongo_uri,mongo_db):
  self.mongo_uri = mongo_uri
  self.mongo_db = mongo_db

 # 通过crawler获取settings.py中设定的MongoDB连接信息
 @classmethod
 def from_crawler(cls,crawler):
  return cls(
   mongo_uri = crawler.settings.get('MONGO_URI'),
   mongo_db = crawler.settings.get('MONGO_DB','segmentfault')
  )

 # 当爬虫启动时连接MongoDB
 def open_spider(self,spider):
  self.client = pymongo.MongoClient(self.mongo_uri)
  self.db = self.client[self.mongo_db]

 # 当爬虫关闭时断开MongoDB连接
 def close_spider(self,spider):
  self.client.close()

 # 将Item插入数据库保存
 def process_item(self, item, spider):
  self.db[self.collection_name].insert_one(dict(item))
  return item

settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for segmentfault project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#  https://doc.scrapy.org/en/latest/topics/settings.html
#  https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#  https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'segmentfault'

SPIDER_MODULES = ['segmentfault.spiders']
NEWSPIDER_MODULE = 'segmentfault.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 100

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 2
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 32
# CONCURRENT_REQUESTS_PER_IP = 32

# Disable cookies (enabled by default)
# COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

RETRY_ENABLED = False

REDIRECT_ENABLED = False

DOWNLOAD_TIMEOUT = 5

# HTTPALLOW

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}


# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
 'segmentfault.middlewares.SegmentfaultSpiderMiddleware': 543,
}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
 # 'segmentfault.middlewares.SegmentfaultHttpProxyMiddleware': 543,
 'segmentfault.middlewares.SegmentfaultUserAgentMiddleware':643,
 'segmentfault.middlewares.SegmentfaultCookiesMiddleware':743,
 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': None,
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
 # 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware':None,

}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
 'segmentfault.pipelines.SegmentfaultPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# # The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# # The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# # The average number of requests Scrapy should be sending in parallel to
# # each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# # Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

# 配置MONGODB
MONGO_URI = 'localhost:27017'
MONGO_DB = 'segmentfault'

# 用户列表
USER_LIST = [
 ("798549150@qq.com","guoqing1010"),
 ("learnscrapy@163.com","guoqing1010"),
]

# 配置代理列表
PROXY_LIST = [
 'http://115.182.212.169:8080',
 'http://121.61.25.149:9999',
 'http://180.118.247.189:9000',
 'http://115.151.3.12:9999',
 'http://183.154.213.160:9000',
 'http://113.128.9.106:9999',
 'http://124.42.68.152:90',
 'http://49.70.48.50:9999',
 'http://113.128.11.172:9999',
 'http://111.177.177.40:9999',
 'http://59.62.83.253:9999',
 'http://39.107.84.185:8123',
 'http://124.94.195.107:9999',
 'http://111.177.160.132:9999',
 'http://120.25.203.182:7777'
]

USER_AGENT_LIST = [
 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
 'Opera/8.0 (Windows NT 5.1; U; en)',
 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50',
 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
 'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16',
 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)',
 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)',
 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36'
]

userinfo.py

# -*- coding: utf-8 -*-
import scrapy
import time
from scrapy import Request
from pymongo import MongoClient
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider,Rule
from scrapy.http import FormRequest
from segmentfault.items import SegmentfaultItem


class UserinfoSpider(CrawlSpider):
 name = 'userinfo'
 allowed_domains = ['segmentfault.com']
 start_urls = ['https://segmentfault.com/u/mybigbigcat/users/following']

 rules = (
  # 用户主页地址,跟进并进行解析
  Rule(LinkExtractor(allow=r'/u/\w+$'),callback='parse_item',follow=True),
  # 用户关注列表,跟进列表页面,抓取用户主页地址进行后续操作
  # Rule(LinkExtractor(allow=r'/users/followed$'),follow=True),
  # 用户粉丝列表,跟进列表页面,抓取用户主页地址进行后续操作
  Rule(LinkExtractor(allow=r'/users/following$'),follow=True),
  # 跟进其他页面地址
  # Rule(LinkExtractor(allow=r'/users/[followed|following]?page=\d+'),follow=True),
 )

 def start_requests(self):
  # 从MongoDB中获取一条cookie,添加到开始方法
  client = MongoClient(self.crawler.settings['MONGO_URI'])
  db = client[self.crawler.settings['MONGO_DB']]
  cookies_collection = db.cookies
  # 获取一条cookie
  cookies = cookies_collection.find_one()
  # cookie中的'Hm_lpvt_e23800c454aa573c0ccb16b52665ac26'参数是当前时间的10位表示法,因此重新填充
  cookies['Hm_lpvt_e23800c454aa573c0ccb16b52665ac26'] = str(int(time.time()))

  return [Request("https://segmentfault.com",
      cookies=cookies,
      meta={'cookiejar':1},
      callback=self.after_login)]

 # 登录之后从start_url中开始抓取数据
 def after_login(self,response):
  for url in self.start_urls:
   return self.make_requests_from_url(url)
 # def after_login(self,response):
 #  yield Request(self.start_urls[0],
 #     meta={'cookiejar':response.meta['cookiejar']},
 #     callback=self.parse_item)

 def parse_item(self, response):
  """
  :param response:
  :return:
  """
  item = SegmentfaultItem()
  # 个人属性模块
  profile_head = response.css('.profile__heading')
  # 姓名
  item['name'] = profile_head.css('h2[class*=name]::text').re_first(r'\w+')
  # 声望
  item['rank'] = profile_head.css('.profile__rank-btn > span::text').extract_first()
  # 学校专业信息
  school_info = profile_head.css('.profile__school::text').extract()
  if school_info:
   # 学校
   item['school'] = school_info[0]
   # 专业
   item['majors'] = school_info[1].strip()
  else:
   item['school'] = ''
   item['majors'] = ''
  # 公司职位信息
  company_info = profile_head.css('.profile__company::text').extract()
  if company_info:
   # 公司
   item['company'] = company_info[0]
   # 职位
   item['job'] = company_info[1].strip()
  else:
   item['company'] = ''
   item['job'] = ''
  # 个人博客
  item['blog'] = profile_head.css('a[class*=other-item-link]::attr(href)').extract_first()

  # 统计面板模块
  profile_active = response.xpath("//div[@class='col-md-2']")
  # 关注人数
  item['following'] = profile_active.css('div[class*=info] a > .h5::text').re(r'\d+')[0]
  # 粉丝人数
  item['fans'] = profile_active.css('div[class*=info] a > .h5::text').re(r'\d+')[1]
  # 回答问题数
  item['answers'] = profile_active.css('a[href*=answer] .count::text').re_first(r'\d+')
  # 提问数
  item['questions'] = profile_active.css('a[href*=questions] .count::text').re_first(r'\d+')
  # 文章数
  item['articles'] = profile_active.css('a[href*=articles] .count::text').re_first(r'\d+')
  # 讲座数
  item['lives'] = profile_active.css('a[href*=lives] .count::text').re_first(r'\d+')
  # 徽章数
  item['badges'] = profile_active.css('a[href*=badges] .count::text').re_first(r'\d+')
  # 徽章详细页面地址
  badge_url = profile_active.css('a[href*=badges]::attr(href)').extract_first()

  # 技能面板模块
  profile_skill = response.xpath("//div[@class='col-md-3']")
  # 技能标签列表
  item['skills'] = profile_skill.css('.tag::text').re(r'\w+')
  # 获得的点赞数
  item['like'] = profile_skill.css('.authlist').re_first(r'获得 (\d+) 次点赞')
  # 注册日期
  item['register_date'] = profile_skill.css('.profile__skill--other p::text').extract_first()
  # if register_time:
  #  item['register_date'] = ''.join(re.findall(r'\d+',register_time))
  # else:
  #  item['register_date'] = ''

  # 产出数据模块
  profile_work = response.xpath("//div[@class='col-md-7']")
  # 回答获得的最高分
  item['answers_top_score'] = profile_work.css('#navAnswer .label::text').re_first(r'\d+')
  # 最高分回答对应的问题的标题
  item['answers_top_title'] = profile_work.css('#navAnswer div[class*=title-warp] > a::text').extract_first()
  # 最高分回答对应的问题的url
  answer_url = profile_work.css('#navAnswer div[class*=title-warp] > a::attr(href)').extract_first()

  # 将需要继续跟进抓取数据的url与item作为参数传递给相应方法继续抓取数据
  request = scrapy.Request(
   # 问题详细页url
   url=response.urljoin(answer_url),
   meta={
   # item需要传递
   'item':item,
   # 徽章的url
   'badge_url':response.urljoin(badge_url)},
   # 调用parse_ansser继续处理
   callback=self.parse_answer)
  yield request

 def parse_answer(self,response):
  # 取出传递的item
  item = response.meta['item']
  # 取出传递的徽章详细页url
  badge_url = response.meta['badge_url']
  # 问题标签列表
  item['answers_top_tags'] = response.css('.question__title--tag .tag::text').re(r'\w+')
  # 先获取组成问题内容的字符串列表
  question_content = response.css('.widget-question__item p').re(r'>(.*?)<')
  # 拼接后传入item
  item['answers_top_question'] = ''.join(question_content)
  # 先获取组成答案的字符串列表
  answer_content = response.css('.qa-answer > article .answer').re(r'>(.*?)<')
  # 拼接后传入item
  item['answers_top_content'] = ''.join(answer_content)

  # 问题页面内容抓取后继续抓取徽章页内容,并将更新后的item继续传递
  request = scrapy.Request(url=badge_url,
         meta={'item':item},
         callback=self.parse_badge)
  yield request

 def parse_badge(self,response):
  item = response.meta['item']
  badge_name = response.css('span.badge span::text').extract()
  badge_count = response.css('span[class*=badges-count]::text').re(r'\d+')
  name_count = {}
  for i in range(len(badge_count)):
   name_count[badge_name[i]] = badge_count[i]
  item['badges'] = name_count
  yield item

middlewars.py

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
import random
import re
import datetime
import scrapy
import logging
import time
from scrapy.conf import settings
from pymongo import MongoClient
from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware
import pymongo
logger = logging.getLogger(__name__)


class SegmentfaultSpiderMiddleware(object):
 """
 处理Item中保存的三种类型注册日期数据:
 1. 注册于 2015年12月12日
 2. 注册于 3 天前
 3. 注册于 5 小时前
 """

 def process_spider_output(self,response,result,spider):

  """
  输出response时调用此方法处理item中register_date
  :param response:
  :param result: 包含item
  :param spider:
  :return:处理过注册日期的item
  """
  for item in result:
   # 判断获取的数据是否是scrapy.item类型
   if isinstance(item,scrapy.Item):
    # 获取当前时间
    now = datetime.datetime.now()
    register_date = item['register_date']
    logger.info("获取注册日志格式为{}".format(register_date))
    # 提取注册日期字符串,如'注册于2015年12月12日' => '20151212'
    day = ''.join(re.findall(r'\d+',register_date))
    # 如果提取数字字符串长度大于4位,则为'注册于2015年12月12日'形式
    if len(day) > 4:
     date = day
    # 如果‘时'在提取的字符串中,则为'注册于8小时前'形式
    elif '时' in register_date:
     d = now - datetime.timedelta(hours=int(day))
     date = d.strftime("%Y%m%d")
    # 最后一种情况就是'注册于3天前'形式
    else:
     d = now - datetime.timedelta(days=int(day))
     date = d.strftime("%Y%m%d")

    # 更新register_date值
    item['register_date'] = date
   yield item


class SegmentfaultHttpProxyMiddleware(object):
 # Not all methods need to be defined. If a method is not defined,
 # scrapy acts as if the downloader middleware does not modify the
 # passed objects.
 def __init__(self):
  self.proxy_list = settings['PROXY_LIST']

 def process_request(self, request, spider):
  proxy = random.choice(self.proxy_list)
  logger.info('使用代理:{}'.format(proxy))
  request.meta['proxy'] = proxy


class SegmentfaultUserAgentMiddleware(object):
 def __init__(self):
  self.useragent_list = settings['USER_AGENT_LIST']

 def process_request(self,request,spider):
  user_agent = random.choice(self.useragent_list)

  # logger.info('使用的USE USER-AGENT:{}'.format(user_agent))
  request.headers['User-Agent'] = user_agent



class SegmentfaultCookiesMiddleware(object):
 client = MongoClient(settings['MONGO_URI'])
 db = client[settings['MONGO_DB']]
 collection = db['cookies']

 def get_cookies(self):
  """
  随机获取cookies
  :return:
  """
  cookies = random.choice([cookie for cookie in self.collection.find()])
  # 将不需要的"_id"与"_gat"参数删除
  cookies.pop('_id')
  cookies.pop('_gat')
  # 将"Hm_lpvt_e23800c454aa573c0ccb16b52665ac26"填充当前时间
  cookies['Hm_lpvt_e23800c454aa573c0ccb16b52665ac26'] = str(int(time.time()))
  return cookies

 def remove_cookies(self,cookies):
  """
  删除已失效的cookies
  :param cookies:
  :return:
  """
  # 随机获取cookies中的一对键值,返回结果是一个元祖
  i = cookies.popitem()
  # 删除cookies
  try:
   logger.info("删除cookies{}".format(cookies))
   self.collection.remove({i[0]:i[1]})
  except Exception as e:
   logger.info("No this cookies:{}".format(cookies))

 def process_request(self,request,spider):
  """
  为每一个request添加一个cookie
  :param request:
  :param spider:
  :return:
  """
  cookies = self.get_cookies()
  request.cookies = cookies

 def process_response(self,request,response,spider):
  """
  对于登录失效的情况,可能会重定向到登录页面,这时添加新的cookies继续,将请求放回调度器
  :param request:
  :param response:
  :param spider:
  :return:
  """
  if response.status in [301,302]:
   logger.info("Redirect response:{}".format(response))
   redirect_url = response.headers['location']
   if b'/user/login' in redirect_url:
    logger.info("Cookies失效")

    # 请求失败,重新获取一个cookie,添加到request,并停止后续中间件处理此request,将此request放入调度器
    new_cookie = self.get_cookies()
    logger.info("获取新cookie:{}".format(new_cookie))
    # 删除旧cookies
    self.remove_cookies(request.cookies)
    request.cookies = new_cookie
   return request
  #
  return response

run.py

from scrapy import cmdline
# from segmentfault.get_cookies import GetCookies
from get_cookies import GetCookies

if __name__ == '__main__':
 cookies = GetCookies()
 cookies.save()
 name = 'userinfo'
 ""
 cmd = 'scrapy crawl {}'.format(name)
 cmdline.execute(cmd.split())

到此这篇关于Scrapy项目实战之爬取某社区用户详情的文章就介绍到这了,更多相关Scrapy 爬取某社区用户内容请搜索三水点靠木以前的文章或继续浏览下面的相关文章希望大家以后多多支持三水点靠木!

Python 相关文章推荐
理解python多线程(python多线程简明教程)
Jun 09 Python
Python实现截屏的函数
Jul 25 Python
python开发之thread线程基础实例入门
Nov 11 Python
Python中Class类用法实例分析
Nov 12 Python
使用Python的Scrapy框架十分钟爬取美女图
Dec 26 Python
python方向键控制上下左右代码
Jan 20 Python
python 字典中文key处理,读取,比较方法
Jul 06 Python
python字符串中匹配数字的正则表达式
Jul 03 Python
python3 实现的对象与json相互转换操作示例
Aug 17 Python
Django密码存储策略分析
Jan 09 Python
Python可以用来做什么
Nov 23 Python
python 如何获取页面所有a标签下href的值
May 06 Python
django跳转页面传参的实现
Sep 17 #Python
解决Ubuntu18中的pycharm不能调用tensorflow-gpu的问题
Sep 17 #Python
Django mysqlclient安装和使用详解
Sep 17 #Python
Pycharm2020最新激活码|永久激活(附最新激活码和插件的详细教程)
Sep 29 #Python
Django返回HTML文件的实现方法
Sep 17 #Python
Pycharm新手使用教程(图文详解)
Sep 17 #Python
Django修改app名称和数据表迁移方案实现
Sep 17 #Python
You might like
PHP 开源AJAX框架14种
2009/08/24 PHP
php实现猴子选大王问题算法实例
2015/04/20 PHP
PHP 二维数组和三维数组的过滤
2016/03/16 PHP
thinkPHP5分页功能实现方法分析
2017/10/25 PHP
PHP实现搜索时记住状态的方法示例
2018/05/11 PHP
Yii 框架控制器创建使用及控制器响应操作示例
2019/10/14 PHP
js实现ASP分页函数 HTML分页函数
2006/09/22 Javascript
jQuery使用手册之三 CSS操作
2007/03/24 Javascript
jQuery CSS()方法改变现有的CSS样式表
2014/09/09 Javascript
node.js中Socket.IO的进阶使用技巧
2014/11/04 Javascript
jQuery实现强制cookie过期方法汇总
2015/05/22 Javascript
jQuery采用连缀写法实现的折叠菜单效果
2015/09/18 Javascript
jQuery实现鼠标滑过链接控制图片的滑动展开与隐藏效果
2015/10/28 Javascript
nodejs获取微信小程序带参数二维码实现代码
2017/04/12 NodeJs
vue使用stompjs实现mqtt消息推送通知
2017/06/22 Javascript
官方推荐react-navigation的具体使用详解
2018/05/08 Javascript
微信小程序实现之手势锁功能实例代码
2018/07/19 Javascript
记录微信小程序 height: calc(xx - xx);无效问题
2019/12/30 Javascript
微信小程序自定义navigationBar顶部导航栏适配所有机型(附完整案例)
2020/04/26 Javascript
Vue生命周期activated之返回上一页不重新请求数据操作
2020/07/26 Javascript
基于ajax实现上传图片代码示例解析
2020/12/03 Javascript
[01:27:30]LGD vs Newbee 2019国际邀请赛小组赛 BO2 第二场 8.16
2019/08/19 DOTA
简单的抓取淘宝图片的Python爬虫
2014/12/25 Python
Python实现EXCEL表格的排序功能示例
2019/06/25 Python
用于ETL的Python数据转换工具详解
2020/07/21 Python
Python面向对象实现方法总结
2020/08/12 Python
css3通过scale()、rotate()实现放大、旋转
2020/03/19 HTML / CSS
英国轻奢珠宝品牌:Astley Clarke
2016/12/18 全球购物
美国女士时尚珠宝及配饰购物网站:Icing
2018/07/02 全球购物
在C++ 程序中调用被C 编译器编译后的函数,为什么要加extern "C"
2014/08/09 面试题
聚美优品的广告词
2014/03/14 职场文书
双方协议书
2014/04/22 职场文书
优秀班集体申报材料
2014/12/25 职场文书
初中军训感言
2015/08/01 职场文书
2016年优秀团支部事迹材料
2016/02/26 职场文书
详解CSS故障艺术
2021/05/25 HTML / CSS