Scrapy项目实战之爬取某社区用户详情


Posted in Python onSeptember 17, 2020

本文介绍了Scrapy项目实战之爬取某社区用户详情,分享给大家,具有如下:

get_cookies.py

from selenium import webdriver
from pymongo import MongoClient
from scrapy.crawler import overridden_settings
# from segmentfault import settings
import time
import settings

class GetCookies(object):
 def __init__(self):
  # 初始化组件
  # 设定webdriver选项
  self.opt = webdriver.ChromeOptions()
  # self.opt.add_argument("--headless")
  # 初始化用户列表
  self.user_list = settings.USER_LIST
  # 初始化MongoDB参数
  self.client = MongoClient(settings.MONGO_URI)
  self.db = self.client[settings.MONGO_DB]
  self.collection = self.db["cookies"]

 def get_cookies(self,username,password):
  """

  :param username:
  :param password:
  :return: cookies
  """
  # 使用webdriver选项创建driver
  driver = webdriver.Chrome(executable_path="/Users/Hank/scrapy/segmentfault/segmentfault/chromedriver",options=self.opt)
  driver.get("https://segmentfault.com/user/login")
  driver.find_element_by_name("username").send_keys(username)
  driver.find_element_by_name("password").send_keys(password)
  driver.find_element_by_xpath("//button[@type='submit']").click()
  time.sleep(2)
  driver.get("https://segmentfault.com/u/luwangmeilun/users/following")
  # 登陆之后获取页面cookies
  cookies = driver.get_cookies()
  driver.quit()

  return cookies

 def format_cookies(self,cookies):
  """

  :param cookies:
  从driver.get_cookies的形式为:
  [{'domain': 'segmentfault.com', 'httpOnly': False, 'name': 'PHPSESSID',
  'path': '/', 'secure': False, 'value': 'web2~5grmfa89j12eksub8hja3bvaq4'},
  {'domain': '.segmentfault.com', 'expiry': 1581602940, 'httpOnly': False,
  'name': 'Hm_lvt_e23800c454aa573c0ccb16b52665ac26', 'path': '/', 'secure': False,
  'value': '1550066940'},
  {'domain': '.segmentfault.com', 'httpOnly': False,
  'name': 'Hm_lpvt_e23800c454aa573c0ccb16b52665ac26',
  'path': '/', 'secure': False, 'value': '1550066940'},
  {'domain': '.segmentfault.com', 'expiry': 1550067000, 'httpOnly': False,
  'name': '_gat', 'path': '/', 'secure': False, 'value': '1'},
  {'domain': '.segmentfault.com', 'expiry': 1550153340, 'httpOnly': False,
  'name': '_gid', 'path': '/', 'secure': False, 'value': 'GA1.2.783265084.1550066940'},
  {'domain': '.segmentfault.com', 'expiry': 1613138940, 'httpOnly': False, 'name': '_ga',
  'path': '/', 'secure': False, 'value': 'GA1.2.1119166665.1550066940'}]
  只需提取每一项的name与value即可

  :return:
  """
  c = dict()
  for item in cookies:
   c[item['name']] = item['value']

  return c

 def save(self):
  print("开始获取Cookies....")
  # 从用户列表中获取用户名与密码,分别登陆获取cookies
  for username,password in self.user_list:
   cookies = self.get_cookies(username,password)
   f_cookies = self.format_cookies(cookies)
   print("insert cookie:{}".format(f_cookies))
   # 将格式整理后的cookies插入MongoDB数据库
   self.collection.insert_one(f_cookies)

  # s = db[self.collection].find()
  # for i in s:
  #  print(i)


if __name__ == '__main__':

 cookies = GetCookies()
 for i in range(20):
  cookies.save()

item.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class SegmentfaultItem(scrapy.Item):
 # define the fields for your item here like:
 # 个人属性
 # 姓名
 name = scrapy.Field()
 # 声望
 rank = scrapy.Field()
 # 学校
 school = scrapy.Field()
 # 专业
 majors = scrapy.Field()
 # 公司
 company = scrapy.Field()
 # 工作
 job = scrapy.Field()
 # blog
 blog = scrapy.Field()
 # 社交活动数据
 # 关注人数
 following = scrapy.Field()
 # 粉丝数
 fans = scrapy.Field()
 # 回答数
 answers = scrapy.Field()
 # 提问数
 questions = scrapy.Field()
 # 文章数
 articles = scrapy.Field()
 # 讲座数
 lives = scrapy.Field()
 # 徽章数
 badges = scrapy.Field()
 # 技能属性
 # 点赞数
 like = scrapy.Field()
 # 技能
 skills = scrapy.Field()
 # 注册日期
 register_date = scrapy.Field()
 # 问答统计
 # 回答最高得票数
 answers_top_score = scrapy.Field()
 # 得票数最高的回答对应的问题的标题
 answers_top_title = scrapy.Field()
 # 得票数最高的回答对应的问题的标签
 answers_top_tags = scrapy.Field()
 # 得票数最高的回答对应的问题的内容
 answers_top_question = scrapy.Field()
 # 得票数最高的回答对应的问题的内容
 answers_top_content = scrapy.Field()

pipeline.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo

class SegmentfaultPipeline(object):
 # 设定MongoDB集合名称
 collection_name = 'userinfo'

 def __init__(self,mongo_uri,mongo_db):
  self.mongo_uri = mongo_uri
  self.mongo_db = mongo_db

 # 通过crawler获取settings.py中设定的MongoDB连接信息
 @classmethod
 def from_crawler(cls,crawler):
  return cls(
   mongo_uri = crawler.settings.get('MONGO_URI'),
   mongo_db = crawler.settings.get('MONGO_DB','segmentfault')
  )

 # 当爬虫启动时连接MongoDB
 def open_spider(self,spider):
  self.client = pymongo.MongoClient(self.mongo_uri)
  self.db = self.client[self.mongo_db]

 # 当爬虫关闭时断开MongoDB连接
 def close_spider(self,spider):
  self.client.close()

 # 将Item插入数据库保存
 def process_item(self, item, spider):
  self.db[self.collection_name].insert_one(dict(item))
  return item

settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for segmentfault project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#  https://doc.scrapy.org/en/latest/topics/settings.html
#  https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#  https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'segmentfault'

SPIDER_MODULES = ['segmentfault.spiders']
NEWSPIDER_MODULE = 'segmentfault.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 100

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 2
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 32
# CONCURRENT_REQUESTS_PER_IP = 32

# Disable cookies (enabled by default)
# COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

RETRY_ENABLED = False

REDIRECT_ENABLED = False

DOWNLOAD_TIMEOUT = 5

# HTTPALLOW

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}


# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
 'segmentfault.middlewares.SegmentfaultSpiderMiddleware': 543,
}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
 # 'segmentfault.middlewares.SegmentfaultHttpProxyMiddleware': 543,
 'segmentfault.middlewares.SegmentfaultUserAgentMiddleware':643,
 'segmentfault.middlewares.SegmentfaultCookiesMiddleware':743,
 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': None,
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
 # 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware':None,

}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
 'segmentfault.pipelines.SegmentfaultPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# # The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# # The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# # The average number of requests Scrapy should be sending in parallel to
# # each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# # Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

# 配置MONGODB
MONGO_URI = 'localhost:27017'
MONGO_DB = 'segmentfault'

# 用户列表
USER_LIST = [
 ("798549150@qq.com","guoqing1010"),
 ("learnscrapy@163.com","guoqing1010"),
]

# 配置代理列表
PROXY_LIST = [
 'http://115.182.212.169:8080',
 'http://121.61.25.149:9999',
 'http://180.118.247.189:9000',
 'http://115.151.3.12:9999',
 'http://183.154.213.160:9000',
 'http://113.128.9.106:9999',
 'http://124.42.68.152:90',
 'http://49.70.48.50:9999',
 'http://113.128.11.172:9999',
 'http://111.177.177.40:9999',
 'http://59.62.83.253:9999',
 'http://39.107.84.185:8123',
 'http://124.94.195.107:9999',
 'http://111.177.160.132:9999',
 'http://120.25.203.182:7777'
]

USER_AGENT_LIST = [
 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
 'Opera/8.0 (Windows NT 5.1; U; en)',
 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50',
 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
 'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16',
 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)',
 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)',
 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36'
]

userinfo.py

# -*- coding: utf-8 -*-
import scrapy
import time
from scrapy import Request
from pymongo import MongoClient
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider,Rule
from scrapy.http import FormRequest
from segmentfault.items import SegmentfaultItem


class UserinfoSpider(CrawlSpider):
 name = 'userinfo'
 allowed_domains = ['segmentfault.com']
 start_urls = ['https://segmentfault.com/u/mybigbigcat/users/following']

 rules = (
  # 用户主页地址,跟进并进行解析
  Rule(LinkExtractor(allow=r'/u/\w+$'),callback='parse_item',follow=True),
  # 用户关注列表,跟进列表页面,抓取用户主页地址进行后续操作
  # Rule(LinkExtractor(allow=r'/users/followed$'),follow=True),
  # 用户粉丝列表,跟进列表页面,抓取用户主页地址进行后续操作
  Rule(LinkExtractor(allow=r'/users/following$'),follow=True),
  # 跟进其他页面地址
  # Rule(LinkExtractor(allow=r'/users/[followed|following]?page=\d+'),follow=True),
 )

 def start_requests(self):
  # 从MongoDB中获取一条cookie,添加到开始方法
  client = MongoClient(self.crawler.settings['MONGO_URI'])
  db = client[self.crawler.settings['MONGO_DB']]
  cookies_collection = db.cookies
  # 获取一条cookie
  cookies = cookies_collection.find_one()
  # cookie中的'Hm_lpvt_e23800c454aa573c0ccb16b52665ac26'参数是当前时间的10位表示法,因此重新填充
  cookies['Hm_lpvt_e23800c454aa573c0ccb16b52665ac26'] = str(int(time.time()))

  return [Request("https://segmentfault.com",
      cookies=cookies,
      meta={'cookiejar':1},
      callback=self.after_login)]

 # 登录之后从start_url中开始抓取数据
 def after_login(self,response):
  for url in self.start_urls:
   return self.make_requests_from_url(url)
 # def after_login(self,response):
 #  yield Request(self.start_urls[0],
 #     meta={'cookiejar':response.meta['cookiejar']},
 #     callback=self.parse_item)

 def parse_item(self, response):
  """
  :param response:
  :return:
  """
  item = SegmentfaultItem()
  # 个人属性模块
  profile_head = response.css('.profile__heading')
  # 姓名
  item['name'] = profile_head.css('h2[class*=name]::text').re_first(r'\w+')
  # 声望
  item['rank'] = profile_head.css('.profile__rank-btn > span::text').extract_first()
  # 学校专业信息
  school_info = profile_head.css('.profile__school::text').extract()
  if school_info:
   # 学校
   item['school'] = school_info[0]
   # 专业
   item['majors'] = school_info[1].strip()
  else:
   item['school'] = ''
   item['majors'] = ''
  # 公司职位信息
  company_info = profile_head.css('.profile__company::text').extract()
  if company_info:
   # 公司
   item['company'] = company_info[0]
   # 职位
   item['job'] = company_info[1].strip()
  else:
   item['company'] = ''
   item['job'] = ''
  # 个人博客
  item['blog'] = profile_head.css('a[class*=other-item-link]::attr(href)').extract_first()

  # 统计面板模块
  profile_active = response.xpath("//div[@class='col-md-2']")
  # 关注人数
  item['following'] = profile_active.css('div[class*=info] a > .h5::text').re(r'\d+')[0]
  # 粉丝人数
  item['fans'] = profile_active.css('div[class*=info] a > .h5::text').re(r'\d+')[1]
  # 回答问题数
  item['answers'] = profile_active.css('a[href*=answer] .count::text').re_first(r'\d+')
  # 提问数
  item['questions'] = profile_active.css('a[href*=questions] .count::text').re_first(r'\d+')
  # 文章数
  item['articles'] = profile_active.css('a[href*=articles] .count::text').re_first(r'\d+')
  # 讲座数
  item['lives'] = profile_active.css('a[href*=lives] .count::text').re_first(r'\d+')
  # 徽章数
  item['badges'] = profile_active.css('a[href*=badges] .count::text').re_first(r'\d+')
  # 徽章详细页面地址
  badge_url = profile_active.css('a[href*=badges]::attr(href)').extract_first()

  # 技能面板模块
  profile_skill = response.xpath("//div[@class='col-md-3']")
  # 技能标签列表
  item['skills'] = profile_skill.css('.tag::text').re(r'\w+')
  # 获得的点赞数
  item['like'] = profile_skill.css('.authlist').re_first(r'获得 (\d+) 次点赞')
  # 注册日期
  item['register_date'] = profile_skill.css('.profile__skill--other p::text').extract_first()
  # if register_time:
  #  item['register_date'] = ''.join(re.findall(r'\d+',register_time))
  # else:
  #  item['register_date'] = ''

  # 产出数据模块
  profile_work = response.xpath("//div[@class='col-md-7']")
  # 回答获得的最高分
  item['answers_top_score'] = profile_work.css('#navAnswer .label::text').re_first(r'\d+')
  # 最高分回答对应的问题的标题
  item['answers_top_title'] = profile_work.css('#navAnswer div[class*=title-warp] > a::text').extract_first()
  # 最高分回答对应的问题的url
  answer_url = profile_work.css('#navAnswer div[class*=title-warp] > a::attr(href)').extract_first()

  # 将需要继续跟进抓取数据的url与item作为参数传递给相应方法继续抓取数据
  request = scrapy.Request(
   # 问题详细页url
   url=response.urljoin(answer_url),
   meta={
   # item需要传递
   'item':item,
   # 徽章的url
   'badge_url':response.urljoin(badge_url)},
   # 调用parse_ansser继续处理
   callback=self.parse_answer)
  yield request

 def parse_answer(self,response):
  # 取出传递的item
  item = response.meta['item']
  # 取出传递的徽章详细页url
  badge_url = response.meta['badge_url']
  # 问题标签列表
  item['answers_top_tags'] = response.css('.question__title--tag .tag::text').re(r'\w+')
  # 先获取组成问题内容的字符串列表
  question_content = response.css('.widget-question__item p').re(r'>(.*?)<')
  # 拼接后传入item
  item['answers_top_question'] = ''.join(question_content)
  # 先获取组成答案的字符串列表
  answer_content = response.css('.qa-answer > article .answer').re(r'>(.*?)<')
  # 拼接后传入item
  item['answers_top_content'] = ''.join(answer_content)

  # 问题页面内容抓取后继续抓取徽章页内容,并将更新后的item继续传递
  request = scrapy.Request(url=badge_url,
         meta={'item':item},
         callback=self.parse_badge)
  yield request

 def parse_badge(self,response):
  item = response.meta['item']
  badge_name = response.css('span.badge span::text').extract()
  badge_count = response.css('span[class*=badges-count]::text').re(r'\d+')
  name_count = {}
  for i in range(len(badge_count)):
   name_count[badge_name[i]] = badge_count[i]
  item['badges'] = name_count
  yield item

middlewars.py

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
import random
import re
import datetime
import scrapy
import logging
import time
from scrapy.conf import settings
from pymongo import MongoClient
from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware
import pymongo
logger = logging.getLogger(__name__)


class SegmentfaultSpiderMiddleware(object):
 """
 处理Item中保存的三种类型注册日期数据:
 1. 注册于 2015年12月12日
 2. 注册于 3 天前
 3. 注册于 5 小时前
 """

 def process_spider_output(self,response,result,spider):

  """
  输出response时调用此方法处理item中register_date
  :param response:
  :param result: 包含item
  :param spider:
  :return:处理过注册日期的item
  """
  for item in result:
   # 判断获取的数据是否是scrapy.item类型
   if isinstance(item,scrapy.Item):
    # 获取当前时间
    now = datetime.datetime.now()
    register_date = item['register_date']
    logger.info("获取注册日志格式为{}".format(register_date))
    # 提取注册日期字符串,如'注册于2015年12月12日' => '20151212'
    day = ''.join(re.findall(r'\d+',register_date))
    # 如果提取数字字符串长度大于4位,则为'注册于2015年12月12日'形式
    if len(day) > 4:
     date = day
    # 如果‘时'在提取的字符串中,则为'注册于8小时前'形式
    elif '时' in register_date:
     d = now - datetime.timedelta(hours=int(day))
     date = d.strftime("%Y%m%d")
    # 最后一种情况就是'注册于3天前'形式
    else:
     d = now - datetime.timedelta(days=int(day))
     date = d.strftime("%Y%m%d")

    # 更新register_date值
    item['register_date'] = date
   yield item


class SegmentfaultHttpProxyMiddleware(object):
 # Not all methods need to be defined. If a method is not defined,
 # scrapy acts as if the downloader middleware does not modify the
 # passed objects.
 def __init__(self):
  self.proxy_list = settings['PROXY_LIST']

 def process_request(self, request, spider):
  proxy = random.choice(self.proxy_list)
  logger.info('使用代理:{}'.format(proxy))
  request.meta['proxy'] = proxy


class SegmentfaultUserAgentMiddleware(object):
 def __init__(self):
  self.useragent_list = settings['USER_AGENT_LIST']

 def process_request(self,request,spider):
  user_agent = random.choice(self.useragent_list)

  # logger.info('使用的USE USER-AGENT:{}'.format(user_agent))
  request.headers['User-Agent'] = user_agent



class SegmentfaultCookiesMiddleware(object):
 client = MongoClient(settings['MONGO_URI'])
 db = client[settings['MONGO_DB']]
 collection = db['cookies']

 def get_cookies(self):
  """
  随机获取cookies
  :return:
  """
  cookies = random.choice([cookie for cookie in self.collection.find()])
  # 将不需要的"_id"与"_gat"参数删除
  cookies.pop('_id')
  cookies.pop('_gat')
  # 将"Hm_lpvt_e23800c454aa573c0ccb16b52665ac26"填充当前时间
  cookies['Hm_lpvt_e23800c454aa573c0ccb16b52665ac26'] = str(int(time.time()))
  return cookies

 def remove_cookies(self,cookies):
  """
  删除已失效的cookies
  :param cookies:
  :return:
  """
  # 随机获取cookies中的一对键值,返回结果是一个元祖
  i = cookies.popitem()
  # 删除cookies
  try:
   logger.info("删除cookies{}".format(cookies))
   self.collection.remove({i[0]:i[1]})
  except Exception as e:
   logger.info("No this cookies:{}".format(cookies))

 def process_request(self,request,spider):
  """
  为每一个request添加一个cookie
  :param request:
  :param spider:
  :return:
  """
  cookies = self.get_cookies()
  request.cookies = cookies

 def process_response(self,request,response,spider):
  """
  对于登录失效的情况,可能会重定向到登录页面,这时添加新的cookies继续,将请求放回调度器
  :param request:
  :param response:
  :param spider:
  :return:
  """
  if response.status in [301,302]:
   logger.info("Redirect response:{}".format(response))
   redirect_url = response.headers['location']
   if b'/user/login' in redirect_url:
    logger.info("Cookies失效")

    # 请求失败,重新获取一个cookie,添加到request,并停止后续中间件处理此request,将此request放入调度器
    new_cookie = self.get_cookies()
    logger.info("获取新cookie:{}".format(new_cookie))
    # 删除旧cookies
    self.remove_cookies(request.cookies)
    request.cookies = new_cookie
   return request
  #
  return response

run.py

from scrapy import cmdline
# from segmentfault.get_cookies import GetCookies
from get_cookies import GetCookies

if __name__ == '__main__':
 cookies = GetCookies()
 cookies.save()
 name = 'userinfo'
 ""
 cmd = 'scrapy crawl {}'.format(name)
 cmdline.execute(cmd.split())

到此这篇关于Scrapy项目实战之爬取某社区用户详情的文章就介绍到这了,更多相关Scrapy 爬取某社区用户内容请搜索三水点靠木以前的文章或继续浏览下面的相关文章希望大家以后多多支持三水点靠木!

Python 相关文章推荐
Python设计模式编程中解释器模式的简单程序示例分享
Mar 02 Python
Python实现二维数组输出为图片
Apr 03 Python
深入浅析Python中list的复制及深拷贝与浅拷贝
Sep 03 Python
利用arcgis的python读取要素的X,Y方法
Dec 22 Python
Python3 pip3 list 出现 DEPRECATION 警告的解决方法
Feb 16 Python
Python中字符串String的基本内置函数与过滤字符模块函数的基本用法
May 27 Python
python多线程共享变量的使用和效率方法
Jul 16 Python
Python 使用type来定义类的实现
Nov 19 Python
Python全面分析系统的时域特性和频率域特性
Feb 26 Python
python中rc1什么意思
Jun 19 Python
python subprocess pipe 实时输出日志的操作
Dec 05 Python
python保存图片的四个常用方法
Feb 28 Python
django跳转页面传参的实现
Sep 17 #Python
解决Ubuntu18中的pycharm不能调用tensorflow-gpu的问题
Sep 17 #Python
Django mysqlclient安装和使用详解
Sep 17 #Python
Pycharm2020最新激活码|永久激活(附最新激活码和插件的详细教程)
Sep 29 #Python
Django返回HTML文件的实现方法
Sep 17 #Python
Pycharm新手使用教程(图文详解)
Sep 17 #Python
Django修改app名称和数据表迁移方案实现
Sep 17 #Python
You might like
php 图片上传类代码
2009/07/17 PHP
PHP Stream_*系列函数
2010/08/01 PHP
PHP实现对文本数据库的常用操作方法实例演示
2014/07/04 PHP
ThinkPHP中Session用法详解
2014/11/29 PHP
php实现复制移动文件的方法
2015/07/29 PHP
PHP模拟http请求的方法详解
2016/11/09 PHP
php使用Jpgraph创建柱状图展示年度收支表效果示例
2017/02/15 PHP
thinkphp3.2框架中where条件查询用法总结
2019/08/13 PHP
window.location和document.location的区别分析
2008/12/23 Javascript
锋利的jQuery 要点归纳(二) jQuery中的DOM操作(下)
2010/03/23 Javascript
jqPlot 图表中文API使用文档及源码和在线示例
2012/02/07 Javascript
JavaScript 高级篇之DOM文档,简单封装及调用、动态添加、删除样式(六)
2012/04/07 Javascript
JavaScript获取页面中第一个锚定文本的方法
2015/04/03 Javascript
jQuery实现平滑滚动页面到指定锚点链接的方法
2015/07/15 Javascript
JavaScript字符集编码与解码详谈
2017/02/02 Javascript
Bootstrap警告(Alerts)的实现方法
2017/03/22 Javascript
Vue.js 单页面多路由区域操作的实例详解
2017/07/17 Javascript
nuxt+axios解决前后端分离SSR的示例代码
2017/10/24 Javascript
使用 Javascript 实现浏览器推送提醒功能的示例
2017/11/03 Javascript
vue axios登录请求拦截器
2018/04/02 Javascript
详解js删除数组中的指定元素
2018/10/31 Javascript
JavaScript学习笔记之数组基本操作示例
2019/01/09 Javascript
JavaScript正则表达式验证登录实例
2020/03/18 Javascript
python实现红包裂变算法
2016/02/16 Python
python unittest实现api自动化测试
2018/04/04 Python
在matplotlib中改变figure的布局和大小实例
2020/04/23 Python
岗位职责的定义
2013/11/10 职场文书
护士自荐信范文
2013/12/15 职场文书
民族团结先进个人材料
2014/02/05 职场文书
大学信息公开实施方案
2014/03/09 职场文书
师德师风自我评价范文
2014/09/11 职场文书
2014年维修工作总结
2014/11/22 职场文书
公司保洁员岗位职责
2015/02/13 职场文书
人力资源部工作计划
2019/05/14 职场文书
python基础之文件操作
2021/10/24 Python
win10电脑双屏显示一个黑屏怎么办?win10电脑双屏显示一个黑屏解决方法
2022/07/15 数码科技