Scrapy项目实战之爬取某社区用户详情


Posted in Python onSeptember 17, 2020

本文介绍了Scrapy项目实战之爬取某社区用户详情,分享给大家,具有如下:

get_cookies.py

from selenium import webdriver
from pymongo import MongoClient
from scrapy.crawler import overridden_settings
# from segmentfault import settings
import time
import settings

class GetCookies(object):
 def __init__(self):
  # 初始化组件
  # 设定webdriver选项
  self.opt = webdriver.ChromeOptions()
  # self.opt.add_argument("--headless")
  # 初始化用户列表
  self.user_list = settings.USER_LIST
  # 初始化MongoDB参数
  self.client = MongoClient(settings.MONGO_URI)
  self.db = self.client[settings.MONGO_DB]
  self.collection = self.db["cookies"]

 def get_cookies(self,username,password):
  """

  :param username:
  :param password:
  :return: cookies
  """
  # 使用webdriver选项创建driver
  driver = webdriver.Chrome(executable_path="/Users/Hank/scrapy/segmentfault/segmentfault/chromedriver",options=self.opt)
  driver.get("https://segmentfault.com/user/login")
  driver.find_element_by_name("username").send_keys(username)
  driver.find_element_by_name("password").send_keys(password)
  driver.find_element_by_xpath("//button[@type='submit']").click()
  time.sleep(2)
  driver.get("https://segmentfault.com/u/luwangmeilun/users/following")
  # 登陆之后获取页面cookies
  cookies = driver.get_cookies()
  driver.quit()

  return cookies

 def format_cookies(self,cookies):
  """

  :param cookies:
  从driver.get_cookies的形式为:
  [{'domain': 'segmentfault.com', 'httpOnly': False, 'name': 'PHPSESSID',
  'path': '/', 'secure': False, 'value': 'web2~5grmfa89j12eksub8hja3bvaq4'},
  {'domain': '.segmentfault.com', 'expiry': 1581602940, 'httpOnly': False,
  'name': 'Hm_lvt_e23800c454aa573c0ccb16b52665ac26', 'path': '/', 'secure': False,
  'value': '1550066940'},
  {'domain': '.segmentfault.com', 'httpOnly': False,
  'name': 'Hm_lpvt_e23800c454aa573c0ccb16b52665ac26',
  'path': '/', 'secure': False, 'value': '1550066940'},
  {'domain': '.segmentfault.com', 'expiry': 1550067000, 'httpOnly': False,
  'name': '_gat', 'path': '/', 'secure': False, 'value': '1'},
  {'domain': '.segmentfault.com', 'expiry': 1550153340, 'httpOnly': False,
  'name': '_gid', 'path': '/', 'secure': False, 'value': 'GA1.2.783265084.1550066940'},
  {'domain': '.segmentfault.com', 'expiry': 1613138940, 'httpOnly': False, 'name': '_ga',
  'path': '/', 'secure': False, 'value': 'GA1.2.1119166665.1550066940'}]
  只需提取每一项的name与value即可

  :return:
  """
  c = dict()
  for item in cookies:
   c[item['name']] = item['value']

  return c

 def save(self):
  print("开始获取Cookies....")
  # 从用户列表中获取用户名与密码,分别登陆获取cookies
  for username,password in self.user_list:
   cookies = self.get_cookies(username,password)
   f_cookies = self.format_cookies(cookies)
   print("insert cookie:{}".format(f_cookies))
   # 将格式整理后的cookies插入MongoDB数据库
   self.collection.insert_one(f_cookies)

  # s = db[self.collection].find()
  # for i in s:
  #  print(i)


if __name__ == '__main__':

 cookies = GetCookies()
 for i in range(20):
  cookies.save()

item.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class SegmentfaultItem(scrapy.Item):
 # define the fields for your item here like:
 # 个人属性
 # 姓名
 name = scrapy.Field()
 # 声望
 rank = scrapy.Field()
 # 学校
 school = scrapy.Field()
 # 专业
 majors = scrapy.Field()
 # 公司
 company = scrapy.Field()
 # 工作
 job = scrapy.Field()
 # blog
 blog = scrapy.Field()
 # 社交活动数据
 # 关注人数
 following = scrapy.Field()
 # 粉丝数
 fans = scrapy.Field()
 # 回答数
 answers = scrapy.Field()
 # 提问数
 questions = scrapy.Field()
 # 文章数
 articles = scrapy.Field()
 # 讲座数
 lives = scrapy.Field()
 # 徽章数
 badges = scrapy.Field()
 # 技能属性
 # 点赞数
 like = scrapy.Field()
 # 技能
 skills = scrapy.Field()
 # 注册日期
 register_date = scrapy.Field()
 # 问答统计
 # 回答最高得票数
 answers_top_score = scrapy.Field()
 # 得票数最高的回答对应的问题的标题
 answers_top_title = scrapy.Field()
 # 得票数最高的回答对应的问题的标签
 answers_top_tags = scrapy.Field()
 # 得票数最高的回答对应的问题的内容
 answers_top_question = scrapy.Field()
 # 得票数最高的回答对应的问题的内容
 answers_top_content = scrapy.Field()

pipeline.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo

class SegmentfaultPipeline(object):
 # 设定MongoDB集合名称
 collection_name = 'userinfo'

 def __init__(self,mongo_uri,mongo_db):
  self.mongo_uri = mongo_uri
  self.mongo_db = mongo_db

 # 通过crawler获取settings.py中设定的MongoDB连接信息
 @classmethod
 def from_crawler(cls,crawler):
  return cls(
   mongo_uri = crawler.settings.get('MONGO_URI'),
   mongo_db = crawler.settings.get('MONGO_DB','segmentfault')
  )

 # 当爬虫启动时连接MongoDB
 def open_spider(self,spider):
  self.client = pymongo.MongoClient(self.mongo_uri)
  self.db = self.client[self.mongo_db]

 # 当爬虫关闭时断开MongoDB连接
 def close_spider(self,spider):
  self.client.close()

 # 将Item插入数据库保存
 def process_item(self, item, spider):
  self.db[self.collection_name].insert_one(dict(item))
  return item

settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for segmentfault project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#  https://doc.scrapy.org/en/latest/topics/settings.html
#  https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#  https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'segmentfault'

SPIDER_MODULES = ['segmentfault.spiders']
NEWSPIDER_MODULE = 'segmentfault.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 100

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 2
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 32
# CONCURRENT_REQUESTS_PER_IP = 32

# Disable cookies (enabled by default)
# COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

RETRY_ENABLED = False

REDIRECT_ENABLED = False

DOWNLOAD_TIMEOUT = 5

# HTTPALLOW

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}


# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
 'segmentfault.middlewares.SegmentfaultSpiderMiddleware': 543,
}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
 # 'segmentfault.middlewares.SegmentfaultHttpProxyMiddleware': 543,
 'segmentfault.middlewares.SegmentfaultUserAgentMiddleware':643,
 'segmentfault.middlewares.SegmentfaultCookiesMiddleware':743,
 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': None,
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
 # 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware':None,

}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
 'segmentfault.pipelines.SegmentfaultPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# # The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# # The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# # The average number of requests Scrapy should be sending in parallel to
# # each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# # Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

# 配置MONGODB
MONGO_URI = 'localhost:27017'
MONGO_DB = 'segmentfault'

# 用户列表
USER_LIST = [
 ("798549150@qq.com","guoqing1010"),
 ("learnscrapy@163.com","guoqing1010"),
]

# 配置代理列表
PROXY_LIST = [
 'http://115.182.212.169:8080',
 'http://121.61.25.149:9999',
 'http://180.118.247.189:9000',
 'http://115.151.3.12:9999',
 'http://183.154.213.160:9000',
 'http://113.128.9.106:9999',
 'http://124.42.68.152:90',
 'http://49.70.48.50:9999',
 'http://113.128.11.172:9999',
 'http://111.177.177.40:9999',
 'http://59.62.83.253:9999',
 'http://39.107.84.185:8123',
 'http://124.94.195.107:9999',
 'http://111.177.160.132:9999',
 'http://120.25.203.182:7777'
]

USER_AGENT_LIST = [
 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
 'Opera/8.0 (Windows NT 5.1; U; en)',
 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50',
 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
 'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16',
 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)',
 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)',
 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36'
]

userinfo.py

# -*- coding: utf-8 -*-
import scrapy
import time
from scrapy import Request
from pymongo import MongoClient
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider,Rule
from scrapy.http import FormRequest
from segmentfault.items import SegmentfaultItem


class UserinfoSpider(CrawlSpider):
 name = 'userinfo'
 allowed_domains = ['segmentfault.com']
 start_urls = ['https://segmentfault.com/u/mybigbigcat/users/following']

 rules = (
  # 用户主页地址,跟进并进行解析
  Rule(LinkExtractor(allow=r'/u/\w+$'),callback='parse_item',follow=True),
  # 用户关注列表,跟进列表页面,抓取用户主页地址进行后续操作
  # Rule(LinkExtractor(allow=r'/users/followed$'),follow=True),
  # 用户粉丝列表,跟进列表页面,抓取用户主页地址进行后续操作
  Rule(LinkExtractor(allow=r'/users/following$'),follow=True),
  # 跟进其他页面地址
  # Rule(LinkExtractor(allow=r'/users/[followed|following]?page=\d+'),follow=True),
 )

 def start_requests(self):
  # 从MongoDB中获取一条cookie,添加到开始方法
  client = MongoClient(self.crawler.settings['MONGO_URI'])
  db = client[self.crawler.settings['MONGO_DB']]
  cookies_collection = db.cookies
  # 获取一条cookie
  cookies = cookies_collection.find_one()
  # cookie中的'Hm_lpvt_e23800c454aa573c0ccb16b52665ac26'参数是当前时间的10位表示法,因此重新填充
  cookies['Hm_lpvt_e23800c454aa573c0ccb16b52665ac26'] = str(int(time.time()))

  return [Request("https://segmentfault.com",
      cookies=cookies,
      meta={'cookiejar':1},
      callback=self.after_login)]

 # 登录之后从start_url中开始抓取数据
 def after_login(self,response):
  for url in self.start_urls:
   return self.make_requests_from_url(url)
 # def after_login(self,response):
 #  yield Request(self.start_urls[0],
 #     meta={'cookiejar':response.meta['cookiejar']},
 #     callback=self.parse_item)

 def parse_item(self, response):
  """
  :param response:
  :return:
  """
  item = SegmentfaultItem()
  # 个人属性模块
  profile_head = response.css('.profile__heading')
  # 姓名
  item['name'] = profile_head.css('h2[class*=name]::text').re_first(r'\w+')
  # 声望
  item['rank'] = profile_head.css('.profile__rank-btn > span::text').extract_first()
  # 学校专业信息
  school_info = profile_head.css('.profile__school::text').extract()
  if school_info:
   # 学校
   item['school'] = school_info[0]
   # 专业
   item['majors'] = school_info[1].strip()
  else:
   item['school'] = ''
   item['majors'] = ''
  # 公司职位信息
  company_info = profile_head.css('.profile__company::text').extract()
  if company_info:
   # 公司
   item['company'] = company_info[0]
   # 职位
   item['job'] = company_info[1].strip()
  else:
   item['company'] = ''
   item['job'] = ''
  # 个人博客
  item['blog'] = profile_head.css('a[class*=other-item-link]::attr(href)').extract_first()

  # 统计面板模块
  profile_active = response.xpath("//div[@class='col-md-2']")
  # 关注人数
  item['following'] = profile_active.css('div[class*=info] a > .h5::text').re(r'\d+')[0]
  # 粉丝人数
  item['fans'] = profile_active.css('div[class*=info] a > .h5::text').re(r'\d+')[1]
  # 回答问题数
  item['answers'] = profile_active.css('a[href*=answer] .count::text').re_first(r'\d+')
  # 提问数
  item['questions'] = profile_active.css('a[href*=questions] .count::text').re_first(r'\d+')
  # 文章数
  item['articles'] = profile_active.css('a[href*=articles] .count::text').re_first(r'\d+')
  # 讲座数
  item['lives'] = profile_active.css('a[href*=lives] .count::text').re_first(r'\d+')
  # 徽章数
  item['badges'] = profile_active.css('a[href*=badges] .count::text').re_first(r'\d+')
  # 徽章详细页面地址
  badge_url = profile_active.css('a[href*=badges]::attr(href)').extract_first()

  # 技能面板模块
  profile_skill = response.xpath("//div[@class='col-md-3']")
  # 技能标签列表
  item['skills'] = profile_skill.css('.tag::text').re(r'\w+')
  # 获得的点赞数
  item['like'] = profile_skill.css('.authlist').re_first(r'获得 (\d+) 次点赞')
  # 注册日期
  item['register_date'] = profile_skill.css('.profile__skill--other p::text').extract_first()
  # if register_time:
  #  item['register_date'] = ''.join(re.findall(r'\d+',register_time))
  # else:
  #  item['register_date'] = ''

  # 产出数据模块
  profile_work = response.xpath("//div[@class='col-md-7']")
  # 回答获得的最高分
  item['answers_top_score'] = profile_work.css('#navAnswer .label::text').re_first(r'\d+')
  # 最高分回答对应的问题的标题
  item['answers_top_title'] = profile_work.css('#navAnswer div[class*=title-warp] > a::text').extract_first()
  # 最高分回答对应的问题的url
  answer_url = profile_work.css('#navAnswer div[class*=title-warp] > a::attr(href)').extract_first()

  # 将需要继续跟进抓取数据的url与item作为参数传递给相应方法继续抓取数据
  request = scrapy.Request(
   # 问题详细页url
   url=response.urljoin(answer_url),
   meta={
   # item需要传递
   'item':item,
   # 徽章的url
   'badge_url':response.urljoin(badge_url)},
   # 调用parse_ansser继续处理
   callback=self.parse_answer)
  yield request

 def parse_answer(self,response):
  # 取出传递的item
  item = response.meta['item']
  # 取出传递的徽章详细页url
  badge_url = response.meta['badge_url']
  # 问题标签列表
  item['answers_top_tags'] = response.css('.question__title--tag .tag::text').re(r'\w+')
  # 先获取组成问题内容的字符串列表
  question_content = response.css('.widget-question__item p').re(r'>(.*?)<')
  # 拼接后传入item
  item['answers_top_question'] = ''.join(question_content)
  # 先获取组成答案的字符串列表
  answer_content = response.css('.qa-answer > article .answer').re(r'>(.*?)<')
  # 拼接后传入item
  item['answers_top_content'] = ''.join(answer_content)

  # 问题页面内容抓取后继续抓取徽章页内容,并将更新后的item继续传递
  request = scrapy.Request(url=badge_url,
         meta={'item':item},
         callback=self.parse_badge)
  yield request

 def parse_badge(self,response):
  item = response.meta['item']
  badge_name = response.css('span.badge span::text').extract()
  badge_count = response.css('span[class*=badges-count]::text').re(r'\d+')
  name_count = {}
  for i in range(len(badge_count)):
   name_count[badge_name[i]] = badge_count[i]
  item['badges'] = name_count
  yield item

middlewars.py

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
import random
import re
import datetime
import scrapy
import logging
import time
from scrapy.conf import settings
from pymongo import MongoClient
from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware
import pymongo
logger = logging.getLogger(__name__)


class SegmentfaultSpiderMiddleware(object):
 """
 处理Item中保存的三种类型注册日期数据:
 1. 注册于 2015年12月12日
 2. 注册于 3 天前
 3. 注册于 5 小时前
 """

 def process_spider_output(self,response,result,spider):

  """
  输出response时调用此方法处理item中register_date
  :param response:
  :param result: 包含item
  :param spider:
  :return:处理过注册日期的item
  """
  for item in result:
   # 判断获取的数据是否是scrapy.item类型
   if isinstance(item,scrapy.Item):
    # 获取当前时间
    now = datetime.datetime.now()
    register_date = item['register_date']
    logger.info("获取注册日志格式为{}".format(register_date))
    # 提取注册日期字符串,如'注册于2015年12月12日' => '20151212'
    day = ''.join(re.findall(r'\d+',register_date))
    # 如果提取数字字符串长度大于4位,则为'注册于2015年12月12日'形式
    if len(day) > 4:
     date = day
    # 如果‘时'在提取的字符串中,则为'注册于8小时前'形式
    elif '时' in register_date:
     d = now - datetime.timedelta(hours=int(day))
     date = d.strftime("%Y%m%d")
    # 最后一种情况就是'注册于3天前'形式
    else:
     d = now - datetime.timedelta(days=int(day))
     date = d.strftime("%Y%m%d")

    # 更新register_date值
    item['register_date'] = date
   yield item


class SegmentfaultHttpProxyMiddleware(object):
 # Not all methods need to be defined. If a method is not defined,
 # scrapy acts as if the downloader middleware does not modify the
 # passed objects.
 def __init__(self):
  self.proxy_list = settings['PROXY_LIST']

 def process_request(self, request, spider):
  proxy = random.choice(self.proxy_list)
  logger.info('使用代理:{}'.format(proxy))
  request.meta['proxy'] = proxy


class SegmentfaultUserAgentMiddleware(object):
 def __init__(self):
  self.useragent_list = settings['USER_AGENT_LIST']

 def process_request(self,request,spider):
  user_agent = random.choice(self.useragent_list)

  # logger.info('使用的USE USER-AGENT:{}'.format(user_agent))
  request.headers['User-Agent'] = user_agent



class SegmentfaultCookiesMiddleware(object):
 client = MongoClient(settings['MONGO_URI'])
 db = client[settings['MONGO_DB']]
 collection = db['cookies']

 def get_cookies(self):
  """
  随机获取cookies
  :return:
  """
  cookies = random.choice([cookie for cookie in self.collection.find()])
  # 将不需要的"_id"与"_gat"参数删除
  cookies.pop('_id')
  cookies.pop('_gat')
  # 将"Hm_lpvt_e23800c454aa573c0ccb16b52665ac26"填充当前时间
  cookies['Hm_lpvt_e23800c454aa573c0ccb16b52665ac26'] = str(int(time.time()))
  return cookies

 def remove_cookies(self,cookies):
  """
  删除已失效的cookies
  :param cookies:
  :return:
  """
  # 随机获取cookies中的一对键值,返回结果是一个元祖
  i = cookies.popitem()
  # 删除cookies
  try:
   logger.info("删除cookies{}".format(cookies))
   self.collection.remove({i[0]:i[1]})
  except Exception as e:
   logger.info("No this cookies:{}".format(cookies))

 def process_request(self,request,spider):
  """
  为每一个request添加一个cookie
  :param request:
  :param spider:
  :return:
  """
  cookies = self.get_cookies()
  request.cookies = cookies

 def process_response(self,request,response,spider):
  """
  对于登录失效的情况,可能会重定向到登录页面,这时添加新的cookies继续,将请求放回调度器
  :param request:
  :param response:
  :param spider:
  :return:
  """
  if response.status in [301,302]:
   logger.info("Redirect response:{}".format(response))
   redirect_url = response.headers['location']
   if b'/user/login' in redirect_url:
    logger.info("Cookies失效")

    # 请求失败,重新获取一个cookie,添加到request,并停止后续中间件处理此request,将此request放入调度器
    new_cookie = self.get_cookies()
    logger.info("获取新cookie:{}".format(new_cookie))
    # 删除旧cookies
    self.remove_cookies(request.cookies)
    request.cookies = new_cookie
   return request
  #
  return response

run.py

from scrapy import cmdline
# from segmentfault.get_cookies import GetCookies
from get_cookies import GetCookies

if __name__ == '__main__':
 cookies = GetCookies()
 cookies.save()
 name = 'userinfo'
 ""
 cmd = 'scrapy crawl {}'.format(name)
 cmdline.execute(cmd.split())

到此这篇关于Scrapy项目实战之爬取某社区用户详情的文章就介绍到这了,更多相关Scrapy 爬取某社区用户内容请搜索三水点靠木以前的文章或继续浏览下面的相关文章希望大家以后多多支持三水点靠木!

Python 相关文章推荐
Python中模拟enum枚举类型的5种方法分享
Nov 22 Python
python遍历 truple list dictionary的几种方法总结
Sep 11 Python
Python基于list的append和pop方法实现堆栈与队列功能示例
Jul 24 Python
python判断输入日期为第几天的实例
Nov 13 Python
python中时间、日期、时间戳的转换的实现方法
Jul 06 Python
简单介绍django提供的加密算法
Dec 18 Python
Python注释、分支结构、循环结构、伪“选择结构”用法实例分析
Jan 09 Python
浅谈python之自动化运维(Paramiko)
Jan 31 Python
python range实例用法分享
Feb 06 Python
在 Linux/Mac 下为Python函数添加超时时间的方法
Feb 20 Python
浅析python中的del用法
Sep 02 Python
Python访问Redis的详细操作
Jun 26 Python
django跳转页面传参的实现
Sep 17 #Python
解决Ubuntu18中的pycharm不能调用tensorflow-gpu的问题
Sep 17 #Python
Django mysqlclient安装和使用详解
Sep 17 #Python
Pycharm2020最新激活码|永久激活(附最新激活码和插件的详细教程)
Sep 29 #Python
Django返回HTML文件的实现方法
Sep 17 #Python
Pycharm新手使用教程(图文详解)
Sep 17 #Python
Django修改app名称和数据表迁移方案实现
Sep 17 #Python
You might like
Zend Framework页面缓存实例
2014/06/25 PHP
详解Yii2.0使用AR联表查询实例
2017/06/16 PHP
js玩一玩WSH吧
2007/02/23 Javascript
基于jQuery实现表格数据的动态添加与统计的代码
2011/01/31 Javascript
js 获取浏览器版本以此来调整CSS的样式
2014/06/03 Javascript
jquery+CSS3实现淘宝移动网页菜单效果
2015/08/31 Javascript
最简单的tab切换实例代码
2016/05/13 Javascript
Omi v1.0.2发布正式支持传递javascript表达式
2017/03/21 Javascript
原生js的ajax和解决跨域的jsonp(实例讲解)
2017/10/16 Javascript
获取layer.open弹出层的返回值方法
2018/08/20 Javascript
jquery+ajax实现上传图片并显示上传进度功能【附php后台接收】
2019/06/06 jQuery
构建大型 Vue.js 项目的10条建议(小结)
2019/11/14 Javascript
js中调用微信的扫描二维码功能的实现代码
2020/04/11 Javascript
vue制作抓娃娃机的示例代码
2020/04/17 Javascript
javascript执行上下文、变量对象实例分析
2020/04/25 Javascript
Vue实现购物小球抛物线的方法实例
2020/11/22 Vue.js
typescript编写微信小程序创建项目的方法
2021/01/29 Javascript
python检测服务器是否正常
2014/02/16 Python
解决python写的windows服务不能启动的问题
2014/04/15 Python
如何将python中的List转化成dictionary
2016/08/15 Python
python中学习K-Means和图片压缩
2017/11/20 Python
python对验证码降噪的实现示例代码
2019/11/12 Python
matplotlib之多边形选区(PolygonSelector)的使用
2021/02/24 Python
英国经济型酒店品牌:Travelodge
2019/12/17 全球购物
大学生毕业鉴定
2014/01/31 职场文书
货车司机岗位职责
2014/03/18 职场文书
春风行动实施方案
2014/03/28 职场文书
小学三好学生事迹材料
2014/08/15 职场文书
聋哑人盗窃罪辩护词
2015/05/21 职场文书
毕业实习证明范本
2015/06/16 职场文书
2015入党自传书范文
2015/06/26 职场文书
培训学校2015年度工作总结
2015/07/20 职场文书
交通安全教育心得体会
2016/01/15 职场文书
《圆的周长》教学反思
2016/02/17 职场文书
2016年第十四个公民道德宣传日活动总
2016/04/01 职场文书
深入理解mysql事务隔离级别和存储引擎
2022/04/12 MySQL