Python多线程、异步+多进程爬虫实现代码


Posted in Python onFebruary 17, 2016

安装Tornado
省事点可以直接用grequests库,下面用的是tornado的异步client。 异步用到了tornado,根据官方文档的例子修改得到一个简单的异步爬虫类。可以参考下最新的文档学习下。
pip install tornado

异步爬虫

#!/usr/bin/env python
# -*- coding:utf-8 -*-

import time
from datetime import timedelta
from tornado import httpclient, gen, ioloop, queues
import traceback


class AsySpider(object):
  """A simple class of asynchronous spider."""
  def __init__(self, urls, concurrency=10, **kwargs):
    urls.reverse()
    self.urls = urls
    self.concurrency = concurrency
    self._q = queues.Queue()
    self._fetching = set()
    self._fetched = set()

  def fetch(self, url, **kwargs):
    fetch = getattr(httpclient.AsyncHTTPClient(), 'fetch')
    return fetch(url, **kwargs)

  def handle_html(self, url, html):
    """handle html page"""
    print(url)

  def handle_response(self, url, response):
    """inherit and rewrite this method"""
    if response.code == 200:
      self.handle_html(url, response.body)

    elif response.code == 599:  # retry
      self._fetching.remove(url)
      self._q.put(url)

  @gen.coroutine
  def get_page(self, url):
    try:
      response = yield self.fetch(url)
      print('######fetched %s' % url)
    except Exception as e:
      print('Exception: %s %s' % (e, url))
      raise gen.Return(e)
    raise gen.Return(response)

  @gen.coroutine
  def _run(self):
    @gen.coroutine
    def fetch_url():
      current_url = yield self._q.get()
      try:
        if current_url in self._fetching:
          return

        print('fetching****** %s' % current_url)
        self._fetching.add(current_url)

        response = yield self.get_page(current_url)
        self.handle_response(current_url, response)  # handle reponse

        self._fetched.add(current_url)

        for i in range(self.concurrency):
          if self.urls:
            yield self._q.put(self.urls.pop())

      finally:
        self._q.task_done()

    @gen.coroutine
    def worker():
      while True:
        yield fetch_url()

    self._q.put(self.urls.pop())  # add first url

    # Start workers, then wait for the work queue to be empty.
    for _ in range(self.concurrency):
      worker()

    yield self._q.join(timeout=timedelta(seconds=300000))
    assert self._fetching == self._fetched

  def run(self):
    io_loop = ioloop.IOLoop.current()
    io_loop.run_sync(self._run)


class MySpider(AsySpider):

  def fetch(self, url, **kwargs):
    """重写父类fetch方法可以添加cookies,headers,timeout等信息"""
    cookies_str = "PHPSESSID=j1tt66a829idnms56ppb70jri4; pspt=%7B%22id%22%3A%2233153%22%2C%22pswd%22%3A%228835d2c1351d221b4ab016fbf9e8253f%22%2C%22_code%22%3A%22f779dcd011f4e2581c716d1e1b945861%22%7D; key=%E9%87%8D%E5%BA%86%E5%95%84%E6%9C%A8%E9%B8%9F%E7%BD%91%E7%BB%9C%E7%A7%91%E6%8A%80%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8; think_language=zh-cn; SERVERID=a66d7d08fa1c8b2e37dbdc6ffff82d9e|1444973193|1444967835; CNZZDATA1254842228=1433864393-1442810831-%7C1444972138"  # 从浏览器拷贝cookie字符串
    headers = {
      'User-Agent': 'mozilla/5.0 (compatible; baiduspider/2.0; +http://www.baidu.com/search/spider.html)',
      'cookie': cookies_str
    }
    return super(MySpider, self).fetch(  # 参数参考tornado文档
      url, headers=headers, request_timeout=1
    )

  def handle_html(self, url, html):
    print(url, html)


def main():
  urls = []
  for page in range(1, 100):
    urls.append('http://www.baidu.com?page=%s' % page)
  s = MySpider(urls)
  s.run()


if __name__ == '__main__':
  main()

可以继承这个类,塞一些url进去,然后重写handle_page处理得到的页面。

异步+多进程爬虫
还可以再变态点,加个进程池,使用了multiprocessing模块。效率飕飕的,

#!/usr/bin/env python
# -*- coding:utf-8 -*-

import time
from multiprocessing import Pool
from datetime import timedelta
from tornado import httpclient, gen, ioloop, queues


class AsySpider(object):
  """A simple class of asynchronous spider."""
  def __init__(self, urls, concurrency):
    urls.reverse()
    self.urls = urls
    self.concurrency = concurrency
    self._q = queues.Queue()
    self._fetching = set()
    self._fetched = set()

  def handle_page(self, url, html):
    filename = url.rsplit('/', 1)[1]
    with open(filename, 'w+') as f:
      f.write(html)

  @gen.coroutine
  def get_page(self, url):
    try:
      response = yield httpclient.AsyncHTTPClient().fetch(url)
      print('######fetched %s' % url)
    except Exception as e:
      print('Exception: %s %s' % (e, url))
      raise gen.Return('')
    raise gen.Return(response.body)

  @gen.coroutine
  def _run(self):

    @gen.coroutine
    def fetch_url():
      current_url = yield self._q.get()
      try:
        if current_url in self._fetching:
          return

        print('fetching****** %s' % current_url)
        self._fetching.add(current_url)
        html = yield self.get_page(current_url)
        self._fetched.add(current_url)

        self.handle_page(current_url, html)

        for i in range(self.concurrency):
          if self.urls:
            yield self._q.put(self.urls.pop())

      finally:
        self._q.task_done()

    @gen.coroutine
    def worker():
      while True:
        yield fetch_url()

    self._q.put(self.urls.pop())

    # Start workers, then wait for the work queue to be empty.
    for _ in range(self.concurrency):
      worker()
    yield self._q.join(timeout=timedelta(seconds=300000))
    assert self._fetching == self._fetched

  def run(self):
    io_loop = ioloop.IOLoop.current()
    io_loop.run_sync(self._run)


def run_spider(beg, end):
  urls = []
  for page in range(beg, end):
    urls.append('http://127.0.0.1/%s.htm' % page)
  s = AsySpider(urls, 10)
  s.run()


def main():
  _st = time.time()
  p = Pool()
  all_num = 73000
  num = 4  # number of cpu cores
  per_num, left = divmod(all_num, num)
  s = range(0, all_num, per_num)
  res = []
  for i in range(len(s)-1):
    res.append((s[i], s[i+1]))
  res.append((s[len(s)-1], all_num))
  print res

  for i in res:
    p.apply_async(run_spider, args=(i[0], i[1],))
  p.close()
  p.join()

  print time.time()-_st


if __name__ == '__main__':
  main()

多线程爬虫
线程池实现.

#!/usr/bin/env python
# -*- coding:utf-8 -*-
import Queue
import sys
import requests
import os
import threading
import time

class Worker(threading.Thread):  # 处理工作请求
  def __init__(self, workQueue, resultQueue, **kwds):
    threading.Thread.__init__(self, **kwds)
    self.setDaemon(True)
    self.workQueue = workQueue
    self.resultQueue = resultQueue


  def run(self):
    while 1:
      try:
        callable, args, kwds = self.workQueue.get(False)  # get task
        res = callable(*args, **kwds)
        self.resultQueue.put(res)  # put result
      except Queue.Empty:
        break

class WorkManager:  # 线程池管理,创建
  def __init__(self, num_of_workers=10):
    self.workQueue = Queue.Queue()  # 请求队列
    self.resultQueue = Queue.Queue()  # 输出结果的队列
    self.workers = []
    self._recruitThreads(num_of_workers)

  def _recruitThreads(self, num_of_workers):
    for i in range(num_of_workers):
      worker = Worker(self.workQueue, self.resultQueue)  # 创建工作线程
      self.workers.append(worker)  # 加入到线程队列


  def start(self):
    for w in self.workers:
      w.start()

  def wait_for_complete(self):
    while len(self.workers):
      worker = self.workers.pop()  # 从池中取出一个线程处理请求
      worker.join()
      if worker.isAlive() and not self.workQueue.empty():
        self.workers.append(worker)  # 重新加入线程池中
    print 'All jobs were complete.'


  def add_job(self, callable, *args, **kwds):
    self.workQueue.put((callable, args, kwds))  # 向工作队列中加入请求

  def get_result(self, *args, **kwds):
    return self.resultQueue.get(*args, **kwds)


def download_file(url):
  #print 'beg download', url
  requests.get(url).text


def main():
  try:
    num_of_threads = int(sys.argv[1])
  except:
    num_of_threads = 10
  _st = time.time()
  wm = WorkManager(num_of_threads)
  print num_of_threads
  urls = ['http://www.baidu.com'] * 1000
  for i in urls:
    wm.add_job(download_file, i)
  wm.start()
  wm.wait_for_complete()
  print time.time() - _st

if __name__ == '__main__':
  main()

这三种随便一种都有很高的效率,但是这么跑会给网站服务器不小的压力,尤其是小站点,还是有点节操为好。

Python 相关文章推荐
MySQLdb ImportError: libmysqlclient.so.18解决方法
Aug 21 Python
python通过ftplib登录到ftp服务器的方法
May 08 Python
详解Python实现按任意键继续/退出的功能
Aug 19 Python
人工智能最火编程语言 Python大战Java!
Nov 13 Python
Python Django 实现简单注册功能过程详解
Jul 29 Python
让你的Python代码实现类型提示功能
Nov 19 Python
Python中six模块基础用法
Dec 08 Python
Python之Django自动实现html代码(下拉框,数据选择)
Mar 13 Python
python 生成任意形状的凸包图代码
Apr 16 Python
详解Django配置JWT认证方式
May 09 Python
基于django micro搭建网站实现加水印功能
May 22 Python
Python数据相关系数矩阵和热力图轻松实现教程
Jun 16 Python
玩转python爬虫之爬取糗事百科段子
Feb 17 #Python
玩转python爬虫之正则表达式
Feb 17 #Python
玩转python爬虫之URLError异常处理
Feb 17 #Python
玩转python爬虫之cookie使用方法
Feb 17 #Python
Python 爬虫爬取指定博客的所有文章
Feb 17 #Python
Using Django with GAE Python 后台抓取多个网站的页面全文
Feb 17 #Python
python实现RSA加密(解密)算法
Feb 17 #Python
You might like
一家之言的经验之谈php+mysql扎实个人基本功
2008/03/27 PHP
php中使用__autoload()自动加载未定义类的实现代码
2013/02/06 PHP
解析PHP的session过期设置
2013/06/29 PHP
PHP策略模式定义与用法示例
2017/07/27 PHP
php微信分享到朋友圈、QQ、朋友、微博
2019/02/18 PHP
XMLHTTPRequest的属性和方法简介
2010/11/23 Javascript
在JavaScript中监听IME键盘输入事件
2011/05/29 Javascript
Validform+layer实现漂亮的表单验证特效
2016/01/17 Javascript
JS+CSS实现的漂亮渐变背景特效代码(6个渐变效果)
2016/03/25 Javascript
jQuery bt气泡实现悬停显示及移开隐藏功能的方法
2016/07/12 Javascript
41个Web开发者必须收藏的JavaScript实用技巧
2016/07/22 Javascript
Bootstrap基本插件学习笔记之Tooltip提示工具(18)
2016/12/08 Javascript
vue实现百度搜索下拉提示功能实例
2017/06/14 Javascript
Express之托管静态文件的方法
2018/06/01 Javascript
webpack4 + react 搭建多页面应用示例
2018/08/03 Javascript
解决bootstrap-select 动态加载数据不显示的问题
2018/08/10 Javascript
js实现轮播图的完整代码
2020/10/26 Javascript
浅谈webpack4.x 入门(一篇足矣)
2018/09/05 Javascript
vue实现微信获取用户信息的方法
2019/03/21 Javascript
详解mpvue中使用vant时需要注意的onChange事件的坑
2019/05/16 Javascript
浅谈vuex中store的命名空间
2019/11/08 Javascript
ES5 模拟 ES6 的 Symbol 实现私有成员功能示例
2020/05/06 Javascript
[00:38]TI珍贵瞬间系列(二):笑
2020/08/26 DOTA
使用python编写脚本获取手机当前应用apk的信息
2014/07/21 Python
Python实现线程池代码分享
2015/06/21 Python
python打包生成的exe文件运行时提示缺少模块的解决方法
2018/10/31 Python
python实现对服务器脚本敏感信息的加密解密功能
2019/08/13 Python
python如何使用socketserver模块实现并发聊天
2019/12/14 Python
MVMT手表官方网站:时尚又实惠的高品质手表
2016/12/04 全球购物
巴西最大的家具及装饰用品店:Mobly
2017/10/11 全球购物
总务岗位职责
2013/11/19 职场文书
行政人员岗位职责
2013/12/08 职场文书
淘宝活动总结范文
2014/06/26 职场文书
毕业生工作求职信
2014/06/30 职场文书
2015年安全生产月活动总结
2015/03/26 职场文书
经费申请报告范文
2015/05/18 职场文书