Python多线程、异步+多进程爬虫实现代码


Posted in Python onFebruary 17, 2016

安装Tornado
省事点可以直接用grequests库,下面用的是tornado的异步client。 异步用到了tornado,根据官方文档的例子修改得到一个简单的异步爬虫类。可以参考下最新的文档学习下。
pip install tornado

异步爬虫

#!/usr/bin/env python
# -*- coding:utf-8 -*-

import time
from datetime import timedelta
from tornado import httpclient, gen, ioloop, queues
import traceback


class AsySpider(object):
  """A simple class of asynchronous spider."""
  def __init__(self, urls, concurrency=10, **kwargs):
    urls.reverse()
    self.urls = urls
    self.concurrency = concurrency
    self._q = queues.Queue()
    self._fetching = set()
    self._fetched = set()

  def fetch(self, url, **kwargs):
    fetch = getattr(httpclient.AsyncHTTPClient(), 'fetch')
    return fetch(url, **kwargs)

  def handle_html(self, url, html):
    """handle html page"""
    print(url)

  def handle_response(self, url, response):
    """inherit and rewrite this method"""
    if response.code == 200:
      self.handle_html(url, response.body)

    elif response.code == 599:  # retry
      self._fetching.remove(url)
      self._q.put(url)

  @gen.coroutine
  def get_page(self, url):
    try:
      response = yield self.fetch(url)
      print('######fetched %s' % url)
    except Exception as e:
      print('Exception: %s %s' % (e, url))
      raise gen.Return(e)
    raise gen.Return(response)

  @gen.coroutine
  def _run(self):
    @gen.coroutine
    def fetch_url():
      current_url = yield self._q.get()
      try:
        if current_url in self._fetching:
          return

        print('fetching****** %s' % current_url)
        self._fetching.add(current_url)

        response = yield self.get_page(current_url)
        self.handle_response(current_url, response)  # handle reponse

        self._fetched.add(current_url)

        for i in range(self.concurrency):
          if self.urls:
            yield self._q.put(self.urls.pop())

      finally:
        self._q.task_done()

    @gen.coroutine
    def worker():
      while True:
        yield fetch_url()

    self._q.put(self.urls.pop())  # add first url

    # Start workers, then wait for the work queue to be empty.
    for _ in range(self.concurrency):
      worker()

    yield self._q.join(timeout=timedelta(seconds=300000))
    assert self._fetching == self._fetched

  def run(self):
    io_loop = ioloop.IOLoop.current()
    io_loop.run_sync(self._run)


class MySpider(AsySpider):

  def fetch(self, url, **kwargs):
    """重写父类fetch方法可以添加cookies,headers,timeout等信息"""
    cookies_str = "PHPSESSID=j1tt66a829idnms56ppb70jri4; pspt=%7B%22id%22%3A%2233153%22%2C%22pswd%22%3A%228835d2c1351d221b4ab016fbf9e8253f%22%2C%22_code%22%3A%22f779dcd011f4e2581c716d1e1b945861%22%7D; key=%E9%87%8D%E5%BA%86%E5%95%84%E6%9C%A8%E9%B8%9F%E7%BD%91%E7%BB%9C%E7%A7%91%E6%8A%80%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8; think_language=zh-cn; SERVERID=a66d7d08fa1c8b2e37dbdc6ffff82d9e|1444973193|1444967835; CNZZDATA1254842228=1433864393-1442810831-%7C1444972138"  # 从浏览器拷贝cookie字符串
    headers = {
      'User-Agent': 'mozilla/5.0 (compatible; baiduspider/2.0; +http://www.baidu.com/search/spider.html)',
      'cookie': cookies_str
    }
    return super(MySpider, self).fetch(  # 参数参考tornado文档
      url, headers=headers, request_timeout=1
    )

  def handle_html(self, url, html):
    print(url, html)


def main():
  urls = []
  for page in range(1, 100):
    urls.append('http://www.baidu.com?page=%s' % page)
  s = MySpider(urls)
  s.run()


if __name__ == '__main__':
  main()

可以继承这个类,塞一些url进去,然后重写handle_page处理得到的页面。

异步+多进程爬虫
还可以再变态点,加个进程池,使用了multiprocessing模块。效率飕飕的,

#!/usr/bin/env python
# -*- coding:utf-8 -*-

import time
from multiprocessing import Pool
from datetime import timedelta
from tornado import httpclient, gen, ioloop, queues


class AsySpider(object):
  """A simple class of asynchronous spider."""
  def __init__(self, urls, concurrency):
    urls.reverse()
    self.urls = urls
    self.concurrency = concurrency
    self._q = queues.Queue()
    self._fetching = set()
    self._fetched = set()

  def handle_page(self, url, html):
    filename = url.rsplit('/', 1)[1]
    with open(filename, 'w+') as f:
      f.write(html)

  @gen.coroutine
  def get_page(self, url):
    try:
      response = yield httpclient.AsyncHTTPClient().fetch(url)
      print('######fetched %s' % url)
    except Exception as e:
      print('Exception: %s %s' % (e, url))
      raise gen.Return('')
    raise gen.Return(response.body)

  @gen.coroutine
  def _run(self):

    @gen.coroutine
    def fetch_url():
      current_url = yield self._q.get()
      try:
        if current_url in self._fetching:
          return

        print('fetching****** %s' % current_url)
        self._fetching.add(current_url)
        html = yield self.get_page(current_url)
        self._fetched.add(current_url)

        self.handle_page(current_url, html)

        for i in range(self.concurrency):
          if self.urls:
            yield self._q.put(self.urls.pop())

      finally:
        self._q.task_done()

    @gen.coroutine
    def worker():
      while True:
        yield fetch_url()

    self._q.put(self.urls.pop())

    # Start workers, then wait for the work queue to be empty.
    for _ in range(self.concurrency):
      worker()
    yield self._q.join(timeout=timedelta(seconds=300000))
    assert self._fetching == self._fetched

  def run(self):
    io_loop = ioloop.IOLoop.current()
    io_loop.run_sync(self._run)


def run_spider(beg, end):
  urls = []
  for page in range(beg, end):
    urls.append('http://127.0.0.1/%s.htm' % page)
  s = AsySpider(urls, 10)
  s.run()


def main():
  _st = time.time()
  p = Pool()
  all_num = 73000
  num = 4  # number of cpu cores
  per_num, left = divmod(all_num, num)
  s = range(0, all_num, per_num)
  res = []
  for i in range(len(s)-1):
    res.append((s[i], s[i+1]))
  res.append((s[len(s)-1], all_num))
  print res

  for i in res:
    p.apply_async(run_spider, args=(i[0], i[1],))
  p.close()
  p.join()

  print time.time()-_st


if __name__ == '__main__':
  main()

多线程爬虫
线程池实现.

#!/usr/bin/env python
# -*- coding:utf-8 -*-
import Queue
import sys
import requests
import os
import threading
import time

class Worker(threading.Thread):  # 处理工作请求
  def __init__(self, workQueue, resultQueue, **kwds):
    threading.Thread.__init__(self, **kwds)
    self.setDaemon(True)
    self.workQueue = workQueue
    self.resultQueue = resultQueue


  def run(self):
    while 1:
      try:
        callable, args, kwds = self.workQueue.get(False)  # get task
        res = callable(*args, **kwds)
        self.resultQueue.put(res)  # put result
      except Queue.Empty:
        break

class WorkManager:  # 线程池管理,创建
  def __init__(self, num_of_workers=10):
    self.workQueue = Queue.Queue()  # 请求队列
    self.resultQueue = Queue.Queue()  # 输出结果的队列
    self.workers = []
    self._recruitThreads(num_of_workers)

  def _recruitThreads(self, num_of_workers):
    for i in range(num_of_workers):
      worker = Worker(self.workQueue, self.resultQueue)  # 创建工作线程
      self.workers.append(worker)  # 加入到线程队列


  def start(self):
    for w in self.workers:
      w.start()

  def wait_for_complete(self):
    while len(self.workers):
      worker = self.workers.pop()  # 从池中取出一个线程处理请求
      worker.join()
      if worker.isAlive() and not self.workQueue.empty():
        self.workers.append(worker)  # 重新加入线程池中
    print 'All jobs were complete.'


  def add_job(self, callable, *args, **kwds):
    self.workQueue.put((callable, args, kwds))  # 向工作队列中加入请求

  def get_result(self, *args, **kwds):
    return self.resultQueue.get(*args, **kwds)


def download_file(url):
  #print 'beg download', url
  requests.get(url).text


def main():
  try:
    num_of_threads = int(sys.argv[1])
  except:
    num_of_threads = 10
  _st = time.time()
  wm = WorkManager(num_of_threads)
  print num_of_threads
  urls = ['http://www.baidu.com'] * 1000
  for i in urls:
    wm.add_job(download_file, i)
  wm.start()
  wm.wait_for_complete()
  print time.time() - _st

if __name__ == '__main__':
  main()

这三种随便一种都有很高的效率,但是这么跑会给网站服务器不小的压力,尤其是小站点,还是有点节操为好。

Python 相关文章推荐
python利用matplotlib库绘制饼图的方法示例
Dec 18 Python
更改Ubuntu默认python版本的两种方法python-> Anaconda
Dec 18 Python
Python学习pygal绘制线图代码分享
Dec 09 Python
TensorFlow实现创建分类器
Feb 06 Python
Python实现从log日志中提取ip的方法【正则提取】
Mar 31 Python
python取数作为临时极大值(极小值)的方法
Oct 15 Python
解决pycharm工程启动卡住没反应的问题
Jan 19 Python
Python eval的常见错误封装及利用原理详解
Mar 26 Python
不到20行代码用Python做一个智能聊天机器人
Apr 19 Python
pycharm第三方库安装失败的问题及解决经验分享
May 09 Python
基于Python编写一个计算器程序,实现简单的加减乘除和取余二元运算
Aug 05 Python
pygame面向对象的飞行小鸟实现(Flappy bird)
Apr 01 Python
玩转python爬虫之爬取糗事百科段子
Feb 17 #Python
玩转python爬虫之正则表达式
Feb 17 #Python
玩转python爬虫之URLError异常处理
Feb 17 #Python
玩转python爬虫之cookie使用方法
Feb 17 #Python
Python 爬虫爬取指定博客的所有文章
Feb 17 #Python
Using Django with GAE Python 后台抓取多个网站的页面全文
Feb 17 #Python
python实现RSA加密(解密)算法
Feb 17 #Python
You might like
PHP 服务器配置(使用Apache及IIS两种方法)
2009/06/01 PHP
php for 循环语句使用方法详细说明
2010/05/09 PHP
php时间计算相关问题小结
2016/05/09 PHP
PHP中利用sleep函数实现定时执行功能实现代码
2016/08/25 PHP
php组合排序简单实现方法
2016/10/15 PHP
PHP有序表查找之插值查找算法示例
2018/02/10 PHP
围观tangram js库
2010/12/28 Javascript
js表单中选择框值的获取及表单的序列化
2015/12/17 Javascript
JavaScript实现简洁的俄罗斯方块完整实例
2016/03/01 Javascript
Markdown+Bootstrap图片自适应属性详解
2016/05/21 Javascript
JS查找字符串中出现次数最多的字符
2016/09/05 Javascript
jQuery给表格添加分页效果
2017/03/02 Javascript
详解vue表单验证组件 v-verify-plugin
2017/04/19 Javascript
node.js操作mysql简单实例
2017/05/25 Javascript
JavaScript Drum Kit 指南(纯 JS 模拟敲鼓效果)
2017/07/23 Javascript
js实现控制文件拖拽并获取拖拽内容功能
2018/02/17 Javascript
vue的diff算法知识点总结
2018/03/29 Javascript
深入理解Vue父子组件生命周期执行顺序及钩子函数
2018/08/12 Javascript
javascript中的闭包概念与用法实践分析
2019/07/26 Javascript
Vue基于localStorage存储信息代码实例
2020/11/16 Javascript
[02:29]大剑、皮鞭、女装,这届DOTA2勇士令状里都有
2020/07/17 DOTA
使用Python对MySQL数据操作
2017/04/06 Python
Python实现找出数组中第2大数字的方法示例
2018/03/26 Python
python实现将文件夹下面的不是以py文件结尾的文件都过滤掉的方法
2018/10/21 Python
django drf框架中的user验证以及JWT拓展的介绍
2019/08/12 Python
python 追踪except信息方式
2020/04/25 Python
解决python中import文件夹下面py文件报错问题
2020/06/01 Python
Python通过yagmail实现发送邮件代码解析
2020/10/27 Python
Matplotlib配色之Colormap详解
2021/01/05 Python
Css3+Js制作漂亮时钟(附源码)
2013/04/24 HTML / CSS
微软台湾官方网站:Microsoft台湾
2018/08/15 全球购物
vue路由实现登录拦截
2021/03/24 Vue.js
党员实事承诺书
2014/03/26 职场文书
电子商务助理求职自荐信
2014/04/10 职场文书
企业优秀员工事迹材料
2014/05/28 职场文书
观后感开头
2015/06/19 职场文书