php与python实现的线程池多线程爬虫功能示例


Posted in PHP onOctober 12, 2016

本文实例讲述了php与python实现的线程池多线程爬虫功能。分享给大家供大家参考,具体如下:

多线程爬虫可以用于抓取内容了这个可以提升性能了,这里我们来看php与python 线程池多线程爬虫的例子,代码如下:

php例子

<?php
class Connect extends Worker //worker模式
{
public function __construct()
{
}
public function getConnection()
{
if (!self::$ch)
{
self::$ch = curl_init();
curl_setopt(self::$ch, CURLOPT_TIMEOUT, 2);
curl_setopt(self::$ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt(self::$ch, CURLOPT_HEADER, 0);
curl_setopt(self::$ch, CURLOPT_NOSIGNAL, true);
curl_setopt(self::$ch, CURLOPT_USERAGENT, "Firefox");
curl_setopt(self::$ch, CURLOPT_FOLLOWLOCATION, 1);
}
/* do some exception/error stuff here maybe */
return self::$ch;
}
public function closeConnection()
{
curl_close(self::$ch);
}
/**
* Note that the link is stored statically, which for pthreads, means thread local
* */
protected static $ch;
}
class Query extends Threaded
{
public function __construct($url)
{
$this->url = $url;
}
public function run()
{
$ch = $this->worker->getConnection();
curl_setopt($ch, CURLOPT_URL, $this->url);
$page = curl_exec($ch);
$info = curl_getinfo($ch);
$error = curl_error($ch);
$this->deal_data($this->url, $page, $info, $error);
$this->result = $page;
}
function deal_data($url, $page, $info, $error)
{
$parts = explode(".", $url);
$id = $parts[1];
if ($info['http_code'] != 200)
{
$this->show_msg($id, $error);
} else
{
$this->show_msg($id, "OK");
}
}
function show_msg($id, $msg)
{
echo $id."\t$msg\n";
}
public function getResult()
{
return $this->result;
}
protected $url;
protected $result;
}
function check_urls_multi_pthreads()
{
global $check_urls; //定义抓取的连接
$check_urls = array( 'http://xxx.com' => "xx网",);
$pool = new Pool(10, "Connect", array()); //建立10个线程池
foreach ($check_urls as $url => $name)
{
$pool->submit(new Query($url));
}
$pool->shutdown();
}
check_urls_multi_pthreads();
python 多线程
def handle(sid)://这个方法内执行爬虫数据处理
pass
class MyThread(Thread):
"""docstring for ClassName"""
def __init__(self, sid):
Thread.__init__(self)
self.sid = sid
def run():
handle(self.sid)
threads = []
for i in xrange(1,11):
t = MyThread(i)
threads.append(t)
t.start()
for t in threads:
t.join()

python 线程池爬虫:

from queue import Queue
from threading import Thread, Lock
import urllib.parse
import socket
import re
import time
seen_urls = set(['/'])
lock = Lock()
class Fetcher(Thread):
  def __init__(self, tasks):
    Thread.__init__(self)
    self.tasks = tasks
    self.daemon = True
    self.start()
  def run(self):
    while True:
      url = self.tasks.get()
      print(url)
      sock = socket.socket()
      sock.connect(('localhost', 3000))
      get = 'GET {} HTTP/1.0\r\nHost: localhost\r\n\r\n'.format(url)
      sock.send(get.encode('ascii'))
      response = b''
      chunk = sock.recv(4096)
      while chunk:
        response += chunk
        chunk = sock.recv(4096)
      links = self.parse_links(url, response)
      lock.acquire()
      for link in links.difference(seen_urls):
        self.tasks.put(link)
      seen_urls.update(links)
      lock.release()
      self.tasks.task_done()
  def parse_links(self, fetched_url, response):
    if not response:
      print('error: {}'.format(fetched_url))
      return set()
    if not self._is_html(response):
      return set()
    urls = set(re.findall(r'''(?i)href=["']?([^\s"'<>]+)''',
               self.body(response)))
    links = set()
    for url in urls:
      normalized = urllib.parse.urljoin(fetched_url, url)
      parts = urllib.parse.urlparse(normalized)
      if parts.scheme not in ('', 'http', 'https'):
        continue
      host, port = urllib.parse.splitport(parts.netloc)
      if host and host.lower() not in ('localhost'):
        continue
      defragmented, frag = urllib.parse.urldefrag(parts.path)
      links.add(defragmented)
    return links
  def body(self, response):
    body = response.split(b'\r\n\r\n', 1)[1]
    return body.decode('utf-8')
  def _is_html(self, response):
    head, body = response.split(b'\r\n\r\n', 1)
    headers = dict(h.split(': ') for h in head.decode().split('\r\n')[1:])
    return headers.get('Content-Type', '').startswith('text/html')
class ThreadPool:
  def __init__(self, num_threads):
    self.tasks = Queue()
    for _ in range(num_threads):
      Fetcher(self.tasks)
  def add_task(self, url):
    self.tasks.put(url)
  def wait_completion(self):
    self.tasks.join()
if __name__ == '__main__':
  start = time.time()
  pool = ThreadPool(4)
  pool.add_task("/")
  pool.wait_completion()
  print('{} URLs fetched in {:.1f} seconds'.format(len(seen_urls),time.time() - start))

希望本文所述对大家PHP程序设计有所帮助。

PHP 相关文章推荐
require(),include(),require_once()和include_once()的异同
Jan 02 PHP
php下过滤html代码的函数 提高程序安全性
Mar 02 PHP
php中global和$GLOBALS[]的分析之一
Feb 02 PHP
PHP将XML转数组过程详解
Nov 13 PHP
php curl请求信息和返回信息设置代码实例
Apr 27 PHP
PHP7常量数组用法分析
Sep 26 PHP
PHP yii实现model添加默认值的方法(两种方法)
Nov 10 PHP
详解Yii2 定制表单输入字段的标签和样式
Jan 04 PHP
PHP自定义函数实现数组比较功能示例
Oct 19 PHP
JS(jQuery)实现聊天接收到消息语言自动提醒功能详解【提示“您有新的消息请注意查收”】
Apr 16 PHP
调试php程序的简单步骤
Oct 04 PHP
laravel框架如何设置公共头和公共尾
Oct 22 PHP
php实现的SSO单点登录系统接入功能示例分析
Oct 12 #PHP
php用户密码加密算法分析【Discuz加密算法】
Oct 12 #PHP
基于php实现的php代码加密解密类完整实例
Oct 12 #PHP
php fseek函数读取大文件两种方法
Oct 12 #PHP
PHP从二维数组得到N层分类树的实现代码
Oct 11 #PHP
php 无限分类 树形数据格式化代码
Oct 11 #PHP
PHP简单判断iPhone、iPad、Android及PC设备的方法
Oct 11 #PHP
You might like
PHP 引用是个坏习惯
2010/03/12 PHP
浅谈php serialize()与unserialize()的用法
2013/06/05 PHP
Javascript 加载和执行-性能提高篇
2012/12/28 Javascript
JS中setInterval、setTimeout不能传递带参数的函数的解决方案
2013/04/28 Javascript
AngularJS身份验证的方法
2016/02/17 Javascript
JS组件Bootstrap按钮组与下拉按钮详解
2016/05/10 Javascript
JavaScript对象数组排序实例方法浅析
2016/06/15 Javascript
图文详解JavaScript的原型对象及原型链
2016/08/02 Javascript
js图片上传的封装代码
2017/08/01 Javascript
JS中appendChild追加子节点无效的解决方法
2018/10/14 Javascript
Vue.js路由实现选项卡简单实例
2019/07/24 Javascript
使用layui定义一个模块并使用的例子
2019/09/14 Javascript
解决vue 表格table列求和的问题
2019/11/06 Javascript
[55:56]NB vs Infamous 2019国际邀请赛淘汰赛 败者组 BO3 第二场 8.22
2019/09/05 DOTA
Python实现过滤单个Android程序日志脚本分享
2015/01/16 Python
Python中的ctime()方法使用教程
2015/05/22 Python
Python中对象迭代与反迭代的技巧总结
2016/09/17 Python
Windows下的Jupyter Notebook 安装与自定义启动(图文详解)
2018/02/21 Python
Python numpy 提取矩阵的某一行或某一列的实例
2018/04/03 Python
聊聊python里如何用Borg pattern实现的单例模式
2019/06/06 Python
django中账号密码验证登陆功能的实现方法
2019/07/15 Python
利用Python复制文件的9种方法总结
2019/09/02 Python
pytorch实现mnist分类的示例讲解
2020/01/10 Python
python tkinter 设置窗口大小不可缩放实例
2020/03/04 Python
在Mac中PyCharm配置python Anaconda环境过程图解
2020/03/11 Python
详解Django中views数据查询使用locals()函数进行优化
2020/08/24 Python
互斥锁解决 Python 中多线程共享全局变量的问题(推荐)
2020/09/28 Python
美国最大的香水出口:FragranceX.com
2017/11/04 全球购物
盛大二次面试题
2016/11/18 面试题
简历自我评价模版
2014/01/31 职场文书
总经理文秘岗位职责
2014/02/03 职场文书
学生会招新策划书
2014/02/14 职场文书
国培计划培训感言
2014/03/11 职场文书
汉语拼音教学反思
2016/02/22 职场文书
关于JavaScript轮播图的实现
2021/11/20 Javascript
python数字图像处理之图像自动阈值分割示例
2022/06/28 Python