python实现多线程采集的2个代码例子


Posted in Python onJuly 07, 2014

代码一:

#!/usr/bin/python
# -*- coding: utf-8 -*-
#encoding=utf-8
 
import threading
import Queue
import sys
import urllib2
import re
import MySQLdb
 
#
# 数据库变量设置
#
DB_HOST = '127.0.0.1'
DB_USER = "XXXX"
DB_PASSWD = "XXXXXXXX"
DB_NAME = "xxxx"
 
#
# 变量设置
#
THREAD_LIMIT = 3
jobs = Queue.Queue(5)
singlelock = threading.Lock()
info = Queue.Queue()
 
def workerbee(inputlist):
    for x in xrange(THREAD_LIMIT):
        print 'Thead {0} started.'.format(x)
        t = spider()
        t.start()
    for i in inputlist:
        try:
            jobs.put(i, block=True, timeout=5)
        except:
            singlelock.acquire()
            print "The queue is full !"
            singlelock.release()
 
    # Wait for the threads to finish
    singlelock.acquire()        # Acquire the lock so we can print
    print "Waiting for threads to finish."
    singlelock.release()        # Release the lock
    jobs.join()              # This command waits for all threads to finish.
    # while not jobs.empty():
    #   print jobs.get()
 
def getTitle(url,time=10):
    response = urllib2.urlopen(url,timeout=time)
    html = response.read()
    response.close()
    reg = r'<title>(.*?)</title>'
    title = re.compile(reg).findall(html)
    # title = title[0].decode('gb2312','replace').encode('utf-8')
    title = title[0]
    return title
 
class spider(threading.Thread):
    def run(self):
        while 1:
            try:
                job = jobs.get(True,1)
                singlelock.acquire()
                title = getTitle(job[1])
                info.put([job[0],title], block=True, timeout=5)
                # print 'This {0} is {1}'.format(job[1],title)
                singlelock.release()
                jobs.task_done()
            except:
                break;
 
if __name__ == '__main__':
    con = None
    urls = []
    try:
        con = MySQLdb.connect(DB_HOST,DB_USER,DB_PASSWD,DB_NAME)
        cur = con.cursor()
        cur.execute('SELECT id,url FROM `table_name` WHERE `status`=0 LIMIT 10')
        rows = cur.fetchall()
        for row in rows:
            # print row
            urls.append([row[0],row[1]])
        workerbee(urls)
        while not info.empty():
            print info.get()
    finally:
        if con:
            con.close()

代码二:

#!/usr/bin/python
# -*- coding: utf-8 -*-
#encoding=utf-8
#Filename:robot.py
 
import threading,Queue,sys,urllib2,re
#
# 变量设置
#
THREAD_LIMIT = 3        #设置线程数
jobs = Queue.Queue(5)      #设置队列长度
singlelock = threading.Lock()    #设置一个线程锁,避免重复调用
 
urls = ['http://games.sina.com.cn/w/n/2013-04-28/1634703505.shtml','http://games.sina.com.cn/w/n/2013-04-28/1246703487.shtml','http://games.sina.com.cn/w/n/2013-04-28/1028703471.shtml','http://games.sina.com.cn/w/n/2013-04-27/1015703426.shtml','http://games.sina.com.cn/w/n/2013-04-26/1554703373.shtml','http://games.sina.com.cn/w/n/2013-04-26/1512703346.shtml','http://games.sina.com.cn/w/n/2013-04-26/1453703334.shtml','http://games.sina.com.cn/w/n/2013-04-26/1451703333.shtml','http://games.sina.com.cn/w/n/2013-04-26/1445703329.shtml','http://games.sina.com.cn/w/n/2013-04-26/1434703322.shtml','http://games.sina.com.cn/w/n/2013-04-26/1433703321.shtml','http://games.sina.com.cn/w/n/2013-04-26/1433703320.shtml','http://games.sina.com.cn/w/n/2013-04-26/1429703318.shtml','http://games.sina.com.cn/w/n/2013-04-26/1429703317.shtml','http://games.sina.com.cn/w/n/2013-04-26/1409703297.shtml','http://games.sina.com.cn/w/n/2013-04-26/1406703296.shtml','http://games.sina.com.cn/w/n/2013-04-26/1402703292.shtml','http://games.sina.com.cn/w/n/2013-04-26/1353703286.shtml','http://games.sina.com.cn/w/n/2013-04-26/1348703284.shtml','http://games.sina.com.cn/w/n/2013-04-26/1327703275.shtml','http://games.sina.com.cn/w/n/2013-04-26/1239703265.shtml','http://games.sina.com.cn/w/n/2013-04-26/1238703264.shtml','http://games.sina.com.cn/w/n/2013-04-26/1231703262.shtml','http://games.sina.com.cn/w/n/2013-04-26/1229703261.shtml','http://games.sina.com.cn/w/n/2013-04-26/1228703260.shtml','http://games.sina.com.cn/w/n/2013-04-26/1223703259.shtml','http://games.sina.com.cn/w/n/2013-04-26/1218703258.shtml','http://games.sina.com.cn/w/n/2013-04-26/1202703254.shtml','http://games.sina.com.cn/w/n/2013-04-26/1159703251.shtml','http://games.sina.com.cn/w/n/2013-04-26/1139703233.shtml']
 
def workerbee(inputlist):
  for x in xrange(THREAD_LIMIT):
    print 'Thead {0} started.'.format(x)
    t = spider()
    t.start()
  for i in inputlist:
    try:
      jobs.put(i, block=True, timeout=5)
    except:
      singlelock.acquire()
      print "The queue is full !"
      singlelock.release()
 
  # Wait for the threads to finish
  singlelock.acquire()    # Acquire the lock so we can print
  print "Waiting for threads to finish."
  singlelock.release()    # Release the lock
  jobs.join()       # This command waits for all threads to finish.
  # while not jobs.empty():
  #  print jobs.get()
 
def getTitle(url,time=10):
  response = urllib2.urlopen(url,timeout=time)
  html = response.read()
  response.close()
  reg = r'<title>(.*?)</title>'
  title = re.compile(reg).findall(html)
  title = title[0].decode('gb2312','replace').encode('utf-8')
  return title
 
class spider(threading.Thread):
  def run(self):
    while 1:
      try:
        job = jobs.get(True,1)
        singlelock.acquire()
        title = getTitle(job)
        print 'This {0} is {1}'.format(job,title)
        singlelock.release()
        jobs.task_done()
      except:
        break;
 
if __name__ == '__main__':
  workerbee(urls)
Python 相关文章推荐
python smtplib模块自动收发邮件功能(一)
May 22 Python
django框架模型层功能、组成与用法分析
Jul 30 Python
python OpenCV GrabCut使用实例解析
Nov 11 Python
python pygame实现挡板弹球游戏
Nov 25 Python
解决Tensorboard 不显示计算图graph的问题
Feb 15 Python
PyCharm2020最新激活码+激活码补丁(亲测最新版PyCharm2020.2激活成功)
Nov 25 Python
Python中Yield的基本用法
Oct 18 Python
Python实现壁纸下载与轮换
Oct 19 Python
用python批量下载apk
Dec 29 Python
如何用 Python 子进程关闭 Excel 自动化中的弹窗
May 07 Python
Python超简单容易上手的画图工具库推荐
May 10 Python
Python中for后接else的语法使用
May 18 Python
Python程序员开发中常犯的10个错误
Jul 07 #Python
python采用requests库模拟登录和抓取数据的简单示例
Jul 05 #Python
浅析python 中__name__ = '__main__' 的作用
Jul 05 #Python
python在windows下实现备份程序实例
Jul 04 #Python
python调用短信猫控件实现发短信功能实例
Jul 04 #Python
Python实现类继承实例
Jul 04 #Python
Django集成百度富文本编辑器uEditor攻略
Jul 04 #Python
You might like
PHP命名空间(Namespace)的使用详解
2013/05/04 PHP
php中调用其他系统http接口的方法说明
2014/02/28 PHP
排序算法之PHP版快速排序、冒泡排序
2014/04/09 PHP
php 反斜杠处理函数addslashes()和stripslashes()实例详解
2016/12/25 PHP
PHP实现将标点符号正则替换为空格的方法
2017/08/09 PHP
PHP实现登录验证码校验功能
2018/05/17 PHP
jQuery each()方法的使用方法
2010/03/18 Javascript
分享一个自定义的console类 让你不再纠结JS中的调试代码的兼容
2012/04/20 Javascript
js时间戳格式化成日期格式的多种方法
2013/11/11 Javascript
js正则表达式中test,exec,match方法的区别说明
2014/01/29 Javascript
js表单验证实例讲解
2016/03/31 Javascript
vue解决使用webpack打包后keep-alive不生效的方法
2018/09/01 Javascript
layui给下拉框、按钮状态、时间赋初始值的方法
2019/09/10 Javascript
小程序实现锚点滑动效果
2019/09/23 Javascript
JavaScript indexOf()原理及使用方法详解
2020/07/09 Javascript
Openlayers绘制地图标注
2020/09/28 Javascript
[01:03:59]2018DOTA2亚洲邀请赛3月30日 小组赛B组VGJ.T VS Secret
2018/03/31 DOTA
Python中pillow知识点学习
2018/04/30 Python
Python全局变量与局部变量区别及用法分析
2018/09/03 Python
Python安装whl文件过程图解
2020/02/18 Python
详解django使用include无法跳转的解决方法
2020/03/19 Python
使用python无账号无限制获取企查查信息的实例代码
2020/04/17 Python
python实现npy格式文件转换为txt文件操作
2020/07/01 Python
python openCV实现摄像头获取人脸图片
2020/08/20 Python
突破canvas语法限制 让他支持链式语法
2012/12/24 HTML / CSS
Sneaker Studio乌克兰:购买运动鞋
2018/03/26 全球购物
Laura官网:加拿大女性的顶级时尚目的地
2019/09/20 全球购物
Ruby如何定义一个类
2012/10/08 面试题
对祖国的寄语大全
2014/04/11 职场文书
护士求职信范文
2014/05/24 职场文书
80后婚前协议书范本
2014/10/24 职场文书
党员民主生活会材料
2014/12/15 职场文书
2015年全国保险公众宣传日活动方案
2015/05/06 职场文书
平凡的世界读书笔记
2015/06/25 职场文书
党员廉政准则心得体会
2016/01/20 职场文书
Python中的变量与常量
2021/11/11 Python