python实现多线程采集的2个代码例子


Posted in Python onJuly 07, 2014

代码一:

#!/usr/bin/python
# -*- coding: utf-8 -*-
#encoding=utf-8
 
import threading
import Queue
import sys
import urllib2
import re
import MySQLdb
 
#
# 数据库变量设置
#
DB_HOST = '127.0.0.1'
DB_USER = "XXXX"
DB_PASSWD = "XXXXXXXX"
DB_NAME = "xxxx"
 
#
# 变量设置
#
THREAD_LIMIT = 3
jobs = Queue.Queue(5)
singlelock = threading.Lock()
info = Queue.Queue()
 
def workerbee(inputlist):
    for x in xrange(THREAD_LIMIT):
        print 'Thead {0} started.'.format(x)
        t = spider()
        t.start()
    for i in inputlist:
        try:
            jobs.put(i, block=True, timeout=5)
        except:
            singlelock.acquire()
            print "The queue is full !"
            singlelock.release()
 
    # Wait for the threads to finish
    singlelock.acquire()        # Acquire the lock so we can print
    print "Waiting for threads to finish."
    singlelock.release()        # Release the lock
    jobs.join()              # This command waits for all threads to finish.
    # while not jobs.empty():
    #   print jobs.get()
 
def getTitle(url,time=10):
    response = urllib2.urlopen(url,timeout=time)
    html = response.read()
    response.close()
    reg = r'<title>(.*?)</title>'
    title = re.compile(reg).findall(html)
    # title = title[0].decode('gb2312','replace').encode('utf-8')
    title = title[0]
    return title
 
class spider(threading.Thread):
    def run(self):
        while 1:
            try:
                job = jobs.get(True,1)
                singlelock.acquire()
                title = getTitle(job[1])
                info.put([job[0],title], block=True, timeout=5)
                # print 'This {0} is {1}'.format(job[1],title)
                singlelock.release()
                jobs.task_done()
            except:
                break;
 
if __name__ == '__main__':
    con = None
    urls = []
    try:
        con = MySQLdb.connect(DB_HOST,DB_USER,DB_PASSWD,DB_NAME)
        cur = con.cursor()
        cur.execute('SELECT id,url FROM `table_name` WHERE `status`=0 LIMIT 10')
        rows = cur.fetchall()
        for row in rows:
            # print row
            urls.append([row[0],row[1]])
        workerbee(urls)
        while not info.empty():
            print info.get()
    finally:
        if con:
            con.close()

代码二:

#!/usr/bin/python
# -*- coding: utf-8 -*-
#encoding=utf-8
#Filename:robot.py
 
import threading,Queue,sys,urllib2,re
#
# 变量设置
#
THREAD_LIMIT = 3        #设置线程数
jobs = Queue.Queue(5)      #设置队列长度
singlelock = threading.Lock()    #设置一个线程锁,避免重复调用
 
urls = ['http://games.sina.com.cn/w/n/2013-04-28/1634703505.shtml','http://games.sina.com.cn/w/n/2013-04-28/1246703487.shtml','http://games.sina.com.cn/w/n/2013-04-28/1028703471.shtml','http://games.sina.com.cn/w/n/2013-04-27/1015703426.shtml','http://games.sina.com.cn/w/n/2013-04-26/1554703373.shtml','http://games.sina.com.cn/w/n/2013-04-26/1512703346.shtml','http://games.sina.com.cn/w/n/2013-04-26/1453703334.shtml','http://games.sina.com.cn/w/n/2013-04-26/1451703333.shtml','http://games.sina.com.cn/w/n/2013-04-26/1445703329.shtml','http://games.sina.com.cn/w/n/2013-04-26/1434703322.shtml','http://games.sina.com.cn/w/n/2013-04-26/1433703321.shtml','http://games.sina.com.cn/w/n/2013-04-26/1433703320.shtml','http://games.sina.com.cn/w/n/2013-04-26/1429703318.shtml','http://games.sina.com.cn/w/n/2013-04-26/1429703317.shtml','http://games.sina.com.cn/w/n/2013-04-26/1409703297.shtml','http://games.sina.com.cn/w/n/2013-04-26/1406703296.shtml','http://games.sina.com.cn/w/n/2013-04-26/1402703292.shtml','http://games.sina.com.cn/w/n/2013-04-26/1353703286.shtml','http://games.sina.com.cn/w/n/2013-04-26/1348703284.shtml','http://games.sina.com.cn/w/n/2013-04-26/1327703275.shtml','http://games.sina.com.cn/w/n/2013-04-26/1239703265.shtml','http://games.sina.com.cn/w/n/2013-04-26/1238703264.shtml','http://games.sina.com.cn/w/n/2013-04-26/1231703262.shtml','http://games.sina.com.cn/w/n/2013-04-26/1229703261.shtml','http://games.sina.com.cn/w/n/2013-04-26/1228703260.shtml','http://games.sina.com.cn/w/n/2013-04-26/1223703259.shtml','http://games.sina.com.cn/w/n/2013-04-26/1218703258.shtml','http://games.sina.com.cn/w/n/2013-04-26/1202703254.shtml','http://games.sina.com.cn/w/n/2013-04-26/1159703251.shtml','http://games.sina.com.cn/w/n/2013-04-26/1139703233.shtml']
 
def workerbee(inputlist):
  for x in xrange(THREAD_LIMIT):
    print 'Thead {0} started.'.format(x)
    t = spider()
    t.start()
  for i in inputlist:
    try:
      jobs.put(i, block=True, timeout=5)
    except:
      singlelock.acquire()
      print "The queue is full !"
      singlelock.release()
 
  # Wait for the threads to finish
  singlelock.acquire()    # Acquire the lock so we can print
  print "Waiting for threads to finish."
  singlelock.release()    # Release the lock
  jobs.join()       # This command waits for all threads to finish.
  # while not jobs.empty():
  #  print jobs.get()
 
def getTitle(url,time=10):
  response = urllib2.urlopen(url,timeout=time)
  html = response.read()
  response.close()
  reg = r'<title>(.*?)</title>'
  title = re.compile(reg).findall(html)
  title = title[0].decode('gb2312','replace').encode('utf-8')
  return title
 
class spider(threading.Thread):
  def run(self):
    while 1:
      try:
        job = jobs.get(True,1)
        singlelock.acquire()
        title = getTitle(job)
        print 'This {0} is {1}'.format(job,title)
        singlelock.release()
        jobs.task_done()
      except:
        break;
 
if __name__ == '__main__':
  workerbee(urls)
Python 相关文章推荐
Python字符转换
Sep 06 Python
python操作MySQL数据库的方法分享
May 29 Python
基于Python实现通过微信搜索功能查看谁把你删除了
Jan 27 Python
Python在Console下显示文本进度条的方法
Feb 14 Python
Java编程迭代地删除文件夹及其下的所有文件实例
Feb 10 Python
matplotlib 输出保存指定尺寸的图片方法
May 24 Python
Python函数装饰器实现方法详解
Dec 22 Python
TensorFlow绘制loss/accuracy曲线的实例
Jan 21 Python
Python 通过监听端口实现唯一脚本运行方式
May 05 Python
django rest framework使用django-filter用法
Jul 15 Python
Python爬虫scrapy框架Cookie池(微博Cookie池)的使用
Jan 13 Python
linux系统下pip升级报错的解决方法
Jan 31 Python
Python程序员开发中常犯的10个错误
Jul 07 #Python
python采用requests库模拟登录和抓取数据的简单示例
Jul 05 #Python
浅析python 中__name__ = '__main__' 的作用
Jul 05 #Python
python在windows下实现备份程序实例
Jul 04 #Python
python调用短信猫控件实现发短信功能实例
Jul 04 #Python
Python实现类继承实例
Jul 04 #Python
Django集成百度富文本编辑器uEditor攻略
Jul 04 #Python
You might like
分享PHP入门的学习方法
2007/01/02 PHP
令PHP初学者头疼十四条问题大总结
2008/11/12 PHP
PHP时间戳 strtotime()使用方法和技巧
2013/10/29 PHP
Yii框架引用插件和ckeditor中body与P标签去除的方法
2017/01/19 PHP
php-app开发接口加密详解
2018/04/18 PHP
如何在PHP中读写文件
2020/09/07 PHP
jQuery 性能优化指南(2)
2009/05/21 Javascript
javascript的console.log()用法小结
2012/05/31 Javascript
浅析Cookie中的Path与domain
2013/12/18 Javascript
jQuery的缓存机制浅析
2014/06/07 Javascript
jQuery异步加载数据并添加事件示例
2014/08/24 Javascript
jQuery遍历之next()、nextAll()方法使用实例
2014/11/08 Javascript
《JavaScript DOM 编程艺术》读书笔记之JavaScript 图片库
2015/01/09 Javascript
谈谈我对JavaScript DOM事件的理解
2015/12/18 Javascript
jQuery中$.each()函数的用法引申实例
2016/05/12 Javascript
详解webpack打包vue时提取css
2017/05/26 Javascript
Windows下使用Nodejs运行js的方法
2017/09/02 NodeJs
vue2里面ref的具体使用方法
2017/10/27 Javascript
基于游标的分页接口实现代码示例
2018/11/12 Javascript
解决vue移动端适配问题
2018/12/12 Javascript
js实现时钟定时器
2020/03/26 Javascript
ES6 async、await的基本使用方法示例
2020/06/06 Javascript
Python中endswith()函数的基本使用
2015/04/07 Python
Python使用multiprocessing创建进程的方法
2015/06/04 Python
pyqt5的QComboBox 使用模板的具体方法
2018/09/06 Python
Python使用random.shuffle()打乱列表顺序的方法
2018/11/08 Python
在Pandas中给多层索引降级的方法
2018/11/16 Python
Python Pandas中根据列的值选取多行数据
2019/07/08 Python
Django 导出项目依赖库到 requirements.txt过程解析
2019/08/23 Python
Python和Anaconda和Pycharm安装教程图文详解
2020/02/04 Python
Python使用ElementTree美化XML格式的操作
2020/03/06 Python
Django-simple-captcha验证码包使用方法详解
2020/11/28 Python
捷克电器和DJ设备网上商店:Electronic-star
2017/07/18 全球购物
寒假家长评语大全
2014/04/16 职场文书
升职自荐信怎么写
2015/03/05 职场文书
PHP设计模式(观察者模式)
2021/07/07 PHP