python实现多线程采集的2个代码例子


Posted in Python onJuly 07, 2014

代码一:

#!/usr/bin/python
# -*- coding: utf-8 -*-
#encoding=utf-8
 
import threading
import Queue
import sys
import urllib2
import re
import MySQLdb
 
#
# 数据库变量设置
#
DB_HOST = '127.0.0.1'
DB_USER = "XXXX"
DB_PASSWD = "XXXXXXXX"
DB_NAME = "xxxx"
 
#
# 变量设置
#
THREAD_LIMIT = 3
jobs = Queue.Queue(5)
singlelock = threading.Lock()
info = Queue.Queue()
 
def workerbee(inputlist):
    for x in xrange(THREAD_LIMIT):
        print 'Thead {0} started.'.format(x)
        t = spider()
        t.start()
    for i in inputlist:
        try:
            jobs.put(i, block=True, timeout=5)
        except:
            singlelock.acquire()
            print "The queue is full !"
            singlelock.release()
 
    # Wait for the threads to finish
    singlelock.acquire()        # Acquire the lock so we can print
    print "Waiting for threads to finish."
    singlelock.release()        # Release the lock
    jobs.join()              # This command waits for all threads to finish.
    # while not jobs.empty():
    #   print jobs.get()
 
def getTitle(url,time=10):
    response = urllib2.urlopen(url,timeout=time)
    html = response.read()
    response.close()
    reg = r'<title>(.*?)</title>'
    title = re.compile(reg).findall(html)
    # title = title[0].decode('gb2312','replace').encode('utf-8')
    title = title[0]
    return title
 
class spider(threading.Thread):
    def run(self):
        while 1:
            try:
                job = jobs.get(True,1)
                singlelock.acquire()
                title = getTitle(job[1])
                info.put([job[0],title], block=True, timeout=5)
                # print 'This {0} is {1}'.format(job[1],title)
                singlelock.release()
                jobs.task_done()
            except:
                break;
 
if __name__ == '__main__':
    con = None
    urls = []
    try:
        con = MySQLdb.connect(DB_HOST,DB_USER,DB_PASSWD,DB_NAME)
        cur = con.cursor()
        cur.execute('SELECT id,url FROM `table_name` WHERE `status`=0 LIMIT 10')
        rows = cur.fetchall()
        for row in rows:
            # print row
            urls.append([row[0],row[1]])
        workerbee(urls)
        while not info.empty():
            print info.get()
    finally:
        if con:
            con.close()

代码二:

#!/usr/bin/python
# -*- coding: utf-8 -*-
#encoding=utf-8
#Filename:robot.py
 
import threading,Queue,sys,urllib2,re
#
# 变量设置
#
THREAD_LIMIT = 3        #设置线程数
jobs = Queue.Queue(5)      #设置队列长度
singlelock = threading.Lock()    #设置一个线程锁,避免重复调用
 
urls = ['http://games.sina.com.cn/w/n/2013-04-28/1634703505.shtml','http://games.sina.com.cn/w/n/2013-04-28/1246703487.shtml','http://games.sina.com.cn/w/n/2013-04-28/1028703471.shtml','http://games.sina.com.cn/w/n/2013-04-27/1015703426.shtml','http://games.sina.com.cn/w/n/2013-04-26/1554703373.shtml','http://games.sina.com.cn/w/n/2013-04-26/1512703346.shtml','http://games.sina.com.cn/w/n/2013-04-26/1453703334.shtml','http://games.sina.com.cn/w/n/2013-04-26/1451703333.shtml','http://games.sina.com.cn/w/n/2013-04-26/1445703329.shtml','http://games.sina.com.cn/w/n/2013-04-26/1434703322.shtml','http://games.sina.com.cn/w/n/2013-04-26/1433703321.shtml','http://games.sina.com.cn/w/n/2013-04-26/1433703320.shtml','http://games.sina.com.cn/w/n/2013-04-26/1429703318.shtml','http://games.sina.com.cn/w/n/2013-04-26/1429703317.shtml','http://games.sina.com.cn/w/n/2013-04-26/1409703297.shtml','http://games.sina.com.cn/w/n/2013-04-26/1406703296.shtml','http://games.sina.com.cn/w/n/2013-04-26/1402703292.shtml','http://games.sina.com.cn/w/n/2013-04-26/1353703286.shtml','http://games.sina.com.cn/w/n/2013-04-26/1348703284.shtml','http://games.sina.com.cn/w/n/2013-04-26/1327703275.shtml','http://games.sina.com.cn/w/n/2013-04-26/1239703265.shtml','http://games.sina.com.cn/w/n/2013-04-26/1238703264.shtml','http://games.sina.com.cn/w/n/2013-04-26/1231703262.shtml','http://games.sina.com.cn/w/n/2013-04-26/1229703261.shtml','http://games.sina.com.cn/w/n/2013-04-26/1228703260.shtml','http://games.sina.com.cn/w/n/2013-04-26/1223703259.shtml','http://games.sina.com.cn/w/n/2013-04-26/1218703258.shtml','http://games.sina.com.cn/w/n/2013-04-26/1202703254.shtml','http://games.sina.com.cn/w/n/2013-04-26/1159703251.shtml','http://games.sina.com.cn/w/n/2013-04-26/1139703233.shtml']
 
def workerbee(inputlist):
  for x in xrange(THREAD_LIMIT):
    print 'Thead {0} started.'.format(x)
    t = spider()
    t.start()
  for i in inputlist:
    try:
      jobs.put(i, block=True, timeout=5)
    except:
      singlelock.acquire()
      print "The queue is full !"
      singlelock.release()
 
  # Wait for the threads to finish
  singlelock.acquire()    # Acquire the lock so we can print
  print "Waiting for threads to finish."
  singlelock.release()    # Release the lock
  jobs.join()       # This command waits for all threads to finish.
  # while not jobs.empty():
  #  print jobs.get()
 
def getTitle(url,time=10):
  response = urllib2.urlopen(url,timeout=time)
  html = response.read()
  response.close()
  reg = r'<title>(.*?)</title>'
  title = re.compile(reg).findall(html)
  title = title[0].decode('gb2312','replace').encode('utf-8')
  return title
 
class spider(threading.Thread):
  def run(self):
    while 1:
      try:
        job = jobs.get(True,1)
        singlelock.acquire()
        title = getTitle(job)
        print 'This {0} is {1}'.format(job,title)
        singlelock.release()
        jobs.task_done()
      except:
        break;
 
if __name__ == '__main__':
  workerbee(urls)
Python 相关文章推荐
Python三级目录展示的实现方法
Sep 28 Python
利用Python实现网络测试的脚本分享
May 26 Python
python中使用xlrd读excel使用xlwt写excel的实例代码
Jan 31 Python
python GUI实现小球满屏乱跑效果
May 09 Python
django框架模板中定义变量(set variable in django template)的方法分析
Jun 24 Python
python plotly绘制直方图实例详解
Jul 22 Python
python虚拟环境完美部署教程
Aug 06 Python
利用pyecharts实现地图可视化的例子
Aug 12 Python
Python GUI学习之登录系统界面篇
Aug 21 Python
Python socket聊天脚本代码实例
Jan 02 Python
Python实现遗传算法(二进制编码)求函数最优值方式
Feb 11 Python
python实现简易名片管理系统
Apr 11 Python
Python程序员开发中常犯的10个错误
Jul 07 #Python
python采用requests库模拟登录和抓取数据的简单示例
Jul 05 #Python
浅析python 中__name__ = '__main__' 的作用
Jul 05 #Python
python在windows下实现备份程序实例
Jul 04 #Python
python调用短信猫控件实现发短信功能实例
Jul 04 #Python
Python实现类继承实例
Jul 04 #Python
Django集成百度富文本编辑器uEditor攻略
Jul 04 #Python
You might like
一个程序下载的管理程序(三)
2006/10/09 PHP
php mssql 数据库分页SQL语句
2008/12/16 PHP
php数组函数序列之array_intersect() 返回两个或多个数组的交集数组
2011/11/10 PHP
PHP中关键字interface和implements详解
2017/06/14 PHP
PHP基于timestamp和nonce实现的防止重放攻击方案分析
2019/07/26 PHP
PHP dirname(__FILE__)原理及用法解析
2020/10/28 PHP
40个有创意的jQuery图片、内容滑动及弹出插件收藏集之一
2011/12/31 Javascript
一个奇葩的最短的 IE 版本判断JS脚本
2014/05/28 Javascript
Node.js中创建和管理外部进程详解
2014/08/16 Javascript
20个实用的JavaScript技巧分享
2014/11/28 Javascript
jQuery表格插件datatables用法详解
2020/11/23 Javascript
JavaScript中数组添加值和访问值常见问题
2016/02/06 Javascript
jQuery实现拖拽页面元素并将其保存到cookie的方法
2016/06/12 Javascript
JS实现一次性弹窗的方法【刷新后不弹出】
2016/12/26 Javascript
JavaScript 栈的详解及实例代码
2017/01/22 Javascript
VUE实现日历组件功能
2017/03/13 Javascript
JavaScript订单操作小程序完整版
2017/06/23 Javascript
js字符串类型String常用操作实例总结
2019/07/05 Javascript
layer弹出层自定义提交取消按钮的例子
2019/09/10 Javascript
layui table 获取分页 limit的方法
2019/09/20 Javascript
微信小程序上传帖子的实例代码(含有文字图片的微信验证)
2020/07/11 Javascript
[01:36:19]Secret vs NB 2018国际邀请赛小组赛BO2 第一场 8.19
2018/08/21 DOTA
[42:32]VP vs RNG 2019国际邀请赛淘汰赛 败者组 BO3 第一场 8.21.mp4
2020/07/19 DOTA
Python中if __name__ == '__main__'作用解析
2015/06/29 Python
5款非常棒的Python工具
2018/01/05 Python
Python模块文件结构代码详解
2018/02/03 Python
Python实现爬虫从网络上下载文档的实例代码
2018/06/13 Python
pandas.DataFrame选取/排除特定行的方法
2018/07/03 Python
python实现简单登陆系统
2018/10/18 Python
Python Django简单实现session登录注销过程详解
2019/08/06 Python
PyCharm2019安装教程及其使用(图文教程)
2019/09/29 Python
肯尼亚网上商城:Kilimall
2016/08/20 全球购物
说出ArrayList,Vector, LinkedList的存储性能和特性
2015/01/04 面试题
农行实习自我鉴定
2013/09/22 职场文书
社区禁毒工作方案
2014/06/02 职场文书
感情真挚的毕业生求职信
2014/07/19 职场文书