python实现多线程采集的2个代码例子


Posted in Python onJuly 07, 2014

代码一:

#!/usr/bin/python
# -*- coding: utf-8 -*-
#encoding=utf-8
 
import threading
import Queue
import sys
import urllib2
import re
import MySQLdb
 
#
# 数据库变量设置
#
DB_HOST = '127.0.0.1'
DB_USER = "XXXX"
DB_PASSWD = "XXXXXXXX"
DB_NAME = "xxxx"
 
#
# 变量设置
#
THREAD_LIMIT = 3
jobs = Queue.Queue(5)
singlelock = threading.Lock()
info = Queue.Queue()
 
def workerbee(inputlist):
    for x in xrange(THREAD_LIMIT):
        print 'Thead {0} started.'.format(x)
        t = spider()
        t.start()
    for i in inputlist:
        try:
            jobs.put(i, block=True, timeout=5)
        except:
            singlelock.acquire()
            print "The queue is full !"
            singlelock.release()
 
    # Wait for the threads to finish
    singlelock.acquire()        # Acquire the lock so we can print
    print "Waiting for threads to finish."
    singlelock.release()        # Release the lock
    jobs.join()              # This command waits for all threads to finish.
    # while not jobs.empty():
    #   print jobs.get()
 
def getTitle(url,time=10):
    response = urllib2.urlopen(url,timeout=time)
    html = response.read()
    response.close()
    reg = r'<title>(.*?)</title>'
    title = re.compile(reg).findall(html)
    # title = title[0].decode('gb2312','replace').encode('utf-8')
    title = title[0]
    return title
 
class spider(threading.Thread):
    def run(self):
        while 1:
            try:
                job = jobs.get(True,1)
                singlelock.acquire()
                title = getTitle(job[1])
                info.put([job[0],title], block=True, timeout=5)
                # print 'This {0} is {1}'.format(job[1],title)
                singlelock.release()
                jobs.task_done()
            except:
                break;
 
if __name__ == '__main__':
    con = None
    urls = []
    try:
        con = MySQLdb.connect(DB_HOST,DB_USER,DB_PASSWD,DB_NAME)
        cur = con.cursor()
        cur.execute('SELECT id,url FROM `table_name` WHERE `status`=0 LIMIT 10')
        rows = cur.fetchall()
        for row in rows:
            # print row
            urls.append([row[0],row[1]])
        workerbee(urls)
        while not info.empty():
            print info.get()
    finally:
        if con:
            con.close()

代码二:

#!/usr/bin/python
# -*- coding: utf-8 -*-
#encoding=utf-8
#Filename:robot.py
 
import threading,Queue,sys,urllib2,re
#
# 变量设置
#
THREAD_LIMIT = 3        #设置线程数
jobs = Queue.Queue(5)      #设置队列长度
singlelock = threading.Lock()    #设置一个线程锁,避免重复调用
 
urls = ['http://games.sina.com.cn/w/n/2013-04-28/1634703505.shtml','http://games.sina.com.cn/w/n/2013-04-28/1246703487.shtml','http://games.sina.com.cn/w/n/2013-04-28/1028703471.shtml','http://games.sina.com.cn/w/n/2013-04-27/1015703426.shtml','http://games.sina.com.cn/w/n/2013-04-26/1554703373.shtml','http://games.sina.com.cn/w/n/2013-04-26/1512703346.shtml','http://games.sina.com.cn/w/n/2013-04-26/1453703334.shtml','http://games.sina.com.cn/w/n/2013-04-26/1451703333.shtml','http://games.sina.com.cn/w/n/2013-04-26/1445703329.shtml','http://games.sina.com.cn/w/n/2013-04-26/1434703322.shtml','http://games.sina.com.cn/w/n/2013-04-26/1433703321.shtml','http://games.sina.com.cn/w/n/2013-04-26/1433703320.shtml','http://games.sina.com.cn/w/n/2013-04-26/1429703318.shtml','http://games.sina.com.cn/w/n/2013-04-26/1429703317.shtml','http://games.sina.com.cn/w/n/2013-04-26/1409703297.shtml','http://games.sina.com.cn/w/n/2013-04-26/1406703296.shtml','http://games.sina.com.cn/w/n/2013-04-26/1402703292.shtml','http://games.sina.com.cn/w/n/2013-04-26/1353703286.shtml','http://games.sina.com.cn/w/n/2013-04-26/1348703284.shtml','http://games.sina.com.cn/w/n/2013-04-26/1327703275.shtml','http://games.sina.com.cn/w/n/2013-04-26/1239703265.shtml','http://games.sina.com.cn/w/n/2013-04-26/1238703264.shtml','http://games.sina.com.cn/w/n/2013-04-26/1231703262.shtml','http://games.sina.com.cn/w/n/2013-04-26/1229703261.shtml','http://games.sina.com.cn/w/n/2013-04-26/1228703260.shtml','http://games.sina.com.cn/w/n/2013-04-26/1223703259.shtml','http://games.sina.com.cn/w/n/2013-04-26/1218703258.shtml','http://games.sina.com.cn/w/n/2013-04-26/1202703254.shtml','http://games.sina.com.cn/w/n/2013-04-26/1159703251.shtml','http://games.sina.com.cn/w/n/2013-04-26/1139703233.shtml']
 
def workerbee(inputlist):
  for x in xrange(THREAD_LIMIT):
    print 'Thead {0} started.'.format(x)
    t = spider()
    t.start()
  for i in inputlist:
    try:
      jobs.put(i, block=True, timeout=5)
    except:
      singlelock.acquire()
      print "The queue is full !"
      singlelock.release()
 
  # Wait for the threads to finish
  singlelock.acquire()    # Acquire the lock so we can print
  print "Waiting for threads to finish."
  singlelock.release()    # Release the lock
  jobs.join()       # This command waits for all threads to finish.
  # while not jobs.empty():
  #  print jobs.get()
 
def getTitle(url,time=10):
  response = urllib2.urlopen(url,timeout=time)
  html = response.read()
  response.close()
  reg = r'<title>(.*?)</title>'
  title = re.compile(reg).findall(html)
  title = title[0].decode('gb2312','replace').encode('utf-8')
  return title
 
class spider(threading.Thread):
  def run(self):
    while 1:
      try:
        job = jobs.get(True,1)
        singlelock.acquire()
        title = getTitle(job)
        print 'This {0} is {1}'.format(job,title)
        singlelock.release()
        jobs.task_done()
      except:
        break;
 
if __name__ == '__main__':
  workerbee(urls)
Python 相关文章推荐
Python中的pass语句使用方法讲解
May 14 Python
Python初学时购物车程序练习实例(推荐)
Aug 08 Python
Python数字图像处理之霍夫线变换实现详解
Jan 12 Python
python实现对求解最长回文子串的动态规划算法
Jun 02 Python
python 字典修改键(key)的几种方法
Aug 10 Python
Python中的 enum 模块源码详析
Jan 09 Python
python 实现提取某个索引中某个时间段的数据方法
Feb 01 Python
python 获得任意路径下的文件及其根目录的方法
Feb 16 Python
使用Python轻松完成垃圾分类(基于图像识别)
Jul 09 Python
找Python安装目录,设置环境路径以及在命令行运行python脚本实例
Mar 09 Python
python异步Web框架sanic的实现
Apr 27 Python
解决Pytorch自定义层出现多Variable共享内存错误问题
Jun 28 Python
Python程序员开发中常犯的10个错误
Jul 07 #Python
python采用requests库模拟登录和抓取数据的简单示例
Jul 05 #Python
浅析python 中__name__ = '__main__' 的作用
Jul 05 #Python
python在windows下实现备份程序实例
Jul 04 #Python
python调用短信猫控件实现发短信功能实例
Jul 04 #Python
Python实现类继承实例
Jul 04 #Python
Django集成百度富文本编辑器uEditor攻略
Jul 04 #Python
You might like
PHP操作Memcache实例介绍
2013/06/14 PHP
Thinkphp中数据按分类嵌套循环实现方法
2014/10/30 PHP
php计算title标题相似比的方法
2015/07/29 PHP
PHP命名空间namespace用法实例分析
2016/09/27 PHP
IE与FireFox的兼容性问题分析
2007/04/22 Javascript
window.onbeforeunload方法在IE下无法正常工作的解决办法
2010/01/23 Javascript
Javascript 实用小技巧
2010/04/07 Javascript
js读写(删除)Cookie实例详解
2013/04/17 Javascript
javascript字符串函数汇总
2015/12/06 Javascript
jquery实现拖动效果(代码分享)
2017/01/25 Javascript
Bootstrap实现基于carousel.js框架的轮播图效果
2017/05/02 Javascript
lhgcalendar时间插件限制只能选择三个月的实现方法
2017/07/03 Javascript
JS如何实现在页面上快速定位(锚点跳转问题)
2017/08/14 Javascript
BootStrap模态框和select2合用时input无法获取焦点的解决方法
2017/09/01 Javascript
JS简单实现点击跳转登陆邮箱功能的方法
2017/10/31 Javascript
微信小程序tabBar模板用法实例分析【附demo源码下载】
2017/11/28 Javascript
关于Vue的路由权限管理的示例代码
2018/03/06 Javascript
Nodejs实现WebSocket代码实例
2020/05/19 NodeJs
[43:32]Winstrike vs VGJ.S 2018国际邀请赛淘汰赛BO3 第一场 8.23
2018/08/24 DOTA
[00:10]DOTA2 TI9勇士令状明日上线
2019/05/07 DOTA
举例讲解Python的Tornado框架实现数据可视化的教程
2015/05/02 Python
Python 常用string函数详解
2016/05/30 Python
python3新特性函数注释Function Annotations用法分析
2016/07/28 Python
深入理解Python爬虫代理池服务
2018/02/28 Python
Python使用win32 COM实现Excel的写入与保存功能示例
2018/05/03 Python
Django中STATIC_ROOT和STATIC_URL及STATICFILES_DIRS浅析
2018/05/08 Python
python十进制和二进制的转换方法(含浮点数)
2018/07/07 Python
Python numpy线性代数用法实例解析
2019/11/15 Python
Python 根据数据模板创建shapefile的实现
2019/11/26 Python
Django 再谈一谈json序列化
2020/03/16 Python
keras训练曲线,混淆矩阵,CNN层输出可视化实例
2020/06/15 Python
Python实例方法、类方法、静态方法区别详解
2020/09/05 Python
Python实现像awk一样分割字符串
2020/09/15 Python
优质有机椰子产品:Dr. Goerg
2019/09/24 全球购物
广告艺术设计专业自荐书
2014/07/08 职场文书
银行客户经理岗位职责
2015/04/09 职场文书