python实现多线程采集的2个代码例子


Posted in Python onJuly 07, 2014

代码一:

#!/usr/bin/python
# -*- coding: utf-8 -*-
#encoding=utf-8
 
import threading
import Queue
import sys
import urllib2
import re
import MySQLdb
 
#
# 数据库变量设置
#
DB_HOST = '127.0.0.1'
DB_USER = "XXXX"
DB_PASSWD = "XXXXXXXX"
DB_NAME = "xxxx"
 
#
# 变量设置
#
THREAD_LIMIT = 3
jobs = Queue.Queue(5)
singlelock = threading.Lock()
info = Queue.Queue()
 
def workerbee(inputlist):
    for x in xrange(THREAD_LIMIT):
        print 'Thead {0} started.'.format(x)
        t = spider()
        t.start()
    for i in inputlist:
        try:
            jobs.put(i, block=True, timeout=5)
        except:
            singlelock.acquire()
            print "The queue is full !"
            singlelock.release()
 
    # Wait for the threads to finish
    singlelock.acquire()        # Acquire the lock so we can print
    print "Waiting for threads to finish."
    singlelock.release()        # Release the lock
    jobs.join()              # This command waits for all threads to finish.
    # while not jobs.empty():
    #   print jobs.get()
 
def getTitle(url,time=10):
    response = urllib2.urlopen(url,timeout=time)
    html = response.read()
    response.close()
    reg = r'<title>(.*?)</title>'
    title = re.compile(reg).findall(html)
    # title = title[0].decode('gb2312','replace').encode('utf-8')
    title = title[0]
    return title
 
class spider(threading.Thread):
    def run(self):
        while 1:
            try:
                job = jobs.get(True,1)
                singlelock.acquire()
                title = getTitle(job[1])
                info.put([job[0],title], block=True, timeout=5)
                # print 'This {0} is {1}'.format(job[1],title)
                singlelock.release()
                jobs.task_done()
            except:
                break;
 
if __name__ == '__main__':
    con = None
    urls = []
    try:
        con = MySQLdb.connect(DB_HOST,DB_USER,DB_PASSWD,DB_NAME)
        cur = con.cursor()
        cur.execute('SELECT id,url FROM `table_name` WHERE `status`=0 LIMIT 10')
        rows = cur.fetchall()
        for row in rows:
            # print row
            urls.append([row[0],row[1]])
        workerbee(urls)
        while not info.empty():
            print info.get()
    finally:
        if con:
            con.close()

代码二:

#!/usr/bin/python
# -*- coding: utf-8 -*-
#encoding=utf-8
#Filename:robot.py
 
import threading,Queue,sys,urllib2,re
#
# 变量设置
#
THREAD_LIMIT = 3        #设置线程数
jobs = Queue.Queue(5)      #设置队列长度
singlelock = threading.Lock()    #设置一个线程锁,避免重复调用
 
urls = ['http://games.sina.com.cn/w/n/2013-04-28/1634703505.shtml','http://games.sina.com.cn/w/n/2013-04-28/1246703487.shtml','http://games.sina.com.cn/w/n/2013-04-28/1028703471.shtml','http://games.sina.com.cn/w/n/2013-04-27/1015703426.shtml','http://games.sina.com.cn/w/n/2013-04-26/1554703373.shtml','http://games.sina.com.cn/w/n/2013-04-26/1512703346.shtml','http://games.sina.com.cn/w/n/2013-04-26/1453703334.shtml','http://games.sina.com.cn/w/n/2013-04-26/1451703333.shtml','http://games.sina.com.cn/w/n/2013-04-26/1445703329.shtml','http://games.sina.com.cn/w/n/2013-04-26/1434703322.shtml','http://games.sina.com.cn/w/n/2013-04-26/1433703321.shtml','http://games.sina.com.cn/w/n/2013-04-26/1433703320.shtml','http://games.sina.com.cn/w/n/2013-04-26/1429703318.shtml','http://games.sina.com.cn/w/n/2013-04-26/1429703317.shtml','http://games.sina.com.cn/w/n/2013-04-26/1409703297.shtml','http://games.sina.com.cn/w/n/2013-04-26/1406703296.shtml','http://games.sina.com.cn/w/n/2013-04-26/1402703292.shtml','http://games.sina.com.cn/w/n/2013-04-26/1353703286.shtml','http://games.sina.com.cn/w/n/2013-04-26/1348703284.shtml','http://games.sina.com.cn/w/n/2013-04-26/1327703275.shtml','http://games.sina.com.cn/w/n/2013-04-26/1239703265.shtml','http://games.sina.com.cn/w/n/2013-04-26/1238703264.shtml','http://games.sina.com.cn/w/n/2013-04-26/1231703262.shtml','http://games.sina.com.cn/w/n/2013-04-26/1229703261.shtml','http://games.sina.com.cn/w/n/2013-04-26/1228703260.shtml','http://games.sina.com.cn/w/n/2013-04-26/1223703259.shtml','http://games.sina.com.cn/w/n/2013-04-26/1218703258.shtml','http://games.sina.com.cn/w/n/2013-04-26/1202703254.shtml','http://games.sina.com.cn/w/n/2013-04-26/1159703251.shtml','http://games.sina.com.cn/w/n/2013-04-26/1139703233.shtml']
 
def workerbee(inputlist):
  for x in xrange(THREAD_LIMIT):
    print 'Thead {0} started.'.format(x)
    t = spider()
    t.start()
  for i in inputlist:
    try:
      jobs.put(i, block=True, timeout=5)
    except:
      singlelock.acquire()
      print "The queue is full !"
      singlelock.release()
 
  # Wait for the threads to finish
  singlelock.acquire()    # Acquire the lock so we can print
  print "Waiting for threads to finish."
  singlelock.release()    # Release the lock
  jobs.join()       # This command waits for all threads to finish.
  # while not jobs.empty():
  #  print jobs.get()
 
def getTitle(url,time=10):
  response = urllib2.urlopen(url,timeout=time)
  html = response.read()
  response.close()
  reg = r'<title>(.*?)</title>'
  title = re.compile(reg).findall(html)
  title = title[0].decode('gb2312','replace').encode('utf-8')
  return title
 
class spider(threading.Thread):
  def run(self):
    while 1:
      try:
        job = jobs.get(True,1)
        singlelock.acquire()
        title = getTitle(job)
        print 'This {0} is {1}'.format(job,title)
        singlelock.release()
        jobs.task_done()
      except:
        break;
 
if __name__ == '__main__':
  workerbee(urls)
Python 相关文章推荐
详细解析Python中__init__()方法的高级应用
May 11 Python
Python实现PS滤镜中马赛克效果示例
Jan 20 Python
1分钟快速生成用于网页内容提取的xslt
Feb 23 Python
Python统计单词出现的次数
Apr 04 Python
Python HTML解析模块HTMLParser用法分析【爬虫工具】
Apr 05 Python
Python数据类型之List列表实例详解
May 08 Python
详解如何用python实现一个简单下载器的服务端和客户端
Oct 28 Python
pycharm激活码有效到2020年11月底
Sep 18 Python
后端开发使用pycharm的技巧(推荐)
Mar 27 Python
pyinstaller打包找不到文件的问题解决
Apr 15 Python
Python numpy矩阵处理运算工具用法汇总
Jul 13 Python
详解python的异常捕获
Mar 03 Python
Python程序员开发中常犯的10个错误
Jul 07 #Python
python采用requests库模拟登录和抓取数据的简单示例
Jul 05 #Python
浅析python 中__name__ = '__main__' 的作用
Jul 05 #Python
python在windows下实现备份程序实例
Jul 04 #Python
python调用短信猫控件实现发短信功能实例
Jul 04 #Python
Python实现类继承实例
Jul 04 #Python
Django集成百度富文本编辑器uEditor攻略
Jul 04 #Python
You might like
模拟OICQ的实现思路和核心程序(一)
2006/10/09 PHP
php 进度条实现代码
2009/03/10 PHP
php获取QQ头像并显示的方法
2014/12/23 PHP
PHP+Mysql无刷新问答评论系统(源码)
2016/12/20 PHP
PHP文件管理之实现网盘及压缩包的功能操作
2017/09/20 PHP
ejs v9 javascript模板系统
2012/03/21 Javascript
JavaScript 实现类的多种方法实例
2013/05/01 Javascript
判断在css加载完毕后执行后续代码示例
2014/09/03 Javascript
javasript实现密码的隐藏与显示
2015/05/08 Javascript
js下拉选择框与输入框联动实现添加选中值到输入框的方法
2015/08/17 Javascript
js+canvas简单绘制圆圈的方法
2016/01/28 Javascript
Angular2使用jQuery的方法教程
2017/05/28 jQuery
JS+H5 Canvas实现时钟效果
2018/07/20 Javascript
详解Vue 动态组件与全局事件绑定总结
2018/11/11 Javascript
Javascript读写cookie的实例源码
2019/03/16 Javascript
微信小程序实现语音识别转文字功能及遇到的坑
2019/08/02 Javascript
ES6学习笔记之字符串、数组、对象、函数新增知识点实例分析
2020/01/22 Javascript
使用go和python递归删除.ds store文件的方法
2014/01/22 Python
从零学python系列之浅谈pickle模块封装和拆封数据对象的方法
2014/05/23 Python
Python字符串切片操作知识详解
2016/03/28 Python
Python 含参构造函数实例详解
2017/05/25 Python
Python入门之三角函数全解【收藏】
2017/11/08 Python
Python读取excel中的图片完美解决方法
2018/07/27 Python
浅谈Python脚本开头及导包注释自动添加方法
2018/10/27 Python
利用Python实现原创工具的Logo与Help
2018/12/03 Python
Python使用crontab模块设置和清除定时任务操作详解
2019/04/09 Python
如何使用selenium和requests组合实现登录页面
2020/02/03 Python
web字体加载方案优化小结
2019/11/29 HTML / CSS
英语系本科生个人求职信
2013/09/21 职场文书
暑期政治学习心得体会
2014/09/02 职场文书
2015年父亲节寄语
2015/03/23 职场文书
2015年基层党支部工作总结
2015/05/21 职场文书
六一儿童节致辞
2015/07/31 职场文书
2016年教师学习廉政准则心得体会
2016/01/20 职场文书
Redis+Lua脚本实现计数器接口防刷功能(升级版)
2022/02/12 Redis
Ubuntu安装Mysql+启用远程连接的完整过程
2022/06/21 Servers