python实现多线程采集的2个代码例子


Posted in Python onJuly 07, 2014

代码一:

#!/usr/bin/python
# -*- coding: utf-8 -*-
#encoding=utf-8
 
import threading
import Queue
import sys
import urllib2
import re
import MySQLdb
 
#
# 数据库变量设置
#
DB_HOST = '127.0.0.1'
DB_USER = "XXXX"
DB_PASSWD = "XXXXXXXX"
DB_NAME = "xxxx"
 
#
# 变量设置
#
THREAD_LIMIT = 3
jobs = Queue.Queue(5)
singlelock = threading.Lock()
info = Queue.Queue()
 
def workerbee(inputlist):
    for x in xrange(THREAD_LIMIT):
        print 'Thead {0} started.'.format(x)
        t = spider()
        t.start()
    for i in inputlist:
        try:
            jobs.put(i, block=True, timeout=5)
        except:
            singlelock.acquire()
            print "The queue is full !"
            singlelock.release()
 
    # Wait for the threads to finish
    singlelock.acquire()        # Acquire the lock so we can print
    print "Waiting for threads to finish."
    singlelock.release()        # Release the lock
    jobs.join()              # This command waits for all threads to finish.
    # while not jobs.empty():
    #   print jobs.get()
 
def getTitle(url,time=10):
    response = urllib2.urlopen(url,timeout=time)
    html = response.read()
    response.close()
    reg = r'<title>(.*?)</title>'
    title = re.compile(reg).findall(html)
    # title = title[0].decode('gb2312','replace').encode('utf-8')
    title = title[0]
    return title
 
class spider(threading.Thread):
    def run(self):
        while 1:
            try:
                job = jobs.get(True,1)
                singlelock.acquire()
                title = getTitle(job[1])
                info.put([job[0],title], block=True, timeout=5)
                # print 'This {0} is {1}'.format(job[1],title)
                singlelock.release()
                jobs.task_done()
            except:
                break;
 
if __name__ == '__main__':
    con = None
    urls = []
    try:
        con = MySQLdb.connect(DB_HOST,DB_USER,DB_PASSWD,DB_NAME)
        cur = con.cursor()
        cur.execute('SELECT id,url FROM `table_name` WHERE `status`=0 LIMIT 10')
        rows = cur.fetchall()
        for row in rows:
            # print row
            urls.append([row[0],row[1]])
        workerbee(urls)
        while not info.empty():
            print info.get()
    finally:
        if con:
            con.close()

代码二:

#!/usr/bin/python
# -*- coding: utf-8 -*-
#encoding=utf-8
#Filename:robot.py
 
import threading,Queue,sys,urllib2,re
#
# 变量设置
#
THREAD_LIMIT = 3        #设置线程数
jobs = Queue.Queue(5)      #设置队列长度
singlelock = threading.Lock()    #设置一个线程锁,避免重复调用
 
urls = ['http://games.sina.com.cn/w/n/2013-04-28/1634703505.shtml','http://games.sina.com.cn/w/n/2013-04-28/1246703487.shtml','http://games.sina.com.cn/w/n/2013-04-28/1028703471.shtml','http://games.sina.com.cn/w/n/2013-04-27/1015703426.shtml','http://games.sina.com.cn/w/n/2013-04-26/1554703373.shtml','http://games.sina.com.cn/w/n/2013-04-26/1512703346.shtml','http://games.sina.com.cn/w/n/2013-04-26/1453703334.shtml','http://games.sina.com.cn/w/n/2013-04-26/1451703333.shtml','http://games.sina.com.cn/w/n/2013-04-26/1445703329.shtml','http://games.sina.com.cn/w/n/2013-04-26/1434703322.shtml','http://games.sina.com.cn/w/n/2013-04-26/1433703321.shtml','http://games.sina.com.cn/w/n/2013-04-26/1433703320.shtml','http://games.sina.com.cn/w/n/2013-04-26/1429703318.shtml','http://games.sina.com.cn/w/n/2013-04-26/1429703317.shtml','http://games.sina.com.cn/w/n/2013-04-26/1409703297.shtml','http://games.sina.com.cn/w/n/2013-04-26/1406703296.shtml','http://games.sina.com.cn/w/n/2013-04-26/1402703292.shtml','http://games.sina.com.cn/w/n/2013-04-26/1353703286.shtml','http://games.sina.com.cn/w/n/2013-04-26/1348703284.shtml','http://games.sina.com.cn/w/n/2013-04-26/1327703275.shtml','http://games.sina.com.cn/w/n/2013-04-26/1239703265.shtml','http://games.sina.com.cn/w/n/2013-04-26/1238703264.shtml','http://games.sina.com.cn/w/n/2013-04-26/1231703262.shtml','http://games.sina.com.cn/w/n/2013-04-26/1229703261.shtml','http://games.sina.com.cn/w/n/2013-04-26/1228703260.shtml','http://games.sina.com.cn/w/n/2013-04-26/1223703259.shtml','http://games.sina.com.cn/w/n/2013-04-26/1218703258.shtml','http://games.sina.com.cn/w/n/2013-04-26/1202703254.shtml','http://games.sina.com.cn/w/n/2013-04-26/1159703251.shtml','http://games.sina.com.cn/w/n/2013-04-26/1139703233.shtml']
 
def workerbee(inputlist):
  for x in xrange(THREAD_LIMIT):
    print 'Thead {0} started.'.format(x)
    t = spider()
    t.start()
  for i in inputlist:
    try:
      jobs.put(i, block=True, timeout=5)
    except:
      singlelock.acquire()
      print "The queue is full !"
      singlelock.release()
 
  # Wait for the threads to finish
  singlelock.acquire()    # Acquire the lock so we can print
  print "Waiting for threads to finish."
  singlelock.release()    # Release the lock
  jobs.join()       # This command waits for all threads to finish.
  # while not jobs.empty():
  #  print jobs.get()
 
def getTitle(url,time=10):
  response = urllib2.urlopen(url,timeout=time)
  html = response.read()
  response.close()
  reg = r'<title>(.*?)</title>'
  title = re.compile(reg).findall(html)
  title = title[0].decode('gb2312','replace').encode('utf-8')
  return title
 
class spider(threading.Thread):
  def run(self):
    while 1:
      try:
        job = jobs.get(True,1)
        singlelock.acquire()
        title = getTitle(job)
        print 'This {0} is {1}'.format(job,title)
        singlelock.release()
        jobs.task_done()
      except:
        break;
 
if __name__ == '__main__':
  workerbee(urls)
Python 相关文章推荐
python计算最小优先级队列代码分享
Dec 18 Python
对Python新手编程过程中如何规避一些常见问题的建议
Apr 01 Python
探索Python3.4中新引入的asyncio模块
Apr 08 Python
在Python中处理XML的教程
Apr 29 Python
python http接口自动化脚本详解
Jan 02 Python
tensorflow实现softma识别MNIST
Mar 12 Python
python人民币小写转大写辅助工具
Jun 20 Python
Python实现查找数组中任意第k大的数字算法示例
Jan 23 Python
python自动保存百度盘资源到百度盘中的实例代码
Aug 26 Python
使用Python给头像加上圣诞帽或圣诞老人小图标附源码
Dec 25 Python
python3光学字符识别模块tesserocr与pytesseract的使用详解
Feb 26 Python
PyCharm 在Windows的有用快捷键详解
Apr 07 Python
Python程序员开发中常犯的10个错误
Jul 07 #Python
python采用requests库模拟登录和抓取数据的简单示例
Jul 05 #Python
浅析python 中__name__ = '__main__' 的作用
Jul 05 #Python
python在windows下实现备份程序实例
Jul 04 #Python
python调用短信猫控件实现发短信功能实例
Jul 04 #Python
Python实现类继承实例
Jul 04 #Python
Django集成百度富文本编辑器uEditor攻略
Jul 04 #Python
You might like
PHP的SQL注入过程分析
2012/01/06 PHP
解析PHP高效率写法(详解原因)
2013/06/20 PHP
php基于curl扩展制作跨平台的restfule 接口
2015/05/11 PHP
Laravel中前端js上传图片到七牛云的示例代码
2017/09/04 PHP
推荐6款基于jQuery实现图片效果插件
2014/12/07 Javascript
轻松创建nodejs服务器(10):处理上传图片
2014/12/18 NodeJs
js鼠标悬浮出现遮罩层的方法
2015/01/28 Javascript
Js实现自定义右键行为
2015/03/26 Javascript
在JavaScript中操作时间之setYear()方法的使用
2015/06/12 Javascript
jQuery实现自动与手动切换的滚动新闻特效代码分享
2015/08/27 Javascript
jQuery AJAX timeout 超时问题详解
2016/06/21 Javascript
AngularJS基础 ng-mouseenter 指令示例代码
2016/08/02 Javascript
JavaScript实现DOM对象选择器
2016/09/24 Javascript
jQuery实现动态生成表格并为行绑定单击变色动作的方法
2017/04/17 jQuery
JavaScript基础之this详解
2017/06/04 Javascript
tween.js缓动补间动画算法示例
2018/02/13 Javascript
nodejs对mongodb数据库的增加修删该查实例代码
2020/01/05 NodeJs
Vue通过getAction的finally来最大程度避免影响主数据呈现问题
2020/04/24 Javascript
微信小程序上传帖子的实例代码(含有文字图片的微信验证)
2020/07/11 Javascript
vue实现前端列表多条件筛选
2020/10/26 Javascript
简单介绍Ruby中的CGI编程
2015/04/10 Python
实例讲解Python中函数的调用与定义
2016/03/14 Python
Python if语句知识点用法总结
2018/06/10 Python
Python实现括号匹配方法详解
2020/02/10 Python
关于HTML5你必须知道的28个新特性,新技巧以及新技术
2012/05/28 HTML / CSS
html5移动端价格输入键盘的实现
2019/09/16 HTML / CSS
世界领先的在线地板和建筑材料批发商:BuildDirect
2017/02/26 全球购物
德国综合购物网站:OTTO
2018/11/13 全球购物
创意婚礼策划方案
2014/05/18 职场文书
乡镇党建工作汇报材料
2014/08/14 职场文书
代领学位证书毕业证书委托书
2014/09/30 职场文书
2014年幼儿园学期工作总结
2014/12/05 职场文书
高三英语教学计划
2015/01/23 职场文书
倡议书的格式写法
2015/04/28 职场文书
CSS使用伪类控制边框长度的方法
2022/01/18 HTML / CSS
Javascript中async与await的捕捉错误详解
2022/03/03 Javascript