python实现多线程采集的2个代码例子


Posted in Python onJuly 07, 2014

代码一:

#!/usr/bin/python
# -*- coding: utf-8 -*-
#encoding=utf-8
 
import threading
import Queue
import sys
import urllib2
import re
import MySQLdb
 
#
# 数据库变量设置
#
DB_HOST = '127.0.0.1'
DB_USER = "XXXX"
DB_PASSWD = "XXXXXXXX"
DB_NAME = "xxxx"
 
#
# 变量设置
#
THREAD_LIMIT = 3
jobs = Queue.Queue(5)
singlelock = threading.Lock()
info = Queue.Queue()
 
def workerbee(inputlist):
    for x in xrange(THREAD_LIMIT):
        print 'Thead {0} started.'.format(x)
        t = spider()
        t.start()
    for i in inputlist:
        try:
            jobs.put(i, block=True, timeout=5)
        except:
            singlelock.acquire()
            print "The queue is full !"
            singlelock.release()
 
    # Wait for the threads to finish
    singlelock.acquire()        # Acquire the lock so we can print
    print "Waiting for threads to finish."
    singlelock.release()        # Release the lock
    jobs.join()              # This command waits for all threads to finish.
    # while not jobs.empty():
    #   print jobs.get()
 
def getTitle(url,time=10):
    response = urllib2.urlopen(url,timeout=time)
    html = response.read()
    response.close()
    reg = r'<title>(.*?)</title>'
    title = re.compile(reg).findall(html)
    # title = title[0].decode('gb2312','replace').encode('utf-8')
    title = title[0]
    return title
 
class spider(threading.Thread):
    def run(self):
        while 1:
            try:
                job = jobs.get(True,1)
                singlelock.acquire()
                title = getTitle(job[1])
                info.put([job[0],title], block=True, timeout=5)
                # print 'This {0} is {1}'.format(job[1],title)
                singlelock.release()
                jobs.task_done()
            except:
                break;
 
if __name__ == '__main__':
    con = None
    urls = []
    try:
        con = MySQLdb.connect(DB_HOST,DB_USER,DB_PASSWD,DB_NAME)
        cur = con.cursor()
        cur.execute('SELECT id,url FROM `table_name` WHERE `status`=0 LIMIT 10')
        rows = cur.fetchall()
        for row in rows:
            # print row
            urls.append([row[0],row[1]])
        workerbee(urls)
        while not info.empty():
            print info.get()
    finally:
        if con:
            con.close()

代码二:

#!/usr/bin/python
# -*- coding: utf-8 -*-
#encoding=utf-8
#Filename:robot.py
 
import threading,Queue,sys,urllib2,re
#
# 变量设置
#
THREAD_LIMIT = 3        #设置线程数
jobs = Queue.Queue(5)      #设置队列长度
singlelock = threading.Lock()    #设置一个线程锁,避免重复调用
 
urls = ['http://games.sina.com.cn/w/n/2013-04-28/1634703505.shtml','http://games.sina.com.cn/w/n/2013-04-28/1246703487.shtml','http://games.sina.com.cn/w/n/2013-04-28/1028703471.shtml','http://games.sina.com.cn/w/n/2013-04-27/1015703426.shtml','http://games.sina.com.cn/w/n/2013-04-26/1554703373.shtml','http://games.sina.com.cn/w/n/2013-04-26/1512703346.shtml','http://games.sina.com.cn/w/n/2013-04-26/1453703334.shtml','http://games.sina.com.cn/w/n/2013-04-26/1451703333.shtml','http://games.sina.com.cn/w/n/2013-04-26/1445703329.shtml','http://games.sina.com.cn/w/n/2013-04-26/1434703322.shtml','http://games.sina.com.cn/w/n/2013-04-26/1433703321.shtml','http://games.sina.com.cn/w/n/2013-04-26/1433703320.shtml','http://games.sina.com.cn/w/n/2013-04-26/1429703318.shtml','http://games.sina.com.cn/w/n/2013-04-26/1429703317.shtml','http://games.sina.com.cn/w/n/2013-04-26/1409703297.shtml','http://games.sina.com.cn/w/n/2013-04-26/1406703296.shtml','http://games.sina.com.cn/w/n/2013-04-26/1402703292.shtml','http://games.sina.com.cn/w/n/2013-04-26/1353703286.shtml','http://games.sina.com.cn/w/n/2013-04-26/1348703284.shtml','http://games.sina.com.cn/w/n/2013-04-26/1327703275.shtml','http://games.sina.com.cn/w/n/2013-04-26/1239703265.shtml','http://games.sina.com.cn/w/n/2013-04-26/1238703264.shtml','http://games.sina.com.cn/w/n/2013-04-26/1231703262.shtml','http://games.sina.com.cn/w/n/2013-04-26/1229703261.shtml','http://games.sina.com.cn/w/n/2013-04-26/1228703260.shtml','http://games.sina.com.cn/w/n/2013-04-26/1223703259.shtml','http://games.sina.com.cn/w/n/2013-04-26/1218703258.shtml','http://games.sina.com.cn/w/n/2013-04-26/1202703254.shtml','http://games.sina.com.cn/w/n/2013-04-26/1159703251.shtml','http://games.sina.com.cn/w/n/2013-04-26/1139703233.shtml']
 
def workerbee(inputlist):
  for x in xrange(THREAD_LIMIT):
    print 'Thead {0} started.'.format(x)
    t = spider()
    t.start()
  for i in inputlist:
    try:
      jobs.put(i, block=True, timeout=5)
    except:
      singlelock.acquire()
      print "The queue is full !"
      singlelock.release()
 
  # Wait for the threads to finish
  singlelock.acquire()    # Acquire the lock so we can print
  print "Waiting for threads to finish."
  singlelock.release()    # Release the lock
  jobs.join()       # This command waits for all threads to finish.
  # while not jobs.empty():
  #  print jobs.get()
 
def getTitle(url,time=10):
  response = urllib2.urlopen(url,timeout=time)
  html = response.read()
  response.close()
  reg = r'<title>(.*?)</title>'
  title = re.compile(reg).findall(html)
  title = title[0].decode('gb2312','replace').encode('utf-8')
  return title
 
class spider(threading.Thread):
  def run(self):
    while 1:
      try:
        job = jobs.get(True,1)
        singlelock.acquire()
        title = getTitle(job)
        print 'This {0} is {1}'.format(job,title)
        singlelock.release()
        jobs.task_done()
      except:
        break;
 
if __name__ == '__main__':
  workerbee(urls)
Python 相关文章推荐
Python开发实例分享bt种子爬虫程序和种子解析
May 21 Python
Python单例模式的两种实现方法
Aug 14 Python
Python3使用PyQt5制作简单的画板/手写板实例
Oct 19 Python
python交互式图形编程实例(一)
Nov 17 Python
django中的HTML控件及参数传递方法
Mar 20 Python
Python实现多线程的两种方式分析
Aug 29 Python
Python输出\u编码将其转换成中文的实例
Dec 15 Python
python对数组进行排序,并输出排序后对应的索引值方式
Feb 28 Python
基于python实现把json数据转换成Excel表格
May 07 Python
Python3自定义json逐层解析器代码
May 11 Python
Python pexpect模块及shell脚本except原理解析
Aug 03 Python
Django contrib auth authenticate函数源码解析
Nov 12 Python
Python程序员开发中常犯的10个错误
Jul 07 #Python
python采用requests库模拟登录和抓取数据的简单示例
Jul 05 #Python
浅析python 中__name__ = '__main__' 的作用
Jul 05 #Python
python在windows下实现备份程序实例
Jul 04 #Python
python调用短信猫控件实现发短信功能实例
Jul 04 #Python
Python实现类继承实例
Jul 04 #Python
Django集成百度富文本编辑器uEditor攻略
Jul 04 #Python
You might like
php在数组中查找指定值的方法
2015/03/17 PHP
PHP实现恶意DDOS攻击避免带宽占用问题方法
2015/05/27 PHP
关于PHP中字符串与多进制转换函数的实例代码
2016/11/03 PHP
Yii2.0框架模型添加/修改/删除数据操作示例
2019/07/18 PHP
PHP实现单条sql执行多个数据的insert语句方法
2019/10/11 PHP
js常用排序实现代码
2010/12/28 Javascript
Jquery 模板数据绑定插件的使用方法详解
2013/07/08 Javascript
nodejs实现黑名单中间件设计
2014/06/17 NodeJs
jQuery 移动端artEditor富文本编辑器
2016/01/11 Javascript
AngularJS服务service用法总结
2016/12/13 Javascript
js继承实现方法详解
2016/12/16 Javascript
bootstrap折叠调用collapse()后data-parent不生效的快速解决办法
2017/02/23 Javascript
使用use注册Vue全局组件和全局指令的方法
2018/03/08 Javascript
js实现无缝轮播图
2020/03/09 Javascript
原生JS实现汇率转换功能代码实例
2020/05/13 Javascript
python django框架中使用FastDFS分布式文件系统的安装方法
2019/06/10 Python
用Python去除图像的黑色或白色背景实例
2019/12/12 Python
Jupyter 无法下载文件夹如何实现曲线救国
2020/04/22 Python
浅谈TensorFlow中读取图像数据的三种方式
2020/06/30 Python
Jmeter调用Python脚本实现参数互相传递的实现
2021/01/22 Python
斯凯奇新西兰官网:SKECHERS新西兰
2018/02/22 全球购物
凯撒娱乐:Caesars Entertainment
2018/02/23 全球购物
IRO美国官网:法国服装品牌
2018/03/06 全球购物
POS解决方案:MUNBYN(热敏打印机、条形码扫描仪)
2020/06/09 全球购物
最新远光软件笔试题面试题内容
2013/11/08 面试题
既然说Ruby中一切都是对象,那么Ruby中类也是对象吗
2013/01/26 面试题
消防安全员岗位职责
2014/03/10 职场文书
《九寨沟》教学反思
2014/04/08 职场文书
个人反四风对照检查材料思想汇报
2014/09/23 职场文书
学校食堂食品安全承诺书
2015/04/29 职场文书
2015年出纳年终工作总结
2015/05/14 职场文书
诗词赏析-(浣溪沙)
2019/08/13 职场文书
详解Django中 render() 函数的使用方法
2021/04/22 Python
Golang 语言控制并发 Goroutine的方法
2021/06/30 Golang
Android Studio实现带三角函数对数运算功能的高级计算器
2022/05/20 Java/Android
LeetCode189轮转数组python示例
2022/08/05 Python