python实现多线程采集的2个代码例子


Posted in Python onJuly 07, 2014

代码一:

#!/usr/bin/python
# -*- coding: utf-8 -*-
#encoding=utf-8
 
import threading
import Queue
import sys
import urllib2
import re
import MySQLdb
 
#
# 数据库变量设置
#
DB_HOST = '127.0.0.1'
DB_USER = "XXXX"
DB_PASSWD = "XXXXXXXX"
DB_NAME = "xxxx"
 
#
# 变量设置
#
THREAD_LIMIT = 3
jobs = Queue.Queue(5)
singlelock = threading.Lock()
info = Queue.Queue()
 
def workerbee(inputlist):
    for x in xrange(THREAD_LIMIT):
        print 'Thead {0} started.'.format(x)
        t = spider()
        t.start()
    for i in inputlist:
        try:
            jobs.put(i, block=True, timeout=5)
        except:
            singlelock.acquire()
            print "The queue is full !"
            singlelock.release()
 
    # Wait for the threads to finish
    singlelock.acquire()        # Acquire the lock so we can print
    print "Waiting for threads to finish."
    singlelock.release()        # Release the lock
    jobs.join()              # This command waits for all threads to finish.
    # while not jobs.empty():
    #   print jobs.get()
 
def getTitle(url,time=10):
    response = urllib2.urlopen(url,timeout=time)
    html = response.read()
    response.close()
    reg = r'<title>(.*?)</title>'
    title = re.compile(reg).findall(html)
    # title = title[0].decode('gb2312','replace').encode('utf-8')
    title = title[0]
    return title
 
class spider(threading.Thread):
    def run(self):
        while 1:
            try:
                job = jobs.get(True,1)
                singlelock.acquire()
                title = getTitle(job[1])
                info.put([job[0],title], block=True, timeout=5)
                # print 'This {0} is {1}'.format(job[1],title)
                singlelock.release()
                jobs.task_done()
            except:
                break;
 
if __name__ == '__main__':
    con = None
    urls = []
    try:
        con = MySQLdb.connect(DB_HOST,DB_USER,DB_PASSWD,DB_NAME)
        cur = con.cursor()
        cur.execute('SELECT id,url FROM `table_name` WHERE `status`=0 LIMIT 10')
        rows = cur.fetchall()
        for row in rows:
            # print row
            urls.append([row[0],row[1]])
        workerbee(urls)
        while not info.empty():
            print info.get()
    finally:
        if con:
            con.close()

代码二:

#!/usr/bin/python
# -*- coding: utf-8 -*-
#encoding=utf-8
#Filename:robot.py
 
import threading,Queue,sys,urllib2,re
#
# 变量设置
#
THREAD_LIMIT = 3        #设置线程数
jobs = Queue.Queue(5)      #设置队列长度
singlelock = threading.Lock()    #设置一个线程锁,避免重复调用
 
urls = ['http://games.sina.com.cn/w/n/2013-04-28/1634703505.shtml','http://games.sina.com.cn/w/n/2013-04-28/1246703487.shtml','http://games.sina.com.cn/w/n/2013-04-28/1028703471.shtml','http://games.sina.com.cn/w/n/2013-04-27/1015703426.shtml','http://games.sina.com.cn/w/n/2013-04-26/1554703373.shtml','http://games.sina.com.cn/w/n/2013-04-26/1512703346.shtml','http://games.sina.com.cn/w/n/2013-04-26/1453703334.shtml','http://games.sina.com.cn/w/n/2013-04-26/1451703333.shtml','http://games.sina.com.cn/w/n/2013-04-26/1445703329.shtml','http://games.sina.com.cn/w/n/2013-04-26/1434703322.shtml','http://games.sina.com.cn/w/n/2013-04-26/1433703321.shtml','http://games.sina.com.cn/w/n/2013-04-26/1433703320.shtml','http://games.sina.com.cn/w/n/2013-04-26/1429703318.shtml','http://games.sina.com.cn/w/n/2013-04-26/1429703317.shtml','http://games.sina.com.cn/w/n/2013-04-26/1409703297.shtml','http://games.sina.com.cn/w/n/2013-04-26/1406703296.shtml','http://games.sina.com.cn/w/n/2013-04-26/1402703292.shtml','http://games.sina.com.cn/w/n/2013-04-26/1353703286.shtml','http://games.sina.com.cn/w/n/2013-04-26/1348703284.shtml','http://games.sina.com.cn/w/n/2013-04-26/1327703275.shtml','http://games.sina.com.cn/w/n/2013-04-26/1239703265.shtml','http://games.sina.com.cn/w/n/2013-04-26/1238703264.shtml','http://games.sina.com.cn/w/n/2013-04-26/1231703262.shtml','http://games.sina.com.cn/w/n/2013-04-26/1229703261.shtml','http://games.sina.com.cn/w/n/2013-04-26/1228703260.shtml','http://games.sina.com.cn/w/n/2013-04-26/1223703259.shtml','http://games.sina.com.cn/w/n/2013-04-26/1218703258.shtml','http://games.sina.com.cn/w/n/2013-04-26/1202703254.shtml','http://games.sina.com.cn/w/n/2013-04-26/1159703251.shtml','http://games.sina.com.cn/w/n/2013-04-26/1139703233.shtml']
 
def workerbee(inputlist):
  for x in xrange(THREAD_LIMIT):
    print 'Thead {0} started.'.format(x)
    t = spider()
    t.start()
  for i in inputlist:
    try:
      jobs.put(i, block=True, timeout=5)
    except:
      singlelock.acquire()
      print "The queue is full !"
      singlelock.release()
 
  # Wait for the threads to finish
  singlelock.acquire()    # Acquire the lock so we can print
  print "Waiting for threads to finish."
  singlelock.release()    # Release the lock
  jobs.join()       # This command waits for all threads to finish.
  # while not jobs.empty():
  #  print jobs.get()
 
def getTitle(url,time=10):
  response = urllib2.urlopen(url,timeout=time)
  html = response.read()
  response.close()
  reg = r'<title>(.*?)</title>'
  title = re.compile(reg).findall(html)
  title = title[0].decode('gb2312','replace').encode('utf-8')
  return title
 
class spider(threading.Thread):
  def run(self):
    while 1:
      try:
        job = jobs.get(True,1)
        singlelock.acquire()
        title = getTitle(job)
        print 'This {0} is {1}'.format(job,title)
        singlelock.release()
        jobs.task_done()
      except:
        break;
 
if __name__ == '__main__':
  workerbee(urls)
Python 相关文章推荐
使用python在校内发人人网状态(人人网看状态)
Feb 19 Python
python类和函数中使用静态变量的方法
May 09 Python
Python使用urllib2模块实现断点续传下载的方法
Jun 17 Python
利用numpy+matplotlib绘图的基本操作教程
May 03 Python
wxpython实现图书管理系统
Mar 12 Python
pandas.DataFrame删除/选取含有特定数值的行或列实例
Nov 07 Python
Python中输入和输出(打印)数据实例方法
Oct 13 Python
Python 实现取多维数组第n维的前几位
Nov 26 Python
Python对称的二叉树多种思路实现方法
Feb 28 Python
python求解汉诺塔游戏
Jul 09 Python
pycharm 使用anaconda为默认环境的操作
Feb 05 Python
python数据处理之Pandas类型转换
Apr 28 Python
Python程序员开发中常犯的10个错误
Jul 07 #Python
python采用requests库模拟登录和抓取数据的简单示例
Jul 05 #Python
浅析python 中__name__ = '__main__' 的作用
Jul 05 #Python
python在windows下实现备份程序实例
Jul 04 #Python
python调用短信猫控件实现发短信功能实例
Jul 04 #Python
Python实现类继承实例
Jul 04 #Python
Django集成百度富文本编辑器uEditor攻略
Jul 04 #Python
You might like
PHP-CGI进程CPU 100% 与 file_get_contents 函数的关系分析
2011/08/15 PHP
第七章 php自定义函数实现代码
2011/12/30 PHP
smarty模板引擎中内建函数if、elseif和else的使用方法
2015/01/22 PHP
ThinkPHP开发--使用七牛云储存
2017/09/14 PHP
PHP7 list() 函数修改
2021/03/09 PHP
5个javascript的数字格式化函数分享
2011/12/07 Javascript
js iframe跨域访问(同主域/非同主域)分别深入介绍
2013/01/24 Javascript
jQuery ReferenceError: $ is not defined 错误的处理办法
2013/05/10 Javascript
js获取select默认选中的Option并不是当前选中值
2014/05/07 Javascript
AngularJS 自定义过滤器详解及实例代码
2016/09/14 Javascript
基于JavaScript实现微信抢红包功能
2017/07/20 Javascript
浅谈React Native 中组件的生命周期
2017/09/08 Javascript
微信小程序视图template模板引用的实例详解
2017/09/20 Javascript
vuejs使用递归组件实现树形目录的方法
2017/09/30 Javascript
Angular弹出模态框的两种方式
2017/10/19 Javascript
分析javascript原型及原型链
2018/03/18 Javascript
r.js来合并压缩css文件的示例
2018/04/26 Javascript
node 使用 async 控制并发的方法
2018/05/07 Javascript
jQuery实现表格隔行换色
2018/09/01 jQuery
用vscode开发vue应用的方法步骤
2019/05/06 Javascript
vue移动端使用canvas签名的实现
2020/01/15 Javascript
Python中使用select模块实现非阻塞的IO
2015/02/03 Python
python内置函数:lambda、map、filter简单介绍
2017/11/16 Python
PyCharm代码回滚,恢复历史版本的解决方法
2018/10/22 Python
Python3多线程版TCP端口扫描器
2019/08/31 Python
python super的使用方法及实例详解
2019/09/25 Python
解决torch.autograd.backward中的参数问题
2020/01/07 Python
keras 特征图可视化实例(中间层)
2020/01/24 Python
python使用信号量动态更新配置文件的操作
2020/04/01 Python
python requests.get带header
2020/05/05 Python
python中sklearn的pipeline模块实例详解
2020/05/21 Python
Python+Selenium实现自动化的环境搭建的步骤(图文)
2020/09/01 Python
Pytorch实验常用代码段汇总
2020/11/19 Python
python实现MySQL指定表增量同步数据到clickhouse的脚本
2021/02/26 Python
影视广告专业求职信
2014/09/02 职场文书
Oracle笔记
2021/04/05 Oracle