python实现多线程采集的2个代码例子


Posted in Python onJuly 07, 2014

代码一:

#!/usr/bin/python
# -*- coding: utf-8 -*-
#encoding=utf-8
 
import threading
import Queue
import sys
import urllib2
import re
import MySQLdb
 
#
# 数据库变量设置
#
DB_HOST = '127.0.0.1'
DB_USER = "XXXX"
DB_PASSWD = "XXXXXXXX"
DB_NAME = "xxxx"
 
#
# 变量设置
#
THREAD_LIMIT = 3
jobs = Queue.Queue(5)
singlelock = threading.Lock()
info = Queue.Queue()
 
def workerbee(inputlist):
    for x in xrange(THREAD_LIMIT):
        print 'Thead {0} started.'.format(x)
        t = spider()
        t.start()
    for i in inputlist:
        try:
            jobs.put(i, block=True, timeout=5)
        except:
            singlelock.acquire()
            print "The queue is full !"
            singlelock.release()
 
    # Wait for the threads to finish
    singlelock.acquire()        # Acquire the lock so we can print
    print "Waiting for threads to finish."
    singlelock.release()        # Release the lock
    jobs.join()              # This command waits for all threads to finish.
    # while not jobs.empty():
    #   print jobs.get()
 
def getTitle(url,time=10):
    response = urllib2.urlopen(url,timeout=time)
    html = response.read()
    response.close()
    reg = r'<title>(.*?)</title>'
    title = re.compile(reg).findall(html)
    # title = title[0].decode('gb2312','replace').encode('utf-8')
    title = title[0]
    return title
 
class spider(threading.Thread):
    def run(self):
        while 1:
            try:
                job = jobs.get(True,1)
                singlelock.acquire()
                title = getTitle(job[1])
                info.put([job[0],title], block=True, timeout=5)
                # print 'This {0} is {1}'.format(job[1],title)
                singlelock.release()
                jobs.task_done()
            except:
                break;
 
if __name__ == '__main__':
    con = None
    urls = []
    try:
        con = MySQLdb.connect(DB_HOST,DB_USER,DB_PASSWD,DB_NAME)
        cur = con.cursor()
        cur.execute('SELECT id,url FROM `table_name` WHERE `status`=0 LIMIT 10')
        rows = cur.fetchall()
        for row in rows:
            # print row
            urls.append([row[0],row[1]])
        workerbee(urls)
        while not info.empty():
            print info.get()
    finally:
        if con:
            con.close()

代码二:

#!/usr/bin/python
# -*- coding: utf-8 -*-
#encoding=utf-8
#Filename:robot.py
 
import threading,Queue,sys,urllib2,re
#
# 变量设置
#
THREAD_LIMIT = 3        #设置线程数
jobs = Queue.Queue(5)      #设置队列长度
singlelock = threading.Lock()    #设置一个线程锁,避免重复调用
 
urls = ['http://games.sina.com.cn/w/n/2013-04-28/1634703505.shtml','http://games.sina.com.cn/w/n/2013-04-28/1246703487.shtml','http://games.sina.com.cn/w/n/2013-04-28/1028703471.shtml','http://games.sina.com.cn/w/n/2013-04-27/1015703426.shtml','http://games.sina.com.cn/w/n/2013-04-26/1554703373.shtml','http://games.sina.com.cn/w/n/2013-04-26/1512703346.shtml','http://games.sina.com.cn/w/n/2013-04-26/1453703334.shtml','http://games.sina.com.cn/w/n/2013-04-26/1451703333.shtml','http://games.sina.com.cn/w/n/2013-04-26/1445703329.shtml','http://games.sina.com.cn/w/n/2013-04-26/1434703322.shtml','http://games.sina.com.cn/w/n/2013-04-26/1433703321.shtml','http://games.sina.com.cn/w/n/2013-04-26/1433703320.shtml','http://games.sina.com.cn/w/n/2013-04-26/1429703318.shtml','http://games.sina.com.cn/w/n/2013-04-26/1429703317.shtml','http://games.sina.com.cn/w/n/2013-04-26/1409703297.shtml','http://games.sina.com.cn/w/n/2013-04-26/1406703296.shtml','http://games.sina.com.cn/w/n/2013-04-26/1402703292.shtml','http://games.sina.com.cn/w/n/2013-04-26/1353703286.shtml','http://games.sina.com.cn/w/n/2013-04-26/1348703284.shtml','http://games.sina.com.cn/w/n/2013-04-26/1327703275.shtml','http://games.sina.com.cn/w/n/2013-04-26/1239703265.shtml','http://games.sina.com.cn/w/n/2013-04-26/1238703264.shtml','http://games.sina.com.cn/w/n/2013-04-26/1231703262.shtml','http://games.sina.com.cn/w/n/2013-04-26/1229703261.shtml','http://games.sina.com.cn/w/n/2013-04-26/1228703260.shtml','http://games.sina.com.cn/w/n/2013-04-26/1223703259.shtml','http://games.sina.com.cn/w/n/2013-04-26/1218703258.shtml','http://games.sina.com.cn/w/n/2013-04-26/1202703254.shtml','http://games.sina.com.cn/w/n/2013-04-26/1159703251.shtml','http://games.sina.com.cn/w/n/2013-04-26/1139703233.shtml']
 
def workerbee(inputlist):
  for x in xrange(THREAD_LIMIT):
    print 'Thead {0} started.'.format(x)
    t = spider()
    t.start()
  for i in inputlist:
    try:
      jobs.put(i, block=True, timeout=5)
    except:
      singlelock.acquire()
      print "The queue is full !"
      singlelock.release()
 
  # Wait for the threads to finish
  singlelock.acquire()    # Acquire the lock so we can print
  print "Waiting for threads to finish."
  singlelock.release()    # Release the lock
  jobs.join()       # This command waits for all threads to finish.
  # while not jobs.empty():
  #  print jobs.get()
 
def getTitle(url,time=10):
  response = urllib2.urlopen(url,timeout=time)
  html = response.read()
  response.close()
  reg = r'<title>(.*?)</title>'
  title = re.compile(reg).findall(html)
  title = title[0].decode('gb2312','replace').encode('utf-8')
  return title
 
class spider(threading.Thread):
  def run(self):
    while 1:
      try:
        job = jobs.get(True,1)
        singlelock.acquire()
        title = getTitle(job)
        print 'This {0} is {1}'.format(job,title)
        singlelock.release()
        jobs.task_done()
      except:
        break;
 
if __name__ == '__main__':
  workerbee(urls)
Python 相关文章推荐
连接Python程序与MySQL的教程
Apr 29 Python
在Mac OS上部署Nginx和FastCGI以及Flask框架的教程
May 02 Python
Django中URLconf和include()的协同工作方法
Jul 20 Python
Python连接mysql数据库的正确姿势
Feb 03 Python
Python实现并行抓取整站40万条房价数据(可更换抓取城市)
Dec 14 Python
python中的闭包函数
Feb 09 Python
python在回调函数中获取返回值的方法
Feb 22 Python
Python Django 封装分页成通用的模块详解
Aug 21 Python
python实现翻译word表格小程序
Feb 27 Python
详解Windows下PyCharm安装Numpy包及无法安装问题解决方案
Jun 18 Python
Python __slots__的使用方法
Nov 15 Python
OpenCV实现常见的四种图像几何变换
Apr 01 Python
Python程序员开发中常犯的10个错误
Jul 07 #Python
python采用requests库模拟登录和抓取数据的简单示例
Jul 05 #Python
浅析python 中__name__ = '__main__' 的作用
Jul 05 #Python
python在windows下实现备份程序实例
Jul 04 #Python
python调用短信猫控件实现发短信功能实例
Jul 04 #Python
Python实现类继承实例
Jul 04 #Python
Django集成百度富文本编辑器uEditor攻略
Jul 04 #Python
You might like
在Mac OS上编译安装Nginx+PHP+MariaDB开发环境的教程
2016/02/23 PHP
PHP 年月日的三级联动实例代码
2017/05/24 PHP
让 JavaScript 轻松支持函数重载 (Part 2 - 实现)
2009/08/04 Javascript
JS+XML 省份和城市之间的联动实现代码
2009/10/14 Javascript
JQuery 拾色器插件发布-jquery.icolor.js
2010/10/20 Javascript
jQuery学习笔记之Helloworld
2010/12/22 Javascript
jQuery遍历Form示例代码
2013/09/03 Javascript
jquery scroll()区分横向纵向滚动条的方法
2014/04/04 Javascript
jQuery内置的AJAX功能和JSON的使用实例
2014/07/27 Javascript
14款经典网页图片和文字特效的jQuery插件-前端开发必备
2015/08/25 Javascript
jQuery Validate初步体验(一)
2015/12/12 Javascript
javascript使用闭包模拟对象的私有属性和方法
2016/10/05 Javascript
使用jquery.qrcode.js生成二维码插件
2016/10/17 Javascript
前端实现文件的断点续传(前端文件提交+后端PHP文件接收)
2016/11/04 Javascript
jQuery插件HighCharts绘制2D带Label的折线图效果示例【附demo源码下载】
2017/03/08 Javascript
JavaScript方法_动力节点Java学院整理
2017/06/28 Javascript
关于laydate.js加载laydate.css路径错误问题解决
2017/12/27 Javascript
JavaScript 中的 this 工作原理
2018/06/20 Javascript
微信小程序bindinput与bindsubmit的区别实例分析
2019/04/17 Javascript
vue项目前端错误收集之sentry教程详解
2019/05/27 Javascript
Smartour 让网页导览变得更简单(推荐)
2019/07/19 Javascript
[02:29]大剑、皮鞭、女装,这届DOTA2勇士令状里都有
2020/07/17 DOTA
python提取字典key列表的方法
2015/07/11 Python
python九九乘法表的实例
2017/09/26 Python
Python实现返回数组中第i小元素的方法示例
2017/12/04 Python
python实现简单的单变量线性回归方法
2018/11/08 Python
支持IE8的纯css3开发的响应式设计动画菜单教程
2014/11/05 HTML / CSS
中国医药集团国药在线:国药网
2017/02/06 全球购物
乌克兰机票、铁路和巴士票、酒店搜索、保险:Tickets.ua
2020/01/11 全球购物
RealTek面试题
2016/06/28 面试题
中层领导干部群众路线对照检查材料思想汇报
2014/10/02 职场文书
初中作文评语集锦
2014/12/25 职场文书
2016年国庆节67周年活动总结
2016/04/01 职场文书
自荐信范文
2019/05/20 职场文书
MySQL事务的隔离级别详情
2022/07/15 MySQL
Shell中的单中括号和双中括号的用法详解
2022/12/24 Servers