python实现多线程采集的2个代码例子


Posted in Python onJuly 07, 2014

代码一:

#!/usr/bin/python
# -*- coding: utf-8 -*-
#encoding=utf-8
 
import threading
import Queue
import sys
import urllib2
import re
import MySQLdb
 
#
# 数据库变量设置
#
DB_HOST = '127.0.0.1'
DB_USER = "XXXX"
DB_PASSWD = "XXXXXXXX"
DB_NAME = "xxxx"
 
#
# 变量设置
#
THREAD_LIMIT = 3
jobs = Queue.Queue(5)
singlelock = threading.Lock()
info = Queue.Queue()
 
def workerbee(inputlist):
    for x in xrange(THREAD_LIMIT):
        print 'Thead {0} started.'.format(x)
        t = spider()
        t.start()
    for i in inputlist:
        try:
            jobs.put(i, block=True, timeout=5)
        except:
            singlelock.acquire()
            print "The queue is full !"
            singlelock.release()
 
    # Wait for the threads to finish
    singlelock.acquire()        # Acquire the lock so we can print
    print "Waiting for threads to finish."
    singlelock.release()        # Release the lock
    jobs.join()              # This command waits for all threads to finish.
    # while not jobs.empty():
    #   print jobs.get()
 
def getTitle(url,time=10):
    response = urllib2.urlopen(url,timeout=time)
    html = response.read()
    response.close()
    reg = r'<title>(.*?)</title>'
    title = re.compile(reg).findall(html)
    # title = title[0].decode('gb2312','replace').encode('utf-8')
    title = title[0]
    return title
 
class spider(threading.Thread):
    def run(self):
        while 1:
            try:
                job = jobs.get(True,1)
                singlelock.acquire()
                title = getTitle(job[1])
                info.put([job[0],title], block=True, timeout=5)
                # print 'This {0} is {1}'.format(job[1],title)
                singlelock.release()
                jobs.task_done()
            except:
                break;
 
if __name__ == '__main__':
    con = None
    urls = []
    try:
        con = MySQLdb.connect(DB_HOST,DB_USER,DB_PASSWD,DB_NAME)
        cur = con.cursor()
        cur.execute('SELECT id,url FROM `table_name` WHERE `status`=0 LIMIT 10')
        rows = cur.fetchall()
        for row in rows:
            # print row
            urls.append([row[0],row[1]])
        workerbee(urls)
        while not info.empty():
            print info.get()
    finally:
        if con:
            con.close()

代码二:

#!/usr/bin/python
# -*- coding: utf-8 -*-
#encoding=utf-8
#Filename:robot.py
 
import threading,Queue,sys,urllib2,re
#
# 变量设置
#
THREAD_LIMIT = 3        #设置线程数
jobs = Queue.Queue(5)      #设置队列长度
singlelock = threading.Lock()    #设置一个线程锁,避免重复调用
 
urls = ['http://games.sina.com.cn/w/n/2013-04-28/1634703505.shtml','http://games.sina.com.cn/w/n/2013-04-28/1246703487.shtml','http://games.sina.com.cn/w/n/2013-04-28/1028703471.shtml','http://games.sina.com.cn/w/n/2013-04-27/1015703426.shtml','http://games.sina.com.cn/w/n/2013-04-26/1554703373.shtml','http://games.sina.com.cn/w/n/2013-04-26/1512703346.shtml','http://games.sina.com.cn/w/n/2013-04-26/1453703334.shtml','http://games.sina.com.cn/w/n/2013-04-26/1451703333.shtml','http://games.sina.com.cn/w/n/2013-04-26/1445703329.shtml','http://games.sina.com.cn/w/n/2013-04-26/1434703322.shtml','http://games.sina.com.cn/w/n/2013-04-26/1433703321.shtml','http://games.sina.com.cn/w/n/2013-04-26/1433703320.shtml','http://games.sina.com.cn/w/n/2013-04-26/1429703318.shtml','http://games.sina.com.cn/w/n/2013-04-26/1429703317.shtml','http://games.sina.com.cn/w/n/2013-04-26/1409703297.shtml','http://games.sina.com.cn/w/n/2013-04-26/1406703296.shtml','http://games.sina.com.cn/w/n/2013-04-26/1402703292.shtml','http://games.sina.com.cn/w/n/2013-04-26/1353703286.shtml','http://games.sina.com.cn/w/n/2013-04-26/1348703284.shtml','http://games.sina.com.cn/w/n/2013-04-26/1327703275.shtml','http://games.sina.com.cn/w/n/2013-04-26/1239703265.shtml','http://games.sina.com.cn/w/n/2013-04-26/1238703264.shtml','http://games.sina.com.cn/w/n/2013-04-26/1231703262.shtml','http://games.sina.com.cn/w/n/2013-04-26/1229703261.shtml','http://games.sina.com.cn/w/n/2013-04-26/1228703260.shtml','http://games.sina.com.cn/w/n/2013-04-26/1223703259.shtml','http://games.sina.com.cn/w/n/2013-04-26/1218703258.shtml','http://games.sina.com.cn/w/n/2013-04-26/1202703254.shtml','http://games.sina.com.cn/w/n/2013-04-26/1159703251.shtml','http://games.sina.com.cn/w/n/2013-04-26/1139703233.shtml']
 
def workerbee(inputlist):
  for x in xrange(THREAD_LIMIT):
    print 'Thead {0} started.'.format(x)
    t = spider()
    t.start()
  for i in inputlist:
    try:
      jobs.put(i, block=True, timeout=5)
    except:
      singlelock.acquire()
      print "The queue is full !"
      singlelock.release()
 
  # Wait for the threads to finish
  singlelock.acquire()    # Acquire the lock so we can print
  print "Waiting for threads to finish."
  singlelock.release()    # Release the lock
  jobs.join()       # This command waits for all threads to finish.
  # while not jobs.empty():
  #  print jobs.get()
 
def getTitle(url,time=10):
  response = urllib2.urlopen(url,timeout=time)
  html = response.read()
  response.close()
  reg = r'<title>(.*?)</title>'
  title = re.compile(reg).findall(html)
  title = title[0].decode('gb2312','replace').encode('utf-8')
  return title
 
class spider(threading.Thread):
  def run(self):
    while 1:
      try:
        job = jobs.get(True,1)
        singlelock.acquire()
        title = getTitle(job)
        print 'This {0} is {1}'.format(job,title)
        singlelock.release()
        jobs.task_done()
      except:
        break;
 
if __name__ == '__main__':
  workerbee(urls)
Python 相关文章推荐
Python Queue模块详解
Nov 30 Python
Python命令行参数解析模块getopt使用实例
Apr 13 Python
详解Python中heapq模块的用法
Jun 28 Python
python获取list下标及其值的简单方法
Sep 12 Python
Django的信号机制详解
May 05 Python
python负载均衡的简单实现方法
Feb 04 Python
Python实现的连接mssql数据库操作示例
Aug 17 Python
python使用matplotlib模块绘制多条折线图、散点图
Apr 26 Python
centos7之Python3.74安装教程
Aug 15 Python
基于keras 模型、结构、权重保存的实现
Jan 24 Python
python求最大公约数和最小公倍数的简单方法
Feb 13 Python
使用Python发现隐藏的wifi
Mar 04 Python
Python程序员开发中常犯的10个错误
Jul 07 #Python
python采用requests库模拟登录和抓取数据的简单示例
Jul 05 #Python
浅析python 中__name__ = '__main__' 的作用
Jul 05 #Python
python在windows下实现备份程序实例
Jul 04 #Python
python调用短信猫控件实现发短信功能实例
Jul 04 #Python
Python实现类继承实例
Jul 04 #Python
Django集成百度富文本编辑器uEditor攻略
Jul 04 #Python
You might like
PHP中“简单工厂模式”实例代码讲解
2012/09/04 PHP
ThinkPHP中__initialize()和类的构造函数__construct()用法分析
2014/11/29 PHP
php中动态变量用法实例
2015/06/10 PHP
thinkphp实现163、QQ邮箱收发邮件的方法
2015/12/18 PHP
php 后端实现JWT认证方法示例
2018/09/04 PHP
用JTrackBar实现的模拟苹果风格的滚动条
2007/08/06 Javascript
Javascript Tab 导航插件 (23个)
2009/06/11 Javascript
Jquery中Ajax 缓存带来的影响的解决方法
2011/05/19 Javascript
javascript-简单的日历实现及Date对象语法介绍(附图)
2013/05/30 Javascript
jquery专业的导航菜单特效代码分享
2015/08/29 Javascript
关于Javascript中defer和async的区别总结
2016/09/20 Javascript
jquery实现百叶窗效果
2017/01/12 Javascript
微信小程序 详解页面跳转与返回并回传数据
2017/02/13 Javascript
关于vue-router路径计算问题
2017/05/10 Javascript
node express使用HTML模板的方法示例
2019/08/22 Javascript
TypeScript之调用栈的实现
2019/12/31 Javascript
JavaScript canvas基于数组生成柱状图代码实例
2020/03/06 Javascript
[01:14:34]DOTA2上海特级锦标赛C组资格赛#2 LGD VS Newbee第一局
2016/02/28 DOTA
[00:09]DOTA2全国高校联赛 精彩活动引爆全场
2018/05/30 DOTA
python实现监控windows服务并自动启动服务示例
2014/04/17 Python
python妹子图简单爬虫实例
2015/07/07 Python
python版简单工厂模式
2017/10/16 Python
利用python 更新ssh 远程代码 操作远程服务器的实现代码
2018/02/08 Python
用python实现将数组元素按从小到大的顺序排列方法
2018/07/02 Python
对python同一个文件夹里面不同.py文件的交叉引用方法详解
2018/12/15 Python
基于python3抓取pinpoint应用信息入库
2020/01/08 Python
Python importlib动态导入模块实现代码
2020/04/16 Python
Clarks鞋法国官方网站:英国其乐鞋品牌
2018/02/11 全球购物
餐饮业的创业计划书范文
2013/12/26 职场文书
单位领导证婚词
2014/01/14 职场文书
小学生中国梦演讲稿
2014/04/23 职场文书
大学生就业自荐书
2014/06/16 职场文书
自我介绍演讲稿范文
2014/08/21 职场文书
六年级语文教学反思
2016/03/03 职场文书
星际争霸:毕姥爷vs解冻03
2022/04/01 星际争霸
Java存储没有重复元素的数组
2022/04/29 Java/Android