python实现多线程采集的2个代码例子


Posted in Python onJuly 07, 2014

代码一:

#!/usr/bin/python
# -*- coding: utf-8 -*-
#encoding=utf-8
 
import threading
import Queue
import sys
import urllib2
import re
import MySQLdb
 
#
# 数据库变量设置
#
DB_HOST = '127.0.0.1'
DB_USER = "XXXX"
DB_PASSWD = "XXXXXXXX"
DB_NAME = "xxxx"
 
#
# 变量设置
#
THREAD_LIMIT = 3
jobs = Queue.Queue(5)
singlelock = threading.Lock()
info = Queue.Queue()
 
def workerbee(inputlist):
    for x in xrange(THREAD_LIMIT):
        print 'Thead {0} started.'.format(x)
        t = spider()
        t.start()
    for i in inputlist:
        try:
            jobs.put(i, block=True, timeout=5)
        except:
            singlelock.acquire()
            print "The queue is full !"
            singlelock.release()
 
    # Wait for the threads to finish
    singlelock.acquire()        # Acquire the lock so we can print
    print "Waiting for threads to finish."
    singlelock.release()        # Release the lock
    jobs.join()              # This command waits for all threads to finish.
    # while not jobs.empty():
    #   print jobs.get()
 
def getTitle(url,time=10):
    response = urllib2.urlopen(url,timeout=time)
    html = response.read()
    response.close()
    reg = r'<title>(.*?)</title>'
    title = re.compile(reg).findall(html)
    # title = title[0].decode('gb2312','replace').encode('utf-8')
    title = title[0]
    return title
 
class spider(threading.Thread):
    def run(self):
        while 1:
            try:
                job = jobs.get(True,1)
                singlelock.acquire()
                title = getTitle(job[1])
                info.put([job[0],title], block=True, timeout=5)
                # print 'This {0} is {1}'.format(job[1],title)
                singlelock.release()
                jobs.task_done()
            except:
                break;
 
if __name__ == '__main__':
    con = None
    urls = []
    try:
        con = MySQLdb.connect(DB_HOST,DB_USER,DB_PASSWD,DB_NAME)
        cur = con.cursor()
        cur.execute('SELECT id,url FROM `table_name` WHERE `status`=0 LIMIT 10')
        rows = cur.fetchall()
        for row in rows:
            # print row
            urls.append([row[0],row[1]])
        workerbee(urls)
        while not info.empty():
            print info.get()
    finally:
        if con:
            con.close()

代码二:

#!/usr/bin/python
# -*- coding: utf-8 -*-
#encoding=utf-8
#Filename:robot.py
 
import threading,Queue,sys,urllib2,re
#
# 变量设置
#
THREAD_LIMIT = 3        #设置线程数
jobs = Queue.Queue(5)      #设置队列长度
singlelock = threading.Lock()    #设置一个线程锁,避免重复调用
 
urls = ['http://games.sina.com.cn/w/n/2013-04-28/1634703505.shtml','http://games.sina.com.cn/w/n/2013-04-28/1246703487.shtml','http://games.sina.com.cn/w/n/2013-04-28/1028703471.shtml','http://games.sina.com.cn/w/n/2013-04-27/1015703426.shtml','http://games.sina.com.cn/w/n/2013-04-26/1554703373.shtml','http://games.sina.com.cn/w/n/2013-04-26/1512703346.shtml','http://games.sina.com.cn/w/n/2013-04-26/1453703334.shtml','http://games.sina.com.cn/w/n/2013-04-26/1451703333.shtml','http://games.sina.com.cn/w/n/2013-04-26/1445703329.shtml','http://games.sina.com.cn/w/n/2013-04-26/1434703322.shtml','http://games.sina.com.cn/w/n/2013-04-26/1433703321.shtml','http://games.sina.com.cn/w/n/2013-04-26/1433703320.shtml','http://games.sina.com.cn/w/n/2013-04-26/1429703318.shtml','http://games.sina.com.cn/w/n/2013-04-26/1429703317.shtml','http://games.sina.com.cn/w/n/2013-04-26/1409703297.shtml','http://games.sina.com.cn/w/n/2013-04-26/1406703296.shtml','http://games.sina.com.cn/w/n/2013-04-26/1402703292.shtml','http://games.sina.com.cn/w/n/2013-04-26/1353703286.shtml','http://games.sina.com.cn/w/n/2013-04-26/1348703284.shtml','http://games.sina.com.cn/w/n/2013-04-26/1327703275.shtml','http://games.sina.com.cn/w/n/2013-04-26/1239703265.shtml','http://games.sina.com.cn/w/n/2013-04-26/1238703264.shtml','http://games.sina.com.cn/w/n/2013-04-26/1231703262.shtml','http://games.sina.com.cn/w/n/2013-04-26/1229703261.shtml','http://games.sina.com.cn/w/n/2013-04-26/1228703260.shtml','http://games.sina.com.cn/w/n/2013-04-26/1223703259.shtml','http://games.sina.com.cn/w/n/2013-04-26/1218703258.shtml','http://games.sina.com.cn/w/n/2013-04-26/1202703254.shtml','http://games.sina.com.cn/w/n/2013-04-26/1159703251.shtml','http://games.sina.com.cn/w/n/2013-04-26/1139703233.shtml']
 
def workerbee(inputlist):
  for x in xrange(THREAD_LIMIT):
    print 'Thead {0} started.'.format(x)
    t = spider()
    t.start()
  for i in inputlist:
    try:
      jobs.put(i, block=True, timeout=5)
    except:
      singlelock.acquire()
      print "The queue is full !"
      singlelock.release()
 
  # Wait for the threads to finish
  singlelock.acquire()    # Acquire the lock so we can print
  print "Waiting for threads to finish."
  singlelock.release()    # Release the lock
  jobs.join()       # This command waits for all threads to finish.
  # while not jobs.empty():
  #  print jobs.get()
 
def getTitle(url,time=10):
  response = urllib2.urlopen(url,timeout=time)
  html = response.read()
  response.close()
  reg = r'<title>(.*?)</title>'
  title = re.compile(reg).findall(html)
  title = title[0].decode('gb2312','replace').encode('utf-8')
  return title
 
class spider(threading.Thread):
  def run(self):
    while 1:
      try:
        job = jobs.get(True,1)
        singlelock.acquire()
        title = getTitle(job)
        print 'This {0} is {1}'.format(job,title)
        singlelock.release()
        jobs.task_done()
      except:
        break;
 
if __name__ == '__main__':
  workerbee(urls)
Python 相关文章推荐
python 爬取微信文章
Jan 30 Python
Python 元类实例解析
Apr 04 Python
Python中的二维数组实例(list与numpy.array)
Apr 13 Python
解决python爬虫中有中文的url问题
May 11 Python
Python实现提取XML内容并保存到Excel中的方法
Sep 01 Python
手把手教你使用Python创建微信机器人
Apr 29 Python
python实现人工智能Ai抠图功能
Sep 05 Python
python的列表List求均值和中位数实例
Mar 03 Python
Python导入模块包原理及相关注意事项
Mar 25 Python
Django与pyecharts结合的实例代码
May 13 Python
Python用K-means聚类算法进行客户分群的实现
Aug 23 Python
python爬虫分布式获取数据的实例方法
Nov 26 Python
Python程序员开发中常犯的10个错误
Jul 07 #Python
python采用requests库模拟登录和抓取数据的简单示例
Jul 05 #Python
浅析python 中__name__ = '__main__' 的作用
Jul 05 #Python
python在windows下实现备份程序实例
Jul 04 #Python
python调用短信猫控件实现发短信功能实例
Jul 04 #Python
Python实现类继承实例
Jul 04 #Python
Django集成百度富文本编辑器uEditor攻略
Jul 04 #Python
You might like
PHP容易被忽略而出错陷阱 数字与字符串比较
2011/11/10 PHP
本地机apache配置基于域名的虚拟主机详解
2013/08/10 PHP
[原创]php使用curl判断网页404(不存在)的方法
2016/06/23 PHP
删除select中所有option选项jquery代码
2013/08/12 Javascript
用Jquery.load载入页面后样式没了页面混乱的解决方法
2014/10/20 Javascript
jQuery 生成svg矢量二维码
2016/08/09 Javascript
微信小程序 animation API详解及实例代码
2016/10/08 Javascript
Bootstrap CSS组件之按钮组(btn-group)
2016/12/17 Javascript
在一个页面重复使用一个js函数的方法详解
2016/12/26 Javascript
vue2.0全局组件之pdf详解
2017/06/26 Javascript
JavaScript中错误正确处理方式小结你用对了吗
2017/10/10 Javascript
JS实现的将html转为pdf功能【基于浏览器端插件jsPDF】
2018/02/06 Javascript
React+Redux实现简单的待办事项列表ToDoList
2019/09/29 Javascript
vue elementUI 表单校验的实现代码(多层嵌套)
2019/11/06 Javascript
Vue中点击active并第一个默认选中功能的实现
2020/02/24 Javascript
JSON 入门教程基础篇 json入门学习笔记
2020/09/22 Javascript
element中table高度自适应的实现
2020/10/21 Javascript
[01:04:48]VGJ.S vs TNC Supermajor 败者组 BO3 第一场 6.6
2018/06/07 DOTA
python中文乱码不着急,先看懂字节和字符
2017/12/20 Python
python3 判断列表是一个空列表的方法
2018/05/04 Python
python实现下载pop3邮件保存到本地
2018/06/19 Python
Selenium控制浏览器常见操作示例
2018/08/13 Python
Python实现合并两个有序链表的方法示例
2019/01/31 Python
python 利用文件锁单例执行脚本的方法
2019/02/19 Python
pyqt5 实现多窗口跳转的方法
2019/06/19 Python
pytorch torch.expand和torch.repeat的区别详解
2019/11/05 Python
opencv resize图片为正方形尺寸的实现方法
2019/12/26 Python
Django实现将views.py中的数据传递到前端html页面,并展示
2020/03/16 Python
爬虫代理的cookie如何生成运行
2020/09/22 Python
python中K-means算法基础知识点
2021/01/25 Python
pandas按条件筛选数据的实现
2021/02/20 Python
你常见到的runtime exception
2016/09/05 面试题
2015财务年度工作总结范文
2015/05/04 职场文书
公证书
2019/04/17 职场文书
2019年世界儿童日宣传标语
2019/11/22 职场文书
JavaScript数组 几个常用方法总结
2021/11/11 Javascript