python支持断点续传的多线程下载示例


Posted in Python onJanuary 16, 2014
#! /usr/bin/env python
#coding=utf-8
from __future__ import unicode_literals
from multiprocessing.dummy import Pool as ThreadPool
import threading
import os
import sys
import cPickle
from collections import namedtuple
import urllib2
from urlparse import urlsplit
import time

# global lock
lock = threading.Lock()

# default parameters
defaults = dict(thread_count=10,
    buffer_size=10*1024,
    block_size=1000*1024)

def progress(percent, width=50):
    print "%s %d%%\r" % (('%%-%ds' % width) % (width * percent / 100 * '='), percent),
    if percent >= 100:
        print
        sys.stdout.flush()

def write_data(filepath, data):
    with open(filepath, 'wb') as output:
        cPickle.dump(data, output)

def read_data(filepath):
    with open(filepath, 'rb') as output:
        return cPickle.load(output)

FileInfo = namedtuple('FileInfo', 'url name size lastmodified')

def get_file_info(url):
    class HeadRequest(urllib2.Request):
        def get_method(self):
            return "HEAD"
    res = urllib2.urlopen(HeadRequest(url))
    res.read()
    headers = dict(res.headers)
    size = int(headers.get('content-length', 0))
    lastmodified = headers.get('last-modified', '')
    name = None
    if headers.has_key('content-disposition'):
        name = headers['content-disposition'].split('filename=')[1]
        if name[0] == '"' or name[0] == "'":
            name = name[1:-1]
    else:
        name = os.path.basename(urlsplit(url)[2])
    return FileInfo(url, name, size, lastmodified)

def download(url, output,
        thread_count = defaults['thread_count'],
        buffer_size = defaults['buffer_size'],
        block_size = defaults['block_size']):
    # get latest file info
    file_info = get_file_info(url)
    # init path
    if output is None:
        output = file_info.name
    workpath = '%s.ing' % output
    infopath = '%s.inf' % output
    # split file to blocks. every block is a array [start, offset, end],
    # then each greenlet download filepart according to a block, and
    # update the block' offset.
    blocks = []
    if os.path.exists(infopath):
        # load blocks
        _x, blocks = read_data(infopath)
        if (_x.url != url or
                _x.name != file_info.name or
                _x.lastmodified != file_info.lastmodified):
            blocks = []
    if len(blocks) == 0:
        # set blocks
        if block_size > file_info.size:
            blocks = [[0, 0, file_info.size]]
        else:
            block_count, remain = divmod(file_info.size, block_size)
            blocks = [[i*block_size, i*block_size, (i+1)*block_size-1] for i in range(block_count)]
            blocks[-1][-1] += remain
        # create new blank workpath
        with open(workpath, 'wb') as fobj:
            fobj.write('')
    print 'Downloading %s' % url
    # start monitor
    threading.Thread(target=_monitor, args=(infopath, file_info, blocks)).start()
    # start downloading
    with open(workpath, 'rb+') as fobj:
        args = [(url, blocks[i], fobj, buffer_size) for i in range(len(blocks)) if blocks[i][1] < blocks[i][2]]
        if thread_count > len(args):
            thread_count = len(args)
        pool = ThreadPool(thread_count)
        pool.map(_worker, args)
        pool.close()
        pool.join()

    # rename workpath to output
    if os.path.exists(output):
        os.remove(output)
    os.rename(workpath, output)
    # delete infopath
    if os.path.exists(infopath):
        os.remove(infopath)
    assert all([block[1]>=block[2] for block in blocks]) is True

def _worker((url, block, fobj, buffer_size)):
    req = urllib2.Request(url)
    req.headers['Range'] = 'bytes=%s-%s' % (block[1], block[2])
    res = urllib2.urlopen(req)
    while 1:
        chunk = res.read(buffer_size)
        if not chunk:
            break
        with lock:
            fobj.seek(block[1])
            fobj.write(chunk)
            block[1] += len(chunk)

def _monitor(infopath, file_info, blocks):
    while 1:
        with lock:
            percent = sum([block[1] - block[0] for block in blocks]) * 100 / file_info.size
            progress(percent)
            if percent >= 100:
                break
            write_data(infopath, (file_info, blocks))
        time.sleep(2)

if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser(description='Download file by multi-threads.')
    parser.add_argument('url', type=str, help='url of the download file')
    parser.add_argument('-o', type=str, default=None, dest="output", help='output file')
    parser.add_argument('-t', type=int, default=defaults['thread_count'], dest="thread_count", help='thread counts to downloading')
    parser.add_argument('-b', type=int, default=defaults['buffer_size'], dest="buffer_size", help='buffer size')
    parser.add_argument('-s', type=int, default=defaults['block_size'], dest="block_size", help='block size')
    argv = sys.argv[1:]
    if len(argv) == 0:
        argv = ['https://eyes.nasa.gov/eyesproduct/EYES/os/win']
    args = parser.parse_args(argv)
    start_time = time.time()
    download(args.url, args.output, args.thread_count, args.buffer_size, args.block_size)
    print 'times: %ds' % int(time.time()-start_time)
Python 相关文章推荐
linux系统使用python监测网络接口获取网络的输入输出
Jan 15 Python
Python中的默认参数详解
Jun 24 Python
python读取LMDB中图像的方法
Jul 02 Python
对python中的乘法dot和对应分量相乘multiply详解
Nov 14 Python
Python实现的对本地host127.0.0.1主机进行扫描端口功能示例
Feb 15 Python
python3射线法判断点是否在多边形内
Jun 28 Python
Python实现 PS 图像调整中的亮度调整
Jun 28 Python
win10环境下配置vscode python开发环境的教程详解
Oct 16 Python
Python数据可视化:泊松分布详解
Dec 07 Python
python GUI库图形界面开发之PyQt5状态栏控件QStatusBar详细使用方法实例
Feb 28 Python
详解python 内存优化
Aug 17 Python
如何在python中实现ECDSA你知道吗
Nov 23 Python
python获得图片base64编码示例
Jan 16 #Python
python练习程序批量修改文件名
Jan 16 #Python
python使用urllib模块开发的多线程豆瓣小站mp3下载器
Jan 16 #Python
python使用urllib模块和pyquery实现阿里巴巴排名查询
Jan 16 #Python
python3.3教程之模拟百度登陆代码分享
Jan 16 #Python
python解析发往本机的数据包示例 (解析数据包)
Jan 16 #Python
python多线程扫描端口示例
Jan 16 #Python
You might like
在PHP中养成7个面向对象的好习惯
2010/01/28 PHP
递归删除一个节点以及该节点下的所有节点示例
2014/03/19 PHP
php简单获取目录列表的方法
2015/03/24 PHP
WordPress自定义时间显示格式
2015/03/27 PHP
PHP的swoole扩展安装方法详细教程
2016/05/18 PHP
Thinkphp整合阿里云OSS图片上传实例代码
2019/04/28 PHP
在线编辑器中换行与内容自动提取
2009/04/24 Javascript
firefox插件Firebug的使用教程
2010/01/02 Javascript
JavaScript 错误处理与调试经验总结
2010/08/10 Javascript
jquery中获取元素的几种方式小结
2011/07/05 Javascript
javascript制作幻灯片(360度全景图片)
2015/07/28 Javascript
相册展示PhotoSwipe.js插件实现
2016/08/25 Javascript
vue的基本用法与常见指令
2017/08/15 Javascript
基于bootstrap写的一点localStorage本地储存
2017/11/21 Javascript
JavaScript进阶(一)变量声明提升实例分析
2020/05/09 Javascript
[01:03]DOTA2新的征程 你的脚印值得踏上
2014/08/13 DOTA
python定时器(Timer)用法简单实例
2015/06/04 Python
理解python正则表达式
2016/01/15 Python
Python3常用内置方法代码实例
2019/11/18 Python
Python中使用gflags实例及原理解析
2019/12/13 Python
Python如何省略括号方法详解
2020/03/21 Python
Python自动化测试中yaml文件读取操作
2020/08/20 Python
Python中Pyspider爬虫框架的基本使用详解
2021/01/27 Python
详解CSS3媒体查询响应式布局bootstrap 框架原理实战(推荐)
2020/11/16 HTML / CSS
新西兰领先的鞋类和靴子网上商城:Merchant 1948
2017/09/08 全球购物
巴基斯坦电子产品购物网站:Home Shopping
2017/09/14 全球购物
EJB面试题
2015/07/28 面试题
计算机大学生职业生涯规划书范文
2014/02/19 职场文书
环境工程专业自荐信范文
2014/03/18 职场文书
学习党代会心得体会
2014/09/05 职场文书
考试作弊检讨书怎么写?
2014/12/21 职场文书
cf战队宣传语
2015/07/13 职场文书
小学毕业感言100字
2015/07/30 职场文书
Django展示可视化图表的多种方式
2021/04/08 Python
使用qt quick-ListView仿微信好友列表和聊天列表的示例代码
2021/06/13 Python
Redis基本数据类型哈希Hash常用操作命令
2022/06/01 Redis