python支持断点续传的多线程下载示例


Posted in Python onJanuary 16, 2014
#! /usr/bin/env python
#coding=utf-8
from __future__ import unicode_literals
from multiprocessing.dummy import Pool as ThreadPool
import threading
import os
import sys
import cPickle
from collections import namedtuple
import urllib2
from urlparse import urlsplit
import time

# global lock
lock = threading.Lock()

# default parameters
defaults = dict(thread_count=10,
    buffer_size=10*1024,
    block_size=1000*1024)

def progress(percent, width=50):
    print "%s %d%%\r" % (('%%-%ds' % width) % (width * percent / 100 * '='), percent),
    if percent >= 100:
        print
        sys.stdout.flush()

def write_data(filepath, data):
    with open(filepath, 'wb') as output:
        cPickle.dump(data, output)

def read_data(filepath):
    with open(filepath, 'rb') as output:
        return cPickle.load(output)

FileInfo = namedtuple('FileInfo', 'url name size lastmodified')

def get_file_info(url):
    class HeadRequest(urllib2.Request):
        def get_method(self):
            return "HEAD"
    res = urllib2.urlopen(HeadRequest(url))
    res.read()
    headers = dict(res.headers)
    size = int(headers.get('content-length', 0))
    lastmodified = headers.get('last-modified', '')
    name = None
    if headers.has_key('content-disposition'):
        name = headers['content-disposition'].split('filename=')[1]
        if name[0] == '"' or name[0] == "'":
            name = name[1:-1]
    else:
        name = os.path.basename(urlsplit(url)[2])
    return FileInfo(url, name, size, lastmodified)

def download(url, output,
        thread_count = defaults['thread_count'],
        buffer_size = defaults['buffer_size'],
        block_size = defaults['block_size']):
    # get latest file info
    file_info = get_file_info(url)
    # init path
    if output is None:
        output = file_info.name
    workpath = '%s.ing' % output
    infopath = '%s.inf' % output
    # split file to blocks. every block is a array [start, offset, end],
    # then each greenlet download filepart according to a block, and
    # update the block' offset.
    blocks = []
    if os.path.exists(infopath):
        # load blocks
        _x, blocks = read_data(infopath)
        if (_x.url != url or
                _x.name != file_info.name or
                _x.lastmodified != file_info.lastmodified):
            blocks = []
    if len(blocks) == 0:
        # set blocks
        if block_size > file_info.size:
            blocks = [[0, 0, file_info.size]]
        else:
            block_count, remain = divmod(file_info.size, block_size)
            blocks = [[i*block_size, i*block_size, (i+1)*block_size-1] for i in range(block_count)]
            blocks[-1][-1] += remain
        # create new blank workpath
        with open(workpath, 'wb') as fobj:
            fobj.write('')
    print 'Downloading %s' % url
    # start monitor
    threading.Thread(target=_monitor, args=(infopath, file_info, blocks)).start()
    # start downloading
    with open(workpath, 'rb+') as fobj:
        args = [(url, blocks[i], fobj, buffer_size) for i in range(len(blocks)) if blocks[i][1] < blocks[i][2]]
        if thread_count > len(args):
            thread_count = len(args)
        pool = ThreadPool(thread_count)
        pool.map(_worker, args)
        pool.close()
        pool.join()

    # rename workpath to output
    if os.path.exists(output):
        os.remove(output)
    os.rename(workpath, output)
    # delete infopath
    if os.path.exists(infopath):
        os.remove(infopath)
    assert all([block[1]>=block[2] for block in blocks]) is True

def _worker((url, block, fobj, buffer_size)):
    req = urllib2.Request(url)
    req.headers['Range'] = 'bytes=%s-%s' % (block[1], block[2])
    res = urllib2.urlopen(req)
    while 1:
        chunk = res.read(buffer_size)
        if not chunk:
            break
        with lock:
            fobj.seek(block[1])
            fobj.write(chunk)
            block[1] += len(chunk)

def _monitor(infopath, file_info, blocks):
    while 1:
        with lock:
            percent = sum([block[1] - block[0] for block in blocks]) * 100 / file_info.size
            progress(percent)
            if percent >= 100:
                break
            write_data(infopath, (file_info, blocks))
        time.sleep(2)

if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser(description='Download file by multi-threads.')
    parser.add_argument('url', type=str, help='url of the download file')
    parser.add_argument('-o', type=str, default=None, dest="output", help='output file')
    parser.add_argument('-t', type=int, default=defaults['thread_count'], dest="thread_count", help='thread counts to downloading')
    parser.add_argument('-b', type=int, default=defaults['buffer_size'], dest="buffer_size", help='buffer size')
    parser.add_argument('-s', type=int, default=defaults['block_size'], dest="block_size", help='block size')
    argv = sys.argv[1:]
    if len(argv) == 0:
        argv = ['https://eyes.nasa.gov/eyesproduct/EYES/os/win']
    args = parser.parse_args(argv)
    start_time = time.time()
    download(args.url, args.output, args.thread_count, args.buffer_size, args.block_size)
    print 'times: %ds' % int(time.time()-start_time)
Python 相关文章推荐
Python列表计数及插入实例
Dec 17 Python
利用Python生成文件md5校验值函数的方法
Jan 10 Python
Python之re操作方法(详解)
Jun 14 Python
Python实现的中国剩余定理算法示例
Aug 05 Python
Python unittest单元测试框架总结
Sep 08 Python
python游戏开发的五个案例分享
Mar 09 Python
python程序输出无内容的解决方式
Apr 09 Python
调整Jupyter notebook的启动目录操作
Apr 10 Python
TensorFlow打印输出tensor的值
Apr 19 Python
python中if及if-else如何使用
Jun 02 Python
解决python 虚拟环境删除包无法加载的问题
Jul 13 Python
Python rabbitMQ如何实现生产消费者模式
Aug 24 Python
python获得图片base64编码示例
Jan 16 #Python
python练习程序批量修改文件名
Jan 16 #Python
python使用urllib模块开发的多线程豆瓣小站mp3下载器
Jan 16 #Python
python使用urllib模块和pyquery实现阿里巴巴排名查询
Jan 16 #Python
python3.3教程之模拟百度登陆代码分享
Jan 16 #Python
python解析发往本机的数据包示例 (解析数据包)
Jan 16 #Python
python多线程扫描端口示例
Jan 16 #Python
You might like
兼容性最强的PHP生成缩略图的函数代码(修改版)
2011/01/18 PHP
php将gd生成的图片缓存到memcache的小例子
2013/06/05 PHP
Windows下Apache + PHP SESSION丢失的解决过程全纪录
2015/04/07 PHP
PHP不使用递归的无限级分类简单实例
2016/11/05 PHP
PHP实现的简单适配器模式示例
2017/06/22 PHP
javascript (用setTimeout而非setInterval)
2011/12/28 Javascript
js 判断计算字符串长度/判断空的简单方法
2013/08/05 Javascript
用JS将搜索的关键字高亮显示实现代码
2013/11/08 Javascript
JavaScript的setAttribute兼容性问题解决方法
2013/11/11 Javascript
jquery获取form表单input元素值的简单实例
2016/05/30 Javascript
js添加千分位的实现代码(超简单)
2016/08/01 Javascript
vue 监听 Treeselect 选择项的改变操作
2020/08/31 Javascript
js实现简易点击切换显示或隐藏
2020/11/29 Javascript
[01:33]一分钟玩转DOTA2第三弹:DOTA2&DotA快捷操作大对比
2014/06/04 DOTA
python中使用enumerate函数遍历元素实例
2014/06/16 Python
python编写网页爬虫脚本并实现APScheduler调度
2014/07/28 Python
Python NumPy库安装使用笔记
2015/05/18 Python
Django项目中model的数据处理以及页面交互方法
2018/05/30 Python
Python基于opencv实现的简单画板功能示例
2019/03/04 Python
详解Python time库的使用
2019/10/10 Python
matlab中imadjust函数的作用及应用举例
2020/02/27 Python
基于Python共轭梯度法与最速下降法之间的对比
2020/04/02 Python
pycharm 激活码及使用方式的详细教程
2020/05/12 Python
使用PyCharm官方中文语言包汉化PyCharm
2020/11/18 Python
详解CSS3的box-shadow属性制作边框阴影效果的方法
2016/05/10 HTML / CSS
Html5 postMessage实现跨域消息传递
2016/03/11 HTML / CSS
巴西最大的玩具连锁店:Ri Happy
2020/06/17 全球购物
测试时代收集的软件测试面试题
2013/09/25 面试题
remote接口和home接口主要作用
2013/05/15 面试题
2014优秀党员事迹材料
2014/08/14 职场文书
捐款通知怎么写
2015/04/24 职场文书
三八红旗手主要事迹材料
2015/11/04 职场文书
2016年教师政治思想表现评语
2015/12/02 职场文书
Windows10下安装MySQL8
2021/04/06 MySQL
分析SQL窗口函数之取值窗口函数
2022/04/21 Oracle
Python matplotlib 利用随机函数生成变化图形
2022/04/26 Python