python支持断点续传的多线程下载示例


Posted in Python onJanuary 16, 2014
#! /usr/bin/env python
#coding=utf-8
from __future__ import unicode_literals
from multiprocessing.dummy import Pool as ThreadPool
import threading
import os
import sys
import cPickle
from collections import namedtuple
import urllib2
from urlparse import urlsplit
import time

# global lock
lock = threading.Lock()

# default parameters
defaults = dict(thread_count=10,
    buffer_size=10*1024,
    block_size=1000*1024)

def progress(percent, width=50):
    print "%s %d%%\r" % (('%%-%ds' % width) % (width * percent / 100 * '='), percent),
    if percent >= 100:
        print
        sys.stdout.flush()

def write_data(filepath, data):
    with open(filepath, 'wb') as output:
        cPickle.dump(data, output)

def read_data(filepath):
    with open(filepath, 'rb') as output:
        return cPickle.load(output)

FileInfo = namedtuple('FileInfo', 'url name size lastmodified')

def get_file_info(url):
    class HeadRequest(urllib2.Request):
        def get_method(self):
            return "HEAD"
    res = urllib2.urlopen(HeadRequest(url))
    res.read()
    headers = dict(res.headers)
    size = int(headers.get('content-length', 0))
    lastmodified = headers.get('last-modified', '')
    name = None
    if headers.has_key('content-disposition'):
        name = headers['content-disposition'].split('filename=')[1]
        if name[0] == '"' or name[0] == "'":
            name = name[1:-1]
    else:
        name = os.path.basename(urlsplit(url)[2])
    return FileInfo(url, name, size, lastmodified)

def download(url, output,
        thread_count = defaults['thread_count'],
        buffer_size = defaults['buffer_size'],
        block_size = defaults['block_size']):
    # get latest file info
    file_info = get_file_info(url)
    # init path
    if output is None:
        output = file_info.name
    workpath = '%s.ing' % output
    infopath = '%s.inf' % output
    # split file to blocks. every block is a array [start, offset, end],
    # then each greenlet download filepart according to a block, and
    # update the block' offset.
    blocks = []
    if os.path.exists(infopath):
        # load blocks
        _x, blocks = read_data(infopath)
        if (_x.url != url or
                _x.name != file_info.name or
                _x.lastmodified != file_info.lastmodified):
            blocks = []
    if len(blocks) == 0:
        # set blocks
        if block_size > file_info.size:
            blocks = [[0, 0, file_info.size]]
        else:
            block_count, remain = divmod(file_info.size, block_size)
            blocks = [[i*block_size, i*block_size, (i+1)*block_size-1] for i in range(block_count)]
            blocks[-1][-1] += remain
        # create new blank workpath
        with open(workpath, 'wb') as fobj:
            fobj.write('')
    print 'Downloading %s' % url
    # start monitor
    threading.Thread(target=_monitor, args=(infopath, file_info, blocks)).start()
    # start downloading
    with open(workpath, 'rb+') as fobj:
        args = [(url, blocks[i], fobj, buffer_size) for i in range(len(blocks)) if blocks[i][1] < blocks[i][2]]
        if thread_count > len(args):
            thread_count = len(args)
        pool = ThreadPool(thread_count)
        pool.map(_worker, args)
        pool.close()
        pool.join()

    # rename workpath to output
    if os.path.exists(output):
        os.remove(output)
    os.rename(workpath, output)
    # delete infopath
    if os.path.exists(infopath):
        os.remove(infopath)
    assert all([block[1]>=block[2] for block in blocks]) is True

def _worker((url, block, fobj, buffer_size)):
    req = urllib2.Request(url)
    req.headers['Range'] = 'bytes=%s-%s' % (block[1], block[2])
    res = urllib2.urlopen(req)
    while 1:
        chunk = res.read(buffer_size)
        if not chunk:
            break
        with lock:
            fobj.seek(block[1])
            fobj.write(chunk)
            block[1] += len(chunk)

def _monitor(infopath, file_info, blocks):
    while 1:
        with lock:
            percent = sum([block[1] - block[0] for block in blocks]) * 100 / file_info.size
            progress(percent)
            if percent >= 100:
                break
            write_data(infopath, (file_info, blocks))
        time.sleep(2)

if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser(description='Download file by multi-threads.')
    parser.add_argument('url', type=str, help='url of the download file')
    parser.add_argument('-o', type=str, default=None, dest="output", help='output file')
    parser.add_argument('-t', type=int, default=defaults['thread_count'], dest="thread_count", help='thread counts to downloading')
    parser.add_argument('-b', type=int, default=defaults['buffer_size'], dest="buffer_size", help='buffer size')
    parser.add_argument('-s', type=int, default=defaults['block_size'], dest="block_size", help='block size')
    argv = sys.argv[1:]
    if len(argv) == 0:
        argv = ['https://eyes.nasa.gov/eyesproduct/EYES/os/win']
    args = parser.parse_args(argv)
    start_time = time.time()
    download(args.url, args.output, args.thread_count, args.buffer_size, args.block_size)
    print 'times: %ds' % int(time.time()-start_time)
Python 相关文章推荐
python使用calendar输出指定年份全年日历的方法
Apr 04 Python
Django中对数据查询结果进行排序的方法
Jul 17 Python
Django URL传递参数的方法总结
Aug 28 Python
用pandas按列合并两个文件的实例
Apr 12 Python
python寻找list中最大值、最小值并返回其所在位置的方法
Jun 27 Python
浅谈python连续赋值可能引发的错误
Nov 10 Python
Python FTP文件定时自动下载实现过程解析
Nov 12 Python
pycharm 2019 最新激活方式(pycharm破解、激活)
Sep 22 Python
python图形界面开发之wxPython树控件使用方法详解
Feb 24 Python
python对文件的操作方法汇总
Feb 28 Python
用python自动生成日历
Apr 24 Python
Python中使用subprocess库创建附加进程
May 11 Python
python获得图片base64编码示例
Jan 16 #Python
python练习程序批量修改文件名
Jan 16 #Python
python使用urllib模块开发的多线程豆瓣小站mp3下载器
Jan 16 #Python
python使用urllib模块和pyquery实现阿里巴巴排名查询
Jan 16 #Python
python3.3教程之模拟百度登陆代码分享
Jan 16 #Python
python解析发往本机的数据包示例 (解析数据包)
Jan 16 #Python
python多线程扫描端口示例
Jan 16 #Python
You might like
Laravel5.1自定义500错误页面示例
2016/10/09 PHP
YII2框架中使用yii.js实现的post请求
2017/04/09 PHP
JavaScript的目的分析
2007/01/05 Javascript
关于scrollLeft,scrollTop的浏览器兼容性测试
2013/03/19 Javascript
Javascript控制页面链接在新窗口打开具体方法
2013/08/16 Javascript
javascript定时器完整实例
2015/02/10 Javascript
Node.js中常规的文件操作总结
2016/10/13 Javascript
Vue.js实现一个SPA登录页面的过程【推荐】
2017/04/29 Javascript
es6学习笔记之Async函数的使用示例
2017/05/11 Javascript
Angular2 之 路由与导航详细介绍
2017/05/26 Javascript
Vue.js结合Ueditor富文本编辑器的实例代码
2017/07/11 Javascript
基于Cookie常用操作以及属性介绍
2017/09/07 Javascript
vue获取input输入值的问题解决办法
2017/10/17 Javascript
jQuery+CSS实现的table表格行列转置功能示例
2018/01/08 jQuery
原生js实现拖拽功能基本思路详解
2018/04/18 Javascript
详解JS函数stack size计算方法
2018/06/18 Javascript
深入理解JavaScript的值传递和引用传递
2018/10/24 Javascript
layer.prompt输入层的例子
2019/09/24 Javascript
解决Vue在Tomcat8下部署页面不加载的问题
2019/11/12 Javascript
[16:04]DOTA2海涛带你玩炸弹 9月5日更新内容详解
2014/09/05 DOTA
[36:02]DOTA2上海特级锦标赛D组小组赛#2 Liquid VS VP第一局
2016/02/28 DOTA
[39:52]2018DOTA2亚洲邀请赛 4.3 突围赛 EG vs Newbee 第一场
2018/04/04 DOTA
Python中断言Assertion的一些改进方案
2016/10/27 Python
python使用标准库根据进程名如何获取进程的pid详解
2017/10/31 Python
Python中协程用法代码详解
2018/02/10 Python
Python Requests模拟登录实现图书馆座位自动预约
2018/04/27 Python
Flask框架各种常见装饰器示例
2018/07/17 Python
python2.7和NLTK安装详细教程
2018/09/19 Python
对Python+opencv将图片生成视频的实例详解
2019/01/08 Python
Python秒算24点实现及原理详解
2019/07/29 Python
matplotlib图例legend语法及设置的方法
2020/07/28 Python
Python字符串对齐、删除字符串不需要的内容以及格式化打印字符
2021/01/23 Python
2015年物流客服工作总结
2015/07/27 职场文书
教学反思怎么写
2016/02/24 职场文书
2019秋季运动会口号
2019/06/25 职场文书
详解Python魔法方法之描述符类
2021/05/26 Python