python支持断点续传的多线程下载示例


Posted in Python onJanuary 16, 2014
#! /usr/bin/env python
#coding=utf-8
from __future__ import unicode_literals
from multiprocessing.dummy import Pool as ThreadPool
import threading
import os
import sys
import cPickle
from collections import namedtuple
import urllib2
from urlparse import urlsplit
import time

# global lock
lock = threading.Lock()

# default parameters
defaults = dict(thread_count=10,
    buffer_size=10*1024,
    block_size=1000*1024)

def progress(percent, width=50):
    print "%s %d%%\r" % (('%%-%ds' % width) % (width * percent / 100 * '='), percent),
    if percent >= 100:
        print
        sys.stdout.flush()

def write_data(filepath, data):
    with open(filepath, 'wb') as output:
        cPickle.dump(data, output)

def read_data(filepath):
    with open(filepath, 'rb') as output:
        return cPickle.load(output)

FileInfo = namedtuple('FileInfo', 'url name size lastmodified')

def get_file_info(url):
    class HeadRequest(urllib2.Request):
        def get_method(self):
            return "HEAD"
    res = urllib2.urlopen(HeadRequest(url))
    res.read()
    headers = dict(res.headers)
    size = int(headers.get('content-length', 0))
    lastmodified = headers.get('last-modified', '')
    name = None
    if headers.has_key('content-disposition'):
        name = headers['content-disposition'].split('filename=')[1]
        if name[0] == '"' or name[0] == "'":
            name = name[1:-1]
    else:
        name = os.path.basename(urlsplit(url)[2])
    return FileInfo(url, name, size, lastmodified)

def download(url, output,
        thread_count = defaults['thread_count'],
        buffer_size = defaults['buffer_size'],
        block_size = defaults['block_size']):
    # get latest file info
    file_info = get_file_info(url)
    # init path
    if output is None:
        output = file_info.name
    workpath = '%s.ing' % output
    infopath = '%s.inf' % output
    # split file to blocks. every block is a array [start, offset, end],
    # then each greenlet download filepart according to a block, and
    # update the block' offset.
    blocks = []
    if os.path.exists(infopath):
        # load blocks
        _x, blocks = read_data(infopath)
        if (_x.url != url or
                _x.name != file_info.name or
                _x.lastmodified != file_info.lastmodified):
            blocks = []
    if len(blocks) == 0:
        # set blocks
        if block_size > file_info.size:
            blocks = [[0, 0, file_info.size]]
        else:
            block_count, remain = divmod(file_info.size, block_size)
            blocks = [[i*block_size, i*block_size, (i+1)*block_size-1] for i in range(block_count)]
            blocks[-1][-1] += remain
        # create new blank workpath
        with open(workpath, 'wb') as fobj:
            fobj.write('')
    print 'Downloading %s' % url
    # start monitor
    threading.Thread(target=_monitor, args=(infopath, file_info, blocks)).start()
    # start downloading
    with open(workpath, 'rb+') as fobj:
        args = [(url, blocks[i], fobj, buffer_size) for i in range(len(blocks)) if blocks[i][1] < blocks[i][2]]
        if thread_count > len(args):
            thread_count = len(args)
        pool = ThreadPool(thread_count)
        pool.map(_worker, args)
        pool.close()
        pool.join()

    # rename workpath to output
    if os.path.exists(output):
        os.remove(output)
    os.rename(workpath, output)
    # delete infopath
    if os.path.exists(infopath):
        os.remove(infopath)
    assert all([block[1]>=block[2] for block in blocks]) is True

def _worker((url, block, fobj, buffer_size)):
    req = urllib2.Request(url)
    req.headers['Range'] = 'bytes=%s-%s' % (block[1], block[2])
    res = urllib2.urlopen(req)
    while 1:
        chunk = res.read(buffer_size)
        if not chunk:
            break
        with lock:
            fobj.seek(block[1])
            fobj.write(chunk)
            block[1] += len(chunk)

def _monitor(infopath, file_info, blocks):
    while 1:
        with lock:
            percent = sum([block[1] - block[0] for block in blocks]) * 100 / file_info.size
            progress(percent)
            if percent >= 100:
                break
            write_data(infopath, (file_info, blocks))
        time.sleep(2)

if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser(description='Download file by multi-threads.')
    parser.add_argument('url', type=str, help='url of the download file')
    parser.add_argument('-o', type=str, default=None, dest="output", help='output file')
    parser.add_argument('-t', type=int, default=defaults['thread_count'], dest="thread_count", help='thread counts to downloading')
    parser.add_argument('-b', type=int, default=defaults['buffer_size'], dest="buffer_size", help='buffer size')
    parser.add_argument('-s', type=int, default=defaults['block_size'], dest="block_size", help='block size')
    argv = sys.argv[1:]
    if len(argv) == 0:
        argv = ['https://eyes.nasa.gov/eyesproduct/EYES/os/win']
    args = parser.parse_args(argv)
    start_time = time.time()
    download(args.url, args.output, args.thread_count, args.buffer_size, args.block_size)
    print 'times: %ds' % int(time.time()-start_time)
Python 相关文章推荐
Python比较文件夹比另一同名文件夹多出的文件并复制出来的方法
Mar 05 Python
Python中if __name__ == '__main__'作用解析
Jun 29 Python
利用python爬取软考试题之ip自动代理
Mar 28 Python
Python实现将SQLite中的数据直接输出为CVS的方法示例
Jul 13 Python
wx.CheckBox创建复选框控件并响应鼠标点击事件
Apr 25 Python
Django+JS 实现点击头像即可更改头像的方法示例
Dec 26 Python
解决Python找不到ssl模块问题 No module named _ssl的方法
Apr 29 Python
Python队列RabbitMQ 使用方法实例记录
Aug 05 Python
pytorch 固定部分参数训练的方法
Aug 17 Python
Python基于类路径字符串获取静态属性
Mar 12 Python
浅谈django channels 路由误导
May 28 Python
python DataFrame中stack()方法、unstack()方法和pivot()方法浅析
Apr 06 Python
python获得图片base64编码示例
Jan 16 #Python
python练习程序批量修改文件名
Jan 16 #Python
python使用urllib模块开发的多线程豆瓣小站mp3下载器
Jan 16 #Python
python使用urllib模块和pyquery实现阿里巴巴排名查询
Jan 16 #Python
python3.3教程之模拟百度登陆代码分享
Jan 16 #Python
python解析发往本机的数据包示例 (解析数据包)
Jan 16 #Python
python多线程扫描端口示例
Jan 16 #Python
You might like
php 设计模式之 工厂模式
2008/12/19 PHP
php采集时被封ip的解决方法
2010/08/29 PHP
php小技巧 把数组的键和值交换形成了新的数组,查找值取得键
2011/06/02 PHP
一个简单的网页密码登陆php代码
2012/07/17 PHP
PHP魔术方法以及关于独立实例与相连实例的全面讲解
2016/10/18 PHP
详解PHP中foreach的用法和实例
2016/10/25 PHP
解决php扩展安装不生效问题
2019/10/25 PHP
Javascript写了一个清除“logo1_.exe”的杀毒工具(可扫描目录)
2007/02/09 Javascript
jquery ajax 登录验证实现代码
2009/09/23 Javascript
非常棒的10款jQuery 幻灯片插件
2011/06/14 Javascript
jQuery Deferred和Promise创建响应式应用程序详细介绍
2013/03/05 Javascript
jquery实现跳到底部,回到顶部效果的简单实例(类似锚)
2016/07/10 Javascript
详解Axios统一错误处理与后置
2018/09/26 Javascript
使用Webpack提升Vue.js应用程序的4种方法(翻译)
2019/10/09 Javascript
react项目从新建到部署的实现示例
2021/02/19 Javascript
对Python3之进程池与回调函数的实例详解
2019/01/22 Python
pyqt弹出新对话框,以及关闭对话框获取数据的实例
2019/06/18 Python
python做反被爬保护的方法
2019/07/01 Python
django rest framework vue 实现用户登录详解
2019/07/29 Python
python实现12306登录并保存cookie的方法示例
2019/12/17 Python
Matplotlib使用字符串代替变量绘制散点图的方法
2020/02/17 Python
Expedia挪威官网:酒店、机票和租车
2018/03/03 全球购物
英国派对礼服和连衣裙购物网站:TFNC London
2018/07/07 全球购物
SQL面试题
2013/12/09 面试题
教师师德教育的自我评价
2013/10/31 职场文书
教导处工作制度
2014/01/18 职场文书
教育技术职业规划范文
2014/03/04 职场文书
区长工作作风个人整改措施
2014/10/01 职场文书
庆祝国庆节标语
2014/10/09 职场文书
个人查摆问题及整改措施
2014/10/16 职场文书
2014年个人工作总结范文
2014/11/07 职场文书
辞职信格式模板
2015/02/27 职场文书
迎国庆主题班会
2015/08/17 职场文书
小学英语教师2015年度个人工作总结
2015/10/14 职场文书
毕业生自我鉴定范文
2019/05/13 职场文书
python神经网络学习 使用Keras进行简单分类
2022/05/04 Python