python支持断点续传的多线程下载示例


Posted in Python onJanuary 16, 2014
#! /usr/bin/env python
#coding=utf-8
from __future__ import unicode_literals
from multiprocessing.dummy import Pool as ThreadPool
import threading
import os
import sys
import cPickle
from collections import namedtuple
import urllib2
from urlparse import urlsplit
import time

# global lock
lock = threading.Lock()

# default parameters
defaults = dict(thread_count=10,
    buffer_size=10*1024,
    block_size=1000*1024)

def progress(percent, width=50):
    print "%s %d%%\r" % (('%%-%ds' % width) % (width * percent / 100 * '='), percent),
    if percent >= 100:
        print
        sys.stdout.flush()

def write_data(filepath, data):
    with open(filepath, 'wb') as output:
        cPickle.dump(data, output)

def read_data(filepath):
    with open(filepath, 'rb') as output:
        return cPickle.load(output)

FileInfo = namedtuple('FileInfo', 'url name size lastmodified')

def get_file_info(url):
    class HeadRequest(urllib2.Request):
        def get_method(self):
            return "HEAD"
    res = urllib2.urlopen(HeadRequest(url))
    res.read()
    headers = dict(res.headers)
    size = int(headers.get('content-length', 0))
    lastmodified = headers.get('last-modified', '')
    name = None
    if headers.has_key('content-disposition'):
        name = headers['content-disposition'].split('filename=')[1]
        if name[0] == '"' or name[0] == "'":
            name = name[1:-1]
    else:
        name = os.path.basename(urlsplit(url)[2])
    return FileInfo(url, name, size, lastmodified)

def download(url, output,
        thread_count = defaults['thread_count'],
        buffer_size = defaults['buffer_size'],
        block_size = defaults['block_size']):
    # get latest file info
    file_info = get_file_info(url)
    # init path
    if output is None:
        output = file_info.name
    workpath = '%s.ing' % output
    infopath = '%s.inf' % output
    # split file to blocks. every block is a array [start, offset, end],
    # then each greenlet download filepart according to a block, and
    # update the block' offset.
    blocks = []
    if os.path.exists(infopath):
        # load blocks
        _x, blocks = read_data(infopath)
        if (_x.url != url or
                _x.name != file_info.name or
                _x.lastmodified != file_info.lastmodified):
            blocks = []
    if len(blocks) == 0:
        # set blocks
        if block_size > file_info.size:
            blocks = [[0, 0, file_info.size]]
        else:
            block_count, remain = divmod(file_info.size, block_size)
            blocks = [[i*block_size, i*block_size, (i+1)*block_size-1] for i in range(block_count)]
            blocks[-1][-1] += remain
        # create new blank workpath
        with open(workpath, 'wb') as fobj:
            fobj.write('')
    print 'Downloading %s' % url
    # start monitor
    threading.Thread(target=_monitor, args=(infopath, file_info, blocks)).start()
    # start downloading
    with open(workpath, 'rb+') as fobj:
        args = [(url, blocks[i], fobj, buffer_size) for i in range(len(blocks)) if blocks[i][1] < blocks[i][2]]
        if thread_count > len(args):
            thread_count = len(args)
        pool = ThreadPool(thread_count)
        pool.map(_worker, args)
        pool.close()
        pool.join()

    # rename workpath to output
    if os.path.exists(output):
        os.remove(output)
    os.rename(workpath, output)
    # delete infopath
    if os.path.exists(infopath):
        os.remove(infopath)
    assert all([block[1]>=block[2] for block in blocks]) is True

def _worker((url, block, fobj, buffer_size)):
    req = urllib2.Request(url)
    req.headers['Range'] = 'bytes=%s-%s' % (block[1], block[2])
    res = urllib2.urlopen(req)
    while 1:
        chunk = res.read(buffer_size)
        if not chunk:
            break
        with lock:
            fobj.seek(block[1])
            fobj.write(chunk)
            block[1] += len(chunk)

def _monitor(infopath, file_info, blocks):
    while 1:
        with lock:
            percent = sum([block[1] - block[0] for block in blocks]) * 100 / file_info.size
            progress(percent)
            if percent >= 100:
                break
            write_data(infopath, (file_info, blocks))
        time.sleep(2)

if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser(description='Download file by multi-threads.')
    parser.add_argument('url', type=str, help='url of the download file')
    parser.add_argument('-o', type=str, default=None, dest="output", help='output file')
    parser.add_argument('-t', type=int, default=defaults['thread_count'], dest="thread_count", help='thread counts to downloading')
    parser.add_argument('-b', type=int, default=defaults['buffer_size'], dest="buffer_size", help='buffer size')
    parser.add_argument('-s', type=int, default=defaults['block_size'], dest="block_size", help='block size')
    argv = sys.argv[1:]
    if len(argv) == 0:
        argv = ['https://eyes.nasa.gov/eyesproduct/EYES/os/win']
    args = parser.parse_args(argv)
    start_time = time.time()
    download(args.url, args.output, args.thread_count, args.buffer_size, args.block_size)
    print 'times: %ds' % int(time.time()-start_time)
Python 相关文章推荐
使用PYTHON创建XML文档
Mar 01 Python
Python pass 语句使用示例
Mar 11 Python
Python使用回溯法子集树模板解决爬楼梯问题示例
Sep 08 Python
修复 Django migration 时遇到的问题解决
Jun 14 Python
Django rest framework工具包简单用法示例
Jul 20 Python
利用python在excel里面直接使用sql函数的方法
Feb 08 Python
Django多数据库的实现过程详解
Aug 01 Python
python 实现屏幕录制示例
Dec 23 Python
pycharm激活方法到2099年(激活流程)
Sep 22 Python
Python中使用aiohttp模拟服务器出现错误问题及解决方法
Oct 31 Python
Python基础之常用库常用方法整理
Apr 30 Python
Python采集壁纸并实现炫轮播
Apr 30 Python
python获得图片base64编码示例
Jan 16 #Python
python练习程序批量修改文件名
Jan 16 #Python
python使用urllib模块开发的多线程豆瓣小站mp3下载器
Jan 16 #Python
python使用urllib模块和pyquery实现阿里巴巴排名查询
Jan 16 #Python
python3.3教程之模拟百度登陆代码分享
Jan 16 #Python
python解析发往本机的数据包示例 (解析数据包)
Jan 16 #Python
python多线程扫描端口示例
Jan 16 #Python
You might like
PHP判断远程图片是否存在的几种方法
2014/05/04 PHP
php快递单号查询接口使用示例
2014/05/05 PHP
PHP实现搜索地理位置及计算两点地理位置间距离的实例
2016/01/08 PHP
php时间戳转换代码详解
2019/08/04 PHP
浅析javascript闭包 实例分析
2010/12/25 Javascript
iframe子页面与父页面在同域或不同域下的js通信
2014/05/07 Javascript
原生的html元素选择器类似jquery选择器
2014/10/15 Javascript
Javascript学习指南
2014/12/01 Javascript
jquery实现可自动判断位置的弹出层效果代码
2015/10/12 Javascript
Web前端开发工具——bower依赖包管理工具
2016/03/29 Javascript
js将json格式的对象拼接成复杂的url参数方法
2016/05/25 Javascript
Bootstrap布局组件教程之Bootstrap下拉菜单
2016/06/12 Javascript
jQuery联动日历的实例解析
2016/12/02 Javascript
如何在AngularJs中调用第三方插件库
2017/05/21 Javascript
js评分组件使用详解
2017/06/06 Javascript
解决VUE自定义拖拽指令时 onmouseup 与 click事件冲突问题
2020/07/24 Javascript
轻松掌握python设计模式之策略模式
2016/11/18 Python
Python学习笔记之解析json的方法分析
2017/04/21 Python
python3之微信文章爬虫实例讲解
2017/07/12 Python
python数字图像处理实现直方图与均衡化
2018/05/04 Python
python取均匀不重复的随机数方式
2019/11/27 Python
Win10里python3创建虚拟环境的步骤
2020/01/31 Python
CSS3动画效果回调处理详解
2014/12/10 HTML / CSS
若通过ObjectOutputStream向一个文件中多次以追加方式写入object,为什么用ObjectInputStream读取这些object时会产生StreamCorruptedException?
2016/10/17 面试题
机电专业毕业生求职信
2013/10/27 职场文书
会务接待方案
2014/02/27 职场文书
外贸采购员岗位职责
2014/03/08 职场文书
迎新晚会主持词
2014/03/24 职场文书
珍惜资源保护环境的建议书
2014/05/14 职场文书
员工安全责任书范本
2014/07/24 职场文书
中秋节国旗下演讲稿
2014/09/13 职场文书
标准单位租车协议书
2014/09/23 职场文书
2015年安全月活动总结
2015/03/26 职场文书
创业计划书之干洗店
2019/09/10 职场文书
Java基于Dijkstra算法实现校园导游程序
2022/03/17 Java/Android
vue中div禁止点击事件的实现
2022/04/02 Vue.js