python支持断点续传的多线程下载示例


Posted in Python onJanuary 16, 2014
#! /usr/bin/env python
#coding=utf-8
from __future__ import unicode_literals
from multiprocessing.dummy import Pool as ThreadPool
import threading
import os
import sys
import cPickle
from collections import namedtuple
import urllib2
from urlparse import urlsplit
import time

# global lock
lock = threading.Lock()

# default parameters
defaults = dict(thread_count=10,
    buffer_size=10*1024,
    block_size=1000*1024)

def progress(percent, width=50):
    print "%s %d%%\r" % (('%%-%ds' % width) % (width * percent / 100 * '='), percent),
    if percent >= 100:
        print
        sys.stdout.flush()

def write_data(filepath, data):
    with open(filepath, 'wb') as output:
        cPickle.dump(data, output)

def read_data(filepath):
    with open(filepath, 'rb') as output:
        return cPickle.load(output)

FileInfo = namedtuple('FileInfo', 'url name size lastmodified')

def get_file_info(url):
    class HeadRequest(urllib2.Request):
        def get_method(self):
            return "HEAD"
    res = urllib2.urlopen(HeadRequest(url))
    res.read()
    headers = dict(res.headers)
    size = int(headers.get('content-length', 0))
    lastmodified = headers.get('last-modified', '')
    name = None
    if headers.has_key('content-disposition'):
        name = headers['content-disposition'].split('filename=')[1]
        if name[0] == '"' or name[0] == "'":
            name = name[1:-1]
    else:
        name = os.path.basename(urlsplit(url)[2])
    return FileInfo(url, name, size, lastmodified)

def download(url, output,
        thread_count = defaults['thread_count'],
        buffer_size = defaults['buffer_size'],
        block_size = defaults['block_size']):
    # get latest file info
    file_info = get_file_info(url)
    # init path
    if output is None:
        output = file_info.name
    workpath = '%s.ing' % output
    infopath = '%s.inf' % output
    # split file to blocks. every block is a array [start, offset, end],
    # then each greenlet download filepart according to a block, and
    # update the block' offset.
    blocks = []
    if os.path.exists(infopath):
        # load blocks
        _x, blocks = read_data(infopath)
        if (_x.url != url or
                _x.name != file_info.name or
                _x.lastmodified != file_info.lastmodified):
            blocks = []
    if len(blocks) == 0:
        # set blocks
        if block_size > file_info.size:
            blocks = [[0, 0, file_info.size]]
        else:
            block_count, remain = divmod(file_info.size, block_size)
            blocks = [[i*block_size, i*block_size, (i+1)*block_size-1] for i in range(block_count)]
            blocks[-1][-1] += remain
        # create new blank workpath
        with open(workpath, 'wb') as fobj:
            fobj.write('')
    print 'Downloading %s' % url
    # start monitor
    threading.Thread(target=_monitor, args=(infopath, file_info, blocks)).start()
    # start downloading
    with open(workpath, 'rb+') as fobj:
        args = [(url, blocks[i], fobj, buffer_size) for i in range(len(blocks)) if blocks[i][1] < blocks[i][2]]
        if thread_count > len(args):
            thread_count = len(args)
        pool = ThreadPool(thread_count)
        pool.map(_worker, args)
        pool.close()
        pool.join()

    # rename workpath to output
    if os.path.exists(output):
        os.remove(output)
    os.rename(workpath, output)
    # delete infopath
    if os.path.exists(infopath):
        os.remove(infopath)
    assert all([block[1]>=block[2] for block in blocks]) is True

def _worker((url, block, fobj, buffer_size)):
    req = urllib2.Request(url)
    req.headers['Range'] = 'bytes=%s-%s' % (block[1], block[2])
    res = urllib2.urlopen(req)
    while 1:
        chunk = res.read(buffer_size)
        if not chunk:
            break
        with lock:
            fobj.seek(block[1])
            fobj.write(chunk)
            block[1] += len(chunk)

def _monitor(infopath, file_info, blocks):
    while 1:
        with lock:
            percent = sum([block[1] - block[0] for block in blocks]) * 100 / file_info.size
            progress(percent)
            if percent >= 100:
                break
            write_data(infopath, (file_info, blocks))
        time.sleep(2)

if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser(description='Download file by multi-threads.')
    parser.add_argument('url', type=str, help='url of the download file')
    parser.add_argument('-o', type=str, default=None, dest="output", help='output file')
    parser.add_argument('-t', type=int, default=defaults['thread_count'], dest="thread_count", help='thread counts to downloading')
    parser.add_argument('-b', type=int, default=defaults['buffer_size'], dest="buffer_size", help='buffer size')
    parser.add_argument('-s', type=int, default=defaults['block_size'], dest="block_size", help='block size')
    argv = sys.argv[1:]
    if len(argv) == 0:
        argv = ['https://eyes.nasa.gov/eyesproduct/EYES/os/win']
    args = parser.parse_args(argv)
    start_time = time.time()
    download(args.url, args.output, args.thread_count, args.buffer_size, args.block_size)
    print 'times: %ds' % int(time.time()-start_time)
Python 相关文章推荐
探究Python的Tornado框架对子域名和泛域名的支持
May 02 Python
Python爬虫之xlml解析库(全面了解)
Aug 08 Python
用Python实现读写锁的示例代码
Nov 05 Python
PySide和PyQt加载ui文件的两种方法
Feb 27 Python
django如何通过类视图使用装饰器
Jul 24 Python
简单了解Django应用app及分布式路由
Jul 24 Python
python3.7环境下安装Anaconda的教程图解
Sep 10 Python
django drf框架自带的路由及最简化的视图
Sep 10 Python
python 装饰器功能与用法案例详解
Mar 06 Python
使用python求斐波那契数列中第n个数的值示例代码
Jul 26 Python
Django ModelForm组件原理及用法详解
Oct 12 Python
python中的random模块和相关函数详解
Apr 22 Python
python获得图片base64编码示例
Jan 16 #Python
python练习程序批量修改文件名
Jan 16 #Python
python使用urllib模块开发的多线程豆瓣小站mp3下载器
Jan 16 #Python
python使用urllib模块和pyquery实现阿里巴巴排名查询
Jan 16 #Python
python3.3教程之模拟百度登陆代码分享
Jan 16 #Python
python解析发往本机的数据包示例 (解析数据包)
Jan 16 #Python
python多线程扫描端口示例
Jan 16 #Python
You might like
Javascript String.replace的妙用
2009/09/08 Javascript
JavaScript 三种不同位置代码的写法
2009/10/25 Javascript
jQuery 白痴级入门教程
2009/11/11 Javascript
javascript 强制刷新页面的实现代码
2009/12/13 Javascript
使用Json比用string返回数据更友好,也更面向对象一些
2011/09/13 Javascript
JSON 数字排序多字段排序介绍
2013/09/18 Javascript
jQuery对指定元素中指定字符串进行替换的方法
2015/03/17 Javascript
jQuery子元素过滤选择器用法示例
2016/09/09 Javascript
D3.js实现雷达图的方法详解
2016/09/22 Javascript
使用AngularJS 跨站请求如何解决jsonp请求问题
2017/01/16 Javascript
轻松学习Javascript闭包
2017/03/01 Javascript
Ionic2开发环境搭建教程
2020/08/20 Javascript
layer.open 按钮的点击事件关闭方法
2018/08/17 Javascript
详解如何在微信小程序开发中正确的使用vant ui组件
2018/09/13 Javascript
JavaScript中常用的简洁高级技巧总结
2019/03/10 Javascript
javascript全局自定义鼠标右键菜单
2020/12/08 Javascript
[43:24]完美世界DOTA2联赛PWL S3 INK ICE vs DLG 第二场 12.12
2020/12/17 DOTA
Python处理中文标点符号大集合
2018/05/14 Python
Pandas 合并多个Dataframe(merge,concat)的方法
2018/06/08 Python
python输入多行字符串的方法总结
2019/07/02 Python
python统计函数库scipy.stats的用法解析
2020/02/25 Python
解决Django no such table: django_session的问题
2020/04/07 Python
Ubuntu18.04安装 PyCharm并使用 Anaconda 管理的Python环境
2020/04/08 Python
keras自定义回调函数查看训练的loss和accuracy方式
2020/05/23 Python
python3实现名片管理系统(控制台版)
2020/11/29 Python
Black Halo官方网站:购买连衣裙、礼服和连体裤
2018/06/13 全球购物
用JAVA实现一种排序,JAVA类实现序列化的方法(二种)
2014/04/23 面试题
给排水工程师岗位职责
2013/11/21 职场文书
工厂保洁员岗位职责
2013/12/04 职场文书
户外亲子活动策划方案
2014/02/07 职场文书
合作协议书范本
2014/04/17 职场文书
服务承诺口号
2014/05/22 职场文书
2016毕业实习单位评语大全
2015/12/01 职场文书
《自己去吧》教学反思
2016/02/16 职场文书
2016年学校综治宣传月活动总结
2016/03/16 职场文书
入党申请书格式
2019/06/20 职场文书