python实现爬虫下载漫画示例


Posted in Python onFebruary 16, 2014
#!/usr/bin/python3.2
import os,socket
import urllib
import urllib.request,threading,time
import re,sys
global manhuaweb,weburl,floder,chapterbegin,currentthreadnum,threadcount,mutex,mutex2
weburl=''
floder=''
chapterbegin=0
currentthreadnum=0
threadcount=6

if len(sys.argv)>=3:
  weburl=sys.argv[1]
  floder=sys.argv[2]
else:
    print("usag: downloadmanhua weburl floder chapterbegin=0 threadnnum=6") 
    sys.exit(0)
if len(sys.argv)>=4:
  chapterbegin=int(sys.argv[3])
if len(sys.argv)>=5:
  threadcount=(int)(sys.argv[4])
 
def jin(i,jinzhi):
        finalans=""
        answer=i%jinzhi
        i=int(i/jinzhi)
        if answer>9:
                finalans=finalans+chr(ord('a')+(answer-10))
        else:
                finalans=finalans+str(answer)
        if i!=0:
                finalans=jin(i,jinzhi)+finalans
        return finalans
def urlparse(p,a,c,k):
        d={}
        e=lambda c:     jin(c,36)
        if 1:
                while c:
                        c=c-1
                        if not k[c]:
                                d[jin(c,36)]=jin(c,36)
                        else:
                                d[jin(c,36)]=k[c]
                k=[lambda e:d[e]]
                e=lambda c:'\\w+'
                c=1
        newstr=""
        while c:
                c=c-1
                if k[c]:
                        for i in range(0,len(p)):
                                tempi=p[i]
                                tempi=ord(tempi)
                                if tempi>=ord('a') and tempi<=ord('f'):
                                        newstr+=d[chr(tempi)]
                                elif tempi>=ord('0') and tempi<=ord('9'):
                                        newstr+=d[chr(tempi)]
                                else:
                                        newstr+=chr(tempi)
        return newstr
def meispower(s):
        p=re.compile(r"(?=\}\().*",re.IGNORECASE)
        s=p.findall(s)
        s=s[0]
        s=s[0:(len(s)-19)]
        par=s.split(',')
        par[3]=par[3][1:len(par[3])]
        answer=par[3].split('|')
        chapterpath=urlparse(par[0],int(par[1]),int(par[2]),answer)
        allurl=re.findall('imgpath=[^;]*',chapterpath)[0]
        allurl=allurl[10:(len(allurl)-2)]
        return allurl
def pictofile(weburl,filename,loop=100):
        if loop<0:
                print('can\'t download the picture %s'%weburl)
                return
        loop=loop-1
        if os.path.exists(filename):
            return
        try:
                url=urllib.request.urlopen(weburl)
                data=url.read()
                if len(data)<2048:
                        url.close()
                        pictofile(weburl,filename,loop)
                else:
                        print('download from %s name is %s\n'%(weburl,filename))
                        myfile=open('%s'%filename,'wb')
                        myfile.write(data)
                        myfile.close()
                        url.close();
        except socket.timeout:
                print('timeout')
                pictofile(weburl,filename,loop)
        except Exception as e:
          print('error',e)
          pictofile(weburl,filename,loop)
        finally:
            pass
def downloadpic(url,loadpicdir,num):
    #download the all url picture to loadpicdir
    global currentthreadnum,mutex,mutex2
    mymode=re.compile(r'[0-9a-z.]*\Z')
    try:
                mutex2.acquire()
                os.chdir(loadpicdir)
                mutex2.release()
    except:
                print("can't open the floder %s will be create"%loadpicdir)
                try:
                    if(mutex2.locked()):
                        os.mkdir(loadpicdir)
                        os.chdir(loadpicdir)
                        mutex2.release()
                    print('create floder succeed')
                except:
                    print("can't create floder %s"%loadpicdir)
                    if(mutex.acquire()):
                        mutex.release()
                    quit(0)
    name=mymode.findall(url)
    filename='manhua'+name[0]
    pictofile(url,loadpicdir+'//'+str(num)+'-'+filename)
    mutex.acquire()
    currentthreadnum=currentthreadnum-1
    mutex.release()
def downloadchapter(url,loadpicdir,num,begin=0):
        global manhuaweb,threadcount,currentthreadnum,mutex
        print(manhuaweb+url)
        webdata=urllib.request.urlopen(manhuaweb+url).read()
        webdata=webdata.decode('UTF-8')
        chaptername=re.findall(r'<title>[^_]*',webdata)[0]
        chaptername=chaptername[7:len(chaptername)]
        webscrip=re.findall(r'eval.*[^<>]',webdata)
        chapterurl=meispower(webscrip[0]);
        chapterurl='http://mhimg.ali213.net'+chapterurl
        for i in range(begin,num):
                try:
                        while(currentthreadnum>=threadcount):
                                time.sleep(0.5)
                        mutex.acquire()
                        currentthreadnum=currentthreadnum+1
                        mutex.release()
                        threading.Thread(target=downloadpic,args=(r'%s%d.jpg'%(chapterurl,i),loadpicdir+chaptername,num)).start()
                except socket.error:
                        mutex.acquire()
                        i=i-1
                        currentthreadnum=currentthreadnum-1
                        mutex.release()
                except Exception as error:
                        print(error,'break')
                        print('download chapter %d of picture make a error'%i)
                        break
if __name__=='__main__':
        manhuaweb=r'http://manhua.ali213.net'
        socket.setdefaulttimeout(60.0)
        mutex=threading.Lock()
        mutex2=threading.Lock()
        
        webfile=urllib.request.urlopen(weburl)
        webdata=webfile.read();
        webdata=webdata.decode('UTF-8')
        meshmode=re.compile(r'<div class="detail_body_right_sec_con">.*</div>')
        meshdata=meshmode.findall(webdata)[0]
        indexmode=re.compile(r'([0-9]*页)')
        indexdata=indexmode.findall(meshdata)
        picurlmode=re.compile(r'/comic/[0-9/]*.html')
        picurldata=picurlmode.findall(meshdata)

        chapterlength=len(picurldata)
        nummode=re.compile(r'[\d]+')
        i=chapterbegin
        while i<chapterlength:
                manhuachapter=picurldata[chapterlength-i-1]
                downloadchapter(manhuachapter,floder,int(nummode.findall(indexdata[chapterlength-i-1])[0]))
                i=i+1
Python 相关文章推荐
python自动安装pip
Apr 24 Python
由Python运算π的值深入Python中科学计算的实现
Apr 17 Python
Python 文件管理实例详解
Nov 10 Python
python交互式图形编程实例(一)
Nov 17 Python
Python读csv文件去掉一列后再写入新的文件实例
Dec 28 Python
python leetcode 字符串相乘实例详解
Sep 03 Python
Django中URL的参数传递的实现
Aug 04 Python
Python中注释(多行注释和单行注释)的用法实例
Aug 28 Python
python实现坦克大战
Apr 24 Python
Python如何进行时间处理
Aug 06 Python
详解Pycharm与anaconda安装配置指南
Aug 25 Python
一文详述 Python 中的 property 语法
Sep 01 Python
python发送邮件示例(支持中文邮件标题)
Feb 16 #Python
python定时器使用示例分享
Feb 16 #Python
python求素数示例分享
Feb 16 #Python
python检测服务器是否正常
Feb 16 #Python
java直接调用python脚本的例子
Feb 16 #Python
python根据距离和时长计算配速示例
Feb 16 #Python
python根据经纬度计算距离示例
Feb 16 #Python
You might like
PHP实现无限级分类(不使用递归)
2015/10/22 PHP
PHP实现上传多图即时显示与即时删除的方法
2017/05/09 PHP
JQuery中each()的使用方法说明
2010/08/19 Javascript
javascript记住用户名和登录密码(两种方式)
2015/08/04 Javascript
原生js实现class的添加和删除简单代码
2016/07/12 Javascript
js原生跨域_用script标签的简单实现
2016/09/24 Javascript
jquery配合.NET实现点击指定绑定数据并且能够一键下载
2016/10/28 Javascript
Angularjs的Controller间通信机制实例分析
2016/11/07 Javascript
Angularjs之filter过滤器(推荐)
2016/11/27 Javascript
详解AngularJS验证、过滤器、指令
2017/01/04 Javascript
JS+HTML5实现上传图片预览效果完整实例【测试可用】
2017/04/20 Javascript
详解vue 模版组件的三种用法
2017/07/21 Javascript
Express下采用bcryptjs进行密码加密的方法
2018/02/07 Javascript
分析javascript原型及原型链
2018/03/18 Javascript
vue 路由嵌套高亮问题的解决方法
2018/05/17 Javascript
实例讲解JavaScript预编译流程
2019/01/24 Javascript
eslint 的三大通用规则详解
2019/05/16 Javascript
在Vue mounted方法中使用data变量详解
2019/11/05 Javascript
vue el-table实现自定义表头
2019/12/11 Javascript
JS实现商城秒杀倒计时功能(动态设置秒杀时间)
2019/12/12 Javascript
Vue+tracking.js 实现前端人脸检测功能
2020/04/16 Javascript
微信小程序实现多选框功能的实例代码
2020/06/24 Javascript
antd-DatePicker组件获取时间值,及相关设置方式
2020/10/27 Javascript
[04:27]DOTA2官方论坛水友赛集锦
2013/09/16 DOTA
Python实现的科学计算器功能示例
2017/08/04 Python
python使用TensorFlow进行图像处理的方法
2018/02/28 Python
Python占用的内存优化教程
2019/07/28 Python
python 调用pyautogui 实时获取鼠标的位置、移动鼠标的方法
2019/08/27 Python
Django1.11配合uni-app发起微信支付的实现
2019/10/12 Python
AmazeUI中各种的导航式菜单与解决方法
2020/08/19 HTML / CSS
全球知名提供各类营养保健品的零售商:Vitamin Shoppe
2016/10/09 全球购物
Under Armour安德玛荷兰官网:美国高端运动科技品牌
2019/07/10 全球购物
地球鞋加拿大官网:Earth Shoes Canada
2020/11/17 全球购物
学生上课看漫画的检讨书
2014/09/26 职场文书
2014年会计工作总结
2014/11/27 职场文书
会计工作自我鉴定范文
2019/06/21 职场文书