python实现爬虫下载漫画示例


Posted in Python onFebruary 16, 2014
#!/usr/bin/python3.2
import os,socket
import urllib
import urllib.request,threading,time
import re,sys
global manhuaweb,weburl,floder,chapterbegin,currentthreadnum,threadcount,mutex,mutex2
weburl=''
floder=''
chapterbegin=0
currentthreadnum=0
threadcount=6

if len(sys.argv)>=3:
  weburl=sys.argv[1]
  floder=sys.argv[2]
else:
    print("usag: downloadmanhua weburl floder chapterbegin=0 threadnnum=6") 
    sys.exit(0)
if len(sys.argv)>=4:
  chapterbegin=int(sys.argv[3])
if len(sys.argv)>=5:
  threadcount=(int)(sys.argv[4])
 
def jin(i,jinzhi):
        finalans=""
        answer=i%jinzhi
        i=int(i/jinzhi)
        if answer>9:
                finalans=finalans+chr(ord('a')+(answer-10))
        else:
                finalans=finalans+str(answer)
        if i!=0:
                finalans=jin(i,jinzhi)+finalans
        return finalans
def urlparse(p,a,c,k):
        d={}
        e=lambda c:     jin(c,36)
        if 1:
                while c:
                        c=c-1
                        if not k[c]:
                                d[jin(c,36)]=jin(c,36)
                        else:
                                d[jin(c,36)]=k[c]
                k=[lambda e:d[e]]
                e=lambda c:'\\w+'
                c=1
        newstr=""
        while c:
                c=c-1
                if k[c]:
                        for i in range(0,len(p)):
                                tempi=p[i]
                                tempi=ord(tempi)
                                if tempi>=ord('a') and tempi<=ord('f'):
                                        newstr+=d[chr(tempi)]
                                elif tempi>=ord('0') and tempi<=ord('9'):
                                        newstr+=d[chr(tempi)]
                                else:
                                        newstr+=chr(tempi)
        return newstr
def meispower(s):
        p=re.compile(r"(?=\}\().*",re.IGNORECASE)
        s=p.findall(s)
        s=s[0]
        s=s[0:(len(s)-19)]
        par=s.split(',')
        par[3]=par[3][1:len(par[3])]
        answer=par[3].split('|')
        chapterpath=urlparse(par[0],int(par[1]),int(par[2]),answer)
        allurl=re.findall('imgpath=[^;]*',chapterpath)[0]
        allurl=allurl[10:(len(allurl)-2)]
        return allurl
def pictofile(weburl,filename,loop=100):
        if loop<0:
                print('can\'t download the picture %s'%weburl)
                return
        loop=loop-1
        if os.path.exists(filename):
            return
        try:
                url=urllib.request.urlopen(weburl)
                data=url.read()
                if len(data)<2048:
                        url.close()
                        pictofile(weburl,filename,loop)
                else:
                        print('download from %s name is %s\n'%(weburl,filename))
                        myfile=open('%s'%filename,'wb')
                        myfile.write(data)
                        myfile.close()
                        url.close();
        except socket.timeout:
                print('timeout')
                pictofile(weburl,filename,loop)
        except Exception as e:
          print('error',e)
          pictofile(weburl,filename,loop)
        finally:
            pass
def downloadpic(url,loadpicdir,num):
    #download the all url picture to loadpicdir
    global currentthreadnum,mutex,mutex2
    mymode=re.compile(r'[0-9a-z.]*\Z')
    try:
                mutex2.acquire()
                os.chdir(loadpicdir)
                mutex2.release()
    except:
                print("can't open the floder %s will be create"%loadpicdir)
                try:
                    if(mutex2.locked()):
                        os.mkdir(loadpicdir)
                        os.chdir(loadpicdir)
                        mutex2.release()
                    print('create floder succeed')
                except:
                    print("can't create floder %s"%loadpicdir)
                    if(mutex.acquire()):
                        mutex.release()
                    quit(0)
    name=mymode.findall(url)
    filename='manhua'+name[0]
    pictofile(url,loadpicdir+'//'+str(num)+'-'+filename)
    mutex.acquire()
    currentthreadnum=currentthreadnum-1
    mutex.release()
def downloadchapter(url,loadpicdir,num,begin=0):
        global manhuaweb,threadcount,currentthreadnum,mutex
        print(manhuaweb+url)
        webdata=urllib.request.urlopen(manhuaweb+url).read()
        webdata=webdata.decode('UTF-8')
        chaptername=re.findall(r'<title>[^_]*',webdata)[0]
        chaptername=chaptername[7:len(chaptername)]
        webscrip=re.findall(r'eval.*[^<>]',webdata)
        chapterurl=meispower(webscrip[0]);
        chapterurl='http://mhimg.ali213.net'+chapterurl
        for i in range(begin,num):
                try:
                        while(currentthreadnum>=threadcount):
                                time.sleep(0.5)
                        mutex.acquire()
                        currentthreadnum=currentthreadnum+1
                        mutex.release()
                        threading.Thread(target=downloadpic,args=(r'%s%d.jpg'%(chapterurl,i),loadpicdir+chaptername,num)).start()
                except socket.error:
                        mutex.acquire()
                        i=i-1
                        currentthreadnum=currentthreadnum-1
                        mutex.release()
                except Exception as error:
                        print(error,'break')
                        print('download chapter %d of picture make a error'%i)
                        break
if __name__=='__main__':
        manhuaweb=r'http://manhua.ali213.net'
        socket.setdefaulttimeout(60.0)
        mutex=threading.Lock()
        mutex2=threading.Lock()
        
        webfile=urllib.request.urlopen(weburl)
        webdata=webfile.read();
        webdata=webdata.decode('UTF-8')
        meshmode=re.compile(r'<div class="detail_body_right_sec_con">.*</div>')
        meshdata=meshmode.findall(webdata)[0]
        indexmode=re.compile(r'([0-9]*页)')
        indexdata=indexmode.findall(meshdata)
        picurlmode=re.compile(r'/comic/[0-9/]*.html')
        picurldata=picurlmode.findall(meshdata)

        chapterlength=len(picurldata)
        nummode=re.compile(r'[\d]+')
        i=chapterbegin
        while i<chapterlength:
                manhuachapter=picurldata[chapterlength-i-1]
                downloadchapter(manhuachapter,floder,int(nummode.findall(indexdata[chapterlength-i-1])[0]))
                i=i+1
Python 相关文章推荐
python下载文件时显示下载进度的方法
Apr 02 Python
Python函数中*args和**kwargs来传递变长参数的用法
Jan 26 Python
深入解析Python设计模式编程中建造者模式的使用
Mar 02 Python
Django重装mysql后启动报错:No module named ‘MySQLdb’的解决方法
Apr 22 Python
Python二叉树的镜像转换实现方法示例
Mar 06 Python
使用Python3内置文档高效学习以及官方中文文档
May 19 Python
用Anaconda安装本地python包的方法及路径问题(图文)
Jul 16 Python
python中的线程threading.Thread()使用详解
Dec 17 Python
python实现堆排序的实例讲解
Feb 21 Python
Python3.7在anaconda里面使用IDLE编译器的步骤详解
Apr 29 Python
Lombok插件安装(IDEA)及配置jar包使用详解
Nov 04 Python
Python request post上传文件常见要点
Nov 20 Python
python发送邮件示例(支持中文邮件标题)
Feb 16 #Python
python定时器使用示例分享
Feb 16 #Python
python求素数示例分享
Feb 16 #Python
python检测服务器是否正常
Feb 16 #Python
java直接调用python脚本的例子
Feb 16 #Python
python根据距离和时长计算配速示例
Feb 16 #Python
python根据经纬度计算距离示例
Feb 16 #Python
You might like
PHP使用json_encode函数时不转义中文的解决方法
2014/11/12 PHP
PHP  Yii清理缓存的实现方法
2016/11/10 PHP
浅谈Yii乐观锁的使用及原理
2017/07/25 PHP
PHP MVC框架中类的自动加载机制实例分析
2019/09/18 PHP
jQuery 使用手册(三)
2009/09/23 Javascript
Jquery练习之表单验证实现代码
2010/12/14 Javascript
jQuery为iframe的body添加click事件的实现代码
2011/04/07 Javascript
利用JS解决ie6不支持max-width,max-height问题的方法
2014/01/02 Javascript
Jquery Ajax方法传值到action的方法
2014/05/11 Javascript
JavaScript中实现依赖注入的思路分享
2015/01/15 Javascript
javascript中sort()的用法实例分析
2015/01/30 Javascript
jQuery的bind()方法使用详解
2015/07/15 Javascript
javascript+canvas实现刮刮卡抽奖效果
2015/07/29 Javascript
利用js来实现缩略语列表、文献来源链接和快捷键列表
2016/12/16 Javascript
Bootstrap 设置datetimepicker在屏幕上面弹出设置方法
2017/03/21 Javascript
js表单序列化判断空值的实例
2017/09/22 Javascript
总结javascript三元运算符知识点
2018/09/28 Javascript
Vue中消息横向滚动时setInterval清不掉的问题及解决方法
2019/08/23 Javascript
微信小程序实现选项卡滑动切换
2020/10/22 Javascript
Python中的下划线详解
2015/06/24 Python
python从入门到精通(DAY 3)
2015/12/20 Python
Python实现的摇骰子猜大小功能小游戏示例
2017/12/18 Python
python模块smtplib实现纯文本邮件发送功能
2018/05/22 Python
Python自动抢红包教程详解
2019/06/11 Python
Tensorflow的常用矩阵生成方式
2020/01/04 Python
TensorFlow实现指数衰减学习率的方法
2020/02/05 Python
python 对象真假值的实例(哪些视为False)
2020/12/11 Python
CSS3关于z-index不生效问题的解决
2020/02/19 HTML / CSS
Html5实现文件异步上传功能
2017/05/19 HTML / CSS
Java里面StringBuilder和StringBuffer有什么区别
2016/06/06 面试题
通信工程专业个人找工作求职信范文
2013/09/21 职场文书
应届毕业生求职信范文
2013/12/18 职场文书
通用自荐信范文
2014/03/14 职场文书
交通事故被告代理词
2015/05/23 职场文书
优秀党员主要事迹范文
2015/11/05 职场文书
Python实现DBSCAN聚类算法并样例测试
2021/06/22 Python