python实现爬虫下载漫画示例


Posted in Python onFebruary 16, 2014
#!/usr/bin/python3.2
import os,socket
import urllib
import urllib.request,threading,time
import re,sys
global manhuaweb,weburl,floder,chapterbegin,currentthreadnum,threadcount,mutex,mutex2
weburl=''
floder=''
chapterbegin=0
currentthreadnum=0
threadcount=6

if len(sys.argv)>=3:
  weburl=sys.argv[1]
  floder=sys.argv[2]
else:
    print("usag: downloadmanhua weburl floder chapterbegin=0 threadnnum=6") 
    sys.exit(0)
if len(sys.argv)>=4:
  chapterbegin=int(sys.argv[3])
if len(sys.argv)>=5:
  threadcount=(int)(sys.argv[4])
 
def jin(i,jinzhi):
        finalans=""
        answer=i%jinzhi
        i=int(i/jinzhi)
        if answer>9:
                finalans=finalans+chr(ord('a')+(answer-10))
        else:
                finalans=finalans+str(answer)
        if i!=0:
                finalans=jin(i,jinzhi)+finalans
        return finalans
def urlparse(p,a,c,k):
        d={}
        e=lambda c:     jin(c,36)
        if 1:
                while c:
                        c=c-1
                        if not k[c]:
                                d[jin(c,36)]=jin(c,36)
                        else:
                                d[jin(c,36)]=k[c]
                k=[lambda e:d[e]]
                e=lambda c:'\\w+'
                c=1
        newstr=""
        while c:
                c=c-1
                if k[c]:
                        for i in range(0,len(p)):
                                tempi=p[i]
                                tempi=ord(tempi)
                                if tempi>=ord('a') and tempi<=ord('f'):
                                        newstr+=d[chr(tempi)]
                                elif tempi>=ord('0') and tempi<=ord('9'):
                                        newstr+=d[chr(tempi)]
                                else:
                                        newstr+=chr(tempi)
        return newstr
def meispower(s):
        p=re.compile(r"(?=\}\().*",re.IGNORECASE)
        s=p.findall(s)
        s=s[0]
        s=s[0:(len(s)-19)]
        par=s.split(',')
        par[3]=par[3][1:len(par[3])]
        answer=par[3].split('|')
        chapterpath=urlparse(par[0],int(par[1]),int(par[2]),answer)
        allurl=re.findall('imgpath=[^;]*',chapterpath)[0]
        allurl=allurl[10:(len(allurl)-2)]
        return allurl
def pictofile(weburl,filename,loop=100):
        if loop<0:
                print('can\'t download the picture %s'%weburl)
                return
        loop=loop-1
        if os.path.exists(filename):
            return
        try:
                url=urllib.request.urlopen(weburl)
                data=url.read()
                if len(data)<2048:
                        url.close()
                        pictofile(weburl,filename,loop)
                else:
                        print('download from %s name is %s\n'%(weburl,filename))
                        myfile=open('%s'%filename,'wb')
                        myfile.write(data)
                        myfile.close()
                        url.close();
        except socket.timeout:
                print('timeout')
                pictofile(weburl,filename,loop)
        except Exception as e:
          print('error',e)
          pictofile(weburl,filename,loop)
        finally:
            pass
def downloadpic(url,loadpicdir,num):
    #download the all url picture to loadpicdir
    global currentthreadnum,mutex,mutex2
    mymode=re.compile(r'[0-9a-z.]*\Z')
    try:
                mutex2.acquire()
                os.chdir(loadpicdir)
                mutex2.release()
    except:
                print("can't open the floder %s will be create"%loadpicdir)
                try:
                    if(mutex2.locked()):
                        os.mkdir(loadpicdir)
                        os.chdir(loadpicdir)
                        mutex2.release()
                    print('create floder succeed')
                except:
                    print("can't create floder %s"%loadpicdir)
                    if(mutex.acquire()):
                        mutex.release()
                    quit(0)
    name=mymode.findall(url)
    filename='manhua'+name[0]
    pictofile(url,loadpicdir+'//'+str(num)+'-'+filename)
    mutex.acquire()
    currentthreadnum=currentthreadnum-1
    mutex.release()
def downloadchapter(url,loadpicdir,num,begin=0):
        global manhuaweb,threadcount,currentthreadnum,mutex
        print(manhuaweb+url)
        webdata=urllib.request.urlopen(manhuaweb+url).read()
        webdata=webdata.decode('UTF-8')
        chaptername=re.findall(r'<title>[^_]*',webdata)[0]
        chaptername=chaptername[7:len(chaptername)]
        webscrip=re.findall(r'eval.*[^<>]',webdata)
        chapterurl=meispower(webscrip[0]);
        chapterurl='http://mhimg.ali213.net'+chapterurl
        for i in range(begin,num):
                try:
                        while(currentthreadnum>=threadcount):
                                time.sleep(0.5)
                        mutex.acquire()
                        currentthreadnum=currentthreadnum+1
                        mutex.release()
                        threading.Thread(target=downloadpic,args=(r'%s%d.jpg'%(chapterurl,i),loadpicdir+chaptername,num)).start()
                except socket.error:
                        mutex.acquire()
                        i=i-1
                        currentthreadnum=currentthreadnum-1
                        mutex.release()
                except Exception as error:
                        print(error,'break')
                        print('download chapter %d of picture make a error'%i)
                        break
if __name__=='__main__':
        manhuaweb=r'http://manhua.ali213.net'
        socket.setdefaulttimeout(60.0)
        mutex=threading.Lock()
        mutex2=threading.Lock()
        
        webfile=urllib.request.urlopen(weburl)
        webdata=webfile.read();
        webdata=webdata.decode('UTF-8')
        meshmode=re.compile(r'<div class="detail_body_right_sec_con">.*</div>')
        meshdata=meshmode.findall(webdata)[0]
        indexmode=re.compile(r'([0-9]*页)')
        indexdata=indexmode.findall(meshdata)
        picurlmode=re.compile(r'/comic/[0-9/]*.html')
        picurldata=picurlmode.findall(meshdata)

        chapterlength=len(picurldata)
        nummode=re.compile(r'[\d]+')
        i=chapterbegin
        while i<chapterlength:
                manhuachapter=picurldata[chapterlength-i-1]
                downloadchapter(manhuachapter,floder,int(nummode.findall(indexdata[chapterlength-i-1])[0]))
                i=i+1
Python 相关文章推荐
在Django的session中使用User对象的方法
Jul 23 Python
详解python开发环境搭建
Dec 16 Python
Python 比较文本相似性的方法(difflib,Levenshtein)
Oct 15 Python
详解Python静态网页爬取获取高清壁纸
Apr 23 Python
Django xadmin开启搜索功能的实现
Nov 15 Python
python3 Scrapy爬虫框架ip代理配置的方法
Jan 17 Python
150行Python代码实现带界面的数独游戏
Apr 04 Python
3种适用于Python的疯狂秘密武器及原因解析
Apr 29 Python
python pip如何手动安装二进制包
Sep 30 Python
如何用Python徒手写线性回归
Jan 25 Python
python中出现invalid syntax报错的几种原因分析
Feb 12 Python
python 学习GCN图卷积神经网络
May 11 Python
python发送邮件示例(支持中文邮件标题)
Feb 16 #Python
python定时器使用示例分享
Feb 16 #Python
python求素数示例分享
Feb 16 #Python
python检测服务器是否正常
Feb 16 #Python
java直接调用python脚本的例子
Feb 16 #Python
python根据距离和时长计算配速示例
Feb 16 #Python
python根据经纬度计算距离示例
Feb 16 #Python
You might like
php预定义变量使用帮助(带实例)
2013/10/30 PHP
php实现斐波那契数列的简单写法
2014/07/19 PHP
PHP生成静态HTML页面最简单方法示例
2015/04/09 PHP
phpinfo()中Loaded Configuration File(none)的解决方法
2017/01/16 PHP
thinkPHP5.1框架路由::get、post请求简单用法示例
2019/05/06 PHP
js几个不错的函数 $$()
2006/10/09 Javascript
为Yahoo! UI Extensions Grid增加内置的可编辑器
2007/03/10 Javascript
跟着Jquery API学Jquery之一 选择器
2010/04/07 Javascript
始终在屏幕中间显示Div的代码(css+js)
2011/03/10 Javascript
ASP.NET jQuery 实例15 通过控件CustomValidator验证CheckBoxList
2012/02/03 Javascript
JavaScript词法作用域与调用对象深入理解
2012/11/29 Javascript
jQuery 如何先创建、再修改、后添加DOM元素
2014/05/20 Javascript
node.js中的querystring.unescape方法使用说明
2014/12/10 Javascript
js不间断滚动的简单实现
2016/06/03 Javascript
javascript中闭包概念与用法深入理解
2016/12/15 Javascript
vue.js实现含搜索的多种复选框(附源码)
2017/03/23 Javascript
vuejs使用$emit和$on进行组件之间的传值的示例
2017/10/04 Javascript
详解Node.js模板引擎Jade入门
2018/01/19 Javascript
详解使用VueJS开发项目中的兼容问题
2018/08/02 Javascript
TypeScript高级用法的知识点汇总
2019/12/17 Javascript
ES6实现图片切换特效代码
2020/01/14 Javascript
Python实现的视频播放器功能完整示例
2018/02/01 Python
分析Python读取文件时的路径问题
2018/02/11 Python
Python查找第n个子串的技巧分享
2018/06/27 Python
Django中如何防范CSRF跨站点请求伪造攻击的实现
2019/04/28 Python
python 批量修改 labelImg 生成的xml文件的方法
2019/09/09 Python
Python提取PDF内容的方法(文本、图像、线条等)
2019/09/25 Python
澳大利亚女士时装在线:Rockmans
2018/09/26 全球购物
多媒体专业自我鉴定
2014/02/28 职场文书
学前教育专业求职信
2014/09/02 职场文书
用人单位终止解除劳动合同证明书
2014/10/06 职场文书
群众路线党员个人剖析材料
2014/10/08 职场文书
事业单位个人查摆问题及整改措施
2014/10/28 职场文书
领导欢迎词致辞
2015/01/23 职场文书
2015秋季开学演讲稿范文
2015/07/16 职场文书
2015年主婚人婚礼致辞
2015/07/28 职场文书