python实现爬虫下载漫画示例


Posted in Python onFebruary 16, 2014
#!/usr/bin/python3.2
import os,socket
import urllib
import urllib.request,threading,time
import re,sys
global manhuaweb,weburl,floder,chapterbegin,currentthreadnum,threadcount,mutex,mutex2
weburl=''
floder=''
chapterbegin=0
currentthreadnum=0
threadcount=6

if len(sys.argv)>=3:
  weburl=sys.argv[1]
  floder=sys.argv[2]
else:
    print("usag: downloadmanhua weburl floder chapterbegin=0 threadnnum=6") 
    sys.exit(0)
if len(sys.argv)>=4:
  chapterbegin=int(sys.argv[3])
if len(sys.argv)>=5:
  threadcount=(int)(sys.argv[4])
 
def jin(i,jinzhi):
        finalans=""
        answer=i%jinzhi
        i=int(i/jinzhi)
        if answer>9:
                finalans=finalans+chr(ord('a')+(answer-10))
        else:
                finalans=finalans+str(answer)
        if i!=0:
                finalans=jin(i,jinzhi)+finalans
        return finalans
def urlparse(p,a,c,k):
        d={}
        e=lambda c:     jin(c,36)
        if 1:
                while c:
                        c=c-1
                        if not k[c]:
                                d[jin(c,36)]=jin(c,36)
                        else:
                                d[jin(c,36)]=k[c]
                k=[lambda e:d[e]]
                e=lambda c:'\\w+'
                c=1
        newstr=""
        while c:
                c=c-1
                if k[c]:
                        for i in range(0,len(p)):
                                tempi=p[i]
                                tempi=ord(tempi)
                                if tempi>=ord('a') and tempi<=ord('f'):
                                        newstr+=d[chr(tempi)]
                                elif tempi>=ord('0') and tempi<=ord('9'):
                                        newstr+=d[chr(tempi)]
                                else:
                                        newstr+=chr(tempi)
        return newstr
def meispower(s):
        p=re.compile(r"(?=\}\().*",re.IGNORECASE)
        s=p.findall(s)
        s=s[0]
        s=s[0:(len(s)-19)]
        par=s.split(',')
        par[3]=par[3][1:len(par[3])]
        answer=par[3].split('|')
        chapterpath=urlparse(par[0],int(par[1]),int(par[2]),answer)
        allurl=re.findall('imgpath=[^;]*',chapterpath)[0]
        allurl=allurl[10:(len(allurl)-2)]
        return allurl
def pictofile(weburl,filename,loop=100):
        if loop<0:
                print('can\'t download the picture %s'%weburl)
                return
        loop=loop-1
        if os.path.exists(filename):
            return
        try:
                url=urllib.request.urlopen(weburl)
                data=url.read()
                if len(data)<2048:
                        url.close()
                        pictofile(weburl,filename,loop)
                else:
                        print('download from %s name is %s\n'%(weburl,filename))
                        myfile=open('%s'%filename,'wb')
                        myfile.write(data)
                        myfile.close()
                        url.close();
        except socket.timeout:
                print('timeout')
                pictofile(weburl,filename,loop)
        except Exception as e:
          print('error',e)
          pictofile(weburl,filename,loop)
        finally:
            pass
def downloadpic(url,loadpicdir,num):
    #download the all url picture to loadpicdir
    global currentthreadnum,mutex,mutex2
    mymode=re.compile(r'[0-9a-z.]*\Z')
    try:
                mutex2.acquire()
                os.chdir(loadpicdir)
                mutex2.release()
    except:
                print("can't open the floder %s will be create"%loadpicdir)
                try:
                    if(mutex2.locked()):
                        os.mkdir(loadpicdir)
                        os.chdir(loadpicdir)
                        mutex2.release()
                    print('create floder succeed')
                except:
                    print("can't create floder %s"%loadpicdir)
                    if(mutex.acquire()):
                        mutex.release()
                    quit(0)
    name=mymode.findall(url)
    filename='manhua'+name[0]
    pictofile(url,loadpicdir+'//'+str(num)+'-'+filename)
    mutex.acquire()
    currentthreadnum=currentthreadnum-1
    mutex.release()
def downloadchapter(url,loadpicdir,num,begin=0):
        global manhuaweb,threadcount,currentthreadnum,mutex
        print(manhuaweb+url)
        webdata=urllib.request.urlopen(manhuaweb+url).read()
        webdata=webdata.decode('UTF-8')
        chaptername=re.findall(r'<title>[^_]*',webdata)[0]
        chaptername=chaptername[7:len(chaptername)]
        webscrip=re.findall(r'eval.*[^<>]',webdata)
        chapterurl=meispower(webscrip[0]);
        chapterurl='http://mhimg.ali213.net'+chapterurl
        for i in range(begin,num):
                try:
                        while(currentthreadnum>=threadcount):
                                time.sleep(0.5)
                        mutex.acquire()
                        currentthreadnum=currentthreadnum+1
                        mutex.release()
                        threading.Thread(target=downloadpic,args=(r'%s%d.jpg'%(chapterurl,i),loadpicdir+chaptername,num)).start()
                except socket.error:
                        mutex.acquire()
                        i=i-1
                        currentthreadnum=currentthreadnum-1
                        mutex.release()
                except Exception as error:
                        print(error,'break')
                        print('download chapter %d of picture make a error'%i)
                        break
if __name__=='__main__':
        manhuaweb=r'http://manhua.ali213.net'
        socket.setdefaulttimeout(60.0)
        mutex=threading.Lock()
        mutex2=threading.Lock()
        
        webfile=urllib.request.urlopen(weburl)
        webdata=webfile.read();
        webdata=webdata.decode('UTF-8')
        meshmode=re.compile(r'<div class="detail_body_right_sec_con">.*</div>')
        meshdata=meshmode.findall(webdata)[0]
        indexmode=re.compile(r'([0-9]*页)')
        indexdata=indexmode.findall(meshdata)
        picurlmode=re.compile(r'/comic/[0-9/]*.html')
        picurldata=picurlmode.findall(meshdata)

        chapterlength=len(picurldata)
        nummode=re.compile(r'[\d]+')
        i=chapterbegin
        while i<chapterlength:
                manhuachapter=picurldata[chapterlength-i-1]
                downloadchapter(manhuachapter,floder,int(nummode.findall(indexdata[chapterlength-i-1])[0]))
                i=i+1
Python 相关文章推荐
使用Python的PIL模块来进行图片对比
Feb 18 Python
Python简单生成8位随机密码的方法
May 24 Python
Django forms组件的使用教程
Oct 08 Python
python opencv 图像拼接的实现方法
Jun 27 Python
python hough变换检测直线的实现方法
Jul 12 Python
基于python3实现倒叙字符串
Feb 18 Python
Python修改列表值问题解决方案
Mar 06 Python
python实现FTP文件传输的方法(服务器端和客户端)
Mar 20 Python
解决numpy矩阵相减出现的负值自动转正值的问题
Jun 03 Python
Python headers请求头如何实现快速添加
Nov 03 Python
Python实现迪杰斯特拉算法并生成最短路径的示例代码
Dec 01 Python
python+selenium自动化实战携带cookies模拟登陆微博
Jan 19 Python
python发送邮件示例(支持中文邮件标题)
Feb 16 #Python
python定时器使用示例分享
Feb 16 #Python
python求素数示例分享
Feb 16 #Python
python检测服务器是否正常
Feb 16 #Python
java直接调用python脚本的例子
Feb 16 #Python
python根据距离和时长计算配速示例
Feb 16 #Python
python根据经纬度计算距离示例
Feb 16 #Python
You might like
PHP mysql与mysqli事务使用说明 分享
2013/08/17 PHP
PHP实现图片自动清理的方法
2015/07/08 PHP
PHP静态成员变量和非静态成员变量详解
2017/02/14 PHP
PHP封装的分页类与简单用法示例
2019/02/25 PHP
学习YUI.Ext基础第一天
2007/03/10 Javascript
JS 对象介绍
2010/01/20 Javascript
JQuery 1.6发布 性能提升,同时包含大量破坏性变更
2011/05/10 Javascript
解决Extjs上传图片无法预览的解决方法
2012/03/22 Javascript
jQuery取得select选择的文本与值的示例
2013/12/09 Javascript
jQuery圆形统计图开发实例
2015/01/04 Javascript
jQuery中prependTo()方法用法实例
2015/01/08 Javascript
使用NodeJs 开发微信公众号(三)微信事件交互实例
2016/03/02 NodeJs
JSP基于Bootstrap分页显示实例解析
2016/06/12 Javascript
js精准的倒计时函数分享
2016/06/29 Javascript
JS中对Cookie的操作详解
2016/08/05 Javascript
Jquery EasyUI Datagrid右键菜单实现方法
2016/12/30 Javascript
详解webpack 如何集成第三方js库
2017/06/29 Javascript
浅谈ElementUI中switch回调函数change的参数问题
2018/08/24 Javascript
利用node.js开发cli的完整步骤
2020/12/29 Javascript
[03:03]DOTA2校园争霸赛 济南城市决赛欢乐发奖活动
2013/10/21 DOTA
打印出python 当前全局变量和入口参数的所有属性
2009/07/01 Python
python中列表元素连接方法join用法实例
2015/04/07 Python
详解Python中__str__和__repr__方法的区别
2015/04/17 Python
python aiohttp的使用详解
2019/06/20 Python
python GUI库图形界面开发之PyQt5切换按钮控件QPushButton详细使用方法与实例
2020/02/28 Python
python自动下载图片的方法示例
2020/03/25 Python
jupyter notebook oepncv 显示一张图像的实现
2020/04/24 Python
python中有帮助函数吗
2020/06/19 Python
详解python实现可视化的MD5、sha256哈希加密小工具
2020/09/14 Python
如何用Matlab和Python读取Netcdf文件
2021/02/19 Python
美国最大的烧烤架和户外生活用品专业零售商:Barbeques Galore
2021/01/09 全球购物
国防教育标语
2014/10/08 职场文书
工作违纪检讨书范文
2015/01/26 职场文书
2015年小学数学教师工作总结
2015/05/20 职场文书
复兴之路观后感3000字
2015/06/02 职场文书
Jpa Specification如何实现and和or同时使用查询
2021/11/23 Java/Android