python实现爬虫下载漫画示例


Posted in Python onFebruary 16, 2014
#!/usr/bin/python3.2
import os,socket
import urllib
import urllib.request,threading,time
import re,sys
global manhuaweb,weburl,floder,chapterbegin,currentthreadnum,threadcount,mutex,mutex2
weburl=''
floder=''
chapterbegin=0
currentthreadnum=0
threadcount=6

if len(sys.argv)>=3:
  weburl=sys.argv[1]
  floder=sys.argv[2]
else:
    print("usag: downloadmanhua weburl floder chapterbegin=0 threadnnum=6") 
    sys.exit(0)
if len(sys.argv)>=4:
  chapterbegin=int(sys.argv[3])
if len(sys.argv)>=5:
  threadcount=(int)(sys.argv[4])
 
def jin(i,jinzhi):
        finalans=""
        answer=i%jinzhi
        i=int(i/jinzhi)
        if answer>9:
                finalans=finalans+chr(ord('a')+(answer-10))
        else:
                finalans=finalans+str(answer)
        if i!=0:
                finalans=jin(i,jinzhi)+finalans
        return finalans
def urlparse(p,a,c,k):
        d={}
        e=lambda c:     jin(c,36)
        if 1:
                while c:
                        c=c-1
                        if not k[c]:
                                d[jin(c,36)]=jin(c,36)
                        else:
                                d[jin(c,36)]=k[c]
                k=[lambda e:d[e]]
                e=lambda c:'\\w+'
                c=1
        newstr=""
        while c:
                c=c-1
                if k[c]:
                        for i in range(0,len(p)):
                                tempi=p[i]
                                tempi=ord(tempi)
                                if tempi>=ord('a') and tempi<=ord('f'):
                                        newstr+=d[chr(tempi)]
                                elif tempi>=ord('0') and tempi<=ord('9'):
                                        newstr+=d[chr(tempi)]
                                else:
                                        newstr+=chr(tempi)
        return newstr
def meispower(s):
        p=re.compile(r"(?=\}\().*",re.IGNORECASE)
        s=p.findall(s)
        s=s[0]
        s=s[0:(len(s)-19)]
        par=s.split(',')
        par[3]=par[3][1:len(par[3])]
        answer=par[3].split('|')
        chapterpath=urlparse(par[0],int(par[1]),int(par[2]),answer)
        allurl=re.findall('imgpath=[^;]*',chapterpath)[0]
        allurl=allurl[10:(len(allurl)-2)]
        return allurl
def pictofile(weburl,filename,loop=100):
        if loop<0:
                print('can\'t download the picture %s'%weburl)
                return
        loop=loop-1
        if os.path.exists(filename):
            return
        try:
                url=urllib.request.urlopen(weburl)
                data=url.read()
                if len(data)<2048:
                        url.close()
                        pictofile(weburl,filename,loop)
                else:
                        print('download from %s name is %s\n'%(weburl,filename))
                        myfile=open('%s'%filename,'wb')
                        myfile.write(data)
                        myfile.close()
                        url.close();
        except socket.timeout:
                print('timeout')
                pictofile(weburl,filename,loop)
        except Exception as e:
          print('error',e)
          pictofile(weburl,filename,loop)
        finally:
            pass
def downloadpic(url,loadpicdir,num):
    #download the all url picture to loadpicdir
    global currentthreadnum,mutex,mutex2
    mymode=re.compile(r'[0-9a-z.]*\Z')
    try:
                mutex2.acquire()
                os.chdir(loadpicdir)
                mutex2.release()
    except:
                print("can't open the floder %s will be create"%loadpicdir)
                try:
                    if(mutex2.locked()):
                        os.mkdir(loadpicdir)
                        os.chdir(loadpicdir)
                        mutex2.release()
                    print('create floder succeed')
                except:
                    print("can't create floder %s"%loadpicdir)
                    if(mutex.acquire()):
                        mutex.release()
                    quit(0)
    name=mymode.findall(url)
    filename='manhua'+name[0]
    pictofile(url,loadpicdir+'//'+str(num)+'-'+filename)
    mutex.acquire()
    currentthreadnum=currentthreadnum-1
    mutex.release()
def downloadchapter(url,loadpicdir,num,begin=0):
        global manhuaweb,threadcount,currentthreadnum,mutex
        print(manhuaweb+url)
        webdata=urllib.request.urlopen(manhuaweb+url).read()
        webdata=webdata.decode('UTF-8')
        chaptername=re.findall(r'<title>[^_]*',webdata)[0]
        chaptername=chaptername[7:len(chaptername)]
        webscrip=re.findall(r'eval.*[^<>]',webdata)
        chapterurl=meispower(webscrip[0]);
        chapterurl='http://mhimg.ali213.net'+chapterurl
        for i in range(begin,num):
                try:
                        while(currentthreadnum>=threadcount):
                                time.sleep(0.5)
                        mutex.acquire()
                        currentthreadnum=currentthreadnum+1
                        mutex.release()
                        threading.Thread(target=downloadpic,args=(r'%s%d.jpg'%(chapterurl,i),loadpicdir+chaptername,num)).start()
                except socket.error:
                        mutex.acquire()
                        i=i-1
                        currentthreadnum=currentthreadnum-1
                        mutex.release()
                except Exception as error:
                        print(error,'break')
                        print('download chapter %d of picture make a error'%i)
                        break
if __name__=='__main__':
        manhuaweb=r'http://manhua.ali213.net'
        socket.setdefaulttimeout(60.0)
        mutex=threading.Lock()
        mutex2=threading.Lock()
        
        webfile=urllib.request.urlopen(weburl)
        webdata=webfile.read();
        webdata=webdata.decode('UTF-8')
        meshmode=re.compile(r'<div class="detail_body_right_sec_con">.*</div>')
        meshdata=meshmode.findall(webdata)[0]
        indexmode=re.compile(r'([0-9]*页)')
        indexdata=indexmode.findall(meshdata)
        picurlmode=re.compile(r'/comic/[0-9/]*.html')
        picurldata=picurlmode.findall(meshdata)

        chapterlength=len(picurldata)
        nummode=re.compile(r'[\d]+')
        i=chapterbegin
        while i<chapterlength:
                manhuachapter=picurldata[chapterlength-i-1]
                downloadchapter(manhuachapter,floder,int(nummode.findall(indexdata[chapterlength-i-1])[0]))
                i=i+1
Python 相关文章推荐
在Windows8上的搭建Python和Django环境
Jul 03 Python
python中函数传参详解
Jul 03 Python
python数据结构之链表的实例讲解
Jul 25 Python
用十张图详解TensorFlow数据读取机制(附代码)
Feb 06 Python
Python(Django)项目与Apache的管理交互的方法
May 16 Python
解决matplotlib库show()方法不显示图片的问题
May 24 Python
Python3日期与时间戳转换的几种方法详解
Jun 04 Python
Django实现文件上传下载
Oct 06 Python
浅谈Python3识别判断图片主要颜色并和颜色库进行对比的方法
Oct 25 Python
python小程序基于Jupyter实现天气查询的方法
Mar 27 Python
Python实现查找数据库最接近的数据
Jun 08 Python
python怎么提高计算速度
Jun 11 Python
python发送邮件示例(支持中文邮件标题)
Feb 16 #Python
python定时器使用示例分享
Feb 16 #Python
python求素数示例分享
Feb 16 #Python
python检测服务器是否正常
Feb 16 #Python
java直接调用python脚本的例子
Feb 16 #Python
python根据距离和时长计算配速示例
Feb 16 #Python
python根据经纬度计算距离示例
Feb 16 #Python
You might like
编写php应用程序实现摘要式身份验证的方法详解
2013/06/08 PHP
thinkphp中多表查询中防止数据重复的sql语句(必看)
2016/09/22 PHP
简明json介绍
2008/09/28 Javascript
解析Jquery中如何把一段html代码动态写入到DIV中(实例说明)
2013/07/09 Javascript
使用JavaScript实现网页版Pongo设计思路及源代码分享
2014/06/16 Javascript
JS实现从表格中动态删除指定行的方法
2015/03/31 Javascript
jQuery购物网页经典制作案例
2016/08/19 Javascript
JavaScript的兼容性与调试技巧
2016/11/22 Javascript
bootstrap日期插件daterangepicker使用详解
2017/10/19 Javascript
JS表单传值和URL编码转换
2018/03/03 Javascript
nodejs之koa2请求示例(GET,POST)
2018/08/07 NodeJs
jQuery判断自定义属性data-val用法示例
2019/01/07 jQuery
es6函数之箭头函数用法实例详解
2020/04/25 Javascript
深入讲解Python函数中参数的使用及默认参数的陷阱
2016/03/13 Python
django model去掉unique_together报错的解决方案
2016/10/18 Python
python executemany的使用及注意事项
2017/03/13 Python
python3连接MySQL数据库实例详解
2018/05/24 Python
解决Ubuntu pip 安装 mysql-python包出错的问题
2018/06/11 Python
简单了解python单例模式的几种写法
2019/07/01 Python
python 动态迁移solr数据过程解析
2019/09/04 Python
Python django搭建layui提交表单,表格,图标的实例
2019/11/18 Python
Python基础之函数基本用法与进阶详解
2020/01/02 Python
使用IPython或Spyder将省略号表示的内容完整输出
2020/04/20 Python
基于css3的属性transition制作菜单导航效果
2015/09/01 HTML / CSS
canvas像素画板的实现代码
2018/11/21 HTML / CSS
Yahoo-PHP面试题1
2016/07/20 面试题
送给程序员的20个Java集合面试问题
2014/08/06 面试题
中学生差生评语
2014/01/30 职场文书
服务理念标语
2014/06/18 职场文书
党员教师群众路线对照检查材料思想汇报
2014/09/29 职场文书
酒店辞职书范文
2015/02/26 职场文书
高中生思想道德自我评价
2015/03/09 职场文书
车间安全生产管理制度
2015/08/06 职场文书
2016计算机专业毕业生自荐信
2016/01/28 职场文书
详细总结Python常见的安全问题
2021/05/21 Python
Python中的matplotlib绘制百分比堆叠柱状图,并为每一个类别设置不同的填充图案
2022/04/20 Python