python实现爬虫下载漫画示例


Posted in Python onFebruary 16, 2014
#!/usr/bin/python3.2
import os,socket
import urllib
import urllib.request,threading,time
import re,sys
global manhuaweb,weburl,floder,chapterbegin,currentthreadnum,threadcount,mutex,mutex2
weburl=''
floder=''
chapterbegin=0
currentthreadnum=0
threadcount=6

if len(sys.argv)>=3:
  weburl=sys.argv[1]
  floder=sys.argv[2]
else:
    print("usag: downloadmanhua weburl floder chapterbegin=0 threadnnum=6") 
    sys.exit(0)
if len(sys.argv)>=4:
  chapterbegin=int(sys.argv[3])
if len(sys.argv)>=5:
  threadcount=(int)(sys.argv[4])
 
def jin(i,jinzhi):
        finalans=""
        answer=i%jinzhi
        i=int(i/jinzhi)
        if answer>9:
                finalans=finalans+chr(ord('a')+(answer-10))
        else:
                finalans=finalans+str(answer)
        if i!=0:
                finalans=jin(i,jinzhi)+finalans
        return finalans
def urlparse(p,a,c,k):
        d={}
        e=lambda c:     jin(c,36)
        if 1:
                while c:
                        c=c-1
                        if not k[c]:
                                d[jin(c,36)]=jin(c,36)
                        else:
                                d[jin(c,36)]=k[c]
                k=[lambda e:d[e]]
                e=lambda c:'\\w+'
                c=1
        newstr=""
        while c:
                c=c-1
                if k[c]:
                        for i in range(0,len(p)):
                                tempi=p[i]
                                tempi=ord(tempi)
                                if tempi>=ord('a') and tempi<=ord('f'):
                                        newstr+=d[chr(tempi)]
                                elif tempi>=ord('0') and tempi<=ord('9'):
                                        newstr+=d[chr(tempi)]
                                else:
                                        newstr+=chr(tempi)
        return newstr
def meispower(s):
        p=re.compile(r"(?=\}\().*",re.IGNORECASE)
        s=p.findall(s)
        s=s[0]
        s=s[0:(len(s)-19)]
        par=s.split(',')
        par[3]=par[3][1:len(par[3])]
        answer=par[3].split('|')
        chapterpath=urlparse(par[0],int(par[1]),int(par[2]),answer)
        allurl=re.findall('imgpath=[^;]*',chapterpath)[0]
        allurl=allurl[10:(len(allurl)-2)]
        return allurl
def pictofile(weburl,filename,loop=100):
        if loop<0:
                print('can\'t download the picture %s'%weburl)
                return
        loop=loop-1
        if os.path.exists(filename):
            return
        try:
                url=urllib.request.urlopen(weburl)
                data=url.read()
                if len(data)<2048:
                        url.close()
                        pictofile(weburl,filename,loop)
                else:
                        print('download from %s name is %s\n'%(weburl,filename))
                        myfile=open('%s'%filename,'wb')
                        myfile.write(data)
                        myfile.close()
                        url.close();
        except socket.timeout:
                print('timeout')
                pictofile(weburl,filename,loop)
        except Exception as e:
          print('error',e)
          pictofile(weburl,filename,loop)
        finally:
            pass
def downloadpic(url,loadpicdir,num):
    #download the all url picture to loadpicdir
    global currentthreadnum,mutex,mutex2
    mymode=re.compile(r'[0-9a-z.]*\Z')
    try:
                mutex2.acquire()
                os.chdir(loadpicdir)
                mutex2.release()
    except:
                print("can't open the floder %s will be create"%loadpicdir)
                try:
                    if(mutex2.locked()):
                        os.mkdir(loadpicdir)
                        os.chdir(loadpicdir)
                        mutex2.release()
                    print('create floder succeed')
                except:
                    print("can't create floder %s"%loadpicdir)
                    if(mutex.acquire()):
                        mutex.release()
                    quit(0)
    name=mymode.findall(url)
    filename='manhua'+name[0]
    pictofile(url,loadpicdir+'//'+str(num)+'-'+filename)
    mutex.acquire()
    currentthreadnum=currentthreadnum-1
    mutex.release()
def downloadchapter(url,loadpicdir,num,begin=0):
        global manhuaweb,threadcount,currentthreadnum,mutex
        print(manhuaweb+url)
        webdata=urllib.request.urlopen(manhuaweb+url).read()
        webdata=webdata.decode('UTF-8')
        chaptername=re.findall(r'<title>[^_]*',webdata)[0]
        chaptername=chaptername[7:len(chaptername)]
        webscrip=re.findall(r'eval.*[^<>]',webdata)
        chapterurl=meispower(webscrip[0]);
        chapterurl='http://mhimg.ali213.net'+chapterurl
        for i in range(begin,num):
                try:
                        while(currentthreadnum>=threadcount):
                                time.sleep(0.5)
                        mutex.acquire()
                        currentthreadnum=currentthreadnum+1
                        mutex.release()
                        threading.Thread(target=downloadpic,args=(r'%s%d.jpg'%(chapterurl,i),loadpicdir+chaptername,num)).start()
                except socket.error:
                        mutex.acquire()
                        i=i-1
                        currentthreadnum=currentthreadnum-1
                        mutex.release()
                except Exception as error:
                        print(error,'break')
                        print('download chapter %d of picture make a error'%i)
                        break
if __name__=='__main__':
        manhuaweb=r'http://manhua.ali213.net'
        socket.setdefaulttimeout(60.0)
        mutex=threading.Lock()
        mutex2=threading.Lock()
        
        webfile=urllib.request.urlopen(weburl)
        webdata=webfile.read();
        webdata=webdata.decode('UTF-8')
        meshmode=re.compile(r'<div class="detail_body_right_sec_con">.*</div>')
        meshdata=meshmode.findall(webdata)[0]
        indexmode=re.compile(r'([0-9]*页)')
        indexdata=indexmode.findall(meshdata)
        picurlmode=re.compile(r'/comic/[0-9/]*.html')
        picurldata=picurlmode.findall(meshdata)

        chapterlength=len(picurldata)
        nummode=re.compile(r'[\d]+')
        i=chapterbegin
        while i<chapterlength:
                manhuachapter=picurldata[chapterlength-i-1]
                downloadchapter(manhuachapter,floder,int(nummode.findall(indexdata[chapterlength-i-1])[0]))
                i=i+1
Python 相关文章推荐
python的常见命令注入威胁
Feb 18 Python
python使用socket向客户端发送数据的方法
Apr 29 Python
Python爬取网页中的图片(搜狗图片)详解
Mar 23 Python
python引入导入自定义模块和外部文件的实例
Jul 24 Python
python分布式环境下的限流器的示例
Oct 26 Python
python验证码识别教程之利用滴水算法分割图片
Jun 05 Python
Python发送邮件测试报告操作实例详解
Dec 08 Python
Python----数据预处理代码实例
Mar 20 Python
python3+PyQt5 数据库编程--增删改实例
Jun 17 Python
50行Python代码实现视频中物体颜色识别和跟踪(必须以红色为例)
Nov 20 Python
使用python检查yaml配置文件是否符合要求
Apr 09 Python
python百行代码实现汉服圈图片爬取
Nov 23 Python
python发送邮件示例(支持中文邮件标题)
Feb 16 #Python
python定时器使用示例分享
Feb 16 #Python
python求素数示例分享
Feb 16 #Python
python检测服务器是否正常
Feb 16 #Python
java直接调用python脚本的例子
Feb 16 #Python
python根据距离和时长计算配速示例
Feb 16 #Python
python根据经纬度计算距离示例
Feb 16 #Python
You might like
php实现删除空目录的方法
2015/03/16 PHP
PHP随手笔记整理之PHP脚本和JAVA连接mysql数据库
2015/11/25 PHP
php 使用curl模拟ip和来源进行访问的实现方法
2017/05/02 PHP
PHP preg_match实现正则表达式匹配功能【输出是否匹配及匹配值】
2017/07/19 PHP
模拟多级复选框效果的jquery代码
2013/08/13 Javascript
有关Promises异步问题详解
2015/11/13 Javascript
AngularJS实现标签页的两种方式
2016/09/05 Javascript
JavaScript实现瀑布流以及加载效果
2017/02/11 Javascript
Javascript实现页面滚动时导航智能定位
2017/05/06 Javascript
详解HTTPS 的原理和 NodeJS 的实现
2017/07/04 NodeJs
jquery动态赋值id与动态取id方法示例
2017/08/21 jQuery
weex里Vuex state使用storage持久化详解
2017/09/09 Javascript
JavaScript编程设计模式之构造器模式实例分析
2017/10/25 Javascript
详解webpack3编译兼容IE8的正确姿势
2017/12/21 Javascript
vue-cli 3.x 修改dist路径的方法
2018/09/19 Javascript
node.js实现为PDF添加水印的示例代码
2018/12/05 Javascript
vue实现Excel文件的上传与下载功能的两种方式
2019/06/28 Javascript
[02:47]2018年度DOTA2最佳辅助位选手4号位-完美盛典
2018/12/17 DOTA
python 控制语句
2011/11/03 Python
Python多层嵌套list的递归处理方法(推荐)
2016/06/08 Python
Python调用C语言的方法【基于ctypes模块】
2018/01/22 Python
Python图像处理之颜色的定义与使用分析
2019/01/03 Python
keras实现调用自己训练的模型,并去掉全连接层
2020/06/09 Python
Python matplotlib图例放在外侧保存时显示不完整问题解决
2020/07/28 Python
python 解决selenium 中的 .clear()方法失效问题
2020/09/01 Python
HTML5 MiranaVideo播放器 (代码开源)
2010/06/11 HTML / CSS
Booking.com西班牙:全球酒店预订
2018/03/30 全球购物
市场部规章制度
2014/01/24 职场文书
《商鞅南门立木》教学反思
2014/02/16 职场文书
代理协议书范本
2014/04/22 职场文书
师范学院毕业生求职信
2014/06/24 职场文书
人事行政专员岗位职责
2014/07/23 职场文书
党的群众路线教育实践活动剖析材料
2014/09/30 职场文书
负责培养人意见
2015/06/05 职场文书
读《庄子》有感:美而不自知
2019/11/06 职场文书
golang 实用库gotable的具体使用
2021/07/01 Golang