python实现爬虫下载漫画示例


Posted in Python onFebruary 16, 2014
#!/usr/bin/python3.2
import os,socket
import urllib
import urllib.request,threading,time
import re,sys
global manhuaweb,weburl,floder,chapterbegin,currentthreadnum,threadcount,mutex,mutex2
weburl=''
floder=''
chapterbegin=0
currentthreadnum=0
threadcount=6

if len(sys.argv)>=3:
  weburl=sys.argv[1]
  floder=sys.argv[2]
else:
    print("usag: downloadmanhua weburl floder chapterbegin=0 threadnnum=6") 
    sys.exit(0)
if len(sys.argv)>=4:
  chapterbegin=int(sys.argv[3])
if len(sys.argv)>=5:
  threadcount=(int)(sys.argv[4])
 
def jin(i,jinzhi):
        finalans=""
        answer=i%jinzhi
        i=int(i/jinzhi)
        if answer>9:
                finalans=finalans+chr(ord('a')+(answer-10))
        else:
                finalans=finalans+str(answer)
        if i!=0:
                finalans=jin(i,jinzhi)+finalans
        return finalans
def urlparse(p,a,c,k):
        d={}
        e=lambda c:     jin(c,36)
        if 1:
                while c:
                        c=c-1
                        if not k[c]:
                                d[jin(c,36)]=jin(c,36)
                        else:
                                d[jin(c,36)]=k[c]
                k=[lambda e:d[e]]
                e=lambda c:'\\w+'
                c=1
        newstr=""
        while c:
                c=c-1
                if k[c]:
                        for i in range(0,len(p)):
                                tempi=p[i]
                                tempi=ord(tempi)
                                if tempi>=ord('a') and tempi<=ord('f'):
                                        newstr+=d[chr(tempi)]
                                elif tempi>=ord('0') and tempi<=ord('9'):
                                        newstr+=d[chr(tempi)]
                                else:
                                        newstr+=chr(tempi)
        return newstr
def meispower(s):
        p=re.compile(r"(?=\}\().*",re.IGNORECASE)
        s=p.findall(s)
        s=s[0]
        s=s[0:(len(s)-19)]
        par=s.split(',')
        par[3]=par[3][1:len(par[3])]
        answer=par[3].split('|')
        chapterpath=urlparse(par[0],int(par[1]),int(par[2]),answer)
        allurl=re.findall('imgpath=[^;]*',chapterpath)[0]
        allurl=allurl[10:(len(allurl)-2)]
        return allurl
def pictofile(weburl,filename,loop=100):
        if loop<0:
                print('can\'t download the picture %s'%weburl)
                return
        loop=loop-1
        if os.path.exists(filename):
            return
        try:
                url=urllib.request.urlopen(weburl)
                data=url.read()
                if len(data)<2048:
                        url.close()
                        pictofile(weburl,filename,loop)
                else:
                        print('download from %s name is %s\n'%(weburl,filename))
                        myfile=open('%s'%filename,'wb')
                        myfile.write(data)
                        myfile.close()
                        url.close();
        except socket.timeout:
                print('timeout')
                pictofile(weburl,filename,loop)
        except Exception as e:
          print('error',e)
          pictofile(weburl,filename,loop)
        finally:
            pass
def downloadpic(url,loadpicdir,num):
    #download the all url picture to loadpicdir
    global currentthreadnum,mutex,mutex2
    mymode=re.compile(r'[0-9a-z.]*\Z')
    try:
                mutex2.acquire()
                os.chdir(loadpicdir)
                mutex2.release()
    except:
                print("can't open the floder %s will be create"%loadpicdir)
                try:
                    if(mutex2.locked()):
                        os.mkdir(loadpicdir)
                        os.chdir(loadpicdir)
                        mutex2.release()
                    print('create floder succeed')
                except:
                    print("can't create floder %s"%loadpicdir)
                    if(mutex.acquire()):
                        mutex.release()
                    quit(0)
    name=mymode.findall(url)
    filename='manhua'+name[0]
    pictofile(url,loadpicdir+'//'+str(num)+'-'+filename)
    mutex.acquire()
    currentthreadnum=currentthreadnum-1
    mutex.release()
def downloadchapter(url,loadpicdir,num,begin=0):
        global manhuaweb,threadcount,currentthreadnum,mutex
        print(manhuaweb+url)
        webdata=urllib.request.urlopen(manhuaweb+url).read()
        webdata=webdata.decode('UTF-8')
        chaptername=re.findall(r'<title>[^_]*',webdata)[0]
        chaptername=chaptername[7:len(chaptername)]
        webscrip=re.findall(r'eval.*[^<>]',webdata)
        chapterurl=meispower(webscrip[0]);
        chapterurl='http://mhimg.ali213.net'+chapterurl
        for i in range(begin,num):
                try:
                        while(currentthreadnum>=threadcount):
                                time.sleep(0.5)
                        mutex.acquire()
                        currentthreadnum=currentthreadnum+1
                        mutex.release()
                        threading.Thread(target=downloadpic,args=(r'%s%d.jpg'%(chapterurl,i),loadpicdir+chaptername,num)).start()
                except socket.error:
                        mutex.acquire()
                        i=i-1
                        currentthreadnum=currentthreadnum-1
                        mutex.release()
                except Exception as error:
                        print(error,'break')
                        print('download chapter %d of picture make a error'%i)
                        break
if __name__=='__main__':
        manhuaweb=r'http://manhua.ali213.net'
        socket.setdefaulttimeout(60.0)
        mutex=threading.Lock()
        mutex2=threading.Lock()
        
        webfile=urllib.request.urlopen(weburl)
        webdata=webfile.read();
        webdata=webdata.decode('UTF-8')
        meshmode=re.compile(r'<div class="detail_body_right_sec_con">.*</div>')
        meshdata=meshmode.findall(webdata)[0]
        indexmode=re.compile(r'([0-9]*页)')
        indexdata=indexmode.findall(meshdata)
        picurlmode=re.compile(r'/comic/[0-9/]*.html')
        picurldata=picurlmode.findall(meshdata)

        chapterlength=len(picurldata)
        nummode=re.compile(r'[\d]+')
        i=chapterbegin
        while i<chapterlength:
                manhuachapter=picurldata[chapterlength-i-1]
                downloadchapter(manhuachapter,floder,int(nummode.findall(indexdata[chapterlength-i-1])[0]))
                i=i+1
Python 相关文章推荐
python中cPickle用法例子分享
Jan 03 Python
浅析Python基础-流程控制
Mar 18 Python
python 读取excel文件生成sql文件实例详解
May 12 Python
Python实现PS滤镜的旋转模糊功能示例
Jan 20 Python
Sanic框架应用部署方法详解
Jul 18 Python
tensorflow学习教程之文本分类详析
Aug 07 Python
python实现感知机线性分类模型示例代码
Jun 02 Python
python代码实现逻辑回归logistic原理
Aug 07 Python
python两个list[]相加的实现方法
Sep 23 Python
pycharm激活码免费分享适用最新pycharm2020.2.3永久激活
Nov 25 Python
Python如何使用logging为Flask增加logid
Mar 30 Python
Jupyter Notebook内使用argparse报错的解决方案
Jun 03 Python
python发送邮件示例(支持中文邮件标题)
Feb 16 #Python
python定时器使用示例分享
Feb 16 #Python
python求素数示例分享
Feb 16 #Python
python检测服务器是否正常
Feb 16 #Python
java直接调用python脚本的例子
Feb 16 #Python
python根据距离和时长计算配速示例
Feb 16 #Python
python根据经纬度计算距离示例
Feb 16 #Python
You might like
PHP中json_encode、json_decode与serialize、unserialize的性能测试分析
2010/06/09 PHP
php自动给文章加关键词链接的函数代码
2012/11/29 PHP
PHP中通过HTTP_USER_AGENT判断是否为手机移动终端的函数代码
2013/02/14 PHP
十幅图告诉你什么是PHP引用
2015/02/22 PHP
yii用户注册表单验证实例
2015/12/26 PHP
date.parse在IE和FF中的区别
2010/07/29 Javascript
JQuery实现简单验证码提示解决方案
2012/12/20 Javascript
通过正则格式化url查询字符串实现代码
2012/12/28 Javascript
JavaScript获取客户端计算机硬件及系统等信息的方法
2014/01/02 Javascript
JS根据变量保存方法名并执行方法示例
2014/04/04 Javascript
JavaScript实现的图像模糊算法代码分享
2014/04/22 Javascript
基于jquery实现的可编辑下拉框实现代码
2014/08/02 Javascript
JavaScript中的getDay()方法使用详解
2015/06/09 Javascript
jQuery实现灰蓝风格标准二级下拉菜单效果代码
2015/08/31 Javascript
基于Bootstrap实现图片轮播效果
2016/05/22 Javascript
js手动播放图片实现图片轮播效果
2016/09/17 Javascript
分享javascript、jquery实用代码段
2016/10/20 Javascript
Vue.js实现简单ToDoList 前期准备(一)
2016/12/01 Javascript
jQuery删除当前节点元素
2016/12/07 Javascript
Webpack 之 babel-loader文件预处理器详解
2018/03/23 Javascript
Vue项目中跨域问题解决方案
2018/06/05 Javascript
详解JavaScript中操作符和表达式
2018/09/12 Javascript
基于vue-cli搭建多模块且各模块独立打包的项目
2019/06/12 Javascript
vue@cli3项目模板怎么使用public目录下的静态文件
2020/07/07 Javascript
Python subprocess库的使用详解
2018/10/26 Python
Django forms表单 select下拉框的传值实例
2019/07/19 Python
Python 转换文本编码实现解析
2019/08/27 Python
学习Python列表的基础知识汇总
2020/03/10 Python
CSS3截取字符串实例代码【推荐】
2018/06/07 HTML / CSS
让ie浏览器成为支持html5的浏览器的解决方法(使用html5shiv)
2014/04/08 HTML / CSS
size?法国官网:英国伦敦的球鞋精品店
2020/03/15 全球购物
2014年敬老院工作总结
2014/12/08 职场文书
学生上课迟到检讨书
2015/01/01 职场文书
高二语文教学反思
2016/02/16 职场文书
课文《燕子》教学反思
2016/02/17 职场文书
用Python远程登陆服务器的步骤
2021/04/16 Python