python实现爬虫下载漫画示例


Posted in Python onFebruary 16, 2014
#!/usr/bin/python3.2
import os,socket
import urllib
import urllib.request,threading,time
import re,sys
global manhuaweb,weburl,floder,chapterbegin,currentthreadnum,threadcount,mutex,mutex2
weburl=''
floder=''
chapterbegin=0
currentthreadnum=0
threadcount=6

if len(sys.argv)>=3:
  weburl=sys.argv[1]
  floder=sys.argv[2]
else:
    print("usag: downloadmanhua weburl floder chapterbegin=0 threadnnum=6") 
    sys.exit(0)
if len(sys.argv)>=4:
  chapterbegin=int(sys.argv[3])
if len(sys.argv)>=5:
  threadcount=(int)(sys.argv[4])
 
def jin(i,jinzhi):
        finalans=""
        answer=i%jinzhi
        i=int(i/jinzhi)
        if answer>9:
                finalans=finalans+chr(ord('a')+(answer-10))
        else:
                finalans=finalans+str(answer)
        if i!=0:
                finalans=jin(i,jinzhi)+finalans
        return finalans
def urlparse(p,a,c,k):
        d={}
        e=lambda c:     jin(c,36)
        if 1:
                while c:
                        c=c-1
                        if not k[c]:
                                d[jin(c,36)]=jin(c,36)
                        else:
                                d[jin(c,36)]=k[c]
                k=[lambda e:d[e]]
                e=lambda c:'\\w+'
                c=1
        newstr=""
        while c:
                c=c-1
                if k[c]:
                        for i in range(0,len(p)):
                                tempi=p[i]
                                tempi=ord(tempi)
                                if tempi>=ord('a') and tempi<=ord('f'):
                                        newstr+=d[chr(tempi)]
                                elif tempi>=ord('0') and tempi<=ord('9'):
                                        newstr+=d[chr(tempi)]
                                else:
                                        newstr+=chr(tempi)
        return newstr
def meispower(s):
        p=re.compile(r"(?=\}\().*",re.IGNORECASE)
        s=p.findall(s)
        s=s[0]
        s=s[0:(len(s)-19)]
        par=s.split(',')
        par[3]=par[3][1:len(par[3])]
        answer=par[3].split('|')
        chapterpath=urlparse(par[0],int(par[1]),int(par[2]),answer)
        allurl=re.findall('imgpath=[^;]*',chapterpath)[0]
        allurl=allurl[10:(len(allurl)-2)]
        return allurl
def pictofile(weburl,filename,loop=100):
        if loop<0:
                print('can\'t download the picture %s'%weburl)
                return
        loop=loop-1
        if os.path.exists(filename):
            return
        try:
                url=urllib.request.urlopen(weburl)
                data=url.read()
                if len(data)<2048:
                        url.close()
                        pictofile(weburl,filename,loop)
                else:
                        print('download from %s name is %s\n'%(weburl,filename))
                        myfile=open('%s'%filename,'wb')
                        myfile.write(data)
                        myfile.close()
                        url.close();
        except socket.timeout:
                print('timeout')
                pictofile(weburl,filename,loop)
        except Exception as e:
          print('error',e)
          pictofile(weburl,filename,loop)
        finally:
            pass
def downloadpic(url,loadpicdir,num):
    #download the all url picture to loadpicdir
    global currentthreadnum,mutex,mutex2
    mymode=re.compile(r'[0-9a-z.]*\Z')
    try:
                mutex2.acquire()
                os.chdir(loadpicdir)
                mutex2.release()
    except:
                print("can't open the floder %s will be create"%loadpicdir)
                try:
                    if(mutex2.locked()):
                        os.mkdir(loadpicdir)
                        os.chdir(loadpicdir)
                        mutex2.release()
                    print('create floder succeed')
                except:
                    print("can't create floder %s"%loadpicdir)
                    if(mutex.acquire()):
                        mutex.release()
                    quit(0)
    name=mymode.findall(url)
    filename='manhua'+name[0]
    pictofile(url,loadpicdir+'//'+str(num)+'-'+filename)
    mutex.acquire()
    currentthreadnum=currentthreadnum-1
    mutex.release()
def downloadchapter(url,loadpicdir,num,begin=0):
        global manhuaweb,threadcount,currentthreadnum,mutex
        print(manhuaweb+url)
        webdata=urllib.request.urlopen(manhuaweb+url).read()
        webdata=webdata.decode('UTF-8')
        chaptername=re.findall(r'<title>[^_]*',webdata)[0]
        chaptername=chaptername[7:len(chaptername)]
        webscrip=re.findall(r'eval.*[^<>]',webdata)
        chapterurl=meispower(webscrip[0]);
        chapterurl='http://mhimg.ali213.net'+chapterurl
        for i in range(begin,num):
                try:
                        while(currentthreadnum>=threadcount):
                                time.sleep(0.5)
                        mutex.acquire()
                        currentthreadnum=currentthreadnum+1
                        mutex.release()
                        threading.Thread(target=downloadpic,args=(r'%s%d.jpg'%(chapterurl,i),loadpicdir+chaptername,num)).start()
                except socket.error:
                        mutex.acquire()
                        i=i-1
                        currentthreadnum=currentthreadnum-1
                        mutex.release()
                except Exception as error:
                        print(error,'break')
                        print('download chapter %d of picture make a error'%i)
                        break
if __name__=='__main__':
        manhuaweb=r'http://manhua.ali213.net'
        socket.setdefaulttimeout(60.0)
        mutex=threading.Lock()
        mutex2=threading.Lock()
        
        webfile=urllib.request.urlopen(weburl)
        webdata=webfile.read();
        webdata=webdata.decode('UTF-8')
        meshmode=re.compile(r'<div class="detail_body_right_sec_con">.*</div>')
        meshdata=meshmode.findall(webdata)[0]
        indexmode=re.compile(r'([0-9]*页)')
        indexdata=indexmode.findall(meshdata)
        picurlmode=re.compile(r'/comic/[0-9/]*.html')
        picurldata=picurlmode.findall(meshdata)

        chapterlength=len(picurldata)
        nummode=re.compile(r'[\d]+')
        i=chapterbegin
        while i<chapterlength:
                manhuachapter=picurldata[chapterlength-i-1]
                downloadchapter(manhuachapter,floder,int(nummode.findall(indexdata[chapterlength-i-1])[0]))
                i=i+1
Python 相关文章推荐
Python中使用socket发送HTTP请求数据接收不完整问题解决方法
Feb 04 Python
Python实现矩阵相乘的三种方法小结
Jul 26 Python
Django uwsgi Nginx 的生产环境部署详解
Feb 02 Python
利用Python的sympy包求解一元三次方程示例
Nov 22 Python
Numpy之将矩阵拉成向量的实例
Nov 30 Python
python 实现list或string按指定分段
Dec 25 Python
Python找出列表中出现次数最多的元素三种方式
Feb 24 Python
python生成并处理uuid的实现方式
Mar 03 Python
pycharm工具连接mysql数据库失败问题
Apr 01 Python
python中的yield from语法快速学习
Nov 06 Python
tensorboard 可视化之localhost:6006不显示的解决方案
May 22 Python
使用Django实现商城验证码模块的方法
Jun 01 Python
python发送邮件示例(支持中文邮件标题)
Feb 16 #Python
python定时器使用示例分享
Feb 16 #Python
python求素数示例分享
Feb 16 #Python
python检测服务器是否正常
Feb 16 #Python
java直接调用python脚本的例子
Feb 16 #Python
python根据距离和时长计算配速示例
Feb 16 #Python
python根据经纬度计算距离示例
Feb 16 #Python
You might like
PHP 字符截取 解决中文的截取问题,不用mb系列
2009/09/29 PHP
PHP调用Linux的命令行执行文件压缩命令
2013/01/27 PHP
Symfony2安装的方法(2种方法)
2016/02/04 PHP
laravel 模型查询按照whereIn排序的示例
2019/10/16 PHP
js文字滚动停顿效果代码
2008/06/28 Javascript
JavaScript 字符串处理函数使用小结
2010/12/02 Javascript
javascript数组去掉重复
2011/05/12 Javascript
jQuery实现用户注册的表单验证示例
2013/08/28 Javascript
javascript进行数组追加方法小结
2014/06/16 Javascript
使用documentElement正确取得当前可见区域的大小
2014/07/25 Javascript
node.js中的events.EventEmitter.listenerCount方法使用说明
2014/12/08 Javascript
js运动应用实例解析
2015/12/28 Javascript
JS实现滑动门效果的方法详解
2016/12/19 Javascript
JavaScript中的遍历详解(多种遍历)
2017/04/07 Javascript
利用JS实现scroll自定义滚动效果详解
2017/10/17 Javascript
JS 使用 window对象的print方法实现分页打印功能
2018/05/16 Javascript
this在vue和小程序中的使用详解
2019/01/28 Javascript
关于微信小程序获取小程序码并接受buffer流保存为图片的方法
2019/06/07 Javascript
如何在VUE中使用vue-awesome-swiper
2021/01/04 Vue.js
python的tkinter布局之简单的聊天窗口实现方法
2014/09/03 Python
Python的Django框架中if标签的相关使用
2015/07/15 Python
python实现二分查找算法
2017/09/21 Python
python机器学习实战之最近邻kNN分类器
2017/12/20 Python
使用实现pandas读取csv文件指定的前几行
2018/04/20 Python
详谈python在windows中的文件路径问题
2018/04/28 Python
Python告诉你木马程序的键盘记录原理
2019/02/02 Python
Django框架ORM数据库操作实例详解
2019/11/07 Python
python路径的写法及目录的获取方式
2019/12/26 Python
Pycharm 2020.1 版配置优化的详细教程
2020/08/07 Python
纯css3(无图片/js)制作的几个社交媒体网站的图标
2013/03/21 HTML / CSS
HomeAway澳大利亚:预订你的度假屋,公寓、度假村、别墅等
2019/02/20 全球购物
Pandora西班牙官方商店:PandoraShop.es
2020/10/05 全球购物
英语系毕业生自荐信
2013/10/31 职场文书
预备党员2014全国两会学习心得体会
2014/03/10 职场文书
放飞梦想演讲稿200字
2014/08/26 职场文书
redis中lua脚本使用教程
2021/11/01 Redis