python实现爬虫下载漫画示例


Posted in Python onFebruary 16, 2014
#!/usr/bin/python3.2
import os,socket
import urllib
import urllib.request,threading,time
import re,sys
global manhuaweb,weburl,floder,chapterbegin,currentthreadnum,threadcount,mutex,mutex2
weburl=''
floder=''
chapterbegin=0
currentthreadnum=0
threadcount=6

if len(sys.argv)>=3:
  weburl=sys.argv[1]
  floder=sys.argv[2]
else:
    print("usag: downloadmanhua weburl floder chapterbegin=0 threadnnum=6") 
    sys.exit(0)
if len(sys.argv)>=4:
  chapterbegin=int(sys.argv[3])
if len(sys.argv)>=5:
  threadcount=(int)(sys.argv[4])
 
def jin(i,jinzhi):
        finalans=""
        answer=i%jinzhi
        i=int(i/jinzhi)
        if answer>9:
                finalans=finalans+chr(ord('a')+(answer-10))
        else:
                finalans=finalans+str(answer)
        if i!=0:
                finalans=jin(i,jinzhi)+finalans
        return finalans
def urlparse(p,a,c,k):
        d={}
        e=lambda c:     jin(c,36)
        if 1:
                while c:
                        c=c-1
                        if not k[c]:
                                d[jin(c,36)]=jin(c,36)
                        else:
                                d[jin(c,36)]=k[c]
                k=[lambda e:d[e]]
                e=lambda c:'\\w+'
                c=1
        newstr=""
        while c:
                c=c-1
                if k[c]:
                        for i in range(0,len(p)):
                                tempi=p[i]
                                tempi=ord(tempi)
                                if tempi>=ord('a') and tempi<=ord('f'):
                                        newstr+=d[chr(tempi)]
                                elif tempi>=ord('0') and tempi<=ord('9'):
                                        newstr+=d[chr(tempi)]
                                else:
                                        newstr+=chr(tempi)
        return newstr
def meispower(s):
        p=re.compile(r"(?=\}\().*",re.IGNORECASE)
        s=p.findall(s)
        s=s[0]
        s=s[0:(len(s)-19)]
        par=s.split(',')
        par[3]=par[3][1:len(par[3])]
        answer=par[3].split('|')
        chapterpath=urlparse(par[0],int(par[1]),int(par[2]),answer)
        allurl=re.findall('imgpath=[^;]*',chapterpath)[0]
        allurl=allurl[10:(len(allurl)-2)]
        return allurl
def pictofile(weburl,filename,loop=100):
        if loop<0:
                print('can\'t download the picture %s'%weburl)
                return
        loop=loop-1
        if os.path.exists(filename):
            return
        try:
                url=urllib.request.urlopen(weburl)
                data=url.read()
                if len(data)<2048:
                        url.close()
                        pictofile(weburl,filename,loop)
                else:
                        print('download from %s name is %s\n'%(weburl,filename))
                        myfile=open('%s'%filename,'wb')
                        myfile.write(data)
                        myfile.close()
                        url.close();
        except socket.timeout:
                print('timeout')
                pictofile(weburl,filename,loop)
        except Exception as e:
          print('error',e)
          pictofile(weburl,filename,loop)
        finally:
            pass
def downloadpic(url,loadpicdir,num):
    #download the all url picture to loadpicdir
    global currentthreadnum,mutex,mutex2
    mymode=re.compile(r'[0-9a-z.]*\Z')
    try:
                mutex2.acquire()
                os.chdir(loadpicdir)
                mutex2.release()
    except:
                print("can't open the floder %s will be create"%loadpicdir)
                try:
                    if(mutex2.locked()):
                        os.mkdir(loadpicdir)
                        os.chdir(loadpicdir)
                        mutex2.release()
                    print('create floder succeed')
                except:
                    print("can't create floder %s"%loadpicdir)
                    if(mutex.acquire()):
                        mutex.release()
                    quit(0)
    name=mymode.findall(url)
    filename='manhua'+name[0]
    pictofile(url,loadpicdir+'//'+str(num)+'-'+filename)
    mutex.acquire()
    currentthreadnum=currentthreadnum-1
    mutex.release()
def downloadchapter(url,loadpicdir,num,begin=0):
        global manhuaweb,threadcount,currentthreadnum,mutex
        print(manhuaweb+url)
        webdata=urllib.request.urlopen(manhuaweb+url).read()
        webdata=webdata.decode('UTF-8')
        chaptername=re.findall(r'<title>[^_]*',webdata)[0]
        chaptername=chaptername[7:len(chaptername)]
        webscrip=re.findall(r'eval.*[^<>]',webdata)
        chapterurl=meispower(webscrip[0]);
        chapterurl='http://mhimg.ali213.net'+chapterurl
        for i in range(begin,num):
                try:
                        while(currentthreadnum>=threadcount):
                                time.sleep(0.5)
                        mutex.acquire()
                        currentthreadnum=currentthreadnum+1
                        mutex.release()
                        threading.Thread(target=downloadpic,args=(r'%s%d.jpg'%(chapterurl,i),loadpicdir+chaptername,num)).start()
                except socket.error:
                        mutex.acquire()
                        i=i-1
                        currentthreadnum=currentthreadnum-1
                        mutex.release()
                except Exception as error:
                        print(error,'break')
                        print('download chapter %d of picture make a error'%i)
                        break
if __name__=='__main__':
        manhuaweb=r'http://manhua.ali213.net'
        socket.setdefaulttimeout(60.0)
        mutex=threading.Lock()
        mutex2=threading.Lock()
        
        webfile=urllib.request.urlopen(weburl)
        webdata=webfile.read();
        webdata=webdata.decode('UTF-8')
        meshmode=re.compile(r'<div class="detail_body_right_sec_con">.*</div>')
        meshdata=meshmode.findall(webdata)[0]
        indexmode=re.compile(r'([0-9]*页)')
        indexdata=indexmode.findall(meshdata)
        picurlmode=re.compile(r'/comic/[0-9/]*.html')
        picurldata=picurlmode.findall(meshdata)

        chapterlength=len(picurldata)
        nummode=re.compile(r'[\d]+')
        i=chapterbegin
        while i<chapterlength:
                manhuachapter=picurldata[chapterlength-i-1]
                downloadchapter(manhuachapter,floder,int(nummode.findall(indexdata[chapterlength-i-1])[0]))
                i=i+1
Python 相关文章推荐
跟老齐学Python之模块的加载
Oct 24 Python
Python用模块pytz来转换时区
Aug 19 Python
Python 数据处理库 pandas 入门教程基本操作
Apr 19 Python
redis之django-redis的简单缓存使用
Jun 07 Python
Python神奇的内置函数locals的实例讲解
Feb 22 Python
解决Python计算矩阵乘向量,矩阵乘实数的一些小错误
Aug 26 Python
Python 字符串、列表、元组的截取与切片操作示例
Sep 17 Python
python paramiko远程服务器终端操作过程解析
Dec 14 Python
在tensorflow中实现屏蔽输出的log信息
Feb 04 Python
python将dict中的unicode打印成中文实例
May 11 Python
python实现监听键盘
Apr 26 Python
Python pandas求方差和标准差的方法实例
Aug 04 Python
python发送邮件示例(支持中文邮件标题)
Feb 16 #Python
python定时器使用示例分享
Feb 16 #Python
python求素数示例分享
Feb 16 #Python
python检测服务器是否正常
Feb 16 #Python
java直接调用python脚本的例子
Feb 16 #Python
python根据距离和时长计算配速示例
Feb 16 #Python
python根据经纬度计算距离示例
Feb 16 #Python
You might like
php获取远程文件内容的函数
2015/11/02 PHP
PHP编程实现微信企业向用户付款的方法示例
2017/07/26 PHP
实例:用 JavaScript 来操作字符串(一些字符串函数)
2007/02/15 Javascript
发两个小东西,ASP/PHP 学习工具。 用JavaScript写的
2007/04/12 Javascript
经典海量jQuery插件 大家可以收藏一下
2010/02/07 Javascript
window.location.hash 使用说明
2010/11/08 Javascript
40个有创意的jQuery图片和内容滑动及弹出插件收藏集之二
2011/12/31 Javascript
推荐4个原生javascript常用的函数
2015/01/12 Javascript
JavaScript删除数组元素的方法
2015/03/20 Javascript
jquery判断复选框是否被选中的方法
2015/10/16 Javascript
超链接怎么正确调用javascript函数
2016/05/23 Javascript
深入理解逻辑表达式的用法 与或非的用法
2016/06/06 Javascript
jQuery生成假加载动画效果
2016/12/01 Javascript
JavaScript 输出显示内容(document.write、alert、innerHTML、console.log)
2016/12/14 Javascript
jQuery动态移除和添加背景图片的方法详解
2017/03/07 Javascript
bmob js-sdk 在vue中的使用教程
2018/01/21 Javascript
Node.js利用console输出日志文件的方法示例
2018/04/27 Javascript
JavaScript实现简单音乐播放器
2020/04/17 Javascript
WebGL three.js学习笔记之阴影与实现物体的动画效果
2019/04/25 Javascript
layui 表格操作列按钮动态显示的实现方法
2019/09/06 Javascript
Ant Design的Table组件去除
2020/10/24 Javascript
python中pygame模块用法实例
2014/10/09 Python
浅谈numpy数组的几种排序方式
2017/12/15 Python
python爬虫自动创建文件夹的功能
2018/08/01 Python
python爬虫scrapy基于CrawlSpider类的全站数据爬取示例解析
2021/02/20 Python
伦敦一卡通:The London Pass
2018/11/30 全球购物
英文求职信结束语大全
2013/10/26 职场文书
办公自动化毕业生求职信
2014/03/09 职场文书
专家推荐信模板
2014/05/09 职场文书
旅游节目策划方案
2014/05/26 职场文书
幸福中国演讲稿
2014/09/12 职场文书
单位工作证明
2014/10/07 职场文书
社区敬老月活动总结
2015/05/07 职场文书
初中班长竞选稿
2015/11/20 职场文书
《悬崖边的树》读后感2篇
2019/12/02 职场文书
MySQL数据库索引的最左匹配原则
2021/11/20 MySQL