python实现爬虫下载漫画示例


Posted in Python onFebruary 16, 2014
#!/usr/bin/python3.2
import os,socket
import urllib
import urllib.request,threading,time
import re,sys
global manhuaweb,weburl,floder,chapterbegin,currentthreadnum,threadcount,mutex,mutex2
weburl=''
floder=''
chapterbegin=0
currentthreadnum=0
threadcount=6

if len(sys.argv)>=3:
  weburl=sys.argv[1]
  floder=sys.argv[2]
else:
    print("usag: downloadmanhua weburl floder chapterbegin=0 threadnnum=6") 
    sys.exit(0)
if len(sys.argv)>=4:
  chapterbegin=int(sys.argv[3])
if len(sys.argv)>=5:
  threadcount=(int)(sys.argv[4])
 
def jin(i,jinzhi):
        finalans=""
        answer=i%jinzhi
        i=int(i/jinzhi)
        if answer>9:
                finalans=finalans+chr(ord('a')+(answer-10))
        else:
                finalans=finalans+str(answer)
        if i!=0:
                finalans=jin(i,jinzhi)+finalans
        return finalans
def urlparse(p,a,c,k):
        d={}
        e=lambda c:     jin(c,36)
        if 1:
                while c:
                        c=c-1
                        if not k[c]:
                                d[jin(c,36)]=jin(c,36)
                        else:
                                d[jin(c,36)]=k[c]
                k=[lambda e:d[e]]
                e=lambda c:'\\w+'
                c=1
        newstr=""
        while c:
                c=c-1
                if k[c]:
                        for i in range(0,len(p)):
                                tempi=p[i]
                                tempi=ord(tempi)
                                if tempi>=ord('a') and tempi<=ord('f'):
                                        newstr+=d[chr(tempi)]
                                elif tempi>=ord('0') and tempi<=ord('9'):
                                        newstr+=d[chr(tempi)]
                                else:
                                        newstr+=chr(tempi)
        return newstr
def meispower(s):
        p=re.compile(r"(?=\}\().*",re.IGNORECASE)
        s=p.findall(s)
        s=s[0]
        s=s[0:(len(s)-19)]
        par=s.split(',')
        par[3]=par[3][1:len(par[3])]
        answer=par[3].split('|')
        chapterpath=urlparse(par[0],int(par[1]),int(par[2]),answer)
        allurl=re.findall('imgpath=[^;]*',chapterpath)[0]
        allurl=allurl[10:(len(allurl)-2)]
        return allurl
def pictofile(weburl,filename,loop=100):
        if loop<0:
                print('can\'t download the picture %s'%weburl)
                return
        loop=loop-1
        if os.path.exists(filename):
            return
        try:
                url=urllib.request.urlopen(weburl)
                data=url.read()
                if len(data)<2048:
                        url.close()
                        pictofile(weburl,filename,loop)
                else:
                        print('download from %s name is %s\n'%(weburl,filename))
                        myfile=open('%s'%filename,'wb')
                        myfile.write(data)
                        myfile.close()
                        url.close();
        except socket.timeout:
                print('timeout')
                pictofile(weburl,filename,loop)
        except Exception as e:
          print('error',e)
          pictofile(weburl,filename,loop)
        finally:
            pass
def downloadpic(url,loadpicdir,num):
    #download the all url picture to loadpicdir
    global currentthreadnum,mutex,mutex2
    mymode=re.compile(r'[0-9a-z.]*\Z')
    try:
                mutex2.acquire()
                os.chdir(loadpicdir)
                mutex2.release()
    except:
                print("can't open the floder %s will be create"%loadpicdir)
                try:
                    if(mutex2.locked()):
                        os.mkdir(loadpicdir)
                        os.chdir(loadpicdir)
                        mutex2.release()
                    print('create floder succeed')
                except:
                    print("can't create floder %s"%loadpicdir)
                    if(mutex.acquire()):
                        mutex.release()
                    quit(0)
    name=mymode.findall(url)
    filename='manhua'+name[0]
    pictofile(url,loadpicdir+'//'+str(num)+'-'+filename)
    mutex.acquire()
    currentthreadnum=currentthreadnum-1
    mutex.release()
def downloadchapter(url,loadpicdir,num,begin=0):
        global manhuaweb,threadcount,currentthreadnum,mutex
        print(manhuaweb+url)
        webdata=urllib.request.urlopen(manhuaweb+url).read()
        webdata=webdata.decode('UTF-8')
        chaptername=re.findall(r'<title>[^_]*',webdata)[0]
        chaptername=chaptername[7:len(chaptername)]
        webscrip=re.findall(r'eval.*[^<>]',webdata)
        chapterurl=meispower(webscrip[0]);
        chapterurl='http://mhimg.ali213.net'+chapterurl
        for i in range(begin,num):
                try:
                        while(currentthreadnum>=threadcount):
                                time.sleep(0.5)
                        mutex.acquire()
                        currentthreadnum=currentthreadnum+1
                        mutex.release()
                        threading.Thread(target=downloadpic,args=(r'%s%d.jpg'%(chapterurl,i),loadpicdir+chaptername,num)).start()
                except socket.error:
                        mutex.acquire()
                        i=i-1
                        currentthreadnum=currentthreadnum-1
                        mutex.release()
                except Exception as error:
                        print(error,'break')
                        print('download chapter %d of picture make a error'%i)
                        break
if __name__=='__main__':
        manhuaweb=r'http://manhua.ali213.net'
        socket.setdefaulttimeout(60.0)
        mutex=threading.Lock()
        mutex2=threading.Lock()
        
        webfile=urllib.request.urlopen(weburl)
        webdata=webfile.read();
        webdata=webdata.decode('UTF-8')
        meshmode=re.compile(r'<div class="detail_body_right_sec_con">.*</div>')
        meshdata=meshmode.findall(webdata)[0]
        indexmode=re.compile(r'([0-9]*页)')
        indexdata=indexmode.findall(meshdata)
        picurlmode=re.compile(r'/comic/[0-9/]*.html')
        picurldata=picurlmode.findall(meshdata)

        chapterlength=len(picurldata)
        nummode=re.compile(r'[\d]+')
        i=chapterbegin
        while i<chapterlength:
                manhuachapter=picurldata[chapterlength-i-1]
                downloadchapter(manhuachapter,floder,int(nummode.findall(indexdata[chapterlength-i-1])[0]))
                i=i+1
Python 相关文章推荐
Python读写Excel文件的实例
Nov 01 Python
在Windows8上的搭建Python和Django环境
Jul 03 Python
python实现的生成随机迷宫算法核心代码分享(含游戏完整代码)
Jul 11 Python
Python实现网站文件的全备份和差异备份
Nov 30 Python
Python3一行代码实现图片文字识别的示例
Jan 15 Python
Centos部署django服务nginx+uwsgi的方法
Jan 02 Python
PyQt5实现简易计算器
May 30 Python
python利用多种方式来统计词频(单词个数)
May 27 Python
如何基于Python代码实现高精度免费OCR工具
Jun 18 Python
Scrapy爬虫文件批量运行的实现
Sep 30 Python
Python 爬取淘宝商品信息栏目的实现
Feb 06 Python
利用python做表格数据处理
Apr 13 Python
python发送邮件示例(支持中文邮件标题)
Feb 16 #Python
python定时器使用示例分享
Feb 16 #Python
python求素数示例分享
Feb 16 #Python
python检测服务器是否正常
Feb 16 #Python
java直接调用python脚本的例子
Feb 16 #Python
python根据距离和时长计算配速示例
Feb 16 #Python
python根据经纬度计算距离示例
Feb 16 #Python
You might like
IIS+PHP+MySQL+Zend配置 (视频教程)
2006/12/13 PHP
PHP类与对象中的private访问控制的疑问
2012/11/01 PHP
Laravel网站打开速度优化的方法汇总
2017/07/16 PHP
laravel接管Dingo-api和默认的错误处理方式
2019/10/25 PHP
最近项目写了一些js,水平有待提高
2009/01/31 Javascript
利用谷歌地图API获取点与点的距离的js代码
2012/10/11 Javascript
Javascript new Date().valueOf()的作用与时间戳由来详解
2013/04/24 Javascript
JavaScript原型链示例分享
2014/01/26 Javascript
jquery选择器原理介绍($()使用方法)
2014/03/25 Javascript
iframe调用父页面函数示例详解
2014/07/17 Javascript
JSON字符串转JSON对象
2015/07/31 Javascript
jquery实现可点击伸缩与展开的菜单效果代码
2015/08/31 Javascript
jquery实现加载进度条提示效果
2015/11/23 Javascript
浅谈JavaScript中小数和大整数的精度丢失
2016/05/31 Javascript
详解js中的apply与call的用法
2016/07/30 Javascript
JS锚点的设置与使用方法
2016/09/05 Javascript
vue-cli脚手架引入弹出层layer插件的几种方法
2019/06/24 Javascript
解决layui 表单元素radio不显示渲染的问题
2019/09/04 Javascript
jquery实现拖拽小方块效果
2020/12/10 jQuery
jquery实现穿梭框功能
2021/01/19 jQuery
[04:09]2014DOTA2国际邀请赛Ti西雅图 历届冠军相继出局 BBC综述今日比赛
2014/07/20 DOTA
python映射列表实例分析
2015/01/26 Python
Python2.X/Python3.X中urllib库区别讲解
2017/12/19 Python
django文档学习之applications使用详解
2018/01/29 Python
Python3.6笔记之将程序运行结果输出到文件的方法
2018/04/22 Python
python 对txt中每行内容进行批量替换的方法
2018/07/11 Python
Numpy中的mask的使用
2018/07/21 Python
出门问问全球官方商城:Tichome音箱和TicWatch智能手表
2017/12/02 全球购物
Lookfantastic俄罗斯:欧洲在线化妆品零售商
2019/08/06 全球购物
个性发展自我评价
2014/02/11 职场文书
校园活动宣传方案
2014/03/28 职场文书
药学职务聘任书
2014/03/29 职场文书
师恩难忘教学反思
2014/04/27 职场文书
爱祖国爱家乡演讲稿
2014/09/02 职场文书
教师思想作风整顿个人剖析材料
2014/10/10 职场文书
python基于OpenCV模板匹配识别图片中的数字
2021/03/31 Python