Posted in Python onFebruary 16, 2014
#!/usr/bin/python3.2 import os,socket import urllib import urllib.request,threading,time import re,sys global manhuaweb,weburl,floder,chapterbegin,currentthreadnum,threadcount,mutex,mutex2 weburl='' floder='' chapterbegin=0 currentthreadnum=0 threadcount=6 if len(sys.argv)>=3: weburl=sys.argv[1] floder=sys.argv[2] else: print("usag: downloadmanhua weburl floder chapterbegin=0 threadnnum=6") sys.exit(0) if len(sys.argv)>=4: chapterbegin=int(sys.argv[3]) if len(sys.argv)>=5: threadcount=(int)(sys.argv[4]) def jin(i,jinzhi): finalans="" answer=i%jinzhi i=int(i/jinzhi) if answer>9: finalans=finalans+chr(ord('a')+(answer-10)) else: finalans=finalans+str(answer) if i!=0: finalans=jin(i,jinzhi)+finalans return finalans def urlparse(p,a,c,k): d={} e=lambda c: jin(c,36) if 1: while c: c=c-1 if not k[c]: d[jin(c,36)]=jin(c,36) else: d[jin(c,36)]=k[c] k=[lambda e:d[e]] e=lambda c:'\\w+' c=1 newstr="" while c: c=c-1 if k[c]: for i in range(0,len(p)): tempi=p[i] tempi=ord(tempi) if tempi>=ord('a') and tempi<=ord('f'): newstr+=d[chr(tempi)] elif tempi>=ord('0') and tempi<=ord('9'): newstr+=d[chr(tempi)] else: newstr+=chr(tempi) return newstr def meispower(s): p=re.compile(r"(?=\}\().*",re.IGNORECASE) s=p.findall(s) s=s[0] s=s[0:(len(s)-19)] par=s.split(',') par[3]=par[3][1:len(par[3])] answer=par[3].split('|') chapterpath=urlparse(par[0],int(par[1]),int(par[2]),answer) allurl=re.findall('imgpath=[^;]*',chapterpath)[0] allurl=allurl[10:(len(allurl)-2)] return allurl def pictofile(weburl,filename,loop=100): if loop<0: print('can\'t download the picture %s'%weburl) return loop=loop-1 if os.path.exists(filename): return try: url=urllib.request.urlopen(weburl) data=url.read() if len(data)<2048: url.close() pictofile(weburl,filename,loop) else: print('download from %s name is %s\n'%(weburl,filename)) myfile=open('%s'%filename,'wb') myfile.write(data) myfile.close() url.close(); except socket.timeout: print('timeout') pictofile(weburl,filename,loop) except Exception as e: print('error',e) pictofile(weburl,filename,loop) finally: pass def downloadpic(url,loadpicdir,num): #download the all url picture to loadpicdir global currentthreadnum,mutex,mutex2 mymode=re.compile(r'[0-9a-z.]*\Z') try: mutex2.acquire() os.chdir(loadpicdir) mutex2.release() except: print("can't open the floder %s will be create"%loadpicdir) try: if(mutex2.locked()): os.mkdir(loadpicdir) os.chdir(loadpicdir) mutex2.release() print('create floder succeed') except: print("can't create floder %s"%loadpicdir) if(mutex.acquire()): mutex.release() quit(0) name=mymode.findall(url) filename='manhua'+name[0] pictofile(url,loadpicdir+'//'+str(num)+'-'+filename) mutex.acquire() currentthreadnum=currentthreadnum-1 mutex.release() def downloadchapter(url,loadpicdir,num,begin=0): global manhuaweb,threadcount,currentthreadnum,mutex print(manhuaweb+url) webdata=urllib.request.urlopen(manhuaweb+url).read() webdata=webdata.decode('UTF-8') chaptername=re.findall(r'<title>[^_]*',webdata)[0] chaptername=chaptername[7:len(chaptername)] webscrip=re.findall(r'eval.*[^<>]',webdata) chapterurl=meispower(webscrip[0]); chapterurl='http://mhimg.ali213.net'+chapterurl for i in range(begin,num): try: while(currentthreadnum>=threadcount): time.sleep(0.5) mutex.acquire() currentthreadnum=currentthreadnum+1 mutex.release() threading.Thread(target=downloadpic,args=(r'%s%d.jpg'%(chapterurl,i),loadpicdir+chaptername,num)).start() except socket.error: mutex.acquire() i=i-1 currentthreadnum=currentthreadnum-1 mutex.release() except Exception as error: print(error,'break') print('download chapter %d of picture make a error'%i) break if __name__=='__main__': manhuaweb=r'http://manhua.ali213.net' socket.setdefaulttimeout(60.0) mutex=threading.Lock() mutex2=threading.Lock() webfile=urllib.request.urlopen(weburl) webdata=webfile.read(); webdata=webdata.decode('UTF-8') meshmode=re.compile(r'<div class="detail_body_right_sec_con">.*</div>') meshdata=meshmode.findall(webdata)[0] indexmode=re.compile(r'([0-9]*页)') indexdata=indexmode.findall(meshdata) picurlmode=re.compile(r'/comic/[0-9/]*.html') picurldata=picurlmode.findall(meshdata) chapterlength=len(picurldata) nummode=re.compile(r'[\d]+') i=chapterbegin while i<chapterlength: manhuachapter=picurldata[chapterlength-i-1] downloadchapter(manhuachapter,floder,int(nummode.findall(indexdata[chapterlength-i-1])[0])) i=i+1
python实现爬虫下载漫画示例
声明:登载此文出于传递更多信息之目的,并不意味着赞同其观点或证实其描述。
Reply on: @reply_date@
@reply_contents@