Posted in Python onMarch 10, 2015
学了下beautifulsoup后,做个个网络爬虫,爬取读者杂志并用reportlab制作成pdf..
crawler.py
#!/usr/bin/env python #coding=utf-8 """ Author: Anemone Filename: getmain.py Last modified: 2015-02-19 16:47 E-mail: anemone@82flex.com """ import urllib2 from bs4 import BeautifulSoup import re import sys reload(sys) sys.setdefaultencoding('utf-8') def getEachArticle(url): # response = urllib2.urlopen('http://www.52duzhe.com/2015_01/duzh20150104.html') response = urllib2.urlopen(url) html = response.read() soup = BeautifulSoup(html)#.decode("utf-8").encode("gbk")) #for i in soup.find_all('div'): # print i,1 title=soup.find("h1").string writer=soup.find(id="pub_date").string.strip() _from=soup.find(id="media_name").string.strip() text=soup.get_text()#.encode("utf-8") main=re.split("BAIDU_CLB.*;",text) result={"title":title,"writer":writer,"from":_from,"context":main[1]} return result #new=open("new.txt","w") #new.write(result["title"]+"\n\n") #new.write(result["writer"]+" "+result["from"]) #new.write(result["context"]) #new.close() def getCatalog(issue): url="http://www.52duzhe.com/"+issue[:4]+"_"+issue[-2:]+"/" firstUrl=url+"duzh"+issue+"01.html" firstUrl=url+"index.html" duzhe=dict() response = urllib2.urlopen(firstUrl) html = response.read() soup=BeautifulSoup(html) firstUrl=url+soup.table.a.get("href") response = urllib2.urlopen(firstUrl) html = response.read() soup = BeautifulSoup(html) all=soup.find_all("h2") for i in all: print i.string duzhe[i.string]=list() for link in i.parent.find_all("a"): href=url+link.get("href") print href while 1: try: article=getEachArticle(href) break except: continue duzhe[i.string].append(article) return duzhe def readDuZhe(duzhe): for eachColumn in duzhe: for eachArticle in duzhe[eachColumn]: print eachArticle["title"] if __name__ == '__main__': # issue=raw_input("issue(201501):") readDuZhe(getCatalog("201424"))
getpdf.py
#!/usr/bin/env python #coding=utf-8 """ Author: Anemone Filename: writetopdf.py Last modified: 2015-02-20 19:19 E-mail: anemone@82flex.com """ #coding=utf-8 import reportlab.rl_config from reportlab.pdfbase import pdfmetrics from reportlab.pdfbase.ttfonts import TTFont from reportlab.lib import fonts import copy from reportlab.platypus import Paragraph, SimpleDocTemplate,flowables from reportlab.lib.styles import getSampleStyleSheet import crawler def writePDF(issue,duzhe): reportlab.rl_config.warnOnMissingFontGlyphs = 0 pdfmetrics.registerFont(TTFont('song',"simsun.ttc")) pdfmetrics.registerFont(TTFont('hei',"msyh.ttc")) fonts.addMapping('song', 0, 0, 'song') fonts.addMapping('song', 0, 1, 'song') fonts.addMapping('song', 1, 0, 'hei') fonts.addMapping('song', 1, 1, 'hei') stylesheet=getSampleStyleSheet() normalStyle = copy.deepcopy(stylesheet['Normal']) normalStyle.fontName ='song' normalStyle.fontSize = 11 normalStyle.leading = 11 normalStyle.firstLineIndent = 20 titleStyle = copy.deepcopy(stylesheet['Normal']) titleStyle.fontName ='song' titleStyle.fontSize = 15 titleStyle.leading = 20 firstTitleStyle = copy.deepcopy(stylesheet['Normal']) firstTitleStyle.fontName ='song' firstTitleStyle.fontSize = 20 firstTitleStyle.leading = 20 firstTitleStyle.firstLineIndent = 50 smallStyle = copy.deepcopy(stylesheet['Normal']) smallStyle.fontName ='song' smallStyle.fontSize = 8 smallStyle.leading = 8 story = [] story.append(Paragraph("<b>读者{0}期</b>".format(issue), firstTitleStyle)) for eachColumn in duzhe: story.append(Paragraph('__'*28, titleStyle)) story.append(Paragraph('<b>{0}</b>'.format(eachColumn), titleStyle)) for eachArticle in duzhe[eachColumn]: story.append(Paragraph(eachArticle["title"],normalStyle)) story.append(flowables.PageBreak()) for eachColumn in duzhe: for eachArticle in duzhe[eachColumn]: story.append(Paragraph("<b>{0}</b>".format(eachArticle["title"]),titleStyle)) story.append(Paragraph(" {0} {1}".format(eachArticle["writer"],eachArticle["from"]),smallStyle)) para=eachArticle["context"].split("") for eachPara in para: story.append(Paragraph(eachPara,normalStyle)) story.append(flowables.PageBreak()) #story.append(Paragraph("context",normalStyle)) doc = SimpleDocTemplate("duzhe"+issue+".pdf") print "Writing PDF..." doc.build(story) def main(issue): duzhe=crawler.getCatalog(issue) writePDF(issue,duzhe) if __name__ == '__main__': issue=raw_input("Enter issue(201501):") main(issue)
以上就是本文的全部内容了,希望大家能够喜欢。
Python爬取读者并制作成PDF
- Author -
hebedich声明:登载此文出于传递更多信息之目的,并不意味着赞同其观点或证实其描述。
Reply on: @reply_date@
@reply_contents@