Posted in Python onApril 03, 2014
Python2.7编写的读取html中指定元素,并生成excle文件
#coding=gbk import string import codecs import os,time import xlwt import xlrd from bs4 import BeautifulSoup from xlrd import open_workbook class LogMsg: def __init__(self,logfile,Level=0): try: import logging #self.logger = None self.logger = logging.getLogger() self.hdlr = logging.FileHandler(logfile) formatter = logging.Formatter("[%(asctime)s]: %(message)s","%Y%m%d %H:%M:%S") self.hdlr.setFormatter(formatter) self.logger.addHandler(self.hdlr) #logger.setLevel() if Level == 10: self.logger.setLevel(logging.DEBUG) elif Level == 20: self.logger.setLevel(logging.INFO) elif Level == 30: self.logger.setLevel(logging.WARNING) elif Level == 40: self.logger.setLevel(logging.ERROR) elif Level == 50: self.logger.setLevel(logging.CRITICAL) else: self.logger.setLevel(logging.NOTSET) except: print "log init error!" exit(1) def output(self,logInfo): Level = self.logger.getEffectiveLevel() try: if Level == 10: self.logger.debug(logInfo) elif Level == 20: self.logger.info(logInfo) elif Level == 30: self.logger.warning(logInfo) elif Level == 40: self.logger.error(logInfo) elif Level == 50: self.logger.critical(logInfo) else: self.logger.info(logInfo) except: print "log output error!" exit(1) def close(self): try: #logging.shutdown([self.hdlr]) self.logger.removeHandler(self.hdlr) except: print "log closed error!" exit(1) Logtime = time.strftime("%Y%m%d%H%M%S",time.localtime()) logFileTime = time.strftime("%Y%m%d",time.localtime()) Logfile = '/data/pyExample/logs/htmlparser_%s.log' % logFileTime log = LogMsg(Logfile,20) DATAPATH = '/data/pyExample/' XLSname = 'dangjian_'+Logtime+'.xls' if __name__ == '__main__': wbk = xlwt.Workbook(encoding = 'gbk') sheet = wbk.add_sheet('基本内容导入模板') sheet.write(0,0,'内容类型 ') sheet.write(0,1,'栏目名称') sheet.write(0,2,'栏目编号') sheet.write(0,3,'内容名称') sheet.write(0,4,'时长') sheet.write(0,5,'关键字') sheet.write(0,6,'看点') sheet.write(0,7,'作者') sheet.write(0,8,'来源') sheet.write(0,9,'子内容1') sheet.write(0,10,'子内容2') xlsContent = [] files = os.listdir(DATAPATH) k = 0 for f in files: if os.path.splitext(f)[1] == '.html': content=[] log.output('当前文件:'+f) htmlFile =codecs.open(DATAPATH+f,'r','gbk') lines = htmlFile.readlines() if not lines: log.output ('not line') for line in lines: if line.strip()=='\n': log.output('该处是空行') else: line = line.replace(' ','') soup = BeautifulSoup(line) for tdd in soup.findAll('td'): #print tdd.text.encode("gbk") content.append(tdd.text.encode("gbk")) #print line.encode('gbk') htmlFile.close() for i in content: print content.index(i),',',i log.output(i) log.output(content.index(i)) print '----------------------------------------' folderName = content[6] contentName= content[4] duration = filter(str.isdigit, content[16]) int_duration = string.atoi(duration)*60 str_duration = "%i"%int_duration keyWord = content[6] desciption = content[36] videoName_1 = content[10] print folderName print contentName print str_duration print keyWord print desciption print videoName_1 log.output('输出xls数据:'+','+folderName+',,'+contentName+','+str_duration+','+keyWord+','+desciption+',管理员,华数编辑,'+videoName_1+',,') print k sheet.write(k+1,0,'') sheet.write(k+1,1,folderName) sheet.write(k+1,2,'') sheet.write(k+1,3,contentName) sheet.write(k+1,4,str_duration) sheet.write(k+1,5,keyWord) sheet.write(k+1,6,desciption) sheet.write(k+1,7,'管理员') sheet.write(k+1,8,'华数编辑') sheet.write(k+1,9,videoName_1) sheet.write(k+1,10,'') k+=1 wbk.save(DATAPATH + XLSname) print '========================================='
python读取html中指定元素生成excle文件示例
声明:登载此文出于传递更多信息之目的,并不意味着赞同其观点或证实其描述。
Reply on: @reply_date@
@reply_contents@