python读取html中指定元素生成excle文件示例


Posted in Python onApril 03, 2014

Python2.7编写的读取html中指定元素,并生成excle文件

#coding=gbk
import string
import codecs
import os,time
import xlwt
import xlrd
from bs4 import BeautifulSoup 
from xlrd import open_workbook
class LogMsg:
        def __init__(self,logfile,Level=0):
                try:
                        import logging
                        #self.logger = None
                        self.logger = logging.getLogger()
                        self.hdlr = logging.FileHandler(logfile)
                        formatter = logging.Formatter("[%(asctime)s]: %(message)s","%Y%m%d %H:%M:%S")
                        self.hdlr.setFormatter(formatter)
                        self.logger.addHandler(self.hdlr)
                        #logger.setLevel()
                        if Level == 10:
                                self.logger.setLevel(logging.DEBUG)
                        elif Level == 20:
                                self.logger.setLevel(logging.INFO)
                        elif Level == 30:
                                self.logger.setLevel(logging.WARNING)
                        elif Level == 40:
                                self.logger.setLevel(logging.ERROR)
                        elif Level == 50:
                                self.logger.setLevel(logging.CRITICAL)
                        else:
                                self.logger.setLevel(logging.NOTSET)
                except:
                        print "log init error!"
                        exit(1)
        def output(self,logInfo):
                Level = self.logger.getEffectiveLevel()
                try:
                        if Level == 10:
                                self.logger.debug(logInfo)
                        elif Level == 20:
                                self.logger.info(logInfo)
                        elif Level == 30:
                                self.logger.warning(logInfo)
                        elif Level == 40:
                                self.logger.error(logInfo)
                        elif Level == 50:
                                self.logger.critical(logInfo)
                        else:
                                self.logger.info(logInfo)
                except:
                        print "log output error!"
                        exit(1)
        def close(self):
                try:
                #logging.shutdown([self.hdlr])
                        self.logger.removeHandler(self.hdlr)
                except:
                        print "log closed error!"
                        exit(1) 
Logtime = time.strftime("%Y%m%d%H%M%S",time.localtime())
logFileTime = time.strftime("%Y%m%d",time.localtime())
Logfile = '/data/pyExample/logs/htmlparser_%s.log' % logFileTime
log = LogMsg(Logfile,20)

DATAPATH = '/data/pyExample/' 
XLSname = 'dangjian_'+Logtime+'.xls'

if __name__ == '__main__':
    
    wbk = xlwt.Workbook(encoding = 'gbk')
    sheet = wbk.add_sheet('基本内容导入模板')
    sheet.write(0,0,'内容类型 ')
    sheet.write(0,1,'栏目名称')
    sheet.write(0,2,'栏目编号')
    sheet.write(0,3,'内容名称')
    sheet.write(0,4,'时长')
    sheet.write(0,5,'关键字')
    sheet.write(0,6,'看点')
    sheet.write(0,7,'作者')
    sheet.write(0,8,'来源')
    sheet.write(0,9,'子内容1')
    sheet.write(0,10,'子内容2')
    xlsContent = []   
    files = os.listdir(DATAPATH)
    k = 0
    for f in files:  
        if os.path.splitext(f)[1] == '.html':
            content=[]
            log.output('当前文件:'+f)
            htmlFile =codecs.open(DATAPATH+f,'r','gbk')
            lines = htmlFile.readlines()
            if not lines:
                log.output ('not line')
            for line in lines:
                if line.strip()=='\n':
                    log.output('该处是空行')
                else:
                    line = line.replace(' ','')
                    soup  = BeautifulSoup(line)
                    for tdd in soup.findAll('td'):  
                        #print tdd.text.encode("gbk")
                        content.append(tdd.text.encode("gbk"))       
                #print line.encode('gbk') 
            htmlFile.close()    
            for i in content:
                print content.index(i),',',i 
                log.output(i) 
                log.output(content.index(i)) 
            print '----------------------------------------'
            
            folderName =  content[6]
            contentName=  content[4]       
            duration =    filter(str.isdigit, content[16])
            int_duration = string.atoi(duration)*60
            str_duration = "%i"%int_duration
            keyWord =     content[6] 
            desciption =  content[36]
            videoName_1 = content[10]
            print folderName
            print contentName
            print str_duration
            print keyWord
            print desciption
            print videoName_1
            log.output('输出xls数据:'+','+folderName+',,'+contentName+','+str_duration+','+keyWord+','+desciption+',管理员,华数编辑,'+videoName_1+',,')
            print k            
            sheet.write(k+1,0,'')
            sheet.write(k+1,1,folderName)
            sheet.write(k+1,2,'')
            sheet.write(k+1,3,contentName)
            sheet.write(k+1,4,str_duration)
            sheet.write(k+1,5,keyWord)
            sheet.write(k+1,6,desciption)
            sheet.write(k+1,7,'管理员')
            sheet.write(k+1,8,'华数编辑')
            sheet.write(k+1,9,videoName_1)
            sheet.write(k+1,10,'')
            k+=1
    wbk.save(DATAPATH + XLSname)        
    print '=========================================' 
Python 相关文章推荐
python实现的各种排序算法代码
Mar 04 Python
python修改注册表终止360进程实例
Oct 13 Python
Python中实现常量(Const)功能
Jan 28 Python
Python中字典映射类型的学习教程
Aug 20 Python
如何处理Python3.4 使用pymssql 乱码问题
Jan 08 Python
Python正则表达式教程之二:捕获篇
Mar 02 Python
利用Tkinter和matplotlib两种方式画饼状图的实例
Nov 06 Python
读取本地json文件,解析json(实例讲解)
Dec 06 Python
Python2/3中urllib库的一些常见用法
Dec 19 Python
使用Python AIML搭建聊天机器人的方法示例
Jul 09 Python
Python3爬虫mitmproxy的安装步骤
Jul 29 Python
详解Python 最短匹配模式
Jul 29 Python
python实现zencart产品数据导入到magento(python导入数据)
Apr 03 #Python
python模拟登陆阿里妈妈生成商品推广链接
Apr 03 #Python
python多线程抓取天涯帖子内容示例
Apr 03 #Python
python局域网ip扫描示例分享
Apr 03 #Python
python实现数通设备tftp备份配置文件示例
Apr 02 #Python
python实现巡检系统(solaris)示例
Apr 02 #Python
python实现apahce网站日志分析示例
Apr 02 #Python
You might like
攻克CakePHP系列一 连接MySQL数据库
2008/10/22 PHP
php不用内置函数对数组排序的两个算法代码
2010/02/08 PHP
ThinkPHP模板判断输出Defined标签用法详解
2014/06/30 PHP
php过滤html标记属性类用法实例
2014/09/23 PHP
Yii使用DeleteAll连表删除出现报错问题的解决方法
2016/07/14 PHP
PHP入门教程之日期与时间操作技巧总结(格式化,验证,获取,转换,计算等)
2016/09/11 PHP
php中get_magic_quotes_gpc()函数说明
2017/02/06 PHP
javascript 三种方法实现获得和设置以及移除元素属性
2013/03/20 Javascript
js如何设置在iframe框架中指定div不显示
2013/12/04 Javascript
教你如何自定义百度分享插件以及bshare分享插件的分享按钮
2014/06/20 Javascript
javascript实现继承的简单实例
2015/07/26 Javascript
Javascript实现通过选择周数显示开始日和结束日的实现代码
2016/05/30 Javascript
总结JavaScript的正则与其他语言的不同之处
2016/08/25 Javascript
JS/jQuery判断DOM节点是否存在的简单方法
2016/11/24 Javascript
详解jQuery的表单验证插件--Validation
2016/12/21 Javascript
angular分页指令操作
2017/01/09 Javascript
基于VUE.JS的移动端框架Mint UI的使用
2017/10/11 Javascript
React Native 真机断点调试+跨域资源加载出错问题的解决方法
2018/01/18 Javascript
js中document.write和document.writeln的区别
2018/03/11 Javascript
vue.js使用v-pre与v-html输出HTML操作示例
2018/07/07 Javascript
微信小程序支付PHP代码
2018/08/23 Javascript
JS实现二维数组元素的排列组合运算简单示例
2019/01/28 Javascript
vue--vuex详解
2019/04/15 Javascript
Vue路由切换页面不更新问题解决方案
2020/07/10 Javascript
如何在JavaScript中正确处理变量
2020/12/25 Javascript
使用node-media-server搭建一个简易的流媒体服务器
2021/01/20 Javascript
python生成指定长度的随机数密码
2014/01/23 Python
python 生成器生成杨辉三角的方法(必看)
2017/04/10 Python
对numpy中布尔型数组的处理方法详解
2018/04/17 Python
python中设置超时跳过,超时退出的方式
2019/12/13 Python
python学习笔记之多进程
2020/08/06 Python
英国最红的高街时尚品牌:Topshop
2016/08/05 全球购物
Vans英国官方网站:美国南加州的原创极限运动潮牌
2017/01/20 全球购物
对公司合理化的建议书
2014/03/12 职场文书
幼儿园2014年度工作总结
2014/11/10 职场文书
大学军训心得体会800字
2016/01/11 职场文书