python读取html中指定元素生成excle文件示例


Posted in Python onApril 03, 2014

Python2.7编写的读取html中指定元素,并生成excle文件

#coding=gbk
import string
import codecs
import os,time
import xlwt
import xlrd
from bs4 import BeautifulSoup 
from xlrd import open_workbook
class LogMsg:
        def __init__(self,logfile,Level=0):
                try:
                        import logging
                        #self.logger = None
                        self.logger = logging.getLogger()
                        self.hdlr = logging.FileHandler(logfile)
                        formatter = logging.Formatter("[%(asctime)s]: %(message)s","%Y%m%d %H:%M:%S")
                        self.hdlr.setFormatter(formatter)
                        self.logger.addHandler(self.hdlr)
                        #logger.setLevel()
                        if Level == 10:
                                self.logger.setLevel(logging.DEBUG)
                        elif Level == 20:
                                self.logger.setLevel(logging.INFO)
                        elif Level == 30:
                                self.logger.setLevel(logging.WARNING)
                        elif Level == 40:
                                self.logger.setLevel(logging.ERROR)
                        elif Level == 50:
                                self.logger.setLevel(logging.CRITICAL)
                        else:
                                self.logger.setLevel(logging.NOTSET)
                except:
                        print "log init error!"
                        exit(1)
        def output(self,logInfo):
                Level = self.logger.getEffectiveLevel()
                try:
                        if Level == 10:
                                self.logger.debug(logInfo)
                        elif Level == 20:
                                self.logger.info(logInfo)
                        elif Level == 30:
                                self.logger.warning(logInfo)
                        elif Level == 40:
                                self.logger.error(logInfo)
                        elif Level == 50:
                                self.logger.critical(logInfo)
                        else:
                                self.logger.info(logInfo)
                except:
                        print "log output error!"
                        exit(1)
        def close(self):
                try:
                #logging.shutdown([self.hdlr])
                        self.logger.removeHandler(self.hdlr)
                except:
                        print "log closed error!"
                        exit(1) 
Logtime = time.strftime("%Y%m%d%H%M%S",time.localtime())
logFileTime = time.strftime("%Y%m%d",time.localtime())
Logfile = '/data/pyExample/logs/htmlparser_%s.log' % logFileTime
log = LogMsg(Logfile,20)

DATAPATH = '/data/pyExample/' 
XLSname = 'dangjian_'+Logtime+'.xls'

if __name__ == '__main__':
    
    wbk = xlwt.Workbook(encoding = 'gbk')
    sheet = wbk.add_sheet('基本内容导入模板')
    sheet.write(0,0,'内容类型 ')
    sheet.write(0,1,'栏目名称')
    sheet.write(0,2,'栏目编号')
    sheet.write(0,3,'内容名称')
    sheet.write(0,4,'时长')
    sheet.write(0,5,'关键字')
    sheet.write(0,6,'看点')
    sheet.write(0,7,'作者')
    sheet.write(0,8,'来源')
    sheet.write(0,9,'子内容1')
    sheet.write(0,10,'子内容2')
    xlsContent = []   
    files = os.listdir(DATAPATH)
    k = 0
    for f in files:  
        if os.path.splitext(f)[1] == '.html':
            content=[]
            log.output('当前文件:'+f)
            htmlFile =codecs.open(DATAPATH+f,'r','gbk')
            lines = htmlFile.readlines()
            if not lines:
                log.output ('not line')
            for line in lines:
                if line.strip()=='\n':
                    log.output('该处是空行')
                else:
                    line = line.replace(' ','')
                    soup  = BeautifulSoup(line)
                    for tdd in soup.findAll('td'):  
                        #print tdd.text.encode("gbk")
                        content.append(tdd.text.encode("gbk"))       
                #print line.encode('gbk') 
            htmlFile.close()    
            for i in content:
                print content.index(i),',',i 
                log.output(i) 
                log.output(content.index(i)) 
            print '----------------------------------------'
            
            folderName =  content[6]
            contentName=  content[4]       
            duration =    filter(str.isdigit, content[16])
            int_duration = string.atoi(duration)*60
            str_duration = "%i"%int_duration
            keyWord =     content[6] 
            desciption =  content[36]
            videoName_1 = content[10]
            print folderName
            print contentName
            print str_duration
            print keyWord
            print desciption
            print videoName_1
            log.output('输出xls数据:'+','+folderName+',,'+contentName+','+str_duration+','+keyWord+','+desciption+',管理员,华数编辑,'+videoName_1+',,')
            print k            
            sheet.write(k+1,0,'')
            sheet.write(k+1,1,folderName)
            sheet.write(k+1,2,'')
            sheet.write(k+1,3,contentName)
            sheet.write(k+1,4,str_duration)
            sheet.write(k+1,5,keyWord)
            sheet.write(k+1,6,desciption)
            sheet.write(k+1,7,'管理员')
            sheet.write(k+1,8,'华数编辑')
            sheet.write(k+1,9,videoName_1)
            sheet.write(k+1,10,'')
            k+=1
    wbk.save(DATAPATH + XLSname)        
    print '=========================================' 
Python 相关文章推荐
Python3基础之基本运算符概述
Aug 13 Python
Python基于checksum计算文件是否相同的方法
Jul 09 Python
Python基于更相减损术实现求解最大公约数的方法
Apr 04 Python
python3中os.path模块下常用的用法总结【推荐】
Sep 16 Python
python基于pdfminer库提取pdf文字代码实例
Aug 15 Python
python yield关键词案例测试
Oct 15 Python
python飞机大战 pygame游戏创建快速入门详解
Dec 17 Python
pytorch 常用线性函数详解
Jan 15 Python
Python基于Dlib的人脸识别系统的实现
Feb 26 Python
Django 删除upload_to文件的步骤
Mar 30 Python
浅谈pytorch 模型 .pt, .pth, .pkl的区别及模型保存方式
May 25 Python
python 实现"神经衰弱"翻牌游戏
Nov 09 Python
python实现zencart产品数据导入到magento(python导入数据)
Apr 03 #Python
python模拟登陆阿里妈妈生成商品推广链接
Apr 03 #Python
python多线程抓取天涯帖子内容示例
Apr 03 #Python
python局域网ip扫描示例分享
Apr 03 #Python
python实现数通设备tftp备份配置文件示例
Apr 02 #Python
python实现巡检系统(solaris)示例
Apr 02 #Python
python实现apahce网站日志分析示例
Apr 02 #Python
You might like
ThinkPHP写第一个模块应用
2012/02/20 PHP
smarty学习笔记之常见代码段用法总结
2016/03/19 PHP
微信支付扫码支付php版
2016/07/22 PHP
laravel 5.4 + vue + vux + element的环境搭配过程介绍
2018/04/26 PHP
js 提交和设置表单的值
2008/12/19 Javascript
JavaScript 原型继承之构造函数继承
2011/08/26 Javascript
jQuery ajax(复习)—Baidu ajax request分离版
2013/01/24 Javascript
jquery禁止回车触发表单提交
2014/12/12 Javascript
jQuery中element选择器用法实例
2014/12/29 Javascript
JS+CSS实现带关闭按钮DIV弹出窗口的方法
2015/02/27 Javascript
用jQuery获取table中行id和td值的实现代码
2016/05/19 Javascript
js 判断一组日期是否是连续的简单实例
2016/07/11 Javascript
JavaScript解析JSON格式数据的方法示例
2017/01/24 Javascript
ECMAScript6--解构
2017/03/30 Javascript
基于Node的React图片上传组件实现实例代码
2017/05/10 Javascript
JavaScript使用Ajax上传文件的示例代码
2017/08/10 Javascript
vue实现搜索功能
2019/05/28 Javascript
element-ui tooltip修改背景颜色和箭头颜色的实现
2019/12/16 Javascript
JS数组属性去重并校验重复数据
2020/01/10 Javascript
Python守护线程用法实例
2017/06/23 Python
Python 将pdf转成图片的方法
2018/04/23 Python
python 处理微信对账单数据的实例代码
2019/07/19 Python
python opencv鼠标事件实现画框圈定目标获取坐标信息
2020/04/18 Python
UI自动化定位常用实现方法代码示例
2020/10/27 Python
使用PyCharm官方中文语言包汉化PyCharm
2020/11/18 Python
原生 JS+CSS+HTML 实现时序图的方法
2019/07/31 HTML / CSS
西班牙宠物用品和食品网上商店:Tiendanimal
2019/06/06 全球购物
介绍一下MD5加密算法
2016/11/12 面试题
医学检验专业大学生求职信
2013/11/18 职场文书
马云的职业生涯规划之路
2014/01/01 职场文书
化学教师教学反思
2014/01/17 职场文书
高三毕业典礼演讲稿
2014/05/13 职场文书
团员自我评价范文
2015/03/10 职场文书
小学语文教学随笔
2015/08/14 职场文书
在redisCluster中模糊获取key方式
2021/07/09 Redis
Android Flutter实现3D动画效果示例详解
2022/04/07 Java/Android