编程 Python

下载糗事百科的内容_python版

Posted in Python onDecember 07, 2008

#coding:utf-8 import urllib.request 
import xml.dom.minidom 
import sqlite3 
import threading 
import time 
class logger(object): 
def log(self,*msg): 
for i in msg: 
print(i) 
Log = logger() 
Log.log('测试下') 
class downloader(object): 
def __init__(self,url): 
self.url = url 
def download(self): 
Log.log('开始下载',self.url) 
try: 
content = urllib.request.urlopen(self.url).read() 
#req = urllib.request.Request(url) 
#response = urllib.request.urlopen(req) 
#content = response.read() 
Log.log('下载完毕') 
return(content) 
except: 
Log.log('下载出错') 
return(None) 

class parser(object): 
def __init__(self,content): 
#获得根节点 
self.html = xml.dom.minidom.parseString(content) 
def parse(self): 
Log.log('开始提取数据') 
contents = {'content':'','url':[]} 
#获得div节点 
divs = self.html.getElementsByTagName('div') 
#获得content节点 
for div in divs: 
if div.hasAttribute('class') and \ 
div.getAttribute('class') == 'content': 
#获得糗事百科的内容 
textNode = div.childNodes[0] 
qContent = textNode.data 
#数据填充 
contents['content'] = qContent 
#获得上一糗事、下一糗事节点 
spans = self.html.getElementsByTagName('span') 
for span in spans: 
pspan = span.parentNode 
if pspan.tagName == 'a': 
#pspan为对应的链接,此时需要将对应的地址加入数据库 
url = pspan.getAttribute('href') 
qid = url[10:][:-4] 
#数据填充 
contents['url'].append(qid) 
Log.log('提取数据完毕') 
return(contents) 
def downloadPage(qid,db): 
url = 'http://www.qiushibaike.com/articles/'+str(qid)+'.htm' 
content = downloader(url).download() 
if content: 
contents = parser(content).parse() 
if contents['content']: 
db.updateContent(qid,contents['content']) 
for i in contents['url']: 
db.addQID(i) 
if len(contents['url']) == 2: 
db.updateStatus(qid,2) 
#下载池，表示同时允许下载的链接个数 
class downloaderPool(object): 
def __init__(self,maxLength=15): 
self.downloaders = [None]*maxLength 
self.downloadList = [] 
self.db = None 
def setDownloadList(self,downloadList): 
self.downloadList = list(set(self.downloadList+downloadList)) 
def setdb(self,db): 
self.db = db 
def daemon(self): 
#每隔一秒查询线程的状态，为非活动线程则设置为None 
Log.log('设置守护进程') 
for index,downloader in enumerate(self.downloaders): 
if downloader: 
if not downloader.isAlive(): 
Log.log('将下载器置空',index) 
self.downloaders[index] = None 
#检查线程池状态 
for index,downloader in enumerate(self.downloaders): 
if not downloader: 
qid = self.getQID() 
if qid: 
#创建线程 
t = threading.Thread(target=downloadPage,args=(qid,self.db)) 
self.downloaders[index] = t 
t.start() 
t.join() 
Log.log('设置下载器',index) 
#间隔一秒执行一次 
time.sleep(1) 
def getQID(self): 
try: 
tmp = self.downloadList[0] 
del self.downloadList[0] 
return(tmp) 
except: 
return(None) 
def beginDownload(self): 
#创建守护线程 
daemon = threading.Thread(target=self.daemon) 
daemon.setDaemon(True) 
daemon.start() 
daemon.join() 
def getDownloader(self): 
for index,downloader in enumerate(self.downloaders): 
if not downloader: 
return(index) 
return(None) 

ADD_Q_ID = 'insert into qiushibaike(id,success) values(?,?)' 
UPDATE_Q_CONTENT = 'update qiushibaike set content=? where id=?' 
UPDATE_Q_STATUS = 'update qiushibaike set success=? where id=?' 
Q_LIST = 'select id from qiushibaike where success=?' 
Q_LIST_BY_ID = 'select count(*) from qiushibaike where id=?' 
class dbConnect(object): 
""" 
create table qiushibaike( 
id,Integer 
content,Varchar 
success,Interger 
) 
#id表示糗事的ID 
#content表示糗事的内容 
#success表示是否下载成功，当该糗事内容下载完成，且获得上一页、下一页ID时表示下载完成 
1表示未完成 
2表示完成 
""" 
def __init__(self,dbpath='db.sqlite'): 
self.dbpath = dbpath 
def addQID(self,qid): 
Log.log('插入糗事百科',qid) 
#获得连接 
cn = sqlite3.connect(self.dbpath) 
c = cn.cursor() 
try: 
#添加内容并提交 
c.execute(ADD_Q_ID,(qid,1)) 
cn.commit() 
except: 
Log.log('添加ID出错',qid) 
#关闭连接 
c.close() 
cn.close() 
Log.log('插入成功') 
def updateContent(self,qid,content): 
Log.log('更新糗事百科',qid,content) 
#获得连接 
cn = sqlite3.connect(self.dbpath) 
c = cn.cursor() 
#添加内容并提交 
c.execute(UPDATE_Q_CONTENT,(content,qid)) 
cn.commit() 
#关闭连接 
c.close() 
cn.close() 
Log.log('更新成功') 
def updateStatus(self,qid,flag): 
Log.log('更新状态',qid,flag) 
#获得连接 
cn = sqlite3.connect(self.dbpath) 
c = cn.cursor() 
#添加内容并提交 
c.execute(UPDATE_Q_STATUS,(flag,qid)) 
cn.commit() 
#关闭连接 
c.close() 
cn.close() 
Log.log('更新状态成功') 
def getList(self,unDonloaded=1): 
Log.log('获得列表') 
l = [] 
#获得连接 
cn = sqlite3.connect(self.dbpath) 
c = cn.cursor() 
#获得数据 
c.execute(Q_LIST,(unDonloaded,)) 
rows = c.fetchall() 
for i in rows: 
l.append(i[0]) 
#关闭连接 
c.close() 
cn.close() 
Log.log('获得列表成功') 
return(l) 
class singleDownloader(object): 
def __init__(self): 
self.downloadList = [] 
def setdb(self,db): 
self.db = db 
def setDownloadList(self,downloadList): 
self.downloadList = list(set(self.downloadList+downloadList)) 
def beginDownload(self): 
for i in self.downloadList: 
downloadPage(i,self.db) 
def main(): 
db = dbConnect('db.sqlite') 
#dp = downloaderPool() 
#dp.setdb(db) 
sp = singleDownloader() 
sp.setdb(db) 
dp=sp 
unDownloadedList = db.getList() 
#当还有未下载的糗事时就要继续下载 
while(len(unDownloadedList)): 
#使用该列表填充下载池 
dp.setDownloadList(unDownloadedList) 
dp.beginDownload() 
time.sleep(1) 
#重置参数 
unDownloadedList = db.getList() 
if __name__ == '__main__': 
main()

代码是没问题的，可以正常运行，但是希望做到以下2方面：
1、多线程下载
2、代码分离度更高，跟面向对象

下载糗事百科的内容_python版

声明：登载此文出于传递更多信息之目的，并不意味着赞同其观点或证实其描述。

Python 相关文章推荐

python发送arp欺骗攻击代码分析

Jan 16 Python

python实现通过pil模块对图片格式进行转换的方法

Mar 24 Python

Python实现的简单文件传输服务器和客户端

Apr 08 Python

python中管道用法入门实例

Jun 04 Python

Python初学时购物车程序练习实例(推荐)

Aug 08 Python

python自定义函数实现一个数的三次方计算方法

Jan 20 Python

利用Python半自动化生成Nessus报告的方法

Mar 19 Python

Django框架设置cookies与获取cookies操作详解

May 27 Python

pytorch 实现模型不同层设置不同的学习率方式

Jan 06 Python

Python如何把Spark数据写入ElasticSearch

Apr 18 Python

浅谈keras 模型用于预测时的注意事项

Jun 27 Python

python开发的自动化运维工具ansible详解

Aug 07 Python

python 参数列表中的self 显式不等于冗余

Dec 01 #Python

Python GAE、Django导出Excel的方法

Nov 24 #Python

Python类的基础入门知识

Nov 24 #Python

Python 连连看连接算法

Nov 22 #Python

python sqlobject(mysql)中文乱码解决方法

Nov 14 #Python

Python转码问题的解决方法

Oct 07 #Python

Python函数学习笔记

Oct 07 #Python

You might like

PHP下载大文件失败并限制下载速度的实例代码

2019/05/10 PHP

Thinkphp5 如何隐藏入口文件index.php(URL重写)

2019/10/16 PHP

GWT中复制到剪贴板 js+flash实现复制兼容性比较好

2010/03/07 Javascript

浅谈Javascript面向对象编程

2011/11/15 Javascript

关于jQuery UI 使用心得及技巧

2012/10/10 Javascript

window.location.href = window.location.href 跳转无反应 a超链接onclick事件写法

2013/08/21 Javascript

JS实现霓虹灯文字效果的方法

2015/08/06 Javascript

Jquery代码实现图片轮播效果（一）

2015/08/12 Javascript

js导出excel文件的简洁方法(推荐)

2016/11/02 Javascript

实现一个简单的vue无限加载指令方法

2017/01/10 Javascript

vue组件实例解析

2017/01/10 Javascript

JavaScript中如何判断一个值的类型

2017/09/15 Javascript

nodejs一个简单的文件服务器的创建方法

2019/09/13 NodeJs

node省市区三级数据性能测评实例分析

2019/11/06 Javascript

mpvue微信小程序开发之实现一个弹幕评论

2019/11/24 Javascript

JavaScript console的使用方法实例分析

2020/04/28 Javascript

JavaScript接口实现方法实例分析

2020/05/16 Javascript

vue 清空input标签中file的值操作

2020/07/21 Javascript

Python中eval带来的潜在风险代码分析

2017/12/11 Python

TensorFlow Session使用的两种方法小结

2018/07/30 Python

Python使用Selenium爬取淘宝异步加载的数据方法

2018/12/17 Python

Python Matplotlib 基于networkx画关系网络图

2019/07/10 Python

对Python获取屏幕截图的4种方法详解

2019/08/27 Python

Python将列表中的元素转化为数字并排序的示例

2019/12/25 Python

Python实现代码块儿折叠

2020/04/15 Python

怎样比较两个类型为String的字符串

2016/08/17 面试题

软件配置管理有什么好处

2015/04/15 面试题

销售人员中英文自荐信

2013/09/22 职场文书

2014年公司迎新年活动方案

2014/02/24 职场文书

幼儿园招生广告

2014/03/19 职场文书

横幅标语大全

2014/06/17 职场文书

社区服务活动报告

2015/02/05 职场文书

2015年综治宣传月活动总结

2015/03/25 职场文书

政审证明材料

2015/06/19 职场文书

Python使用OpenCV实现虚拟缩放效果

2022/02/28 Python

微信小程序APP的事件绑定以及传递参数时的冒泡和捕获

2022/04/19 Javascript