Python抓取聚划算商品分析页面获取商品信息并以XML格式保存到本地


Posted in Python onFebruary 23, 2018

本文实例为大家分享了Python抓取聚划算商品页面获取商品信息并保存的具体代码,供大家参考,具体内容如下

#!/user/bin/python 
# -*- coding: gbk -*- 
#Spider.py 
 
import urllib2 
import httplib 
import StringIO 
import gzip 
import re 
import chardet 
import sys 
import os 
import datetime 
from xml.dom.minidom import Document 
from BeautifulSoup import BeautifulSoup 
 
## 这段代码是用于解决控制台打印汉字报错的问题 
reload(sys) 
sys.setdefaultencoding("utf8") 
##################################################### 
 
## debug模式开关,开启后可以看到Http请求的头部信息以及debug日志 
DEBUG = 1 
NO_DEBUG = 0 
httplib.HTTPConnection.debuglevel = DEBUG 
## 是否显示爬取网页源代码开关 
showSrcCode = False 
## 压缩方式 
ZIP_TYPE = "gzip" 
 
fileName = "auctions" 
location = "d://spiderData/" 
 
## header 
headerConfig = {"User-Agent":"taobao-yanyuan.qzs", "Accept-encoding":ZIP_TYPE} 
##################################################### 
 
 
#############class SpiderConfig ##################### 
class SpiderConfig: 
 """ 
  configuration for spider name and url 
 """ 
 def __init__(self, name, url): 
  self.name = name 
  self.url = url 
##################################################### 
 
##############class SpiderAuctionDomain############## 
class SpiderAuctionDomain: 
 """ 
  Store information with auctions spidered by python 
 """ 
 title = "" 
 url = "" 
 img = "" 
 price = "" 
 
 def __init__(self): 
  pass 
 
##################################################### 
 
########class SpiderDefaultErrorHandler############## 
class SpiderDefaultErrorHandler(urllib2.HTTPDefaultErrorHandler): 
 def http_error_default(self, req, fp, code, msg, hdrs): 
  """ 
   default error process handler for spider 
  """ 
  result = urllib2.HTTPError(req.get_full_url(), code, msg, hdrs, fp) 
  result.status = code 
  result.url = req.get_full_url() 
 
  print "<", result.url, "Exception code :", result.status, ">" 
 
  return result 
##################################################### 
 
#############class SpiderHandler##################### 
class SpiderHandler: 
 """ 
  spider handler 
 """ 
 
 def spider(self, spiderConfig): 
  try: 
   request = urllib2.Request(spiderConfig.url) 
 
   ## configure request hreader 
   for key,val in headerConfig.items(): 
    request.add_header(key, val) 
 
   ## build opener 
   opener = urllib2.build_opener(SpiderDefaultErrorHandler()) 
 
   ## open request 
   openRequest = opener.open(request) 
 
   ## read data 
   spiderData = openRequest.read() 
 
   ## close 
   opener.close() 
 
   if 0 == len(spiderData): 
    return 
 
   if ZIP_TYPE== openRequest.headers.get("Content-Encoding"): 
    spiderData = SpiderHandler.gzipData(self, spiderData) 
 
   if httplib.HTTPConnection.debuglevel == DEBUG and showSrcCode: 
    print spiderData 
 
   # parse html 
   SpiderHandler.parse(self, spiderData) 
 
  except Exception,x: 
   print "spider process Exception:", x 
 
 
 
 def parse(self, spiderData): 
  """ 
   parse html content 
  """ 
 
  if httplib.HTTPConnection.debuglevel == DEBUG: 
   charsetAnalyze = chardet.detect(spiderData) 
   print "analyze spider data encode :",charsetAnalyze["encoding"] 
 
  print "执行解析", fileName 
 
  soup = BeautifulSoup(spiderData) 
  encode = soup.originalEncoding 
 
  encoding = lambda x : x.encode(encode) 
 
  if httplib.HTTPConnection.debuglevel == DEBUG: 
   print "识别到编码:", encode 
   title = soup.head.title.string 
   print encoding(title) 
 
  spiderContents = soup.findAll(name="div", attrs={"class":"main-box avil"}) 
  auctions = ["%s" % s for s in spiderContents] 
 
  if auctions is None: 
   return 
 
  auctionList = [] 
 
  for auc in auctions: 
   auctionDomain = SpiderAuctionDomain() 
   # parse auction link 
   links = re.search(re.compile(r'<a href=[\"|\']http://ju.taobao.com/tg/life_home.htm\?item_id=([^>]*)[\"|\']', re.IGNORECASE), auc) 
   if links is not None : 
    auctionDomain.link = encoding("http://ju.taobao.com/tg/life_home.htm?item_id=" + "".join(["%s" % s for s in links.groups() if len(s) > 0])) 
 
   #parse auction title 
   titles = re.search(re.compile(r"([^>]*)</a></h2>", re.IGNORECASE), auc) 
   if titles is not None: 
    auctionDomain.title = encoding("".join(["%s" % t for t in titles.groups() if len(t) > 0])) 
 
   #parse auction price 
   price = re.search(re.compile(r"<strong class=\"J_juPrices\".*</b>([^<]*)</strong>", re.IGNORECASE), auc) 
   if price is not None: 
    auctionDomain.price = "".join(["%s" % p for p in price.groups() if len(p) > 0]) 
 
   #parse image url 
   imgs = re.search(re.compile(r"<img src=[\'\"]([^>]*)[\'\"]", re.IGNORECASE), auc) 
   if imgs is not None: 
    auctionDomain.img = "".join(["%s" % i for i in imgs.groups() if len(i) > 0]) 
 
   auctionList.append(auctionDomain) 
 
  print "成功解析商品信息:" 
  for a in auctionList: 
   print "--->",a.title 
 
  # sort auction list 
  auctionList = SpiderHandler.sortAuctionList(self, auctionList) 
 
  # save in file 
  SpiderHandler.save(self, auctionList) 
 
  print "解析完成" 
 
  pass 
 
 def sortAuctionList(self, auctionList): 
  """ 
   冒泡排序,按照价格排序 
  """ 
  length = len(auctionList) 
  if length < 2: 
   return auctionList 
  else: 
   for i in range(length-1): 
    for j in range(length - i -1): 
     if float(auctionList[j].price) > float(auctionList[j+1].price): 
      auctionList[j], auctionList[j+1] = auctionList[j+1], auctionList[j] 
  return auctionList 
  pass 
 
 def save(self, auctionList): 
  if auctionList is not None: 
   doc = Document() 
 
   auctions = doc.createElement("auctions") 
   doc.appendChild(auctions) 
 
   for auc in auctionList: 
    auction = doc.createElement("auction") 
    auctions.appendChild(auction) 
 
    SpiderHandler.generateXML(self, doc, auction, "title", auc.title) 
    SpiderHandler.generateXML(self, doc, auction, "price", auc.price) 
    SpiderHandler.generateXML(self, doc, auction, "img", auc.img) 
    SpiderHandler.generateXML(self, doc, auction, "link", auc.link) 
 
   if False == os.path.exists(location): 
    os.mkdir(location) 
 
   file = open(location+fileName+".xml", 'w') 
   file.write(doc.toprettyxml()) 
   file.close() 
 
   if httplib.HTTPConnection.debuglevel == DEBUG: 
    print doc.toprettyxml() 
 
 def generateXML(self, doc, f, name, txt): 
  c = doc.createElement(name) 
  f.appendChild(c) 
  c.appendChild(doc.createTextNode(txt)) 
 
 def gzipData(self, spiderData): 
  """ 
   get data from gzip 
  """ 
  if 0 == len(spiderData): 
   return spiderData 
  spiderDataStream = StringIO.StringIO(spiderData) 
  spiderData = gzip.GzipFile(fileobj=spiderDataStream).read() 
  return spiderData 
##################################################### 
 
if __name__ == "__main__": 
 nowtime = lambda:datetime.datetime.strftime(datetime.datetime.now(),"%Y年%m月%d日 %H时%m分%S秒") 
 
 needSpiderUrl = {"suzhou":"http://ju.taobao.com/suzhou", 
      "hangzhou":"http://ju.taobao.com/hangzhou", 
      "shanghai":"http://ju.taobao.com/shanghai", 
      "beijing":"http://ju.taobao.com/beijing", 
      "chengdu":"http://ju.taobao.com/chengdu"} 
 
 configList = [] 
 for k,v in needSpiderUrl.items(): 
  spiderConfig = SpiderConfig(k, v) 
  configList.append(spiderConfig) 
 
 spiderHandler = SpiderHandler() 
 
 print "爬虫执行开始时间:",nowtime() 
 for spiderConfig in configList: 
  fileName = spiderConfig.name 
  spiderHandler.spider(spiderConfig) 
 
 print "爬虫执行完毕时间:",nowtime()

更多内容请参考专题《python爬取功能汇总》进行学习。

以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持三水点靠木。

Python 相关文章推荐
python实现系统状态监测和故障转移实例方法
Nov 18 Python
python编写网页爬虫脚本并实现APScheduler调度
Jul 28 Python
Python3通过Luhn算法快速验证信用卡卡号的方法
May 14 Python
玩转python爬虫之cookie使用方法
Feb 17 Python
Python3.5 创建文件的简单实例
Apr 26 Python
python获取文件真实链接的方法,针对于302返回码
May 14 Python
python和shell获取文本内容的方法
Jun 05 Python
python 划分数据集为训练集和测试集的方法
Dec 11 Python
python爬虫爬取幽默笑话网站
Oct 24 Python
Python爬虫与反爬虫大战
Jul 30 Python
哪种Python框架适合你?简单介绍几种主流Python框架
Aug 04 Python
Python 使用 Frame tkraise() 方法在 Tkinter 应用程序中的Frame之间切换
Apr 24 Python
Python各类图像库的图片读写方式总结(推荐)
Feb 23 #Python
python自动发邮件库yagmail的示例代码
Feb 23 #Python
Python KMeans聚类问题分析
Feb 23 #Python
浅谈python爬虫使用Selenium模拟浏览器行为
Feb 23 #Python
python kmeans聚类简单介绍和实现代码
Feb 23 #Python
python MysqlDb模块安装及其使用详解
Feb 23 #Python
Python实现k-means算法
Feb 23 #Python
You might like
php curl 登录163邮箱并抓取邮箱好友列表的代码(经测试)
2011/04/07 PHP
php实现压缩多个CSS与JS文件的方法
2014/11/11 PHP
smarty模板判断数组为空的方法
2015/06/10 PHP
jQuery 点击图片跳转上一张或下一张功能的实现代码
2010/03/12 Javascript
NodeJS框架Express的模板视图机制分析
2011/07/19 NodeJs
JQuery分别取得每行最后一列和最后一行的示例代码
2013/08/18 Javascript
jquery获取及设置outerhtml的方法
2015/03/09 Javascript
AngularJS基础 ng-readonly 指令简单示例
2016/08/02 Javascript
AngularJS入门教程引导程序
2016/08/18 Javascript
javascript中的后退和刷新实现方法
2016/11/10 Javascript
深入理解javascript中concat方法
2016/12/12 Javascript
MUI 上拉刷新/下拉加载功能实例代码
2017/04/13 Javascript
微信小程序获取微信运动步数的实例代码
2017/07/20 Javascript
收集前端面试题之url、href、src
2018/03/22 Javascript
js实现购物车功能
2018/06/12 Javascript
axios向后台传递数组作为参数的方法
2018/08/11 Javascript
vue+mousemove实现鼠标拖动功能(拖动过快失效问题解决方法)
2018/08/24 Javascript
Python变量和数据类型详解
2017/02/15 Python
关于Python 3中print函数的换行详解
2017/08/08 Python
Python最火、R极具潜力 2017机器学习调查报告
2017/12/11 Python
python实现测试工具(一)——命令行发送get请求
2020/10/19 Python
Python 按比例获取样本数据或执行任务的实现代码
2020/12/03 Python
Django与AJAX实现网页动态数据显示的示例代码
2021/02/24 Python
德国网上药房:Apotal
2017/04/04 全球购物
Clarks鞋法国官方网站:英国其乐鞋品牌
2018/02/11 全球购物
美国网上书店:Barnes & Noble
2018/08/15 全球购物
师范毕业生自荐信
2013/10/17 职场文书
迟到检讨书5000字
2014/01/31 职场文书
小学六年级学生评语
2014/04/22 职场文书
《放飞蜻蜓》教学反思
2014/04/27 职场文书
青春奉献演讲稿
2014/05/08 职场文书
个人贷款授权委托书样本
2014/10/07 职场文书
2014年检验员工作总结
2014/11/19 职场文书
2015年119消防宣传日活动总结
2015/03/24 职场文书
婚宴祝酒词大全
2015/08/10 职场文书
nginx常用命令放入shell脚本详解
2021/03/31 Servers