Python抓取聚划算商品分析页面获取商品信息并以XML格式保存到本地


Posted in Python onFebruary 23, 2018

本文实例为大家分享了Python抓取聚划算商品页面获取商品信息并保存的具体代码,供大家参考,具体内容如下

#!/user/bin/python 
# -*- coding: gbk -*- 
#Spider.py 
 
import urllib2 
import httplib 
import StringIO 
import gzip 
import re 
import chardet 
import sys 
import os 
import datetime 
from xml.dom.minidom import Document 
from BeautifulSoup import BeautifulSoup 
 
## 这段代码是用于解决控制台打印汉字报错的问题 
reload(sys) 
sys.setdefaultencoding("utf8") 
##################################################### 
 
## debug模式开关,开启后可以看到Http请求的头部信息以及debug日志 
DEBUG = 1 
NO_DEBUG = 0 
httplib.HTTPConnection.debuglevel = DEBUG 
## 是否显示爬取网页源代码开关 
showSrcCode = False 
## 压缩方式 
ZIP_TYPE = "gzip" 
 
fileName = "auctions" 
location = "d://spiderData/" 
 
## header 
headerConfig = {"User-Agent":"taobao-yanyuan.qzs", "Accept-encoding":ZIP_TYPE} 
##################################################### 
 
 
#############class SpiderConfig ##################### 
class SpiderConfig: 
 """ 
  configuration for spider name and url 
 """ 
 def __init__(self, name, url): 
  self.name = name 
  self.url = url 
##################################################### 
 
##############class SpiderAuctionDomain############## 
class SpiderAuctionDomain: 
 """ 
  Store information with auctions spidered by python 
 """ 
 title = "" 
 url = "" 
 img = "" 
 price = "" 
 
 def __init__(self): 
  pass 
 
##################################################### 
 
########class SpiderDefaultErrorHandler############## 
class SpiderDefaultErrorHandler(urllib2.HTTPDefaultErrorHandler): 
 def http_error_default(self, req, fp, code, msg, hdrs): 
  """ 
   default error process handler for spider 
  """ 
  result = urllib2.HTTPError(req.get_full_url(), code, msg, hdrs, fp) 
  result.status = code 
  result.url = req.get_full_url() 
 
  print "<", result.url, "Exception code :", result.status, ">" 
 
  return result 
##################################################### 
 
#############class SpiderHandler##################### 
class SpiderHandler: 
 """ 
  spider handler 
 """ 
 
 def spider(self, spiderConfig): 
  try: 
   request = urllib2.Request(spiderConfig.url) 
 
   ## configure request hreader 
   for key,val in headerConfig.items(): 
    request.add_header(key, val) 
 
   ## build opener 
   opener = urllib2.build_opener(SpiderDefaultErrorHandler()) 
 
   ## open request 
   openRequest = opener.open(request) 
 
   ## read data 
   spiderData = openRequest.read() 
 
   ## close 
   opener.close() 
 
   if 0 == len(spiderData): 
    return 
 
   if ZIP_TYPE== openRequest.headers.get("Content-Encoding"): 
    spiderData = SpiderHandler.gzipData(self, spiderData) 
 
   if httplib.HTTPConnection.debuglevel == DEBUG and showSrcCode: 
    print spiderData 
 
   # parse html 
   SpiderHandler.parse(self, spiderData) 
 
  except Exception,x: 
   print "spider process Exception:", x 
 
 
 
 def parse(self, spiderData): 
  """ 
   parse html content 
  """ 
 
  if httplib.HTTPConnection.debuglevel == DEBUG: 
   charsetAnalyze = chardet.detect(spiderData) 
   print "analyze spider data encode :",charsetAnalyze["encoding"] 
 
  print "执行解析", fileName 
 
  soup = BeautifulSoup(spiderData) 
  encode = soup.originalEncoding 
 
  encoding = lambda x : x.encode(encode) 
 
  if httplib.HTTPConnection.debuglevel == DEBUG: 
   print "识别到编码:", encode 
   title = soup.head.title.string 
   print encoding(title) 
 
  spiderContents = soup.findAll(name="div", attrs={"class":"main-box avil"}) 
  auctions = ["%s" % s for s in spiderContents] 
 
  if auctions is None: 
   return 
 
  auctionList = [] 
 
  for auc in auctions: 
   auctionDomain = SpiderAuctionDomain() 
   # parse auction link 
   links = re.search(re.compile(r'<a href=[\"|\']http://ju.taobao.com/tg/life_home.htm\?item_id=([^>]*)[\"|\']', re.IGNORECASE), auc) 
   if links is not None : 
    auctionDomain.link = encoding("http://ju.taobao.com/tg/life_home.htm?item_id=" + "".join(["%s" % s for s in links.groups() if len(s) > 0])) 
 
   #parse auction title 
   titles = re.search(re.compile(r"([^>]*)</a></h2>", re.IGNORECASE), auc) 
   if titles is not None: 
    auctionDomain.title = encoding("".join(["%s" % t for t in titles.groups() if len(t) > 0])) 
 
   #parse auction price 
   price = re.search(re.compile(r"<strong class=\"J_juPrices\".*</b>([^<]*)</strong>", re.IGNORECASE), auc) 
   if price is not None: 
    auctionDomain.price = "".join(["%s" % p for p in price.groups() if len(p) > 0]) 
 
   #parse image url 
   imgs = re.search(re.compile(r"<img src=[\'\"]([^>]*)[\'\"]", re.IGNORECASE), auc) 
   if imgs is not None: 
    auctionDomain.img = "".join(["%s" % i for i in imgs.groups() if len(i) > 0]) 
 
   auctionList.append(auctionDomain) 
 
  print "成功解析商品信息:" 
  for a in auctionList: 
   print "--->",a.title 
 
  # sort auction list 
  auctionList = SpiderHandler.sortAuctionList(self, auctionList) 
 
  # save in file 
  SpiderHandler.save(self, auctionList) 
 
  print "解析完成" 
 
  pass 
 
 def sortAuctionList(self, auctionList): 
  """ 
   冒泡排序,按照价格排序 
  """ 
  length = len(auctionList) 
  if length < 2: 
   return auctionList 
  else: 
   for i in range(length-1): 
    for j in range(length - i -1): 
     if float(auctionList[j].price) > float(auctionList[j+1].price): 
      auctionList[j], auctionList[j+1] = auctionList[j+1], auctionList[j] 
  return auctionList 
  pass 
 
 def save(self, auctionList): 
  if auctionList is not None: 
   doc = Document() 
 
   auctions = doc.createElement("auctions") 
   doc.appendChild(auctions) 
 
   for auc in auctionList: 
    auction = doc.createElement("auction") 
    auctions.appendChild(auction) 
 
    SpiderHandler.generateXML(self, doc, auction, "title", auc.title) 
    SpiderHandler.generateXML(self, doc, auction, "price", auc.price) 
    SpiderHandler.generateXML(self, doc, auction, "img", auc.img) 
    SpiderHandler.generateXML(self, doc, auction, "link", auc.link) 
 
   if False == os.path.exists(location): 
    os.mkdir(location) 
 
   file = open(location+fileName+".xml", 'w') 
   file.write(doc.toprettyxml()) 
   file.close() 
 
   if httplib.HTTPConnection.debuglevel == DEBUG: 
    print doc.toprettyxml() 
 
 def generateXML(self, doc, f, name, txt): 
  c = doc.createElement(name) 
  f.appendChild(c) 
  c.appendChild(doc.createTextNode(txt)) 
 
 def gzipData(self, spiderData): 
  """ 
   get data from gzip 
  """ 
  if 0 == len(spiderData): 
   return spiderData 
  spiderDataStream = StringIO.StringIO(spiderData) 
  spiderData = gzip.GzipFile(fileobj=spiderDataStream).read() 
  return spiderData 
##################################################### 
 
if __name__ == "__main__": 
 nowtime = lambda:datetime.datetime.strftime(datetime.datetime.now(),"%Y年%m月%d日 %H时%m分%S秒") 
 
 needSpiderUrl = {"suzhou":"http://ju.taobao.com/suzhou", 
      "hangzhou":"http://ju.taobao.com/hangzhou", 
      "shanghai":"http://ju.taobao.com/shanghai", 
      "beijing":"http://ju.taobao.com/beijing", 
      "chengdu":"http://ju.taobao.com/chengdu"} 
 
 configList = [] 
 for k,v in needSpiderUrl.items(): 
  spiderConfig = SpiderConfig(k, v) 
  configList.append(spiderConfig) 
 
 spiderHandler = SpiderHandler() 
 
 print "爬虫执行开始时间:",nowtime() 
 for spiderConfig in configList: 
  fileName = spiderConfig.name 
  spiderHandler.spider(spiderConfig) 
 
 print "爬虫执行完毕时间:",nowtime()

更多内容请参考专题《python爬取功能汇总》进行学习。

以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持三水点靠木。

Python 相关文章推荐
Pycharm学习教程(7)虚拟机VM的配置教程
May 04 Python
python的paramiko模块实现远程控制和传输示例
Oct 13 Python
NetworkX之Prim算法(实例讲解)
Dec 22 Python
tensorflow 恢复指定层与不同层指定不同学习率的方法
Jul 26 Python
Python将8位的图片转为24位的图片实现方法
Oct 24 Python
利用arcgis的python读取要素的X,Y方法
Dec 22 Python
Python爬虫爬取煎蛋网图片代码实例
Dec 16 Python
Python库安装速度过慢解决方案
Jul 14 Python
基于python实现删除指定文件类型
Jul 21 Python
详解Python的爬虫框架 Scrapy
Aug 03 Python
python Matplotlib数据可视化(1):简单入门
Sep 30 Python
Python爬虫中urllib3与urllib的区别是什么
Jul 21 Python
Python各类图像库的图片读写方式总结(推荐)
Feb 23 #Python
python自动发邮件库yagmail的示例代码
Feb 23 #Python
Python KMeans聚类问题分析
Feb 23 #Python
浅谈python爬虫使用Selenium模拟浏览器行为
Feb 23 #Python
python kmeans聚类简单介绍和实现代码
Feb 23 #Python
python MysqlDb模块安装及其使用详解
Feb 23 #Python
Python实现k-means算法
Feb 23 #Python
You might like
实现了一个PHP5的getter/setter基类的代码
2007/02/25 PHP
CodeIgniter启用缓存和清除缓存的方法
2014/06/12 PHP
PHP类的特性实例分析
2016/09/28 PHP
SlideView 图片滑动(扩展/收缩)展示效果
2010/08/01 Javascript
鼠标悬浮显示二级菜单效果的jquery实现
2014/10/29 Javascript
jQuery插件datepicker 日期连续选择
2015/06/12 Javascript
javascript遍历json对象的key和任意js对象属性实例
2017/03/09 Javascript
Bootstrap 网格系统布局详解
2017/03/19 Javascript
微信小程序 登录的简单实现
2017/04/19 Javascript
Vue页面骨架屏注入方法
2018/05/13 Javascript
layui自己添加图片按钮并点击跳转页面的例子
2019/09/14 Javascript
使用PreloadJS加载图片资源的基础方法详解
2020/02/03 Javascript
JS实现网站楼层导航效果代码实例
2020/06/16 Javascript
如何管理Vue中的缓存页面
2021/02/06 Vue.js
[52:22]EG vs VG Supermajor小组赛B组 BO3 第一场 6.2
2018/06/03 DOTA
Python中使用语句导入模块或包的机制研究
2015/03/30 Python
PyTorch里面的torch.nn.Parameter()详解
2020/01/03 Python
python scatter函数用法实例详解
2020/02/11 Python
如何将anaconda安装配置的mmdetection环境离线拷贝到另一台电脑
2020/10/15 Python
纯css3实现图片翻牌特效
2015/03/10 HTML / CSS
Hotter Shoes英国官网:英伦风格,舒适的鞋子
2017/12/28 全球购物
经典洗发水广告词
2014/03/13 职场文书
理想演讲稿范文
2014/05/21 职场文书
品酒会策划方案
2014/05/26 职场文书
公司自我介绍演讲稿
2014/08/21 职场文书
创先争优活动承诺书
2014/08/30 职场文书
医德考评自我评价
2014/09/14 职场文书
党支部遵守党的政治纪律情况对照检查材料
2014/09/26 职场文书
2014小学年度工作总结
2014/12/20 职场文书
党员干部廉政承诺书
2015/04/28 职场文书
2015年幼儿园保育工作总结
2015/05/12 职场文书
2015年语文教学工作总结
2015/05/25 职场文书
CSS 一行代码实现头像与国旗的融合
2021/10/24 HTML / CSS
使用python求解迷宫问题的三种实现方法
2022/03/17 Python
nginx lua 操作 mysql
2022/05/15 Servers
Linux中sftp常用命令整理
2022/06/28 Servers