Python抓取聚划算商品分析页面获取商品信息并以XML格式保存到本地


Posted in Python onFebruary 23, 2018

本文实例为大家分享了Python抓取聚划算商品页面获取商品信息并保存的具体代码,供大家参考,具体内容如下

#!/user/bin/python 
# -*- coding: gbk -*- 
#Spider.py 
 
import urllib2 
import httplib 
import StringIO 
import gzip 
import re 
import chardet 
import sys 
import os 
import datetime 
from xml.dom.minidom import Document 
from BeautifulSoup import BeautifulSoup 
 
## 这段代码是用于解决控制台打印汉字报错的问题 
reload(sys) 
sys.setdefaultencoding("utf8") 
##################################################### 
 
## debug模式开关,开启后可以看到Http请求的头部信息以及debug日志 
DEBUG = 1 
NO_DEBUG = 0 
httplib.HTTPConnection.debuglevel = DEBUG 
## 是否显示爬取网页源代码开关 
showSrcCode = False 
## 压缩方式 
ZIP_TYPE = "gzip" 
 
fileName = "auctions" 
location = "d://spiderData/" 
 
## header 
headerConfig = {"User-Agent":"taobao-yanyuan.qzs", "Accept-encoding":ZIP_TYPE} 
##################################################### 
 
 
#############class SpiderConfig ##################### 
class SpiderConfig: 
 """ 
  configuration for spider name and url 
 """ 
 def __init__(self, name, url): 
  self.name = name 
  self.url = url 
##################################################### 
 
##############class SpiderAuctionDomain############## 
class SpiderAuctionDomain: 
 """ 
  Store information with auctions spidered by python 
 """ 
 title = "" 
 url = "" 
 img = "" 
 price = "" 
 
 def __init__(self): 
  pass 
 
##################################################### 
 
########class SpiderDefaultErrorHandler############## 
class SpiderDefaultErrorHandler(urllib2.HTTPDefaultErrorHandler): 
 def http_error_default(self, req, fp, code, msg, hdrs): 
  """ 
   default error process handler for spider 
  """ 
  result = urllib2.HTTPError(req.get_full_url(), code, msg, hdrs, fp) 
  result.status = code 
  result.url = req.get_full_url() 
 
  print "<", result.url, "Exception code :", result.status, ">" 
 
  return result 
##################################################### 
 
#############class SpiderHandler##################### 
class SpiderHandler: 
 """ 
  spider handler 
 """ 
 
 def spider(self, spiderConfig): 
  try: 
   request = urllib2.Request(spiderConfig.url) 
 
   ## configure request hreader 
   for key,val in headerConfig.items(): 
    request.add_header(key, val) 
 
   ## build opener 
   opener = urllib2.build_opener(SpiderDefaultErrorHandler()) 
 
   ## open request 
   openRequest = opener.open(request) 
 
   ## read data 
   spiderData = openRequest.read() 
 
   ## close 
   opener.close() 
 
   if 0 == len(spiderData): 
    return 
 
   if ZIP_TYPE== openRequest.headers.get("Content-Encoding"): 
    spiderData = SpiderHandler.gzipData(self, spiderData) 
 
   if httplib.HTTPConnection.debuglevel == DEBUG and showSrcCode: 
    print spiderData 
 
   # parse html 
   SpiderHandler.parse(self, spiderData) 
 
  except Exception,x: 
   print "spider process Exception:", x 
 
 
 
 def parse(self, spiderData): 
  """ 
   parse html content 
  """ 
 
  if httplib.HTTPConnection.debuglevel == DEBUG: 
   charsetAnalyze = chardet.detect(spiderData) 
   print "analyze spider data encode :",charsetAnalyze["encoding"] 
 
  print "执行解析", fileName 
 
  soup = BeautifulSoup(spiderData) 
  encode = soup.originalEncoding 
 
  encoding = lambda x : x.encode(encode) 
 
  if httplib.HTTPConnection.debuglevel == DEBUG: 
   print "识别到编码:", encode 
   title = soup.head.title.string 
   print encoding(title) 
 
  spiderContents = soup.findAll(name="div", attrs={"class":"main-box avil"}) 
  auctions = ["%s" % s for s in spiderContents] 
 
  if auctions is None: 
   return 
 
  auctionList = [] 
 
  for auc in auctions: 
   auctionDomain = SpiderAuctionDomain() 
   # parse auction link 
   links = re.search(re.compile(r'<a href=[\"|\']http://ju.taobao.com/tg/life_home.htm\?item_id=([^>]*)[\"|\']', re.IGNORECASE), auc) 
   if links is not None : 
    auctionDomain.link = encoding("http://ju.taobao.com/tg/life_home.htm?item_id=" + "".join(["%s" % s for s in links.groups() if len(s) > 0])) 
 
   #parse auction title 
   titles = re.search(re.compile(r"([^>]*)</a></h2>", re.IGNORECASE), auc) 
   if titles is not None: 
    auctionDomain.title = encoding("".join(["%s" % t for t in titles.groups() if len(t) > 0])) 
 
   #parse auction price 
   price = re.search(re.compile(r"<strong class=\"J_juPrices\".*</b>([^<]*)</strong>", re.IGNORECASE), auc) 
   if price is not None: 
    auctionDomain.price = "".join(["%s" % p for p in price.groups() if len(p) > 0]) 
 
   #parse image url 
   imgs = re.search(re.compile(r"<img src=[\'\"]([^>]*)[\'\"]", re.IGNORECASE), auc) 
   if imgs is not None: 
    auctionDomain.img = "".join(["%s" % i for i in imgs.groups() if len(i) > 0]) 
 
   auctionList.append(auctionDomain) 
 
  print "成功解析商品信息:" 
  for a in auctionList: 
   print "--->",a.title 
 
  # sort auction list 
  auctionList = SpiderHandler.sortAuctionList(self, auctionList) 
 
  # save in file 
  SpiderHandler.save(self, auctionList) 
 
  print "解析完成" 
 
  pass 
 
 def sortAuctionList(self, auctionList): 
  """ 
   冒泡排序,按照价格排序 
  """ 
  length = len(auctionList) 
  if length < 2: 
   return auctionList 
  else: 
   for i in range(length-1): 
    for j in range(length - i -1): 
     if float(auctionList[j].price) > float(auctionList[j+1].price): 
      auctionList[j], auctionList[j+1] = auctionList[j+1], auctionList[j] 
  return auctionList 
  pass 
 
 def save(self, auctionList): 
  if auctionList is not None: 
   doc = Document() 
 
   auctions = doc.createElement("auctions") 
   doc.appendChild(auctions) 
 
   for auc in auctionList: 
    auction = doc.createElement("auction") 
    auctions.appendChild(auction) 
 
    SpiderHandler.generateXML(self, doc, auction, "title", auc.title) 
    SpiderHandler.generateXML(self, doc, auction, "price", auc.price) 
    SpiderHandler.generateXML(self, doc, auction, "img", auc.img) 
    SpiderHandler.generateXML(self, doc, auction, "link", auc.link) 
 
   if False == os.path.exists(location): 
    os.mkdir(location) 
 
   file = open(location+fileName+".xml", 'w') 
   file.write(doc.toprettyxml()) 
   file.close() 
 
   if httplib.HTTPConnection.debuglevel == DEBUG: 
    print doc.toprettyxml() 
 
 def generateXML(self, doc, f, name, txt): 
  c = doc.createElement(name) 
  f.appendChild(c) 
  c.appendChild(doc.createTextNode(txt)) 
 
 def gzipData(self, spiderData): 
  """ 
   get data from gzip 
  """ 
  if 0 == len(spiderData): 
   return spiderData 
  spiderDataStream = StringIO.StringIO(spiderData) 
  spiderData = gzip.GzipFile(fileobj=spiderDataStream).read() 
  return spiderData 
##################################################### 
 
if __name__ == "__main__": 
 nowtime = lambda:datetime.datetime.strftime(datetime.datetime.now(),"%Y年%m月%d日 %H时%m分%S秒") 
 
 needSpiderUrl = {"suzhou":"http://ju.taobao.com/suzhou", 
      "hangzhou":"http://ju.taobao.com/hangzhou", 
      "shanghai":"http://ju.taobao.com/shanghai", 
      "beijing":"http://ju.taobao.com/beijing", 
      "chengdu":"http://ju.taobao.com/chengdu"} 
 
 configList = [] 
 for k,v in needSpiderUrl.items(): 
  spiderConfig = SpiderConfig(k, v) 
  configList.append(spiderConfig) 
 
 spiderHandler = SpiderHandler() 
 
 print "爬虫执行开始时间:",nowtime() 
 for spiderConfig in configList: 
  fileName = spiderConfig.name 
  spiderHandler.spider(spiderConfig) 
 
 print "爬虫执行完毕时间:",nowtime()

更多内容请参考专题《python爬取功能汇总》进行学习。

以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持三水点靠木。

Python 相关文章推荐
Python Mysql数据库操作 Perl操作Mysql数据库
Jan 12 Python
python局部赋值的规则
Mar 07 Python
python实现class对象转换成json/字典的方法
Mar 11 Python
Python模拟用户登录验证
Sep 11 Python
python操作excel的方法
Aug 16 Python
Python 实现中值滤波、均值滤波的方法
Jan 09 Python
ubuntu 18.04搭建python环境(pycharm+anaconda)
Jun 14 Python
解决安装python3.7.4报错Can''t connect to HTTPS URL because the SSL module is not available
Jul 31 Python
python将数组n等分的实例
Dec 02 Python
Python random模块制作简易的四位数验证码
Feb 01 Python
以SQLite和PySqlite为例来学习Python DB API
Feb 05 Python
python json.dumps中文乱码问题解决
Apr 01 Python
Python各类图像库的图片读写方式总结(推荐)
Feb 23 #Python
python自动发邮件库yagmail的示例代码
Feb 23 #Python
Python KMeans聚类问题分析
Feb 23 #Python
浅谈python爬虫使用Selenium模拟浏览器行为
Feb 23 #Python
python kmeans聚类简单介绍和实现代码
Feb 23 #Python
python MysqlDb模块安装及其使用详解
Feb 23 #Python
Python实现k-means算法
Feb 23 #Python
You might like
一个自定义位数的php多用户计数器代码
2007/03/11 PHP
PHP动态规划解决0-1背包问题实例分析
2015/03/23 PHP
微信公众平台DEMO(PHP)
2016/05/04 PHP
PHP生成随机码的思路与方法实例探索
2019/04/11 PHP
神奇的7个jQuery 3D插件整理
2011/01/06 Javascript
JavaScript中的稀疏数组与密集数组[译]
2012/09/17 Javascript
使用js的replace()方法查找字符示例代码
2013/10/28 Javascript
鼠标经过tr时,改变tr当前背景颜色
2014/01/13 Javascript
js数组操作常用方法
2014/05/08 Javascript
js实现透明度渐变效果的方法
2015/04/10 Javascript
JavaScript中常见的字符串操作函数及用法汇总
2015/05/04 Javascript
基于jQuery Ajax实现上传文件
2016/03/24 Javascript
js如何准确获取当前页面url网址信息
2020/09/13 Javascript
jQuery给div,Span, a ,button, radio 赋值与取值
2016/06/24 Javascript
浅谈js和css内联外联注意事项
2016/06/30 Javascript
Angular中$cacheFactory的作用和用法实例详解
2016/08/19 Javascript
DropDownList实现可输入可选择(两种版本可选)
2016/12/07 Javascript
vue动态生成dom并且自动绑定事件
2017/04/19 Javascript
JS字符串按逗号和回车分隔的方法
2017/04/25 Javascript
JavaScript实现滑动导航栏效果
2017/08/30 Javascript
微信{"errcode":48001,"errmsg":"api unauthorized, hints: [ req_id: 1QoCla0699ns81 ]"}
2018/10/12 Javascript
layer 刷新某个页面的实现方法
2019/09/05 Javascript
Python文件夹与文件的操作实现代码
2014/07/13 Python
跟老齐学Python之不要红头文件(2)
2014/09/28 Python
python使用正则表达式匹配字符串开头并打印示例
2017/01/11 Python
Python实现读取TXT文件数据并存进内置数据库SQLite3的方法
2017/08/08 Python
Python常见MongoDB数据库操作实例总结
2018/07/24 Python
在SQLite-Python中实现返回、查询中文字段的方法
2019/07/17 Python
Python内存泄漏和内存溢出的解决方案
2020/09/26 Python
详解HTML5中的元素与元素
2015/08/17 HTML / CSS
小狗电器官方商城:中国高端吸尘器品牌
2017/03/29 全球购物
股权转让意向书
2014/04/01 职场文书
工程款催款函
2015/06/24 职场文书
涨工资申请书应该怎么写?
2019/07/08 职场文书
2019年暑期法院实习报告
2019/12/18 职场文书
Nginx location 和 proxy_pass路径配置问题小结
2021/09/04 Servers