Python如何抓取天猫商品详细信息及交易记录


Posted in Python onFebruary 23, 2018

本文实例为大家分享了Python抓取天猫商品详细信息及交易记录的具体代码,供大家参考,具体内容如下

一、搭建Python环境

本帖使用的是Python 2.7
涉及到的模块:spynner, scrapy, bs4, pymmssql

二、要获取的天猫数据

三、数据抓取流程

四、源代码

#coding:utf-8
import spynner
from scrapy.selector import Selector
from bs4 import BeautifulSoup
import random
import pymssql


#------------------------接数据库-----------------------------#
server="localhost"
user="sa"
password = "123456"
conn=pymssql.connect(server,user,password,"TmallData")
if conn:
  print "DataBase connecting successfully!"
else:
  print "DataBase connecting error!"
cursor=conn.cursor()
#----------------------定义网页操作函数--------------------------#
def py_click_element(browser,pos):
  #点击网页中的元素
  #pos example:'a[href="#description" rel="external nofollow" rel="external nofollow" ]'
  browser.click(pos)
  browser.wait(random.randint(3,10))
  return browser

def py_click_xpath(browser,xpath):
  xpath=xpath+'/@href'
  inner_href=Selector(text=browser.html).xpath(xpath).extract()
  pos='a[href="'+str(inner_href[0])+'" rel="external nofollow" ]'
  browser=py_click_element(browser, pos)
  return browser

def py_webpage_load(browser,url):
  browser.load(url,load_timeout=60)
  browser.wait(10)
  return browser

def py_check_element(browser,xpath):
  #按照xpath查找元素,如果存在则返回True,否则返回False
  if Selector(text=browser.html).xpath(xpath).extract()!=[]:
    return True
  else:
    return False

def py_extract_xpath(browser,xpath):
  if py_check_element(browser, xpath):
    return Selector(text=browser.html).xpath(xpath).extract()[0]
  else:
    return "none"

def py_extract_xpaths(browser,xpaths):
  #批量提取网页内容
  length=len(xpaths)
  results=[0]*length
  for i in range(length):
    results[i]=py_extract_xpath(browser, xpaths[i])
  return results

#-----------------------------数据库操作函数---------------------------#


#-----------------------------数据提取函数----------------------------#
def py_getDealReord(doc):
  soup=BeautifulSoup(doc,'lxml')
  tr=soup.find_all('tr')
  total_dealRecord=[([0]*5)for i in range(len(tr))] 
  i=-1
  for this_tr in tr:
    i=i+1
    td_user=this_tr.find_all('td',attrs={'class':"cell-align-l buyer"})
    for this_td in td_user:
      total_dealRecord[i][0]=this_td.getText().strip(' ')
      #print username
    td_style=this_tr.find_all('td',attrs={'class':"cell-align-l style"})
    for this_td in td_style:
      total_dealRecord[i][1]=this_td.getText(',').strip(' ')
      #print style
    td_quantity=this_tr.find_all('td',attrs={'class':"quantity"})
    for this_td in td_quantity:
      total_dealRecord[i][2]=this_td.getText().strip(' ')
      #print quantity
    td_dealtime=this_tr.find_all('td',attrs={'class':"dealtime"})
    for this_td in td_dealtime:
      total_dealRecord[i][3]=this_td.find('p',attrs={'class':"date"}).getText()
      total_dealRecord[i][4]=this_td.find('p',attrs={'class':"time"}).getText()
  return total_dealRecord
#--------------------获取要抓取的所有商品链接-----------------------#
cursor.execute("""
select * from ProductURLs where BrandName='NB'
""")


file=open("H:\\Eclipse\\TmallCrawling\\HTMLParse\\errLog.txt")
InProductInfo=cursor.fetchall()
browser=spynner.Browser()
for temp_InProductInfo in InProductInfo:

  url='https:'+temp_InProductInfo[2]

  BrandName=temp_InProductInfo[0]
  ProductType=temp_InProductInfo[1]
  print BrandName,'\t',ProductType,'\t',url
  #url= 'https://detail.tmall.com/item.htm?id=524425656711&rn=77636d6db8dea5e30060976fdaf9768d&abbucket=19' 

  try:
    browser=py_webpage_load(browser, url)
  except:
    print "Loading webpage failed."
    file.write(url)
    file.write('\n')
    continue

  xpaths=['//*[@id="J_PromoPrice"]/dd/div/span/text()',\
    '//*[@id="J_StrPriceModBox"]/dd/span/text()',\
    '//*[@id="J_DetailMeta"]/div[1]/div[1]/div/div[1]/h1/text()',\
    '//*[@id="J_PostageToggleCont"]/p/span/text()',\
    '//*[@id="J_EmStock"]/text()',\
    '//*[@id="J_CollectCount"]/text()',\
    '//*[@id="J_ItemRates"]/div/span[2]/text()',\
    '//*[@id="J_DetailMeta"]/div[1]/div[1]/div/ul/li[1]/div/span[2]/text()']
  out_ProductInfo=py_extract_xpaths(browser,xpaths)
  browser=py_click_element(browser,'a[href="#description" rel="external nofollow" rel="external nofollow" ]')
  ProductProperty=py_extract_xpath(browser, '//*[@id="J_AttrUL"]')
  soup=BeautifulSoup(ProductProperty,'lxml')
  li=soup.find_all('li')
  prop=''
  for this_li in li:
    prop=prop+this_li.getText()+'\\'
  prop=prop[0:len(prop)-1]
  out_ProductProperty=prop
  print out_ProductProperty
  cursor.execute("""
  Insert into py_ProductInfo values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
  """,(BrandName,ProductType,url,\
     out_ProductInfo[2],out_ProductInfo[1],\
     out_ProductInfo[0],out_ProductInfo[7],\
     out_ProductInfo[1],out_ProductInfo[3],\
     out_ProductInfo[4],out_ProductInfo[5],\
     out_ProductProperty))
  conn.commit()
  Deal_PageCount=0
  browser=py_click_element(browser, 'a[href="#J_DealRecord" rel="external nofollow" ]')
  #browser.browse(True)
  DealRecord=py_extract_xpath(browser, '//*[@id="J_showBuyerList"]/table/tbody')
  out_DealRecord=py_getDealReord(DealRecord)
  for temp_DealRecord in out_DealRecord:
    if str(temp_DealRecord[4])=='0':
      continue
    cursor.execute("""
    Insert into DealRecord values(%s,%s,%s,%s,%s,%s)
    """,(url,temp_DealRecord[0],temp_DealRecord[1],\
       temp_DealRecord[2],temp_DealRecord[3],\
       temp_DealRecord[4]))
    conn.commit()
  Deal_PageCount=Deal_PageCount+1
  print "Page ",Deal_PageCount
  for i in range(6):
    if (i==0) or (i==2):
      continue
    xpath='//*[@id="J_showBuyerList"]/div/div/a['+str(i)+']'
    if py_check_element(browser,xpath):
      browser=py_click_xpath(browser, xpath)
      DealRecord=py_extract_xpath(browser, '//*[@id="J_showBuyerList"]/table/tbody')
      out_DealRecord=py_getDealReord(DealRecord)
      for temp_DealRecord in out_DealRecord:
        if str(temp_DealRecord[4])=='0':
          continue
        cursor.execute("""
        Insert into DealRecord values(%s,%s,%s,%s,%s,%s)
        """,(url,temp_DealRecord[0],temp_DealRecord[1],\
           temp_DealRecord[2],temp_DealRecord[3],\
           temp_DealRecord[4]))
        conn.commit()
      Deal_PageCount=Deal_PageCount+1
      print "Page ",Deal_PageCount
  while py_check_element(browser, '//*[@id="J_showBuyerList"]/div/div/a[6]'):
    browser=py_click_xpath(browser, '//*[@id="J_showBuyerList"]/div/div/a[6]')
    DealRecord=py_extract_xpath(browser, '//*[@id="J_showBuyerList"]/table/tbody')
    out_DealRecord=py_getDealReord(DealRecord)
    for temp_DealRecord in out_DealRecord:
      if str(temp_DealRecord[4])=='0':
        continue
      cursor.execute("""
      Insert into DealRecord values(%s,%s,%s,%s,%s,%s)
      """,(url,temp_DealRecord[0],temp_DealRecord[1],\
         temp_DealRecord[2],temp_DealRecord[3],\
         temp_DealRecord[4]))
      conn.commit()
    Deal_PageCount=Deal_PageCount+1
    print "Page ",Deal_PageCount

以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持三水点靠木。

Python 相关文章推荐
Python算法之图的遍历
Nov 16 Python
详解Python连接MySQL数据库的多种方式
Apr 16 Python
Python3.5 Json与pickle实现数据序列化与反序列化操作示例
Apr 29 Python
简单了解django索引的相关知识
Jul 17 Python
python 魔法函数实例及解析
Sep 25 Python
Python jieba库用法及实例解析
Nov 04 Python
python操作gitlab API过程解析
Dec 27 Python
Python常用编译器原理及特点解析
Mar 23 Python
深入理解Python 多线程
Jun 16 Python
Python爬虫使用bs4方法实现数据解析
Aug 25 Python
python中实现栈的三种方法
Dec 19 Python
Python 里最强的地图绘制神器
Mar 01 Python
python列表生成式与列表生成器的使用
Feb 23 #Python
1分钟快速生成用于网页内容提取的xslt
Feb 23 #Python
python使用xslt提取网页数据的方法
Feb 23 #Python
Python爬虫使用Selenium+PhantomJS抓取Ajax和动态HTML内容
Feb 23 #Python
python爬虫获取多页天涯帖子
Feb 23 #Python
Python即时网络爬虫项目启动说明详解
Feb 23 #Python
Python爬豆瓣电影实例
Feb 23 #Python
You might like
PHP实现表单提交数据的验证处理功能【防SQL注入和XSS攻击等】
2017/07/21 PHP
详解PHP神奇又有用的Trait
2019/03/25 PHP
JSON 客户端和服务器端的格式转换
2009/08/27 Javascript
理解Javascript_11_constructor实现原理
2010/10/18 Javascript
cookie中的path与domain属性详解
2013/12/18 Javascript
DOM 事件流详解
2015/01/20 Javascript
js图片卷帘门导航菜单特效代码分享
2015/09/10 Javascript
js获取及修改网页背景色和字体色的方法
2015/12/29 Javascript
JS onkeypress兼容性写法详解
2016/04/27 Javascript
jQuery焦点图轮播插件KinSlideshow用法分析
2016/06/08 Javascript
JavaScript实现垂直向上无缝滚动特效代码
2016/11/23 Javascript
分享一个精简的vue.js 图片lazyload插件实例
2017/03/13 Javascript
vue2.0结合Element实现select动态控制input禁用实例
2017/05/12 Javascript
vue实现点击图片放大效果
2017/08/15 Javascript
Vue单页应用引用单独的样式文件的两种方式
2018/03/30 Javascript
iconfont的三种使用方式详解
2018/08/05 Javascript
微信小程序 wepy框架与iview-weapp的用法详解
2019/04/10 Javascript
浅谈es6中的元编程
2020/12/01 Javascript
Vue 实例中使用$refs的注意事项
2021/01/29 Vue.js
Python读取一个目录下所有目录和文件的方法
2016/07/15 Python
python如何在终端里面显示一张图片
2016/08/17 Python
Python进阶之递归函数的用法及其示例
2018/01/31 Python
解决csv.writer写入文件有多余的空行问题
2018/07/06 Python
python使用多线程编写tcp客户端程序
2019/09/02 Python
python爬虫 Pyppeteer使用方法解析
2019/09/28 Python
python matplotlib饼状图参数及用法解析
2019/11/04 Python
利用Python制作动态排名图的实现代码
2020/04/09 Python
Python在centos7.6上安装python3.9的详细教程(默认python版本为2.7.5)
2020/10/15 Python
python之openpyxl模块的安装和基本用法(excel管理)
2021/02/03 Python
纯CSS3编写的的精美动画进度条(无flash/无图像/无脚本/附源码)
2013/01/07 HTML / CSS
微软俄罗斯官方网站:Microsoft俄罗斯
2016/09/18 全球购物
节能减排倡议书
2014/04/15 职场文书
原料仓仓管员岗位职责
2014/07/08 职场文书
电力培训心得体会
2014/09/02 职场文书
周一给客户的问候语
2015/11/10 职场文书
macos系统如何实现微信双开? mac登录两个微信以上微信的技巧
2022/07/23 数码科技