编程 Python

Python如何抓取天猫商品详细信息及交易记录

Posted in Python onFebruary 23, 2018

本文实例为大家分享了Python抓取天猫商品详细信息及交易记录的具体代码，供大家参考，具体内容如下

一、搭建Python环境

本帖使用的是Python 2.7
涉及到的模块：spynner, scrapy, bs4, pymmssql

二、要获取的天猫数据

三、数据抓取流程

四、源代码

#coding:utf-8
import spynner
from scrapy.selector import Selector
from bs4 import BeautifulSoup
import random
import pymssql


#------------------------接数据库-----------------------------#
server="localhost"
user="sa"
password = "123456"
conn=pymssql.connect(server,user,password,"TmallData")
if conn:
  print "DataBase connecting successfully!"
else:
  print "DataBase connecting error!"
cursor=conn.cursor()
#----------------------定义网页操作函数--------------------------#
def py_click_element(browser,pos):
  #点击网页中的元素
  #pos example:'a[href="#description" rel="external nofollow" rel="external nofollow" ]'
  browser.click(pos)
  browser.wait(random.randint(3,10))
  return browser

def py_click_xpath(browser,xpath):
  xpath=xpath+'/@href'
  inner_href=Selector(text=browser.html).xpath(xpath).extract()
  pos='a[href="'+str(inner_href[0])+'" rel="external nofollow" ]'
  browser=py_click_element(browser, pos)
  return browser

def py_webpage_load(browser,url):
  browser.load(url,load_timeout=60)
  browser.wait(10)
  return browser

def py_check_element(browser,xpath):
  #按照xpath查找元素，如果存在则返回True，否则返回False
  if Selector(text=browser.html).xpath(xpath).extract()!=[]:
    return True
  else:
    return False

def py_extract_xpath(browser,xpath):
  if py_check_element(browser, xpath):
    return Selector(text=browser.html).xpath(xpath).extract()[0]
  else:
    return "none"

def py_extract_xpaths(browser,xpaths):
  #批量提取网页内容
  length=len(xpaths)
  results=[0]*length
  for i in range(length):
    results[i]=py_extract_xpath(browser, xpaths[i])
  return results

#-----------------------------数据库操作函数---------------------------#


#-----------------------------数据提取函数----------------------------#
def py_getDealReord(doc):
  soup=BeautifulSoup(doc,'lxml')
  tr=soup.find_all('tr')
  total_dealRecord=[([0]*5)for i in range(len(tr))] 
  i=-1
  for this_tr in tr:
    i=i+1
    td_user=this_tr.find_all('td',attrs={'class':"cell-align-l buyer"})
    for this_td in td_user:
      total_dealRecord[i][0]=this_td.getText().strip(' ')
      #print username
    td_style=this_tr.find_all('td',attrs={'class':"cell-align-l style"})
    for this_td in td_style:
      total_dealRecord[i][1]=this_td.getText(',').strip(' ')
      #print style
    td_quantity=this_tr.find_all('td',attrs={'class':"quantity"})
    for this_td in td_quantity:
      total_dealRecord[i][2]=this_td.getText().strip(' ')
      #print quantity
    td_dealtime=this_tr.find_all('td',attrs={'class':"dealtime"})
    for this_td in td_dealtime:
      total_dealRecord[i][3]=this_td.find('p',attrs={'class':"date"}).getText()
      total_dealRecord[i][4]=this_td.find('p',attrs={'class':"time"}).getText()
  return total_dealRecord
#--------------------获取要抓取的所有商品链接-----------------------#
cursor.execute("""
select * from ProductURLs where BrandName='NB'
""")


file=open("H:\\Eclipse\\TmallCrawling\\HTMLParse\\errLog.txt")
InProductInfo=cursor.fetchall()
browser=spynner.Browser()
for temp_InProductInfo in InProductInfo:

  url='https:'+temp_InProductInfo[2]

  BrandName=temp_InProductInfo[0]
  ProductType=temp_InProductInfo[1]
  print BrandName,'\t',ProductType,'\t',url
  #url= 'https://detail.tmall.com/item.htm?id=524425656711&rn=77636d6db8dea5e30060976fdaf9768d&abbucket=19' 

  try:
    browser=py_webpage_load(browser, url)
  except:
    print "Loading webpage failed."
    file.write(url)
    file.write('\n')
    continue

  xpaths=['//*[@id="J_PromoPrice"]/dd/div/span/text()',\
    '//*[@id="J_StrPriceModBox"]/dd/span/text()',\
    '//*[@id="J_DetailMeta"]/div[1]/div[1]/div/div[1]/h1/text()',\
    '//*[@id="J_PostageToggleCont"]/p/span/text()',\
    '//*[@id="J_EmStock"]/text()',\
    '//*[@id="J_CollectCount"]/text()',\
    '//*[@id="J_ItemRates"]/div/span[2]/text()',\
    '//*[@id="J_DetailMeta"]/div[1]/div[1]/div/ul/li[1]/div/span[2]/text()']
  out_ProductInfo=py_extract_xpaths(browser,xpaths)
  browser=py_click_element(browser,'a[href="#description" rel="external nofollow" rel="external nofollow" ]')
  ProductProperty=py_extract_xpath(browser, '//*[@id="J_AttrUL"]')
  soup=BeautifulSoup(ProductProperty,'lxml')
  li=soup.find_all('li')
  prop=''
  for this_li in li:
    prop=prop+this_li.getText()+'\\'
  prop=prop[0:len(prop)-1]
  out_ProductProperty=prop
  print out_ProductProperty
  cursor.execute("""
  Insert into py_ProductInfo values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
  """,(BrandName,ProductType,url,\
     out_ProductInfo[2],out_ProductInfo[1],\
     out_ProductInfo[0],out_ProductInfo[7],\
     out_ProductInfo[1],out_ProductInfo[3],\
     out_ProductInfo[4],out_ProductInfo[5],\
     out_ProductProperty))
  conn.commit()
  Deal_PageCount=0
  browser=py_click_element(browser, 'a[href="#J_DealRecord" rel="external nofollow" ]')
  #browser.browse(True)
  DealRecord=py_extract_xpath(browser, '//*[@id="J_showBuyerList"]/table/tbody')
  out_DealRecord=py_getDealReord(DealRecord)
  for temp_DealRecord in out_DealRecord:
    if str(temp_DealRecord[4])=='0':
      continue
    cursor.execute("""
    Insert into DealRecord values(%s,%s,%s,%s,%s,%s)
    """,(url,temp_DealRecord[0],temp_DealRecord[1],\
       temp_DealRecord[2],temp_DealRecord[3],\
       temp_DealRecord[4]))
    conn.commit()
  Deal_PageCount=Deal_PageCount+1
  print "Page ",Deal_PageCount
  for i in range(6):
    if (i==0) or (i==2):
      continue
    xpath='//*[@id="J_showBuyerList"]/div/div/a['+str(i)+']'
    if py_check_element(browser,xpath):
      browser=py_click_xpath(browser, xpath)
      DealRecord=py_extract_xpath(browser, '//*[@id="J_showBuyerList"]/table/tbody')
      out_DealRecord=py_getDealReord(DealRecord)
      for temp_DealRecord in out_DealRecord:
        if str(temp_DealRecord[4])=='0':
          continue
        cursor.execute("""
        Insert into DealRecord values(%s,%s,%s,%s,%s,%s)
        """,(url,temp_DealRecord[0],temp_DealRecord[1],\
           temp_DealRecord[2],temp_DealRecord[3],\
           temp_DealRecord[4]))
        conn.commit()
      Deal_PageCount=Deal_PageCount+1
      print "Page ",Deal_PageCount
  while py_check_element(browser, '//*[@id="J_showBuyerList"]/div/div/a[6]'):
    browser=py_click_xpath(browser, '//*[@id="J_showBuyerList"]/div/div/a[6]')
    DealRecord=py_extract_xpath(browser, '//*[@id="J_showBuyerList"]/table/tbody')
    out_DealRecord=py_getDealReord(DealRecord)
    for temp_DealRecord in out_DealRecord:
      if str(temp_DealRecord[4])=='0':
        continue
      cursor.execute("""
      Insert into DealRecord values(%s,%s,%s,%s,%s,%s)
      """,(url,temp_DealRecord[0],temp_DealRecord[1],\
         temp_DealRecord[2],temp_DealRecord[3],\
         temp_DealRecord[4]))
      conn.commit()
    Deal_PageCount=Deal_PageCount+1
    print "Page ",Deal_PageCount

以上就是本文的全部内容，希望对大家的学习有所帮助，也希望大家多多支持三水点靠木。

Python如何抓取天猫商品详细信息及交易记录

- Author -

lwhusted

声明：登载此文出于传递更多信息之目的，并不意味着赞同其观点或证实其描述。

Python 相关文章推荐

python逐行读取文件内容的三种方法

Jan 20 Python

将图片文件嵌入到wxpython代码中的实现方法

Aug 11 Python

在windows下快速搭建web.py开发框架方法

Apr 22 Python

详解Python中contextlib上下文管理模块的用法

Jun 28 Python

python实现在pandas.DataFrame添加一行

Apr 04 Python

python处理数据,存进hive表的方法

Jul 04 Python

Python基础之条件控制操作示例【if语句】

Mar 23 Python

利用python和百度地图API实现数据地图标注的方法

May 13 Python

numpy ndarray 取出满足特定条件的某些行实例

Dec 05 Python

基于Python实现人脸自动戴口罩系统

Feb 06 Python

Pytorch 扩展Tensor维度、压缩Tensor维度的方法

Sep 09 Python

python模板入门教程之flask Jinja

Apr 11 Python

python列表生成式与列表生成器的使用

Feb 23 #Python

1分钟快速生成用于网页内容提取的xslt

Feb 23 #Python

python使用xslt提取网页数据的方法

Feb 23 #Python

Python爬虫使用Selenium+PhantomJS抓取Ajax和动态HTML内容

Feb 23 #Python

python爬虫获取多页天涯帖子

Feb 23 #Python

Python即时网络爬虫项目启动说明详解

Feb 23 #Python

Python爬豆瓣电影实例

Feb 23 #Python

You might like

PHP is_dir() 判断给定文件名是否是一个目录

2010/05/10 PHP

smarty内置函数foreach用法实例

2015/01/22 PHP

php单例模式实现方法分析

2015/03/14 PHP

Laravel 5框架学习之环境与配置

2015/04/08 PHP

php实现页面纯静态的实例代码

2017/06/21 PHP

php表单文件iframe异步上传实例讲解

2017/07/26 PHP

js String对象中常用方法小结(字符串操作)

2012/01/27 Javascript

基于javascipt-dom编程 table对象的使用

2013/04/22 Javascript

javascript实现数字验证码的简单实例

2014/02/10 Javascript

jquery实现省市select下拉框的替换(示例代码)

2014/02/22 Javascript

js对图片base64编码字符串进行解码并输出图像示例

2014/03/17 Javascript

JavaScript中对象属性的添加和删除示例

2014/05/12 Javascript

JQuery鼠标移到小图显示大图效果的方法

2015/06/10 Javascript

javascript实现根据3原色制作颜色选择器的方法

2015/07/17 Javascript

在ASP.NET MVC项目中使用RequireJS库的用法示例

2016/02/15 Javascript

javascript的正则匹配方法学习

2016/02/24 Javascript

jQuery给指定的table动态添加删除行的操作方法

2016/10/12 Javascript

原生js实现轮播图的示例代码

2017/02/20 Javascript

JS实现新建文件夹功能

2017/06/17 Javascript

vue如何解决循环引用组件报错的问题

2018/09/22 Javascript

如何用原生js写一个弹窗消息提醒插件

2019/05/24 Javascript

[01:04:08]完美世界DOTA2联赛PWL S3 INK ICE vs GXR 第一场 12.16

2020/12/18 DOTA

Python中optparser库用法实例详解

2018/01/26 Python

python多线程共享变量的使用和效率方法

2019/07/16 Python

python 中xpath爬虫实例详解

2019/08/26 Python

美国最大的家庭鞋类零售商之一：Shoe Carnival

2017/10/06 全球购物

来自圣地亚哥的实惠太阳镜：Knockaround

2018/08/27 全球购物

企业管理毕业生求职信范文

2014/03/07 职场文书

副乡长群众路线教育实践活动个人对照检查材料

2014/09/19 职场文书

学校运动会广播稿范文

2014/10/02 职场文书

学校体育节班级口号

2015/12/25 职场文书

golang中的空接口使用详解

2021/03/30 Python

多属性、多分类MySQL模式设计

2021/04/05 MySQL

OpenCV-Python实现轮廓拟合

2021/06/08 Python

Spring Data JPA框架自定义Repository接口

2022/04/28 Java/Android

Vue3实现简易音乐播放器组件

2022/08/14 Vue.js