Python如何抓取天猫商品详细信息及交易记录


Posted in Python onFebruary 23, 2018

本文实例为大家分享了Python抓取天猫商品详细信息及交易记录的具体代码,供大家参考,具体内容如下

一、搭建Python环境

本帖使用的是Python 2.7
涉及到的模块:spynner, scrapy, bs4, pymmssql

二、要获取的天猫数据

三、数据抓取流程

四、源代码

#coding:utf-8
import spynner
from scrapy.selector import Selector
from bs4 import BeautifulSoup
import random
import pymssql


#------------------------接数据库-----------------------------#
server="localhost"
user="sa"
password = "123456"
conn=pymssql.connect(server,user,password,"TmallData")
if conn:
  print "DataBase connecting successfully!"
else:
  print "DataBase connecting error!"
cursor=conn.cursor()
#----------------------定义网页操作函数--------------------------#
def py_click_element(browser,pos):
  #点击网页中的元素
  #pos example:'a[href="#description" rel="external nofollow" rel="external nofollow" ]'
  browser.click(pos)
  browser.wait(random.randint(3,10))
  return browser

def py_click_xpath(browser,xpath):
  xpath=xpath+'/@href'
  inner_href=Selector(text=browser.html).xpath(xpath).extract()
  pos='a[href="'+str(inner_href[0])+'" rel="external nofollow" ]'
  browser=py_click_element(browser, pos)
  return browser

def py_webpage_load(browser,url):
  browser.load(url,load_timeout=60)
  browser.wait(10)
  return browser

def py_check_element(browser,xpath):
  #按照xpath查找元素,如果存在则返回True,否则返回False
  if Selector(text=browser.html).xpath(xpath).extract()!=[]:
    return True
  else:
    return False

def py_extract_xpath(browser,xpath):
  if py_check_element(browser, xpath):
    return Selector(text=browser.html).xpath(xpath).extract()[0]
  else:
    return "none"

def py_extract_xpaths(browser,xpaths):
  #批量提取网页内容
  length=len(xpaths)
  results=[0]*length
  for i in range(length):
    results[i]=py_extract_xpath(browser, xpaths[i])
  return results

#-----------------------------数据库操作函数---------------------------#


#-----------------------------数据提取函数----------------------------#
def py_getDealReord(doc):
  soup=BeautifulSoup(doc,'lxml')
  tr=soup.find_all('tr')
  total_dealRecord=[([0]*5)for i in range(len(tr))] 
  i=-1
  for this_tr in tr:
    i=i+1
    td_user=this_tr.find_all('td',attrs={'class':"cell-align-l buyer"})
    for this_td in td_user:
      total_dealRecord[i][0]=this_td.getText().strip(' ')
      #print username
    td_style=this_tr.find_all('td',attrs={'class':"cell-align-l style"})
    for this_td in td_style:
      total_dealRecord[i][1]=this_td.getText(',').strip(' ')
      #print style
    td_quantity=this_tr.find_all('td',attrs={'class':"quantity"})
    for this_td in td_quantity:
      total_dealRecord[i][2]=this_td.getText().strip(' ')
      #print quantity
    td_dealtime=this_tr.find_all('td',attrs={'class':"dealtime"})
    for this_td in td_dealtime:
      total_dealRecord[i][3]=this_td.find('p',attrs={'class':"date"}).getText()
      total_dealRecord[i][4]=this_td.find('p',attrs={'class':"time"}).getText()
  return total_dealRecord
#--------------------获取要抓取的所有商品链接-----------------------#
cursor.execute("""
select * from ProductURLs where BrandName='NB'
""")


file=open("H:\\Eclipse\\TmallCrawling\\HTMLParse\\errLog.txt")
InProductInfo=cursor.fetchall()
browser=spynner.Browser()
for temp_InProductInfo in InProductInfo:

  url='https:'+temp_InProductInfo[2]

  BrandName=temp_InProductInfo[0]
  ProductType=temp_InProductInfo[1]
  print BrandName,'\t',ProductType,'\t',url
  #url= 'https://detail.tmall.com/item.htm?id=524425656711&rn=77636d6db8dea5e30060976fdaf9768d&abbucket=19' 

  try:
    browser=py_webpage_load(browser, url)
  except:
    print "Loading webpage failed."
    file.write(url)
    file.write('\n')
    continue

  xpaths=['//*[@id="J_PromoPrice"]/dd/div/span/text()',\
    '//*[@id="J_StrPriceModBox"]/dd/span/text()',\
    '//*[@id="J_DetailMeta"]/div[1]/div[1]/div/div[1]/h1/text()',\
    '//*[@id="J_PostageToggleCont"]/p/span/text()',\
    '//*[@id="J_EmStock"]/text()',\
    '//*[@id="J_CollectCount"]/text()',\
    '//*[@id="J_ItemRates"]/div/span[2]/text()',\
    '//*[@id="J_DetailMeta"]/div[1]/div[1]/div/ul/li[1]/div/span[2]/text()']
  out_ProductInfo=py_extract_xpaths(browser,xpaths)
  browser=py_click_element(browser,'a[href="#description" rel="external nofollow" rel="external nofollow" ]')
  ProductProperty=py_extract_xpath(browser, '//*[@id="J_AttrUL"]')
  soup=BeautifulSoup(ProductProperty,'lxml')
  li=soup.find_all('li')
  prop=''
  for this_li in li:
    prop=prop+this_li.getText()+'\\'
  prop=prop[0:len(prop)-1]
  out_ProductProperty=prop
  print out_ProductProperty
  cursor.execute("""
  Insert into py_ProductInfo values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
  """,(BrandName,ProductType,url,\
     out_ProductInfo[2],out_ProductInfo[1],\
     out_ProductInfo[0],out_ProductInfo[7],\
     out_ProductInfo[1],out_ProductInfo[3],\
     out_ProductInfo[4],out_ProductInfo[5],\
     out_ProductProperty))
  conn.commit()
  Deal_PageCount=0
  browser=py_click_element(browser, 'a[href="#J_DealRecord" rel="external nofollow" ]')
  #browser.browse(True)
  DealRecord=py_extract_xpath(browser, '//*[@id="J_showBuyerList"]/table/tbody')
  out_DealRecord=py_getDealReord(DealRecord)
  for temp_DealRecord in out_DealRecord:
    if str(temp_DealRecord[4])=='0':
      continue
    cursor.execute("""
    Insert into DealRecord values(%s,%s,%s,%s,%s,%s)
    """,(url,temp_DealRecord[0],temp_DealRecord[1],\
       temp_DealRecord[2],temp_DealRecord[3],\
       temp_DealRecord[4]))
    conn.commit()
  Deal_PageCount=Deal_PageCount+1
  print "Page ",Deal_PageCount
  for i in range(6):
    if (i==0) or (i==2):
      continue
    xpath='//*[@id="J_showBuyerList"]/div/div/a['+str(i)+']'
    if py_check_element(browser,xpath):
      browser=py_click_xpath(browser, xpath)
      DealRecord=py_extract_xpath(browser, '//*[@id="J_showBuyerList"]/table/tbody')
      out_DealRecord=py_getDealReord(DealRecord)
      for temp_DealRecord in out_DealRecord:
        if str(temp_DealRecord[4])=='0':
          continue
        cursor.execute("""
        Insert into DealRecord values(%s,%s,%s,%s,%s,%s)
        """,(url,temp_DealRecord[0],temp_DealRecord[1],\
           temp_DealRecord[2],temp_DealRecord[3],\
           temp_DealRecord[4]))
        conn.commit()
      Deal_PageCount=Deal_PageCount+1
      print "Page ",Deal_PageCount
  while py_check_element(browser, '//*[@id="J_showBuyerList"]/div/div/a[6]'):
    browser=py_click_xpath(browser, '//*[@id="J_showBuyerList"]/div/div/a[6]')
    DealRecord=py_extract_xpath(browser, '//*[@id="J_showBuyerList"]/table/tbody')
    out_DealRecord=py_getDealReord(DealRecord)
    for temp_DealRecord in out_DealRecord:
      if str(temp_DealRecord[4])=='0':
        continue
      cursor.execute("""
      Insert into DealRecord values(%s,%s,%s,%s,%s,%s)
      """,(url,temp_DealRecord[0],temp_DealRecord[1],\
         temp_DealRecord[2],temp_DealRecord[3],\
         temp_DealRecord[4]))
      conn.commit()
    Deal_PageCount=Deal_PageCount+1
    print "Page ",Deal_PageCount

以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持三水点靠木。

Python 相关文章推荐
Python2.x中str与unicode相关问题的解决方法
Mar 30 Python
Python中为什么要用self探讨
Apr 14 Python
python连接mysql实例分享
Oct 09 Python
《与孩子一起学编程》python自测题
May 27 Python
Django添加feeds功能的示例
Aug 07 Python
pandas read_excel()和to_excel()函数解析
Sep 19 Python
Python求解排列中的逆序数个数实例
May 03 Python
通过实例简单了解python yield使用方法
Aug 06 Python
基于python tkinter的点名小程序功能的实例代码
Aug 22 Python
python 实用工具状态机transitions
Nov 21 Python
Python 利用argparse模块实现脚本命令行参数解析
Dec 28 Python
解决jupyter notebook图片显示模糊和保存清晰图片的操作
Apr 24 Python
python列表生成式与列表生成器的使用
Feb 23 #Python
1分钟快速生成用于网页内容提取的xslt
Feb 23 #Python
python使用xslt提取网页数据的方法
Feb 23 #Python
Python爬虫使用Selenium+PhantomJS抓取Ajax和动态HTML内容
Feb 23 #Python
python爬虫获取多页天涯帖子
Feb 23 #Python
Python即时网络爬虫项目启动说明详解
Feb 23 #Python
Python爬豆瓣电影实例
Feb 23 #Python
You might like
关于php程序报date()警告的处理(date_default_timezone_set)
2013/10/22 PHP
限制ckeditor上传图片文件大小的方法
2013/11/15 PHP
输入值/表单提交参数过滤有效防止sql注入的方法
2013/12/25 PHP
通过curl模拟post和get方式提交的表单类
2014/04/23 PHP
PHP Echo字符串的连接格式
2016/03/07 PHP
Laravel5.7 数据库操作迁移的实现方法
2019/04/12 PHP
js 单引号 传递方法
2009/06/22 Javascript
ExtJS Ext.MessageBox.alert()弹出对话框详解
2010/04/02 Javascript
javascript中的throttle和debounce浅析
2014/06/06 Javascript
JavaScript实现的一个计算数字步数的算法分享
2014/12/06 Javascript
js实现上传文件添加和删除文件选择框
2016/10/24 Javascript
详谈js中数组(array)和对象(object)的区别
2017/02/27 Javascript
vue获取DOM元素并设置属性的两种实现方法
2017/09/30 Javascript
基于Vue实现微前端的示例代码
2020/04/24 Javascript
基于jquery实现彩色投票进度条代码解析
2020/08/26 jQuery
vue项目中使用rem,在入口文件添加内容操作
2020/11/11 Javascript
pycharm 使用心得(二)设置字体大小
2014/06/05 Python
Python的Django中将文件上传至七牛云存储的代码分享
2016/06/03 Python
Python中的TCP socket写法示例
2018/05/11 Python
Python3自动生成MySQL数据字典的markdown文本的实现
2020/05/07 Python
解决python cv2.imread 读取中文路径的图片返回为None的问题
2020/06/02 Python
python如何操作mysql
2020/08/17 Python
美国在线鲜花速递:ProFlowers
2017/01/05 全球购物
英国手机零售商:Carphone Warehouse
2018/06/06 全球购物
创建无烟单位实施方案
2014/03/29 职场文书
竞争与合作演讲稿
2014/05/12 职场文书
护士医德医风自我评价
2014/09/15 职场文书
三严三实学习心得体会
2014/10/13 职场文书
2014年化验员工作总结
2014/11/18 职场文书
教师党员个人自我评价
2015/03/04 职场文书
盗窃罪辩护词范文
2015/05/21 职场文书
Golang的继承模拟实例
2021/06/30 Golang
铁拳制作人赞《铁拳7》老头环Mod:制作精良 但别弄了
2022/04/03 其他游戏
Java 使用类型为Object的变量指向任意类型的对象
2022/04/13 Java/Android
python实现商品进销存管理系统
2022/05/30 Python
Python实战实现爬取天气数据并完成可视化分析详解
2022/06/16 Python