python利用xpath爬取网上数据并存储到django模型中


Posted in Python onFebruary 26, 2021

帮朋友制作一个网站,需要一些产品数据信息,因为是代理其他公司产品,直接爬取代理公司产品数据

1.设计数据库

from django.db import models
from uuslug import slugify
import uuid
import os


def products_directory_path(instance, filename):
  ext = filename.split('.')[-1]
  filename = '{}.{}'.format(uuid.uuid4().hex[:8], ext)
  # return the whole path to the file
  return os.path.join('images', "products", instance.title, filename)


def product_relatedimage_directory_path(instance, filename):
  ext = filename.split('.')[-1]
  filename = '{}.{}'.format(uuid.uuid4().hex[:8], ext)
  # return the whole path to the file
  return os.path.join('images', "product_relatedimage", instance.product.title, filename)


class ProductsCategory(models.Model):
  """产品分类"""
  name = models.CharField('产品分类名', max_length=80, unique=True)
  description = models.TextField('产品分类描述', blank=True, null=True)
  slug = models.SlugField('slug', max_length=80, blank=True, null=True)
  parent_category = models.ForeignKey('self', verbose_name="父级分类", blank=True, null=True, on_delete=models.CASCADE)

  def save(self, *args, **kwargs):
    if not self.id or not self.slug:
      self.slug = slugify(self.name)
    super().save(*args, **kwargs)

  def __str__(self):
    return self.name

  class Meta:
    ordering = ['name']
    verbose_name = "产品分类"
    verbose_name_plural = verbose_name


class ProductsTag(models.Model):
  """产品标签"""
  name = models.CharField('产品标签名', max_length=30, unique=True)
  slug = models.SlugField('slug', max_length=40)

  def __str__(self):
    return self.name

  def save(self, *args, **kwargs):
    if not self.id or not self.slug:
      self.slug = slugify(self.name)
    super().save(*args, **kwargs)

  class Meta:
    ordering = ['name']
    verbose_name = "产品标签"
    verbose_name_plural = verbose_name


class Product(models.Model):
  title = models.CharField('标题', max_length=255, unique=True)
  slug = models.SlugField('slug', max_length=255, blank=True, null=True)
  jscs = models.TextField('技术参数', blank=True, null=True)
  image = models.ImageField(upload_to=products_directory_path, verbose_name="产品图片")
  views = models.PositiveIntegerField('浏览量', default=0)
  category = models.ForeignKey('ProductsCategory', verbose_name='分类', on_delete=models.CASCADE, blank=True, null=True)
  tags = models.ManyToManyField('ProductsTag', verbose_name='标签集合', blank=True)

  def save(self, *args, **kwargs):
    if not self.id or not self.slug:
      self.slug = slugify(self.title)
    super().save(*args, **kwargs)

  def update_views(self):
    self.views += 1
    self.save(update_fields=['views'])

  def get_pre(self):
    return Product.objects.filter(id__lt=self.id).order_by('-id').first()

  def get_next(self):
    return Product.objects.filter(id__gt=self.id).order_by('id').first()

  def __str__(self):
    return self.title

  class Meta:
    verbose_name = "产品"
    verbose_name_plural = verbose_name


class ProductAdvantage(models.Model):
  content = models.TextField('产品优势', blank=True, null=True)
  product = models.ForeignKey(Product, on_delete=models.CASCADE, blank=True, null=True)

  def __str__(self):
    return self.content

  class Meta:
    verbose_name = "产品优势"
    verbose_name_plural = verbose_name


class ProductBody(models.Model):
  body = models.CharField('产品内容', max_length=256, blank=True, null=True)
  product = models.ForeignKey(Product, on_delete=models.CASCADE, blank=True, null=True)

  def __str__(self):
    return self.product.title

  class Meta:
    verbose_name = "产品内容"
    verbose_name_plural = verbose_name

2.脚本编写

2.1编写获取网页源代码函数

def get_one_page(url):
  try:
    headers = {
      "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
    res = requests.get(url=url, headers=headers)
    res.encoding = 'utf-8'
    if res.status_code == 200:
      return res.text
    else:
      return None
  except Exception:
    return None

2.2根据base页面获取所有产品分类页面链接

if __name__ == '__main__':
  content = get_one_page(url)
  tree = etree.HTML(content)
  # 产品分类url
  catgory_urls = tree.xpath('//div[@class="fdh-01-nav"]/div/h3/a/@href')
  # 处理catgory_urls
  for url in catgory_urls:
    url = 'http://www.kexinjianji.com' + url
    print(url)

2.3根据产品分类页面链接获取对应所有产品链接

if __name__ == '__main__':
  content = get_one_page(url)
  tree = etree.HTML(content)
  # 产品分类
  catgory = tree.xpath('//div[@class="cplb-3n-ts-03 b"]/h3/span/text()')
  print("产品分类:" + catgory[0])
  # 该分类下产品url
  urls = tree.xpath('//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href')
  # 处理url
  for url in urls:
    url = 'http://www.kexinjianji.com' + url
    print(url)
  print("=====================================================")

两者结合起来就可以打印出所有产品链接

if __name__ == '__main__':
  content = get_one_page(url)
  tree = etree.HTML(content)
  # 产品分类url
  catgory_urls = tree.xpath('//div[@class="fdh-01-nav"]/div/h3/a/@href')
  # 处理catgory_urls
  for url in catgory_urls:
    url = 'http://www.kexinjianji.com' + url
    content = get_one_page(url)
    tree = etree.HTML(content)
    # 产品分类
    catgory = tree.xpath('//div[@class="cplb-3n-ts-03 b"]/h3/span/text()')
    print("产品分类:" + catgory[0])
    # 该分类下产品url
    urls = tree.xpath('//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href')
    # 处理url
    for url in urls:
      url = 'http://www.kexinjianji.com' + url
      print(url)
    print("=====================================================")

2.2使用xpath解析函数返回产品链接的内容

if __name__ == '__main__':
  content = get_one_page(url)
  tree = etree.HTML(content)
  # 产品名称
  title = tree.xpath('//*[@id="wrap"]//h1/text()')
  images = tree.xpath('//div[@class="sol_tj_left"]/a/img/@src')
  # 产品图片
  images_url = 'http://www.kexinjianji.com/' + images[0]
  # 性能特点
  xntd = tree.xpath('//div[@class="w"]//div/span/text()|//div[@class="w"]//div/text()')
  # 技术参数
  jscs = tree.xpath('//table')[0]
  jscs_str = etree.tostring(jscs, encoding='utf-8').decode('utf-8')
  # 产品内容
  cpnr = tree.xpath('//div[@class="describe"]/p')
  print('产品名称:' + title[0])
  print('产品图片:' + images_url)
  for td in xntd:
    print('性能特点:' + td)
  print('技术参数:' + jscs_str)
  for cp in cpnr:
    # string(.) 获取当前标签下所有文本内容
    cp = cp.xpath('string(.)')
    print('产品内容:' + cp)
  print('============================================')

将三者结合在一起就可以获取所有产品信息

if __name__ == '__main__':
  content = get_one_page(url)
  tree = etree.HTML(content)
  # 产品分类url
  catgory_urls = tree.xpath('//div[@class="fdh-01-nav"]/div/h3/a/@href')
  # 处理catgory_urls
  for url in catgory_urls:
    url = 'http://www.kexinjianji.com' + url
    content = get_one_page(url)
    tree = etree.HTML(content)
    # 产品分类
    catgory = tree.xpath('//div[@class="cplb-3n-ts-03 b"]/h3/span/text()')
    # 该分类下产品url
    urls = tree.xpath('//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href')
    # 处理url
    for url in urls:
      url = 'http://www.kexinjianji.com' + url
      content = get_one_page(url)
      try:
        tree = etree.HTML(content)
        # 产品名称
        title = tree.xpath('//*[@id="wrap"]//h1/text()')
        images = tree.xpath('//div[@class="sol_tj_left"]/a/img/@src')
        # 产品图片
        images_url = 'http://www.kexinjianji.com' + images[0]
        # 性能特点
        xntd = tree.xpath('//div[@class="w"]//div/span/text()|//div[@class="w"]//div/text()')
        # 技术参数
        jscs = tree.xpath('//table')[0]
        jscs_str = etree.tostring(jscs, encoding='utf-8').decode('utf-8')
        # 产品内容
        cpnr = tree.xpath('//div[@class="describe"]/p')
        print("产品分类:" + catgory[0])
        print('产品链接:' + url)
        print('产品名称:' + title[0])
        print('产品图片:' + images_url)
        for td in xntd:
          print('性能特点:' + td.strip())
        # print('技术参数:' + jscs_str)
        for cp in cpnr:
          # string(.) 获取当前标签下所有文本内容
          cp = cp.xpath('string(.)')
          print('产品内容:' + cp)
        print('============================================')
      except Exception as e:
        print(e)
        print('出错url:' + url)
        pass

3.存储到django模型

import requests
from lxml.html import etree
import os
import django
import uuid
from django.core.files.base import ContentFile

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "jiaobanzhan.settings")
django.setup()

from products.models import ProductBody, ProductsCategory, Product, ProductAdvantage

url = 'http://www.kexinjianji.com/product/hzshntjbz_1/'


def get_one_page(url):
  try:
    headers = {
      "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
    res = requests.get(url=url, headers=headers, timeout=10)
    res.encoding = 'utf-8'
    if res.status_code == 200:
      return res.text
    else:
      return None
  except Exception:
    print('aa')
    return None


if __name__ == '__main__':
  content = get_one_page(url)
  tree = etree.HTML(content)
  # 产品分类url
  catgory_urls = tree.xpath('//div[@class="fdh-01-nav"]/div/h3/a/@href')
  # 处理catgory_urls
  for url in catgory_urls:
    url = 'http://www.kexinjianji.com' + url
    content = get_one_page(url)
    tree = etree.HTML(content)
    # 产品分类
    p_catgory = tree.xpath('//div[@class="cplb-3n-ts-03 b"]/h3/span/text()')
    # 该分类下产品url
    urls = tree.xpath('//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href')
    # 处理url
    for url in urls:
      url = 'http://www.kexinjianji.com' + url
      content = get_one_page(url)
      try:
        tree = etree.HTML(content)
        # 产品名称
        title = tree.xpath('//*[@id="wrap"]//h1/text()')
        images = tree.xpath('//div[@class="sol_tj_left"]/a/img/@src')
        # 产品图片
        images_url = 'http://www.kexinjianji.com' + images[0]
        # 性能特点
        xntd = tree.xpath('//div[@class="w"]//div/span/text()|//div[@class="w"]//div/text()')
        # 技术参数
        jscs = tree.xpath('//table')[0]
        jscs_str = etree.tostring(jscs, encoding='utf-8').decode('utf-8')
        # 产品内容
        cpnr = tree.xpath('//div[@class="describe"]/p')
        # 判断是否有这分类,没有则新建
        catgory = p_catgory[0]
        products_catgory = ProductsCategory.objects.filter(name=catgory).exists()
        if products_catgory:
          products_catgory = ProductsCategory.objects.get(name=catgory)
        else:
          products_catgory = ProductsCategory(name=catgory)
          products_catgory.save()
        print(products_catgory)

        # 保存产品图片
        image_content = requests.get(url=images_url)
        ext = images_url.split('.')[-1] # 获取图片类型
        filename = '{}.{}'.format(uuid.uuid4().hex[:8], ext) # 随机生成图片名字
        upload_image_file = ContentFile(image_content.content, name=filename) # 将图片保存为django类型
        product = Product(title=title[0], jscs=jscs_str, image=upload_image_file, category=products_catgory)
        product.save()
        for td in xntd:
          product_advantage = ProductAdvantage()
          product_advantage.content = td
          product_advantage.product = product
          product_advantage.save()
        for cp in cpnr:
          cp = cp.xpath('string(.)')
          product_body = ProductBody()
          product_body.body = cp
          product_body.product = product
          product_body.save()
      except Exception as e:
        print(e)
        print('出错url:' + url)

最后自己手动处理出错url(页面没有获取到技术参数,技术参数是一张图片)

4.总结

1.xpath 获取标签内容时,p标签中嵌套span标签,源码如下

<div class="describe" style="position: relative;"> 
   <p><span>板  宽:</span>1500mm</p> 
   <p><span>板  厚:</span>4.5 mm</p> 
   <p><span>出料口:</span>6口</p> 
   <p><span>重  量:</span>6000 kg</p>
</div>

使用xpath获取p标签内容
我想得到的效果如下
板 宽:1500mm
板 厚:4.5 mm
出料口:6口
重 量:6000 kg
使用以下xpath 只能分开获取,不是想要的效果

//div[@class="describe"]/p/span/text()|//div[@class="describe"]/p/text()

百度之后找到的解决办法,使用xpath(‘string(.)')
1.先获取所有p标签

cpnr = tree.xpath('//div[@class="describe"]/p')

2.使用**string(.)**获取所有标签所有文本

cp = cp.xpath('string(.)')

循环遍历所有p标签即可

到此这篇关于python利用xpath爬取网上数据并存储到django模型中的文章就介绍到这了,更多相关xpath爬取网上数据存储到django模型内容请搜索三水点靠木以前的文章或继续浏览下面的相关文章希望大家以后多多支持三水点靠木!

Python 相关文章推荐
Python HTMLParser模块解析html获取url实例
Apr 08 Python
python实现数据导出到excel的示例--普通格式
May 03 Python
django admin 后台实现三级联动的示例代码
Jun 22 Python
对numpy.append()里的axis的用法详解
Jun 28 Python
基于tensorflow加载部分层的方法
Jul 26 Python
Python字典中的键映射多个值的方法(列表或者集合)
Oct 17 Python
python3射线法判断点是否在多边形内
Jun 28 Python
python打包成so文件过程解析
Sep 28 Python
Python3.6安装卸载、执行命令、执行py文件的方法详解
Feb 20 Python
python实现简单俄罗斯方块
Mar 13 Python
Python中random模块常用方法的使用教程
Oct 04 Python
 分享一个Python 遇到数据库超好用的模块
Apr 06 Python
用python 绘制茎叶图和复合饼图
Feb 26 #Python
python lambda的使用详解
Feb 26 #Python
python爬虫scrapy框架之增量式爬虫的示例代码
Feb 26 #Python
详解Python openpyxl库的基本应用
Feb 26 #Python
解决python的空格和tab混淆而报错的问题
Feb 26 #Python
Python Spyder 调出缩进对齐线的操作
Feb 26 #Python
使用Python制作一盏 3D 花灯喜迎元宵佳节
Feb 26 #Python
You might like
thinkphp实现数组分页示例
2014/04/13 PHP
destoon实现VIP排名一直在前面排序的方法
2014/08/21 PHP
HTML中嵌入PHP的简单方法
2016/02/16 PHP
Laravel实现表单提交
2017/05/07 PHP
PHP长网址与短网址的实现方法
2017/10/13 PHP
FLASH 广告之外的链接
2008/12/16 Javascript
Javascript处理DOM元素事件实现代码
2012/05/23 Javascript
获取内联和链接中的样式(js代码)
2013/04/11 Javascript
js获取对象、数组的实际长度,元素实际个数的实现代码
2016/06/08 Javascript
在js代码拼接dom对象到页面上去的模板总结(必看)
2017/02/14 Javascript
详解Vue方法与事件
2017/03/09 Javascript
jQuery实现菜单栏导航效果
2017/08/15 jQuery
vue2.0在没有dev-server.js下的本地数据配置方法
2018/02/23 Javascript
vue input输入框模糊查询的示例代码
2018/05/22 Javascript
bootstrap table表格插件之服务器端分页实例代码
2018/09/12 Javascript
jquery无缝图片轮播组件封装
2020/11/25 jQuery
vue 调用 RESTful风格接口操作
2020/08/11 Javascript
vuex刷新后数据丢失的解决方法
2020/10/18 Javascript
[01:03:41]完美世界DOTA2联赛PWL S3 DLG vs Phoenix 第一场 12.17
2020/12/19 DOTA
深入了解Python iter() 方法的用法
2019/07/11 Python
Python如何应用cx_Oracle获取oracle中的clob字段问题
2019/08/27 Python
在Python中预先初始化列表内容和长度的实现
2019/11/28 Python
基于Python和C++实现删除链表的节点
2020/07/06 Python
HTML5标签小集
2011/08/02 HTML / CSS
HTML5之SVG 2D入门6—视窗坐标系与用户坐标系及变换概述
2013/01/30 HTML / CSS
Canvas中设置width与height的问题浅析
2018/11/01 HTML / CSS
iPhoneX安全区域(Safe Area)底部小黑条在微信小程序和H5的屏幕适配
2020/04/08 HTML / CSS
俄罗斯汽车零件和配件在线商店:CarvilleShop
2019/11/29 全球购物
毕业生物理教师求职信
2013/10/17 职场文书
摄影助理岗位职责
2014/02/07 职场文书
会计师职业生涯规划范文
2014/02/18 职场文书
生日主持词
2014/03/20 职场文书
12.4法制宣传日标语
2014/10/08 职场文书
Java用自带的Image IO给图片添加水印
2021/06/15 Java/Android
MySQL悲观锁与乐观锁的实现方案
2021/11/02 MySQL
OpenCV实现常见的四种图像几何变换
2022/04/01 Python