python实现爬取图书封面


Posted in Python onJuly 05, 2018

本文实例为大家分享了python实现爬取图书封面的具体代码,供大家参考,具体内容如下

kongfuzi.py

利用更换代理ip,延迟提交数据,设置请求头破解网站的反爬虫机制

import requests
import random
import time
 
 
class DownLoad():
  def __init__(self):
    self.ip_list = ['191.33.179.242:8080', '122.72.108.53:80', '93.190.142.214:80', '189.8.88.125:65301',
            '36.66.55.181:8080', '170.84.102.5:8080', '177.200.72.214:20183', '115.229.115.190:9000']
 
    self.user_agent_list = [
      'User-Agent:Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
      'User-Agent:Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
      'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'
    ]
 
  def get(self, url, proxy=None, timeout=20, num=5):
    print("正在请求%s" % url)
    UA = random.choice(self.user_agent_list)
    headers = {'User-Agent': UA}
 
    if proxy == None:
      try:
        return requests.get(url, headers=headers, timeout=timeout)
      except:
        if num > 0:
          time.sleep(10)
          return self.get(url, num=num - 1)
        else:
          time.sleep(10)
          IP = ''.join(random.choice(self.ip_list).strip())
          proxy = {'http': IP}
          return self.get(url, proxy=proxy, timeout=timeout)
    else:
      try:
        IP = ''.join(random.choice(self.ip_list).strip())
        proxy = {'http': IP}
        return requests.get(url, headers=headers, proxy=proxy, timeout=timeout)
      except:
        if num > 0:
          time.sleep(10)
          IP = ''.join(random.choice(self.ip_list).strip())
          proxy = {'http': IP}
          print("正在更换代理")
          print("当前代理%s" % proxy)
          return self.get(url, proxy=proxy, num=num - 1)

main.py

将爬取的图片保存到本地,然后展示到界面

import kongfuzi
import os
import requests
import bs4
from tkinter import *
from PIL import Image, ImageTk
 
 
# 下载图片,生成图片地址列表和图书信息列表
def download():
  baseUrl = "http://search.kongfz.com"
  keyword = e1.get()
  url = baseUrl + "/product_result/?select=0&key=" + keyword
  print("下载链接:" + url)
  show(url)
 
 
# bs4处理
def changesoup(html):
  htm = html.content
  html_doc = str(htm, 'utf-8')
  soup = bs4.BeautifulSoup(html_doc, "html.parser")
  return soup
 
 
# 图书信息集合
def bookinfo(soup):
  # 图书价格列表
  price = []
  soupprice = soup.select(".first-info .f_right .bold")
  for i in soupprice:
    price.append(i.string)
 
  # 书店名列表
  storename = []
  soupstorename = soup.select(".text a span")
  for each in soupstorename:
    if each.string == None:
      soupstorename.remove(each)
  for i in soupstorename:
    storename.append(i.string)
 
  # 商家地区列表
  place = []
  soupplace = soup.select(".user-place")
  for i in soupplace:
    place.append(i.string)
 
  # 书名列表
  bookname = []
  bookname1 = soup.select(
    ".search-wrap .search-main .search-main-result .result-content .result-list .item .item-info .title .link")
  # print(len(bookname1))
  # print(bookname1)
  for each in bookname1:
    print(each)
    # a = bs4.BeautifulSoup(each, "html.parser")
    a = each.get_text()
    print(a)
    # type(a)
    # a = bs4.BeautifulSoup(a, "html.parser")
    # b = a.get_text()
    bookname.append(a)
  # print(bookname)
  # print(len(bookname))
 
  return bookname, price, place, storename
 
 
# 保存图片
def imgsave(soup):
  dirName = "image"
  os.makedirs(dirName, exist_ok=True)
  filePathList = []
  imgUrl = soup.select(".search-main-result .result-content .result-list .item .item-img .img-box img")
 
  # print(imgUrl)
  if not imgUrl:
    print("没有找到当前节点下图片")
  else:
    i = 0
    for imageUrls in imgUrl:
      # 找到图片地址 获取它
      downloadUrl = imageUrls.get('src')
      # if downloadUrl == "/searchfront/img/error.jpg":
      #   downloadUrl = "http://book.kongfz.com/img/pc/error.jpg"
      print("打印要下载的图片地址:", downloadUrl)
      #   http://book.kongfz.com/img/pc/error.jpg
      # 分割字符
      split = downloadUrl.split("/")
      # 只保留最后一个元素
      fileName = str(i) + "-" + os.path.basename(split[len(split) - 1])
      print("文件名:" + fileName)
      # 建立一个新路径
      filePath = os.path.join(dirName, fileName)
      filePathList.append(filePath)
      if not os.path.exists(filePath):
        imageUrlPath = requests.get(downloadUrl)
        # 检查当前网络是否请求成功
        imageUrlPath.raise_for_status()
        # 'wb'二进制模式打开img适用
        imageFile = open(filePath, 'wb')
        for image in imageUrlPath.iter_content(10000):
          # 把每次遍历的文件图像都存储进文件夹中
          imageFile.write(image)
        # 关闭文件
        imageFile.close()
      i = i + 1
  return filePathList
 
# 图片展示
def show(url):
  xz = kongfuzi.DownLoad()
  html = xz.get(url)
 
  # 添加代理ip到ip_list
  add_ip = e2.get()
  xz.ip_list.append(add_ip)
 
  soup = changesoup(html)
  bookname, price, place, storename = bookinfo(soup)
  # print(bookname)
  # print(price)
  # print(place)
  # print(storename)
  filePathList = imgsave(soup)
  root1 = Toplevel()
  root1.geometry("1720x800")
  root1.title("孔网图片爬取")
 
  # 处理图片,转换成可以显示
  photo = []
  temp = []
  for each in filePathList:
    temp = Image.open(each)
    photo.append(ImageTk.PhotoImage(temp))
 
  canvas = Canvas(root1, width=1700, height=800, scrollregion=(0, 0, 0, 4000)) # 创建canvas
  canvas.place(x=10, y=10) # 放置canvas的位置
 
  frame = Frame(canvas) # 把frame放在canvas里
  frame.place(width=1680, height=800)
 
  for i in range(50):
    # 图片行列
    rownum = int(i / 5)
    columnnum = i % 5
 
    # photo = ImageTk.PhotoImage(Image.open(filePathList[i]))
    imgLabel1 = Label(frame, image=photo[i], width=280, height=280)
    imgLabel1.grid(row=rownum * 5, column=columnnum, padx=10, pady=5)
 
    infoLabel1 = Label(frame, text="书名:" + bookname[i], bg="#FFF8DC", justify=LEFT)
    infoLabel1.grid(row=rownum * 5 + 1, column=columnnum, padx=45, pady=2, sticky=W)
    infoLabel2 = Label(frame, text="价格:" + price[i] + "元", bg="#FFF8DC", justify=LEFT)
    infoLabel2.grid(row=rownum * 5 + 2, column=columnnum, padx=45, pady=2, sticky=W)
    infoLabel3 = Label(frame, text="发货地区:" + place[i], bg="#FFF8DC", justify=LEFT)
    infoLabel3.grid(row=rownum * 5 + 3, column=columnnum, padx=45, pady=2, sticky=W)
    infoLabel4 = Label(frame, text="书店:" + storename[i], bg="#FFF8DC", justify=LEFT)
    infoLabel4.grid(row=rownum * 5 + 4, column=columnnum, padx=45, pady=2, sticky=W)
 
  vbar = Scrollbar(canvas, orient=VERTICAL) # 竖直滚动条
  vbar.place(x=1680, width=20, height=800)
  vbar.configure(command=canvas.yview)
  canvas.config(yscrollcommand=vbar.set) # 设置
  canvas.create_window((800, 2000), window=frame)
 
  mainloop()
 
 
if __name__ == '__main__':
  # 界面
  root = Tk()
  root.title("孔网图片爬取")
  e1 = Entry(root)
  e2 = Entry(root)
  e1.grid(row=0, column=0, padx=20, pady=20)
  e2.grid(row=0, column=2, padx=20, pady=20)
  label1 = Label(root, text="关键字", width=10).grid(row=0, column=1, padx=10, pady=5)
  label2 = Label(root, text="添加代理ip", width=10).grid(row=0, column=3, padx=10, pady=5)
  btn1 = Button(root, text="搜索", width=10, command=download).grid(row=1, column=1, padx=10, pady=5)
  # print(e1.get())
  mainloop()

以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持三水点靠木。

Python 相关文章推荐
python使用os模块的os.walk遍历文件夹示例
Jan 27 Python
Python入门篇之面向对象
Oct 20 Python
在Python中使用cookielib和urllib2配合PyQuery抓取网页信息
Apr 25 Python
在SAE上部署Python的Django框架的一些问题汇总
May 30 Python
PyCharm+Qt Designer+PyUIC安装配置教程详解
Jun 13 Python
Python替换月份为英文缩写的实现方法
Jul 15 Python
树莓派4B+opencv4+python 打开摄像头的实现方法
Oct 18 Python
Python CSS选择器爬取京东网商品信息过程解析
Jun 01 Python
python smtplib发送多个email联系人的实现
Oct 09 Python
教你如何用python操作摄像头以及对视频流的处理
Oct 12 Python
Django后端按照日期查询的方法教程
Feb 28 Python
PYTHON基于Pyecharts绘制常见的直角坐标系图表
Apr 28 Python
Python定义二叉树及4种遍历方法实例详解
Jul 05 #Python
Python使用pyodbc访问数据库操作方法详解
Jul 05 #Python
如何优雅地处理Django中的favicon.ico图标详解
Jul 05 #Python
解决pandas中读取中文名称的csv文件报错的问题
Jul 04 #Python
解决Python pandas df 写入excel 出现的问题
Jul 04 #Python
python处理数据,存进hive表的方法
Jul 04 #Python
利用Pandas读取文件路径或文件名称包含中文的csv文件方法
Jul 04 #Python
You might like
php_xmlhttp 乱码问题解决方法
2009/08/07 PHP
基于PHP array数组的教程详解
2013/06/05 PHP
源码分析 Laravel 重复执行同一个队列任务的原因
2017/12/25 PHP
“不能执行已释放的Script代码”错误的原因及解决办法
2007/09/09 Javascript
JavaScript定义类或函数的几种方式小结
2011/01/09 Javascript
jquery预览图片实现鼠标放上去显示实际大小
2014/01/16 Javascript
javascript 处理null及null值示例
2014/06/09 Javascript
javascript实现的字符串与十六进制表示字符串相互转换方法
2015/07/17 Javascript
浅析JavaScript 调试方法和技巧
2015/10/22 Javascript
jQuery实现鼠标双击Table单元格变成文本框及输入内容后更新到数据库的方法
2015/11/25 Javascript
node.js require() 源码解读
2015/12/13 Javascript
jQuery实现的指纹扫描效果实例(附演示与demo源码下载)
2016/01/26 Javascript
JavaScript实现弹出模态窗体并接受传值的方法
2016/02/12 Javascript
jquery遍历json对象集合详解
2016/05/18 Javascript
JavaScript手机振动API
2016/06/11 Javascript
JS在Chrome浏览器中showModalDialog函数返回值为undefined的解决方法
2016/08/03 Javascript
Angular 中 select指令用法详解
2016/09/29 Javascript
jquery实现全选、全不选以及单选功能
2017/03/23 jQuery
nodejs 子进程正确的打开方式
2017/07/03 NodeJs
JS实现带动画的回到顶部效果
2017/12/28 Javascript
js实现3D照片墙效果
2019/10/28 Javascript
原生js实现五子棋游戏
2020/05/28 Javascript
js实现详情页放大镜效果
2020/10/28 Javascript
[02:00]最后,我终于出了辉耀
2018/03/27 DOTA
[47:42]完美世界DOTA2联赛PWL S2 GXR vs Ink 第一场 11.19
2020/11/20 DOTA
Python网络编程之TCP套接字简单用法示例
2018/04/09 Python
python使用PIL实现多张图片垂直合并
2019/01/15 Python
对python tkinter窗口弹出置顶的方法详解
2019/06/14 Python
python用线性回归预测股票价格的实现代码
2019/09/04 Python
python解析命令行参数的三种方法详解
2019/11/29 Python
pycharm中选中一个单词替换所有重复单词的实现方法
2020/11/17 Python
python实现图像高斯金字塔的示例代码
2020/12/11 Python
YSL圣罗兰美妆英国官网:Yves Saint Laurent Beauty UK
2019/08/03 全球购物
Pandora西班牙官方商店:PandoraShop.es
2020/10/05 全球购物
医学毕业生自我鉴定
2013/10/30 职场文书
Win11快速关闭所有广告推荐
2022/04/19 数码科技