python实现爬取图书封面


Posted in Python onJuly 05, 2018

本文实例为大家分享了python实现爬取图书封面的具体代码,供大家参考,具体内容如下

kongfuzi.py

利用更换代理ip,延迟提交数据,设置请求头破解网站的反爬虫机制

import requests
import random
import time
 
 
class DownLoad():
  def __init__(self):
    self.ip_list = ['191.33.179.242:8080', '122.72.108.53:80', '93.190.142.214:80', '189.8.88.125:65301',
            '36.66.55.181:8080', '170.84.102.5:8080', '177.200.72.214:20183', '115.229.115.190:9000']
 
    self.user_agent_list = [
      'User-Agent:Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
      'User-Agent:Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
      'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'
    ]
 
  def get(self, url, proxy=None, timeout=20, num=5):
    print("正在请求%s" % url)
    UA = random.choice(self.user_agent_list)
    headers = {'User-Agent': UA}
 
    if proxy == None:
      try:
        return requests.get(url, headers=headers, timeout=timeout)
      except:
        if num > 0:
          time.sleep(10)
          return self.get(url, num=num - 1)
        else:
          time.sleep(10)
          IP = ''.join(random.choice(self.ip_list).strip())
          proxy = {'http': IP}
          return self.get(url, proxy=proxy, timeout=timeout)
    else:
      try:
        IP = ''.join(random.choice(self.ip_list).strip())
        proxy = {'http': IP}
        return requests.get(url, headers=headers, proxy=proxy, timeout=timeout)
      except:
        if num > 0:
          time.sleep(10)
          IP = ''.join(random.choice(self.ip_list).strip())
          proxy = {'http': IP}
          print("正在更换代理")
          print("当前代理%s" % proxy)
          return self.get(url, proxy=proxy, num=num - 1)

main.py

将爬取的图片保存到本地,然后展示到界面

import kongfuzi
import os
import requests
import bs4
from tkinter import *
from PIL import Image, ImageTk
 
 
# 下载图片,生成图片地址列表和图书信息列表
def download():
  baseUrl = "http://search.kongfz.com"
  keyword = e1.get()
  url = baseUrl + "/product_result/?select=0&key=" + keyword
  print("下载链接:" + url)
  show(url)
 
 
# bs4处理
def changesoup(html):
  htm = html.content
  html_doc = str(htm, 'utf-8')
  soup = bs4.BeautifulSoup(html_doc, "html.parser")
  return soup
 
 
# 图书信息集合
def bookinfo(soup):
  # 图书价格列表
  price = []
  soupprice = soup.select(".first-info .f_right .bold")
  for i in soupprice:
    price.append(i.string)
 
  # 书店名列表
  storename = []
  soupstorename = soup.select(".text a span")
  for each in soupstorename:
    if each.string == None:
      soupstorename.remove(each)
  for i in soupstorename:
    storename.append(i.string)
 
  # 商家地区列表
  place = []
  soupplace = soup.select(".user-place")
  for i in soupplace:
    place.append(i.string)
 
  # 书名列表
  bookname = []
  bookname1 = soup.select(
    ".search-wrap .search-main .search-main-result .result-content .result-list .item .item-info .title .link")
  # print(len(bookname1))
  # print(bookname1)
  for each in bookname1:
    print(each)
    # a = bs4.BeautifulSoup(each, "html.parser")
    a = each.get_text()
    print(a)
    # type(a)
    # a = bs4.BeautifulSoup(a, "html.parser")
    # b = a.get_text()
    bookname.append(a)
  # print(bookname)
  # print(len(bookname))
 
  return bookname, price, place, storename
 
 
# 保存图片
def imgsave(soup):
  dirName = "image"
  os.makedirs(dirName, exist_ok=True)
  filePathList = []
  imgUrl = soup.select(".search-main-result .result-content .result-list .item .item-img .img-box img")
 
  # print(imgUrl)
  if not imgUrl:
    print("没有找到当前节点下图片")
  else:
    i = 0
    for imageUrls in imgUrl:
      # 找到图片地址 获取它
      downloadUrl = imageUrls.get('src')
      # if downloadUrl == "/searchfront/img/error.jpg":
      #   downloadUrl = "http://book.kongfz.com/img/pc/error.jpg"
      print("打印要下载的图片地址:", downloadUrl)
      #   http://book.kongfz.com/img/pc/error.jpg
      # 分割字符
      split = downloadUrl.split("/")
      # 只保留最后一个元素
      fileName = str(i) + "-" + os.path.basename(split[len(split) - 1])
      print("文件名:" + fileName)
      # 建立一个新路径
      filePath = os.path.join(dirName, fileName)
      filePathList.append(filePath)
      if not os.path.exists(filePath):
        imageUrlPath = requests.get(downloadUrl)
        # 检查当前网络是否请求成功
        imageUrlPath.raise_for_status()
        # 'wb'二进制模式打开img适用
        imageFile = open(filePath, 'wb')
        for image in imageUrlPath.iter_content(10000):
          # 把每次遍历的文件图像都存储进文件夹中
          imageFile.write(image)
        # 关闭文件
        imageFile.close()
      i = i + 1
  return filePathList
 
# 图片展示
def show(url):
  xz = kongfuzi.DownLoad()
  html = xz.get(url)
 
  # 添加代理ip到ip_list
  add_ip = e2.get()
  xz.ip_list.append(add_ip)
 
  soup = changesoup(html)
  bookname, price, place, storename = bookinfo(soup)
  # print(bookname)
  # print(price)
  # print(place)
  # print(storename)
  filePathList = imgsave(soup)
  root1 = Toplevel()
  root1.geometry("1720x800")
  root1.title("孔网图片爬取")
 
  # 处理图片,转换成可以显示
  photo = []
  temp = []
  for each in filePathList:
    temp = Image.open(each)
    photo.append(ImageTk.PhotoImage(temp))
 
  canvas = Canvas(root1, width=1700, height=800, scrollregion=(0, 0, 0, 4000)) # 创建canvas
  canvas.place(x=10, y=10) # 放置canvas的位置
 
  frame = Frame(canvas) # 把frame放在canvas里
  frame.place(width=1680, height=800)
 
  for i in range(50):
    # 图片行列
    rownum = int(i / 5)
    columnnum = i % 5
 
    # photo = ImageTk.PhotoImage(Image.open(filePathList[i]))
    imgLabel1 = Label(frame, image=photo[i], width=280, height=280)
    imgLabel1.grid(row=rownum * 5, column=columnnum, padx=10, pady=5)
 
    infoLabel1 = Label(frame, text="书名:" + bookname[i], bg="#FFF8DC", justify=LEFT)
    infoLabel1.grid(row=rownum * 5 + 1, column=columnnum, padx=45, pady=2, sticky=W)
    infoLabel2 = Label(frame, text="价格:" + price[i] + "元", bg="#FFF8DC", justify=LEFT)
    infoLabel2.grid(row=rownum * 5 + 2, column=columnnum, padx=45, pady=2, sticky=W)
    infoLabel3 = Label(frame, text="发货地区:" + place[i], bg="#FFF8DC", justify=LEFT)
    infoLabel3.grid(row=rownum * 5 + 3, column=columnnum, padx=45, pady=2, sticky=W)
    infoLabel4 = Label(frame, text="书店:" + storename[i], bg="#FFF8DC", justify=LEFT)
    infoLabel4.grid(row=rownum * 5 + 4, column=columnnum, padx=45, pady=2, sticky=W)
 
  vbar = Scrollbar(canvas, orient=VERTICAL) # 竖直滚动条
  vbar.place(x=1680, width=20, height=800)
  vbar.configure(command=canvas.yview)
  canvas.config(yscrollcommand=vbar.set) # 设置
  canvas.create_window((800, 2000), window=frame)
 
  mainloop()
 
 
if __name__ == '__main__':
  # 界面
  root = Tk()
  root.title("孔网图片爬取")
  e1 = Entry(root)
  e2 = Entry(root)
  e1.grid(row=0, column=0, padx=20, pady=20)
  e2.grid(row=0, column=2, padx=20, pady=20)
  label1 = Label(root, text="关键字", width=10).grid(row=0, column=1, padx=10, pady=5)
  label2 = Label(root, text="添加代理ip", width=10).grid(row=0, column=3, padx=10, pady=5)
  btn1 = Button(root, text="搜索", width=10, command=download).grid(row=1, column=1, padx=10, pady=5)
  # print(e1.get())
  mainloop()

以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持三水点靠木。

Python 相关文章推荐
学习python的几条建议分享
Feb 10 Python
python使用matplotlib绘制折线图教程
Feb 08 Python
python数据结构之链表的实例讲解
Jul 25 Python
python爬虫实战之最简单的网页爬虫教程
Aug 13 Python
Python常见字典内建函数用法示例
May 14 Python
python3.6利用pyinstall打包py为exe的操作实例
Oct 31 Python
Python操作qml对象过程详解
Sep 26 Python
在Mac中PyCharm配置python Anaconda环境过程图解
Mar 11 Python
Python实现寻找回文数字过程解析
Jun 09 Python
5 分钟读懂Python 中的 Hook 钩子函数
Dec 09 Python
python 实现两个变量值进行交换的n种操作
Jun 02 Python
Django基础CBV装饰器和中间件
Mar 22 Python
Python定义二叉树及4种遍历方法实例详解
Jul 05 #Python
Python使用pyodbc访问数据库操作方法详解
Jul 05 #Python
如何优雅地处理Django中的favicon.ico图标详解
Jul 05 #Python
解决pandas中读取中文名称的csv文件报错的问题
Jul 04 #Python
解决Python pandas df 写入excel 出现的问题
Jul 04 #Python
python处理数据,存进hive表的方法
Jul 04 #Python
利用Pandas读取文件路径或文件名称包含中文的csv文件方法
Jul 04 #Python
You might like
ThinkPHP令牌验证实例
2014/06/18 PHP
Yii中CGridView禁止列排序的设置方法
2016/07/12 PHP
Javascript & DHTML 实例编程(教程)基础知识
2007/06/02 Javascript
基于jQuery试卷自动排版系统
2010/07/18 Javascript
简单实用jquery版三级联动select示例
2013/07/04 Javascript
Jquery uploadify图片上传插件无法上传的解决方法
2013/12/16 Javascript
使用js Math.random()函数生成n到m间的随机数字
2014/10/09 Javascript
JS实现常见的TAB、弹出层效果(TAB标签,斑马线,遮罩层等)
2015/10/08 Javascript
jquery插件uploadify实现带进度条的文件批量上传
2015/12/13 Javascript
jquery限定文本框只能输入数字(整数和小数)
2016/01/08 Javascript
关于Angular2 + node接口调试的解决方案
2017/05/28 Javascript
Vue header组件开发详解
2018/01/26 Javascript
微信公众号H5之微信分享常见错误和问题(小结)
2019/11/14 Javascript
vue实现页面切换滑动效果
2020/06/29 Javascript
Vue中computed及watch区别实例解析
2020/08/01 Javascript
微信小程序实现简单的select下拉框
2020/11/23 Javascript
[01:20:38]完美世界DOTA2联赛 GXR vs IO 第一场 11.07
2020/11/09 DOTA
Python获取Windows或Linux主机名称通用函数分享
2014/11/22 Python
python机器学习之神经网络(二)
2017/12/20 Python
python 定时修改数据库的示例代码
2018/04/08 Python
使用pandas批量处理矢量化字符串的实例讲解
2018/07/10 Python
浅谈pycharm的xmx和xms设置方法
2018/12/03 Python
Jacobi迭代算法的Python实现详解
2019/06/29 Python
python线程安全及多进程多线程实现方法详解
2019/09/27 Python
AUC计算方法与Python实现代码
2020/02/28 Python
linux 下selenium chrome使用详解
2020/04/02 Python
使用bandit对目标python代码进行安全函数扫描的案例分析
2021/01/27 Python
BASIC HOUSE官方旗舰店:韩国著名的服装品牌
2018/09/27 全球购物
彪马英国官网:PUMA英国
2019/02/11 全球购物
普通大学毕业生自荐信
2013/11/04 职场文书
求职信模板标准格式范文
2014/02/23 职场文书
个人投资计划书
2014/05/01 职场文书
学校党员对照检查材料
2014/08/28 职场文书
倡议书格式及范文
2015/04/29 职场文书
2016同学毕业寄语大全
2015/12/04 职场文书
Python基础之Socket通信原理
2021/04/22 Python