python爬取本站电子书信息并入库的实现代码


Posted in Python onJanuary 20, 2020

入门级爬虫:只抓取书籍名称,信息及下载地址并存储到数据库

数据库工具类:DBUtil.py

import pymysql

class DBUtils(object):
  def connDB(self):               #连接数据库
    conn=pymysql.connect(host='192.168.251.114',port=3306, user='root',passwd='b6f3g2',db='yangsj',charset='utf8');
    cur=conn.cursor();
    return (conn,cur);

  def exeUpdate(self,conn,cur,sql):        #更新或插入操作
    sta=cur.execute(sql);
    conn.commit();
    return (sta);

  def exeDelete(self,conn,cur,IDs):        #删除操作 demo 没用到
    sta=0;
    for eachID in IDs.split(' '):
      sta+=cur.execute("delete from students where Id=%d"%(int(eachID)));
    conn.commit();
    return (sta);

  def exeQuery(self,cur,sql):           #查找操作
    effect_row = cur.execute(sql);
    return (effect_row,cur);

  def connClose(self,conn,cur):          #关闭连接,释放资源
    cur.close();
    conn.close();

if __name__ == '__main__':
  dbUtil = DBUtils();
  conn,cur = dbUtil.connDB();

书籍操作文件 bookOpe.py

from DBUtil import DBUtils
from bookInfo import Book
from bookInfo import DownLoadInfo
import logging
logging.basicConfig(
  level=logging.INFO
)
class BookOperator(object):
  def __addBook(self,book):
    logging.info("add book:%s" % book.bookName);
    dbUtil = DBUtils();
    conn,cur = dbUtil.connDB();
    insertBookSql = ("insert into book (bookName,bookUrl,bookInfo) values ('%s','%s','%s');"%(book.bookName,book.downLoadUrl,book.mainInfo));
    dbUtil.exeUpdate(conn,cur,insertBookSql);
    dbUtil.connClose(conn,cur);
  def __selectLastBookId(self):
    logging.info("selectLastBookId ");
    dbUtil = DBUtils();
    conn,cur = dbUtil.connDB();
    selectLastBookSql = "select id from book order by id desc limit 1";
    effect_row,cur = dbUtil.exeQuery(cur,selectLastBookSql);
    bookId = cur.fetchone()[0];
    dbUtil.connClose(conn,cur);
    return bookId;
  def __addBookDownLoadInfos(self,downLoadInfos,bookId):
    logging.info("add bookId:%s" % bookId);
    dbUtil = DBUtils();
    conn,cur = dbUtil.connDB();
    for downLoadinfo in downLoadInfos:
      insertBookDownLoadInfo = ("insert into book_down_url (bookId,downName,downUrl) values ('%s','%s','%s');"%(bookId,downLoadinfo.downName,downLoadinfo.downUrl));
      dbUtil.exeUpdate(conn,cur,insertBookDownLoadInfo);
    dbUtil.connClose(conn,cur);
  def addBookInfo(self,book):
    logging.info("add bookInfo:%s" % book.bookName);
    self.__addBook(book);
    bookId = self.__selectLastBookId();
    self.__addBookDownLoadInfos(book.downLoadInfos,bookId);
if __name__ == '__main__':
  bookope = BookOperator();
  book = Book("aaa","yang","cccc");
  book.addDownLoadUrl(DownLoadInfo("aaa.html","书籍"));
  bookope.addBookInfo(book);

书籍信息文件 bookInfo.py

import sys
sys.encoding = "utf8"
class Book(object):
  #书籍信息#
  def __init__(self,mainInfo,downLoadUrl,bookName):
    self.mainInfo = mainInfo;
    self.downLoadUrl = downLoadUrl;
    self.bookName = bookName;
    self.downLoadInfos = [];
  def addDownLoadUrl(self,downloadInfo):
    self.downLoadInfos.append(downloadInfo);
  def print_book_info(self):
    print ("bookName :%s" % (self.bookName));
class DownLoadInfo(object):
  #下载信息#
  def __init__(self,downUrl,downName):
    self.downUrl = downUrl;
    self.downName = downName;
  def print_down_info(self):
    print ("downLoad %s - %s" % (self.downUrl,self.downName));

51job界面解析文件 FiveOneJobFetch.py

import requests
from bs4 import BeautifulSoup
import sys
from bookInfo import Book
from bookInfo import DownLoadInfo
import logging
sys.encoding = "utf8"
class PageFetch(object):
  host = "//3water.com/";  #域名+分类
  category = "books/"; #具体请求页
  def __init__(self,pageUrl):
    self.pageUrl = pageUrl; #完整URL
    self.url = PageFetch.host+PageFetch.category + pageUrl;
  def __getPageContent(self):
    req = requests.get(self.url);
    if req.status_code == 200:
      req.encoding = "gb2312";
      strText = req.text;
      return strText;
    else:
      return "";
  def getPageContent(url):
    req = requests.get(url);
    if req.status_code == 200:
      req.encoding = "gb2312";
      strText = req.text;
      return strText;
    else:
      return "";
  def __getMaxPageNumAndUrl(self):
    fetchUrl = self.pageUrl;
    #获取分页地址 分页url 形如 list45_2.html 2为页号#
    maxPageNum = 0;
    maxLink = "";
    while maxLink == "":
      url = PageFetch.host+PageFetch.category +fetchUrl;
      reqContent = PageFetch.getPageContent(url)
      soup = BeautifulSoup (reqContent,"html.parser");
      for ul in soup.select(".plist"):
        print ("数据");
        print (ul);
        maxPageNum = ul.select("strong")[0].text;
        alink = ul.select("a");
        if alink[-1]['href'] == "#":
          maxLink = alink[1]['href'];
        else:
          fetchUrl = alink[-1]['href'];
    return maxPageNum,maxLink;
  def __formatPage(self,pageNum):
    #格式化url 形如 list45_2.html#
    lineBeginSite = self.pageUrl.index("_")+1;
    docBeginSite = self.pageUrl.index(".");
    return self.pageUrl[:lineBeginSite]+str(pageNum+1)+self.pageUrl[docBeginSite:];
  def getBookPageList(self):
    #获取书籍每页的URL#
    shortPageList = [];
    maxPageNum,urlPattern = self.__getMaxPageNumAndUrl();
    for i in range(int(maxPageNum)):
      shortPageList.append(self.host +self.category+ self.__formatPage(i));
    return shortPageList;
  def getDownloadPage(url):
    downPage= [];
    reqContent = PageFetch.getPageContent(url);
    soup = BeautifulSoup (reqContent,"html.parser");
    for a in soup.select(".cur-cat-list .btn-dl"):
      downPage.append(PageFetch.host+a['href']);
    return downPage;
  def getBookInfo(url):
    logging.info("获取书籍信息url:%s" % url);
    reqContent = PageFetch.getPageContent(url);
    soup = BeautifulSoup (reqContent,"html.parser");
    mainInfo = (soup.select("#soft-intro"))[0].text.replace("截图:","").replace("'","");
    title = (soup.select("dl dt h1"))[0].text.replace("'","");
    book = Book(mainInfo,url,title);
    for ul in soup.select(".ul_Address"):
      for li in ul.select("li"):
        downLoadInfo = DownLoadInfo(li.select("a")[0]['href'],li.select("a")[0].text);
        book.addDownLoadUrl(downLoadInfo);
    return book;
if __name__ == '__main__':
  p = PageFetch("list152_1.html");
  shortPageList = p.getBookPageList();
  downPage= [];
  for page in shortPageList:
    downLoadPage = PageFetch.getDownloadPage(page);
    downPage = downPage+downLoadPage;
  print ("================汇总如下===============================");
  for bookDownLoadPage in downPage:
    book = PageFetch.getBookInfo(bookDownLoadPage);
    print (book.bookName+":%s" % book.downLoadUrl);
    for d in book.downLoadInfos:
      print ("%s - %s" % (d.downUrl,d.downName));
  # p = PageFetch("list977_1.html");
  # p = p.getMaxPageNumAndUrl();
  # print (p);

执行文件,以上文件copy在相同的文件夹下 执行此文件即可 51Job.py

from FiveOneJobFetch import PageFetch
from bookInfo import Book
from bookInfo import DownLoadInfo
from bookOpe import BookOperator

def main(url):
  p = PageFetch(url);
  shortPageList = p.getBookPageList();
  bookOperator = BookOperator();
  downPage= [];
  for page in shortPageList:
    downLoadPage = PageFetch.getDownloadPage(page);
    downPage = downPage+downLoadPage;
  for bookDownLoadPage in downPage:
    book = PageFetch.getBookInfo(bookDownLoadPage);
    bookOperator.addBookInfo(book);
  print ("数据抓取成功:"+url);

if __name__ == '__main__':
  urls = ["list152_35.html","list300_2.html","list476_6.html","list977_2.html","list572_5.html","list509_2.html","list481_1.html","list576_1.html","list482_1.html","list483_1.html","list484_1.html"];
  for url in urls:
    main(url);

数据库表:书籍信息表和下载地址表

CREATE TABLE `book` (
`id` INT(11) NOT NULL AUTO_INCREMENT,
`bookName` VARCHAR(200) NULL DEFAULT NULL,
`bookUrl` VARCHAR(500) NULL DEFAULT NULL,
`bookInfo` TEXT NULL,
PRIMARY KEY (`id`)
)
COLLATE='utf8mb4_general_ci'
ENGINE=InnoDB
AUTO_INCREMENT=2936;
CREATE TABLE `book_down_url` (
`id` INT(11) NOT NULL AUTO_INCREMENT,
`bookId` INT(11) NOT NULL DEFAULT '0',
`downName` VARCHAR(200) NOT NULL DEFAULT '0',
`downUrl` VARCHAR(2000) NOT NULL DEFAULT '0',
PRIMARY KEY (`id`)
)
COLLATE='utf8mb4_general_ci'
ENGINE=InnoDB
AUTO_INCREMENT=44441;

git地址:https://git.oschina.net/yangsj/BookFetch/tree/master

Python 相关文章推荐
python常见的格式化输出小结
Dec 15 Python
python中的计时器timeit的使用方法
Oct 20 Python
Tornado高并发处理方法实例代码
Jan 15 Python
Python中反射和描述器总结
Sep 23 Python
pygame游戏之旅 添加游戏界面按键图形
Nov 20 Python
PyQt5+Caffe+Opencv搭建人脸识别登录界面
Aug 28 Python
TensorFlow tf.nn.max_pool实现池化操作方式
Jan 04 Python
通过实例解析python描述符原理作用
Jan 22 Python
Python处理PDF与CDF实例
Feb 26 Python
Python基础之字典常见操作经典实例详解
Feb 26 Python
python 决策树算法的实现
Oct 09 Python
Python中免验证跳转到内容页的实例代码
Oct 23 Python
浅谈Tensorflow 动态双向RNN的输出问题
Jan 20 #Python
关于tf.nn.dynamic_rnn返回值详解
Jan 20 #Python
双向RNN:bidirectional_dynamic_rnn()函数的使用详解
Jan 20 #Python
关于tf.reverse_sequence()简述
Jan 20 #Python
tensorflow使用range_input_producer多线程读取数据实例
Jan 20 #Python
浅谈tensorflow中Dataset图片的批量读取及维度的操作详解
Jan 20 #Python
使用tensorflow DataSet实现高效加载变长文本输入
Jan 20 #Python
You might like
PHP计数器的实现代码
2013/06/08 PHP
解析php中获取url与物理路径的总结
2013/06/21 PHP
用PHP来计算某个目录大小的方法
2014/04/01 PHP
PHP中iconv函数知识汇总
2015/07/02 PHP
腾讯的ip接口 方便获取当前用户的ip地理位置
2010/11/25 Javascript
jquery实现在页面加载完毕后获取图片高度或宽度
2014/06/16 Javascript
jquery中使用循环下拉菜单示例代码
2014/09/24 Javascript
javascript实现ecshop搜索框键盘上下键切换控制
2015/03/18 Javascript
Vue方法与事件处理器详解
2016/12/01 Javascript
EasyUI学习之Combobox下拉列表(1)
2016/12/29 Javascript
jquery插件treegrid树状表格的使用方法详解(.Net平台)
2017/01/03 Javascript
基于React实现表单数据的添加和删除详解
2017/03/14 Javascript
js禁止表单重复提交
2017/08/29 Javascript
解决vue多个路由共用一个页面的问题
2018/03/12 Javascript
vue 中swiper的使用教程
2018/05/22 Javascript
JavaScript实现小球沿正弦曲线运动
2020/09/07 Javascript
35个Python编程小技巧
2014/04/01 Python
Python的Bottle框架中获取制定cookie的教程
2015/04/24 Python
Python中Continue语句的用法的举例详解
2015/05/14 Python
神经网络python源码分享
2017/12/15 Python
http请求 request失败自动重新尝试代码示例
2018/01/25 Python
Python中optparser库用法实例详解
2018/01/26 Python
Python迭代器和生成器定义与用法示例
2018/02/10 Python
Python3 修改默认环境的方法
2019/02/16 Python
解决在keras中使用model.save()函数保存模型失败的问题
2020/05/21 Python
Python从MySQL数据库中面抽取试题,生成试卷
2021/01/14 Python
Probikekit日本:自行车套件,跑步和铁人三项装备
2017/04/03 全球购物
一套SQL笔试题
2016/08/14 面试题
架构师岗位职责
2013/11/18 职场文书
服装厂厂长岗位职责
2013/12/27 职场文书
省三好学生申请材料
2014/01/22 职场文书
迎国庆横幅标语
2014/10/08 职场文书
学生病假条怎么写
2015/08/17 职场文书
Apache Calcite 实现方言转换的代码
2021/04/24 Servers
学习nginx基础知识
2021/09/04 Servers
MySQL和Oracle批量插入SQL的通用写法示例
2021/11/17 MySQL