python爬取本站电子书信息并入库的实现代码


Posted in Python onJanuary 20, 2020

入门级爬虫:只抓取书籍名称,信息及下载地址并存储到数据库

数据库工具类:DBUtil.py

import pymysql

class DBUtils(object):
  def connDB(self):               #连接数据库
    conn=pymysql.connect(host='192.168.251.114',port=3306, user='root',passwd='b6f3g2',db='yangsj',charset='utf8');
    cur=conn.cursor();
    return (conn,cur);

  def exeUpdate(self,conn,cur,sql):        #更新或插入操作
    sta=cur.execute(sql);
    conn.commit();
    return (sta);

  def exeDelete(self,conn,cur,IDs):        #删除操作 demo 没用到
    sta=0;
    for eachID in IDs.split(' '):
      sta+=cur.execute("delete from students where Id=%d"%(int(eachID)));
    conn.commit();
    return (sta);

  def exeQuery(self,cur,sql):           #查找操作
    effect_row = cur.execute(sql);
    return (effect_row,cur);

  def connClose(self,conn,cur):          #关闭连接,释放资源
    cur.close();
    conn.close();

if __name__ == '__main__':
  dbUtil = DBUtils();
  conn,cur = dbUtil.connDB();

书籍操作文件 bookOpe.py

from DBUtil import DBUtils
from bookInfo import Book
from bookInfo import DownLoadInfo
import logging
logging.basicConfig(
  level=logging.INFO
)
class BookOperator(object):
  def __addBook(self,book):
    logging.info("add book:%s" % book.bookName);
    dbUtil = DBUtils();
    conn,cur = dbUtil.connDB();
    insertBookSql = ("insert into book (bookName,bookUrl,bookInfo) values ('%s','%s','%s');"%(book.bookName,book.downLoadUrl,book.mainInfo));
    dbUtil.exeUpdate(conn,cur,insertBookSql);
    dbUtil.connClose(conn,cur);
  def __selectLastBookId(self):
    logging.info("selectLastBookId ");
    dbUtil = DBUtils();
    conn,cur = dbUtil.connDB();
    selectLastBookSql = "select id from book order by id desc limit 1";
    effect_row,cur = dbUtil.exeQuery(cur,selectLastBookSql);
    bookId = cur.fetchone()[0];
    dbUtil.connClose(conn,cur);
    return bookId;
  def __addBookDownLoadInfos(self,downLoadInfos,bookId):
    logging.info("add bookId:%s" % bookId);
    dbUtil = DBUtils();
    conn,cur = dbUtil.connDB();
    for downLoadinfo in downLoadInfos:
      insertBookDownLoadInfo = ("insert into book_down_url (bookId,downName,downUrl) values ('%s','%s','%s');"%(bookId,downLoadinfo.downName,downLoadinfo.downUrl));
      dbUtil.exeUpdate(conn,cur,insertBookDownLoadInfo);
    dbUtil.connClose(conn,cur);
  def addBookInfo(self,book):
    logging.info("add bookInfo:%s" % book.bookName);
    self.__addBook(book);
    bookId = self.__selectLastBookId();
    self.__addBookDownLoadInfos(book.downLoadInfos,bookId);
if __name__ == '__main__':
  bookope = BookOperator();
  book = Book("aaa","yang","cccc");
  book.addDownLoadUrl(DownLoadInfo("aaa.html","书籍"));
  bookope.addBookInfo(book);

书籍信息文件 bookInfo.py

import sys
sys.encoding = "utf8"
class Book(object):
  #书籍信息#
  def __init__(self,mainInfo,downLoadUrl,bookName):
    self.mainInfo = mainInfo;
    self.downLoadUrl = downLoadUrl;
    self.bookName = bookName;
    self.downLoadInfos = [];
  def addDownLoadUrl(self,downloadInfo):
    self.downLoadInfos.append(downloadInfo);
  def print_book_info(self):
    print ("bookName :%s" % (self.bookName));
class DownLoadInfo(object):
  #下载信息#
  def __init__(self,downUrl,downName):
    self.downUrl = downUrl;
    self.downName = downName;
  def print_down_info(self):
    print ("downLoad %s - %s" % (self.downUrl,self.downName));

51job界面解析文件 FiveOneJobFetch.py

import requests
from bs4 import BeautifulSoup
import sys
from bookInfo import Book
from bookInfo import DownLoadInfo
import logging
sys.encoding = "utf8"
class PageFetch(object):
  host = "//3water.com/";  #域名+分类
  category = "books/"; #具体请求页
  def __init__(self,pageUrl):
    self.pageUrl = pageUrl; #完整URL
    self.url = PageFetch.host+PageFetch.category + pageUrl;
  def __getPageContent(self):
    req = requests.get(self.url);
    if req.status_code == 200:
      req.encoding = "gb2312";
      strText = req.text;
      return strText;
    else:
      return "";
  def getPageContent(url):
    req = requests.get(url);
    if req.status_code == 200:
      req.encoding = "gb2312";
      strText = req.text;
      return strText;
    else:
      return "";
  def __getMaxPageNumAndUrl(self):
    fetchUrl = self.pageUrl;
    #获取分页地址 分页url 形如 list45_2.html 2为页号#
    maxPageNum = 0;
    maxLink = "";
    while maxLink == "":
      url = PageFetch.host+PageFetch.category +fetchUrl;
      reqContent = PageFetch.getPageContent(url)
      soup = BeautifulSoup (reqContent,"html.parser");
      for ul in soup.select(".plist"):
        print ("数据");
        print (ul);
        maxPageNum = ul.select("strong")[0].text;
        alink = ul.select("a");
        if alink[-1]['href'] == "#":
          maxLink = alink[1]['href'];
        else:
          fetchUrl = alink[-1]['href'];
    return maxPageNum,maxLink;
  def __formatPage(self,pageNum):
    #格式化url 形如 list45_2.html#
    lineBeginSite = self.pageUrl.index("_")+1;
    docBeginSite = self.pageUrl.index(".");
    return self.pageUrl[:lineBeginSite]+str(pageNum+1)+self.pageUrl[docBeginSite:];
  def getBookPageList(self):
    #获取书籍每页的URL#
    shortPageList = [];
    maxPageNum,urlPattern = self.__getMaxPageNumAndUrl();
    for i in range(int(maxPageNum)):
      shortPageList.append(self.host +self.category+ self.__formatPage(i));
    return shortPageList;
  def getDownloadPage(url):
    downPage= [];
    reqContent = PageFetch.getPageContent(url);
    soup = BeautifulSoup (reqContent,"html.parser");
    for a in soup.select(".cur-cat-list .btn-dl"):
      downPage.append(PageFetch.host+a['href']);
    return downPage;
  def getBookInfo(url):
    logging.info("获取书籍信息url:%s" % url);
    reqContent = PageFetch.getPageContent(url);
    soup = BeautifulSoup (reqContent,"html.parser");
    mainInfo = (soup.select("#soft-intro"))[0].text.replace("截图:","").replace("'","");
    title = (soup.select("dl dt h1"))[0].text.replace("'","");
    book = Book(mainInfo,url,title);
    for ul in soup.select(".ul_Address"):
      for li in ul.select("li"):
        downLoadInfo = DownLoadInfo(li.select("a")[0]['href'],li.select("a")[0].text);
        book.addDownLoadUrl(downLoadInfo);
    return book;
if __name__ == '__main__':
  p = PageFetch("list152_1.html");
  shortPageList = p.getBookPageList();
  downPage= [];
  for page in shortPageList:
    downLoadPage = PageFetch.getDownloadPage(page);
    downPage = downPage+downLoadPage;
  print ("================汇总如下===============================");
  for bookDownLoadPage in downPage:
    book = PageFetch.getBookInfo(bookDownLoadPage);
    print (book.bookName+":%s" % book.downLoadUrl);
    for d in book.downLoadInfos:
      print ("%s - %s" % (d.downUrl,d.downName));
  # p = PageFetch("list977_1.html");
  # p = p.getMaxPageNumAndUrl();
  # print (p);

执行文件,以上文件copy在相同的文件夹下 执行此文件即可 51Job.py

from FiveOneJobFetch import PageFetch
from bookInfo import Book
from bookInfo import DownLoadInfo
from bookOpe import BookOperator

def main(url):
  p = PageFetch(url);
  shortPageList = p.getBookPageList();
  bookOperator = BookOperator();
  downPage= [];
  for page in shortPageList:
    downLoadPage = PageFetch.getDownloadPage(page);
    downPage = downPage+downLoadPage;
  for bookDownLoadPage in downPage:
    book = PageFetch.getBookInfo(bookDownLoadPage);
    bookOperator.addBookInfo(book);
  print ("数据抓取成功:"+url);

if __name__ == '__main__':
  urls = ["list152_35.html","list300_2.html","list476_6.html","list977_2.html","list572_5.html","list509_2.html","list481_1.html","list576_1.html","list482_1.html","list483_1.html","list484_1.html"];
  for url in urls:
    main(url);

数据库表:书籍信息表和下载地址表

CREATE TABLE `book` (
`id` INT(11) NOT NULL AUTO_INCREMENT,
`bookName` VARCHAR(200) NULL DEFAULT NULL,
`bookUrl` VARCHAR(500) NULL DEFAULT NULL,
`bookInfo` TEXT NULL,
PRIMARY KEY (`id`)
)
COLLATE='utf8mb4_general_ci'
ENGINE=InnoDB
AUTO_INCREMENT=2936;
CREATE TABLE `book_down_url` (
`id` INT(11) NOT NULL AUTO_INCREMENT,
`bookId` INT(11) NOT NULL DEFAULT '0',
`downName` VARCHAR(200) NOT NULL DEFAULT '0',
`downUrl` VARCHAR(2000) NOT NULL DEFAULT '0',
PRIMARY KEY (`id`)
)
COLLATE='utf8mb4_general_ci'
ENGINE=InnoDB
AUTO_INCREMENT=44441;

git地址:https://git.oschina.net/yangsj/BookFetch/tree/master

Python 相关文章推荐
python对象及面向对象技术详解
Jul 19 Python
Python实现的直接插入排序算法示例
Apr 29 Python
Python多继承顺序实例分析
May 26 Python
Python如何爬取实时变化的WebSocket数据的方法
Mar 09 Python
python简单区块链模拟详解
Jul 03 Python
python3安装crypto出错及解决方法
Jul 30 Python
Python实现对adb命令封装
Mar 06 Python
Numpy 多维数据数组的实现
Jun 18 Python
python怎么删除缓存文件
Jul 19 Python
Linux系统下升级pip的完整步骤
Jan 31 Python
Python字符串的15个基本操作(小结)
Feb 03 Python
利用Python+OpenCV三步去除水印
May 28 Python
浅谈Tensorflow 动态双向RNN的输出问题
Jan 20 #Python
关于tf.nn.dynamic_rnn返回值详解
Jan 20 #Python
双向RNN:bidirectional_dynamic_rnn()函数的使用详解
Jan 20 #Python
关于tf.reverse_sequence()简述
Jan 20 #Python
tensorflow使用range_input_producer多线程读取数据实例
Jan 20 #Python
浅谈tensorflow中Dataset图片的批量读取及维度的操作详解
Jan 20 #Python
使用tensorflow DataSet实现高效加载变长文本输入
Jan 20 #Python
You might like
php安全开发 添加随机字符串验证,防止伪造跨站请求
2013/02/14 PHP
关于zend studio 出现乱码问题的总结
2013/06/23 PHP
PHP数组操作实例分析【添加,删除,计算,反转,排序,查找等】
2016/12/24 PHP
Joomla框架实现字符串截取的方法示例
2017/07/18 PHP
用JS剩余字数计算的代码
2008/07/03 Javascript
javascript控制层显示或隐藏的方法
2015/07/22 Javascript
jQuery Timelinr实现垂直水平时间轴插件(附源码下载)
2016/02/16 Javascript
一起学写js Calender日历控件
2016/04/14 Javascript
jQuery基于ID调用指定iframe页面内的方法
2016/07/06 Javascript
AngularJS 过滤与排序详解及实例代码
2016/09/14 Javascript
JS实现京东首页之页面顶部、Logo和搜索框功能
2017/01/12 Javascript
创建一般js对象的几种方式
2017/01/19 Javascript
Angular2实现自定义双向绑定属性
2017/03/22 Javascript
浅谈Vue数据绑定的原理
2018/01/08 Javascript
js中apply和Math.max()函数的问题及区别介绍
2018/03/27 Javascript
Vue条件循环判断+计算属性+绑定样式v-bind的实例
2018/09/18 Javascript
基于node.js实现爬虫的讲解
2019/02/18 Javascript
Javascript的this详解
2019/03/23 Javascript
使用JS监听键盘按下事件(keydown event)
2019/11/07 Javascript
微信小程序返回上一级页面的实现代码
2020/06/19 Javascript
解决Nuxt使用axios跨域问题
2020/07/06 Javascript
Vue实现摇一摇功能(兼容ios13.3以上)
2021/01/26 Vue.js
Python 类与元类的深度挖掘 II【经验】
2016/05/06 Python
django 常用orm操作详解
2017/09/13 Python
python在TXT文件中按照某一字符串取出该字符串所在的行方法
2018/12/10 Python
Python批量生成幻影坦克图片实例代码
2019/06/04 Python
详解Python绘图Turtle库
2019/10/12 Python
django框架两个使用模板实例
2019/12/11 Python
python实现单目标、多目标、多尺度、自定义特征的KCF跟踪算法(实例代码)
2020/01/08 Python
python Zmail模块简介与使用示例
2020/12/19 Python
泰国健康和美容服务预订网站:GoWabi
2019/06/03 全球购物
Order by的几种用法
2013/06/16 面试题
大学生思想汇报范文
2013/12/31 职场文书
单位实习介绍信
2015/05/05 职场文书
学生检讨书怎么写
2015/05/07 职场文书
电影雨中的树观后感
2015/06/15 职场文书