python爬取本站电子书信息并入库的实现代码


Posted in Python onJanuary 20, 2020

入门级爬虫:只抓取书籍名称,信息及下载地址并存储到数据库

数据库工具类:DBUtil.py

import pymysql

class DBUtils(object):
  def connDB(self):               #连接数据库
    conn=pymysql.connect(host='192.168.251.114',port=3306, user='root',passwd='b6f3g2',db='yangsj',charset='utf8');
    cur=conn.cursor();
    return (conn,cur);

  def exeUpdate(self,conn,cur,sql):        #更新或插入操作
    sta=cur.execute(sql);
    conn.commit();
    return (sta);

  def exeDelete(self,conn,cur,IDs):        #删除操作 demo 没用到
    sta=0;
    for eachID in IDs.split(' '):
      sta+=cur.execute("delete from students where Id=%d"%(int(eachID)));
    conn.commit();
    return (sta);

  def exeQuery(self,cur,sql):           #查找操作
    effect_row = cur.execute(sql);
    return (effect_row,cur);

  def connClose(self,conn,cur):          #关闭连接,释放资源
    cur.close();
    conn.close();

if __name__ == '__main__':
  dbUtil = DBUtils();
  conn,cur = dbUtil.connDB();

书籍操作文件 bookOpe.py

from DBUtil import DBUtils
from bookInfo import Book
from bookInfo import DownLoadInfo
import logging
logging.basicConfig(
  level=logging.INFO
)
class BookOperator(object):
  def __addBook(self,book):
    logging.info("add book:%s" % book.bookName);
    dbUtil = DBUtils();
    conn,cur = dbUtil.connDB();
    insertBookSql = ("insert into book (bookName,bookUrl,bookInfo) values ('%s','%s','%s');"%(book.bookName,book.downLoadUrl,book.mainInfo));
    dbUtil.exeUpdate(conn,cur,insertBookSql);
    dbUtil.connClose(conn,cur);
  def __selectLastBookId(self):
    logging.info("selectLastBookId ");
    dbUtil = DBUtils();
    conn,cur = dbUtil.connDB();
    selectLastBookSql = "select id from book order by id desc limit 1";
    effect_row,cur = dbUtil.exeQuery(cur,selectLastBookSql);
    bookId = cur.fetchone()[0];
    dbUtil.connClose(conn,cur);
    return bookId;
  def __addBookDownLoadInfos(self,downLoadInfos,bookId):
    logging.info("add bookId:%s" % bookId);
    dbUtil = DBUtils();
    conn,cur = dbUtil.connDB();
    for downLoadinfo in downLoadInfos:
      insertBookDownLoadInfo = ("insert into book_down_url (bookId,downName,downUrl) values ('%s','%s','%s');"%(bookId,downLoadinfo.downName,downLoadinfo.downUrl));
      dbUtil.exeUpdate(conn,cur,insertBookDownLoadInfo);
    dbUtil.connClose(conn,cur);
  def addBookInfo(self,book):
    logging.info("add bookInfo:%s" % book.bookName);
    self.__addBook(book);
    bookId = self.__selectLastBookId();
    self.__addBookDownLoadInfos(book.downLoadInfos,bookId);
if __name__ == '__main__':
  bookope = BookOperator();
  book = Book("aaa","yang","cccc");
  book.addDownLoadUrl(DownLoadInfo("aaa.html","书籍"));
  bookope.addBookInfo(book);

书籍信息文件 bookInfo.py

import sys
sys.encoding = "utf8"
class Book(object):
  #书籍信息#
  def __init__(self,mainInfo,downLoadUrl,bookName):
    self.mainInfo = mainInfo;
    self.downLoadUrl = downLoadUrl;
    self.bookName = bookName;
    self.downLoadInfos = [];
  def addDownLoadUrl(self,downloadInfo):
    self.downLoadInfos.append(downloadInfo);
  def print_book_info(self):
    print ("bookName :%s" % (self.bookName));
class DownLoadInfo(object):
  #下载信息#
  def __init__(self,downUrl,downName):
    self.downUrl = downUrl;
    self.downName = downName;
  def print_down_info(self):
    print ("downLoad %s - %s" % (self.downUrl,self.downName));

51job界面解析文件 FiveOneJobFetch.py

import requests
from bs4 import BeautifulSoup
import sys
from bookInfo import Book
from bookInfo import DownLoadInfo
import logging
sys.encoding = "utf8"
class PageFetch(object):
  host = "//3water.com/";  #域名+分类
  category = "books/"; #具体请求页
  def __init__(self,pageUrl):
    self.pageUrl = pageUrl; #完整URL
    self.url = PageFetch.host+PageFetch.category + pageUrl;
  def __getPageContent(self):
    req = requests.get(self.url);
    if req.status_code == 200:
      req.encoding = "gb2312";
      strText = req.text;
      return strText;
    else:
      return "";
  def getPageContent(url):
    req = requests.get(url);
    if req.status_code == 200:
      req.encoding = "gb2312";
      strText = req.text;
      return strText;
    else:
      return "";
  def __getMaxPageNumAndUrl(self):
    fetchUrl = self.pageUrl;
    #获取分页地址 分页url 形如 list45_2.html 2为页号#
    maxPageNum = 0;
    maxLink = "";
    while maxLink == "":
      url = PageFetch.host+PageFetch.category +fetchUrl;
      reqContent = PageFetch.getPageContent(url)
      soup = BeautifulSoup (reqContent,"html.parser");
      for ul in soup.select(".plist"):
        print ("数据");
        print (ul);
        maxPageNum = ul.select("strong")[0].text;
        alink = ul.select("a");
        if alink[-1]['href'] == "#":
          maxLink = alink[1]['href'];
        else:
          fetchUrl = alink[-1]['href'];
    return maxPageNum,maxLink;
  def __formatPage(self,pageNum):
    #格式化url 形如 list45_2.html#
    lineBeginSite = self.pageUrl.index("_")+1;
    docBeginSite = self.pageUrl.index(".");
    return self.pageUrl[:lineBeginSite]+str(pageNum+1)+self.pageUrl[docBeginSite:];
  def getBookPageList(self):
    #获取书籍每页的URL#
    shortPageList = [];
    maxPageNum,urlPattern = self.__getMaxPageNumAndUrl();
    for i in range(int(maxPageNum)):
      shortPageList.append(self.host +self.category+ self.__formatPage(i));
    return shortPageList;
  def getDownloadPage(url):
    downPage= [];
    reqContent = PageFetch.getPageContent(url);
    soup = BeautifulSoup (reqContent,"html.parser");
    for a in soup.select(".cur-cat-list .btn-dl"):
      downPage.append(PageFetch.host+a['href']);
    return downPage;
  def getBookInfo(url):
    logging.info("获取书籍信息url:%s" % url);
    reqContent = PageFetch.getPageContent(url);
    soup = BeautifulSoup (reqContent,"html.parser");
    mainInfo = (soup.select("#soft-intro"))[0].text.replace("截图:","").replace("'","");
    title = (soup.select("dl dt h1"))[0].text.replace("'","");
    book = Book(mainInfo,url,title);
    for ul in soup.select(".ul_Address"):
      for li in ul.select("li"):
        downLoadInfo = DownLoadInfo(li.select("a")[0]['href'],li.select("a")[0].text);
        book.addDownLoadUrl(downLoadInfo);
    return book;
if __name__ == '__main__':
  p = PageFetch("list152_1.html");
  shortPageList = p.getBookPageList();
  downPage= [];
  for page in shortPageList:
    downLoadPage = PageFetch.getDownloadPage(page);
    downPage = downPage+downLoadPage;
  print ("================汇总如下===============================");
  for bookDownLoadPage in downPage:
    book = PageFetch.getBookInfo(bookDownLoadPage);
    print (book.bookName+":%s" % book.downLoadUrl);
    for d in book.downLoadInfos:
      print ("%s - %s" % (d.downUrl,d.downName));
  # p = PageFetch("list977_1.html");
  # p = p.getMaxPageNumAndUrl();
  # print (p);

执行文件,以上文件copy在相同的文件夹下 执行此文件即可 51Job.py

from FiveOneJobFetch import PageFetch
from bookInfo import Book
from bookInfo import DownLoadInfo
from bookOpe import BookOperator

def main(url):
  p = PageFetch(url);
  shortPageList = p.getBookPageList();
  bookOperator = BookOperator();
  downPage= [];
  for page in shortPageList:
    downLoadPage = PageFetch.getDownloadPage(page);
    downPage = downPage+downLoadPage;
  for bookDownLoadPage in downPage:
    book = PageFetch.getBookInfo(bookDownLoadPage);
    bookOperator.addBookInfo(book);
  print ("数据抓取成功:"+url);

if __name__ == '__main__':
  urls = ["list152_35.html","list300_2.html","list476_6.html","list977_2.html","list572_5.html","list509_2.html","list481_1.html","list576_1.html","list482_1.html","list483_1.html","list484_1.html"];
  for url in urls:
    main(url);

数据库表:书籍信息表和下载地址表

CREATE TABLE `book` (
`id` INT(11) NOT NULL AUTO_INCREMENT,
`bookName` VARCHAR(200) NULL DEFAULT NULL,
`bookUrl` VARCHAR(500) NULL DEFAULT NULL,
`bookInfo` TEXT NULL,
PRIMARY KEY (`id`)
)
COLLATE='utf8mb4_general_ci'
ENGINE=InnoDB
AUTO_INCREMENT=2936;
CREATE TABLE `book_down_url` (
`id` INT(11) NOT NULL AUTO_INCREMENT,
`bookId` INT(11) NOT NULL DEFAULT '0',
`downName` VARCHAR(200) NOT NULL DEFAULT '0',
`downUrl` VARCHAR(2000) NOT NULL DEFAULT '0',
PRIMARY KEY (`id`)
)
COLLATE='utf8mb4_general_ci'
ENGINE=InnoDB
AUTO_INCREMENT=44441;

git地址:https://git.oschina.net/yangsj/BookFetch/tree/master

Python 相关文章推荐
python中Genarator函数用法分析
Apr 08 Python
pygame学习笔记(4):声音控制
Apr 15 Python
Python实现大文件排序的方法
Jul 10 Python
pandas实现选取特定索引的行
Apr 20 Python
python 使用正则表达式按照多个空格分割字符的实例
Dec 20 Python
ML神器:sklearn的快速使用及入门
Jul 11 Python
在VS2017中用C#调用python脚本的实现
Jul 31 Python
讲解Python3中NumPy数组寻找特定元素下标的两种方法
Aug 04 Python
为什么说Python可以实现所有的算法
Oct 04 Python
关于Python 常用获取元素 Driver 总结
Nov 24 Python
python如何进行矩阵运算
Jun 05 Python
Python入门学习之类的相关知识总结
May 25 Python
浅谈Tensorflow 动态双向RNN的输出问题
Jan 20 #Python
关于tf.nn.dynamic_rnn返回值详解
Jan 20 #Python
双向RNN:bidirectional_dynamic_rnn()函数的使用详解
Jan 20 #Python
关于tf.reverse_sequence()简述
Jan 20 #Python
tensorflow使用range_input_producer多线程读取数据实例
Jan 20 #Python
浅谈tensorflow中Dataset图片的批量读取及维度的操作详解
Jan 20 #Python
使用tensorflow DataSet实现高效加载变长文本输入
Jan 20 #Python
You might like
解析php中static,const与define的使用区别
2013/06/18 PHP
Yii2实现多域名跨域同步登录退出
2017/02/04 PHP
弹出广告特效(一个IP只弹出一次)的代码
2007/07/27 Javascript
跨浏览器的事件对象介绍
2012/06/27 Javascript
一个背景云变换js特效 鼠标移动背景云变化
2012/12/28 Javascript
showModalDialog在谷歌浏览器下会返回Null的解决方法
2013/11/27 Javascript
JS获取iframe中longdesc属性的方法
2015/04/01 Javascript
js中遍历Map对象的方法
2016/07/27 Javascript
js绘制购物车抛物线动画
2020/11/18 Javascript
Three.js入门之hello world以及如何绘制线
2017/09/25 Javascript
vue使用axios实现文件上传进度的实时更新详解
2017/12/20 Javascript
深入理解JavaScript 中的执行上下文和执行栈
2018/10/23 Javascript
通过layer实现可输入的模态框的例子
2019/09/27 Javascript
vue 微信扫码登录(自定义样式)
2020/01/06 Javascript
Vue组件化开发之通用型弹出框的实现
2020/02/28 Javascript
小程序富文本提取图片可放大缩小
2020/05/26 Javascript
基于PHP pthreads实现多线程代码实例
2020/06/24 Javascript
在vue中实现echarts随窗体变化
2020/07/27 Javascript
angular8.5集成TinyMce5的使用和详细配置(推荐)
2020/11/16 Javascript
[03:30]完美盛典趣味短片 CSGO2019年度名场面
2019/12/07 DOTA
使用python实现拉钩网上的FizzBuzzWhizz问题示例
2014/05/05 Python
wxPython框架类和面板类的使用实例
2014/09/28 Python
Python实现telnet服务器的方法
2015/07/10 Python
Python中py文件转换成exe可执行文件的方法
2019/06/14 Python
pyqt5移动鼠标显示坐标的方法
2019/06/21 Python
pytorch实现seq2seq时对loss进行mask的方式
2020/02/18 Python
matplotlib subplot绘制多个子图的方法示例
2020/07/28 Python
Melissa鞋马来西亚官方网站:MDreams马来西亚
2018/04/05 全球购物
乌克兰在线电子产品商店:MTA
2019/11/14 全球购物
优秀的教师个人的中文求职信
2013/09/21 职场文书
会计工作心得体会
2014/01/13 职场文书
2014年社区植树节活动方案
2014/02/28 职场文书
中学生家长评语大全
2014/04/16 职场文书
生日宴会策划方案
2014/06/03 职场文书
python 爬取华为应用市场评论
2021/05/29 Python
关于MySQL临时表为什么可以重名的问题
2022/03/22 MySQL