python爬取本站电子书信息并入库的实现代码


Posted in Python onJanuary 20, 2020

入门级爬虫:只抓取书籍名称,信息及下载地址并存储到数据库

数据库工具类:DBUtil.py

import pymysql

class DBUtils(object):
  def connDB(self):               #连接数据库
    conn=pymysql.connect(host='192.168.251.114',port=3306, user='root',passwd='b6f3g2',db='yangsj',charset='utf8');
    cur=conn.cursor();
    return (conn,cur);

  def exeUpdate(self,conn,cur,sql):        #更新或插入操作
    sta=cur.execute(sql);
    conn.commit();
    return (sta);

  def exeDelete(self,conn,cur,IDs):        #删除操作 demo 没用到
    sta=0;
    for eachID in IDs.split(' '):
      sta+=cur.execute("delete from students where Id=%d"%(int(eachID)));
    conn.commit();
    return (sta);

  def exeQuery(self,cur,sql):           #查找操作
    effect_row = cur.execute(sql);
    return (effect_row,cur);

  def connClose(self,conn,cur):          #关闭连接,释放资源
    cur.close();
    conn.close();

if __name__ == '__main__':
  dbUtil = DBUtils();
  conn,cur = dbUtil.connDB();

书籍操作文件 bookOpe.py

from DBUtil import DBUtils
from bookInfo import Book
from bookInfo import DownLoadInfo
import logging
logging.basicConfig(
  level=logging.INFO
)
class BookOperator(object):
  def __addBook(self,book):
    logging.info("add book:%s" % book.bookName);
    dbUtil = DBUtils();
    conn,cur = dbUtil.connDB();
    insertBookSql = ("insert into book (bookName,bookUrl,bookInfo) values ('%s','%s','%s');"%(book.bookName,book.downLoadUrl,book.mainInfo));
    dbUtil.exeUpdate(conn,cur,insertBookSql);
    dbUtil.connClose(conn,cur);
  def __selectLastBookId(self):
    logging.info("selectLastBookId ");
    dbUtil = DBUtils();
    conn,cur = dbUtil.connDB();
    selectLastBookSql = "select id from book order by id desc limit 1";
    effect_row,cur = dbUtil.exeQuery(cur,selectLastBookSql);
    bookId = cur.fetchone()[0];
    dbUtil.connClose(conn,cur);
    return bookId;
  def __addBookDownLoadInfos(self,downLoadInfos,bookId):
    logging.info("add bookId:%s" % bookId);
    dbUtil = DBUtils();
    conn,cur = dbUtil.connDB();
    for downLoadinfo in downLoadInfos:
      insertBookDownLoadInfo = ("insert into book_down_url (bookId,downName,downUrl) values ('%s','%s','%s');"%(bookId,downLoadinfo.downName,downLoadinfo.downUrl));
      dbUtil.exeUpdate(conn,cur,insertBookDownLoadInfo);
    dbUtil.connClose(conn,cur);
  def addBookInfo(self,book):
    logging.info("add bookInfo:%s" % book.bookName);
    self.__addBook(book);
    bookId = self.__selectLastBookId();
    self.__addBookDownLoadInfos(book.downLoadInfos,bookId);
if __name__ == '__main__':
  bookope = BookOperator();
  book = Book("aaa","yang","cccc");
  book.addDownLoadUrl(DownLoadInfo("aaa.html","书籍"));
  bookope.addBookInfo(book);

书籍信息文件 bookInfo.py

import sys
sys.encoding = "utf8"
class Book(object):
  #书籍信息#
  def __init__(self,mainInfo,downLoadUrl,bookName):
    self.mainInfo = mainInfo;
    self.downLoadUrl = downLoadUrl;
    self.bookName = bookName;
    self.downLoadInfos = [];
  def addDownLoadUrl(self,downloadInfo):
    self.downLoadInfos.append(downloadInfo);
  def print_book_info(self):
    print ("bookName :%s" % (self.bookName));
class DownLoadInfo(object):
  #下载信息#
  def __init__(self,downUrl,downName):
    self.downUrl = downUrl;
    self.downName = downName;
  def print_down_info(self):
    print ("downLoad %s - %s" % (self.downUrl,self.downName));

51job界面解析文件 FiveOneJobFetch.py

import requests
from bs4 import BeautifulSoup
import sys
from bookInfo import Book
from bookInfo import DownLoadInfo
import logging
sys.encoding = "utf8"
class PageFetch(object):
  host = "//3water.com/";  #域名+分类
  category = "books/"; #具体请求页
  def __init__(self,pageUrl):
    self.pageUrl = pageUrl; #完整URL
    self.url = PageFetch.host+PageFetch.category + pageUrl;
  def __getPageContent(self):
    req = requests.get(self.url);
    if req.status_code == 200:
      req.encoding = "gb2312";
      strText = req.text;
      return strText;
    else:
      return "";
  def getPageContent(url):
    req = requests.get(url);
    if req.status_code == 200:
      req.encoding = "gb2312";
      strText = req.text;
      return strText;
    else:
      return "";
  def __getMaxPageNumAndUrl(self):
    fetchUrl = self.pageUrl;
    #获取分页地址 分页url 形如 list45_2.html 2为页号#
    maxPageNum = 0;
    maxLink = "";
    while maxLink == "":
      url = PageFetch.host+PageFetch.category +fetchUrl;
      reqContent = PageFetch.getPageContent(url)
      soup = BeautifulSoup (reqContent,"html.parser");
      for ul in soup.select(".plist"):
        print ("数据");
        print (ul);
        maxPageNum = ul.select("strong")[0].text;
        alink = ul.select("a");
        if alink[-1]['href'] == "#":
          maxLink = alink[1]['href'];
        else:
          fetchUrl = alink[-1]['href'];
    return maxPageNum,maxLink;
  def __formatPage(self,pageNum):
    #格式化url 形如 list45_2.html#
    lineBeginSite = self.pageUrl.index("_")+1;
    docBeginSite = self.pageUrl.index(".");
    return self.pageUrl[:lineBeginSite]+str(pageNum+1)+self.pageUrl[docBeginSite:];
  def getBookPageList(self):
    #获取书籍每页的URL#
    shortPageList = [];
    maxPageNum,urlPattern = self.__getMaxPageNumAndUrl();
    for i in range(int(maxPageNum)):
      shortPageList.append(self.host +self.category+ self.__formatPage(i));
    return shortPageList;
  def getDownloadPage(url):
    downPage= [];
    reqContent = PageFetch.getPageContent(url);
    soup = BeautifulSoup (reqContent,"html.parser");
    for a in soup.select(".cur-cat-list .btn-dl"):
      downPage.append(PageFetch.host+a['href']);
    return downPage;
  def getBookInfo(url):
    logging.info("获取书籍信息url:%s" % url);
    reqContent = PageFetch.getPageContent(url);
    soup = BeautifulSoup (reqContent,"html.parser");
    mainInfo = (soup.select("#soft-intro"))[0].text.replace("截图:","").replace("'","");
    title = (soup.select("dl dt h1"))[0].text.replace("'","");
    book = Book(mainInfo,url,title);
    for ul in soup.select(".ul_Address"):
      for li in ul.select("li"):
        downLoadInfo = DownLoadInfo(li.select("a")[0]['href'],li.select("a")[0].text);
        book.addDownLoadUrl(downLoadInfo);
    return book;
if __name__ == '__main__':
  p = PageFetch("list152_1.html");
  shortPageList = p.getBookPageList();
  downPage= [];
  for page in shortPageList:
    downLoadPage = PageFetch.getDownloadPage(page);
    downPage = downPage+downLoadPage;
  print ("================汇总如下===============================");
  for bookDownLoadPage in downPage:
    book = PageFetch.getBookInfo(bookDownLoadPage);
    print (book.bookName+":%s" % book.downLoadUrl);
    for d in book.downLoadInfos:
      print ("%s - %s" % (d.downUrl,d.downName));
  # p = PageFetch("list977_1.html");
  # p = p.getMaxPageNumAndUrl();
  # print (p);

执行文件,以上文件copy在相同的文件夹下 执行此文件即可 51Job.py

from FiveOneJobFetch import PageFetch
from bookInfo import Book
from bookInfo import DownLoadInfo
from bookOpe import BookOperator

def main(url):
  p = PageFetch(url);
  shortPageList = p.getBookPageList();
  bookOperator = BookOperator();
  downPage= [];
  for page in shortPageList:
    downLoadPage = PageFetch.getDownloadPage(page);
    downPage = downPage+downLoadPage;
  for bookDownLoadPage in downPage:
    book = PageFetch.getBookInfo(bookDownLoadPage);
    bookOperator.addBookInfo(book);
  print ("数据抓取成功:"+url);

if __name__ == '__main__':
  urls = ["list152_35.html","list300_2.html","list476_6.html","list977_2.html","list572_5.html","list509_2.html","list481_1.html","list576_1.html","list482_1.html","list483_1.html","list484_1.html"];
  for url in urls:
    main(url);

数据库表:书籍信息表和下载地址表

CREATE TABLE `book` (
`id` INT(11) NOT NULL AUTO_INCREMENT,
`bookName` VARCHAR(200) NULL DEFAULT NULL,
`bookUrl` VARCHAR(500) NULL DEFAULT NULL,
`bookInfo` TEXT NULL,
PRIMARY KEY (`id`)
)
COLLATE='utf8mb4_general_ci'
ENGINE=InnoDB
AUTO_INCREMENT=2936;
CREATE TABLE `book_down_url` (
`id` INT(11) NOT NULL AUTO_INCREMENT,
`bookId` INT(11) NOT NULL DEFAULT '0',
`downName` VARCHAR(200) NOT NULL DEFAULT '0',
`downUrl` VARCHAR(2000) NOT NULL DEFAULT '0',
PRIMARY KEY (`id`)
)
COLLATE='utf8mb4_general_ci'
ENGINE=InnoDB
AUTO_INCREMENT=44441;

git地址:https://git.oschina.net/yangsj/BookFetch/tree/master

Python 相关文章推荐
python中sets模块的用法实例
Sep 30 Python
Python使用bs4获取58同城城市分类的方法
Jul 08 Python
Python使用迭代器捕获Generator返回值的方法
Apr 05 Python
详谈Python高阶函数与函数装饰器(推荐)
Sep 30 Python
Python 忽略warning的输出方法
Oct 18 Python
Django为窗体加上防机器人的验证码功能过程解析
Aug 14 Python
对Django中内置的User模型实例详解
Aug 16 Python
Python Scrapy框架第一个入门程序示例
Feb 05 Python
Python 面向对象部分知识点小结
Mar 09 Python
python爬虫开发之使用python爬虫库requests,urllib与今日头条搜索功能爬取搜索内容实例
Mar 10 Python
python 实现单例模式的5种方法
Sep 23 Python
20行代码教你用python给证件照换底色的方法示例
Feb 05 Python
浅谈Tensorflow 动态双向RNN的输出问题
Jan 20 #Python
关于tf.nn.dynamic_rnn返回值详解
Jan 20 #Python
双向RNN:bidirectional_dynamic_rnn()函数的使用详解
Jan 20 #Python
关于tf.reverse_sequence()简述
Jan 20 #Python
tensorflow使用range_input_producer多线程读取数据实例
Jan 20 #Python
浅谈tensorflow中Dataset图片的批量读取及维度的操作详解
Jan 20 #Python
使用tensorflow DataSet实现高效加载变长文本输入
Jan 20 #Python
You might like
DOTA2 无惧惊涛骇浪 昆卡大型水友攻略
2020/04/20 DOTA
php中的实现trim函数代码
2007/03/19 PHP
php实现按文件名搜索文件的远程文件查找器
2014/05/10 PHP
Laravel框架实现的批量删除功能示例
2019/01/16 PHP
经常用到的JavasScript事件的翻译
2007/04/09 Javascript
检测jQuery.js是否已加载的判断代码
2011/05/20 Javascript
jQuery EasyUI API 中文文档 - EasyLoader 加载器
2011/09/29 Javascript
Jquery中给animation加更多的运作效果实例
2013/09/05 Javascript
javascript验证只能输入数字和一个小数点示例
2013/10/21 Javascript
JavaScript中的索引数组、关联数组和静态数组、动态数组讲解
2014/11/08 Javascript
浅谈Sizzle的“编译原理”
2015/04/14 Javascript
浅谈关于JavaScript API设计的一些建议和准则
2015/06/24 Javascript
JavaScript实现99乘法表及隔行变色实例代码
2016/02/24 Javascript
JavaScript性能优化总结之加载与执行
2016/08/11 Javascript
基于Bootstrap的Metronic框架实现页面链接收藏夹功能
2016/08/29 Javascript
想学习javascript JS和jQuery哪个重要 先学哪个
2016/12/11 Javascript
详解webpack2+React 实例demo
2017/09/11 Javascript
vue中动态绑定表单元素的属性方法
2018/02/23 Javascript
vue通过滚动行为实现从列表到详情,返回列表原位置的方法
2018/08/31 Javascript
js继承的这6种方式!(上)
2019/04/23 Javascript
vue-cli3.X快速创建项目的方法步骤
2019/11/14 Javascript
Python实例分享:快速查找出被挂马的文件
2014/06/08 Python
基于Python实现用户管理系统
2019/02/26 Python
关于Python中定制类的比较运算实例
2019/12/19 Python
微软台湾官方网站:Microsoft台湾
2018/08/15 全球购物
Melissa鞋英国官方网站:Nonnon
2019/05/01 全球购物
JBL美国官方商店:扬声器、耳机等
2019/12/01 全球购物
Myprotein荷兰官网:欧洲第一运动营养品牌
2020/07/11 全球购物
学期研究性学习个人的自我评价
2014/01/09 职场文书
决心书范文
2014/03/11 职场文书
图书馆志愿者活动总结
2014/06/27 职场文书
给领导的感谢信范文
2015/01/23 职场文书
快消品行业营销模式与盈利模式分享
2019/09/27 职场文书
正则表达式拆分url实例代码
2022/02/24 Java/Android
【海涛dota解说】一房久违的影魔魂守二连发
2022/04/01 DOTA
Vue2项目中对百度地图的封装使用详解
2022/06/16 Vue.js