编程 Python

Python实现的文轩网爬虫完整示例

Posted in Python onMay 16, 2019

本文实例讲述了Python实现的文轩网爬虫。分享给大家供大家参考，具体如下：

encoding=utf8
import pymysql
import time
import sys
import requests
import os
#捕获错误
import traceback
import types
#将html实体化
import cgi
import warnings
reload(sys)
sys.setdefaultencoding('utf-8')
from pyquery import PyQuery as pq
from lxml import etree
sys.setdefaultencoding('utf-8')
#屏蔽错误
warnings.filterwarnings("ignore")
#下载图片
def dowloadPic(imageUrl,filePath):
r = requests.get(imageUrl,timeout=60)
status=r.status_code
if status == 404:
return 404
with open(filePath, "wb") as code:
code.write(r.content)
#根据详情页地址抓取数据并插入数据库
def getData(final_url):
file_open=open('./url.txt', 'w')
file_open.write(final_url)
file_open.close()
#链接数据库
conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='root', db='bookinfo', charset='utf8')
#设置浮标
cursor = conn.cursor(cursor=pymysql.cursors.DictCursor)
#解析详情页面
try:
detail_url=final_url
c=pq(detail_url)
head=c('html').attr('xmlns')
err='http://www.w3.org/1999/xhtml'
err1='http://www.winxuan.com/cms/2016db_sh'
if head == err or head == err1:
return 'back'
except Exception, e:
return 'back'
i=0
while i<12:
  text = c('#page').find('.cont').find('li').eq(i).text()
  text=text.replace('　','')
  if 'I S B N' in text:
    isbn=text.replace('I S B N：','')
    isbn=isbn.strip()
    sel='select count(*) from bi_book where isbn ='+isbn
    cursor.execute(sel)
    result=cursor.fetchone()
    count=result['count(*)']
    if count != 0 :
      print u'已存在'
      return 'back'
  if 'isbn：' in text :
    isbn=text.replace('isbn：','')
    isbn=isbn.strip()
    sel='select count(*) from bi_book where isbn ='+isbn
    cursor.execute(sel)
    result=cursor.fetchone()
    count=result['count(*)']
    if count != 0 :
      print u'已存在'
      return 'back'
  if '作者：' in text :
    author = text.replace('作者：','')
  if '出版社：' in text :
    press_name=text.replace('出版社：','')
  if '版次：' in text :
    edition=text.replace('版次：','')
  if '印次：' in text :
    impressions=text.replace('印次：','')
  if '装帧：' in text :
    packaging=text.replace('装帧：','')
  if '开本：' in text:
    size=text.replace('开本：','')
  if '出版时间：' in text:
    press_time=text.replace('出版时间：','')
    press_time=press_time.strip()
    if press_time == '无':
      press_time='1970-01-01'
  if '印刷时间：' in text:
    print_time=text.replace('印刷时间：','')
    print_time=print_time.strip()
    if print_time== '无':
      print_time='1970-01-01'
  if '页数：' in text:
    page_num=text.replace('页数：','')
  if '字数：' in text:
    word_num=text.replace('字数：','')
  i+=1
if ('author' in locals().keys()) == False:
  author = ''
if ('press_time' in locals().keys()) == False:
  press_time = '1970-01-01'
if ('print_time' in locals().keys()) == False:
  print_time = '1970-01-01'
if ('impressions' in locals().keys()) == False:
  impressions = ''
if ('edition' in locals().keys())== False:
  edition = ''
if ('page_num' in locals().keys())== False:
  page_num = ''
if ('word_num' in locals().keys())== False:
  word_num = ''
if ('packaging' in locals().keys())== False:
  packaging = ''
if ('size' in locals().keys())== False:
  size = ''
if ('press_name' in locals().keys())== False:
  press_name = ''
#暂无图片地址
none_img='http://static.winxuancdn.com/goods/sml_blank.jpg'
#获取大小图地址
big_path=c('.info-side').find('.img').find('a').find('img').attr('src')
if big_path is None:
  return 'back'
elif big_path == none_img :
  big_path=''
  small_path=''
else :
  small_path=big_path.replace('_16','_11')
#获取分类
#先获取a标签html
ahtml=c('#page').find('.base-nav').eq(0).html()
#解析a标签html
cate=pq(ahtml)
#获取分类的最后一个分类
category=cate('a:last').text()
#获取书名
name=c('.info-main').find('.name').eq(0).find('h1').eq(0).text()
name=name.strip()
#获取价格
price=c('.info-main').find('.attr').eq(0).find('.price-n').eq(0).find('b').text()
price=price.replace('¥','')
#循环获取内容简介和目录信息
k=5
while k<12:
  title=c('#page').find('.title').eq(k).find('.tab').find('h4').text()
  if '内容简介' in title:
    con=c('#page').find('.title').eq(k).nextAll()
    det=pq(con)
    content=det('.text-words-1').html()
    content=content.encode("utf8", "ignore");
  if '目录' in title:
    con=c('#page').find('.title').eq(k).nextAll()
    dry=pq(con)
    directory=dry('.text-words-1').html()
    directory=directory.encode("utf8", "ignore");
  k+=1
#如果内容简介和目录没有的时候指定为空字符串
if ('content' in locals().keys())== False:
  content = ''
if ('directory' in locals().keys())== False:
  directory = ''
details  = '内容简介<br>'+content+'<br><br>目录<br>'+directory
details=cgi.escape(details)
#录入时间
add_time = time.strftime('%Y-%m-%d',time.localtime(time.time()))
#下载小图
#文件根目录
root_path=sys.path[0]
#创建isbn文件夹路径
root_path=root_path.replace('\\','/')
isbn_path=root_path+'/download/'+isbn
if big_path != '' and small_path !='' :
  #创建isbn目录
  if os.path.isdir(isbn_path) ==False :
    os.mkdir(isbn_path)
    #组合下载后图片保存路径
    down_img_small = isbn_path+"/small"+isbn+".jpg"
    down_img_big  = isbn_path+'/big'+isbn+".jpg"
    #调用下载图片方法
    small_res=dowloadPic(small_path,down_img_small)
    #大图保存数据库路径
    big_res=dowloadPic(big_path,down_img_big)
    #小图保存数据库路径
    if small_res==404 :
      img_small = 'none-picture/none-small.jpg'
    else :
      img_small = 'download/'+isbn+'/small'+isbn+'.jpg'
    if big_res==404 :
      img_big = 'none-picture/none-big.jpg'
    else :
      img_big  = 'download/'+isbn+'/big'+isbn+'.jpg'
  else :
    #组合保存数据库中的图片路径
    img_small = 'download/'+isbn+'/small'+isbn+'.jpg'
    img_big  = 'download/'+isbn+'/big'+isbn+'.jpg'
else :
  img_big = 'none-picture/none-big.jpg'
  img_small = 'none-picture/none-small.jpg'
source_type = 3
try :
  #要插入的列表
  li=[0,source_type,category,details,detail_url,price,add_time,packaging,print_time,impressions,name,author,press_name,isbn,edition,size,press_time,page_num,word_num,img_big,img_small]
  #执行sql
  sql="insert into bi_book (book_id,source_type,category,details,detail_url,price,add_time,packaging,print_time,impressions,name,author,press_name,isbn,edition,size,press_time,page_num,word_num,img_big,img_small) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
  aaa=cursor.execute(sql,li)
  if aaa==1:
    print u'插入成功'
  conn.commit()
except Exception, e :
  return 'back'
def winxuan(n):
#首页解析
home_url='http://www.winxuan.com/'
h=pq(home_url)
#分类导航链接
menu=h('.mod-mainmenu').find('dd').find('a').eq(n).attr('href')
#print menu
#分类书籍首页
try:
mh=pq(menu)
except Exception, e :
return 'backs'
# text=mh('.main').find('a').text()
# text=text.encode("GBK", "ignore");
li=[]
u=0
while u<248 :
detail_urls=mh('.main').find('a').eq(u).attr('href')
#将取到所有地址放入到列表当中
li.append(detail_urls)
u+=1
#进行列表去重
li=list(set(li))
for final_url in li:
try:
result=getData(final_url)
except Exception, e :
continue
if result=='back' :
continue
print 'OK,finished'
n=0
while n<58:
while n<58:
print n
string=str(n)
file_open=open('./number.txt', 'w')
file_open.write(string)
file_open.close()
res=winxuan(n)
n+=1
if res=='backs' :
continue

更多关于Python相关内容可查看本站专题：《Python Socket编程技巧总结》、《Python正则表达式用法总结》、《Python数据结构与算法教程》、《Python函数使用技巧总结》、《Python字符串操作技巧汇总》、《Python入门与进阶经典教程》及《Python文件与目录操作技巧汇总》

希望本文所述对大家Python程序设计有所帮助。

Python实现的文轩网爬虫完整示例

- Author -

小鹏程序

声明：登载此文出于传递更多信息之目的，并不意味着赞同其观点或证实其描述。

Python 相关文章推荐

python学习笔记：字典的使用示例详解

Jun 13 Python

老生常谈Python startswith()函数与endswith函数

Sep 08 Python

浅谈Python由__dict__和dir()引发的一些思考

Oct 30 Python

Python3实现的字典、列表和json对象互转功能示例

May 22 Python

python opencv实现运动检测

Jul 10 Python

python TKinter获取文本框内容的方法

Oct 11 Python

使用Python为中秋节绘制一块美味的月饼

Sep 11 Python

python 使用shutil复制图片的例子

Dec 13 Python

wxpython多线程防假死与线程间传递消息实例详解

Dec 13 Python

keras处理欠拟合和过拟合的实例讲解

May 25 Python

python3.4中清屏的处理方法

Jul 06 Python

python分分钟绘制精美地图海报

Feb 15 Python

计算机二级python学习教程（2） python语言基本语法元素

May 16 #Python

计算机二级python学习教程（1）教大家如何学习python

May 16 #Python

详解Python传入参数的几种方法

May 16 #Python

[机器视觉]使用python自动识别验证码详解

May 16 #Python

Python redis操作实例分析【连接、管道、发布和订阅等】

May 16 #Python

Python操作redis实例小结【String、Hash、List、Set等】

May 16 #Python

Python 实现数据结构中的的栈队列

May 16 #Python