Python 微信爬虫完整实例【单线程与多线程】


Posted in Python onJuly 06, 2019

本文实例讲述了Python 实现的微信爬虫。分享给大家供大家参考,具体如下:

单线程版:

import urllib.request
import urllib.parse
import urllib.error
import re,time
headers = ("User-Agent",
      "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3107.4 Safari/537.36")
operner = urllib.request.build_opener()
operner.addheaders = [headers]
urllib.request.install_opener(operner)
list_url = []
###使用代理获取网页url内容
def use_proxy(url):
  try:
    # proxy = urllib.request.ProxyHandler({'http':proxy_addr})
##使用代理版
    # operner = urllib.request.build_opener()
    # urllib.request.install_opener(operner)
    headers = ("User-Agent",
          "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3107.4 Safari/537.36")
    operner = urllib.request.build_opener()
    operner.addheaders = [headers]
    urllib.request.install_opener(operner)
    data = urllib.request.urlopen(url).read().decode('utf-8')
    # print (data)
    return data
  except urllib.error.URLError as e:
    if hasattr(e, "code"):
      print(e.code)
    elif hasattr(e, "reason"):
      print(e.reason)
  except Exception as e:
    print("exception" + str(e))
    time.sleep(1)
##获取要爬取的url
def get_url(key, pagestart, pageend):
  try:
    keycode = urllib.parse.quote(key)
    for page in range(pagestart, pageend + 1):
      url = "http://weixin.sogou.com/weixin?query=%s&_sug_type_=&s_from=input&_sug_=n&type=%d&page=1&ie=utf8" % (
      keycode, page)
      data1 = use_proxy(url)
      #print("data1的内容是", data1)
      listurl_pattern = '<h3>.*?("http://.*?)</h3>'
      result = re.compile(listurl_pattern, re.S).findall(data1)
      for i in range(len(result)):
        res = result[i].replace("amp;", "").split(" ")[0].replace("\"", "")
        list_url.append(res)
    #print(list_url)
    return list_url
  except urllib.error.URLError as e:
    if hasattr(e, "code"):
      print(e.code)
    elif hasattr(e, "reason"):
      print(e.reason)
  except Exception as e:
    print("exception:", e)
##通过获取的url爬行内容数据并处理
def get_url_content(list_url):
  fh1=open("D:\\python-script\\1.html", 'wb')
  html1 = '''<!DOCTYPE html>\n<html xmlns="http://www.w3.org/1999/xhmtl">\n<head>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n<title>微信文章</title></head>\n<body>'''
  fh1.write(html1.encode("utf-8"))
  fh1.close()
  fh = open("D:\\python-script\\1.html", 'ab')
  for url in list_url:
    data_content = use_proxy(url)
    #print (data_content)
    #sys.exit()
    title_pattern = '<h2.*>.*?</h2>'
    result_title = re.compile(title_pattern, re.S).findall(data_content)
    ##标题(str)
    res_title = result_title[0].replace("<h2 class=\"rich_media_title\" id=\"activity-name\">", "").replace("</h2>",
                                             "").strip()
    content_pattern = 'id="js_content">(.*?)<div class="rich_media_tool" id="js_sg_bar">'
    content = re.compile(content_pattern, re.S).findall(data_content)
    try:
      fh.write(res_title.encode("utf-8"))
      for i in content:
        fh.write(i.strip().encode("utf-8"))
    except UnicodeEncodeError as e:
      continue
  fh.write("</body></html>".encode("utf-8"))
if __name__ == '__main__':
  pagestart = 1
  pageend = 2
  key = "人工智能"
  get_url(key, pagestart, pageend)
  get_url_content(list_url)

多线程版:

import urllib.request
import urllib.parse
import urllib.error
import re,time
import queue
import threading
headers = ("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3107.4 Safari/537.36")
operner = urllib.request.build_opener()
operner.addheaders = [headers]
urllib.request.install_opener(operner)
urlque = queue.Queue()
list_url = []
###使用代理获取网页url内容
def use_proxy(url):
  try:
    # proxy = urllib.request.ProxyHandler({'http':proxy_addr})
    # operner = urllib.request.build_opener()
    # urllib.request.install_opener(operner)
    headers = ("User-Agent",
          "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3107.4 Safari/537.36")
    operner = urllib.request.build_opener()
    operner.addheaders = [headers]
    urllib.request.install_opener(operner)
    data = urllib.request.urlopen(url).read().decode('utf-8')
    #print (data)
    return data
  except urllib.error.URLError as e:
    if hasattr(e,"code"):
      print (e.code)
    elif hasattr(e,"reason"):
      print (e.reason)
  except Exception as e:
    print ("exception"+str(e))
    time.sleep(1)
###获取文章的url连接,并将连接加入到队列
class get_url(threading.Thread):
  def __init__(self,key,pagestart,pageend,urlque):
    threading.Thread.__init__(self)
    self.pagestart = pagestart
    self.pageend = pageend
    self.key = key
    self.urlque = urlque
  def run(self):
    try:
      keycode = urllib.parse.quote(self.key)
      for page in range(self.pagestart,self.pageend+1):
        url = "http://weixin.sogou.com/weixin?query=%s&_sug_type_=&s_from=input&_sug_=n&type=%d&page=1&ie=utf8" % (keycode,page)
        data = use_proxy(url)
        print ("data1的内容是",data)
        listurl_pattern = '<h3>.*?("http://.*?)</h3>'
        result = re.compile(listurl_pattern,re.S).findall(data)
        print (result)
        if len(result) == 0:
          print ("没有可用的url")
          sys.exit()
        for i in range(len(result)):
          res = result[i].replace("amp;","").split(" ")[0].replace("\"" ,"")
          #list_url.append(res)    #加入列表
          self.urlque.put(res)      ##加入队列
          self.urlque.task_done()
      #return list_url
    except urllib.error.URLError as e:
      if hasattr(e, "code"):
        print(e.code)
      elif hasattr(e, "reason"):
        print(e.reason)
    except Exception as e:
      print ("exception:",e)
##根据url获取文章内容
class get_url_content(threading.Thread):
  def __init__(self,urlque):
    threading.Thread.__init__(self)
    self.urlque = urlque
  def run(self):
    fh1 = open("D:\\python-script\\1.html", 'wb')
    html1 = '''<!DOCTYPE html>\n<html xmlns="http://www.w3.org/1999/xhmtl">\n<head>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n<title>微信文章</title></head>\n<body>'''
    fh1.write(html1.encode("utf-8"))
    fh1.close()
    fh = open("D:\\python-script\\1.html", 'ab')
    while True:
      try:
        url = self.urlque.get()
        data_content = use_proxy(url)
        title_pattern = '<h2.*>.*?</h2>'
        result_title = re.compile(title_pattern, re.S).findall(data_content)
        ##标题
        res_title = result_title[0].replace("<h2 class=\"rich_media_title\" id=\"activity-name\">", "").replace("</h2>","").strip()
        content_pattern = 'id="js_content">(.*?)<div class="rich_media_tool" id="js_sg_bar">'
        content = re.compile(content_pattern, re.S).findall(data_content)
        #c = '<p style="max-width: 100%;box-sizing: border-box;min-height: 1em;text-indent: 2em;word-wrap: break-word !important;">'
        # for i in content:
        #   ##内容
        #   c_content=i.replace(c, "").replace("<br /></p>", "").replace("</p>", "")
        fh.write(res_title.encode("utf-8"))
        for i in content:
          fh.write(i.strip().encode("utf-8"))
      except UnicodeEncodeError as e:
        continue
      fh.close()
class contrl(threading.Thread):
  def __init__(self,urlqueue):
    threading.Thread.__init__(self)
    self.urlqueue = urlqueue
    while True:
      print ("程序正在执行")
      if self.urlqueue.empty():
        time.sleep(3)
        print ("程序执行完毕")
        exit()
if __name__ == '__main__':
  pagestart = 1
  pageend = 2
  key = "人工智能"
  get_url = get_url(key,pagestart,pageend,urlque)
  get_url.start()
  get_content = get_url_content(urlque)
  get_content.start()
  cntrol = contrl(urlque)
  cntrol.start()

更多关于Python相关内容可查看本站专题:《Python Socket编程技巧总结》、《Python正则表达式用法总结》、《Python数据结构与算法教程》、《Python函数使用技巧总结》、《Python字符串操作技巧汇总》、《Python入门与进阶经典教程》及《Python文件与目录操作技巧汇总》

希望本文所述对大家Python程序设计有所帮助。

Python 相关文章推荐
Python实现的tab文件操作类分享
Nov 20 Python
在Python中操作字典之fromkeys()方法的使用
May 21 Python
Python 实现简单的电话本功能
Aug 09 Python
Python 实现文件的全备份和差异备份详解
Dec 27 Python
使用python对文件中的单词进行提取的方法示例
Dec 21 Python
django解决跨域请求的问题详解
Jan 20 Python
python对文件目录的操作方法实例总结
Jun 24 Python
Python 使用folium绘制leaflet地图的实现方法
Jul 05 Python
Python爬取YY评级分数并保存数据实现过程解析
Jun 01 Python
Python利用pip安装tar.gz格式的离线资源包
Sep 14 Python
详细总结Python常见的安全问题
May 21 Python
Python requests用法和django后台处理详解
Mar 19 Python
python实现爬取百度图片的方法示例
Jul 06 #Python
python实现控制电脑鼠标和键盘,登录QQ的方法示例
Jul 06 #Python
python3 打印输出字典中特定的某个key的方法示例
Jul 06 #Python
python使用 zip 同时迭代多个序列示例
Jul 06 #Python
Python搭建Spark分布式集群环境
Jul 05 #Python
详解python解压压缩包的五种方法
Jul 05 #Python
用python3 返回鼠标位置的实现方法(带界面)
Jul 05 #Python
You might like
php抓即时股票信息
2006/10/09 PHP
深入理解require与require_once与include以及include_once的区别
2013/06/05 PHP
thinkphp数据查询和遍历数组实例
2014/11/28 PHP
利用PHP判断文件是否为图片的方法总结
2017/01/06 PHP
Angularjs基础知识及示例汇总
2015/01/22 Javascript
jQuery常用数据处理方法小结
2015/02/20 Javascript
js实现图片放大和拖拽特效代码分享
2015/09/05 Javascript
JavaScript如何实现跨域请求
2016/08/05 Javascript
javascript实现简单的on事件绑定
2016/08/23 Javascript
基于EasyUI的基础之上实现树形功能菜单
2017/06/28 Javascript
ES6正则表达式扩展笔记
2017/07/25 Javascript
jQuery接受后台传递的List的实例详解
2017/08/02 jQuery
js技巧之十几行的代码实现vue.watch代码
2018/06/09 Javascript
Vue.js中Line第三方登录api的实现代码
2020/06/29 Javascript
[01:11]辉夜杯战队访谈宣传片—CDEC.Y
2015/12/26 DOTA
简单介绍Python的Tornado框架中的协程异步实现原理
2015/04/23 Python
小小聊天室Python代码实现
2016/08/17 Python
python类中super()和__init__()的区别
2016/10/18 Python
详解python之简单主机批量管理工具
2017/01/27 Python
python使用锁访问共享变量实例解析
2018/02/08 Python
Python 做曲线拟合和求积分的方法
2018/12/29 Python
Python静态类型检查新工具之pyright 使用指南
2019/04/26 Python
IntelliJ 中配置 Anaconda的过程图解
2020/06/01 Python
python实现学生管理系统开发
2020/07/24 Python
如何使用Pytorch搭建模型
2020/10/26 Python
css3一个简易的 LED 数字时钟实现方法
2020/01/15 HTML / CSS
英国书籍、CD、DVD和游戏的第一道德零售商:Awesome Books
2020/02/22 全球购物
分厂厂长岗位职责
2013/12/29 职场文书
应用化学专业职业生涯规划书
2013/12/31 职场文书
招聘单位介绍信
2014/01/14 职场文书
酒店总经理欢迎词
2014/01/15 职场文书
植物生产学专业求职信
2014/08/08 职场文书
交警作风整顿剖析材料
2014/10/11 职场文书
2015年班组建设工作总结
2015/05/13 职场文书
Python 循环读取数据内存不足的解决方案
2021/05/25 Python
解决sql server 数据库,sa用户被锁定的问题
2021/06/11 SQL Server