爬虫代理池Python3WebSpider源代码测试过程解析


Posted in Python onDecember 20, 2019

这篇文章主要介绍了爬虫代理池Python3WebSpider源代码测试过程解析,文中通过示例代码介绍的非常详细,对大家的学习或者工作具有一定的参考学习价值,需要的朋友可以参考下

元类属性的使用

代码

主要关于元类的使用

通过获取由元类生成的爬虫抓取类的部分属性.这里为抓取函数,以相同的字符开头的抓取函数,生成属性列表,这样可以持续调用.目的是可以仅仅添加不同的抓取函数抓取不同的网站,而类的其他部分不用做调整.

部分代码:

class ProxyMetaclass(type):
  def __new__(cls, name, bases, attrs):
    count = 0
    attrs['__CrawlFunc__'] = []
    for k, v in attrs.items():
      if 'crawl_' in k:
        attrs['__CrawlFunc__'].append(k)
        count += 1
    attrs['__CrawlFuncCount__'] = count
    return type.__new__(cls, name, bases, attrs)


class Crawler(object, metaclass=ProxyMetaclass):
  def get_proxies(self, callback):
    proxies = []
    for proxy in eval("self.{}()".format(callback)):
      print('成功获取到代理', proxy)
      proxies.append(proxy)
    return proxies
    
  def crawl_daili66(self, page_count=4):
    """
    获取代理66
    :param page_count: 页码
    :return: 代理
    """
    start_url = 'http://www.66ip.cn/{}.html'
    urls = [start_url.format(page) for page in range(1, page_count + 1)]
    for url in urls:
      print('Crawling', url)
      html = get_page(url)
      if html:
        doc = pq(html)
        trs = doc('.containerbox table tr:gt(0)').items()
        for tr in trs:
          ip = tr.find('td:nth-child(1)').text()
          port = tr.find('td:nth-child(2)').text()
          yield ':'.join([ip, port])

测试方法

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time  : 12/19/19 4:10 PM
# @Author : yon
# @Email  : @qq.com
# @File  : test


import json
import re
from pyquery import PyQuery as pq


class ProxyMetaclass(type):
  def __new__(cls, name, bases, attrs):
    count = 0
    attrs['__CrawlFunc__'] = []
    for k, v in attrs.items():
      print("打印k")
      print(k)
      print("打印v")
      print(v)
      if 'crawl_' in k:
        attrs['__CrawlFunc__'].append(k)
        count += 1
    attrs['__CrawlFuncCount__'] = count
    return type.__new__(cls, name, bases, attrs)


class Crawler(object, metaclass=ProxyMetaclass):
  def get_proxies(self, callback):
    proxies = []
    for proxy in eval("self.{}()".format(callback)):
      print('成功获取到代理', proxy)
      proxies.append(proxy)
    return proxies

  def crawl_daili66(self, page_count=4):
    """
    获取代理66
    :param page_count: 页码
    :return: 代理
    """
    start_url = 'http://www.66ip.cn/{}.html'
    urls = [start_url.format(page) for page in range(1, page_count + 1)]
    for url in urls:
      print('Crawling', url)
      html = get_page(url)
      if html:
        doc = pq(html)
        trs = doc('.containerbox table tr:gt(0)').items()
        for tr in trs:
          ip = tr.find('td:nth-child(1)').text()
          port = tr.find('td:nth-child(2)').text()
          yield ':'.join([ip, port])

  def crawl_ip3366(self):
    for page in range(1, 4):
      start_url = 'http://www.ip3366.net/free/?stype=1&page={}'.format(page)
      html = get_page(start_url)
      ip_address = re.compile('<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
      # \s * 匹配空格,起到换行作用
      re_ip_address = ip_address.findall(html)
      for address, port in re_ip_address:
        result = address + ':' + port
        yield result.replace(' ', '')

  def crawl_kuaidaili(self):
    for i in range(1, 4):
      start_url = 'http://www.kuaidaili.com/free/inha/{}/'.format(i)
      html = get_page(start_url)
      if html:
        ip_address = re.compile('<td data-title="IP">(.*?)</td>')
        re_ip_address = ip_address.findall(html)
        port = re.compile('<td data-title="PORT">(.*?)</td>')
        re_port = port.findall(html)
        for address, port in zip(re_ip_address, re_port):
          address_port = address + ':' + port
          yield address_port.replace(' ', '')

  def crawl_xicidaili(self):
    for i in range(1, 3):
      start_url = 'http://www.xicidaili.com/nn/{}'.format(i)
      headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Cookie': '_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJWRjYzc5MmM1MTBiMDMzYTUzNTZjNzA4NjBhNWRjZjliBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMUp6S2tXT3g5a0FCT01ndzlmWWZqRVJNek1WanRuUDBCbTJUN21GMTBKd3M9BjsARg%3D%3D--2a69429cb2115c6a0cc9a86e0ebe2800c0d471b3',
        'Host': 'www.xicidaili.com',
        'Referer': 'http://www.xicidaili.com/nn/3',
        'Upgrade-Insecure-Requests': '1',
      }
      html = get_page(start_url, options=headers)
      if html:
        find_trs = re.compile('<tr class.*?>(.*?)</tr>', re.S)
        trs = find_trs.findall(html)
        for tr in trs:
          find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>')
          re_ip_address = find_ip.findall(tr)
          find_port = re.compile('<td>(\d+)</td>')
          re_port = find_port.findall(tr)
          for address, port in zip(re_ip_address, re_port):
            address_port = address + ':' + port
            yield address_port.replace(' ', '')

  def crawl_ip3366(self):
    for i in range(1, 4):
      start_url = 'http://www.ip3366.net/?stype=1&page={}'.format(i)
      html = get_page(start_url)
      if html:
        find_tr = re.compile('<tr>(.*?)</tr>', re.S)
        trs = find_tr.findall(html)
        for s in range(1, len(trs)):
          find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>')
          re_ip_address = find_ip.findall(trs[s])
          find_port = re.compile('<td>(\d+)</td>')
          re_port = find_port.findall(trs[s])
          for address, port in zip(re_ip_address, re_port):
            address_port = address + ':' + port
            yield address_port.replace(' ', '')

  def crawl_iphai(self):
    start_url = 'http://www.iphai.com/'
    html = get_page(start_url)
    if html:
      find_tr = re.compile('<tr>(.*?)</tr>', re.S)
      trs = find_tr.findall(html)
      for s in range(1, len(trs)):
        find_ip = re.compile('<td>\s+(\d+\.\d+\.\d+\.\d+)\s+</td>', re.S)
        re_ip_address = find_ip.findall(trs[s])
        find_port = re.compile('<td>\s+(\d+)\s+</td>', re.S)
        re_port = find_port.findall(trs[s])
        for address, port in zip(re_ip_address, re_port):
          address_port = address + ':' + port
          yield address_port.replace(' ', '')

  def crawl_data5u(self):
    start_url = 'http://www.data5u.com/free/gngn/index.shtml'
    headers = {
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
      'Accept-Encoding': 'gzip, deflate',
      'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
      'Cache-Control': 'max-age=0',
      'Connection': 'keep-alive',
      'Cookie': 'JSESSIONID=47AA0C887112A2D83EE040405F837A86',
      'Host': 'www.data5u.com',
      'Referer': 'http://www.data5u.com/free/index.shtml',
      'Upgrade-Insecure-Requests': '1',
      'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',
    }
    html = get_page(start_url, options=headers)
    if html:
      ip_address = re.compile('<span><li>(\d+\.\d+\.\d+\.\d+)</li>.*?<li class=\"port.*?>(\d+)</li>', re.S)
      re_ip_address = ip_address.findall(html)
      for address, port in re_ip_address:
        result = address + ':' + port
        yield result.replace(' ', '')


class Getter():
  def __init__(self):
    self.crawler = Crawler()

  def run(self):
    print('获取器开始执行')
    for callback_label in range(self.crawler.__CrawlFuncCount__):
      print(callback_label)
      callback = self.crawler.__CrawlFunc__[callback_label]
      print(callback)
      # # 获取代理
      # proxies = self.crawler.get_proxies(callback)
      # sys.stdout.flush()
      # for proxy in proxies:
      #   self.redis.add(proxy)
if __name__ == '__main__':
  get = Getter()
  get.run()

测试结果

/home/baixiaoxu/PycharmProjects/pytthon-tt/venv/bin/python /home/baixiaoxu/PycharmProjects/pytthon-tt/proxypool/test.py
打印k
__module__
打印v
__main__
打印k
__qualname__
打印v
Crawler
打印k
get_proxies
打印v
<function Crawler.get_proxies at 0x7f905ca5a598>
打印k
crawl_daili66
打印v
<function Crawler.crawl_daili66 at 0x7f905ca5a620>
打印k
crawl_ip3366
打印v
<function Crawler.crawl_ip3366 at 0x7f905ca5a840>
打印k
crawl_kuaidaili
打印v
<function Crawler.crawl_kuaidaili at 0x7f905ca5a730>
打印k
crawl_xicidaili
打印v
<function Crawler.crawl_xicidaili at 0x7f905ca5a7b8>
打印k
crawl_iphai
打印v
<function Crawler.crawl_iphai at 0x7f905ca5a6a8>
打印k
crawl_data5u
打印v
<function Crawler.crawl_data5u at 0x7f905ca5a8c8>
打印k
__CrawlFunc__
打印v
['crawl_daili66', 'crawl_ip3366', 'crawl_kuaidaili', 'crawl_xicidaili', 'crawl_iphai', 'crawl_data5u']
获取器开始执行
0
crawl_daili66
1
crawl_ip3366
2
crawl_kuaidaili
3
crawl_xicidaili
4
crawl_iphai
5
crawl_data5u

进程完成,退出码 0

以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持三水点靠木。

Python 相关文章推荐
Python中获取网页状态码的两个方法
Nov 03 Python
Python3处理文件中每个词的方法
May 22 Python
python自定义异常实例详解
Jul 11 Python
Python返回数组/List长度的实例
Jun 23 Python
对python实现模板生成脚本的方法详解
Jan 30 Python
解决python中用matplotlib画多幅图时出现图形部分重叠的问题
Jul 07 Python
详解python中自定义超时异常的几种方法
Jul 29 Python
python 字符串常用方法汇总详解
Sep 16 Python
Python实现aes加密解密多种方法解析
May 15 Python
python文件读取失败怎么处理
Jun 23 Python
Django Model层F,Q对象和聚合函数原理解析
Nov 12 Python
python中的None与NULL用法说明
May 25 Python
python3的UnicodeDecodeError解决方法
Dec 20 #Python
基于python调用psutil模块过程解析
Dec 20 #Python
python如何使用jt400.jar包代码实例
Dec 20 #Python
基于python使用tibco ems代码实例
Dec 20 #Python
使用python实现数组、链表、队列、栈的方法
Dec 20 #Python
python隐藏类中属性的3种实现方法
Dec 19 #Python
Python合并2个字典成1个新字典的方法(9种)
Dec 19 #Python
You might like
Protoss魔法科技
2020/03/14 星际争霸
用PHP创建PDF中文文档
2006/10/09 PHP
PHP间隔一段时间执行代码的方法
2014/12/02 PHP
PHP+Jquery与ajax相结合实现下拉淡出瀑布流效果【无需插件】
2016/05/06 PHP
PHP实现 APP端微信支付功能
2018/06/22 PHP
jquery 弹出层注册页面等(asp.net后台)
2010/06/17 Javascript
Javascript insertAfter() 实现函数代码
2011/10/12 Javascript
Javascript 面向对象编程(coolshell)
2012/03/18 Javascript
基于jquery的时间段实现代码
2012/08/02 Javascript
js设置文本框中焦点位置在最后的示例代码(简单实用)
2014/03/04 Javascript
JQuery中extend使用介绍
2014/03/13 Javascript
详细分析使用AngularJS编程中提交表单的方式
2015/06/19 Javascript
JavaScript的面向对象编程基础
2015/08/13 Javascript
node.js的exports、module.exports与ES6的export、export default深入详解
2017/10/26 Javascript
基于webpack.config.js 参数详解
2018/03/20 Javascript
浅谈HTTP 缓存的那些事儿
2018/10/17 Javascript
layui lay-verify form表单自定义验证规则详解
2019/09/18 Javascript
js数组的基本使用总结
2021/01/18 Javascript
[03:52]DOTA2英雄基础教程 酒仙
2013/12/23 DOTA
Python 解析XML文件
2009/04/15 Python
python paramiko实现ssh远程访问的方法
2013/12/03 Python
跟老齐学Python之总结参数的传递
2014/10/10 Python
python字符串的index和find的区别详解
2020/06/20 Python
PyQt5-QDateEdit的简单使用操作
2020/07/12 Python
python如何实时获取tcpdump输出
2020/09/16 Python
pandas apply使用多列计算生成新的列实现示例
2021/02/24 Python
浅谈CSS3特性查询(Feature Query: @supports)功能简介
2017/07/31 HTML / CSS
自我评价正确写法范文
2013/12/10 职场文书
我的网上商城创业计划书
2013/12/26 职场文书
八年级生物教学反思
2014/01/22 职场文书
受伤赔偿协议书
2014/09/24 职场文书
三好学生评选事迹材料(2016精选版)
2016/02/25 职场文书
微信小程序实现录音Record功能
2021/05/09 Javascript
如何利用 CSS Overview 面板重构优化你的网站
2021/10/24 HTML / CSS
世界十大动漫制作公司排行榜,迪士尼上榜,第二是美国代表性文化符
2022/03/18 欧美动漫
mysql中如何用命令创建联合唯一索引
2022/04/20 MySQL