爬虫代理池Python3WebSpider源代码测试过程解析


Posted in Python onDecember 20, 2019

这篇文章主要介绍了爬虫代理池Python3WebSpider源代码测试过程解析,文中通过示例代码介绍的非常详细,对大家的学习或者工作具有一定的参考学习价值,需要的朋友可以参考下

元类属性的使用

代码

主要关于元类的使用

通过获取由元类生成的爬虫抓取类的部分属性.这里为抓取函数,以相同的字符开头的抓取函数,生成属性列表,这样可以持续调用.目的是可以仅仅添加不同的抓取函数抓取不同的网站,而类的其他部分不用做调整.

部分代码:

class ProxyMetaclass(type):
  def __new__(cls, name, bases, attrs):
    count = 0
    attrs['__CrawlFunc__'] = []
    for k, v in attrs.items():
      if 'crawl_' in k:
        attrs['__CrawlFunc__'].append(k)
        count += 1
    attrs['__CrawlFuncCount__'] = count
    return type.__new__(cls, name, bases, attrs)


class Crawler(object, metaclass=ProxyMetaclass):
  def get_proxies(self, callback):
    proxies = []
    for proxy in eval("self.{}()".format(callback)):
      print('成功获取到代理', proxy)
      proxies.append(proxy)
    return proxies
    
  def crawl_daili66(self, page_count=4):
    """
    获取代理66
    :param page_count: 页码
    :return: 代理
    """
    start_url = 'http://www.66ip.cn/{}.html'
    urls = [start_url.format(page) for page in range(1, page_count + 1)]
    for url in urls:
      print('Crawling', url)
      html = get_page(url)
      if html:
        doc = pq(html)
        trs = doc('.containerbox table tr:gt(0)').items()
        for tr in trs:
          ip = tr.find('td:nth-child(1)').text()
          port = tr.find('td:nth-child(2)').text()
          yield ':'.join([ip, port])

测试方法

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time  : 12/19/19 4:10 PM
# @Author : yon
# @Email  : @qq.com
# @File  : test


import json
import re
from pyquery import PyQuery as pq


class ProxyMetaclass(type):
  def __new__(cls, name, bases, attrs):
    count = 0
    attrs['__CrawlFunc__'] = []
    for k, v in attrs.items():
      print("打印k")
      print(k)
      print("打印v")
      print(v)
      if 'crawl_' in k:
        attrs['__CrawlFunc__'].append(k)
        count += 1
    attrs['__CrawlFuncCount__'] = count
    return type.__new__(cls, name, bases, attrs)


class Crawler(object, metaclass=ProxyMetaclass):
  def get_proxies(self, callback):
    proxies = []
    for proxy in eval("self.{}()".format(callback)):
      print('成功获取到代理', proxy)
      proxies.append(proxy)
    return proxies

  def crawl_daili66(self, page_count=4):
    """
    获取代理66
    :param page_count: 页码
    :return: 代理
    """
    start_url = 'http://www.66ip.cn/{}.html'
    urls = [start_url.format(page) for page in range(1, page_count + 1)]
    for url in urls:
      print('Crawling', url)
      html = get_page(url)
      if html:
        doc = pq(html)
        trs = doc('.containerbox table tr:gt(0)').items()
        for tr in trs:
          ip = tr.find('td:nth-child(1)').text()
          port = tr.find('td:nth-child(2)').text()
          yield ':'.join([ip, port])

  def crawl_ip3366(self):
    for page in range(1, 4):
      start_url = 'http://www.ip3366.net/free/?stype=1&page={}'.format(page)
      html = get_page(start_url)
      ip_address = re.compile('<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
      # \s * 匹配空格,起到换行作用
      re_ip_address = ip_address.findall(html)
      for address, port in re_ip_address:
        result = address + ':' + port
        yield result.replace(' ', '')

  def crawl_kuaidaili(self):
    for i in range(1, 4):
      start_url = 'http://www.kuaidaili.com/free/inha/{}/'.format(i)
      html = get_page(start_url)
      if html:
        ip_address = re.compile('<td data-title="IP">(.*?)</td>')
        re_ip_address = ip_address.findall(html)
        port = re.compile('<td data-title="PORT">(.*?)</td>')
        re_port = port.findall(html)
        for address, port in zip(re_ip_address, re_port):
          address_port = address + ':' + port
          yield address_port.replace(' ', '')

  def crawl_xicidaili(self):
    for i in range(1, 3):
      start_url = 'http://www.xicidaili.com/nn/{}'.format(i)
      headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Cookie': '_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJWRjYzc5MmM1MTBiMDMzYTUzNTZjNzA4NjBhNWRjZjliBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMUp6S2tXT3g5a0FCT01ndzlmWWZqRVJNek1WanRuUDBCbTJUN21GMTBKd3M9BjsARg%3D%3D--2a69429cb2115c6a0cc9a86e0ebe2800c0d471b3',
        'Host': 'www.xicidaili.com',
        'Referer': 'http://www.xicidaili.com/nn/3',
        'Upgrade-Insecure-Requests': '1',
      }
      html = get_page(start_url, options=headers)
      if html:
        find_trs = re.compile('<tr class.*?>(.*?)</tr>', re.S)
        trs = find_trs.findall(html)
        for tr in trs:
          find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>')
          re_ip_address = find_ip.findall(tr)
          find_port = re.compile('<td>(\d+)</td>')
          re_port = find_port.findall(tr)
          for address, port in zip(re_ip_address, re_port):
            address_port = address + ':' + port
            yield address_port.replace(' ', '')

  def crawl_ip3366(self):
    for i in range(1, 4):
      start_url = 'http://www.ip3366.net/?stype=1&page={}'.format(i)
      html = get_page(start_url)
      if html:
        find_tr = re.compile('<tr>(.*?)</tr>', re.S)
        trs = find_tr.findall(html)
        for s in range(1, len(trs)):
          find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>')
          re_ip_address = find_ip.findall(trs[s])
          find_port = re.compile('<td>(\d+)</td>')
          re_port = find_port.findall(trs[s])
          for address, port in zip(re_ip_address, re_port):
            address_port = address + ':' + port
            yield address_port.replace(' ', '')

  def crawl_iphai(self):
    start_url = 'http://www.iphai.com/'
    html = get_page(start_url)
    if html:
      find_tr = re.compile('<tr>(.*?)</tr>', re.S)
      trs = find_tr.findall(html)
      for s in range(1, len(trs)):
        find_ip = re.compile('<td>\s+(\d+\.\d+\.\d+\.\d+)\s+</td>', re.S)
        re_ip_address = find_ip.findall(trs[s])
        find_port = re.compile('<td>\s+(\d+)\s+</td>', re.S)
        re_port = find_port.findall(trs[s])
        for address, port in zip(re_ip_address, re_port):
          address_port = address + ':' + port
          yield address_port.replace(' ', '')

  def crawl_data5u(self):
    start_url = 'http://www.data5u.com/free/gngn/index.shtml'
    headers = {
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
      'Accept-Encoding': 'gzip, deflate',
      'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
      'Cache-Control': 'max-age=0',
      'Connection': 'keep-alive',
      'Cookie': 'JSESSIONID=47AA0C887112A2D83EE040405F837A86',
      'Host': 'www.data5u.com',
      'Referer': 'http://www.data5u.com/free/index.shtml',
      'Upgrade-Insecure-Requests': '1',
      'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',
    }
    html = get_page(start_url, options=headers)
    if html:
      ip_address = re.compile('<span><li>(\d+\.\d+\.\d+\.\d+)</li>.*?<li class=\"port.*?>(\d+)</li>', re.S)
      re_ip_address = ip_address.findall(html)
      for address, port in re_ip_address:
        result = address + ':' + port
        yield result.replace(' ', '')


class Getter():
  def __init__(self):
    self.crawler = Crawler()

  def run(self):
    print('获取器开始执行')
    for callback_label in range(self.crawler.__CrawlFuncCount__):
      print(callback_label)
      callback = self.crawler.__CrawlFunc__[callback_label]
      print(callback)
      # # 获取代理
      # proxies = self.crawler.get_proxies(callback)
      # sys.stdout.flush()
      # for proxy in proxies:
      #   self.redis.add(proxy)
if __name__ == '__main__':
  get = Getter()
  get.run()

测试结果

/home/baixiaoxu/PycharmProjects/pytthon-tt/venv/bin/python /home/baixiaoxu/PycharmProjects/pytthon-tt/proxypool/test.py
打印k
__module__
打印v
__main__
打印k
__qualname__
打印v
Crawler
打印k
get_proxies
打印v
<function Crawler.get_proxies at 0x7f905ca5a598>
打印k
crawl_daili66
打印v
<function Crawler.crawl_daili66 at 0x7f905ca5a620>
打印k
crawl_ip3366
打印v
<function Crawler.crawl_ip3366 at 0x7f905ca5a840>
打印k
crawl_kuaidaili
打印v
<function Crawler.crawl_kuaidaili at 0x7f905ca5a730>
打印k
crawl_xicidaili
打印v
<function Crawler.crawl_xicidaili at 0x7f905ca5a7b8>
打印k
crawl_iphai
打印v
<function Crawler.crawl_iphai at 0x7f905ca5a6a8>
打印k
crawl_data5u
打印v
<function Crawler.crawl_data5u at 0x7f905ca5a8c8>
打印k
__CrawlFunc__
打印v
['crawl_daili66', 'crawl_ip3366', 'crawl_kuaidaili', 'crawl_xicidaili', 'crawl_iphai', 'crawl_data5u']
获取器开始执行
0
crawl_daili66
1
crawl_ip3366
2
crawl_kuaidaili
3
crawl_xicidaili
4
crawl_iphai
5
crawl_data5u

进程完成,退出码 0

以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持三水点靠木。

Python 相关文章推荐
python解析json实例方法
Nov 19 Python
python和shell变量互相传递的几种方法
Nov 20 Python
python中的对象拷贝示例 python引用传递
Jan 23 Python
Python中的index()方法使用教程
May 18 Python
Django框架中处理URLconf中特定的URL的方法
Jul 20 Python
python2.7读取文件夹下所有文件名称及内容的方法
Feb 24 Python
Python增强赋值和共享引用注意事项小结
May 28 Python
python模拟点击网页按钮实现方法
Feb 25 Python
Python pip install如何修改默认下载路径
Apr 29 Python
sublime3之内网安装python插件Anaconda的流程
Nov 10 Python
用python获取txt文件中关键字的数量
Dec 24 Python
python cv2图像质量压缩的算法示例
Jun 04 Python
python3的UnicodeDecodeError解决方法
Dec 20 #Python
基于python调用psutil模块过程解析
Dec 20 #Python
python如何使用jt400.jar包代码实例
Dec 20 #Python
基于python使用tibco ems代码实例
Dec 20 #Python
使用python实现数组、链表、队列、栈的方法
Dec 20 #Python
python隐藏类中属性的3种实现方法
Dec 19 #Python
Python合并2个字典成1个新字典的方法(9种)
Dec 19 #Python
You might like
DEDE采集大师官方留后门的删除办法
2011/01/08 PHP
mysqli扩展无法在PHP7下升级问题的解决
2019/09/10 PHP
Javascript 遍历页面text控件详解
2014/01/06 Javascript
js中用window.open()打开多个窗口的name问题
2014/03/13 Javascript
js获取元素外链样式的方法
2015/01/27 Javascript
javascript实现行拖动的方法
2015/05/27 Javascript
javascript实现列表滚动的方法
2015/07/30 Javascript
Bootstrap入门书籍之(五)导航条、分页导航
2016/02/17 Javascript
JS实现的打字机效果完整实例
2016/06/20 Javascript
几种二级联动案例(jQuery\Array\Ajax php)
2016/08/13 Javascript
vue双向数据绑定原理探究(附demo)
2017/01/17 Javascript
用原生JS实现简单的多选框功能
2017/06/12 Javascript
Angular实现双向折叠列表组件的示例代码
2017/11/21 Javascript
vue.js内置组件之keep-alive组件使用
2018/07/10 Javascript
JavaScript模拟实现自由落体效果
2018/08/28 Javascript
js实现黑白div块画空心的图形
2018/12/13 Javascript
原生JS 实现的input输入时表格过滤操作示例
2019/08/03 Javascript
Vue代码整洁之去重方法整理
2019/08/06 Javascript
js脚本中执行java后台代码方法解析
2019/10/11 Javascript
在vue中实现禁止回退上一步,路由不存历史记录
2020/07/22 Javascript
jQuery实现本地存储
2020/12/22 jQuery
[56:35]DOTA2上海特级锦标赛C组小组赛#1 OG VS Archon第二局
2016/02/27 DOTA
Python常用内置函数总结
2015/02/08 Python
Python实现短网址ShortUrl的Hash运算实例讲解
2015/08/10 Python
Python实现句子翻译功能
2017/11/14 Python
使用django实现一个代码发布系统
2019/07/18 Python
Python编写带选项的命令行程序方法
2019/08/13 Python
python中提高pip install速度
2020/02/14 Python
django自带的权限管理Permission用法说明
2020/05/13 Python
一个入门级python爬虫教程详解
2021/01/27 Python
前端使用canvas生成盲水印的加密解密的实现
2020/12/16 HTML / CSS
小学毕业典礼演讲稿
2014/09/09 职场文书
医德医魂心得体会
2014/09/11 职场文书
北京天坛导游词
2015/02/12 职场文书
javascript拖曳互换div的位置实现示例
2021/06/28 Javascript
MySQL中TIMESTAMP类型返回日期时间数据中带有T的解决
2022/12/24 MySQL