Python开发实例分享bt种子爬虫程序和种子解析


Posted in Python onMay 21, 2014

看到网上也有开源的代码,这不,我拿来进行了二次重写,呵呵,上代码:

    #encoding: utf-8  
    import socket  
    from hashlib import sha1  
    from random import randint  
    from struct import unpack, pack  
    from socket import inet_aton, inet_ntoa  
    from bisect import bisect_left  
    from threading import Timer  
    from time import sleep  
    import MySQLdb  
    from datetime import *  
    import time  
    from bencode import bencode, bdecode  
    BOOTSTRAP_NODES = [  
        ("router.bittorrent.com", 6881),  
        ("dht.transmissionbt.com", 6881),  
        ("router.utorrent.com", 6881)  
    ]   
    TID_LENGTH = 4  
    KRPC_TIMEOUT = 10  
    REBORN_TIME = 5 * 60  
    K = 8  
    def entropy(bytes):  
        s = ""  
        for i in range(bytes):  
            s += chr(randint(0, 255))  
        return s  
    def random_id():  
        hash = sha1()  
        hash.update( entropy(20) )  
        return hash.digest()  
    def decode_nodes(nodes):  
        n = []  
        length = len(nodes)  
        if (length % 26) != 0:   
            return n  
        for i in range(0, length, 26):  
            nid = nodes[i:i+20]  
            ip = inet_ntoa(nodes[i+20:i+24])  
            port = unpack("!H", nodes[i+24:i+26])[0]  
            n.append( (nid, ip, port) )  
        return n  
    def encode_nodes(nodes):  
        strings = []  
        for node in nodes:  
            s = "%s%s%s" % (node.nid, inet_aton(node.ip), pack("!H", node.port))  
            strings.append(s)  
        return "".join(strings)  
    def intify(hstr):  
        return long(hstr.encode('hex'), 16)      
    def timer(t, f):  
        Timer(t, f).start()  
    class BucketFull(Exception):  
        pass  
    class KRPC(object):  
        def __init__(self):  
            self.types = {  
                "r": self.response_received,  
                "q": self.query_received  
            }  
            self.actions = {  
                "ping": self.ping_received,  
                "find_node": self.find_node_received,  
                "get_peers": self.get_peers_received,  
                "announce_peer": self.announce_peer_received,  
            }  
            self.socket = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)  
            self.socket.bind(("0.0.0.0", self.port))  
        def response_received(self, msg, address):  
            self.find_node_handler(msg)  
        def query_received(self, msg, address):  
            try:  
                self.actions[msg["q"]](msg, address)  
            except KeyError:  
                pass  
        def send_krpc(self, msg, address):  
            try:  
                self.socket.sendto(bencode(msg), address)  
            except:  
                pass  
    class Client(KRPC):  
        def __init__(self, table):  
            self.table = table  
            timer(KRPC_TIMEOUT, self.timeout)  
            timer(REBORN_TIME, self.reborn)  
            KRPC.__init__(self)  
        def find_node(self, address, nid=None):  
            nid = self.get_neighbor(nid) if nid else self.table.nid  
            tid = entropy(TID_LENGTH)  
            msg = {  
                "t": tid,  
                "y": "q",  
                "q": "find_node",  
                "a": {"id": nid, "target": random_id()}  
            }  
            self.send_krpc(msg, address)  
        def find_node_handler(self, msg):  
            try:  
                nodes = decode_nodes(msg["r"]["nodes"])  
                for node in nodes:  
                    (nid, ip, port) = node  
                    if len(nid) != 20: continue  
                    if nid == self.table.nid: continue  
                    self.find_node( (ip, port), nid )  
            except KeyError:  
                pass  
        def joinDHT(self):  
            for address in BOOTSTRAP_NODES:   
                self.find_node(address)  
        def timeout(self):  
            if len( self.table.buckets ) < 2:  
                self.joinDHT()  
            timer(KRPC_TIMEOUT, self.timeout)  
        def reborn(self):  
            self.table.nid = random_id()  
            self.table.buckets = [ KBucket(0, 2**160) ]  
            timer(REBORN_TIME, self.reborn)  
        def start(self):  
            self.joinDHT()  
            while True:  
                try:  
                    (data, address) = self.socket.recvfrom(65536)  
                    msg = bdecode(data)  
                    self.types[msg["y"]](msg, address)  
                except Exception:  
                    pass  
        def get_neighbor(self, target):  
            return target[:10]+random_id()[10:]  
    class Server(Client):  
        def __init__(self, master, table, port):  
            self.table = table  
            self.master = master  
            self.port = port  
            Client.__init__(self, table)  
        def ping_received(self, msg, address):  
            try:  
                nid = msg["a"]["id"]  
                msg = {  
                    "t": msg["t"],  
                    "y": "r",  
                    "r": {"id": self.get_neighbor(nid)}  
                }  
                self.send_krpc(msg, address)  
                self.find_node(address, nid)  
            except KeyError:  
                pass  
        def find_node_received(self, msg, address):  
            try:  
                target = msg["a"]["target"]  
                neighbors = self.table.get_neighbors(target)  
                nid = msg["a"]["id"]  
                msg = {  
                    "t": msg["t"],  
                    "y": "r",  
                    "r": {  
                        "id": self.get_neighbor(target),   
                        "nodes": encode_nodes(neighbors)  
                    }  
                }  
                self.table.append(KNode(nid, *address))  
                self.send_krpc(msg, address)  
                self.find_node(address, nid)  
            except KeyError:  
                pass  
        def get_peers_received(self, msg, address):  
            try:  
                infohash = msg["a"]["info_hash"]  
                neighbors = self.table.get_neighbors(infohash)  
                nid = msg["a"]["id"]  
                msg = {  
                    "t": msg["t"],  
                    "y": "r",  
                    "r": {  
                        "id": self.get_neighbor(infohash),   
                        "nodes": encode_nodes(neighbors)  
                    }  
                }  
                self.table.append(KNode(nid, *address))  
                self.send_krpc(msg, address)  
                self.master.log(infohash)  
                self.find_node(address, nid)  
            except KeyError:  
                pass  
        def announce_peer_received(self, msg, address):  
            try:  
                infohash = msg["a"]["info_hash"]  
                nid = msg["a"]["id"]  
                msg = {   
                    "t": msg["t"],  
                    "y": "r",  
                    "r": {"id": self.get_neighbor(infohash)}  
                }  
                self.table.append(KNode(nid, *address))  
                self.send_krpc(msg, address)  
                self.master.log(infohash)  
                self.find_node(address, nid)  
            except KeyError:  
                pass  
    class KTable(object):  
        def __init__(self, nid):  
            self.nid = nid  
            self.buckets = [ KBucket(0, 2**160) ]  
        def append(self, node):  
            index = self.bucket_index(node.nid)  
            try:  
                bucket = self.buckets[index]  
                bucket.append(node)  
            except IndexError:  
                return  
            except BucketFull:  
                if not bucket.in_range(self.nid): return  
                self.split_bucket(index)  
                self.append(node)  
        def get_neighbors(self, target):  
            nodes = []  
            if len(self.buckets) == 0: return nodes  
            if len(target) != 20 : return nodes  
            index = self.bucket_index(target)  
            try:  
                nodes = self.buckets[index].nodes  
                min = index - 1  
                max = index + 1  
                while len(nodes) < K and ((min >= 0) or (max < len(self.buckets))):  
                    if min >= 0:  
                        nodes.extend(self.buckets[min].nodes)  
                    if max < len(self.buckets):  
                        nodes.extend(self.buckets[max].nodes)  
                    min -= 1  
                    max += 1  
                num = intify(target)  
                nodes.sort(lambda a, b, num=num: cmp(num^intify(a.nid), num^intify(b.nid)))  
                return nodes[:K]  
            except IndexError:  
                return nodes  
        def bucket_index(self, target):  
            return bisect_left(self.buckets, intify(target))  
        def split_bucket(self, index):  
            old = self.buckets[index]  
            point = old.max - (old.max - old.min)/2  
            new = KBucket(point, old.max)  
            old.max = point  
            self.buckets.insert(index + 1, new)  
            for node in old.nodes[:]:  
                if new.in_range(node.nid):  
                    new.append(node)  
                    old.remove(node)  
        def __iter__(self):  
            for bucket in self.buckets:  
                yield bucket  
    class KBucket(object):  
        __slots__ = ("min", "max", "nodes")  
        def __init__(self, min, max):  
            self.min = min  
            self.max = max  
            self.nodes = []  
        def append(self, node):  
            if node in self:  
                self.remove(node)  
                self.nodes.append(node)  
            else:  
                if len(self) < K:  
                    self.nodes.append(node)  
                else:  
                    raise BucketFull  
        def remove(self, node):  
            self.nodes.remove(node)  
        def in_range(self, target):  
            return self.min <= intify(target) < self.max  
        def __len__(self):  
            return len(self.nodes)  
        def __contains__(self, node):  
            return node in self.nodes  
        def __iter__(self):  
            for node in self.nodes:  
                yield node  
        def __lt__(self, target):  
            return self.max <= target  
    class KNode(object):  
        __slots__ = ("nid", "ip", "port")  
        def __init__(self, nid, ip, port):  
            self.nid = nid  
            self.ip = ip  
            self.port = port  
        def __eq__(self, other):  
            return self.nid == other.nid  
    #using example  
    class Master(object):  
        def __init__(self, f):  
            self.f = f  
            try:  
                self.conn=MySQLdb.connect(host='localhost',user='root',passwd='',db='bt',port=3306)  
                self.cur=self.conn.cursor()  
            except MySQLdb.Error,e:  
                print "Mysql Error %d: %s" % (e.args[0], e.args[1])  
        def log(self, infohash):  
            try:  
                sql = "insert into bt_main_new(hash,name,length,date) values(%s,%s,%s,%s)"  
                date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())  
                re=self.cur.execute(sql,(infohash,'','',date))  
                self.conn.commit()  
                self.cur.close()  
                self.conn.close()  
                #print re  
            except MySQLdb.Error,e:  
                print "Mysql Error %d: %s" % (e.args[0], e.args[1])  
            self.f.write(infohash.encode("hex")+"\n")  
            self.f.flush()  
    try:  
        d = date.today()  
        f = open("%s.log" % d, "a")  
        m = Master(f)  
        s = Server(Master(f), KTable(random_id()), 8006)  
        s.start()       
    except KeyboardInterrupt:  
        s.socket.close()  
        f.close() 

本爬虫程序,会自动爬取得网络上分享的bt种子,写入文件盒数据库,爬取的只是个种子的hash码,还需要到网络上下载种子进行分析

下载种子,相信大家都知道国外有几个免费分享种子的网站,大家可以根据hash码去下载,分析,下面呈上我写的一个分析种子的程序:

#! /usr/bin/python  
# -*- coding: utf-8 -*-  
import MySQLdb  
from datetime import *  
import time  
import re  
from time import sleep  
import bencode  
import urllib2  
import base64  
try:  
    conn=MySQLdb.connect(host='localhost',user='root',passwd='',db='bt',port=3306)  
    cur=conn.cursor()  
    sql = "select * from bt_main where name = '' order by id desc"  
    count = cur.execute(sql)  
    rows = cur.fetchall()  
    for row in rows:  
        if row[2].strip() != '':  
            continue  
        id = row[0]  
        hash = row[1]  
        url = "http://haofuli.duapp.com/go/info.php?hash=%s" % hash  
        file = urllib2.urlopen(url).read()  
        if "error!" == file:  
            try:  
                sql = "update bt_main set isTrue = 0 where id = %s "  
                re = cur.execute(sql,(id))  
                conn.commit()  
            except MySQLdb.Error,e:  
                print "Mysql Error %d: %s" % (e.args[0], e.args[1])  
        else:  
            #decode  
            try:  
                fileEncode = bencode.bdecode(file)  
            except Exception,e:pass  
            if 'name.utf-8' in fileEncode['info']:  
                filename=fileEncode['info']['name.utf-8']  
            else:  
                filename = fileEncode['info']['name']  
            ##length  
            if "length" in fileEncode['info']:  
                length = fileEncode['info']['length']  
            else:  
                length = 0  
            try:  
                sql = "update bt_main set name = %s , length = %s , isTrue = 1 where id = %s"  
                re = cur.execute(sql,(base64.b64encode(filename),length,id))  
                conn.commit()  
            except MySQLdb.Error,e:  
                print "Mysql Error %d: %s" % (e.args[0], e.args[1])  
except MySQLdb.Error,e:  
    print "Mysql Error %d: %s" % (e.args[0], e.args[1])

上面的只是简单的分析,对于多文件的,还没有处理。我最近在解析种子的时候,总是出现莫名的填充文件的问题,可能是版本过低吧,最近仍旧在解决。

Python 相关文章推荐
Python中__call__用法实例
Aug 29 Python
Python 序列的方法总结
Oct 18 Python
利用Python破解斗地主残局详解
Jun 30 Python
Python使用requests及BeautifulSoup构建爬虫实例代码
Jan 24 Python
Django实现web端tailf日志文件功能及实例详解
Jul 28 Python
Pytorch 数据加载与数据预处理方式
Dec 31 Python
Python爬虫实现模拟点击动态页面
Mar 05 Python
Django model.py表单设置默认值允许为空的操作
May 19 Python
python进行二次方程式计算的实例讲解
Dec 06 Python
从Pytorch模型pth文件中读取参数成numpy矩阵的操作
Mar 04 Python
golang特有程序结构入门教程
Jun 02 Python
Python的这些库,你知道多少?
Jun 09 Python
从零学Python之引用和类属性的初步理解
May 15 #Python
python中xrange和range的区别
May 13 #Python
Python中os和shutil模块实用方法集锦
May 13 #Python
Python中的jquery PyQuery库使用小结
May 13 #Python
Python getopt模块处理命令行选项实例
May 13 #Python
Python random模块(获取随机数)常用方法和使用例子
May 13 #Python
Python自动化测试工具Splinter简介和使用实例
May 13 #Python
You might like
php中CI操作多个数据库的代码
2012/07/05 PHP
基于php socket(fsockopen)的应用实例分析
2013/06/02 PHP
PHP数字字符串左侧补0、字符串填充和自动补齐的几种方法
2014/05/10 PHP
PHP递归遍历多维数组实现无限分类的方法
2016/05/06 PHP
Laravel Eloquent分表方法并使用模型关联的实现
2019/11/25 PHP
php设计模式之原型模式分析【星际争霸游戏案例】
2020/03/23 PHP
JavaScript 拾漏补遗
2009/12/27 Javascript
javascript数字数组去重复项的实现代码
2010/12/30 Javascript
服务器端的JavaScript脚本 Node.js 使用入门
2012/03/07 Javascript
jquery实现点击文字可编辑并修改保存至数据库
2014/04/15 Javascript
javascript引用赋值(地址传值)用法实例
2015/01/13 Javascript
整理Javascript基础语法学习笔记
2015/11/29 Javascript
第九篇Bootstrap导航菜单创建步骤详解
2016/06/21 Javascript
使用plupload自定义参数实现多文件上传
2016/07/19 Javascript
jQuery过滤选择器经典应用
2016/08/18 Javascript
Bootstrap源码解读网格系统(3)
2016/12/22 Javascript
vue element upload实现图片本地预览
2019/08/20 Javascript
在Node.js中将SVG图像转换为PNG,JPEG,TIFF,WEBP和HEIF格式的方法
2019/08/22 Javascript
es6 super关键字的理解与应用实例分析
2020/02/15 Javascript
JavaScript TAB栏切换效果的示例
2020/11/05 Javascript
[46:28]EG vs Liquid 2019国际邀请赛淘汰赛 败者组 BO3 第二场 8.23
2019/09/05 DOTA
[58:59]完美世界DOTA2联赛PWL S3 access vs CPG 第一场 12.13
2020/12/16 DOTA
python 生成器协程运算实例
2017/09/04 Python
Python登录并获取CSDN博客所有文章列表代码实例
2017/12/28 Python
python实现爬取图书封面
2018/07/05 Python
python如何求解两数的最大公约数
2018/09/27 Python
详解html5 canvas 微信海报分享(个人爬坑)
2018/01/12 HTML / CSS
英国探险旅游专家:Explore
2018/12/20 全球购物
品恩科技软件测试面试题
2014/10/26 面试题
竞选班干部的演讲稿
2014/04/24 职场文书
纪念九一八事变演讲稿:忘记意味着背叛
2014/09/14 职场文书
2014年保育员个人工作总结
2014/12/02 职场文书
运动会表扬稿
2015/01/16 职场文书
构建和谐校园倡议书
2015/01/19 职场文书
element tree树形组件回显数据问题解决
2022/08/14 Javascript
keepalived + nginx 实现高可用方案
2022/12/24 Servers