Python开发实例分享bt种子爬虫程序和种子解析


Posted in Python onMay 21, 2014

看到网上也有开源的代码,这不,我拿来进行了二次重写,呵呵,上代码:

    #encoding: utf-8  
    import socket  
    from hashlib import sha1  
    from random import randint  
    from struct import unpack, pack  
    from socket import inet_aton, inet_ntoa  
    from bisect import bisect_left  
    from threading import Timer  
    from time import sleep  
    import MySQLdb  
    from datetime import *  
    import time  
    from bencode import bencode, bdecode  
    BOOTSTRAP_NODES = [  
        ("router.bittorrent.com", 6881),  
        ("dht.transmissionbt.com", 6881),  
        ("router.utorrent.com", 6881)  
    ]   
    TID_LENGTH = 4  
    KRPC_TIMEOUT = 10  
    REBORN_TIME = 5 * 60  
    K = 8  
    def entropy(bytes):  
        s = ""  
        for i in range(bytes):  
            s += chr(randint(0, 255))  
        return s  
    def random_id():  
        hash = sha1()  
        hash.update( entropy(20) )  
        return hash.digest()  
    def decode_nodes(nodes):  
        n = []  
        length = len(nodes)  
        if (length % 26) != 0:   
            return n  
        for i in range(0, length, 26):  
            nid = nodes[i:i+20]  
            ip = inet_ntoa(nodes[i+20:i+24])  
            port = unpack("!H", nodes[i+24:i+26])[0]  
            n.append( (nid, ip, port) )  
        return n  
    def encode_nodes(nodes):  
        strings = []  
        for node in nodes:  
            s = "%s%s%s" % (node.nid, inet_aton(node.ip), pack("!H", node.port))  
            strings.append(s)  
        return "".join(strings)  
    def intify(hstr):  
        return long(hstr.encode('hex'), 16)      
    def timer(t, f):  
        Timer(t, f).start()  
    class BucketFull(Exception):  
        pass  
    class KRPC(object):  
        def __init__(self):  
            self.types = {  
                "r": self.response_received,  
                "q": self.query_received  
            }  
            self.actions = {  
                "ping": self.ping_received,  
                "find_node": self.find_node_received,  
                "get_peers": self.get_peers_received,  
                "announce_peer": self.announce_peer_received,  
            }  
            self.socket = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)  
            self.socket.bind(("0.0.0.0", self.port))  
        def response_received(self, msg, address):  
            self.find_node_handler(msg)  
        def query_received(self, msg, address):  
            try:  
                self.actions[msg["q"]](msg, address)  
            except KeyError:  
                pass  
        def send_krpc(self, msg, address):  
            try:  
                self.socket.sendto(bencode(msg), address)  
            except:  
                pass  
    class Client(KRPC):  
        def __init__(self, table):  
            self.table = table  
            timer(KRPC_TIMEOUT, self.timeout)  
            timer(REBORN_TIME, self.reborn)  
            KRPC.__init__(self)  
        def find_node(self, address, nid=None):  
            nid = self.get_neighbor(nid) if nid else self.table.nid  
            tid = entropy(TID_LENGTH)  
            msg = {  
                "t": tid,  
                "y": "q",  
                "q": "find_node",  
                "a": {"id": nid, "target": random_id()}  
            }  
            self.send_krpc(msg, address)  
        def find_node_handler(self, msg):  
            try:  
                nodes = decode_nodes(msg["r"]["nodes"])  
                for node in nodes:  
                    (nid, ip, port) = node  
                    if len(nid) != 20: continue  
                    if nid == self.table.nid: continue  
                    self.find_node( (ip, port), nid )  
            except KeyError:  
                pass  
        def joinDHT(self):  
            for address in BOOTSTRAP_NODES:   
                self.find_node(address)  
        def timeout(self):  
            if len( self.table.buckets ) < 2:  
                self.joinDHT()  
            timer(KRPC_TIMEOUT, self.timeout)  
        def reborn(self):  
            self.table.nid = random_id()  
            self.table.buckets = [ KBucket(0, 2**160) ]  
            timer(REBORN_TIME, self.reborn)  
        def start(self):  
            self.joinDHT()  
            while True:  
                try:  
                    (data, address) = self.socket.recvfrom(65536)  
                    msg = bdecode(data)  
                    self.types[msg["y"]](msg, address)  
                except Exception:  
                    pass  
        def get_neighbor(self, target):  
            return target[:10]+random_id()[10:]  
    class Server(Client):  
        def __init__(self, master, table, port):  
            self.table = table  
            self.master = master  
            self.port = port  
            Client.__init__(self, table)  
        def ping_received(self, msg, address):  
            try:  
                nid = msg["a"]["id"]  
                msg = {  
                    "t": msg["t"],  
                    "y": "r",  
                    "r": {"id": self.get_neighbor(nid)}  
                }  
                self.send_krpc(msg, address)  
                self.find_node(address, nid)  
            except KeyError:  
                pass  
        def find_node_received(self, msg, address):  
            try:  
                target = msg["a"]["target"]  
                neighbors = self.table.get_neighbors(target)  
                nid = msg["a"]["id"]  
                msg = {  
                    "t": msg["t"],  
                    "y": "r",  
                    "r": {  
                        "id": self.get_neighbor(target),   
                        "nodes": encode_nodes(neighbors)  
                    }  
                }  
                self.table.append(KNode(nid, *address))  
                self.send_krpc(msg, address)  
                self.find_node(address, nid)  
            except KeyError:  
                pass  
        def get_peers_received(self, msg, address):  
            try:  
                infohash = msg["a"]["info_hash"]  
                neighbors = self.table.get_neighbors(infohash)  
                nid = msg["a"]["id"]  
                msg = {  
                    "t": msg["t"],  
                    "y": "r",  
                    "r": {  
                        "id": self.get_neighbor(infohash),   
                        "nodes": encode_nodes(neighbors)  
                    }  
                }  
                self.table.append(KNode(nid, *address))  
                self.send_krpc(msg, address)  
                self.master.log(infohash)  
                self.find_node(address, nid)  
            except KeyError:  
                pass  
        def announce_peer_received(self, msg, address):  
            try:  
                infohash = msg["a"]["info_hash"]  
                nid = msg["a"]["id"]  
                msg = {   
                    "t": msg["t"],  
                    "y": "r",  
                    "r": {"id": self.get_neighbor(infohash)}  
                }  
                self.table.append(KNode(nid, *address))  
                self.send_krpc(msg, address)  
                self.master.log(infohash)  
                self.find_node(address, nid)  
            except KeyError:  
                pass  
    class KTable(object):  
        def __init__(self, nid):  
            self.nid = nid  
            self.buckets = [ KBucket(0, 2**160) ]  
        def append(self, node):  
            index = self.bucket_index(node.nid)  
            try:  
                bucket = self.buckets[index]  
                bucket.append(node)  
            except IndexError:  
                return  
            except BucketFull:  
                if not bucket.in_range(self.nid): return  
                self.split_bucket(index)  
                self.append(node)  
        def get_neighbors(self, target):  
            nodes = []  
            if len(self.buckets) == 0: return nodes  
            if len(target) != 20 : return nodes  
            index = self.bucket_index(target)  
            try:  
                nodes = self.buckets[index].nodes  
                min = index - 1  
                max = index + 1  
                while len(nodes) < K and ((min >= 0) or (max < len(self.buckets))):  
                    if min >= 0:  
                        nodes.extend(self.buckets[min].nodes)  
                    if max < len(self.buckets):  
                        nodes.extend(self.buckets[max].nodes)  
                    min -= 1  
                    max += 1  
                num = intify(target)  
                nodes.sort(lambda a, b, num=num: cmp(num^intify(a.nid), num^intify(b.nid)))  
                return nodes[:K]  
            except IndexError:  
                return nodes  
        def bucket_index(self, target):  
            return bisect_left(self.buckets, intify(target))  
        def split_bucket(self, index):  
            old = self.buckets[index]  
            point = old.max - (old.max - old.min)/2  
            new = KBucket(point, old.max)  
            old.max = point  
            self.buckets.insert(index + 1, new)  
            for node in old.nodes[:]:  
                if new.in_range(node.nid):  
                    new.append(node)  
                    old.remove(node)  
        def __iter__(self):  
            for bucket in self.buckets:  
                yield bucket  
    class KBucket(object):  
        __slots__ = ("min", "max", "nodes")  
        def __init__(self, min, max):  
            self.min = min  
            self.max = max  
            self.nodes = []  
        def append(self, node):  
            if node in self:  
                self.remove(node)  
                self.nodes.append(node)  
            else:  
                if len(self) < K:  
                    self.nodes.append(node)  
                else:  
                    raise BucketFull  
        def remove(self, node):  
            self.nodes.remove(node)  
        def in_range(self, target):  
            return self.min <= intify(target) < self.max  
        def __len__(self):  
            return len(self.nodes)  
        def __contains__(self, node):  
            return node in self.nodes  
        def __iter__(self):  
            for node in self.nodes:  
                yield node  
        def __lt__(self, target):  
            return self.max <= target  
    class KNode(object):  
        __slots__ = ("nid", "ip", "port")  
        def __init__(self, nid, ip, port):  
            self.nid = nid  
            self.ip = ip  
            self.port = port  
        def __eq__(self, other):  
            return self.nid == other.nid  
    #using example  
    class Master(object):  
        def __init__(self, f):  
            self.f = f  
            try:  
                self.conn=MySQLdb.connect(host='localhost',user='root',passwd='',db='bt',port=3306)  
                self.cur=self.conn.cursor()  
            except MySQLdb.Error,e:  
                print "Mysql Error %d: %s" % (e.args[0], e.args[1])  
        def log(self, infohash):  
            try:  
                sql = "insert into bt_main_new(hash,name,length,date) values(%s,%s,%s,%s)"  
                date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())  
                re=self.cur.execute(sql,(infohash,'','',date))  
                self.conn.commit()  
                self.cur.close()  
                self.conn.close()  
                #print re  
            except MySQLdb.Error,e:  
                print "Mysql Error %d: %s" % (e.args[0], e.args[1])  
            self.f.write(infohash.encode("hex")+"\n")  
            self.f.flush()  
    try:  
        d = date.today()  
        f = open("%s.log" % d, "a")  
        m = Master(f)  
        s = Server(Master(f), KTable(random_id()), 8006)  
        s.start()       
    except KeyboardInterrupt:  
        s.socket.close()  
        f.close() 

本爬虫程序,会自动爬取得网络上分享的bt种子,写入文件盒数据库,爬取的只是个种子的hash码,还需要到网络上下载种子进行分析

下载种子,相信大家都知道国外有几个免费分享种子的网站,大家可以根据hash码去下载,分析,下面呈上我写的一个分析种子的程序:

#! /usr/bin/python  
# -*- coding: utf-8 -*-  
import MySQLdb  
from datetime import *  
import time  
import re  
from time import sleep  
import bencode  
import urllib2  
import base64  
try:  
    conn=MySQLdb.connect(host='localhost',user='root',passwd='',db='bt',port=3306)  
    cur=conn.cursor()  
    sql = "select * from bt_main where name = '' order by id desc"  
    count = cur.execute(sql)  
    rows = cur.fetchall()  
    for row in rows:  
        if row[2].strip() != '':  
            continue  
        id = row[0]  
        hash = row[1]  
        url = "http://haofuli.duapp.com/go/info.php?hash=%s" % hash  
        file = urllib2.urlopen(url).read()  
        if "error!" == file:  
            try:  
                sql = "update bt_main set isTrue = 0 where id = %s "  
                re = cur.execute(sql,(id))  
                conn.commit()  
            except MySQLdb.Error,e:  
                print "Mysql Error %d: %s" % (e.args[0], e.args[1])  
        else:  
            #decode  
            try:  
                fileEncode = bencode.bdecode(file)  
            except Exception,e:pass  
            if 'name.utf-8' in fileEncode['info']:  
                filename=fileEncode['info']['name.utf-8']  
            else:  
                filename = fileEncode['info']['name']  
            ##length  
            if "length" in fileEncode['info']:  
                length = fileEncode['info']['length']  
            else:  
                length = 0  
            try:  
                sql = "update bt_main set name = %s , length = %s , isTrue = 1 where id = %s"  
                re = cur.execute(sql,(base64.b64encode(filename),length,id))  
                conn.commit()  
            except MySQLdb.Error,e:  
                print "Mysql Error %d: %s" % (e.args[0], e.args[1])  
except MySQLdb.Error,e:  
    print "Mysql Error %d: %s" % (e.args[0], e.args[1])

上面的只是简单的分析,对于多文件的,还没有处理。我最近在解析种子的时候,总是出现莫名的填充文件的问题,可能是版本过低吧,最近仍旧在解决。

Python 相关文章推荐
Python设计模式编程中Adapter适配器模式的使用实例
Mar 02 Python
Python 爬虫模拟登陆知乎
Sep 23 Python
numpy使用技巧之数组过滤实例代码
Feb 03 Python
pandas series序列转化为星期几的实例
Apr 11 Python
用python给自己做一款小说阅读器过程详解
Jul 11 Python
pygame实现非图片按钮效果
Oct 29 Python
Python实现鼠标自动在屏幕上随机移动功能
Mar 14 Python
经验丰富程序员才知道的8种高级Python技巧
Jul 27 Python
python 逐步回归算法
Apr 06 Python
Python数据类型最全知识总结
May 31 Python
Qt自定义Plot实现曲线绘制的详细过程
Nov 02 Python
PyTorch device与cuda.device用法
Apr 03 Python
从零学Python之引用和类属性的初步理解
May 15 #Python
python中xrange和range的区别
May 13 #Python
Python中os和shutil模块实用方法集锦
May 13 #Python
Python中的jquery PyQuery库使用小结
May 13 #Python
Python getopt模块处理命令行选项实例
May 13 #Python
Python random模块(获取随机数)常用方法和使用例子
May 13 #Python
Python自动化测试工具Splinter简介和使用实例
May 13 #Python
You might like
php环境配置 php5 mysql5 apache2 phpmyadmin安装与配置
2006/11/17 PHP
防止本地用户用fsockopen DDOS攻击对策
2011/11/02 PHP
javascript字典探测用户名工具
2006/10/05 Javascript
北京奥运官方网站幻灯切换效果flash版打包下载
2008/01/30 Javascript
IE8 引入跨站数据获取功能说明
2008/07/22 Javascript
解析js中获得父窗口链接getParent方法以及各种打开窗口的方法
2013/06/19 Javascript
两种不同的方法实现js对checkbox进行全选和反选
2014/05/13 Javascript
jQuery 获取兄弟元素的几种不错方法
2014/05/23 Javascript
js中实现多态采用和继承类似的方法
2014/08/22 Javascript
JQuery中clone方法复制节点
2015/05/18 Javascript
BootStrap的table表头固定tbody滚动的实例代码
2016/08/24 Javascript
原生JS实现左右箭头选择日期实例代码
2017/03/14 Javascript
Javascript 之封装(Package)
2018/09/14 Javascript
通过图带你深入了解vue的响应式原理
2019/06/21 Javascript
在Python中使用元类的教程
2015/04/28 Python
Python爬虫设置代理IP的方法(爬虫技巧)
2018/03/04 Python
破解安装Pycharm的方法
2018/10/19 Python
Django REST framework 分页的实现代码
2019/06/19 Python
python中的subprocess.Popen()使用详解
2019/12/25 Python
Python实现链表反转的方法分析【迭代法与递归法】
2020/02/22 Python
Python如何使用bokeh包和geojson数据绘制地图
2020/03/21 Python
使用darknet框架的imagenet数据分类预训练操作
2020/07/07 Python
matplotlib subplot绘制多个子图的方法示例
2020/07/28 Python
python+selenium自动化实战携带cookies模拟登陆微博
2021/01/19 Python
智能室内花园:Click & Grow
2021/01/29 全球购物
刊首寄语大全
2014/04/11 职场文书
《真想变成大大的荷叶》教学反思
2014/04/14 职场文书
2014年财务工作自我评价
2014/09/23 职场文书
2014年内部审计工作总结
2014/12/09 职场文书
职位证明模板
2015/06/23 职场文书
JavaScript嵌入百度地图API的最详细方法
2021/04/16 Javascript
新手必备之MySQL msi版本下载安装图文详细教程
2021/05/21 MySQL
带你学习MySQL执行计划
2021/05/31 MySQL
python通过函数名调用函数的几种方法总结
2021/06/07 Python
python迷宫问题深度优先遍历实例
2021/06/20 Python
Python pandas读取CSV文件的注意事项(适合新手)
2021/06/20 Python