Python开发实例分享bt种子爬虫程序和种子解析


Posted in Python onMay 21, 2014

看到网上也有开源的代码,这不,我拿来进行了二次重写,呵呵,上代码:

    #encoding: utf-8  
    import socket  
    from hashlib import sha1  
    from random import randint  
    from struct import unpack, pack  
    from socket import inet_aton, inet_ntoa  
    from bisect import bisect_left  
    from threading import Timer  
    from time import sleep  
    import MySQLdb  
    from datetime import *  
    import time  
    from bencode import bencode, bdecode  
    BOOTSTRAP_NODES = [  
        ("router.bittorrent.com", 6881),  
        ("dht.transmissionbt.com", 6881),  
        ("router.utorrent.com", 6881)  
    ]   
    TID_LENGTH = 4  
    KRPC_TIMEOUT = 10  
    REBORN_TIME = 5 * 60  
    K = 8  
    def entropy(bytes):  
        s = ""  
        for i in range(bytes):  
            s += chr(randint(0, 255))  
        return s  
    def random_id():  
        hash = sha1()  
        hash.update( entropy(20) )  
        return hash.digest()  
    def decode_nodes(nodes):  
        n = []  
        length = len(nodes)  
        if (length % 26) != 0:   
            return n  
        for i in range(0, length, 26):  
            nid = nodes[i:i+20]  
            ip = inet_ntoa(nodes[i+20:i+24])  
            port = unpack("!H", nodes[i+24:i+26])[0]  
            n.append( (nid, ip, port) )  
        return n  
    def encode_nodes(nodes):  
        strings = []  
        for node in nodes:  
            s = "%s%s%s" % (node.nid, inet_aton(node.ip), pack("!H", node.port))  
            strings.append(s)  
        return "".join(strings)  
    def intify(hstr):  
        return long(hstr.encode('hex'), 16)      
    def timer(t, f):  
        Timer(t, f).start()  
    class BucketFull(Exception):  
        pass  
    class KRPC(object):  
        def __init__(self):  
            self.types = {  
                "r": self.response_received,  
                "q": self.query_received  
            }  
            self.actions = {  
                "ping": self.ping_received,  
                "find_node": self.find_node_received,  
                "get_peers": self.get_peers_received,  
                "announce_peer": self.announce_peer_received,  
            }  
            self.socket = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)  
            self.socket.bind(("0.0.0.0", self.port))  
        def response_received(self, msg, address):  
            self.find_node_handler(msg)  
        def query_received(self, msg, address):  
            try:  
                self.actions[msg["q"]](msg, address)  
            except KeyError:  
                pass  
        def send_krpc(self, msg, address):  
            try:  
                self.socket.sendto(bencode(msg), address)  
            except:  
                pass  
    class Client(KRPC):  
        def __init__(self, table):  
            self.table = table  
            timer(KRPC_TIMEOUT, self.timeout)  
            timer(REBORN_TIME, self.reborn)  
            KRPC.__init__(self)  
        def find_node(self, address, nid=None):  
            nid = self.get_neighbor(nid) if nid else self.table.nid  
            tid = entropy(TID_LENGTH)  
            msg = {  
                "t": tid,  
                "y": "q",  
                "q": "find_node",  
                "a": {"id": nid, "target": random_id()}  
            }  
            self.send_krpc(msg, address)  
        def find_node_handler(self, msg):  
            try:  
                nodes = decode_nodes(msg["r"]["nodes"])  
                for node in nodes:  
                    (nid, ip, port) = node  
                    if len(nid) != 20: continue  
                    if nid == self.table.nid: continue  
                    self.find_node( (ip, port), nid )  
            except KeyError:  
                pass  
        def joinDHT(self):  
            for address in BOOTSTRAP_NODES:   
                self.find_node(address)  
        def timeout(self):  
            if len( self.table.buckets ) < 2:  
                self.joinDHT()  
            timer(KRPC_TIMEOUT, self.timeout)  
        def reborn(self):  
            self.table.nid = random_id()  
            self.table.buckets = [ KBucket(0, 2**160) ]  
            timer(REBORN_TIME, self.reborn)  
        def start(self):  
            self.joinDHT()  
            while True:  
                try:  
                    (data, address) = self.socket.recvfrom(65536)  
                    msg = bdecode(data)  
                    self.types[msg["y"]](msg, address)  
                except Exception:  
                    pass  
        def get_neighbor(self, target):  
            return target[:10]+random_id()[10:]  
    class Server(Client):  
        def __init__(self, master, table, port):  
            self.table = table  
            self.master = master  
            self.port = port  
            Client.__init__(self, table)  
        def ping_received(self, msg, address):  
            try:  
                nid = msg["a"]["id"]  
                msg = {  
                    "t": msg["t"],  
                    "y": "r",  
                    "r": {"id": self.get_neighbor(nid)}  
                }  
                self.send_krpc(msg, address)  
                self.find_node(address, nid)  
            except KeyError:  
                pass  
        def find_node_received(self, msg, address):  
            try:  
                target = msg["a"]["target"]  
                neighbors = self.table.get_neighbors(target)  
                nid = msg["a"]["id"]  
                msg = {  
                    "t": msg["t"],  
                    "y": "r",  
                    "r": {  
                        "id": self.get_neighbor(target),   
                        "nodes": encode_nodes(neighbors)  
                    }  
                }  
                self.table.append(KNode(nid, *address))  
                self.send_krpc(msg, address)  
                self.find_node(address, nid)  
            except KeyError:  
                pass  
        def get_peers_received(self, msg, address):  
            try:  
                infohash = msg["a"]["info_hash"]  
                neighbors = self.table.get_neighbors(infohash)  
                nid = msg["a"]["id"]  
                msg = {  
                    "t": msg["t"],  
                    "y": "r",  
                    "r": {  
                        "id": self.get_neighbor(infohash),   
                        "nodes": encode_nodes(neighbors)  
                    }  
                }  
                self.table.append(KNode(nid, *address))  
                self.send_krpc(msg, address)  
                self.master.log(infohash)  
                self.find_node(address, nid)  
            except KeyError:  
                pass  
        def announce_peer_received(self, msg, address):  
            try:  
                infohash = msg["a"]["info_hash"]  
                nid = msg["a"]["id"]  
                msg = {   
                    "t": msg["t"],  
                    "y": "r",  
                    "r": {"id": self.get_neighbor(infohash)}  
                }  
                self.table.append(KNode(nid, *address))  
                self.send_krpc(msg, address)  
                self.master.log(infohash)  
                self.find_node(address, nid)  
            except KeyError:  
                pass  
    class KTable(object):  
        def __init__(self, nid):  
            self.nid = nid  
            self.buckets = [ KBucket(0, 2**160) ]  
        def append(self, node):  
            index = self.bucket_index(node.nid)  
            try:  
                bucket = self.buckets[index]  
                bucket.append(node)  
            except IndexError:  
                return  
            except BucketFull:  
                if not bucket.in_range(self.nid): return  
                self.split_bucket(index)  
                self.append(node)  
        def get_neighbors(self, target):  
            nodes = []  
            if len(self.buckets) == 0: return nodes  
            if len(target) != 20 : return nodes  
            index = self.bucket_index(target)  
            try:  
                nodes = self.buckets[index].nodes  
                min = index - 1  
                max = index + 1  
                while len(nodes) < K and ((min >= 0) or (max < len(self.buckets))):  
                    if min >= 0:  
                        nodes.extend(self.buckets[min].nodes)  
                    if max < len(self.buckets):  
                        nodes.extend(self.buckets[max].nodes)  
                    min -= 1  
                    max += 1  
                num = intify(target)  
                nodes.sort(lambda a, b, num=num: cmp(num^intify(a.nid), num^intify(b.nid)))  
                return nodes[:K]  
            except IndexError:  
                return nodes  
        def bucket_index(self, target):  
            return bisect_left(self.buckets, intify(target))  
        def split_bucket(self, index):  
            old = self.buckets[index]  
            point = old.max - (old.max - old.min)/2  
            new = KBucket(point, old.max)  
            old.max = point  
            self.buckets.insert(index + 1, new)  
            for node in old.nodes[:]:  
                if new.in_range(node.nid):  
                    new.append(node)  
                    old.remove(node)  
        def __iter__(self):  
            for bucket in self.buckets:  
                yield bucket  
    class KBucket(object):  
        __slots__ = ("min", "max", "nodes")  
        def __init__(self, min, max):  
            self.min = min  
            self.max = max  
            self.nodes = []  
        def append(self, node):  
            if node in self:  
                self.remove(node)  
                self.nodes.append(node)  
            else:  
                if len(self) < K:  
                    self.nodes.append(node)  
                else:  
                    raise BucketFull  
        def remove(self, node):  
            self.nodes.remove(node)  
        def in_range(self, target):  
            return self.min <= intify(target) < self.max  
        def __len__(self):  
            return len(self.nodes)  
        def __contains__(self, node):  
            return node in self.nodes  
        def __iter__(self):  
            for node in self.nodes:  
                yield node  
        def __lt__(self, target):  
            return self.max <= target  
    class KNode(object):  
        __slots__ = ("nid", "ip", "port")  
        def __init__(self, nid, ip, port):  
            self.nid = nid  
            self.ip = ip  
            self.port = port  
        def __eq__(self, other):  
            return self.nid == other.nid  
    #using example  
    class Master(object):  
        def __init__(self, f):  
            self.f = f  
            try:  
                self.conn=MySQLdb.connect(host='localhost',user='root',passwd='',db='bt',port=3306)  
                self.cur=self.conn.cursor()  
            except MySQLdb.Error,e:  
                print "Mysql Error %d: %s" % (e.args[0], e.args[1])  
        def log(self, infohash):  
            try:  
                sql = "insert into bt_main_new(hash,name,length,date) values(%s,%s,%s,%s)"  
                date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())  
                re=self.cur.execute(sql,(infohash,'','',date))  
                self.conn.commit()  
                self.cur.close()  
                self.conn.close()  
                #print re  
            except MySQLdb.Error,e:  
                print "Mysql Error %d: %s" % (e.args[0], e.args[1])  
            self.f.write(infohash.encode("hex")+"\n")  
            self.f.flush()  
    try:  
        d = date.today()  
        f = open("%s.log" % d, "a")  
        m = Master(f)  
        s = Server(Master(f), KTable(random_id()), 8006)  
        s.start()       
    except KeyboardInterrupt:  
        s.socket.close()  
        f.close() 

本爬虫程序,会自动爬取得网络上分享的bt种子,写入文件盒数据库,爬取的只是个种子的hash码,还需要到网络上下载种子进行分析

下载种子,相信大家都知道国外有几个免费分享种子的网站,大家可以根据hash码去下载,分析,下面呈上我写的一个分析种子的程序:

#! /usr/bin/python  
# -*- coding: utf-8 -*-  
import MySQLdb  
from datetime import *  
import time  
import re  
from time import sleep  
import bencode  
import urllib2  
import base64  
try:  
    conn=MySQLdb.connect(host='localhost',user='root',passwd='',db='bt',port=3306)  
    cur=conn.cursor()  
    sql = "select * from bt_main where name = '' order by id desc"  
    count = cur.execute(sql)  
    rows = cur.fetchall()  
    for row in rows:  
        if row[2].strip() != '':  
            continue  
        id = row[0]  
        hash = row[1]  
        url = "http://haofuli.duapp.com/go/info.php?hash=%s" % hash  
        file = urllib2.urlopen(url).read()  
        if "error!" == file:  
            try:  
                sql = "update bt_main set isTrue = 0 where id = %s "  
                re = cur.execute(sql,(id))  
                conn.commit()  
            except MySQLdb.Error,e:  
                print "Mysql Error %d: %s" % (e.args[0], e.args[1])  
        else:  
            #decode  
            try:  
                fileEncode = bencode.bdecode(file)  
            except Exception,e:pass  
            if 'name.utf-8' in fileEncode['info']:  
                filename=fileEncode['info']['name.utf-8']  
            else:  
                filename = fileEncode['info']['name']  
            ##length  
            if "length" in fileEncode['info']:  
                length = fileEncode['info']['length']  
            else:  
                length = 0  
            try:  
                sql = "update bt_main set name = %s , length = %s , isTrue = 1 where id = %s"  
                re = cur.execute(sql,(base64.b64encode(filename),length,id))  
                conn.commit()  
            except MySQLdb.Error,e:  
                print "Mysql Error %d: %s" % (e.args[0], e.args[1])  
except MySQLdb.Error,e:  
    print "Mysql Error %d: %s" % (e.args[0], e.args[1])

上面的只是简单的分析,对于多文件的,还没有处理。我最近在解析种子的时候,总是出现莫名的填充文件的问题,可能是版本过低吧,最近仍旧在解决。

Python 相关文章推荐
Python实现对比不同字体中的同一字符的显示效果
Apr 23 Python
Python实现豆瓣图片下载的方法
May 25 Python
python版本的仿windows计划任务工具
Apr 30 Python
Python使用OpenCV进行标定
May 08 Python
python3.5基于TCP实现文件传输
Mar 20 Python
linux环境中没有网络怎么下载python
Jul 07 Python
Python单元测试与测试用例简析
Nov 09 Python
python实现两个一维列表合并成一个二维列表
Dec 02 Python
浅谈在django中使用filter()(即对QuerySet操作)时踩的坑
Mar 31 Python
Python-openpyxl表格读取写入的案例详解
Nov 02 Python
python如何写个俄罗斯方块
Nov 06 Python
Python自动化工具之实现Excel转Markdown表格
Apr 08 Python
从零学Python之引用和类属性的初步理解
May 15 #Python
python中xrange和range的区别
May 13 #Python
Python中os和shutil模块实用方法集锦
May 13 #Python
Python中的jquery PyQuery库使用小结
May 13 #Python
Python getopt模块处理命令行选项实例
May 13 #Python
Python random模块(获取随机数)常用方法和使用例子
May 13 #Python
Python自动化测试工具Splinter简介和使用实例
May 13 #Python
You might like
收音机玩机评测 406 篇视频合集
2020/03/11 无线电
队列在编程中的实际应用(php)
2010/09/04 PHP
PHP+JS三级菜单联动菜单实现方法
2016/02/24 PHP
多个Laravel项目如何共用migrations详解
2018/09/25 PHP
Javascript 陷阱 window全局对象
2008/11/26 Javascript
Prototype ObjectRange对象学习
2009/07/19 Javascript
js 实现在离开页面时提醒未保存的信息(减少用户重复操作)
2013/01/16 Javascript
js onload处理html页面加载之后的事件
2013/10/30 Javascript
PHP开发者必须掌握的6个关键字
2014/04/14 Javascript
JS实现的自定义右键菜单实例二则
2015/09/01 Javascript
七个不允许错过的jQuery小技巧
2015/12/21 Javascript
javascript创建cookie、读取cookie
2016/03/31 Javascript
prototype.js常用函数详解
2016/06/18 Javascript
BootStrap按钮标签及基本样式
2016/11/23 Javascript
javascript 开发之网页兼容各种浏览器
2017/09/28 Javascript
JavaScript动态检测密码强度原理及实现方法详解
2019/06/11 Javascript
[01:51]2018年度CS GO最具人气外援-完美盛典
2018/12/16 DOTA
Python中的文件和目录操作实现代码
2011/03/13 Python
Python version 2.7 required, which was not found in the registry
2014/08/26 Python
Python编程实战之Oracle数据库操作示例
2017/06/21 Python
python实现泊松图像融合
2018/07/26 Python
pytorch训练imagenet分类的方法
2018/07/27 Python
selenium python 实现基本自动化测试的示例代码
2019/02/25 Python
python使用if语句实现一个猜拳游戏详解
2019/08/27 Python
jupyter notebook 使用过程中python莫名崩溃的原因及解决方式
2020/04/10 Python
Python API 操作Hadoop hdfs详解
2020/06/06 Python
如何用Python提取10000份log中的产品信息
2021/01/14 Python
基于Python的接口自动化unittest测试框架和ddt数据驱动详解
2021/01/27 Python
Python基于爬虫实现全网搜索并下载音乐
2021/02/14 Python
使用css3做0.5px的细线的示例代码
2018/01/18 HTML / CSS
客服专员岗位职责
2014/02/28 职场文书
信息与工商管理职业规划范文:为梦想而搏击
2014/09/11 职场文书
售后客服个人自我评价
2014/09/14 职场文书
2014年团支部年度工作总结
2014/12/24 职场文书
出纳试用期工作总结2015
2015/05/28 职场文书
一道JS算法面试题——冒泡、选择排序
2021/04/21 Javascript