Posted in Python onJuly 15, 2009
代码如下:
import sys, urllib import datetime,time def getDate(): strday=datetime.datetime.now().__str__() strday=strday.split()[0] return strday #url = "http://www.kingnic.com/list/2009-06-16.txt" def getUrl(dateStr=None): baseUrl ="http://www.kingnic.com/list/" if dateStr: return baseUrl+dateStr+".txt" thisDate = getDate(); if not thisDate: print "Error Date!" return None; url = baseUrl+thisDate+".txt" return url def getSource(url): source = urllib.urlopen(url).read() return source def save(source,filename="domains.txt"): fp = open(filename,"w") fp.write(source) fp.close() return True; def loadList(fileName="domains.txt"): fp = open("domains.txt","r") source = fp.readlines() fp.close() return source; def getPrefix(domain): return domain.split('.')[0] def getPostfix(domain): return domain.split('.')[1] def hasMidLine(domain): if '-' in domain: return True else: return False def parser(domains): max =4 min =0 keyword =('sky','see','job') result=[] len_num =0; mid_line_num =0; for domain in domains: prefix = getPrefix(domain) postfix = getPostfix(domain) domainlen = len(prefix) if (domainlen < min) or (domainlen > max): len_num +=1 continue if hasMidLine(prefix): mid_line_num +=1 continue result.append(domain) print " log : \n" print "all: \t",len(domains) print "len not in [%s,%s] \t: %s"%(max,min,len_num) print "contain '-' :\t",mid_line_num print "remain:\t",len(result) return result; if __name__ == "__main__": url = getUrl() source = getSource(url) save(source) domains =loadList() result = parser(domains) save("".join(result),"result.txt") print("\n\n\nfinished!!")
输出文件:
domains.txt : kingnic.com 据当天释放的 域名;
result.txt : 符合过滤条件的域名;
log输出:
all: 55500 len not in [4,0] : 55019 contain '-' : 32 remain: 449 finished!!
对 后缀、长度和有无“-”过滤,过滤条件有点少,其它以后如有需要再加。
python 域名分析工具实现代码
声明:登载此文出于传递更多信息之目的,并不意味着赞同其观点或证实其描述。
Reply on: @reply_date@
@reply_contents@