编程 PHP

基于Snoopy的PHP近似完美获取网站编码的代码

Posted in PHP onOctober 23, 2011

先要到网上下载Snoopy.class.php
调用方法：

<?php 
require 'lib/Snoopy.class.php'; 
require 'lib/WebCrawl.class.php';//包含下面代码 
$go=new WebCrawl('http://www.baidu.com'); 
echo $go->getCharset(); 
?>

<?php 
class WebCrawl 
{ 
private $url; 
private $request; 
public $charset_arr=array( 
'gb2312', 
'utf-8', 
'big5', 
'gbk', 
'ascii', 
'cp936', 
'ibm037', 
'ibm437', 
'ibm500', 
'asmo-708', 
'dos-720', 
'ibm737', 
'ibm775', 
'ibm850', 
'ibm852', 
'ibm855', 
'ibm857', 
'ibm00858', 
'ibm861', 
'ibm860', 
'dos-862', 
'ibm863', 
'ibm864', 
'ibm865', 
'cp866', 
'ibm869', 
'ibm870', 
'windows-874', 
'cp875', 
'shift_jis', 
'ks_c_5601-1987', 
'ibm1026', 
'ibm01047', 
'ibm01047', 
'ibm01040', 
'ibm01041', 
'ibm01042', 
'ibm01043', 
'ibm01044', 
'ibm01045', 
'ibm01046', 
'ibm01047', 
'ibm01048', 
'ibm01049', 
'utf-16', 
'unicodefffe', 
'windows-1250', 
'windows-1251', 
'windows-1252', 
'windows-1253', 
'windows-1254', 
'windows-1255', 
'windows-1256', 
'windows-1257', 
'windows-1258', 
'johab', 
'macintosh', 
'x-mac-japanese', 
'x-mac-chinesetrad', 
'x-mac-korean', 
'x-mac-arabic', 
'x-mac-hebrew', 
'x-mac-greek', 
'x-mac-cyrillic', 
'x-mac-chinesesimp', 
'x-mac-romanian', 
'x-mac-ukrainian', 
'x-mac-thai', 
'x-mac-ce', 
'x-mac-icelandic', 
'x-mac-turkish', 
'x-mac-croatian', 
'x-chinese-cns', 
'x-cp20001', 
'x-chinese-eten', 
'x-cp20003', 
'x-cp20004', 
'x-cp20005', 
'x-ia5', 
'x-ia5-german', 
'x-ia5-swedish', 
'x-ia5-norwegian', 
'us-ascii', 
'x-cp20261', 
'x-cp20269', 
'ibm273', 
'ibm277', 
'ibm278', 
'ibm280', 
'ibm284', 
'ibm285', 
'ibm290', 
'ibm420', 
'ibm423', 
'ibm424', 
'x-ebcdic-koreanextended', 
'ibm-thai', 
'koi8-r', 
'ibm871', 
'ibm880', 
'ibm905', 
'ibm00924', 
'x-cp20936', 
'x-cp20949', 
'cp1025', 
'koi8-u', 
'iso-8859-1', 
'iso-8859-2', 
'iso-8859-3', 
'iso-8859-4', 
'iso-8859-5', 
'iso-8859-6', 
'iso-8859-7', 
'iso-8859-8', 
'iso-8859-9', 
'iso-8859-13', 
'iso-8859-15', 
'x-europa', 
'iso-8859-8-i', 
'iso-2022-jp', 
'csiso2022jp', 
'iso-2022-jp', 
'iso-2022-kr', 
'x-cp50227', 
'euc-jp', 
'euc-cn', 
'euc-kr', 
'hz-gb-2312', 
'gb18030', 
'x-iscii-de', 
'x-iscii-be', 
'x-iscii-ta', 
'x-iscii-te', 
'x-iscii-as', 
'x-iscii-or', 
'x-iscii-ka', 
'x-iscii-ma', 
'x-iscii-gu', 
'x-iscii-pa', 
'utf-7', 
'utf-32', 
'utf-32be' 
); 
public function __construct($url) 
{ 
$this->url=$url; 
} 
//打开网站 
private function open($url) 
{ 
if($this->request!==null) 
{ 
if($this->request->status==200) 
{ 
return true; 
} 
else 
{ 
return false; 
} 
} 
else 
{ 
$this->request=new Snoopy(); 
$this->request->fetch($url); 
if($this->request->status==200) 
{ 
$this->request->results=strtolower($this->request->results); 
$charset=$this->getCharset(); 
if($charset!="utf-8") 
{ 
if($charset=="windows-1252") 
{ 
$this->request->results=$this->uni_decode($this->request->results); 
} 
else 
{ 
$this->request->results=mb_convert_encoding($this->request->results,"UTF-8",$charset); 
} 
} 
return true; 
} 
else 
{ 
return false; 
} 
} 
} 
//获取网站title,keywords,description 
public function getWebinfo() 
{ 
$info=array( 
'title'=>'', 
'keywords'=>'', 
'desc'=>'', 
'ip'=>'' 
); 
if(!$this->open($this->url)){return $info;exit;} 
// print_r($this->request->results);exit; 
preg_match('/<title>([^>]*)<\/title>/si', $this->request->results, $titlematch ); 
if (isset($titlematch) && is_array($titlematch) && count($titlematch) > 0) 
{ 
$info['title'] = strip_tags($titlematch[1]); 
} 
preg_match_all('/<[\s]*meta[\s]*name="?' . '([^>"]*)"?[\s]*' . 'content="?([^>"]*)"?[\s]*[\/]?[\s]*>/si', $this->request->results, $match); 
$ft=0; 
foreach($match[1] as $mt) 
{ 
if($mt=="keywords" || $mt=="description") 
{ 
$ft=1; 
} 
} 
if($ft==0) 
{ 
preg_match_all('/<[\s]*meta[\s]*content="?([^>"]*)"?[\s]*name="?' . '([^>"]*)"?[\s]*[\/]?[\s]*>/si', $this->request->results, $match); 
if (isset($match) && is_array($match) && count($match) == 3) 
{ 
$originals = $match[0]; 
$names = $match[2]; 
$values = $match[1]; 
if (count($originals) == count($names) && count($names) == count($values)) 
{ 
$metaTags = array(); 
for ($i=0, $limiti=count($names); $i < $limiti; $i++) 
{ 
$metaTags[$names[$i]] = array ( 
'html' => htmlentities($originals[$i]), 
'value' => $values[$i] 
); 
} 
} 
} 
} 
else 
{ 
if (isset($match) && is_array($match) && count($match) == 3) 
{ 
$originals = $match[0]; 
$names = $match[1]; 
$values = $match[2]; 
if (count($originals) == count($names) && count($names) == count($values)) 
{ 
$metaTags = array(); 
for ($i=0, $limiti=count($names); $i < $limiti; $i++) 
{ 
$metaTags[$names[$i]] = array ( 
'html' => htmlentities($originals[$i]), 
'value' => $values[$i] 
); 
} 
} 
} 
} 
$result = array ( 
'metaTags' => $metaTags 
); 
if(isset($result['metaTags']['keywords']['value'])) 
{ 
$info['keywords']=$result['metaTags']['keywords']['value']; 
} 
else 
{ 
$info['keywords']=""; 
} 
if(isset($result['metaTags']['description']['value'])) 
{ 
$info['desc']=$result['metaTags']['description']['value']; 
} 
else 
{ 
$info['desc']=""; 
} 
$domain=preg_replace('/http\:\/\//si', '', $this->url); 
$ip=@gethostbyname($domain); 
$ip_arr=explode(".", $ip); 
if(count($ip_arr)==4) 
{ 
$info['ip']=$ip; 
} 
return $info; 
} 
public function t($string,$o) 
{ 
for($i=0;$i<strlen($string);$i++) 
{ 
if(ord($string{$i})<128) 
continue; 
if((ord($string{$i})&224)==224) 
{ 
//第一个字节判断通过 
$char = $string{++$i}; 
if((ord($char)&128)==128) 
{ 
//第二个字节判断通过 
$char = $string{++$i}; 
if((ord($char)&128)==128) 
{ 
$encoding = "UTF-8"; 
break; 
} 
} 
} 
if((ord($string{$i})&192)==192) 
{ 
//第一个字节判断通过 
$char = $string{++$i}; 
if((ord($char)&128)==128) 
{ 
//第二个字节判断通过 
$encoding = "GB2312"; 
break; 
} 
} 
} 
return strtolower($encoding); 
} 
function uni_decode ($str, $code = 'utf-8'){ 
$str = json_decode(preg_replace_callback('/&#(\d{5});/', create_function('$dec', 'return \'\\u\'.dechex($dec[1]);'), '"'.$str.'"')); 
if($code != 'utf-8'){ $str = iconv('utf-8', $code, $str); } 
return $str; 
} 
//获取网站编码 
public function getCharset() 
{ 
if(!$this->open($this->url)){return false;exit;} 
//首先从html获取编码 
preg_match("/<meta.+?charset=[^\w]?([-\w]+)/i",$this->request->results,$temp) ? strtolower($temp[1]):""; 
if($temp[1]!="") 
{ 
if(in_array($temp[1], $this->charset_arr)) 
{ 
if($temp[1]=="gb2312") 
{ 
$tmp_charset=$this->t($this->request->results,$temp[1]); 
if($tmp_charset==$temp[1]) 
{ 
return $temp[1]; 
} 
} 
else 
{ 
return $temp[1]; 
} 
} 
} 
if(!empty($this->request->headers)) 
{ 
//从header中获取编码 
$hstr=strtolower(implode("|||",$this->request->headers)); 
preg_match("/charset=[^\w]?([-\w]+)/is",$hstr,$lang) ? strtolower($lang[1]):""; 
if($lang[1]!="") 
{ 
return $lang[1]; 
} 
} 
$encode_arr=array("UTF-8","GB2312","GBK","BIG5","ASCII","EUC-JP","Shift_JIS","CP936","ISO-8859-1","JIS","eucjp-win","sjis-win"); 
$encoded=mb_detect_encoding($this->request->results,$encode_arr); 
if($encoded) 
{ 
return strtolower($encoded); 
} 
else 
{ 
return false; 
} 
} 
} 
?>

声明：登载此文出于传递更多信息之目的，并不意味着赞同其观点或证实其描述。

PHP 相关文章推荐

如何突破PHP程序员的技术瓶颈分析

Jul 17 PHP

简单的php写入数据库类代码分享

Jul 26 PHP

PHP判断图片格式的七种方法小结

Jun 03 PHP

php缓存技术详细总结

Aug 07 PHP

PHP 如何获取二维数组中某个key的集合

Jun 03 PHP

PHP消息队列用法实例分析

Feb 12 PHP

php微信开发之批量生成带参数的二维码

Jun 26 PHP

PHP在同一域名下两个不同的项目做独立登录机制详解

Sep 22 PHP

PHP的PDO预处理语句与存储过程

Jan 27 PHP

关于laravel-admin ueditor 集成并解决刷新的问题

Oct 21 PHP

Laravel框架Blade模板简介及模板继承用法分析

Dec 03 PHP

利用ajax+php实现商品价格计算

Mar 31 PHP

php中经典方法实现判断多维数组是否为空

Oct 23 #PHP

PHP禁止页面缓存的代码

Oct 23 #PHP

Pain 全世界最小最简单的PHP模板引擎 (普通版)

Oct 23 #PHP

供参考的 php 学习提高路线分享

Oct 23 #PHP

PHP中的strtr函数使用介绍(str_replace)

Oct 20 #PHP

PHP中读写文件实现代码

Oct 20 #PHP

Array of country list in PHP with Zend Framework

Oct 17 #PHP

You might like

php常用的安全过滤函数集锦

2014/10/09 PHP

php获取json数据所有的节点路径

2015/05/17 PHP

PHP+AJAX实现投票功能的方法

2015/09/28 PHP

phpmailer绑定邮箱的实现方法

2016/12/01 PHP

详解EventDispatcher事件分发组件

2016/12/25 PHP

js使用eval解析json实例与注意事项分享

2014/01/18 Javascript

用jquery.sortElements实现table排序

2014/05/04 Javascript

node.js中的path.sep方法使用说明

2014/12/08 Javascript

浅谈JQuery+ajax+jsonp 跨域访问

2016/06/25 Javascript

js 判断附件后缀的简单实现方法

2016/10/11 Javascript

jQuery插件FusionCharts绘制的2D双柱状图效果示例【附demo源码】

2017/05/13 jQuery

vue2.0使用swiper组件实现轮播效果

2017/11/27 Javascript

JavaScript寄生组合式继承实例详解

2018/01/06 Javascript

vue实现全屏滚动效果（非fullpage.js）

2020/03/07 Javascript

jQuery使用ajax传递json对象到服务端及contentType的用法示例

2020/03/12 jQuery

[03:48]2014DOTA2 TI专访71DK夺冠不靠小组赛高排名

2014/07/11 DOTA

使用Python开发windows GUI程序入门实例

2014/10/23 Python

python爬取NUS-WIDE数据库图片

2016/10/05 Python

pytorch 在网络中添加可训练参数,修改预训练权重文件的方法

2019/08/17 Python

Python 基于FIR实现Hilbert滤波器求信号包络详解

2020/02/26 Python

python如何将两张图片生成为全景图片

2020/03/05 Python

CSS3模块的目前的状况分析

2010/02/24 HTML / CSS

iHerb香港：维生素、补充剂和天然保健品

2017/08/01 全球购物

美国受信赖的教育产品供应商：Nest Learning

2018/06/14 全球购物

阿迪达斯希腊官方网上商店：adidas希腊

2019/04/06 全球购物

马耳他航空公司官方网站：Air Malta

2019/05/15 全球购物

什么是典型的软件三层结构？软件设计为什么要分层？软件分层有什么好处？

2012/03/14 面试题

秋天的图画教学反思

2014/05/01 职场文书

建筑专业毕业生自荐信

2014/05/25 职场文书

聘用意向书

2014/07/29 职场文书

2014年大学生社会实践自我鉴定

2014/09/26 职场文书

破坏寝室公物检讨书

2014/11/17 职场文书

2015年统计员个人工作总结

2015/07/23 职场文书

《活见鬼》教学反思

2016/02/24 职场文书

解决Python中的modf()函数取小数部分不准确问题

2021/05/28 Python

mysql拆分字符串作为查询条件的示例代码

2022/07/07 MySQL