从请求的页面提取关键词
它可以从一个给定的URL检索网页提取出一些关键词例如从代码珠玑的首页可以提取出类似下面图片中的关键词
它可以从一个给定的URL检索网页提取出一些关键词
例如从代码珠玑的首页可以提取出类似下面图片中的关键词
例如从代码珠玑的首页可以提取出类似下面图片中的关键词
<?php
if(!empty($_REQUEST["url"])){
include 'class.keywords.php';
$keywords = new keywordsugest();
$keywords->_lang = 'es';
$keywords->_encoding = 'iso-8859-1';
$keywords->_catego = 'telecom';
$keywords->_keyCount = 100; // is like the porcent %
$keywords->file($_REQUEST['url']);
#$keywords->readMetaKeyWords();
#$keywords->readHtmlKeyWords();
$keywords->readAll();
echo 'Keywords found :</br></br>';
$i = 1;
foreach($keywords->get() as $word) echo $i++.". $word<br>";
}
//url例如:http://www.codepearl.com
echo "<form method='post'><input type='text' name='url'><input type='submit'></form>";
?>
3. [代码][PHP]代码 跳至 [2] [3] [全屏预览]
<?php
class keywordsugest{
var $_html = FALSE;
var $_keyCount = 5;
var $_keyWords = array();
var $_encoding = 'UTF-8';
var $_lang = 'es';
var $_catego = 'telecom';
var $_url = '';
/**
* # read meta keywords
*
*/
public function readMetaKeyWords() {
if (! $this->_html) return;
preg_match('/<[\s]*meta[\s]*name[\s]*=[\s]*\"[\s]*keywords[\s]*\"[\s]*content[\s]*=[\s]*\"?([^>"]*)\"?[\s]*[\/]?[\s]*>/is', $this->_html, $match);
//$tags = get_meta_tags($this->_url);
//echo $tags['keywords'];
if (count($match)) {
$this->_keyWords = array_unique(explode(',', preg_replace('/\s/i', ' ', mb_strtolower($match[1], $this->_encoding))));
}
}
/**
* strip tags
*
* @param mixed $string
*/
private function rip_tags($string) {
// ----- remove HTML TAGs -----
$string = preg_replace ('/<[^>]*>/', ' ', $string);
/* // ----- remove control characters -----
$string = str_replace("\r", '', $string); // --- replace with empty space
$string = str_replace("\n", ' ', $string); // --- replace with space
$string = str_replace("\t", ' ', $string); // --- replace with space
*/
// ----- remove multiple spaces -----
$string = trim(preg_replace('/ {2,}/', ' ', $string));
return $string;
}
/**
* # read keywords from page body or string
*
*/
public function readHtmlKeyWords() {
if (! $this->_html) return;
if(!empty($this->_keyWords)){
$implo = implode(' ',$this->_keyWords);
$this->_html = $this->_html." ".$implo;
$this->_keyWords = array();
}
$this->_html = str_replace(' ',' ', $this->_html);
# remove unneeded parts
$toRemove = array('head', 'script', 'style', 'object', 'embed', 'noembed', 'applet', 'noframes', 'noscript');
foreach ($toRemove as $remove) $this->_html = preg_replace("/\<\s*$remove.*?\>.*?\<\s*\/\s*$remove\s*\>/is", ' ', $this->_html);
# remove comments
$this->_html = preg_replace("/\<\s*!--.*?-->/is", ' ', $this->_html);
# delete html tags
$this->_html = mb_strtolower($this->rip_tags($this->_html), $this->_encoding);
$this->_html = htmlspecialchars_decode($this->_html);
# decode encoded hmtl entities
$this->_html = html_entity_decode ($this->_html, ENT_COMPAT, $this->_encoding);
# break into words
$words = preg_split("/[\s]+|[\t]+|[\.]+|[\,]+|[\:]+|[\;]+|[\!]+|[\?]+|[\|]+/s", $this->_html, -1, PREG_SPLIT_NO_EMPTY);
if (count($words)) {
$frequency = array_count_values($words);
unset($frequency['']);
if (count($frequency)) {
# delete stop words and interpunctions
include('stopwords_'.$this->_lang.'.php');
include('glodic_'.$this->_catego.'_'.$this->_lang.'.php');
$punct = '~!@#$%^&*()_+|}{[];:\'\",<.>/?`-=\\';
foreach (array_keys($frequency) as $word) {
if ( (in_array($word, $stopWords)) or (strspn($word, $punct) == strlen($word)) ){ unset($frequency[$word]); }
}
$max = max($frequency);
$count = count($frequency);
$tot = round(($max * 100) / $count);
$tot2 = round(($this->_keyCount * 100) / $count);
if($tot > $count){$tot = $tot / 2;}
if($tot2 > $count){$tot = $tot / 2;}
$showmax = round(($tot + $tot2) / 2);
foreach (array_keys($frequency) as $word) {
if ( in_array($word, $glodic) ){$frequency[$word] = $frequency[$word] + $showmax; }
}
# sort by frequency
arsort($frequency, SORT_NUMERIC);
# add them to keyword array
$i = 0;
foreach ($frequency as $word=>$count) {
if ( (! in_array($word, $this->_keyWords)) &&
(! is_numeric($word)) &&
(! empty($word)) ) {
$this->_keyWords[] = (string)$word;
$i++;
if ($i == $showmax) break;
}
}
}
}
}
/**
* change the encoding from default utf-8
*
* @param mixed $enc
*/
private function encoding($enc = FALSE) {
if ($enc) $this->_encoding = $enc;
}
/**
* # reads from file or url
*
* @param mixed $fileUrl
*/
public function file($fileUrl = FALSE) {
if ($fileUrl){ $this->_html = @file_get_contents($fileUrl);
$this->_url = $fileUrl;
}
}
/**
* # define html as string
*
* @param mixed $page
*/
public function html($page = FALSE) {
if ($page) $this->_html = $page;
}
/**
* # reads both meta keywords and from body
*
*/
public function readAll() {
if ($this->_html !== FALSE) {
$this->readMetaKeyWords();
$this->readHtmlKeyWords();
}
$this->_keyWords = array_unique($this->_keyWords);
}
/**
* # returns keywords as array
*
*/
public function get() {
return $this->_keyWords;
}
}
?>
- 上一篇:FirePHP使用详解
- 下一篇:php 定时脚本管理器
精彩图集
精彩文章






