基于信息熵原理分词这个概念很早了,用php实现了个,一气呵成,代码自然是,反正我也没有二次检查,呵呵。不过耗费内存是真的,真的很消耗内存!
写这个的好处就是我明白了很多东西…
代码如下
<?php /** * 基于信息熵的无词典分词 */ class partword{ /** * 词语最大长度 */ public $maxwordlen = 5; /** * 需要进行分词的文字 */ public $text; /** * 字符串长度 */ private $len; /** * 切分的单个文字 */ private $textarr; /** * 文字信息数组 */ private $wordinfo; /** * 初始化文字,将文字进行分割 */ public function initWords(){ $i = 0; $len = strlen($this->text); $strarr = array(); $word = '';//单词缓存 $en_write = False; while( $i<$len ){ $ascnum = ord($this->text[$i]); if( $ascnum >=224 ){ //高位大于244,utf8规定为3个字节 $zh_word = substr($this->text, $i, 3); $en_write = True; $i += 3; }elseif( $ascnum >=192 ){ //高位大于192,utf8规定为2个字节 $zh_word = substr($this->text, $i, 2); $i+=2; $en_write = True; }elseif( $ascnum <= 32 ){ //空字符,英文的空格 $en_write = True; $i += 1; }else{ $en_write = False; $word .= substr($this->text, $i, 1); $i += 1; } if( $en_write && $word != '' ){ $strarr[] = $word; $word = ''; $en_write = False; $this->len += 1; } if( $zh_word !='' ){ $strarr[] = $zh_word; $zh_word = ''; $this->len += 1; } } return $this->textarr = $strarr; } /** * 发送文本 */ public function sendText($text){ $this->text = $text; $this->initWords(); $this->calInfo(); $this->calEntropy(); } /** * 整理词汇,用于后面的凝聚度和信息熵获取 */ public function calInfo(){ // if( !isset($this->textarr) ) die("no text"); $newarr = array(); $word = ''; $i = 0; /* 遍历整个单个文字数组 将所设定的词组阈值之内的所有可能的词进行提取 l=> 左边文字的位置 r=> 右侧文字的位置 c=> 词频 */ for ($i=0,$len = count($this->textarr); $i<$len; $i++) { $word = ''; for( $wordlen = 0; $wordlen < $this->maxwordlen; $wordlen++ ){ if( !isset( $this->textarr[$i+$wordlen] ) ) continue; $word .= $this->textarr[$i+$wordlen]; $newarr[$word]['w'] = $word; //左侧词 $newarr[$word]['l'][] = $i; //左侧词 $newarr[$word]['r'][] = $i+$wordlen; //右侧词 //统计词频 if( !isset($newarr[$word]['c']) ) $newarr[$word]['c'] = 1; else $newarr[$word]['c'] += 1; } } return $this->wordinfo = $newarr; } /** * 计算信息熵 */ public function calEntropy(){ foreach ($this->wordinfo as $key => $v) { $left_entropy = 0; $right_entropy = 0; //求左邻字的信息熵,需要大于0才可以 $left_word = array(); //print_r($v); foreach( $v['l'] as $l ){ if( $l >0 ){ if( !isset($left_word[ $this->textarr[$l-1] ]) ) $left_word[ $this->textarr[$l-1] ] = 1; else $left_word[ $this->textarr[$l-1] ] += 1; } } foreach( $left_word as $word=>$num ){ $left_entropy += -($num/$v['c'])*log($num/$v['c']); } //求右邻字的信息熵,需要小于整个长度 $right_word = array(); foreach( $v['r'] as $r ){ if( $r < $this->len-1 ){ if( !isset($right_word[ $this->textarr[$r+1] ]) ) $right_word[ $this->textarr[$r+1] ] = 1; else $right_word[ $this->textarr[$r+1] ] += 1; } } foreach( $right_word as $word=>$num ){ $right_entropy += -($num/$v['c'])*log($num/$v['c']); } //echo $key."--".$left_entropy."--".$right_entropy."\n"; //取坐信息熵和又信息熵里面最小的一个 $entropy = ($left_entropy < $right_entropy) ? $left_entropy : $right_entropy; $this->wordinfo[$key]['entropy'] = $entropy; unset($this->wordinfo[$key]['l']); unset($this->wordinfo[$key]['r']); } unset($this->textarr); return $this->wordinfo; } /** * 对词进行排序,根据信息熵排序 */ public function sortResult(){ usort($this->wordinfo, array(&$this,'sortByEntropy')); return $this->wordinfo; } private function sortByEntropy($a,$b){ if( $a['entropy'] === $b['entropy'] ) return 0; return ($a['entropy'] > $b['entropy']) ? -1: 1; } } //应用 header("Content-type:text/html;charset=utf-8"); $smen = memory_get_usage(); $partword = new partword(); $text = "吃葡萄不吐葡萄皮不吃葡萄倒吐葡萄皮"; $partword->sendText($text); print_r( $partword->sortResult() ); $emen = memory_get_usage(); echo (($emen - $smen)/1024) ."kb"; ?>
你可能还喜欢下面这些文章
赞赏微信赞赏
支付宝赞赏