php 分词搜索 splitword

/*分词效果*//* 修改_START */include_once('includes/lib_splitword_full.php');$kkk = trim($_REQUEST['keywords']);$kkk_upper = strtoupper($kkk);//判断是否符合商家搜索$corre = $db->getOne("select name from " . $ecs->table('correlation') . " where url ='$kkk' or url ='$kkk_upper' limit 0,1");$Recordkw = str_replace(array("\'"), array(''), trim($_REQUEST['keywords']));$cfg_soft_lang = 'utf-8';$sp = new SplitWord($cfg_soft_lang, $cfg_soft_lang);$sp->SetSource($Recordkw, $cfg_soft_lang, $cfg_soft_lang);$sp->SetResultType(2);$sp->StartAnalysis(TRUE);$word = $sp->GetFinallyResult(' ');

<?phpif (!defined('IN_ECS'))
{die('Hacking attempt');
}class SplitWord
{//hash算法选项var $mask_value = 0x3ffd;//输入和输出的字符编码（只允许 utf-8、gbk/gb2312/gb18030、big5 三种类型）  var $sourceCharSet = 'utf-8';var $targetCharSet = 'utf-8';//生成的分词结果数据类型 1 为全部， 2为 词典词汇及单个中日韩简繁字符及英文， 3 为词典词汇及英文var $resultType = 1;//句子长度小于这个数值时不拆分，notSplitLen = n(个汉字) * 2 + 1var $notSplitLen = 5;//把英文单词全部转小写var $toLower = FALSE;//使用最大切分模式对二元词进行消岐var $differMax = FALSE;//尝试合并单字var $unitWord = TRUE;//初始化类时直接加载词典var $loadInit = TRUE;//使用热门词优先模式进行消岐var $differFreq = FALSE;//被转换为unicode的源字符串var $sourceString = '';//附加词典var $addonDic = array();var $addonDicFile = 'includes/codetable/words_addons.dic';//主词典 var $dicStr = '';var $mainDic = array();var $mainDicHand = FALSE;var $mainDicInfos = array();var $mainDicFile = 'includes/codetable/base_dic_full.dic';//是否直接载入词典（选是载入速度较慢，但解析较快；选否载入较快，但解析较慢，需要时才会载入特定的词条）var $mainDicFileZip = 'includes/codetable/base_dic_full.zip';var $isLoadAll = FALSE;var $isUnpacked = FALSE;//主词典词语最大长度 x / 2var $dicWordMax = 14;//粗分后的数组（通常是截取句子等用途）var $simpleResult = array();//最终结果(用空格分开的词汇列表)var $finallyResult = '';//是否已经载入词典var $isLoadDic = FALSE;//系统识别或合并的新词var $newWords = array();var $foundWordStr = '';//词库载入时间var $loadTime = 0;/*** 构造函数* @param $source_charset* @param $target_charset* @param $load_alldic * @param $source** @return void*/function __construct($source_charset='utf-8', $target_charset='utf-8', $load_all=TRUE, $source=''){$this->SetSource( $source, $source_charset, $target_charset );$this->isLoadAll = $load_all;if(file_exists(ROOT_PATH .$this->mainDicFile)) $this->isUnpacked = TRUE;if($this->loadInit) $this->LoadDict();}function SplitWord($source_charset='utf-8', $target_charset='utf-8', $load_all=TRUE, $source=''){$this->__construct($source_charset, $target_charset, $load_all, $source);}/*** 析构函数*/function __destruct(){if( $this->mainDicHand !== FALSE ){@fclose( $this->mainDicHand );}}/*** 根据字符串计算key索引* @param $key* @return short int*/function _get_index( $key ){$l = strlen($key);$h = 0x238f13af;while ($l--){$h += ($h << 5);$h ^= ord($key[$l]);$h &= 0x7fffffff;}return ($h % $this->mask_value);}/*** 从文件获得词* @param $key* @param $type (类型 word 或 key_groups)* @return short int*/function GetWordInfos( $key, $type='word' ){if( !$this->mainDicHand ){$this->mainDicHand = fopen($this->mainDicFile, 'r');}$p = 0;$keynum = $this->_get_index( $key );if( isset($this->mainDicInfos[ $keynum ]) ){$data = $this->mainDicInfos[ $keynum ];}else{//rewind( $this->mainDicHand );$move_pos = $keynum * 8;fseek($this->mainDicHand, $move_pos, SEEK_SET);$dat = fread($this->mainDicHand, 8);$arr = unpack('I1s/n1l/n1c', $dat);if( $arr['l'] == 0 ){return FALSE;}fseek($this->mainDicHand, $arr['s'], SEEK_SET);$data = @unserialize(fread($this->mainDicHand, $arr['l']));$this->mainDicInfos[ $keynum ] = $data;}if( !is_array($data) || !isset($data[$key]) ) {return FALSE;}return ($type=='word' ? $data[$key] : $data);}/*** 设置源字符串* @param $source* @param $source_charset* @param $target_charset** @return bool*/function SetSource( $source, $source_charset='utf-8', $target_charset='utf-8' ){$this->sourceCharSet = strtolower($source_charset);$this->targetCharSet = strtolower($target_charset);$this->simpleResult = array();$this->finallyResult = array();$this->finallyIndex = array();if( $source != '' ){$rs = TRUE;if( preg_match("/^utf/", $source_charset) ) {$this->sourceString = @iconv('utf-8', UCS2, $source);}else if( preg_match("/^gb/", $source_charset) ) {$this->sourceString = @iconv('utf-8', UCS2, iconv('gb18030', 'utf-8', $source));}else if( preg_match("/^big/", $source_charset) ) {$this->sourceString = @iconv('utf-8', UCS2, iconv('big5', 'utf-8', $source));}else {$rs = FALSE;}}else{$rs = FALSE;}return $rs;}/*** 设置结果类型(只在获取finallyResult才有效)* @param $rstype 1 为全部， 2去除特殊符号** @return void*/function SetResultType( $rstype ){$this->resultType = $rstype;}/*** 载入词典** @return void*/function LoadDict( $maindic='' ){$this->addonDicFile   = ROOT_PATH . $this->addonDicFile;$this->mainDicFile    = ROOT_PATH . $this->mainDicFile;$this->mainDicFileZip = ROOT_PATH . $this->mainDicFileZip;$startt = microtime(TRUE);//正常读取文件$dicAddon = $this->addonDicFile;if($maindic=='' || !file_exists($maindic) ){$dicWords = $this->mainDicFile ;}else{$dicWords = $maindic;$this->mainDicFile = $maindic;}//加载主词典（只打开）if($this->isUnpacked){$this->mainDicHand = fopen($dicWords, 'r');}else{$this->InportDict($this->mainDicFileZip);}//载入副词典$hw = '';$ds = file($dicAddon);foreach($ds as $d){$d = trim($d);if($d=='') continue;$estr = substr($d, 1, 1);if( $estr==':' ) {$hw = substr($d, 0, 1);}else{$spstr = _SP_;$spstr = iconv(UCS2, 'utf-8', $spstr);$ws = explode(',', $d);$wall = iconv('utf-8', UCS2, join($spstr, $ws));$ws = explode(_SP_, $wall);foreach($ws as $estr){$this->addonDic[$hw][$estr] = strlen($estr);}}}$this->loadTime = microtime(TRUE) - $startt;$this->isLoadDic = TRUE;}/*** 检测某个词是否存在*/function IsWord( $word ){$winfos = $this->GetWordInfos( $word );return ($winfos !== FALSE);}/*** 获得某个词的词性及词频信息* @parem $word unicode编码的词* @return void*/function GetWordProperty($word){if( strlen($word)<4 ){return '/s';}$infos = $this->GetWordInfos($word);return isset($infos[1]) ? "/{$infos[1]}{$infos[0]}" : "/s";}/*** 指定某词的词性信息（通常是新词）* @parem $word unicode编码的词* @parem $infos array('c' => 词频, 'm' => 词性);* @return void;*/function SetWordInfos($word, $infos){if( strlen($word)<4 ){return ;}if( isset($this->mainDicInfos[$word]) ){$this->newWords[$word]++;$this->mainDicInfos[$word]['c']++;}else{$this->newWords[$word] = 1;$this->mainDicInfos[$word] = $infos;}}/*** 开始执行分析* @parem bool optimize 是否对结果进行优化* @return bool*/function StartAnalysis($optimize=TRUE){if( !$this->isLoadDic ){$this->LoadDict();}$this->simpleResult = $this->finallyResult = array();$this->sourceString .= chr(0).chr(32);$slen = strlen($this->sourceString);$sbcArr = array();$j = 0;//全角与半角字符对照表for($i=0xFF00; $i < 0xFF5F; $i++){$scb = 0x20 + $j;$j++;$sbcArr[$i] = $scb;}//对字符串进行粗分$onstr = '';$lastc = 1; //1 中/韩/日文, 2 英文/数字/符号('.', '@', '#', '+'), 3 ANSI符号 4 纯数字 5 非ANSI符号或不支持字符$s = 0;$ansiWordMatch = "[0-9a-z@#%\+\.-]";$notNumberMatch = "[a-z@#%\+]";for($i=0; $i < $slen; $i++){$c = $this->sourceString[$i].$this->sourceString[++$i];$cn = hexdec(bin2hex($c));$cn = isset($sbcArr[$cn]) ? $sbcArr[$cn] : $cn;//ANSI字符if($cn < 0x80){if( preg_match('/'.$ansiWordMatch.'/i', chr($cn)) ){if( $lastc != 2 && $onstr != '') {$this->simpleResult[$s]['w'] = $onstr;$this->simpleResult[$s]['t'] = $lastc;$this->_deep_analysis($onstr, $lastc, $s, $optimize);$s++;$onstr = '';}$lastc = 2;$onstr .= chr(0).chr($cn);}else{if( $onstr != '' ){$this->simpleResult[$s]['w'] = $onstr;if( $lastc==2 ){if( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4;}$this->simpleResult[$s]['t'] = $lastc;if( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize);$s++;}$onstr = '';$lastc = 3;if($cn < 31){continue;}else{$this->simpleResult[$s]['w'] = chr(0).chr($cn);$this->simpleResult[$s]['t'] = 3;$s++;}}}//普通字符else{//正常文字if( ($cn>0x3FFF && $cn < 0x9FA6) || ($cn>0xF8FF && $cn < 0xFA2D)|| ($cn>0xABFF && $cn < 0xD7A4) || ($cn>0x3040 && $cn < 0x312B) ){if( $lastc != 1 && $onstr != ''){$this->simpleResult[$s]['w'] = $onstr;if( $lastc==2 ){if( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4;}$this->simpleResult[$s]['t'] = $lastc;if( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize);$s++;$onstr = '';}$lastc = 1;$onstr .= $c;}//特殊符号else{if( $onstr != '' ){$this->simpleResult[$s]['w'] = $onstr;if( $lastc==2 ){if( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4;}$this->simpleResult[$s]['t'] = $lastc;if( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize);$s++;}//检测书名if( $cn == 0x300A ){$tmpw = '';$n = 1;$isok = FALSE;$ew = chr(0x30).chr(0x0B);while(TRUE){$w = $this->sourceString[$i+$n].$this->sourceString[$i+$n+1];if( $w == $ew ){$this->simpleResult[$s]['w'] = $c;$this->simpleResult[$s]['t'] = 5;$s++;$this->simpleResult[$s]['w'] = $tmpw;$this->newWords[$tmpw] = 1;if( !isset($this->newWords[$tmpw]) ){$this->foundWordStr .= $this->_out_string_encoding($tmpw).'/nb, ';$this->SetWordInfos($tmpw, array('c'=>1, 'm'=>'nb'));}$this->simpleResult[$s]['t'] = 13;$s++;//最大切分模式对书名继续分词if( $this->differMax ){$this->simpleResult[$s]['w'] = $tmpw;$this->simpleResult[$s]['t'] = 21;$this->_deep_analysis($tmpw, $lastc, $s, $optimize);$s++;}$this->simpleResult[$s]['w'] = $ew;$this->simpleResult[$s]['t'] =  5;$s++;$i = $i + $n + 1;$isok = TRUE;$onstr = '';$lastc = 5;break;}else{$n = $n+2;$tmpw .= $w;if( strlen($tmpw) > 60 ){break;}}}//whileif( !$isok ){$this->simpleResult[$s]['w'] = $c;$this->simpleResult[$s]['t'] = 5;$s++;$onstr = '';$lastc = 5;}continue;}$onstr = '';$lastc = 5;if( $cn==0x3000 ){continue;}else{$this->simpleResult[$s]['w'] = $c;$this->simpleResult[$s]['t'] = 5;$s++;}}//2byte symbol}//end 2byte char}//end for//处理分词后的结果$this->_sort_finally_result();}/*** 深入分词* @parem $str* @parem $ctype (2 英文类， 3 中/韩/日文类)* @parem $spos   当前粗分结果游标* @return bool*/function _deep_analysis( &$str, $ctype, $spos, $optimize=TRUE ){//中文句子if( $ctype==1 ){$slen = strlen($str);//小于系统配置分词要求长度的句子if( $slen < $this->notSplitLen ){$tmpstr = '';$lastType = 0;if( $spos > 0 ) $lastType = $this->simpleResult[$spos-1]['t'];if($slen < 5){//echo iconv(UCS2, 'utf-8', $str).'<br/>';if( $lastType==4 && ( isset($this->addonDic['u'][$str]) || isset($this->addonDic['u'][substr($str, 0, 2)]) ) ){$str2 = '';if( !isset($this->addonDic['u'][$str]) && isset($this->addonDic['s'][substr($str, 2, 2)]) ){$str2 = substr($str, 2, 2);$str  = substr($str, 0, 2);}$ww = $this->simpleResult[$spos - 1]['w'].$str;$this->simpleResult[$spos - 1]['w'] = $ww;$this->simpleResult[$spos - 1]['t'] = 4;if( !isset($this->newWords[$this->simpleResult[$spos - 1]['w']]) ){$this->foundWordStr .= $this->_out_string_encoding( $ww ).'/mu, ';$this->SetWordInfos($ww, array('c'=>1, 'm'=>'mu'));}$this->simpleResult[$spos]['w'] = '';if( $str2 != '' ){$this->finallyResult[$spos-1][] = $ww;$this->finallyResult[$spos-1][] = $str2;}}else {$this->finallyResult[$spos][] = $str;}}else{$this->_deep_analysis_cn( $str, $ctype, $spos, $slen, $optimize );}}//正常长度的句子，循环进行分词处理else{$this->_deep_analysis_cn( $str, $ctype, $spos, $slen, $optimize );}}//英文句子，转为小写else{if( $this->toLower ) {$this->finallyResult[$spos][] = strtolower($str);}else {$this->finallyResult[$spos][] = $str;}}}/*** 中文的深入分词* @parem $str* @return void*/function _deep_analysis_cn( &$str, $lastec, $spos, $slen, $optimize=TRUE ){$quote1 = chr(0x20).chr(0x1C);$tmparr = array();$hasw = 0;//如果前一个词为 “ ， 并且字符串小于3个字符当成一个词处理。if( $spos > 0 && $slen < 11 && $this->simpleResult[$spos-1]['w']==$quote1 ){$tmparr[] = $str;if( !isset($this->newWords[$str]) ){$this->foundWordStr .= $this->_out_string_encoding($str).'/nq, ';$this->SetWordInfos($str, array('c'=>1, 'm'=>'nq'));}if( !$this->differMax ){$this->finallyResult[$spos][] = $str;return ;}}//进行切分for($i=$slen-1; $i > 0; $i -= 2){//单个词$nc = $str[$i-1].$str[$i];//是否已经到最后两个字if( $i <= 2 ){$tmparr[] = $nc;$i = 0;break;}$isok = FALSE;$i = $i + 1;for($k=$this->dicWordMax; $k>1; $k=$k-2){if($i < $k) continue;$w = substr($str, $i-$k, $k);if( strlen($w) <= 2 ){$i = $i - 1;break;}if( $this->IsWord( $w ) ){$tmparr[] = $w;$i = $i - $k + 1;$isok = TRUE;break;}}//echo '<hr />';//没适合词if(!$isok) $tmparr[] = $nc;}$wcount = count($tmparr);if( $wcount==0 ) return ;$this->finallyResult[$spos] = array_reverse($tmparr);//优化结果(岐义处理、新词、数词、人名识别等)if( $optimize ){$this->_optimize_result( $this->finallyResult[$spos], $spos );}}/*** 对最终分词结果进行优化（把simpleresult结果合并，并尝试新词识别、数词合并等）* @parem $optimize 是否优化合并的结果* @return bool*///t = 1 中/韩/日文, 2 英文/数字/符号('.', '@', '#', '+'), 3 ANSI符号 4 纯数字 5 非ANSI符号或不支持字符function _optimize_result( &$smarr, $spos ){$newarr = array();$prePos = $spos - 1;$arlen = count($smarr);$i = $j = 0;//检测数量词if( $prePos > -1 && !isset($this->finallyResult[$prePos]) ){$lastw = $this->simpleResult[$prePos]['w'];$lastt = $this->simpleResult[$prePos]['t'];if( ($lastt==4 || isset( $this->addonDic['c'][$lastw] )) && isset( $this->addonDic['u'][$smarr[0]] ) ){$this->simpleResult[$prePos]['w'] = $lastw.$smarr[0];$this->simpleResult[$prePos]['t'] = 4;if( !isset($this->newWords[ $this->simpleResult[$prePos]['w'] ]) ){$this->foundWordStr .= $this->_out_string_encoding( $this->simpleResult[$prePos]['w'] ).'/mu, ';$this->SetWordInfos($this->simpleResult[$prePos]['w'], array('c'=>1, 'm'=>'mu'));}$smarr[0] = '';$i++;}}for(; $i < $arlen; $i++){if( !isset( $smarr[$i+1] ) ){$newarr[$j] = $smarr[$i];break;}$cw = $smarr[$i];$nw = $smarr[$i+1];$ischeck = FALSE;//检测数量词if( isset( $this->addonDic['c'][$cw] ) && isset( $this->addonDic['u'][$nw] ) ){//最大切分时保留合并前的词if($this->differMax){$newarr[$j] = chr(0).chr(0x28);$j++;$newarr[$j] = $cw;$j++;$newarr[$j] = $nw;$j++;$newarr[$j] = chr(0).chr(0x29);$j++;}$newarr[$j] = $cw.$nw;if( !isset($this->newWords[$newarr[$j]]) ){$this->foundWordStr .= $this->_out_string_encoding( $newarr[$j] ).'/mu, ';$this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'mu'));}$j++; $i++; $ischeck = TRUE;}//检测前导词(通常是姓)else if( isset( $this->addonDic['n'][ $smarr[$i] ] ) ){$is_rs = FALSE;//词语是副词或介词或频率很高的词不作为人名if( strlen($nw)==4 ){$winfos = $this->GetWordInfos($nw);if(isset($winfos['m']) && ($winfos['m']=='r' || $winfos['m']=='c' || $winfos['c']>500) ){$is_rs = TRUE;}}if( !isset($this->addonDic['s'][$nw]) && strlen($nw)<5 && !$is_rs ){$newarr[$j] = $cw.$nw;//echo iconv(UCS2, 'utf-8', $newarr[$j])."<br />";//尝试检测第三个词if( strlen($nw)==2 && isset($smarr[$i+2]) && strlen($smarr[$i+2])==2 && !isset( $this->addonDic['s'][$smarr[$i+2]] ) ){$newarr[$j] .= $smarr[$i+2];$i++;}if( !isset($this->newWords[$newarr[$j]]) ){$this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'nr'));$this->foundWordStr .= $this->_out_string_encoding($newarr[$j]).'/nr, ';}//为了防止错误，保留合并前的姓名if(strlen($nw)==4){$j++;$newarr[$j] = chr(0).chr(0x28);$j++;$newarr[$j] = $cw;$j++;$newarr[$j] = $nw;$j++;$newarr[$j] = chr(0).chr(0x29);}$j++; $i++; $ischeck = TRUE;}}//检测后缀词(地名等)else if( isset($this->addonDic['a'][$nw]) ){$is_rs = FALSE;//词语是副词或介词不作为前缀if( strlen($cw)>2 ){$winfos = $this->GetWordInfos($cw);if(isset($winfos['m']) && ($winfos['m']=='a' || $winfos['m']=='r' || $winfos['m']=='c' || $winfos['c']>500) ){$is_rs = TRUE;}}if( !isset($this->addonDic['s'][$cw]) && !$is_rs ){$newarr[$j] = $cw.$nw;if( !isset($this->newWords[$newarr[$j]]) ){$this->foundWordStr .= $this->_out_string_encoding($newarr[$j]).'/na, ';$this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'na'));}$i++; $j++; $ischeck = TRUE;}}//新词识别（暂无规则）else if($this->unitWord){if(strlen($cw)==2 && strlen($nw)==2 && !isset($this->addonDic['s'][$cw]) && !isset($this->addonDic['t'][$cw]) && !isset($this->addonDic['a'][$cw]) && !isset($this->addonDic['s'][$nw]) && !isset($this->addonDic['c'][$nw])){$newarr[$j] = $cw.$nw;//尝试检测第三个词if( isset($smarr[$i+2]) && strlen($smarr[$i+2])==2 && (isset( $this->addonDic['a'][$smarr[$i+2]] ) || isset( $this->addonDic['u'][$smarr[$i+2]] )) ){$newarr[$j] .= $smarr[$i+2];$i++;}if( !isset($this->newWords[$newarr[$j]]) ){$this->foundWordStr .= $this->_out_string_encoding($newarr[$j]).'/ms, ';$this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'ms'));}$i++; $j++; $ischeck = TRUE;}}//不符合规则if( !$ischeck ){$newarr[$j] = $cw;//二元消岐处理——最大切分模式if( $this->differMax && !isset($this->addonDic['s'][$cw]) && strlen($cw) < 5 && strlen($nw) < 7){$slen = strlen($nw);$hasDiff = FALSE;for($y=2; $y <= $slen-2; $y=$y+2){$nhead = substr($nw, $y-2, 2);$nfont = $cw.substr($nw, 0, $y-2);if( $this->IsWord( $nfont.$nhead ) ){if( strlen($cw) > 2 ) $j++;$hasDiff = TRUE;$newarr[$j] = $nfont.$nhead;}}}$j++;}}//end for$smarr =  $newarr;}/*** 转换最终分词结果到 finallyResult 数组* @return void*/function _sort_finally_result(){$newarr = array();$i = 0;foreach($this->simpleResult as $k=>$v){if( empty($v['w']) ) continue;if( isset($this->finallyResult[$k]) && count($this->finallyResult[$k]) > 0 ){foreach($this->finallyResult[$k] as $w){if(!empty($w)){$newarr[$i]['w'] = $w;$newarr[$i]['t'] = 20;$i++;}}}else if($v['t'] != 21){$newarr[$i]['w'] = $v['w'];$newarr[$i]['t'] = $v['t'];$i++;}}$this->finallyResult = $newarr;$newarr = '';}/*** 把uncode字符串转换为输出字符串* @parem str* return string*/function _out_string_encoding( &$str ){$rsc = $this->_source_result_charset();if( $rsc==1 ) {$rsstr = iconv(UCS2, 'utf-8', $str);}else if( $rsc==2 ) {$rsstr = iconv('utf-8', 'gb18030', iconv(UCS2, 'utf-8', $str) );}else{$rsstr = iconv('utf-8', 'big5', iconv(UCS2, 'utf-8', $str) );}return $rsstr;}/*** 获取最终结果字符串（用空格分开后的分词结果）* @return string*/function GetFinallyResult($spword=' ', $word_meanings=FALSE){$rsstr = '';foreach($this->finallyResult as $v){if( $this->resultType==2 && ($v['t']==3 || $v['t']==5) ){continue;}$m = '';if( $word_meanings ){$m = $this->GetWordProperty($v['w']);}$w = $this->_out_string_encoding($v['w']);if( $w != ' ' ){if($word_meanings) {$rsstr .= $spword.$w.$m;}else {$rsstr .= $spword.$w;}}}return $rsstr;}/*** 获取粗分结果，不包含粗分属性* @return array()*/function GetSimpleResult(){$rearr = array();foreach($this->simpleResult as $k=>$v){if( empty($v['w']) ) continue;$w = $this->_out_string_encoding($v['w']);if( $w != ' ' ) $rearr[] = $w;}return $rearr;}/*** 获取粗分结果，包含粗分属性（1中文词句、2 ANSI词汇（包括全角），3 ANSI标点符号（包括全角），4数字（包括全角），5 中文标点或无法识别字符）* @return array()*/function GetSimpleResultAll(){$rearr = array();foreach($this->simpleResult as $k=>$v){$w = $this->_out_string_encoding($v['w']);if( $w != ' ' ){$rearr[$k]['w'] = $w;$rearr[$k]['t'] = $v['t'];}}return $rearr;}/*** 获取索引hash数组* @return array('word'=>count,...)*/function GetFinallyIndex(){$rearr = array();foreach($this->finallyResult as $v){if( $this->resultType==2 && ($v['t']==3 || $v['t']==5) ){continue;}$w = $this->_out_string_encoding($v['w']);if( $w == ' ' ){continue;}if( isset($rearr[$w]) ){$rearr[$w]++;}else{$rearr[$w] = 1;}}return $rearr;}/*** 获得保存目标编码* @return int*/function _source_result_charset(){if( preg_match("/^utf/", $this->targetCharSet) ) {$rs = 1;}else if( preg_match("/^gb/", $this->targetCharSet) ) {$rs = 2;}else if( preg_match("/^big/", $this->targetCharSet) ) {$rs = 3;}else {$rs = 4;}return $rs;}/*** 编译词典* @parem $sourcefile utf-8编码的文本词典数据文件<参见范例dict/not-build/base_dic_full.txt>* 注意, 需要PHP开放足够的内存才能完成操作* @return void*/function MakeDict( $source_file, $target_file='' ){$target_file = ($target_file=='' ? $this->mainDicFile : $target_file);$allk = array();$fp = fopen($source_file, 'r');while( $line = fgets($fp, 512) ){if( $line[0]=='@' ) continue;list($w, $r, $a) = explode(',', $line);$a = trim( $a );$w = iconv('utf-8', UCS2, $w);$k = $this->_get_index( $w );if( isset($allk[ $k ]) )$allk[ $k ][ $w ] = array($r, $a);else$allk[ $k ][ $w ] = array($r, $a);}fclose( $fp );$fp = fopen($target_file, 'w');$heade_rarr = array();$alldat = '';$start_pos = $this->mask_value * 8;foreach( $allk as $k => $v ){$dat  = serialize( $v );$dlen = strlen($dat);$alldat .= $dat;$heade_rarr[ $k ][0] = $start_pos;$heade_rarr[ $k ][1] = $dlen;$heade_rarr[ $k ][2] = count( $v );$start_pos += $dlen;}unset( $allk );for($i=0; $i < $this->mask_value; $i++){if( !isset($heade_rarr[$i]) ){$heade_rarr[$i] = array(0, 0, 0);}fwrite($fp, pack("Inn", $heade_rarr[$i][0], $heade_rarr[$i][1], $heade_rarr[$i][2]));}fwrite( $fp, $alldat);fclose( $fp );}/*** 导出词典的词条* @parem $targetfile 保存位置* @return void*/function ExportDict( $targetfile ){if( !$this->mainDicHand ){$this->mainDicHand = fopen($this->mainDicFile, 'rw');}$fp = fopen($targetfile, 'w');for($i=0; $i <= $this->mask_value; $i++){$move_pos = $i * 8;fseek($this->mainDicHand, $move_pos, SEEK_SET);$dat = fread($this->mainDicHand, 8);$arr = unpack('I1s/n1l/n1c', $dat);if( $arr['l'] == 0 ){continue;}fseek($this->mainDicHand, $arr['s'], SEEK_SET);$data = @unserialize(fread($this->mainDicHand, $arr['l']));if( !is_array($data) ) continue;foreach($data as $k => $v){$w = iconv(UCS2, 'utf-8', $k);fwrite($fp, "{$w},{$v[0]},{$v[1]}\n");}}fclose( $fp );return TRUE;}function InportDict( $targetfile ){if(!ini_set('memory_limit', '128M'))exit('设置内存错误，请稍后在试!');require_once(ROOT_PATH . 'includes/zip.class.php');$zip = new zip();//echo $targetfile;$unpackagefile = array_keys($zip->Extract($targetfile, ROOT_PATH . 'includes/codetable/'));//exit();$this->MakeDict(ROOT_PATH . 'includes/codetable/'.$unpackagefile[0]);unlink(ROOT_PATH . 'includes/codetable/'.$unpackagefile[0]);return true;}
}
?>

php 分词搜索 splitword相关推荐

关键词联想关联 php,ECSHOP商品关键词模糊分词搜索插件,商品列表关键字加红功能...
各位ECSHOP网店系统用户大家好,欢迎来到ECSHOP商品关键词模糊分词搜索商品列表关键字加红功能. 最近给客户做一个ECSHOP商城,发现ECSHOP的模糊搜索要求太高,需要加入and.空格.加号 ...
php 搜索名称或者编号,ECSHOP商品关键词模糊分词搜索插件,商品列表关键字加红功能-ecshop插件网...
最近给客户做一个ECSHOP商城,发现ECSHOP的模糊搜索要求太高,需要加入and.空格.加号等,客户搜索的时候不可能这样操作. 考虑到工期的问题,采用了织梦的分词算法,搜索效果虽然不是特别理想,但 ...
找不到具有指定id_JAVA如何整合es指定字段不分词搜索？
一.问题在做一个需求的时候,需要按照电话号码查询用户关系,所以我这边先讲相关信息同步到es,但是电话号码是加密的,所以显示的字符串是杂乱的,既有字母,又有斜杠等号等字符,在进行分词查询的时候匹配不到 ...
es创建索引设置字段不分词_java整合es指定字段不分词搜索
二.问题在做一个需求的时候,需要按照电话号码查询用户关系,所以我这边先讲相关信息同步到es,但是电话号码是加密的,所以显示的字符串是杂乱的,既有字母,又有斜杠等号等字符,在进行分词查询的时候匹配不到 ...
Clucene实现中文分词搜索（转载）
最近,一阵忙乎,终于在Clucene(版本0.9.16)中实现了中文分词搜索. 一些需要改动的地方如下: 一. 把项目设置为Use Unicode Charac ...
php 搜索引擎分词_PHP 实现中文分词搜索功能
中文分词介绍众所周知,英文是以词为单位的,词和词之间是靠空格隔开,而中文是以字为单位,句子中所有的字连起来才能描述一个意思.例如,英文句子I am a student,用中文则为:"我是一 ...
PHP 实现中文分词搜索功能
中文分词介绍众所周知,英语是基于单词的,单词和单词之间用空格隔开,而中文是基于单词的.句子中的所有单词都可以连接起来以描述含义.例如,英文句子"我是学生"将用中文表示" ...
elasticsearch springboot 实现分词搜索
实现分词搜索之前必须保证您的springboot 已经连接上了你的elasticsearch 这里展示下我的配置信息和pom文件 application.yml spring: data: elast ...
ElasticSearch(十一)Java用ElasticSearch 利用搜索词分词搜索
需求: 搜索词进行分词搜索例如: 搜索词为"意大利医生", 那么拆分匹配内容包含"意大利医生"或"意大利"或"医生" ...

php 分词搜索 splitword

php 分词搜索 splitword相关推荐

最新文章

热门文章