Daily Report 2012.11.06 刘宇翔

今天对李忠修改过的match函数进行测试，修正bug,并进行优化。

将中文分词方法加入到算法中，提高了算法的精确度。

但中文分词方法加入到算法后，出现在一些新问题，对新出现的问题进行了修正和优化。

测试过程中运用了900条字符串的样例。

更新后代码如下：

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.IO;namespace match0
{class Program{static public int match(string word,string keyword){int matchDegree = -1;//word、keyword匹配级别//string[] wordlist = word.Split(new char[] { ' ' },StringSplitOptions.RemoveEmptyEntries);List<string> wordlist = ChineseWordSegmentation.word_segmentation(word);int wlN = wordlist.Count();//word关键词数量if (wlN == 0){string[] wordlist2 = word.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);wlN = wordlist2.Count();for (int i = 0; i < wlN; i++)wordlist.Add(wordlist2[i]);}//输入有空，返回************************************************************************if (word.Length == 0 || keyword.Length == 0)//输入有空，返回-1return matchDegree;//**************************************************************************************//模糊匹配，返回0或1或2或3或4***********************************************************if (wlN == 1)//word只含一个关键词
            {matchDegree = wordmatch(wordlist[0], keyword);}else //word含多个关键词
            {List<int> wkDegree = new List<int>();for (int i = 0; i < wlN; i++)wkDegree.Add(0);for (int j = 0; j < wlN; j++){wkDegree[j] = wordmatch(wordlist[j], keyword);}//取wkDegree[]最大数法int Max = 0;for (int i = 0; i < wlN; i++){if (wkDegree[i] > Max){Max = wkDegree[i];}}matchDegree = Max;return matchDegree;}//***************************************************************************************return matchDegree;//因错误等不明原因跳出，返回-1
        }static public int wordmatch(string w, string keyword)//单个关键词对keyword的模糊匹配，w为单个关键词
        {int wmatchDegree = 0;//w、keyword匹配级别int Max = 0;List<int> wkDegree=new List<int>();//string[] keywordlist = keyword.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);List<string> keywordlist = ChineseWordSegmentation.word_segmentation(keyword);//含新中文分词算法int klN = keywordlist.Count();//keyword关键词数量if (klN == 0){string[] keywordlist2 = keyword.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);klN = keywordlist2.Count();for (int i = 0; i < klN; i++)keywordlist.Add(keywordlist2[i]);}for (int i = 0; i < klN;i++ )wkDegree.Add(0);for (int j = 0; j < klN; j++){wkDegree[j] = wkmatch(w, keywordlist[j]);}Max = wkDegree[0];for (int i = 1; i < klN; i++){if (wkDegree[i] > Max)Max = wkDegree[i];}wmatchDegree = Max;return wmatchDegree;}static public int wkmatch(string w, string k)//单个关键词对单个k的模糊匹配，k为keyword单个关键词
        {int wkDegree = 0;//w、k匹配级别int YorN = 0;int m = w.Length;int n = k.Length;w = w.ToLower();k = k.ToLower();int minLength = 0;          //定义一个最小长度变量用于存储w和k中长度的最小值//模糊度4if ( w == k ){wkDegree = 4*w.Length;return wkDegree;}//模糊度3else if (m < n){minLength = m;YorN=m3(w, k);}else{minLength = n;YorN=m3(k, w);}if (YorN == 1){wkDegree = 3 * minLength;return wkDegree;}//模糊度2if (m < n){YorN = m2(w, k);}else{YorN = m2(k, w);}if (YorN == 1){if (editDistance(w, k) < minLength / 4)                     //当两个字符串的模糊度通过模糊度计算后得到的结果为2时，不直接就认为他们的模糊度为2，还要考虑编辑距离，如果编辑距离小于某一个值时，就认为他们的模糊度为三wkDegree = 3 * minLength;elsewkDegree = 2 * minLength;return wkDegree;}//模糊度1if (m < n){YorN = m1(w, k);}else{YorN = m1(k, w);}if (YorN == 1){if (editDistance(w, k) < minLength / 10)                 //编辑距离的使用原因同上；wkDegree = 2 * minLength;elsewkDegree = 1 * minLength;return wkDegree;}//模糊度0else{wkDegree = 0;return wkDegree;}}//模糊度3static public int m3(string x, string y){int ans=0;int m = x.Length;int n = y.Length;int i = 0;int j = 0;int k = 0;while (i < m && j < n){if (x[i] == y[j]){i++;j++;if (i == m){ans = 1;break;}}else{i = 0;k++;j = k;}}return ans;}//模糊度2static public int m2(string x, string y){int ans = 0;int m = x.Length;int n = y.Length;int l = 0;int Ml = 0;               //最大匹配长度for (int i = 0; i < (m/2+1); i++){int i2 = i;int j = 0;int k = 0;while(j<n){if (x[i2] == y[j]){i2++;j++;l++;if (i2 >= m){i2 = i;k++;j = k;if (l > Ml)Ml = l;}}else {i2 = i;k++;j = k;if( l > Ml )Ml = l;}}}if (Ml > (m / 2))           //当最大匹配长度大于m/2时就说明满足模糊程度为2的条件ans = 1;elseans = 0;return ans;}//模糊度1static public int m1(string x, string y){int ans = 0;int m = x.Length;int n = y.Length;for (int i = 0; i < m; i++){int j;for (j = 0; j < n; j++){if (x[i] == y[j]){ans = 1;break;           //当ans已经为1时就可以跳出循环了
                    }}//当ans已经为1时就可以跳出循环了if (j < n)break;}return ans;}//编写一个求两个字符串编辑距离的方法，提高容错率static public int editDistance(string x , string y) {//定义三个常量分别表示插入、删除和修改一个字符所消耗的编辑次数const int COSTINDEL = 1;const int COSTININS = 1;const int COSTINSUB = 1;int xLength = x.Length, yLength = y.Length;//二维数组distance用于存储动态规划过程中每一步的编辑距离int row = xLength + 1, low = yLength + 1;int[][] distance=new int[row][];for (int i = 0; i < row; i++) {distance[i] = new int[low];}//初始化距离distance二维表的行和列distance[0][0] = 0;for (int i = 1; i < row; i++) {distance[i][0] = distance[i - 1][0] + COSTINDEL;}for (int j = 1; j < low; j++) {distance[0][j] = distance[0][j - 1] + COSTININS;}//利用动态规划算法求x和y的编辑距离for (int i = 1; i < row; i++) {for (int j = 1; j < low; j++) {//分别用delDistance、insDistance和subDistance暂存要编辑到distance[i][j]的各种方式的编辑次数int delDistance = distance[i - 1][j] + COSTINDEL;int insDistance = distance[i][j - 1] + COSTININS;int subDistance = distance[i - 1][j - 1] + (x[i - 1] == y[j - 1] ? 0 : COSTINSUB);int temp;distance[i][j] = subDistance < (temp = (delDistance < insDistance ? delDistance : insDistance)) ? subDistance : temp;    //选择一个编辑次数最少的值附给distance[i][j]
                }}return distance[xLength][yLength];             //返回两个数的编辑距离的
        }//将标点符号进行更改的从半角转化为全角的方法static string half_to_whole(string s) { int sLength=s.Length;char[] c=s.ToCharArray();for (int i = 0; i < sLength; i++) { byte[] b = System.Text.Encoding.Unicode.GetBytes(c,i,1);if (b.Length == 2) {//if (b[1] == 0 && !(c[i] >= 'a' && c[i] <= 'z' || c[i] >= 'A' && c[i] <= 'Z' || c[i] >= '0' && c[i] <= '9'))if (b[1] == 0){b[0] = (byte)(b[0] - 32);b[1] = 255;c[i] = System.Text.Encoding.Unicode.GetChars(b)[0];  }}}string news = new string(c);return news;}public class eachline{public string line;public int matchpoint;public int num;}static void Main(string[] args)//供测试用主函数提供各函数返回值
        {int a;string keyword = Console.ReadLine();int count = 0;StreamReader objReader = new StreamReader("test.txt", System.Text.Encoding.Default);string sLine = "";List<eachline> LineList = new List<eachline>();while (sLine != null){sLine = objReader.ReadLine();if (sLine != null && !sLine.Equals("")){a = match(sLine, keyword);eachline l = new eachline();l.line = sLine;l.matchpoint = match(sLine, keyword);l.num = count;LineList.Add(l);count++;}}objReader.Close();eachline temp;int i, j;j = 1;while (j < count)//判断长度
            {for (i = 0; i < count - j; i++){if (LineList[i].matchpoint < LineList[i + 1].matchpoint){temp = LineList[i];LineList[i] = LineList[i + 1];//交换数据    LineList[i + 1] = temp;}}j++;}List<string> keywordlist = ChineseWordSegmentation.word_segmentation(keyword);for (i = 0; i < keywordlist.Count;i++ )Console.WriteLine(keywordlist[i]);for (i = 0; i < 20; i++){Console.WriteLine(LineList[i].line);Console.WriteLine(LineList[i].matchpoint);List<string> wordlist = ChineseWordSegmentation.word_segmentation(LineList[i].line);for (j = 0; j < wordlist.Count; j++)Console.Write(wordlist[j] + ' ');Console.WriteLine(' ');}//Console.WriteLine("");//List<string> xList = ChineseWordSegmentation.word_segmentation(x);//List<string> yList = ChineseWordSegmentation.word_segmentation(y);//Console.WriteLine(x + ":");//for (int i = 0; i < xList.Count; i++) {//    Console.WriteLine(xList[i]);//}//Console.WriteLine(y + ":");//for (int i = 0; i < yList.Count; i++) {//    Console.WriteLine(yList[i]);//}
}}
}

但目前仍然能在语义上提高算法的精确度。

之后需要优化多关键词语义分析计算匹配程度，并测试修改过的代码，修正错误。

转载于:https://www.cnblogs.com/DOOM-scse/archive/2012/11/06/2757983.html

Daily Report 2012.11.06 刘宇翔相关推荐

Daily Report 2012.11.9 刘宇翔
今天的任务并不多. 和李忠把展示页面的功能填满. 把匹配函数又进行了一些修改和修正. 之后的工作剩下的就很少了. 之后可能会帮助做其他没做好的模块. 转载于:https://www.cnblogs.c ...
Daily Report 2012.11.2 刘宇翔
在修改的码的同时我阅读并参考了一下其他搜索系统的算法. 由于目前我们的任务进度略快,给我们留下的修改时间还有一些,我准备对match算法进行进一步的修改. 参考了网上一遍关于百度搜索的文章. 发现了一 ...
Daily Report 2012/11/09 陈伯雄(step 9)
今天的工作是完成把之前建立的倒排索引和数据库搜索匹配模块嵌入到主体工程中,等待运行和测试. 但是,现在的数据库搜索方法精度还不够,天真把每个关键词一视同仁地处理了,这样的后果可能回造成用户搜索体验不佳 ...
Daily Scrum 2012/11/08
@TeamSH-IT 今天经完成了之前在数据定义存在一些问题.sui老师经过小组之间的交流,完成了对数据定义的最终版.周末将进行初步的整合和测试. 详细的完成情况: 组员今天任务明天任务 Hu R ...
Daily Scrum: 2012/11/27
成员角色今天工作明天计划王安然 PM, Dev 讨论决定了AI的策略问题,编写了一部分ProfileManager类(304) 继续进行ProfileManager类的编写(304) 黄杨 P ...
[转]结队编程——软件测试报告 10061178 刘宇翔 10061148 彭笑东
软件测试报告结队成员: 10061178 刘宇翔 10061148 彭笑东测试软件:微软必应词典客户端版本:1.6.2.0 (BETA) 环境:win7 旗舰版,x32,Intel(R) Cor ...
超级计算机TOP500、green500、graph500最新排名（2012.11）
重磅好消息:我国超级计算机重回世界之巅!! top500:http://www.top500.org/list/2013/06/ top500:http://www.top500.org/lists/ ...
2012中国互联网公司、全球互联网公司最新市值排名（2012.12.06）
在最新的中国互联网公司排名中,阿里巴巴超越百度成为中国第二大互联网公司,奇虎360成为中国第五大互联网公司,其市值为28.50亿美元,相当于搜狐(14.50亿美元)两倍,人人网(11.80亿美元)的三 ...
还款每个月90.85元，到 2012年10月，2012 11月 2256元，共 5799.25元
还款每个月90.85元, 到 2012年10月,2012 11月 2256元,共 5799.25元

Daily Report 2012.11.06 刘宇翔

Daily Report 2012.11.06 刘宇翔相关推荐

最新文章

热门文章