抓取网址中的信息(需要解码的)

/// <summary>/// 根据任务生成抓取要素/// </summary>protected override void initCrawlerModel(IwomTask taskItem){crawlerModel = new CrawlerModel();crawlerModel.Keyword = CommonFunction.AssembledKeyword(taskItem.KeyWord, IWOMWebCrawlerDbLayer.Common.KeyWordUrlEncode.Normal, true);     //关键词crawlerModel.PageSize = taskItem.GetItems > 20 ? 20 : taskItem.GetItems;       //每页大小crawlerModel.Postion = taskItem.Task_Postion;}/// <summary>/// 根据任务要素构造抓取的url/// </summary>protected override string createUrl(int pageIndex){//需要修改// string strUrl = "http://cbnsearch.yicai.com/cbnsearch.html?start=0&pagecount=20&documentType=1&datetype=1&contenttype=1&searchKeyWords=" + crawlerModel.Keyword;string strUrl = "http://cbnsearch.yicai.com/searchresult.php?start=0&pagecount=20&documentType=1&datetype=1&searchKeyWords=" + crawlerModel.Keyword + "&contenttype=1";return strUrl;}/// <summary>/// 每抓取一页都间隔的时间/// </summary>protected override void PageSleep(){Thread.Sleep(500);}/// <summary>/// 页面的编码/// </summary>protected override Encoding getPageEncoding(){return Encoding.UTF8;}/// <summary>/// 根据内容判断是否被封禁了/// </summary>protected override bool checkContentIsForbat(string HTMLContent){return true;}/// <summary>/// 根据内容判断是否是最后一页了/// </summary>protected override bool checkContentIsLastPage(string HTMLContent){return false;}/// <summary>/// 根据网页信息得到文章集合/// </summary>protected override List<CrawlerResult> GetArticleByHtml(string HTMLContent, int task_ID){HTMLContent = NormalU2C(HTMLContent);List<CrawlerResult> arrayList = new List<CrawlerResult>();MatchCollection matchList;MatchCollection tempMatch;Regex regex = new Regex(@"{\042id\042:[\s\S]+?}");//每一条记录Regex regexHref = new Regex(@"\042url\042:\042(?<href>.*?)\042,\042title\042:\042(?<name>.*?)\042,");//标题，链接的地址Regex regexContent = new Regex(@"\042content\042:\042(?<content>[\s\S]+?)\042,");//摘要Regex regexTime = new Regex(@"\042creationDate\042:\042(?<time>[\s\S]+?)\042,");//时间Regex regexAuthor = new Regex(@"\042author\042:\042(?<author>[\s\S]+?)\042,");//作者matchList = regex.Matches(HTMLContent);for (int i = 0; i < matchList.Count; i++){CrawlerResult item = new CrawlerResult();item.Task_ID = task_ID;if (matchList[i].Value.ToString() != ""){tempMatch = regexHref.Matches(matchList[i].Value.ToString());//if (item.Url.StartsWith("http:", StringComparison.OrdinalIgnoreCase))// {//主题item.Title = CommonFunction.DeleteHTMLElement(tempMatch[0].Groups["name"].Value.ToString());//内容tempMatch = regexContent.Matches(matchList[i].Value.ToString());if (tempMatch.Count > 0){item.Summary = CommonFunction.DeleteHTMLElement(tempMatch[0].Groups["content"].Value.ToString());}//作者tempMatch = regexAuthor.Matches(matchList[i].Value.ToString());if (tempMatch.Count > 0){item.Author = CommonFunction.DeleteHTMLElement(tempMatch[0].Groups["author"].Value.ToString());}//媒体item.SiteName = "一财网";//时间tempMatch = regexTime.Matches(matchList[i].Value.ToString());if (tempMatch.Count > 0){if (tempMatch.Count > 0 && tempMatch[0].Value.ToString().IndexOf('-') > 0){try{item.CreateTime = DateTime.Parse(tempMatch[0].Groups["time"].Value.ToString());}catch{CommonFunction.logWirte(this.SearchName + "抓取匹配时间出错：源是" + matchList[i].Value, IWOMWebCrawlerDbLayer.Common.LogGrade.Warning);}}}else{item.CreateTime = DateTime.Parse(DateTime.Now.ToShortDateString());}//URLtempMatch = regexHref.Matches(matchList[i].Value.ToString());item.Url = "http://www.yicai.com/news/" + item.CreateTime.ToString().Substring(0,4) + tempMatch[0].Groups["href"].ToString().Replace(@"\", "").Substring(26);item.FilterType = FilterType.FilterNo;arrayList.Add(item);//  }}}return arrayList;}/// <summary>/// 生成测试任务的方法/// </summary>protected override string initTestUrl(){this.HaseLastPostTime = true;this.HasePageSize = 20;HaseAuthor = true;HaseReplyCount = true;HaseVisitCount = true;return "http://cbnsearch.yicai.com/searchresult.php?start=0&pagecount=20&documentType=1&datetype=1&searchKeyWords=%E4%BA%9A%E9%A9%AC%E9%80%8A&contenttype=1";}#region 解码private string NormalU2C(string input){string str = "";char[] chArray = input.ToCharArray();Encoding bigEndianUnicode = Encoding.UTF8;for (int i = 0; i < chArray.Length; i++){char ch = chArray[i];if (ch.Equals('\\')){i++;i++;char[] chArray2 = new char[4];int index = 0;index = 0;while ((index < 4) && (i < chArray.Length)){chArray2[index] = chArray[i];index++;i++;}if (index == 4){try{str = str + this.UnicodeCode2Str(chArray2);}catch (Exception){str = str + @"/";for (int j = 0; j < index; j++){str = str + chArray2[j];}}i--;}else{str = str + @"/";for (int k = 0; k < index; k++){str = str + chArray2[k];}}}else{str = str + ch.ToString();}}return str;}private string UnicodeCode2Str(char[] u4){if (u4.Length < 4){throw new Exception("It's not a unicode code array");}string str = "0123456789ABCDEF";char ch = char.ToUpper(u4[0]);char ch2 = char.ToUpper(u4[1]);char ch3 = char.ToUpper(u4[2]);char ch4 = char.ToUpper(u4[3]);int index = str.IndexOf(ch);int num2 = str.IndexOf(ch2);int num3 = str.IndexOf(ch3);int num4 = str.IndexOf(ch4);if (((index == -1) || (num2 == -1)) || ((num3 == -1) || (num4 == -1))){throw new Exception("It's not a unicode code array");}byte num5 = (byte)(((index * 0x10) + num2) & 0xff);byte num6 = (byte)(((num3 * 0x10) + num4) & 0xff);byte[] bytes = new byte[] { num5, num6 };return Encoding.BigEndianUnicode.GetString(bytes);}#endregion

抓取网址中的信息(需要解码的)相关推荐

Google抓取网址软404，在测试实际版本的过程中，系统检测到该网址存在索引编制问题
Google抓取网址软404,在测试实际版本的过程中,系统检测到该网址存在索引编制问题原文地址:Google抓取网址软404,在测试实际版本的过程中,系统检测到该网址存在索引编制问题
php怎么获取网页内的视频教程,PHP怎样用正则抓取页面中的网址
前言链接也就是超级链接,是从一个元素(文字.图片.视频等)链接到另一个元素(文字.图片.视频等).网页中的链接一般有三种,一种是绝对URL超链接,也就是一个页面的完整路径:另一种是相对URL超链接, ...
华为抓取错误日志在哪里_抓取网址进行分析爬虫工具Screaming Frog SEO Spider for Mac...
Screaming Frog SEO Spider for Mac是一款专门用于抓取网址进行分析的网络爬虫开发工具,你可以通过这款软件来快速抓取网站中可能出现的损坏链接和服务器错误,或是识别网站中临时 ...
根据专利号到专利查询的网站上抓取想要的信息(上)
前述:前几天看到有人论要请别人写一个从从网页上抓取某个专利号的收费信息的一个程序,说实话我自己知道那里面的原理是什么,但一直没有自己动手实现以下.根据自己的实际的工作需要一般是有一张Excel表,第一 ...
用python抓取智联招聘信息并存入excel
用python抓取智联招聘信息并存入excel tags:python 智联招聘导出excel 引言:前一阵子是人们俗称的金三银四,跳槽的小朋友很多,我觉得每个人都应该给自己做一下规划,根据自己的进步 ...
c语言编程网页数据提取,怎么用c语言抓取网页中的数据
当前位置:我的异常网» C语言 » 怎么用c语言抓取网页中的数据怎么用c语言抓取网页中的数据 www.myexceptions.net 网友分享于:2013-07-17 浏览:390次如何用c ...
Python抓取网页中的动态序列化数据
Python抓取网页中的动态序列化数据动态序列化数据经常应用于前后端分离的页面.或者通过VUE.JS等HTML页面环境,常规的爬虫抓取方法并不能满足数据采集的要求,因此需要其他的方式进行数据的采集. ...
php正则获取li,用正则表达式抓取网页中的ul 和 li标签中最终的值！
获取你要抓取的页面 const string URL = "http://www.hn3ddf.gov.cn/price/GetList.html?pageno=1"; ...
利用pandas库中的read_html方法快速抓取网页中常见的表格型数据
利用pandas库中的read_html方法快速抓取网页中常见的表格型数据本文转载自:https://www.makcyun.top/web_scraping_withpython2.html 需要 ...

抓取网址中的信息(需要解码的)

抓取网址中的信息(需要解码的)相关推荐

最新文章

热门文章