discuz论坛通用URL地址:

URL+?mod=my&q=关键字(关键字的编码gb2312)

例如: 'http://bbs.anzhi.com/search.php?mod=my&q=htc+%CA%D6%BB%FA';

而真实的sId的值是存储在head里面的location中的。

例如:location=http://search.bbs.hiapk.com/f/search?q=%E6%89%8B%E6%9C%BA&sId=8520930&ts=1355197250&mySign=5785b7cb&searchLevel=3&menu=1&rfh=1&qs=txt.tsort.a&orderField=posted&orderType=desc

           string url = string.Format("http://bbs.hiapk.com/search.php?mod=my&q={0}", crawlerModel.Keyword);HTMLContent= GetHteml(url);
private string GetHteml(string url){HttpWebRequest request = null;HttpWebResponse response = null;string gethost = string.Empty;CookieContainer cc = new CookieContainer();string Cookiesstr = string.Empty;try{request = (HttpWebRequest)WebRequest.Create(url);request.Method = "GET";request.KeepAlive = true;request.Headers.Add("Cookie:" + Cookiesstr);request.CookieContainer = cc;request.AllowAutoRedirect = false;response = (HttpWebResponse)request.GetResponse();//设置cookieCookiesstr = request.CookieContainer.GetCookieHeader(request.RequestUri);//取再次跳转链接//StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));//string ss = sr.ReadToEnd();gethost =response.Headers["location"];request.Abort();response.Close();              }catch (Exception){//第一次GET出错return "";}try{request = (HttpWebRequest)WebRequest.Create(gethost);request.Method = "GET";request.KeepAlive = true;request.Headers.Add("Cookie:" + Cookiesstr);request.CookieContainer = cc;request.AllowAutoRedirect = false;response = (HttpWebResponse)request.GetResponse();//设置cookieCookiesstr = request.CookieContainer.GetCookieHeader(request.RequestUri);//取再次跳转链接StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("utf-8"));string ss = sr.ReadToEnd();string pattern = @"<a.+?>按时间排序</a>";if (!Regex.IsMatch(ss, pattern)) return "";gethost = "http://search.bbs.anzhi.com"+GetURL(Regex.Match(ss, pattern).Value.Replace("&","&"));request.Abort();sr.Close();response.Close();}catch (Exception){//第一次GET出错return "";}try{request = (HttpWebRequest)WebRequest.Create(gethost);request.Method = "GET";request.KeepAlive = true;request.Headers.Add("Cookie:" + Cookiesstr);request.CookieContainer = cc;request.AllowAutoRedirect = false;response = (HttpWebResponse)request.GetResponse();//设置cookieCookiesstr = request.CookieContainer.GetCookieHeader(request.RequestUri);//取再次跳转链接StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("utf-8"));string ss = sr.ReadToEnd();request.Abort();sr.Close();response.Close();return ss;}catch (Exception){//第一次GET出错return "";}}
       private string GetURL(string urlHtml){MatchCollection matchList;Regex regex = new Regex(@"f="".*?""");matchList = regex.Matches(urlHtml);if (matchList.Count > 0){return matchList[0].Value.Substring(3, matchList[0].Value.Length - 4);}return "";}

小例子:

using System;
using System.Collections.Generic;
using IWOMWebCrawlerDbLayer.DAL;
using IWOMWebCrawlerDbLayer.Model;
using System.Text;
using System.Text.RegularExpressions;
using System.Collections;
using System.Threading;
using IWOMWebCrawlerDbLayer.Common;
using HtmlAgilityPack;
using System.Net;
using System.IO;namespace IWOMWebCrawlerApp.Crawler
{public class tousue_teizi : AbstractSearchEngine{public tousue_teizi(){this.SearchID = 1600;this.SearchName = "投诉易--帖子";}private int pageId=0;/// <summary>/// 根据任务生成抓取要素/// </summary>protected override void initCrawlerModel(IwomTask taskItem){crawlerModel = new CrawlerModel();crawlerModel.Keyword = CommonFunction.AssembledKeyword(taskItem.KeyWord, IWOMWebCrawlerDbLayer.Common.KeyWordUrlEncode.Normal, false);//关键词crawlerModel.PageSize = crawlerModel.PageSize > 10 ? 10 : taskItem.GetItems;//每页大小crawlerModel.Postion = taskItem.Task_Postion;}/// <summary>/// 根据任务要素构造抓取的url/// </summary>protected override string createUrl(int pageIndex){if (pageIndex != 0) return "";string url = string.Format("http://www.tousue.com/search.php?mod=my&q={0}", crawlerModel.Keyword);return url;}/// <summary>/// 每抓取一页都间隔的时间/// </summary>protected override void PageSleep(){Thread.Sleep(500);}/// <summary>/// 页面的编码/// </summary>protected override Encoding getPageEncoding(){return Encoding.GetEncoding("utf-8");}/// <summary>/// 根据内容判断是否被封禁了/// </summary>protected override bool checkContentIsForbat(string HTMLContent){return true;}/// <summary>/// 根据内容判断是否是最后一页了/// </summary>protected override bool checkContentIsLastPage(string HTMLContent){return false;}/// <summary>/// 根据网页信息得到文章集合/// </summary>protected override List<CrawlerResult> GetArticleByHtml(string HTMLContent, int task_ID){string url = string.Format("http://www.tousue.com/search.php?mod=my&q={0}", crawlerModel.Keyword);HTMLContent = GetHteml(url,pageId);string mainId = "result-items";XPathModel model = new XPathModel();model.listXPath = "./ul/li";model.titleXPath = "./h3/a";model.urlXPath = "./h3/a";model.timeXPath = "./p[3]";List<CrawlerResult> arrayList = CommonFunction.GetListByXPath(HTMLContent, task_ID, mainId, model, TitleMethod, CommonFunction.UrlMethod, TimeMethod);return arrayList;}/// <summary>/// 获得解析的URL/// </summary>private string GetURL(string urlHtml){MatchCollection matchList;Regex regex = new Regex(@"f="".*?""");matchList = regex.Matches(urlHtml);if (matchList.Count > 0){return matchList[0].Value.Substring(3, matchList[0].Value.Length - 4);}return "";}private string GetHteml(string url,int pageId){HttpWebRequest request = null;HttpWebResponse response = null;string gethost = string.Empty;CookieContainer cc = new CookieContainer();string Cookiesstr = string.Empty;try{request = (HttpWebRequest)WebRequest.Create(url);request.Method = "GET";request.KeepAlive = true;request.Headers.Add("Cookie:" + Cookiesstr);request.CookieContainer = cc;request.AllowAutoRedirect = false;response = (HttpWebResponse)request.GetResponse();//设置cookieCookiesstr = request.CookieContainer.GetCookieHeader(request.RequestUri);//取再次跳转链接//StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));//string ss = sr.ReadToEnd();gethost = response.Headers["location"];request.Abort();response.Close();}catch (Exception){//第一次GET出错return "";}try{request = (HttpWebRequest)WebRequest.Create(gethost);request.Method = "GET";request.KeepAlive = true;request.Headers.Add("Cookie:" + Cookiesstr);request.CookieContainer = cc;request.AllowAutoRedirect = false;response = (HttpWebResponse)request.GetResponse();//设置cookieCookiesstr = request.CookieContainer.GetCookieHeader(request.RequestUri);//取再次跳转链接StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("utf-8"));string ss = sr.ReadToEnd();string pattern = @"<a.+?>按时间排序</a>";if (!Regex.IsMatch(ss, pattern)) return "";gethost = "http://search.discuz.qq.com" + GetURL(Regex.Match(ss, pattern).Value.Replace("&", "&")) + "&page=" + pageId.ToString();request.Abort();sr.Close();response.Close();}catch (Exception){//第一次GET出错return "";}try{request = (HttpWebRequest)WebRequest.Create(gethost);request.Method = "GET";request.KeepAlive = true;request.Headers.Add("Cookie:" + Cookiesstr);request.CookieContainer = cc;request.AllowAutoRedirect = false;response = (HttpWebResponse)request.GetResponse();//设置cookieCookiesstr = request.CookieContainer.GetCookieHeader(request.RequestUri);//取再次跳转链接StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("utf-8"));string ss = sr.ReadToEnd();request.Abort();sr.Close();response.Close();return ss;}catch (Exception){//第一次GET出错return "";}}private string TitleMethod(string content){string pattern = @"(?<=[\d]回答).+";if (Regex.IsMatch(content, pattern))return Regex.Match(content, pattern).Value;return content;}private DateTime TimeMethod(string content){string pattern = @"[\d]{4}-[\d]{1,2}-[\d]{1,2}";string temp = "";DateTime time = new DateTime();time = CommonFunction.GetTimeByChinese(content);if (time != DateTime.MinValue)return time;if (Regex.IsMatch(content, pattern))temp = Regex.Match(content, pattern).Value;DateTime.TryParse(temp, out time);return time;}/// <summary>/// 生成测试任务的方法/// </summary>protected override string initTestUrl(){this.HaseCreateTime = true;this.HasePageSize = 10;this.HaseSiteName = true;this.HaseSummary = true;pageId = 2;HaseAuthor = true;crawlerModel.Keyword = CommonFunction.AssembledKeyword("海尔", IWOMWebCrawlerDbLayer.Common.KeyWordUrlEncode.Normal, false);string url = string.Format("http://www.tousue.com/search.php?mod=my&q={0}", crawlerModel.Keyword);return url;}}
}

discuz论坛的抓取相关推荐

  1. 收集的材料 关于数据库和抓取器方面的

    同事 的博客 里面有很多 不错的东西 值得 我这个水平不高的人学习 顺便给他涨个人气 抓取器: C#.NET GB2312编码转化为中文 c#验证码下载 正则表达式替换---lamda表达式(UniC ...

  2. python爬取论坛付费内容_Python爬虫抓取论坛关键字过程解析

    前言: 之前学习了用python爬虫的基本知识,现在计划用爬虫去做一些实际的数据统计功能.由于前段时间演员的诞生带火了几个年轻的实力派演员,想用爬虫程序搜索某论坛中对于某些演员的讨论热度,并按照日期统 ...

  3. disallow: /api.php,dz论坛如何禁止搜索引擎抓取任何内容?

    这其实不只是针对于discuz这个程序建的网站,针对所有的网站都有效.网站的根目录有一个robots.txt文件.这个就相当于一个协议.它告诉搜索引擎,你可以抓取我网站里的哪些内容.所以,想要禁止搜索 ...

  4. 爬虫 - 抓取52论坛帖子列表

    1. 前言 这两周稍微得了点空闲,又对爬虫有相当兴趣,PythonPycharm都是现成的,说干就干. 从需求出发,起初是想做个爬图的程序,下点动漫美图什么的,非常实用.网站和图片URL都抓好了,结果 ...

  5. Discuz 论坛模块全部帖子和评论爬取

    Discuz 论坛模块全部帖子和评论爬取 Discuz 是一款由PHP编写的开源论坛 Discuz 官方论坛: https://www.discuz.net/forum.php 要爬取的页面地址: D ...

  6. python爬取论坛付费内容_Python进阶量化交易专栏场外篇20-爬虫抓取股票论坛帖子...

    欢迎大家订阅<教你用 Python 进阶量化交易>专栏!为了能够提供给大家更轻松的学习过程,笔者在专栏内容之外已陆续推出一些手记来辅助同学们学习本专栏内容,目前推出的扩展篇链接如下: 为了 ...

  7. python 爬取财经新闻股票_Python进阶量化交易专栏场外篇20-爬虫抓取股票论坛帖子...

    欢迎大家订阅<教你用 Python 进阶量化交易>专栏!为了能够提供给大家更轻松的学习过程,笔者在专栏内容之外已陆续推出一些手记来辅助同学们学习本专栏内容,目前推出的扩展篇链接如下: 为了 ...

  8. 爬虫实战,抓取论坛帖子内容

    本文主要内容:以最短的时间写一个最简单的爬虫,可以抓取论坛的帖子标题和帖子内容. 本文受众:没写过爬虫的萌新. 入门 0.准备工作 需要准备的东西: Python.scrapy.一个IDE或者随便什么 ...

  9. [python 爬虫]Python爬虫抓取虎扑论坛帖子图片

    自从可以实现抓取文字了,自然要尝试更多的类型,比如图片.我是一个有逛虎扑论坛习惯的人,经常会发现有些帖子的图片挺好看的想保存下来,但是如果人为保存的话,一个帖子至少都有二三十张,这将是一个庞大的工作量 ...

最新文章

  1. CSS布局之品字布局
  2. 关于MEET大会直播抽奖后续红包发放的说明
  3. [leveldb] 3.put/delete操作
  4. 零食嘴----美食领域的美丽说
  5. 有了数学基础,如何进阶AI?
  6. 【python】python程序的输入输出以及标识符详细解读
  7. PHP5.3下加速器ZendGuardLoader安装及故障处理
  8. Python文件输入输出
  9. 解决按键精灵助手无法连接Android手机的问题
  10. 2022芒果TV算法赛_用户下一个观看视频预测_baseline_CF召回YoutubeDNN
  11. php写的软件帮助手册源码使用帮助源码html模版源码,系统依附HDSYSCMS内容系统
  12. html图片十字形,CSS3 十字架
  13. ServU:无法访问servu服务器
  14. RocketMQ生产者组topic和消费组的关系
  15. rust大油田分解机_睡梦中,狂风起!大棚棉被刮翻了,卷帘机也连带吹坏了......
  16. 机器学习基础 - [第四章:正则化](3)线性回归的正则化
  17. 计算机专业考研视频汇总
  18. 工具栏QToolBar
  19. 百度apollo 汽车环境感知 自动驾驶 激光雷达slamtec a1m8-r5 三角测距 双目相机
  20. 举个栗子~Alteryx 技巧(4):教你设置 Alteryx Server 用户权限

热门文章

  1. 解决Word导出PDF显示有批注的问题
  2. 计算机应用月什么,计算机应用月考试卷
  3. js生成html转换成图片保存,js将html生成为图片,并保存在本地
  4. JavaScript - jQuery(一)
  5. MySQL02--高级(BTreeB+Tree、聚簇索引非聚簇索引、性能分析(Explain)、索引、sql优化)
  6. 从2012 飞到 2013 —— 梦想依旧
  7. matlab添加文件夹语音_基于MATLAB的语音处理
  8. php5.7 iis7.5,使用PHPManger给IIS7.5部署PHP7
  9. 计算机无法转换输入发,电脑无法切换输入法怎么办
  10. 一篇让小白彻底搞懂性能调优!