C# 新浪微博滚动抓取 WeiboGrab

应该先说，本来相对网页加载的程序段进行规范的，但是，当再次编写的时候发现，还是不能很好的掌握网页加载的具体规则，导致获取页面的代码还是很繁杂。其他部分改的差不多了，还有就是当微博中的字符含有{}等时，会提示字符串格式错误，这个也该需要改进的，，还没改进，程序还需要一个挂空线程的功能，保留现场，让程序可以继续爬取，而不是从头再爬。
各种类

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.IO;
using HtmlAgilityPack;namespace WeiBoGrab
{class WeiBoGrabClass{}public class GetPage{//加载初始页面public string GetLoginPage(WebBrowser browser){while (browser.ReadyState != WebBrowserReadyState.Complete){Application.DoEvents();}while (browser.Document.GetElementById("pl_login_form").InnerHtml == null){Application.DoEvents();}return "加载登陆页面完成。";}//加载用户主页public string GetMainPage(WebBrowser browser){while (browser.DocumentTitle != "我的首页 新浪微博-随时随地分享身边的新鲜事儿"){Application.DoEvents();}//确保加载完所需内容while (browser.Document.GetElementById("pl_rightmod_myinfo")!=null&&browser.Document.GetElementById("pl_rightmod_myinfo").Children.Count < 2){Application.DoEvents();}return "加载个人主页完成。";}//加载用户关注对象的第一页public string GetFollowsPage(WebBrowser browser){while (browser.DocumentTitle != "我关注的人 新浪微博-随时随地分享身边的新鲜事儿"){Application.DoEvents();}while (browser.Document.GetElementById("pl_relation_myfollow") == null){Application.DoEvents();}while (browser.Document.GetElementById("pl_relation_myfollow").Children.Count < 3){Application.DoEvents();}return "关注对象页面第一页加载完成。";}//加载用户关注对象的下一页public string GetFollowsNextPage(WebBrowser browser){//将原页面的关注对象列表清空（关注对象列表为children[2].children[1]）//加载新页面3=browser.Document.GetElementById("pl_relation_myfollow").Children[2].Children.Count//不明白，孩子个数显示明明是3，但是述操作却正确。。。 //browser.Document.GetElementById("pl_relation_myfollow").Children[2].Children.Count < 4//<!--  -->此类标签有时会被当做标签计数或提取，需要实际分析while (browser.Document.GetElementById("pl_relation_myfollow").Children.Count < 3||browser.Document.GetElementById("pl_relation_myfollow").Children[2].Children.Count < 4){Application.DoEvents();}//当上述条件满足后，再加载，便是新生成的内容return "关注对象下一页加载完成。";}//加载关注对象的主页的第一页public string GetFollowMainPage(WebBrowser browser){while (browser.ReadyState != WebBrowserReadyState.Complete){Application.DoEvents();}//当微博是杂志、新闻类时if (browser.Document.GetElementById("epfeedlist") != null){while (browser.Document.GetElementById("feed_list") == null){Application.DoEvents();}return "关注对象主页第一页加载完成。";}//当微博是个人、媒体类时if (browser.Document.GetElementById("pl_content_hisFeed") == null){while (browser.Document.GetElementById("profileFeed").InnerHtml == null){Application.DoEvents();}}while (browser.Document.GetElementById("pl_content_hisFeed").InnerHtml == null){Application.DoEvents();}//找到feedHtmlElementCollection ps = browser.Document.GetElementById("pl_content_hisFeed").Children;int feed_postion = 0;//有的微博页面需要此步骤while (browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].InnerText == "正在加载，请稍候..." ||browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].InnerText == "正在加载中，请稍候..."){Application.DoEvents();}//pl_content_hisFeed加载不全while (browser.Document.GetElementById("pl_content_hisFeed").Children.Count < 2){Application.DoEvents();}foreach (HtmlElement p in ps){if (p.GetAttribute("node-type") != null && p.GetAttribute("node-type") == "feed_list"){break;}elsefeed_postion++;}//非第一页加载时，有此等待while (browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children[0].InnerText == "正在加载中，请稍候..."|| browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children[0].InnerText == "正在加载，请稍候..."){Application.DoEvents();}//微博数量及等待加载模块所在位置表示int hisFeed_count = browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children.Count - 1;//表示正在加载bool loading = true;//找出加载模块位置HtmlElement load = browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children[hisFeed_count];int i;for (i = 1; (i < 10) && (hisFeed_count - i >= 0); i++){if (load.InnerText == "正在加载中，请稍候..." || load.InnerText == "正在加载，请稍候...")break;load = browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children[hisFeed_count - i];}while (loading){loading = false;load.ScrollIntoView(false);while (load.InnerText == "正在加载中，请稍候..." || load.InnerText == "正在加载，请稍候..."){load.ScrollIntoView(false);Application.DoEvents();load = browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children[hisFeed_count - i];}//微博加载//限制次数，limit有待商榷，过小会使有的微博可能会加载失败int Limit = 100;int L = 0;while ((browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children.Count < hisFeed_count + 2)&&(L < Limit )){L++;//防止无限加载的等待
                    Application.DoEvents();}//更新加载模块位置hisFeed_count = browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children.Count - 1;//更新加载模块load = browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children[hisFeed_count];for (int j = 1; (j < 10) && (hisFeed_count - j >= 0); j++)//假设无效的标签数不超过10个
                {if (load.InnerText == "正在加载中，请稍候..." || load.InnerText == "正在加载，请稍候...")break;load = browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children[hisFeed_count - j];}if (load != null && (load.InnerText == "正在加载中，请稍候..." || load.InnerText == "正在加载，请稍候...")){loading = true;load.ScrollIntoView(false);}}return "加载关注对象主页第一页面完成。";}//加载关注对象的的主页的下一页public string GetFollowMainNextPage(WebBrowser browser){Application.DoEvents();while (browser.ReadyState != WebBrowserReadyState.Complete){Application.DoEvents();}GetFollowMainPage(browser);//针对杂志、新闻类微博if (browser.Document.GetElementById("epfeedlist") == null)Application.DoEvents();return "加载关注对象后续页面完成。";}}//用户登陆类public class LoginSubmit{private string username;private string password;//初始化登陆对象public LoginSubmit(string username, string password){this.username = username;this.password = password;}//点击登陆public void LoginClick(WebBrowser browser){//登陆页面的登陆模块HtmlElement pl_login_form = browser.Document.GetElementById("pl_login_form");//登陆模块中的用户名_INPUTHtmlElement pl_login_form_username = pl_login_form.GetElementsByTagName("INPUT")[0];//让用户名输入框获取焦点(目的清空输入框)pl_login_form_username.InvokeMember("click");pl_login_form_username.SetAttribute("value",username);//登陆模块的密码_INPUTHtmlElement pl_login_form_password = pl_login_form.GetElementsByTagName("INPUT")[1];//让密码输入框获取焦点(目的清空输入框)pl_login_form_password.InvokeMember("click");pl_login_form_password.SetAttribute("value",password);//找到登陆按钮并点击HtmlElementCollection IsClick = pl_login_form.GetElementsByTagName("span");foreach (HtmlElement Click in IsClick){if (Click.GetAttribute("node-type") != null && Click.GetAttribute("node-type") == "submitStates"){Click.InvokeMember("click");break;}}}}//将关注对象设为一类public class Follow{//获取关注对象（点击用户关注对象的超链接）public void GetFollows(WebBrowser browser){//获取用户的信息模块HtmlElement pl_rightmod_myinfo = browser.Document.GetElementById("pl_rightmod_myinfo");//获取关注对象子模块HtmlElement my_info_follow = pl_rightmod_myinfo.GetElementsByTagName("strong")[0];if (my_info_follow.GetAttribute("node-type") == "follow"){//判断用户是否有关注对象if (my_info_follow.InnerText == "0")return;my_info_follow.InvokeMember("click");GetPage getfollowpage = new GetPage();getfollowpage.GetFollowsPage(browser);}}//获取关注对象的url,并写到txt中public void GetFollowsUrl(WebBrowser browser, StreamWriter sw){//是否还有下一页bool Next = true;int UrlCount = 0;while (Next){//默认没有下一页Next = false;HtmlElement FollowLinks = browser.Document.GetElementById("pl_relation_myfollow");HtmlElementCollection Links = FollowLinks.GetElementsByTagName("div");foreach (HtmlElement Link in Links){if (Link.GetAttribute("action-type") == "ignore_list"){HtmlNode href = HtmlNode.CreateNode(Link.InnerHtml);string url = href.Attributes["href"].Value;string followname = href.FirstChild.Attributes["alt"].Value;sw.WriteLine("No.{0}|{1}|{2}", ++UrlCount, followname, url);}}HtmlElementCollection pages = FollowLinks.GetElementsByTagName("span");//判断是否有下一页foreach (HtmlElement page in pages){if (page.InnerText == "下一页"){Next = true;page.InvokeMember("click");//Console.WriteLine("这个标签是："+browser.Document.GetElementById("pl_relation_myfollow").Children[2].Children[2].OuterHtml);browser.Document.GetElementById("pl_relation_myfollow").Children[2].Children[2].OuterHtml = null;//载入关注对象页面的下一页GetPage GetNext = new GetPage();GetNext.GetFollowsNextPage(browser);break;}}}sw.Close();}}//将微博设为一类public class WeiBo{private string FollowName;private string FollowUrl;public WeiBo(string FollowName, string FollowUrl){this.FollowName = FollowName;this.FollowUrl = FollowUrl;}public void GetWeiBo(WebBrowser browser){StreamWriter sw = File.CreateText("D:\\weibo\\" + FollowName + ".txt");bool Next = true;int WeiBoCount = 0;browser.Navigate(new Uri(@FollowUrl));GetPage GetNext = new GetPage();GetNext.GetFollowMainPage(browser);//默认还没登记此类微博string Kind = "N";HtmlElement epfeedlist = browser.Document.GetElementById("epfeedlist");HtmlElement pl_content_hisFeed = browser.Document.GetElementById("pl_content_hisFeed");if (pl_content_hisFeed != null){//媒体类微博的pl_content_hisFeed.Children[1].Children[0].TagName = "dl"//个人微博的pl_content_hisFeed.Children[1].OuterHtml =<!-- /高级搜索 -->if (pl_content_hisFeed.Children[1].Children.Count != 0)//媒体（小）微博Kind = "M";//个人微博                elseKind = "P";}if (epfeedlist != null)//杂志，新闻等微博Kind = "J";while (Next){Next = false;switch (Kind){case "P": {//爬取各条微博HtmlElementCollection divs = browser.Document.GetElementById("pl_content_hisFeed").GetElementsByTagName("div");foreach (HtmlElement div in divs){if (div.GetAttribute("node-type") == "feed_list_content")sw.WriteLine("第{0}条|" + div.InnerText, ++WeiBoCount);}//判断是否还有下一页HtmlElementCollection spans = browser.Document.GetElementById("pl_content_hisFeed").GetElementsByTagName("span");foreach (HtmlElement span in spans){if (span.InnerText == "下一页"){span.InvokeMember("click");Next = true;GetNext.GetFollowMainNextPage(browser);break;}}}break;case "J":{//爬取各条微博int count_li = browser.Document.GetElementById("feed_list").Children.Count;for (int i = 0; i < count_li; i++){sw.WriteLine("第{0}条|" + browser.Document.GetElementById("feed_list").Children[i].GetElementsByTagName("p")[0].InnerText, ++WeiBoCount);}//判断是否还有下一页HtmlElementCollection ems = browser.Document.GetElementById("feed_list").NextSibling.GetElementsByTagName("em");int end = ems.Count;if (ems[end - 1].InnerText == "下一页"){ems[end - 1].InvokeMember("click");browser.Document.GetElementById("feed_list").OuterHtml = null;GetNext.GetFollowMainNextPage(browser);Next = true;}}break;case "M":{HtmlElementCollection ps = browser.Document.GetElementById("pl_content_hisFeed").GetElementsByTagName("p");foreach (HtmlElement p in ps){if (p.GetAttribute("node-type") == "feed_list_content")sw.WriteLine("第{0}条|" + p.InnerText, ++WeiBoCount);}//判断是否还有下一页HtmlElementCollection spans = browser.Document.GetElementById("pl_content_hisFeed").GetElementsByTagName("span");foreach (HtmlElement span in spans){if (span.InnerText == "下一页"){span.InvokeMember("click");Next = true;GetNext.GetFollowMainNextPage(browser);break;}}}break;default: return;//还没记录的微博
                }}sw.Close();}}
}

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.IO;namespace WeiBoGrab
{public partial class Form1 : Form{public Form1(){InitializeComponent();}private void button1_Click(object sender, EventArgs e){string username = textBox1.Text.ToString();string password = textBox2.Text.ToString();string url = "http://weibo.com/";GetPage getpage = new GetPage();StreamWriter sw = File.CreateText("FollowUrl.txt");WebBrowser browser = webBrowser1;browser.Navigate(new Uri(@url));//加载登陆页面textBox3.Text += getpage.GetLoginPage(browser);//登陆操作LoginSubmit loginsubmit = new LoginSubmit(username, password);loginsubmit.LoginClick(browser);//加载个人主页textBox3.Text += getpage.GetMainPage(browser);//获取关注对象Follow follow = new Follow();follow.GetFollows(browser);                     follow.GetFollowsUrl(browser,sw);FileStream fs = new FileStream("FollowUrl.txt",FileMode.Open);StreamReader sr = new StreamReader(fs);string s;while ((s = sr.ReadLine()) != null){string[] arry = s.Split('|');string name = arry[1];string user_url = arry[2];WeiBo feed = new WeiBo(name, user_url);feed.GetWeiBo(browser);}sr.Close();}}
}

转载于:https://www.cnblogs.com/idealing/archive/2013/05/25/3098409.html

C# 新浪微博滚动抓取 WeiboGrab相关推荐

分享：Python使用cookielib和urllib2模拟登陆新浪微博并抓取数据
Python使用cookielib和urllib2模拟登陆新浪微博并抓取数据 http://my.oschina.net/leopardsaga/blog/94774
java模拟新浪微博_Java模拟新浪微博登陆抓取数据
前言: 兄弟们来了来了,最近有人在问如何模拟新浪微博登陆抓取数据,我听后默默地抽了一口老烟,暗暗的对自己说,老汉是时候该你出场了,所以今天有时间就整理整理,浅谈一二. 首先: 要想登陆新浪微博需要预登 ...
谷歌插件webscraper使用问疑难杂症（插件页面跑到右边+爬取内容乱序+自定义选择多个列表+滚动抓取社交发帖+select鼠标无法选中元素+无法识别表格+插件支持范围+爬取数据与原始顺序不一致+）
博客目录谷歌插件webscraper使用问疑难杂症解决 1.插件打开后跑到了右边 2.爬取内容乱序 3.mac的支持这个插件吗 4.除了谷歌外,火狐.IE.360等浏览器支持吗 5.自定义选择多个列 ...
php抓取新浪微博数据抓取,php获取新浪微博数据API实例
php获取新浪微博数据API实例发布于 2014-11-29 12:36:06 | 118 次阅读 | 评论: 0 | 来源: 网友投递 PHP开源脚本语言PHP(外文名: Hypertext Pr ...
pythonurllib微博登录怎么删_Python使用cookielib和urllib2模拟登陆新浪微博并抓取数据...
我们都知道HTTP是无连接的状态协议,但是客户端和服务器端需要保持一些相互信息,比如cookie,有了cookie,服务器才能知道刚才是这个用户登录了网站,才会给予客户端访问一些页面的权限. 用浏览器 ...
python爬取微博内容_Python 爬虫如何机器登录新浪微博并抓取内容？
最近为了做事件分析写了一些微博的爬虫,两个大V总共爬了超70W的微博数据. 官方提供的api有爬取数量上限2000,想爬取的数据大了就不够用了... 果断撸起袖子自己动手!先简单说一下我的思路: 一. ...
微博登录记录pythonurllib_Python使用cookielib和urllib2模拟登陆新浪微博并抓取数据...
我们都知道HTTP是无连接的状态协议,但是客户端和服务器端需要保持一些相互信息,比如cookie,有了cookie,服务器才能知道刚才是这个用户登录了网站,才会给予客户端访问一些页面的权限. 用浏览器 ...
pythonurllib登录微博账号_Python使用cookielib和urllib2模拟登录新浪微博并抓取数据...
这个方法抓下来的网页,得不到相册图片,新浪要js动态生成图片,解决方法能够是:1. 本地用webkit跑抓下来的js:2.抓移动版微博的静态相册. 但都暂时未实现,欢迎回帖好方法,如下是转文. --- ...
php抓取新浪微博数据抓取,php利用curl抓取新浪微博内容示例
很多人都喜欢在网站上DIY自己的微博,所以我也写了一个. 这里直接抓取了新浪微博工具中的微博秀地址. 代码如下: set_time_limit(0); $url="http://widget ...

C# 新浪微博滚动抓取 WeiboGrab

C# 新浪微博滚动抓取 WeiboGrab相关推荐

最新文章

热门文章