先来看我们的web.xml文件,如下

 1 <!DOCTYPE web-app PUBLIC
 2  "-//Sun Microsystems, Inc.//DTD Web Application 2.3//EN"
 3  "http://java.sun.com/dtd/web-app_2_3.dtd" >
 4
 5 <web-app>
 6   <display-name>MySinaSpider</display-name>
 7     <listener>
 8         <listener-class>main.java.sina.spider.StartSpiderLisenter</listener-class>
 9       </listener>
10 </web-app>

这样的配置当启动tomcat的时候,就会运行爬虫,然后再看我们的StartSpiderLisenter类,如下

 1 package main.java.sina.spider;
 2
 3 import javax.servlet.ServletContextEvent;
 4 import javax.servlet.ServletContextListener;
 5 import main.java.sina.bean.info.LoginInfo;
 6 import main.java.sina.utils.Constant;
 7
 8 public class StartSpiderLisenter implements ServletContextListener{
 9
10     public void contextDestroyed(ServletContextEvent arg0) {
11
12     }
13
14     public void contextInitialized(ServletContextEvent arg0) {
15         Constant.personalHomePage = "http://weibo.com/zhaoyao2012/home"; //填写你自己的新浪微博个人主页
16         LoginInfo.username = "***"; //填写你的新浪微博用户名
18         LoginInfo.password = "***"; //填写你的新浪微博密码
19         Constant.enableProxy = false; //是否使用代理
20         Spider.start();
21     }
22
23 }

很明显我们看到StartSpiderLisenter 类是继承自ServletContextListener这个接口,一定要实现它的两个方法,contextInitialized和contextDestroyed.它们分别在初始化和销毁的时候被容器调用。我们看到在contextInitialized初始化上下文的方法中调用了Spider.start()方法。那么我们来看看Spider这个类,如下:

  1 package main.java.sina.spider;
  2
  3 import java.io.IOException;
  4 import java.util.regex.Matcher;
  5 import java.util.regex.Pattern;
  7 import org.quartz.JobBuilder;
  8 import org.quartz.JobDetail;
  9 import org.quartz.Scheduler;
 10 import org.quartz.SchedulerException;
 11 import org.quartz.SchedulerFactory;
 12 import org.quartz.SimpleScheduleBuilder;
 13 import org.quartz.SimpleTrigger;
 14 import org.quartz.TriggerBuilder;
 15 import org.quartz.impl.StdSchedulerFactory;
 17 import main.java.sina.bean.info.LoginInfo;
 18 import main.java.sina.httpclient.LoginSina;
 19 import main.java.sina.httpclient.SpiderSina;
 20 import main.java.sina.job.KeywordSearchJob;
 21 import main.java.sina.utils.Constant;
 22 import main.java.sina.utils.HttpHelper;
 23 import main.java.test.SpiderTest;
 24
 25 public class Spider {
 26
 27     public static void main(String[] args) {
 28
 29         Constant.personalHomePage = "****";
 30         LoginInfo.username = "****";
 31         LoginInfo.password = "****";
 32         Constant.enableProxy = false;
 33         Constant.hourbefore = 0;  //这个参数用于设置时差
 34         start();
 35
 36     }
 37     public static void start() {
 38
 39         final SchedulerFactory factory = new StdSchedulerFactory();
 40         try {
 41             Scheduler scheduler = factory.getScheduler();
 42             JobDetail jobDetail = JobBuilder.newJob(KeywordSearchJob.class)
 43                     .withIdentity("keywordSearch", "weibo").build();
 44             SimpleTrigger trigger = TriggerBuilder.newTrigger()
 45                     .withIdentity("keywordSearch", "weibo")
 46                     .withSchedule(SimpleScheduleBuilder.repeatHourlyForever())
 47                     .build();
 48             scheduler.scheduleJob(jobDetail, trigger);
 49             scheduler.start();
 50         } catch (SchedulerException e) {
 51             e.printStackTrace();
 52         }
 53     }
 54
 55     public static SpiderSina createSpider() {
 56         LoginSina ls = new LoginSina(LoginInfo.username, LoginInfo.password);
 57         ls.dologinSina();
 58         ls.redirect();
 59         SpiderSina spider = new SpiderSina(ls);
 60
 61         return spider;
 62     }
 63
 64     public static void sendMidsofDays(SpiderSina spider,String keyword, String fromdate,
 65             String todate) {
 66
 67         try {
 68             String midsString = "";
 69             for (int i = 1; i <= 50; i++) {
 70                 String htmlContent = spider
 71                         .search(keyword, i, fromdate, todate);
 72                 if (htmlContent.contains("noresult_support")) {
 73                     break;
 74                 }
 75                 System.out.println(i);
 76                 Pattern pattern = Pattern.compile("<div mid=\"([0-9]*)\"");
 77
 78                 String start = "\"pid\":\"pl_weibo_direct\"";
 79                 try {
 80                     htmlContent = htmlContent.substring(htmlContent
 81                             .indexOf(start));
 82                 } catch (Exception e) {
 83                     htmlContent = htmlContent.substring(1);
 84                 }
 85                 htmlContent = htmlContent.replace("\\\"", "\"");
 86                 htmlContent = htmlContent.replace("\\/", "/");
 87                 Matcher matcher = pattern.matcher(htmlContent);
 88                 while (matcher.find()) {
 89                     System.out.println(matcher.group(1));
 90                     midsString += matcher.group(1) + ",";
 91                 }
 92                 if (i == 37) {
 93                     try {
 94                         Thread.sleep(1000 * 60 * 30);
 95                     } catch (InterruptedException e) {
 96                         e.printStackTrace();
 97                     }
 98                 }
 99             }
100             System.out.println(midsString);
101             HttpHelper.getLiveData(midsString, Constant.CommentUrl);
102         } catch (IOException e) {
103             e.printStackTrace();
104         }
105
106     }
107 }

我们在Spider.start()方法中,看到了作业KeywordSearchJob.class,那么我们来看看这个KeywordSearchJob类的实现,如下:

 1 package main.java.sina.job;
 2
 3 import org.quartz.Job;
 4 import org.quartz.JobExecutionContext;
 5 import org.quartz.JobExecutionException;
 6 import main.java.sina.httpclient.SpiderSina;
 7 import main.java.sina.spider.Spider;
 8 import main.java.sina.utils.Constant;
 9 import main.java.sina.utils.Utils;
10
11 public class KeywordSearchJob implements Job {
12
13     public void execute(JobExecutionContext arg0) throws JobExecutionException {
14
15         Constant.enableProxy = false; //我的爬虫中没有使用代理,故值设为false.
16         String keyword = "%25E5%25AE%2581%25E6%25B3%25A2%25E5%25A4%25A7%25E5%25AD%25A6";//被编码后的关键字
17         String datehour = Utils.getDateOfSpecifiedPreHour(Constant.hourbefore);//这个工具类实现了时差格式的转换
18         SpiderSina spider = Spider.createSpider();
19         spider.forwardToWeiboPage();
20         Spider.sendMidsofDays(spider,keyword,datehour,datehour);
21     }
22
23 }

接下来,我们看几个工具类的实现:首先来看下Utils.java这个类,如下:它实现了日期的格式的一些转换

  1 package main.java.sina.utils;
  2
  3 import java.io.BufferedReader;
  4 import java.io.BufferedWriter;
  5 import java.io.File;
  6 import java.io.FileInputStream;
  7 import java.io.FileNotFoundException;
  8 import java.io.FileOutputStream;
  9 import java.io.FileWriter;
 10 import java.io.IOException;
 11 import java.io.InputStream;
 12 import java.io.InputStreamReader;
 13 import java.io.StringReader;
 14 import java.io.UnsupportedEncodingException;
 15 import java.text.ParseException;
 16 import java.text.SimpleDateFormat;
 17 import java.util.Calendar;
 18 import java.util.Date;
 19 import java.util.Properties;
 20
 21 import org.htmlparser.Parser;
 22 import org.htmlparser.lexer.Lexer;
 23 import org.htmlparser.lexer.Page;
 24 import org.htmlparser.util.DefaultParserFeedback;
 25 //  I/O操作类
 26 public class Utils {
 27
 28     public static Date getDateFromString(String dtext,Date fileCreateDate) {
 29         Date date=null;
 30         int y,mm,se;
 31         Calendar c = Calendar.getInstance();
 32         c.setTime(fileCreateDate);
 33         y = c.get(Calendar.YEAR); //年
 34         //d = c.get(Calendar.DAY_OF_MONTH); //日
 35         mm = c.get(Calendar.MINUTE); //分
 36         se = c.get(Calendar.SECOND);//秒
 37         if(dtext.contains("秒前")){
 38             int end=0;
 39             for(int i=0;i<dtext.length();i++){
 40                 if(dtext.charAt(i)>='0' && dtext.charAt(i)<='9'){
 41                     end++;
 42                 }else{
 43                     break;
 44                 }
 45             }
 46             dtext=dtext.substring(0,end);
 47             int second=Integer.parseInt(dtext);
 48             c.set(Calendar.SECOND, se-second);
 49             date=c.getTime();
 50         }
 51         else if(dtext.contains("分钟前")){
 52             int end=0;
 53             for(int i=0;i<dtext.length();i++){
 54                 if(dtext.charAt(i)>='0' && dtext.charAt(i)<='9'){
 55                     end++;
 56                 }else{
 57                     break;
 58                 }
 59             }
 60             dtext=dtext.substring(0,end);
 61             int minute=Integer.parseInt(dtext);
 62             c.set(Calendar.MINUTE, mm-minute);
 63             date=c.getTime();
 64         }else if(dtext.contains("今天")){
 65              dtext=dtext.replace("今天 ", "").trim();
 66              String ss[]=dtext.split(":");
 67              if(ss!=null && ss.length==2){
 68                  c.set(Calendar.HOUR_OF_DAY, Integer.parseInt(ss[0]));
 69                  c.set(Calendar.MINUTE, Integer.parseInt(ss[1]));
 70                  date=c.getTime();
 71              }
 72         }else if(dtext.contains("月")){
 73             dtext=y+"年".concat(dtext);
 74             SimpleDateFormat sf=new SimpleDateFormat("yyyy年MM月dd日 HH:mm");
 75             try {
 76                 date=sf.parse(dtext);
 77             } catch (ParseException e) {
 78                 e.printStackTrace();
 79             }
 80         }else if(dtext.contains("-")){
 81             SimpleDateFormat sf=new SimpleDateFormat("yyyy-MM-dd HH:mm");
 82             try {
 83                 date=sf.parse(dtext);
 84             } catch (ParseException e) {
 85                 e.printStackTrace();
 86             }
 87         }
 88         return date;
 89     }
 90     public static void writeFileFromStream(String filename,InputStream in){
 91         if(filename==null || filename.trim().length()==0)
 92             return;
 93         File file=new File(filename);
 94         if(!file.exists()){
 95             try {
 96                 file.createNewFile();
 97             } catch (IOException e) {
 98                 e.printStackTrace();
 99             }
100         }
101         FileOutputStream fou=null;
102         try {
103             fou = new FileOutputStream(file);
104             byte []buffer=new byte[1024*4];
105             int len=-1;
106             while((len=in.read(buffer))!=-1){
107                 fou.write(buffer,0,len);
108             }
109         } catch (FileNotFoundException e) {
110             e.printStackTrace();
111         } catch (IOException e) {
112             e.printStackTrace();
113         }finally{
114             if(in!=null)
115                 try {
116                     in.close();
117                 } catch (IOException e) {
118                     e.printStackTrace();
119                 }
120             if(fou!=null)
121                 try {
122                     fou.close();
123                 } catch (IOException e) {
124                     e.printStackTrace();
125                 }
126         }
127     }
128     public static void writeFileFromString(String filename,String str){
129         if(filename==null || filename.trim().length()==0)
130             filename="tmp.txt";
131         File file=new File(filename);
132         if(!file.exists()){
133             try {
134                 file.createNewFile();
135             } catch (IOException e) {
136                 e.printStackTrace();
137             }
138         }
139         BufferedWriter writer=null;
140         BufferedReader reader=null;
141         try {
142             writer=new BufferedWriter(new FileWriter(file));
143             reader=new BufferedReader(new StringReader(str));
144             String tmp=null;
145             StringBuffer buffer=new StringBuffer();
146             while((tmp=reader.readLine())!=null)
147                 buffer.append(tmp+"\n");
148             writer.write(buffer.toString());
149
150         } catch (IOException e) {
151             e.printStackTrace();
152         }finally{
153             try {
154                 reader.close();
155                 writer.close();
156             } catch (IOException e) {
157                 e.printStackTrace();
158             }
159         }
160
161     }
162
163
164
165     public static String getStringFromStream(InputStream in) {
166         BufferedReader reader=null;
167         reader = new BufferedReader(new InputStreamReader(in));
168         StringBuffer buffer=new StringBuffer();
169         String str=null;
170         try{
171             while((str=reader.readLine())!=null){
172                 buffer.append(str+"\n");
173             }
174             reader.close();
175         }catch(Exception ex){
176             ex.printStackTrace();
177         }
178         try {
179             return new String(buffer.toString().getBytes(),"utf-8");
180         } catch (UnsupportedEncodingException e) {
181             e.printStackTrace();
182             return "error:"+e.getMessage();
183         }
184     }
185   //得到数据库的配置信息
186     public static Properties getDBconfig(){
187         Properties properties=new Properties();
188         InputStream in = null;
189         try {
190             in = new FileInputStream(new File("config/dbconfig.ini"));
191             properties.load(in);
192         } catch (FileNotFoundException e) {
193             e.printStackTrace();
194         } catch (IOException e) {
195             e.printStackTrace();
196         }finally{
197             if(in!=null)
198                 try {
199                     in.close();
200                 } catch (IOException e) {
201                     e.printStackTrace();
202                 }
203         }
204         return properties;
205     }
206
207     public static Parser createParser(String inputHTML) {
208         Lexer mLexer = new Lexer(new Page(inputHTML));
209         Parser parser = new Parser(mLexer, new DefaultParserFeedback(
210                 DefaultParserFeedback.QUIET));
211         return parser;
212     }
213
214     public static String getDateOfSpecifiedPreHour(int hourNum){
215         SimpleDateFormat sdFormat = new SimpleDateFormat("yyyy-MM-dd-HH");
216         Date date = new Date();
217         System.out.println("date -" +date + " " + hourNum);
218         Calendar calendar = Calendar.getInstance();
219         calendar.setTime(date);
220         calendar.add(Calendar.HOUR_OF_DAY, -1 * hourNum);
221         System.out.println("date2 -" +sdFormat.format(calendar.getTime()));
222         return sdFormat.format(calendar.getTime());
223     }
224 }

再来看一下ThreadPool.java这个类,如下:这是一个线程工具类,定义了线程的一些动作

 1 package main.java.sina.utils;
 2
 3 import java.util.List;
 4 import java.util.concurrent.ExecutorService;
 5 import java.util.concurrent.Executors;
 6
 7 /** 9  * 线程池工具类
10  */
11 public class ThreadPool {
12     private ExecutorService service;
13     private List<Thread> threadList;
14
15     public ThreadPool(int limite, List<Thread> threadList) {
16         this.service = Executors.newFixedThreadPool(limite);
17         this.threadList = threadList;
18     }
19
20     public void execute() {
21         if(threadList==null ||threadList.size()==0) return ;
22         for (int index = 0; index < threadList.size(); index++) {
23             Thread t=threadList.get(index);
24             service.execute(t);
25         }
26     }
27     public boolean isTerminated(){
28         return service.isTerminated();
29     }
30
31     public void shutDown() {
32         service.shutdown();
33     }
34 }

然后再看一下Constant.java这个常量类,如下:常量类把系统总用到的一些常量写在这里,以后项目维护需要更改的时候,方便维护更改

package main.java.sina.utils;/*** @ClassName: Constant * */
public class Constant {public static boolean enableProxy = false;public static String liveCommentUrl = "http://localhost:8080/social-hub-connector/loadingLiveData";public static String CommentUrl = "http://localhost:8080/social-hub-connector/loadingData";public static String personalHomePage = "******";public static String weiboUsername = "*********";public static String weiboPassword = "*********";public static int hourbefore = 0;
}

再来看一下Base64Encoder.java类,它对一些字段进行了编码的类,如下:

 1 package main.java.sina.utils;
 2
 3 /**
 4  *  5  */
 6 public class Base64Encoder {
 7     private static final char last2byte = (char) Integer.parseInt("00000011", 2);
 8     private static final char last4byte = (char) Integer.parseInt("00001111", 2);
 9     private static final char last6byte = (char) Integer.parseInt("00111111", 2);
10     private static final char lead6byte = (char) Integer.parseInt("11111100", 2);
11     private static final char lead4byte = (char) Integer.parseInt("11110000", 2);
12     private static final char lead2byte = (char) Integer.parseInt("11000000", 2);
13     private static final char[] encodeTable = new char[]{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'};
14
15     public Base64Encoder() {
16     }
17     public static  String encode(byte[] from) {
18         StringBuffer to = new StringBuffer((int) (from.length * 1.34) + 3);
19         int num = 0;
20         char currentByte = 0;
21         for (int i = 0; i < from.length; i++) {
22             num = num % 8;
23             while (num < 8) {
24                 switch (num) {
25                     case 0:
26                         currentByte = (char) (from[i] & lead6byte);
27                         currentByte = (char) (currentByte >>> 2);
28                         break;
29                     case 2:
30                         currentByte = (char) (from[i] & last6byte);
31                         break;
32                     case 4:
33                         currentByte = (char) (from[i] & last4byte);
34                         currentByte = (char) (currentByte << 2);
35                         if ((i + 1) < from.length) {
36                             currentByte |= (from[i + 1] & lead2byte) >>> 6;
37                         }
38                         break;
39                     case 6:
40                         currentByte = (char) (from[i] & last2byte);
41                         currentByte = (char) (currentByte << 4);
42                         if ((i + 1) < from.length) {
43                             currentByte |= (from[i + 1] & lead4byte) >>> 4;
44                         }
45                         break;
46                 }
47                 to.append(encodeTable[currentByte]);
48                 num += 6;
49             }
50         }
51         if (to.length() % 4 != 0) {
52             for (int i = 4 - to.length() % 4; i > 0; i--) {
53                 to.append("=");
54             }
55         }
56         return to.toString();
57     }
58 }

这个类中,针对新浪的一些特殊的加密规则,写的方法,这个在拼接最终的URl的时候回用到,如根据servertime+nonce两个参数来生成一串字符串加密规则:

 1 package main.java.sina.utils;
 2 import java.io.File;
 3 import java.io.FileReader;
 4
 5 import javax.script.Invocable;
 6 import javax.script.ScriptEngine;
 7 import javax.script.ScriptEngineManager;
 8
 9 /**
10  * 12  */
13 public class EncodeSuAndSp {
14     static ScriptEngineManager mgr = new ScriptEngineManager();
15     static ScriptEngine engine = mgr.getEngineByExtension("js");
16     static Invocable inv = (Invocable) engine;
17
18     public static String getEncryptedP(String password,String servertime,String nonce){
19         String value1="";
20         try {
21             engine.eval(new FileReader(new File("js/encrypt.js")));
22             value1 = String.valueOf(inv.invokeFunction("hex_sha1",password));
23             value1 = String.valueOf(inv.invokeFunction("hex_sha1",value1));
24             value1 = String.valueOf(inv.invokeFunction("hex_sha1",value1+servertime+nonce));
25         } catch (Exception e) {
26             e.printStackTrace();
27         }
28         return value1;
29     }
30
31
32     public static String getEncodedUsername(String username){
33         String value1="";
34         try {
35             engine.eval(new FileReader(new File("js/encrypt.js")));
36             value1 = String.valueOf(inv.invokeFunction("encode",username));
37             System.out.println(value1);
38         } catch (Exception e) {
39             e.printStackTrace();
40         }
41         return value1;
42     }
43 }

package main.java.sina.utils;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.net.URLEncoder;
public class EncodeUtils {public static final String encodeURL(String str,String enc) {try {return URLEncoder.encode(str, enc);} catch (UnsupportedEncodingException e) {throw new RuntimeException(e);}}public static final String decodeURL(String str,String enc) {try {return URLDecoder.decode(str, enc);} catch (UnsupportedEncodingException e) {throw new RuntimeException(e);}}public static String unicdoeToGB2312(String str) {String res = null;if(str==null ){return "";}StringBuffer sb = new StringBuffer();try {while (str.length() > 0) {if (str.startsWith("\\u")) {int x = 0;try{x = Integer.parseInt(str.substring(2, 6), 16);}catch(Exception ex){x=  0;}sb.append((char) x);str = str.substring(6);} else {sb.append(str.charAt(0));str = str.substring(1);}}res = sb.toString();} catch (Exception e) {e.printStackTrace(System.err);}res=res.replaceAll("\\\\r", "").replaceAll("\\\\n", "").replaceAll("\\\\t", "").replaceAll("&nbsp;", "").replaceAll("&gt", "").replaceAll("\\[", "\"").replaceAll("\\]", "\"");return res;}public static String unicodeTogb2312(String str) {String res = null;StringBuffer sb = new StringBuffer();try {while (str.length() > 0) {if (str.startsWith("\\u")) {int x = Integer.parseInt(str.substring(2, 6), 16);sb.append((char) x);str = str.substring(6);} else {sb.append(str.charAt(0));str = str.substring(1);}}res = sb.toString();} catch (Exception e) {e.printStackTrace(System.err);}res=res.replaceAll("\\\\r", "").replaceAll("\\\\t", "").replaceAll("&nbsp;", "").replaceAll("&gt", "").replaceAll("\\\\n", "");return res;}
}

这个类很关键HttpUtils.java类,这个方法中重写了doPost()和doGet()方法.如下:

package main.java.sina.utils;import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.HttpVersion;
import org.apache.http.NameValuePair;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.conn.params.ConnRoutePNames;
import org.apache.http.conn.params.ConnRouteParams;
import org.apache.http.cookie.Cookie;
import org.apache.http.entity.InputStreamEntity;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.CoreProtocolPNames;
import org.apache.http.params.HttpParams;
import org.apache.http.params.HttpProtocolParams;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.ExecutionContext;
import org.apache.http.protocol.HTTP;
import org.apache.http.protocol.HttpContext;/*** http操作相关的类*/
public class HttpUtils {/** params :* url:  地址* headers请求头部信息* return : httpresponse响应*/public static HttpResponse doGet(String url,Map<String,String> headers){HttpClient client=createHttpClient();HttpGet getMethod=new HttpGet(url);HttpResponse response=null;HttpContext httpContext = new BasicHttpContext();try {if(headers!=null && headers.keySet().size()>0){for(String key:headers.keySet()){getMethod.addHeader(key, headers.get(key));}}    response=client.execute(getMethod);HttpUriRequest realRequest  = (HttpUriRequest)httpContext.getAttribute(ExecutionContext.HTTP_REQUEST);System.out.println(realRequest.getURI());} catch (ClientProtocolException e) {e.printStackTrace();} catch (IOException e) {String msg=e.getMessage();if(msg.contains("Truncated chunk")){System.out.println(e.getMessage() +" 数据获取不完整,需要重新获取。");}else{System.out.println(e.getMessage() +" 连接被拒绝,需要降低爬取频率。");}} catch(Exception e){}System.out.println(response);return response;        }/** params :* url:  地址* headers:请求头部信息* params:post的请求数据* return : httpresponse响应*/public static HttpResponse doPost(String url,Map<String,String> headers,Map<String,String> params){HttpClient client=createHttpClient();HttpPost postMethod=new HttpPost(url);HttpResponse response=null;try {if(headers!=null && headers.keySet().size()>0){for(String key:headers.keySet()){postMethod.addHeader(key, headers.get(key));}}    List<NameValuePair> p=null;if(params!=null && params.keySet().size()>0){p=new ArrayList<NameValuePair>();for(String key:params.keySet()){p.add(new BasicNameValuePair(key,params.get(key)));}}if(p!=null)postMethod.setEntity(new UrlEncodedFormEntity(p,HTTP.UTF_8));response=client.execute(postMethod);} catch (ClientProtocolException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();} return response;            }//上传一个文件public static HttpResponse doPost(String url,Map<String,String> headers,String fileName){HttpClient client=createHttpClient();HttpPost postMethod=new HttpPost(url);String boundary = "";HttpResponse response=null;try {if(headers!=null && headers.keySet().size()>0){for(String key:headers.keySet()){postMethod.addHeader(key, headers.get(key));if(key.equals("Content-Type")){String tmp=headers.get(key);boundary=tmp.substring(tmp.indexOf("=")+1);}}}    File file=new File(fileName);InputStream in=new FileInputStream(file);StringBuffer buffer=new StringBuffer();buffer.append(boundary).append("\n").append("Content-Disposition: form-data; name=\"pic1\"; filename=\""+file.getName()).append("\"\n").append("Content-Type: image/pjpeg").append("\n").append("\n");System.out.println(buffer.toString());String tmpstr=Utils.getStringFromStream(in);tmpstr=Base64Encoder.encode(tmpstr.getBytes());buffer.append(tmpstr).append("\n");buffer.append(boundary+"--").append("\n");System.out.println(buffer.toString());in=new ByteArrayInputStream(buffer.toString().getBytes());InputStreamEntity ise=new InputStreamEntity(in,buffer.toString().getBytes().length);  postMethod.setEntity(ise);  response=client.execute(postMethod);} catch (ClientProtocolException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();} return response;            }/** params :* httpresponse* return : 响应的头部信息*/public static List<Header> getReponseHeaders(HttpResponse response){List<Header> headers=null;Header[] hds=response.getAllHeaders();if(hds!=null && hds.length>0){headers=new ArrayList<Header>();for(int i=0;i<hds.length;i++){headers.add(hds[i]);}}        return headers;}/** params :* headers:头部信息 * request:请求*/public static void setHeaders(Map<String,String> headers,HttpUriRequest request){if(headers!=null && headers.keySet().size()>0){for(String key:headers.keySet()){request.addHeader(key, headers.get(key));            }}}/** params :* httpresponse* return : 响应的cookies值*/public static List<Cookie> getResponseCookies(HttpResponse response){List<Cookie> cookies=null;Header[] hds=response.getAllHeaders();if(hds!=null && hds.length>0){for(int i=0;i<hds.length;i++){if(hds[i].getName().equalsIgnoreCase("Set-Cookie")){if(cookies==null){cookies=new ArrayList<Cookie>();}                     String cookiestring[]=hds[i].getValue().split(";");String ss[]=cookiestring[0].split("=",2);String cookiename=ss[0];String cookievalue=ss[1];Cookie cookie=new BasicClientCookie(cookiename,cookievalue);cookies.add(cookie);}}}        return cookies;}/** params :* cookies数组* return : cookies数组组成的字符串*/public static String setCookie2String(List<Cookie> cookies){StringBuilder builder=null; if(cookies!=null && cookies.size()>0){builder=new StringBuilder();for(int j=0;j<cookies.size();j++){Cookie c=cookies.get(j);builder.append(c.getName()+"="+c.getValue());if(j!=cookies.size()-1)builder.append("; ");}return builder.toString();}        return null;}/** 从响应中得到输入流*/public static InputStream getInputStreamFromResponse(HttpResponse response){if(response==null){return null;}HttpEntity entity=response.getEntity();InputStream in=null;try {in = entity.getContent();} catch (IllegalStateException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();}return  in;}/** 从响应中得到字符串*/public static String getStringFromResponse(HttpResponse response){if(response==null){return null;}InputStream in=getInputStreamFromResponse(response);String responseText="";if(in!=null){responseText=Utils.getStringFromStream(in);}return responseText;}/*** 创建支持多线程并发连接的HTTPCLIENT*/private final static HttpClient createHttpClient() {String proxyHost = "web-proxy-sha.chn.hp.com";int proxyPort = 8080;HttpHost proxy = new HttpHost(proxyHost,proxyPort);HttpParams params = new BasicHttpParams();if(Constant.enableProxy){params.setParameter(ConnRouteParams.DEFAULT_PROXY, proxy);}HttpProtocolParams.setVersion(params, HttpVersion.HTTP_1_1);HttpProtocolParams.setContentCharset(params, "UTF-8");ThreadSafeClientConnManager clientmanager = new ThreadSafeClientConnManager();clientmanager.setMaxTotal(20);HttpClient client = new DefaultHttpClient(clientmanager, params);//定义了环形重定向,定向到相同的路径是否被允许.client.getParams().setParameter("http.protocol.allow-circular-redirects", true); //定义了重定向的最大数量client.getParams().setParameter("http.protocol.max-redirects", 50);//定义了重定向是否应该自动处理client.getParams().setParameter("http.protocol.handle-redirects", false);return client;}/***加入代理的功能* @return HttpClient 对象*/public static HttpClient getDefaultHttpClientByProxy() {HttpClient httpclient =createHttpClient();String filePath = "proxy.properties";HttpHost proxy = null;Map<String, String> map = ReadIni.getDbini(filePath);if (map.size() == 0) {throw new RuntimeException("无可用代理");} else {Set<String> set = map.keySet();String[] array = (String[]) set.toArray(new String[set.size()]);Random r = new Random();int rnum = r.nextInt(array.length);String ip = array[rnum];String port = map.get(ip);proxy = new HttpHost(ip, Integer.parseInt(port));}httpclient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY,proxy);httpclient.getParams().setParameter(CoreProtocolPNames.PROTOCOL_VERSION, HttpVersion.HTTP_1_1);return httpclient;}
}

接下来卡一个HttpHelper的辅助类,如下:

/*** */
package main.java.sina.utils;import java.io.IOException;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.methods.PostMethod;/*** @ClassName: HttpHelper* */
public class HttpHelper {public static String getLiveData(String requestData,String url)throws HttpException, IOException {PostMethod postMethod = new PostMethod(url);postMethod.setParameter("mids", requestData);HttpClient httpClient = new HttpClient();int statusCode = httpClient.executeMethod(postMethod);String response = postMethod.getResponseBodyAsString();postMethod.releaseConnection();System.out.println(response);return response;}public static String getHobbyData(String userid, String hobbys)throws HttpException, IOException {PostMethod postMethod = new PostMethod("http://c0048925.itcs.hp.com:8080/connector/loadingHobby");postMethod.setParameter("userid", userid);postMethod.setParameter("hobbys", hobbys);HttpClient httpClient = new HttpClient();int statusCode = httpClient.executeMethod(postMethod);String response = postMethod.getResponseBodyAsString();postMethod.releaseConnection();System.out.println(response);return response;}}

ReadIni.java类,在读文本文件中使用,如下:

package main.java.sina.utils;import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Map;public class ReadIni {public static Map<String, String> getDbini(String file) {Map<String, String> map = new HashMap<String, String>();InputStreamReader isr = null;try{isr = new InputStreamReader(new FileInputStream(file));} catch (FileNotFoundException e1) {e1.printStackTrace();}BufferedReader br = new BufferedReader(isr);String s = null;try {s = br.readLine();while (s != null) {if (s.trim().length() > 0) {String[] s1 = getIni(s);map.put(s1[0], s1[1]);s = br.readLine();}}br.close();isr.close();} catch (Exception e) {e.printStackTrace();}return map;}public static String[] getIni(String str) {String[] temp = str.split("=");return temp;}}

然后,我们跳转到登录sina,来看一下loginSina这个类的实现:

package main.java.sina.httpclient;import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.math.BigInteger;
import java.security.InvalidKeyException;
import java.security.KeyFactory;
import java.security.NoSuchAlgorithmException;
import java.security.interfaces.RSAPublicKey;
import java.security.spec.InvalidKeySpecException;
import java.security.spec.RSAPublicKeySpec;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Scanner;import javax.crypto.BadPaddingException;
import javax.crypto.Cipher;
import javax.crypto.IllegalBlockSizeException;
import javax.crypto.NoSuchPaddingException;import org.apache.commons.codec.binary.Hex;
import org.apache.commons.httpclient.params.HttpParams;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.cookie.Cookie;
import org.springframework.core.io.ClassPathResource;import main.java.sina.json.msg.PreLoginResponseMessage;
import main.java.sina.utils.Base64Encoder;
import main.java.sina.utils.EncodeUtils;
import main.java.sina.utils.HttpUtils;
import main.java.sina.utils.JsonUtils;
import main.java.sina.utils.Utils;public class LoginSina {private String username;private String password;private String rsakv;private String pubkey;//servertime和nonce都是在登录时需要使用的,用于post信息的加密private String servertime;//服务器的时间private String nonce;//一次性字符串private String userid;//用户微博IDprivate String pcid;//若需要输入验证码时用到private String userdomainname;//用于域名private String door;//验证码private Map<String,String> headers=null;private List<Cookie> cookies=null;public LoginSina(String username,String password){this.username=username;this.password=password;init();}public Map<String,String> getHeaders(){Map<String,String> hds=null;if(headers!=null && headers.keySet().size()>0){hds=new HashMap<String,String>();for(String key:headers.keySet()){hds.put(key,headers.get(key));}}return hds;}public List<Cookie> getCookies(){List<Cookie> cc=null;if(cookies!=null && cookies.size()>0){cc=new ArrayList<Cookie>();for(int i=0;i<cookies.size();i++){cc.add(cookies.get(i));}}return cc;}//登录微博public String dologinSina(){System.out.println("---do login, please hold on...---");String url="http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.5)";//v1.3.17Map<String,String> headers=new HashMap<String,String>();Map<String,String> params=new HashMap<String,String>();/*HTTP协议中的headers:http://www.cnblogs.com/yuzhongwusan/archive/2011/10/20/2218954.html* */headers.put("Accept", "text/html, application/xhtml+xml, */*");headers.put("Referer", "http://login.sina.com.cn/member/my.php?entry=sso");headers.put("Accept-Language", "zh-cn");headers.put("User-Agent", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; BOIE9;ZHCN");headers.put("Host", "login.sina.com.cn");headers.put("Connection", "Keep-Alive");headers.put("Content-Type", "application/x-www-form-urlencoded");headers.put("Cache-Control", "no-cache");params.put("encoding", "UTF-8");params.put("entry", "weibo");params.put("from", "");params.put("prelt", "112");params.put("gateway", "1");params.put("nonce", nonce);params.put("pwencode", "rsa2");//wsseparams.put("returntype", "META");params.put("pagerefer", "");params.put("savestate", "7");    params.put("servertime", servertime);params.put("rsakv", rsakv);params.put("service", "miniblog");params.put("sp", getEncryptedP());params.put("ssosimplelogin", "1");params.put("su", getEncodedU());params.put("url", "http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack");params.put("useticket", "1");params.put("vsnf", "1");HttpResponse response=HttpUtils.doPost(url, headers, params);this.cookies=HttpUtils.getResponseCookies(response);this.headers=headers;String responseText=HttpUtils.getStringFromResponse(response);try {responseText=new String(responseText.getBytes(),"GBK");if(!responseText.contains("retcode=0")){downloadCheckImage();this.nonce=getnonce();Scanner s=new Scanner(System.in);if(responseText.contains("retcode=4049"))System.out.println("请输入验证码:");else if(responseText.contains("retcode=2070")){System.out.println("验证码不正确,请再次输入验证码:");}this.door=s.next();dologinSina();}} catch (UnsupportedEncodingException e) {e.printStackTrace();}System.out.println("Congratulations, you have login success!");return responseText;}//登录后重定向public String redirect(){String cookieValue=HttpUtils.setCookie2String(this.cookies);this.headers.clear();this.headers.put("Accept", "image/gif, image/jpeg, image/pjpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*");this.headers.put("Accept-Language", "zh-cn");this.headers.put("Connection", "Keep-Alive");this.headers.put("Host", "sina.com.cn");this.headers.put("Referer", "http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)");this.headers.put("User", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; QQDownload 691)");this.headers.put("Cookie", cookieValue);String ssosavestate=""; //SSO即Sina Sign-on,String ticket = "";for(Cookie c:this.cookies){if(c.getName().equals("ALF")){ssosavestate=c.getValue();}else if(c.getName().equals("tgc")){ticket=c.getValue();}}String url="http://weibo.com/ajaxlogin.php?" +"framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack&" +"sudaref=weibo.com";HttpResponse response=HttpUtils.doGet(url, this.headers);response=HttpUtils.doGet(url, this.headers);    String responseText=HttpUtils.getStringFromResponse(response);return responseText;}//生成一次性的字符串 6位 用于加密private String getnonce() {String x = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";String str = "";for (int i = 0; i < 6; i++) {str += x.charAt((int)Math.ceil(Math.random() * 1000000) % x.length());}return str;}//初始化:得到服务区的时间servertime和一次性字符串nonceprivate void init(){String url=compositeUrl();Map<String,String> headers=new HashMap<String,String>();headers.put("Accept", "*/*");headers.put("Referer", "http://weibo.com/");headers.put("Accept-Language", "zh-cn");headers.put("User-Agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; QQDownload 691)");headers.put("Host", "login.sina.com.cn");headers.put("Connection", "Keep-Alive");        HttpResponse response=HttpUtils.doGet(url, headers);String responseText=HttpUtils.getStringFromResponse(response);int begin=responseText.indexOf("{");int end=responseText.lastIndexOf("}");responseText=responseText.substring(begin,end+1);PreLoginResponseMessage plrmsg =JsonUtils.jsontoPreLoginResponseMessage(responseText);this.nonce=plrmsg.getNonce();this.servertime=plrmsg.getServertime()+"";this.pubkey=plrmsg.getPubkey();this.rsakv=plrmsg.getRsakv();this.pcid=plrmsg.getPcid();}//下载验证码private void downloadCheckImage() {if(pcid==null) return;this.headers.remove("Content-Type");try {if(this.cookies != null){this.cookies.clear();}} catch (Exception e) {e.printStackTrace();}String cookieValue=HttpUtils.setCookie2String(this.cookies);this.headers.put("Cookie", cookieValue);String url="http://login.sina.com.cn/cgi/pin.php?r="+(long)(Math.random()*100000000)+"&s=0&p="+this.pcid;HttpResponse response=HttpUtils.doGet(url, headers);InputStream in=HttpUtils.getInputStreamFromResponse(response);try {//System.out.println(new ClassPathResource("checkImage.jpeg").getFile().getPath());Utils.writeFileFromStream(new ClassPathResource("checkImage.jpeg").getFile().getPath(), in);} catch (IOException e) {e.printStackTrace();}}//组合预登陆时的URLprivate String compositeUrl(){StringBuilder builder=new StringBuilder();builder.append("http://login.sina.com.cn/sso/prelogin.php?").append("entry=weibo&callback=sinaSSOController.preloginCallBack&").append("su="+getEncodedU()).append("&rsakt=mod&checkpin=1&client=ssologin.js(v1.4.5)&_="+System.currentTimeMillis());return builder.toString();}//对用户名进行编码private String getEncodedU() {if(username!=null && username.length()>0){return Base64Encoder.encode(EncodeUtils.encodeURL(username,"utf-8").getBytes());}return "";}//对密码进行编码private String getEncryptedP(){
//        return EncodeSuAndSp.getEncryptedP(password, servertime, nonce);String data=servertime+"\t"+nonce+"\n"+password;String spT=rsaCrypt(pubkey, "10001", data);return spT;}public static String rsaCrypt(String pubkey, String exponentHex, String pwd,String servertime,String nonce) {String data=servertime+"\t"+nonce+"\n"+pwd;return rsaCrypt(pubkey,exponentHex,data);}
public static String rsaCrypt(String pubkey, String exponentHex, String messageg) {KeyFactory factory=null;try {factory = KeyFactory.getInstance("RSA");} catch (NoSuchAlgorithmException e1) {return "";}BigInteger publicExponent = new BigInteger(pubkey, 16); /* public exponent */BigInteger modulus = new BigInteger(exponentHex, 16); /* modulus */RSAPublicKeySpec spec = new RSAPublicKeySpec(publicExponent, modulus);RSAPublicKey pub=null;try {pub = (RSAPublicKey) factory.generatePublic(spec);} catch (InvalidKeySpecException e1) {return "";}Cipher enc=null;byte[] encryptedContentKey =null;try {enc = Cipher.getInstance("RSA");enc.init(Cipher.ENCRYPT_MODE, pub);encryptedContentKey = enc.doFinal(messageg.getBytes());} catch (NoSuchAlgorithmException e1) {System.out.println(e1.getMessage());return "";} catch (NoSuchPaddingException e1) {System.out.println(e1.getMessage());return "";} catch (InvalidKeyException e1) {System.out.println(e1.getMessage());return "";} catch (IllegalBlockSizeException e1) {System.out.println(e1.getMessage());return "";} catch (BadPaddingException e1) {System.out.println(e1.getMessage());return "";} return new String(Hex.encodeHex(encryptedContentKey));}public void setUserid(String userid) {this.userid = userid;}public String getUserid() {return userid;}public void setUserdomainname(String userdomainname) {this.userdomainname = userdomainname;}public String getUserdomainname() {return userdomainname;}}

Spider.sina类如下:

  1 package main.java.sina.httpclient;
  2 import java.util.HashMap;
  3 import java.util.List;
  4 import java.util.Map;
  5
  6 import org.apache.http.HttpResponse;
  7 import org.apache.http.cookie.Cookie;
  8
  9 import main.java.sina.utils.Constant;
 10 import main.java.sina.utils.EncodeUtils;
 11 import main.java.sina.utils.HttpUtils;
 12 import main.java.sina.utils.Utils;
 13
 14 public class SpiderSina {
 15     private LoginSina ls;
 16     private Map<String,String> headers;
 17     private final int  ADDFOLLOWING =1;
 18     private final int  CANCELFOLLOWING =2;
 19     public SpiderSina(LoginSina ls){
 20         this.ls=ls;
 21         this.headers=new HashMap<String,String>();
 22         headers.put("Accept", "text/html, application/xhtml+xml, */*");
 23         headers.put("Accept-Language", "zh-cn");
 24         headers.put("User-Agent", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; BOIE9;ZHCN");
 25         headers.put("Connection", "Keep-Alive");
 26         headers.put("Cache-Control", "no-cache");
 27         String cookieValue=HttpUtils.setCookie2String(ls.getCookies());
 28         headers.put("Cookie", cookieValue);
 29     }
 36     public String getGroupCategory(){
 37         String url="http://q.weibo.com/";
 38         this.headers.put("Host", "q.weibo.com");
 39         HttpResponse response=HttpUtils.doGet(url, headers);
 40         String responseText=HttpUtils.getStringFromResponse(response);
 41         responseText=EncodeUtils.unicdoeToGB2312(responseText);
 42         return responseText;
 43     }
 44     public String search(String keyword, int pageNo){ 47         String url="http://s.weibo.com/weibo/%25E5%25AE%2581%25E6%25B3%25A2%25E5%25A4%25A7%25E5%25AD%25A6&page="+pageNo;
 48         String cookieValue = "SINAGLOBAL=8556698272004.724.1417744632425; un=shy_annan@126.com; myuid=5439352084; wvr=6; un=sm2014121904@126.com; _s_tentry=developer.51cto.com; SWB=usrmdinst_14; SUS=SID-5438576807-1419173757-GZ-lrze7-d8e1e3f082b428c12412c8ba30f0a6de; SUE=es%3D4cdfdd5d5f0f75141c092b32f89525a2%26ev%3Dv1%26es2%3D469e50c869315e57efeec3012c3bb6a8%26rs0%3DoWdG36CQ33LUEtKTvGn907Zy1mwFETvSVJsxeHEiaMPcKDB7pFxg596a2pLhFLJfQmswf4AvXYAkzTfemrYgWrz%252BQPustEA2wLNYufYpAZqFsGWanhTBq6elzB2yoZp41xcpy1WwXn1CuvzIzzEYpuILjHahkmJDQDQy6KaxlbA%253D%26rv%3D0; SUP=cv%3D1%26bt%3D1419173757%26et%3D1419260157%26d%3Dc909%26i%3Da6de%26us%3D1%26vf%3D0%26vt%3D0%26ac%3D27%26st%3D0%26uid%3D5438576807%26name%3Dsm2014121904%2540126.com%26nick%3DSocialMedia%25E5%259B%259B%25E5%25A8%2583%26fmp%3D%26lcp%3D; SUB=_2A255kq8tDeTxGeNK6FoU9yjEyzuIHXVa6DVlrDV8PUNbvtBeLW3TkW-bMoi0G_bBfpbS3TMqcXg6zDWFLA..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhGThsH46uNrx1VY0ApV0SR5JpX5KMt; ALF=1450709756; SSOLoginState=1419173757; WBStore=bc5ad8450c3f8a48|undefined; Apache=1027467835228.8901.1419173761694; ULV=1419173761704:6:6:1:1027467835228.8901.1419173761694:1418797827169; UOR=www.ilehao.com,widget.weibo.com,login.sina.com.cn; ULOGIN_IMG=14192385783486";
 49         headers.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
 50         //headers.put("Accept-Encoding", "gzip, deflate, sdch");
 51         headers.put("Accept-Language", "zh-CN");
 52         headers.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36");
 53         headers.put("Connection", "Keep-Alive");
 54         headers.put("Cache-Control", "max-age=0");
 55         headers.put("Referer", "http://login.sina.com.cn/sso/login.php?url=http%3A%2F%2Fs.weibo.com%2Fweibo%2F%2525E6%252583%2525A0%2525E6%252599%2525AE%26page%3D2&_rand=1419173756.6387&gateway=1&service=weibo&entry=miniblog&useticket=1&returntype=META");
 56         headers.put("Cookie", cookieValue);
 57         this.headers.put("Host", "s.weibo.com");
 58         HttpResponse response=HttpUtils.doGet(url, headers);
 59         String responseText=HttpUtils.getStringFromResponse(response);
 60         responseText=EncodeUtils.unicdoeToGB2312(responseText);
 61
 62
 63         return responseText;
 64     }
 65
 66     public String searchCommentsByUid(String uid){
 67
 68         String url="http://www.weibo.com/u/"+uid;
 69         String cookieValue = "SINAGLOBAL=8556698272004.724.1417744632425; myuid=2035860051; wvr=6; YF-Ugrow-G0=ad06784f6deda07eea88e095402e4243; SSOLoginState=1423150079; YF-V5-G0=32eb5467e9bfc8b60c2d771056535ac5; _s_tentry=www.weibo.com; Apache=6264929557219.147.1423150103832; ULV=1423150103842:18:2:2:6264929557219.147.1423150103832:1422769721265; ULOGIN_IMG=1423233797946; YF-Page-G0=82cdcdfb16327a659fbb60cc9368fb19; SUS=SID-2035860051-1423286223-GZ-jdkh4-c8ea11de0a42151313986e52f9aa6017; SUE=es%3D8701ff5aca59244ff1ff263cf985bee6%26ev%3Dv1%26es2%3D7995c9eb7455697c09fac4f7486e14eb%26rs0%3DTyXXIRjcEw%252BeS5PaVSM%252FhQjc2JGhKBOe3uFTgShiIUAbPFI2eKtrgxM2wIi9A1xndiTFFM72zY%252FDKYFXONrgkao5cRo%252FHkydV%252FnaQjNmXoeESu5gi6Iq0aX883NhGR0utBVNZb5XaIG3X6HMMfBJC%252B7pnVHogEo8eD6cx8nzN5c%253D%26rv%3D0; SUP=cv%3D1%26bt%3D1423286223%26et%3D1423372623%26d%3Dc909%26i%3D6017%26us%3D1%26vf%3D0%26vt%3D0%26ac%3D0%26st%3D0%26uid%3D2035860051%26name%3Dshy_annan%2540126.com%26nick%3D%25E7%2594%25A8%25E6%2588%25B72035860051%26fmp%3D%26lcp%3D2013-08-18%252021%253A48%253A10; SUB=_2A2550e-fDeTxGeRO6FcZ9i7Mzj2IHXVap0ZXrDV8PUNbvtBuLWnTkW-gBGVORTA7J_lSZzAqzW6E50JjBQ..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9Wh7oKNCGYcNnhlC6eqqQbbl5JpX5KMt; SUHB=0M20OGRPiOKzyc; ALF=1454822222; UOR=www.ilehao.com,widget.weibo.com,login.sina.com.cn";
 70         headers.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
 71         headers.put("Accept-Language", "zh-CN");
 72         headers.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36");
 73         headers.put("Connection", "Keep-Alive");
 74         headers.put("Cache-Control", "max-age=0");
 75         headers.put("Cookie", cookieValue);
 76         this.headers.put("Host", "www.weibo.com");
 77         HttpResponse response=HttpUtils.doGet(url, headers);
 78         String responseText=HttpUtils.getStringFromResponse(response);
 79         responseText=EncodeUtils.unicdoeToGB2312(responseText);
 82         return responseText;
 83     }
 85 //爬虫根据关键字,查询时间断,和查询页数  来得到htmlContent
 86 public String search(String keyword, int pageNo, String fromdate,String todate){
 87     StringBuffer stringBuffer = new StringBuffer(200);
 93     stringBuffer.append("http://s.weibo.com/weibo/"+ keyword +"&page=");
 94     stringBuffer.append(pageNo);
 95     stringBuffer.append("&typeall=1&suball=1&timescope=custom:");
 96     stringBuffer.append(fromdate);
 97     stringBuffer.append(":");
 98     stringBuffer.append(todate);
 99     stringBuffer.append("&Refer=g");
104     String url = stringBuffer.toString();
105     String cookieValue = headers.get("Cookie");
106     headers.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
107     //headers.put("Accept-Encoding", "gzip, deflate, sdch");
108     headers.put("Accept-Language", "zh-CN");
109     headers.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36");
110     headers.put("Connection", "Keep-Alive");
111     headers.put("Cache-Control", "max-age=0");
112     headers.put("Referer", "http://s.weibo.com/weibo/%25E5%25AE%2581%25E6%25B3%25A2%25E5%25A4%25A7%25E5%25AD%25A6&typeall=1&suball=1&timescope=custom:"+fromdate+":"+todate+"&Refer=g");
113     headers.put("Cookie", cookieValue);
114     this.headers.put("Host", "s.weibo.com");
115     HttpResponse response=HttpUtils.doGet(url, headers);
116     String responseText=HttpUtils.getStringFromResponse(response);
117     responseText=EncodeUtils.unicdoeToGB2312(responseText);
118
119     System.out.println("************htmlContent start***********");
120     System.out.println(responseText);
121     System.out.println("************htmlContent end***********");
125     return responseText;
127 }
129 public void forwardToWeiboPage(){
130     String url = Constant.personalHomePage;
131     headers.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
133     headers.put("Accept-Language", "zh-CN");
134     headers.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36");
135     headers.put("Connection", "Keep-Alive");
137     this.headers.put("Host", "s.weibo.com");
138     HttpResponse response=HttpUtils.doGet(url, headers);
139     String responseText=HttpUtils.getStringFromResponse(response);
140     responseText=EncodeUtils.unicdoeToGB2312(responseText);
141     List<Cookie> cookies = HttpUtils.getResponseCookies(response);
142     String cookie = HttpUtils.setCookie2String(cookies);
144     headers.put("Cookie", cookie);
146 }
150     public String getGroupCategory(int id){
151         String url="http://q.weibo.com/class/category/?id="+id;
152         this.headers.put("Host", "q.weibo.com");
154         HttpResponse response=HttpUtils.doGet(url, headers);
155         String responseText=HttpUtils.getStringFromResponse(response);
156         responseText=EncodeUtils.unicdoeToGB2312(responseText);
157         return responseText;
158     }
169     //得到微群管理员ID信息,其实用户成员的第一页 HTML页面
170     public String getGroupAdministrator(String groupid) {
171         String url="http://q.weibo.com/"+groupid+"/members/all";
172         this.headers.remove("Referer");
173         this.headers.put("Host", "q.weibo.com");
174         this.headers.remove("Content-Type");
175         this.headers.remove("x-requested-with");
176         HttpResponse response=HttpUtils.doGet(url, headers);
177         String responseText=HttpUtils.getStringFromResponse(response);
178         return responseText;
179     }
180     //根据微群号和页号得到群成员ID信息 -----JSON格式数据
181     public String getGroupMembers(String groupid,int pagenumber){
182         this.headers.put("Referer", "http://q.weibo.com/"+groupid+"/members/all");
183         this.headers.put("Host", "q.weibo.com");
184         this.headers.put("Content-Type", "application/x-www-form-urlencoded");
185         this.headers.put("x-requested-with", "XMLHttpRequest");
187         Map<String,String> params=new HashMap<String,String>();
188         params.put("_t", "0");
189         params.put("page", pagenumber+"");
190         params.put("gid", groupid);
191         params.put("query","");
192         params.put("tab", "all");
193         params.put("vip", "1");
194         String url="http://q.weibo.com/ajax/members/page";
195         HttpResponse response=HttpUtils.doPost(url, headers, params);
196         return HttpUtils.getStringFromResponse(response);
197     }
198     /*
199      *  得到微群中微博信息 经过多次尝试成功
200      *  每次获得50个微博记录,page是页号, count值50 可以在1-75之间,但是,每次开始的时候还是从50的倍数开始的
201      */
202     public String getGroupTopic(int page,int count,String gid){
203         this.headers.put("Referer", "http://q.weibo.com/"+gid);
204         this.headers.put("Host", "q.weibo.com");
205         this.headers.put("Content-Type", "application/x-www-form-urlencoded");
206         this.headers.put("x-requested-with", "XMLHttpRequest");
207         Integer pre_page=1;
208         if(page==1){
209             pre_page=2;
210         }else{
211             pre_page=page-1;
212         }
213         Map<String,String> params=new HashMap<String,String>();
214         params.put("_k", System.currentTimeMillis()+"");
215         params.put("_t", "0");
216         params.put("count", count+"");
217         //params.put("end_id", end_id);
218         params.put("gid", gid);
219         params.put("is_search","");
220         params.put("key_word", "");
221         params.put("me", "0");
222         params.put("mids", "");
223         params.put("new", "0");
224         params.put("page", page+"");
225         params.put("pagebar", "0");
226         params.put("pre_page", pre_page+"");
227         params.put("since_id", "0");
228         params.put("uid", "0");
229
230         String url="http://q.weibo.com/ajax/mblog/groupfeed";
231         HttpResponse response=HttpUtils.doPost(url, headers, params);
232         return HttpUtils.getStringFromResponse(response);
233     }
234     /*
235      *  得到微群中微博信息数目
236      *  这个信息中其实还包含了微群的所有的基本信息~~~~~~~~~~****** json格式的数据信息
237      */
238     public String getGroupMessageNumber(String gid){
239         this.headers.put("Referer", "http://q.weibo.com/"+gid);
240         this.headers.put("Host", "q.weibo.com");
241         this.headers.put("Content-Type", "application/x-www-form-urlencoded");
242         this.headers.put("x-requested-with", "XMLHttpRequest");
243         String url="http://q.weibo.com/ajax/rightnav/groupprofile?gid="+gid+"&_t=0&__rnd="+System.currentTimeMillis();
244         HttpResponse response=HttpUtils.doGet(url, headers);
245         return HttpUtils.getStringFromResponse(response);
246     }
247     //得到微群的主页信息  HTML页码   主要是为了得到第一条微博记录的MID值
248     public String getgroupMainPage(String groupid) {
249         String url="http://q.weibo.com/"+groupid+"?topnav=1";
250         this.headers.remove("Referer");
251         this.headers.put("Host", "q.weibo.com");
252         this.headers.remove("Content-Type");
253         this.headers.remove("x-requested-with");
254
255         HttpResponse response=HttpUtils.doGet(url, headers);
256         String responseText=HttpUtils.getStringFromResponse(response);
257         return responseText;
258     }
259     /*
260      * 根据分类得到微群信息
261      * categroyID :分类ID号
262      * pagenumber:页号
263      * sort:分类方式 1 按成员人数 2按 微群博数 3按创建时间分类
264      * count:每页的记录数目
265      */
266     public String getGroupByCategroy(int categroyID,int pagenumber,int sort,int count){
267         this.headers.put("Referer", "http://q.weibo.com/class/category/?id="+categroyID);
268         this.headers.put("Host", "q.weibo.com");
269         this.headers.put("Content-Type", "application/x-www-form-urlencoded");
270         this.headers.put("x-requested-with", "XMLHttpRequest");
271         Map<String,String> params=new HashMap<String,String>();
272         params.put("_t", "0");
273         params.put("page", pagenumber+"");
274         params.put("id", categroyID+"");
275         params.put("sort",sort+"");
276         params.put("count", count+"");
277
278         String url="http://q.weibo.com/ajax/class/category";
279         HttpResponse response=HttpUtils.doPost(url, headers,params);
280         String responseText=HttpUtils.getStringFromResponse(response);
281         responseText=EncodeUtils.unicdoeToGB2312(responseText);
282         return responseText;
283     }
284     //得到表情列表信息
285     public String getFaceList(){
286         String url="http://weibo.com/aj/mblog/face?type=face&_t=0&__rnd="+System.currentTimeMillis();
287         this.headers.put("Referer", "http://weibo.com/");
288         this.headers.put("Host", "weibo.com");
289         this.headers.put("Content-Type", "application/x-www-form-urlencoded");
290         this.headers.put("x-requested-with", "XMLHttpRequest");
291
292         HttpResponse response=HttpUtils.doGet(url, headers);
293         String responseText=HttpUtils.getStringFromResponse(response);
294         System.out.println(responseText);
295         Utils.writeFileFromString("tmpFile/faceList.txt", responseText);
296         return responseText;
297     }
307     //用户基本信息          主要是将要解析用户主页下方经过编码后的内容
308     public String getMemberInfo(String memberID){
309         String url="http://weibo.com/"+memberID+"/info";
310         this.headers.put("Host", "weibo.com");
311         this.headers.put("Referer", "http://weibo.com/u/"+memberID);
312         HttpResponse response=HttpUtils.doGet(url, headers);
313         String responseText=HttpUtils.getStringFromResponse(response);
314         return responseText;
315     }
316     //用户粉丝用户信息    html页面,每次20个
317     public String getMemberFans(String memberID,int page){
318         String url="http://weibo.com/"+memberID+"/fans?&uid=1689219395&tag=&page="+page;
319         this.headers.put("Host", "weibo.com");
320         this.headers.put("Referer", "http://weibo.com/"+memberID+"/fans");
321         HttpResponse response=HttpUtils.doGet(url, headers);
322         String responseText=HttpUtils.getStringFromResponse(response);
323         return responseText;
324     }
325     //用户关注的用户信息     html页面
326     public String getMemberFollowing(String memberID,int page){
327         String url="http://weibo.com/"+memberID+"/follow?page="+page;
328         this.headers.put("Host", "weibo.com");
329         this.headers.put("Referer", "http://weibo.com/"+memberID+"/follow");
330         HttpResponse response=HttpUtils.doGet(url, headers);
331         String responseText=HttpUtils.getStringFromResponse(response);
332         return responseText;
333     }
334
335     /*
336      *  @params
337      *   memberID:是用户ID
338      *   max_id:每次AJAX获得数据时上面一次的最后一个ID值
339      *   end_id:用户最新的一条微博的ID值
340      *   k:一个随机数
341      *   page:页号
342      *   pre_page:前一页
343      *   count:每次返回的数值  当max_id为null是 count=50 否则为15
344      *      pagebar:ajax时,第一次为0,第二次为1
345      *   注意:
346      *   1  用此请求,每次获得的数据格式都一样,用同样的解析方法来进行解析。
347      *   2 每次一页可以获得总共45条记录,需要三次请求。每次请求可获得15条记录。
348      *   3 max_id可以不用到,直接等于 end_id就可以了.
349      *   4 第一次请求时可以将end_id设置为NUll,即为第一次时翻页时的请求后边的滚动时必须有end_id参数,end_id为第一页的第一条ID即可。
350      */
351     //获得用户发布的微博信息   json格式的数据
352     public String getMemberReleaseTopic(String memberID,String end_id,Integer page,Integer pagebar){
353         String url="";
354         Integer pre_page=1;
355         Integer count=0;
356         String k=System.currentTimeMillis()+""+(int)(Math.random()*100000)%100;
357         if(end_id==null){
358             count=50;
359             if(page==1){
360                 pre_page=2;
361             }else{
362                 pre_page=page-1;
363             }
364             url="http://weibo.com/aj/mblog/mbloglist?" +
365             "page="+page+"&count="+count+"&pre_page="+pre_page+"&" +
366             "_k="+ k+"&uid="+memberID+
367             "&_t=0&__rnd="+System.currentTimeMillis();
368         }else{
369             count=15;
370             pre_page=page;
371             url="http://weibo.com/aj/mblog/mbloglist?" +
372             "page="+page+"&count="+count+"&max_id="+end_id+"&" +
373             "pre_page="+pre_page+"&end_id="+end_id+"&" +
374             "pagebar="+pagebar+"&_k="+k+"&" +
375             "uid="+memberID+"&_t=0&__rnd="+System.currentTimeMillis();
376         }
377         String cookieValue = "SINAGLOBAL=8556698272004.724.1417744632425; un=sm2014121903@126.com; myuid=5439352084; YF-Ugrow-G0=4703aa1c27ac0c4bab8fc0fc5968141e; SSOLoginState=1421374583; wvr=6; YF-V5-G0=8c4aa275e8793f05bfb8641c780e617b; _s_tentry=login.sina.com.cn; Apache=2461283528245.9854.1421374588453; ULV=1421374588550:13:5:3:2461283528245.9854.1421374588453:1421210767499; UOR=www.ilehao.com,widget.weibo.com,login.sina.com.cn; SUS=SID-2035860051-1421462085-GZ-7jcgb-1539d643bae5195fb7f792b2ae77befb; SUE=es%3Df15e11ed09b6a0108a28adfa58609b78%26ev%3Dv1%26es2%3Da0f706efac5c89495062648a4de3e337%26rs0%3DZBxlOUv0mhmxyHfOVmZ3tH7tNvAp08BjPeLUJPdu9WzG38Dsm40px%252Bd9w21ycDpZQwBK3q0prFfNs%252F8ZuZSasa1eps%252FOGNxJ3CIHN8JN%252Fik6gVpIPgVeeRdalNWTIbth6hLa34uOp%252BXii%252Bxeib%252BvINsr%252FdOvQx6kjp6fsC44QXc%253D%26rv%3D0; SUP=cv%3D1%26bt%3D1421462085%26et%3D1421548485%26d%3Dc909%26i%3Dbefb%26us%3D1%26vf%3D0%26vt%3D0%26ac%3D2%26st%3D0%26uid%3D2035860051%26name%3Dshy_annan%2540126.com%26nick%3D%25E7%2594%25A8%25E6%2588%25B72035860051%26fmp%3D%26lcp%3D2013-08-18%252021%253A48%253A10; SUB=_2A255vboVDeTxGeRO6FcZ9i7Mzj2IHXVazdpdrDV8PUNbvtBuLVj-kW91jmbQSGo7Rn30RVvGP5KOgBgNgQ..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9Wh7oKNCGYcNnhlC6eqqQbbl5JpX5KMt; ALF=1452998078; ULOGIN_IMG=14214638933178; YF-Page-G0=0acee381afd48776ab7a56bd67c2e7ac";
378         headers.put("Cookie", cookieValue);
379         this.headers.put("Referer", "http://weibo.com/u/"+memberID);
380         this.headers.put("Host", "www.weibo.com");
381         this.headers.put("Content-Type", "application/x-www-form-urlencoded");
382         this.headers.put("x-requested-with", "XMLHttpRequest");
383         url = "http://weibo.com/u/"+memberID;
384         HttpResponse response=HttpUtils.doGet(url, headers);
385         if(response==null){
386             return "";
387         }
388         return HttpUtils.getStringFromResponse(response);
389     }
390     /*
391      * ~~~~~~~~~~~~~~~~~~~~~获取用户的一些信息~~~end~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
392      */
393
394
395     //**********************************************************************************
396
397     /*
398      *  名人堂与达人信息
399      */
400     public String getVerified(String url){
401         this.headers.put("Host", "verified.weibo.com");
402         this.headers.put("Referer", "http://plaza.weibo.com/?topnav=1&wvr=4");
403         HttpResponse response=HttpUtils.doGet(url, headers);
404         String responseText=HttpUtils.getStringFromResponse(response);
405         return responseText;
406     }
407
408     public String getVerifiedMember(String path,Integer g_index){
409         String url="http://verified.weibo.com/aj/getgrouplist?g_index="+g_index+
410         "&path="+path+"&_t=0&__rnd="+System.currentTimeMillis();
411         this.headers.put("Host", "verified.weibo.com");
412         this.headers.put("Referer", path);
413         this.headers.put("Content-Type", "application/x-www-form-urlencoded");
414         this.headers.put("x-requested-with", "XMLHttpRequest");
415         HttpResponse response=HttpUtils.doGet(url, headers);
416         String responseText=HttpUtils.getStringFromResponse(response);
417
418         return responseText;
419     }
420
421     public String setArea(Integer provinceID){
422         this.headers.put("Referer", "http://club.weibo.com/list");
423         this.headers.put("Host", "club.weibo.com");
424         this.headers.put("Content-Type", "application/x-www-form-urlencoded");
425         this.headers.put("x-requested-with", "XMLHttpRequest");
426
427         Map<String,String> params=new HashMap<String,String>();
428
429         params.put("_t", "0");
430         params.put("city", "1000");
431         params.put("prov", provinceID+"");
432
433         String url="http://club.weibo.com/ajax_setArea.php";
434         HttpResponse response=HttpUtils.doPost(url, headers, params);
435
436         List<Cookie> cks=HttpUtils.getResponseCookies(response);
437         List<Cookie> cookies=ls.getCookies();
438         cookies.addAll(cks);
439         String cookieValue=HttpUtils.setCookie2String(cookies);
440         this.headers.put("Cookie", cookieValue);
441
442         return HttpUtils.getStringFromResponse(response);
443     }
444
445     public String getDaRen(Integer page){
446         String op="ltime";
447         String url="http://club.weibo.com/list?sex=3&op="+op+"&page="+page+"&";
448         Integer pre_page=(page<=1? 2:page-1);
449         this.headers.put("Host", "club.weibo.com");
450         this.headers.put("Referer", "http://club.weibo.com/list?sex=3&op=ltime&page="+pre_page+"&");
451         this.headers.remove("Content-Type");
452         this.headers.remove("x-requested-with");
453
454         HttpResponse response=HttpUtils.doGet(url, headers);
455         if(response!= null){
456             return HttpUtils.getStringFromResponse(response);
457         }
458         return "";
459
460     }
470     //发布一条文字微博
471     public String releaseTopic(String content){
472         this.headers.put("Referer", "http://weibo.com/");
473         this.headers.put("Host", "weibo.com");
474         this.headers.put("Content-Type", "application/x-www-form-urlencoded");
475         this.headers.put("x-requested-with", "XMLHttpRequest");
476         Map<String,String> params=new HashMap<String,String>();
477         params.put("_t", "0");
478         params.put("location", "home");
479         params.put("module", "stissue");
480         params.put("pic_id", "");
481         params.put("text", content);
482         String url="http://weibo.com/aj/mblog/add?__rnd="+System.currentTimeMillis();
483         HttpResponse response=HttpUtils.doPost(url, headers, params);
484         return HttpUtils.getStringFromResponse(response);
485     }519     //得到自己关注的成员
520     public String getSelfFollowIngs(){
521         return "";
522     }
523     //得到自己的粉丝
524     public String getSelfFollowers(){
525         return "";
526     }
527     //得到自己加入的微群
528     public String getSelfJoinedGroups(){
529         return "";
530     }
531     //得到自己的标签
532     public String getSelfTags(){
533         return "";
534     }
535     //得到自己发布的微博
536     public String getSelfReleaseTopics(){
537         return "";
538     }
539     //得到自己主页的微博
540     public String getSelfPageTopics(){
541         return "";
542     }
543     //关注一个人
544     public String addFollowing(String memberid){
545         return addorcancleFollowing(memberid,this.ADDFOLLOWING);
546     }
547     //取消关注一个人
548     public String cancelFollowing(String memberid){
549         return addorcancleFollowing(memberid,this.CANCELFOLLOWING);
550     }
551     private String addorcancleFollowing(String memberid,int option){
552         String url="";
553         switch(option){
554             case ADDFOLLOWING:
555                 url="http://weibo.com/aj/f/followed?__rnd="+System.currentTimeMillis();
556                 break;
557             case CANCELFOLLOWING:
558                 url="http://weibo.com/aj/f/unfollow?__rnd="+System.currentTimeMillis();
559                 break;
560         }
561
562         Map<String,String> params=new HashMap<String,String>();
563
564         this.headers.put("Referer", "http://weibo.com/");
565         this.headers.put("Host", "weibo.com");
566         this.headers.put("Content-Type", "application/x-www-form-urlencoded");
567         this.headers.put("Referer", "http://weibo.com/");
568         this.headers.put("x-requested-with", "XMLHttpRequest");
569
570         params.put("_t", "0");
571         params.put("f", "1");
572         params.put("location", "profile");
573         params.put("refer_flag", "");
574         params.put("refer_sort", "profile");
575         params.put("uid", memberid);
576
577         HttpResponse response=HttpUtils.doPost(url, headers, params);
578         return HttpUtils.getStringFromResponse(response);
579     }
584     /**
585      * 得到的标签信息  调用一次10个
586      * @return
587      */
588     public String getTags(){
589         String url="http://account.weibo.com/set/aj/tagsuggest?__rnd="+System.currentTimeMillis();
590         this.headers.put("Referer", "http://account.weibo.com/set/tag#");
591         this.headers.put("Host", "account.weibo.com");
592         HttpResponse response=HttpUtils.doGet(url, headers);
593         return HttpUtils.getStringFromResponse(response);
594     }
595
596     /**
597      * 得到微博热词信息
598      * @param k :热词的门类
599      */
600     public String getHotWords(String k){
601         String url="http://data.weibo.com/top/keyword?k="+k;
602         try{
603             Integer.parseInt(k);
604         }catch(Exception ex){
605             url="http://data.weibo.com/top/keyword?t="+k;
606         }
607         this.headers.put("Referer", "http://data.weibo.com/top/keyword");
608         this.headers.put("Host", "data.weibo.com");
609         HttpResponse response=HttpUtils.doGet(url, headers);
610         return HttpUtils.getStringFromResponse(response);
611     }
612
613     /**
614      * 得到微博热帖子
615      * @param cat  表示热帖门类
616      * @param page 表示页号
617      */
618     public String getHotWeibo(String cat,int page){
619         String url="http://data.weibo.com/hot/ajax/catfeed?page="+page+"&cat="+cat+"&_t=0&__rnd="+System.currentTimeMillis();
620         this.headers.put("Referer", "http://data.weibo.com/hot/minibloghot");
621         this.headers.put("Host", "data.weibo.com");
622         HttpResponse response=HttpUtils.doGet(url, headers);
623         return HttpUtils.getStringFromResponse(response);
624     }
625
626     /**
627      * 按照分类获取 微博吧名字  第一步
628      */
629     public String getWeiBar(String ctgid,int p){
630         String sort="post";
631         String url="http://weiba.weibo.com/aj_f/CategoryList?sort="+sort+"&p="+p+"&ctgid="+ctgid+"&_t=0&__rnd="+System.currentTimeMillis();
632         this.headers.put("Referer", "http://weiba.weibo.com/ct/"+ctgid);
633         this.headers.put("Host", "weiba.weibo.com");
634         this.headers.put("Accept", "*/*");
635         this.headers.put("Content-Type", "application/x-www-form-urlencoded");
636         this.headers.put("X-Requested-With", "XMLHttpRequest");
637         HttpResponse response=HttpUtils.doGet(url, headers);
638         return HttpUtils.getStringFromResponse(response);
639     }
640     /**
641      * 根据微博吧 名称 ,得到该吧内的所有帖子标题 第二步
642      */
643     public String getWeiBarByWeibarName(String bid,int p){
644         String url="http://weiba.weibo.com/aj_t/postlist?bid="+bid+"&p="+p+"&_t=all&__rnd="+System.currentTimeMillis();
645         this.headers.put("Referer", "http://weiba.weibo.com/");
646         this.headers.put("Host", "weiba.weibo.com");
647         this.headers.put("Accept", "*/*");
648         this.headers.put("Content-Type", "application/x-www-form-urlencoded");
649         this.headers.put("X-Requested-With", "XMLHttpRequest");
650         HttpResponse response=HttpUtils.doGet(url, headers);
651         return HttpUtils.getStringFromResponse(response);
652     }
653
654     /**
655      * 新浪微公益名单
656      * type ="donate"
657      * type="discuss"
658      */
659     public String getWeiGongYiMember(int page,int projectID,String type){
660         String url="http://gongyi.weibo.com/aj_personal_helpdata?page="+page+"&type="+type+"&project_id="+projectID+"&_t=0&__rnd="+System.currentTimeMillis();
661         this.headers.put("Referer", "http://gongyi.weibo.com/"+projectID);
662         this.headers.put("Host", "gongyi.weibo.com");
663         this.headers.put("Accept", "*/*");
664         this.headers.put("Content-Type", "application/x-www-form-urlencoded");
665         this.headers.put("X-Requested-With", "XMLHttpRequest");
666         HttpResponse response=HttpUtils.doGet(url, headers);
667         return HttpUtils.getStringFromResponse(response);
668     }
669 }

转载于:https://www.cnblogs.com/RunForLove/p/4511920.html

用java实现新浪爬虫,代码完整剖析(仅针对当前SinaSignOn有效)相关推荐

  1. java获取新浪天气预报代码

    package com.test.commons;/*** java获取新浪天气预报代码*/ import java.io.FileNotFoundException; import java.io. ...

  2. 新浪天气预报代码及城市代码

    名称:新浪天气预报代码 代码 :<IFRAME ID='ifm2' WIDTH='260' HEIGHT='70' ALIGN='CENTER' MARGINWIDTH='0' MARGINHE ...

  3. Java 模拟新浪登录 2016

    想学习一下网络爬虫,涉及到模拟登录,查阅了一番资料以后发现大部分都有点过时了,就使用前辈们给的经验,Firefox抓包调试,采用httpclient模拟了一下新浪登录.不对之处多多包含.需要的可以用浏 ...

  4. Java模拟新浪和腾迅自动登录并发送微博(2013年3月更新可用)

    1.准备工作 只是登录无需申请新浪和腾迅的开发者账号,如果需要发送微博功能,需要申请一个新浪和腾迅的开发者账号,并添加一个测试应用. 过程请参考官方帮助文档,申请地址:新浪:http://open.w ...

  5. 飞翔的小鸟--Java小游戏实战(代码完整)

    一.写在前面: <飞扬的小鸟>是一款曾经比较火热的小游戏,本文可以带你你从零开始,一步一步的开发出这款小游戏. 语言 Java 工具 IntelliJ IDEA,JDK 16 准备工作 创 ...

  6. java 学习 新浪微群

    java 新浪微群    http://q.weibo.com/803436/invitation=11mQrw0-1f2c0?source=weiqun_notice_app_18

  7. 2022年新浪股票接口更新需要加Referer才能获取数据, java获取新浪股票数据 http://hq.sinajs.cn

    新浪股票 2022年更新后 java获取数据 引用 <dependency><groupId>cn.hutool</groupId><artifactId&g ...

  8. java 利用新浪天气API获取天气预报

    新浪为我们提供了天气预报获取接口API  http://php.weather.sina.com.cn/xml.php?city=武汉&password=DJOYnieT8234jlsK&am ...

  9. html注册新浪邮箱代码,JS仿新浪邮箱点击联系人添加Email地址

    新浪邮箱添加功能 var ev={}; var flyDiv="bxAddrFly"; var inceptDiv="SendAddress"; var add ...

  10. java 的新浪oauth_新浪微博OAuth授权的Java实现

    一.OAuth协议简介 OAuth授权在各社交网站中广泛使用,该协议使用户不需要直接向第三方应用提供用户名及密码,并且使一个账户在多个网站中使用成为可能,OAuth协议的细节描述可参考其官方网站:ht ...

最新文章

  1. 如何把照片压缩到20k一下_如何将图像压缩10倍?阿里工程师有个大胆的想法!...
  2. 影响中国发展的七大垂直搜索引擎
  3. java arrays.equals_Java Arrays类的常见使用
  4. 看图说cnblogs-强大的SEO功能【有实例】
  5. Job中的Task是如何调度的
  6. CTSC2017 APIO2017 THUSC2017 游记
  7. Unity Application Block 发布
  8. 请教大家一个问题,有关于数据库的设计
  9. Android滑动返回上一级界面
  10. 我所理解的Reed solomon 算法
  11. vue鼠标经过效果实现
  12. 人像抠图——基于深度学习一键去除视频背景
  13. Cass符号填充设置
  14. 网络错误 —未连接到互联网
  15. 【Windows】win7虚拟机安装VMware Tools
  16. 常用算法简要总结(C语言)
  17. iOS WIFI 相关
  18. 工资3000,靠国际版抖音TikTok月入2W+:这个风口真的很赚钱!
  19. STM32C8T6编码器电机测速与arduino光电模块测速
  20. 总结高精度定位难点与解决办法

热门文章

  1. word电子签名工具_电子签名是您不会想到的必备工具的5个理由
  2. Java基础知识总结(一)创建和销毁对象
  3. linux系统gaussian09,高斯(Gaussian)软件linux下安装
  4. 红帽(redhat linux) 初级认证(RHCSA)考点详解
  5. HTML 制作简单的个人简历
  6. html读写txt文件,JS读写文本文件示例代码
  7. vlfeat工具包的MATLAB安装
  8. 之前做设计收集的部分网站
  9. postSQL安装和GIS数据导入
  10. awvs12 Server Exception_使用WebSocket搭建服务器server