用java实现新浪爬虫,代码完整剖析(仅针对当前SinaSignOn有效)
先来看我们的web.xml文件,如下
1 <!DOCTYPE web-app PUBLIC 2 "-//Sun Microsystems, Inc.//DTD Web Application 2.3//EN" 3 "http://java.sun.com/dtd/web-app_2_3.dtd" > 4 5 <web-app> 6 <display-name>MySinaSpider</display-name> 7 <listener> 8 <listener-class>main.java.sina.spider.StartSpiderLisenter</listener-class> 9 </listener> 10 </web-app>
这样的配置当启动tomcat的时候,就会运行爬虫,然后再看我们的StartSpiderLisenter类,如下
1 package main.java.sina.spider; 2 3 import javax.servlet.ServletContextEvent; 4 import javax.servlet.ServletContextListener; 5 import main.java.sina.bean.info.LoginInfo; 6 import main.java.sina.utils.Constant; 7 8 public class StartSpiderLisenter implements ServletContextListener{ 9 10 public void contextDestroyed(ServletContextEvent arg0) { 11 12 } 13 14 public void contextInitialized(ServletContextEvent arg0) { 15 Constant.personalHomePage = "http://weibo.com/zhaoyao2012/home"; //填写你自己的新浪微博个人主页 16 LoginInfo.username = "***"; //填写你的新浪微博用户名 18 LoginInfo.password = "***"; //填写你的新浪微博密码 19 Constant.enableProxy = false; //是否使用代理 20 Spider.start(); 21 } 22 23 }
很明显我们看到StartSpiderLisenter 类是继承自ServletContextListener这个接口,一定要实现它的两个方法,contextInitialized和contextDestroyed.它们分别在初始化和销毁的时候被容器调用。我们看到在contextInitialized初始化上下文的方法中调用了Spider.start()方法。那么我们来看看Spider这个类,如下:
1 package main.java.sina.spider; 2 3 import java.io.IOException; 4 import java.util.regex.Matcher; 5 import java.util.regex.Pattern; 7 import org.quartz.JobBuilder; 8 import org.quartz.JobDetail; 9 import org.quartz.Scheduler; 10 import org.quartz.SchedulerException; 11 import org.quartz.SchedulerFactory; 12 import org.quartz.SimpleScheduleBuilder; 13 import org.quartz.SimpleTrigger; 14 import org.quartz.TriggerBuilder; 15 import org.quartz.impl.StdSchedulerFactory; 17 import main.java.sina.bean.info.LoginInfo; 18 import main.java.sina.httpclient.LoginSina; 19 import main.java.sina.httpclient.SpiderSina; 20 import main.java.sina.job.KeywordSearchJob; 21 import main.java.sina.utils.Constant; 22 import main.java.sina.utils.HttpHelper; 23 import main.java.test.SpiderTest; 24 25 public class Spider { 26 27 public static void main(String[] args) { 28 29 Constant.personalHomePage = "****"; 30 LoginInfo.username = "****"; 31 LoginInfo.password = "****"; 32 Constant.enableProxy = false; 33 Constant.hourbefore = 0; //这个参数用于设置时差 34 start(); 35 36 } 37 public static void start() { 38 39 final SchedulerFactory factory = new StdSchedulerFactory(); 40 try { 41 Scheduler scheduler = factory.getScheduler(); 42 JobDetail jobDetail = JobBuilder.newJob(KeywordSearchJob.class) 43 .withIdentity("keywordSearch", "weibo").build(); 44 SimpleTrigger trigger = TriggerBuilder.newTrigger() 45 .withIdentity("keywordSearch", "weibo") 46 .withSchedule(SimpleScheduleBuilder.repeatHourlyForever()) 47 .build(); 48 scheduler.scheduleJob(jobDetail, trigger); 49 scheduler.start(); 50 } catch (SchedulerException e) { 51 e.printStackTrace(); 52 } 53 } 54 55 public static SpiderSina createSpider() { 56 LoginSina ls = new LoginSina(LoginInfo.username, LoginInfo.password); 57 ls.dologinSina(); 58 ls.redirect(); 59 SpiderSina spider = new SpiderSina(ls); 60 61 return spider; 62 } 63 64 public static void sendMidsofDays(SpiderSina spider,String keyword, String fromdate, 65 String todate) { 66 67 try { 68 String midsString = ""; 69 for (int i = 1; i <= 50; i++) { 70 String htmlContent = spider 71 .search(keyword, i, fromdate, todate); 72 if (htmlContent.contains("noresult_support")) { 73 break; 74 } 75 System.out.println(i); 76 Pattern pattern = Pattern.compile("<div mid=\"([0-9]*)\""); 77 78 String start = "\"pid\":\"pl_weibo_direct\""; 79 try { 80 htmlContent = htmlContent.substring(htmlContent 81 .indexOf(start)); 82 } catch (Exception e) { 83 htmlContent = htmlContent.substring(1); 84 } 85 htmlContent = htmlContent.replace("\\\"", "\""); 86 htmlContent = htmlContent.replace("\\/", "/"); 87 Matcher matcher = pattern.matcher(htmlContent); 88 while (matcher.find()) { 89 System.out.println(matcher.group(1)); 90 midsString += matcher.group(1) + ","; 91 } 92 if (i == 37) { 93 try { 94 Thread.sleep(1000 * 60 * 30); 95 } catch (InterruptedException e) { 96 e.printStackTrace(); 97 } 98 } 99 } 100 System.out.println(midsString); 101 HttpHelper.getLiveData(midsString, Constant.CommentUrl); 102 } catch (IOException e) { 103 e.printStackTrace(); 104 } 105 106 } 107 }
我们在Spider.start()方法中,看到了作业KeywordSearchJob.class,那么我们来看看这个KeywordSearchJob类的实现,如下:
1 package main.java.sina.job; 2 3 import org.quartz.Job; 4 import org.quartz.JobExecutionContext; 5 import org.quartz.JobExecutionException; 6 import main.java.sina.httpclient.SpiderSina; 7 import main.java.sina.spider.Spider; 8 import main.java.sina.utils.Constant; 9 import main.java.sina.utils.Utils; 10 11 public class KeywordSearchJob implements Job { 12 13 public void execute(JobExecutionContext arg0) throws JobExecutionException { 14 15 Constant.enableProxy = false; //我的爬虫中没有使用代理,故值设为false. 16 String keyword = "%25E5%25AE%2581%25E6%25B3%25A2%25E5%25A4%25A7%25E5%25AD%25A6";//被编码后的关键字 17 String datehour = Utils.getDateOfSpecifiedPreHour(Constant.hourbefore);//这个工具类实现了时差格式的转换 18 SpiderSina spider = Spider.createSpider(); 19 spider.forwardToWeiboPage(); 20 Spider.sendMidsofDays(spider,keyword,datehour,datehour); 21 } 22 23 }
接下来,我们看几个工具类的实现:首先来看下Utils.java这个类,如下:它实现了日期的格式的一些转换
1 package main.java.sina.utils; 2 3 import java.io.BufferedReader; 4 import java.io.BufferedWriter; 5 import java.io.File; 6 import java.io.FileInputStream; 7 import java.io.FileNotFoundException; 8 import java.io.FileOutputStream; 9 import java.io.FileWriter; 10 import java.io.IOException; 11 import java.io.InputStream; 12 import java.io.InputStreamReader; 13 import java.io.StringReader; 14 import java.io.UnsupportedEncodingException; 15 import java.text.ParseException; 16 import java.text.SimpleDateFormat; 17 import java.util.Calendar; 18 import java.util.Date; 19 import java.util.Properties; 20 21 import org.htmlparser.Parser; 22 import org.htmlparser.lexer.Lexer; 23 import org.htmlparser.lexer.Page; 24 import org.htmlparser.util.DefaultParserFeedback; 25 // I/O操作类 26 public class Utils { 27 28 public static Date getDateFromString(String dtext,Date fileCreateDate) { 29 Date date=null; 30 int y,mm,se; 31 Calendar c = Calendar.getInstance(); 32 c.setTime(fileCreateDate); 33 y = c.get(Calendar.YEAR); //年 34 //d = c.get(Calendar.DAY_OF_MONTH); //日 35 mm = c.get(Calendar.MINUTE); //分 36 se = c.get(Calendar.SECOND);//秒 37 if(dtext.contains("秒前")){ 38 int end=0; 39 for(int i=0;i<dtext.length();i++){ 40 if(dtext.charAt(i)>='0' && dtext.charAt(i)<='9'){ 41 end++; 42 }else{ 43 break; 44 } 45 } 46 dtext=dtext.substring(0,end); 47 int second=Integer.parseInt(dtext); 48 c.set(Calendar.SECOND, se-second); 49 date=c.getTime(); 50 } 51 else if(dtext.contains("分钟前")){ 52 int end=0; 53 for(int i=0;i<dtext.length();i++){ 54 if(dtext.charAt(i)>='0' && dtext.charAt(i)<='9'){ 55 end++; 56 }else{ 57 break; 58 } 59 } 60 dtext=dtext.substring(0,end); 61 int minute=Integer.parseInt(dtext); 62 c.set(Calendar.MINUTE, mm-minute); 63 date=c.getTime(); 64 }else if(dtext.contains("今天")){ 65 dtext=dtext.replace("今天 ", "").trim(); 66 String ss[]=dtext.split(":"); 67 if(ss!=null && ss.length==2){ 68 c.set(Calendar.HOUR_OF_DAY, Integer.parseInt(ss[0])); 69 c.set(Calendar.MINUTE, Integer.parseInt(ss[1])); 70 date=c.getTime(); 71 } 72 }else if(dtext.contains("月")){ 73 dtext=y+"年".concat(dtext); 74 SimpleDateFormat sf=new SimpleDateFormat("yyyy年MM月dd日 HH:mm"); 75 try { 76 date=sf.parse(dtext); 77 } catch (ParseException e) { 78 e.printStackTrace(); 79 } 80 }else if(dtext.contains("-")){ 81 SimpleDateFormat sf=new SimpleDateFormat("yyyy-MM-dd HH:mm"); 82 try { 83 date=sf.parse(dtext); 84 } catch (ParseException e) { 85 e.printStackTrace(); 86 } 87 } 88 return date; 89 } 90 public static void writeFileFromStream(String filename,InputStream in){ 91 if(filename==null || filename.trim().length()==0) 92 return; 93 File file=new File(filename); 94 if(!file.exists()){ 95 try { 96 file.createNewFile(); 97 } catch (IOException e) { 98 e.printStackTrace(); 99 } 100 } 101 FileOutputStream fou=null; 102 try { 103 fou = new FileOutputStream(file); 104 byte []buffer=new byte[1024*4]; 105 int len=-1; 106 while((len=in.read(buffer))!=-1){ 107 fou.write(buffer,0,len); 108 } 109 } catch (FileNotFoundException e) { 110 e.printStackTrace(); 111 } catch (IOException e) { 112 e.printStackTrace(); 113 }finally{ 114 if(in!=null) 115 try { 116 in.close(); 117 } catch (IOException e) { 118 e.printStackTrace(); 119 } 120 if(fou!=null) 121 try { 122 fou.close(); 123 } catch (IOException e) { 124 e.printStackTrace(); 125 } 126 } 127 } 128 public static void writeFileFromString(String filename,String str){ 129 if(filename==null || filename.trim().length()==0) 130 filename="tmp.txt"; 131 File file=new File(filename); 132 if(!file.exists()){ 133 try { 134 file.createNewFile(); 135 } catch (IOException e) { 136 e.printStackTrace(); 137 } 138 } 139 BufferedWriter writer=null; 140 BufferedReader reader=null; 141 try { 142 writer=new BufferedWriter(new FileWriter(file)); 143 reader=new BufferedReader(new StringReader(str)); 144 String tmp=null; 145 StringBuffer buffer=new StringBuffer(); 146 while((tmp=reader.readLine())!=null) 147 buffer.append(tmp+"\n"); 148 writer.write(buffer.toString()); 149 150 } catch (IOException e) { 151 e.printStackTrace(); 152 }finally{ 153 try { 154 reader.close(); 155 writer.close(); 156 } catch (IOException e) { 157 e.printStackTrace(); 158 } 159 } 160 161 } 162 163 164 165 public static String getStringFromStream(InputStream in) { 166 BufferedReader reader=null; 167 reader = new BufferedReader(new InputStreamReader(in)); 168 StringBuffer buffer=new StringBuffer(); 169 String str=null; 170 try{ 171 while((str=reader.readLine())!=null){ 172 buffer.append(str+"\n"); 173 } 174 reader.close(); 175 }catch(Exception ex){ 176 ex.printStackTrace(); 177 } 178 try { 179 return new String(buffer.toString().getBytes(),"utf-8"); 180 } catch (UnsupportedEncodingException e) { 181 e.printStackTrace(); 182 return "error:"+e.getMessage(); 183 } 184 } 185 //得到数据库的配置信息 186 public static Properties getDBconfig(){ 187 Properties properties=new Properties(); 188 InputStream in = null; 189 try { 190 in = new FileInputStream(new File("config/dbconfig.ini")); 191 properties.load(in); 192 } catch (FileNotFoundException e) { 193 e.printStackTrace(); 194 } catch (IOException e) { 195 e.printStackTrace(); 196 }finally{ 197 if(in!=null) 198 try { 199 in.close(); 200 } catch (IOException e) { 201 e.printStackTrace(); 202 } 203 } 204 return properties; 205 } 206 207 public static Parser createParser(String inputHTML) { 208 Lexer mLexer = new Lexer(new Page(inputHTML)); 209 Parser parser = new Parser(mLexer, new DefaultParserFeedback( 210 DefaultParserFeedback.QUIET)); 211 return parser; 212 } 213 214 public static String getDateOfSpecifiedPreHour(int hourNum){ 215 SimpleDateFormat sdFormat = new SimpleDateFormat("yyyy-MM-dd-HH"); 216 Date date = new Date(); 217 System.out.println("date -" +date + " " + hourNum); 218 Calendar calendar = Calendar.getInstance(); 219 calendar.setTime(date); 220 calendar.add(Calendar.HOUR_OF_DAY, -1 * hourNum); 221 System.out.println("date2 -" +sdFormat.format(calendar.getTime())); 222 return sdFormat.format(calendar.getTime()); 223 } 224 }
再来看一下ThreadPool.java这个类,如下:这是一个线程工具类,定义了线程的一些动作
1 package main.java.sina.utils; 2 3 import java.util.List; 4 import java.util.concurrent.ExecutorService; 5 import java.util.concurrent.Executors; 6 7 /** 9 * 线程池工具类 10 */ 11 public class ThreadPool { 12 private ExecutorService service; 13 private List<Thread> threadList; 14 15 public ThreadPool(int limite, List<Thread> threadList) { 16 this.service = Executors.newFixedThreadPool(limite); 17 this.threadList = threadList; 18 } 19 20 public void execute() { 21 if(threadList==null ||threadList.size()==0) return ; 22 for (int index = 0; index < threadList.size(); index++) { 23 Thread t=threadList.get(index); 24 service.execute(t); 25 } 26 } 27 public boolean isTerminated(){ 28 return service.isTerminated(); 29 } 30 31 public void shutDown() { 32 service.shutdown(); 33 } 34 }
然后再看一下Constant.java这个常量类,如下:常量类把系统总用到的一些常量写在这里,以后项目维护需要更改的时候,方便维护更改
package main.java.sina.utils;/*** @ClassName: Constant * */ public class Constant {public static boolean enableProxy = false;public static String liveCommentUrl = "http://localhost:8080/social-hub-connector/loadingLiveData";public static String CommentUrl = "http://localhost:8080/social-hub-connector/loadingData";public static String personalHomePage = "******";public static String weiboUsername = "*********";public static String weiboPassword = "*********";public static int hourbefore = 0; }
再来看一下Base64Encoder.java类,它对一些字段进行了编码的类,如下:
1 package main.java.sina.utils; 2 3 /** 4 * 5 */ 6 public class Base64Encoder { 7 private static final char last2byte = (char) Integer.parseInt("00000011", 2); 8 private static final char last4byte = (char) Integer.parseInt("00001111", 2); 9 private static final char last6byte = (char) Integer.parseInt("00111111", 2); 10 private static final char lead6byte = (char) Integer.parseInt("11111100", 2); 11 private static final char lead4byte = (char) Integer.parseInt("11110000", 2); 12 private static final char lead2byte = (char) Integer.parseInt("11000000", 2); 13 private static final char[] encodeTable = new char[]{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'}; 14 15 public Base64Encoder() { 16 } 17 public static String encode(byte[] from) { 18 StringBuffer to = new StringBuffer((int) (from.length * 1.34) + 3); 19 int num = 0; 20 char currentByte = 0; 21 for (int i = 0; i < from.length; i++) { 22 num = num % 8; 23 while (num < 8) { 24 switch (num) { 25 case 0: 26 currentByte = (char) (from[i] & lead6byte); 27 currentByte = (char) (currentByte >>> 2); 28 break; 29 case 2: 30 currentByte = (char) (from[i] & last6byte); 31 break; 32 case 4: 33 currentByte = (char) (from[i] & last4byte); 34 currentByte = (char) (currentByte << 2); 35 if ((i + 1) < from.length) { 36 currentByte |= (from[i + 1] & lead2byte) >>> 6; 37 } 38 break; 39 case 6: 40 currentByte = (char) (from[i] & last2byte); 41 currentByte = (char) (currentByte << 4); 42 if ((i + 1) < from.length) { 43 currentByte |= (from[i + 1] & lead4byte) >>> 4; 44 } 45 break; 46 } 47 to.append(encodeTable[currentByte]); 48 num += 6; 49 } 50 } 51 if (to.length() % 4 != 0) { 52 for (int i = 4 - to.length() % 4; i > 0; i--) { 53 to.append("="); 54 } 55 } 56 return to.toString(); 57 } 58 }
这个类中,针对新浪的一些特殊的加密规则,写的方法,这个在拼接最终的URl的时候回用到,如根据servertime+nonce两个参数来生成一串字符串加密规则:
1 package main.java.sina.utils; 2 import java.io.File; 3 import java.io.FileReader; 4 5 import javax.script.Invocable; 6 import javax.script.ScriptEngine; 7 import javax.script.ScriptEngineManager; 8 9 /** 10 * 12 */ 13 public class EncodeSuAndSp { 14 static ScriptEngineManager mgr = new ScriptEngineManager(); 15 static ScriptEngine engine = mgr.getEngineByExtension("js"); 16 static Invocable inv = (Invocable) engine; 17 18 public static String getEncryptedP(String password,String servertime,String nonce){ 19 String value1=""; 20 try { 21 engine.eval(new FileReader(new File("js/encrypt.js"))); 22 value1 = String.valueOf(inv.invokeFunction("hex_sha1",password)); 23 value1 = String.valueOf(inv.invokeFunction("hex_sha1",value1)); 24 value1 = String.valueOf(inv.invokeFunction("hex_sha1",value1+servertime+nonce)); 25 } catch (Exception e) { 26 e.printStackTrace(); 27 } 28 return value1; 29 } 30 31 32 public static String getEncodedUsername(String username){ 33 String value1=""; 34 try { 35 engine.eval(new FileReader(new File("js/encrypt.js"))); 36 value1 = String.valueOf(inv.invokeFunction("encode",username)); 37 System.out.println(value1); 38 } catch (Exception e) { 39 e.printStackTrace(); 40 } 41 return value1; 42 } 43 }
package main.java.sina.utils; import java.io.UnsupportedEncodingException; import java.net.URLDecoder; import java.net.URLEncoder; public class EncodeUtils {public static final String encodeURL(String str,String enc) {try {return URLEncoder.encode(str, enc);} catch (UnsupportedEncodingException e) {throw new RuntimeException(e);}}public static final String decodeURL(String str,String enc) {try {return URLDecoder.decode(str, enc);} catch (UnsupportedEncodingException e) {throw new RuntimeException(e);}}public static String unicdoeToGB2312(String str) {String res = null;if(str==null ){return "";}StringBuffer sb = new StringBuffer();try {while (str.length() > 0) {if (str.startsWith("\\u")) {int x = 0;try{x = Integer.parseInt(str.substring(2, 6), 16);}catch(Exception ex){x= 0;}sb.append((char) x);str = str.substring(6);} else {sb.append(str.charAt(0));str = str.substring(1);}}res = sb.toString();} catch (Exception e) {e.printStackTrace(System.err);}res=res.replaceAll("\\\\r", "").replaceAll("\\\\n", "").replaceAll("\\\\t", "").replaceAll(" ", "").replaceAll(">", "").replaceAll("\\[", "\"").replaceAll("\\]", "\"");return res;}public static String unicodeTogb2312(String str) {String res = null;StringBuffer sb = new StringBuffer();try {while (str.length() > 0) {if (str.startsWith("\\u")) {int x = Integer.parseInt(str.substring(2, 6), 16);sb.append((char) x);str = str.substring(6);} else {sb.append(str.charAt(0));str = str.substring(1);}}res = sb.toString();} catch (Exception e) {e.printStackTrace(System.err);}res=res.replaceAll("\\\\r", "").replaceAll("\\\\t", "").replaceAll(" ", "").replaceAll(">", "").replaceAll("\\\\n", "");return res;} }
这个类很关键HttpUtils.java类,这个方法中重写了doPost()和doGet()方法.如下:
package main.java.sina.utils;import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Random; import java.util.Set; import org.apache.http.Header; import org.apache.http.HttpEntity; import org.apache.http.HttpHost; import org.apache.http.HttpResponse; import org.apache.http.HttpVersion; import org.apache.http.NameValuePair; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.HttpClient; import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpPost; import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.conn.params.ConnRoutePNames; import org.apache.http.conn.params.ConnRouteParams; import org.apache.http.cookie.Cookie; import org.apache.http.entity.InputStreamEntity; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager; import org.apache.http.impl.cookie.BasicClientCookie; import org.apache.http.message.BasicNameValuePair; import org.apache.http.params.BasicHttpParams; import org.apache.http.params.CoreProtocolPNames; import org.apache.http.params.HttpParams; import org.apache.http.params.HttpProtocolParams; import org.apache.http.protocol.BasicHttpContext; import org.apache.http.protocol.ExecutionContext; import org.apache.http.protocol.HTTP; import org.apache.http.protocol.HttpContext;/*** http操作相关的类*/ public class HttpUtils {/** params :* url: 地址* headers请求头部信息* return : httpresponse响应*/public static HttpResponse doGet(String url,Map<String,String> headers){HttpClient client=createHttpClient();HttpGet getMethod=new HttpGet(url);HttpResponse response=null;HttpContext httpContext = new BasicHttpContext();try {if(headers!=null && headers.keySet().size()>0){for(String key:headers.keySet()){getMethod.addHeader(key, headers.get(key));}} response=client.execute(getMethod);HttpUriRequest realRequest = (HttpUriRequest)httpContext.getAttribute(ExecutionContext.HTTP_REQUEST);System.out.println(realRequest.getURI());} catch (ClientProtocolException e) {e.printStackTrace();} catch (IOException e) {String msg=e.getMessage();if(msg.contains("Truncated chunk")){System.out.println(e.getMessage() +" 数据获取不完整,需要重新获取。");}else{System.out.println(e.getMessage() +" 连接被拒绝,需要降低爬取频率。");}} catch(Exception e){}System.out.println(response);return response; }/** params :* url: 地址* headers:请求头部信息* params:post的请求数据* return : httpresponse响应*/public static HttpResponse doPost(String url,Map<String,String> headers,Map<String,String> params){HttpClient client=createHttpClient();HttpPost postMethod=new HttpPost(url);HttpResponse response=null;try {if(headers!=null && headers.keySet().size()>0){for(String key:headers.keySet()){postMethod.addHeader(key, headers.get(key));}} List<NameValuePair> p=null;if(params!=null && params.keySet().size()>0){p=new ArrayList<NameValuePair>();for(String key:params.keySet()){p.add(new BasicNameValuePair(key,params.get(key)));}}if(p!=null)postMethod.setEntity(new UrlEncodedFormEntity(p,HTTP.UTF_8));response=client.execute(postMethod);} catch (ClientProtocolException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();} return response; }//上传一个文件public static HttpResponse doPost(String url,Map<String,String> headers,String fileName){HttpClient client=createHttpClient();HttpPost postMethod=new HttpPost(url);String boundary = "";HttpResponse response=null;try {if(headers!=null && headers.keySet().size()>0){for(String key:headers.keySet()){postMethod.addHeader(key, headers.get(key));if(key.equals("Content-Type")){String tmp=headers.get(key);boundary=tmp.substring(tmp.indexOf("=")+1);}}} File file=new File(fileName);InputStream in=new FileInputStream(file);StringBuffer buffer=new StringBuffer();buffer.append(boundary).append("\n").append("Content-Disposition: form-data; name=\"pic1\"; filename=\""+file.getName()).append("\"\n").append("Content-Type: image/pjpeg").append("\n").append("\n");System.out.println(buffer.toString());String tmpstr=Utils.getStringFromStream(in);tmpstr=Base64Encoder.encode(tmpstr.getBytes());buffer.append(tmpstr).append("\n");buffer.append(boundary+"--").append("\n");System.out.println(buffer.toString());in=new ByteArrayInputStream(buffer.toString().getBytes());InputStreamEntity ise=new InputStreamEntity(in,buffer.toString().getBytes().length); postMethod.setEntity(ise); response=client.execute(postMethod);} catch (ClientProtocolException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();} return response; }/** params :* httpresponse* return : 响应的头部信息*/public static List<Header> getReponseHeaders(HttpResponse response){List<Header> headers=null;Header[] hds=response.getAllHeaders();if(hds!=null && hds.length>0){headers=new ArrayList<Header>();for(int i=0;i<hds.length;i++){headers.add(hds[i]);}} return headers;}/** params :* headers:头部信息 * request:请求*/public static void setHeaders(Map<String,String> headers,HttpUriRequest request){if(headers!=null && headers.keySet().size()>0){for(String key:headers.keySet()){request.addHeader(key, headers.get(key)); }}}/** params :* httpresponse* return : 响应的cookies值*/public static List<Cookie> getResponseCookies(HttpResponse response){List<Cookie> cookies=null;Header[] hds=response.getAllHeaders();if(hds!=null && hds.length>0){for(int i=0;i<hds.length;i++){if(hds[i].getName().equalsIgnoreCase("Set-Cookie")){if(cookies==null){cookies=new ArrayList<Cookie>();} String cookiestring[]=hds[i].getValue().split(";");String ss[]=cookiestring[0].split("=",2);String cookiename=ss[0];String cookievalue=ss[1];Cookie cookie=new BasicClientCookie(cookiename,cookievalue);cookies.add(cookie);}}} return cookies;}/** params :* cookies数组* return : cookies数组组成的字符串*/public static String setCookie2String(List<Cookie> cookies){StringBuilder builder=null; if(cookies!=null && cookies.size()>0){builder=new StringBuilder();for(int j=0;j<cookies.size();j++){Cookie c=cookies.get(j);builder.append(c.getName()+"="+c.getValue());if(j!=cookies.size()-1)builder.append("; ");}return builder.toString();} return null;}/** 从响应中得到输入流*/public static InputStream getInputStreamFromResponse(HttpResponse response){if(response==null){return null;}HttpEntity entity=response.getEntity();InputStream in=null;try {in = entity.getContent();} catch (IllegalStateException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();}return in;}/** 从响应中得到字符串*/public static String getStringFromResponse(HttpResponse response){if(response==null){return null;}InputStream in=getInputStreamFromResponse(response);String responseText="";if(in!=null){responseText=Utils.getStringFromStream(in);}return responseText;}/*** 创建支持多线程并发连接的HTTPCLIENT*/private final static HttpClient createHttpClient() {String proxyHost = "web-proxy-sha.chn.hp.com";int proxyPort = 8080;HttpHost proxy = new HttpHost(proxyHost,proxyPort);HttpParams params = new BasicHttpParams();if(Constant.enableProxy){params.setParameter(ConnRouteParams.DEFAULT_PROXY, proxy);}HttpProtocolParams.setVersion(params, HttpVersion.HTTP_1_1);HttpProtocolParams.setContentCharset(params, "UTF-8");ThreadSafeClientConnManager clientmanager = new ThreadSafeClientConnManager();clientmanager.setMaxTotal(20);HttpClient client = new DefaultHttpClient(clientmanager, params);//定义了环形重定向,定向到相同的路径是否被允许.client.getParams().setParameter("http.protocol.allow-circular-redirects", true); //定义了重定向的最大数量client.getParams().setParameter("http.protocol.max-redirects", 50);//定义了重定向是否应该自动处理client.getParams().setParameter("http.protocol.handle-redirects", false);return client;}/***加入代理的功能* @return HttpClient 对象*/public static HttpClient getDefaultHttpClientByProxy() {HttpClient httpclient =createHttpClient();String filePath = "proxy.properties";HttpHost proxy = null;Map<String, String> map = ReadIni.getDbini(filePath);if (map.size() == 0) {throw new RuntimeException("无可用代理");} else {Set<String> set = map.keySet();String[] array = (String[]) set.toArray(new String[set.size()]);Random r = new Random();int rnum = r.nextInt(array.length);String ip = array[rnum];String port = map.get(ip);proxy = new HttpHost(ip, Integer.parseInt(port));}httpclient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY,proxy);httpclient.getParams().setParameter(CoreProtocolPNames.PROTOCOL_VERSION, HttpVersion.HTTP_1_1);return httpclient;} }
接下来卡一个HttpHelper的辅助类,如下:
/*** */ package main.java.sina.utils;import java.io.IOException; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpException; import org.apache.commons.httpclient.methods.PostMethod;/*** @ClassName: HttpHelper* */ public class HttpHelper {public static String getLiveData(String requestData,String url)throws HttpException, IOException {PostMethod postMethod = new PostMethod(url);postMethod.setParameter("mids", requestData);HttpClient httpClient = new HttpClient();int statusCode = httpClient.executeMethod(postMethod);String response = postMethod.getResponseBodyAsString();postMethod.releaseConnection();System.out.println(response);return response;}public static String getHobbyData(String userid, String hobbys)throws HttpException, IOException {PostMethod postMethod = new PostMethod("http://c0048925.itcs.hp.com:8080/connector/loadingHobby");postMethod.setParameter("userid", userid);postMethod.setParameter("hobbys", hobbys);HttpClient httpClient = new HttpClient();int statusCode = httpClient.executeMethod(postMethod);String response = postMethod.getResponseBodyAsString();postMethod.releaseConnection();System.out.println(response);return response;}}
ReadIni.java类,在读文本文件中使用,如下:
package main.java.sina.utils;import java.io.BufferedReader; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.InputStreamReader; import java.util.HashMap; import java.util.Map;public class ReadIni {public static Map<String, String> getDbini(String file) {Map<String, String> map = new HashMap<String, String>();InputStreamReader isr = null;try{isr = new InputStreamReader(new FileInputStream(file));} catch (FileNotFoundException e1) {e1.printStackTrace();}BufferedReader br = new BufferedReader(isr);String s = null;try {s = br.readLine();while (s != null) {if (s.trim().length() > 0) {String[] s1 = getIni(s);map.put(s1[0], s1[1]);s = br.readLine();}}br.close();isr.close();} catch (Exception e) {e.printStackTrace();}return map;}public static String[] getIni(String str) {String[] temp = str.split("=");return temp;}}
然后,我们跳转到登录sina,来看一下loginSina这个类的实现:
package main.java.sina.httpclient;import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.math.BigInteger; import java.security.InvalidKeyException; import java.security.KeyFactory; import java.security.NoSuchAlgorithmException; import java.security.interfaces.RSAPublicKey; import java.security.spec.InvalidKeySpecException; import java.security.spec.RSAPublicKeySpec; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Scanner;import javax.crypto.BadPaddingException; import javax.crypto.Cipher; import javax.crypto.IllegalBlockSizeException; import javax.crypto.NoSuchPaddingException;import org.apache.commons.codec.binary.Hex; import org.apache.commons.httpclient.params.HttpParams; import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.cookie.Cookie; import org.springframework.core.io.ClassPathResource;import main.java.sina.json.msg.PreLoginResponseMessage; import main.java.sina.utils.Base64Encoder; import main.java.sina.utils.EncodeUtils; import main.java.sina.utils.HttpUtils; import main.java.sina.utils.JsonUtils; import main.java.sina.utils.Utils;public class LoginSina {private String username;private String password;private String rsakv;private String pubkey;//servertime和nonce都是在登录时需要使用的,用于post信息的加密private String servertime;//服务器的时间private String nonce;//一次性字符串private String userid;//用户微博IDprivate String pcid;//若需要输入验证码时用到private String userdomainname;//用于域名private String door;//验证码private Map<String,String> headers=null;private List<Cookie> cookies=null;public LoginSina(String username,String password){this.username=username;this.password=password;init();}public Map<String,String> getHeaders(){Map<String,String> hds=null;if(headers!=null && headers.keySet().size()>0){hds=new HashMap<String,String>();for(String key:headers.keySet()){hds.put(key,headers.get(key));}}return hds;}public List<Cookie> getCookies(){List<Cookie> cc=null;if(cookies!=null && cookies.size()>0){cc=new ArrayList<Cookie>();for(int i=0;i<cookies.size();i++){cc.add(cookies.get(i));}}return cc;}//登录微博public String dologinSina(){System.out.println("---do login, please hold on...---");String url="http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.5)";//v1.3.17Map<String,String> headers=new HashMap<String,String>();Map<String,String> params=new HashMap<String,String>();/*HTTP协议中的headers:http://www.cnblogs.com/yuzhongwusan/archive/2011/10/20/2218954.html* */headers.put("Accept", "text/html, application/xhtml+xml, */*");headers.put("Referer", "http://login.sina.com.cn/member/my.php?entry=sso");headers.put("Accept-Language", "zh-cn");headers.put("User-Agent", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; BOIE9;ZHCN");headers.put("Host", "login.sina.com.cn");headers.put("Connection", "Keep-Alive");headers.put("Content-Type", "application/x-www-form-urlencoded");headers.put("Cache-Control", "no-cache");params.put("encoding", "UTF-8");params.put("entry", "weibo");params.put("from", "");params.put("prelt", "112");params.put("gateway", "1");params.put("nonce", nonce);params.put("pwencode", "rsa2");//wsseparams.put("returntype", "META");params.put("pagerefer", "");params.put("savestate", "7"); params.put("servertime", servertime);params.put("rsakv", rsakv);params.put("service", "miniblog");params.put("sp", getEncryptedP());params.put("ssosimplelogin", "1");params.put("su", getEncodedU());params.put("url", "http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack");params.put("useticket", "1");params.put("vsnf", "1");HttpResponse response=HttpUtils.doPost(url, headers, params);this.cookies=HttpUtils.getResponseCookies(response);this.headers=headers;String responseText=HttpUtils.getStringFromResponse(response);try {responseText=new String(responseText.getBytes(),"GBK");if(!responseText.contains("retcode=0")){downloadCheckImage();this.nonce=getnonce();Scanner s=new Scanner(System.in);if(responseText.contains("retcode=4049"))System.out.println("请输入验证码:");else if(responseText.contains("retcode=2070")){System.out.println("验证码不正确,请再次输入验证码:");}this.door=s.next();dologinSina();}} catch (UnsupportedEncodingException e) {e.printStackTrace();}System.out.println("Congratulations, you have login success!");return responseText;}//登录后重定向public String redirect(){String cookieValue=HttpUtils.setCookie2String(this.cookies);this.headers.clear();this.headers.put("Accept", "image/gif, image/jpeg, image/pjpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*");this.headers.put("Accept-Language", "zh-cn");this.headers.put("Connection", "Keep-Alive");this.headers.put("Host", "sina.com.cn");this.headers.put("Referer", "http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)");this.headers.put("User", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; QQDownload 691)");this.headers.put("Cookie", cookieValue);String ssosavestate=""; //SSO即Sina Sign-on,String ticket = "";for(Cookie c:this.cookies){if(c.getName().equals("ALF")){ssosavestate=c.getValue();}else if(c.getName().equals("tgc")){ticket=c.getValue();}}String url="http://weibo.com/ajaxlogin.php?" +"framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack&" +"sudaref=weibo.com";HttpResponse response=HttpUtils.doGet(url, this.headers);response=HttpUtils.doGet(url, this.headers); String responseText=HttpUtils.getStringFromResponse(response);return responseText;}//生成一次性的字符串 6位 用于加密private String getnonce() {String x = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";String str = "";for (int i = 0; i < 6; i++) {str += x.charAt((int)Math.ceil(Math.random() * 1000000) % x.length());}return str;}//初始化:得到服务区的时间servertime和一次性字符串nonceprivate void init(){String url=compositeUrl();Map<String,String> headers=new HashMap<String,String>();headers.put("Accept", "*/*");headers.put("Referer", "http://weibo.com/");headers.put("Accept-Language", "zh-cn");headers.put("User-Agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; QQDownload 691)");headers.put("Host", "login.sina.com.cn");headers.put("Connection", "Keep-Alive"); HttpResponse response=HttpUtils.doGet(url, headers);String responseText=HttpUtils.getStringFromResponse(response);int begin=responseText.indexOf("{");int end=responseText.lastIndexOf("}");responseText=responseText.substring(begin,end+1);PreLoginResponseMessage plrmsg =JsonUtils.jsontoPreLoginResponseMessage(responseText);this.nonce=plrmsg.getNonce();this.servertime=plrmsg.getServertime()+"";this.pubkey=plrmsg.getPubkey();this.rsakv=plrmsg.getRsakv();this.pcid=plrmsg.getPcid();}//下载验证码private void downloadCheckImage() {if(pcid==null) return;this.headers.remove("Content-Type");try {if(this.cookies != null){this.cookies.clear();}} catch (Exception e) {e.printStackTrace();}String cookieValue=HttpUtils.setCookie2String(this.cookies);this.headers.put("Cookie", cookieValue);String url="http://login.sina.com.cn/cgi/pin.php?r="+(long)(Math.random()*100000000)+"&s=0&p="+this.pcid;HttpResponse response=HttpUtils.doGet(url, headers);InputStream in=HttpUtils.getInputStreamFromResponse(response);try {//System.out.println(new ClassPathResource("checkImage.jpeg").getFile().getPath());Utils.writeFileFromStream(new ClassPathResource("checkImage.jpeg").getFile().getPath(), in);} catch (IOException e) {e.printStackTrace();}}//组合预登陆时的URLprivate String compositeUrl(){StringBuilder builder=new StringBuilder();builder.append("http://login.sina.com.cn/sso/prelogin.php?").append("entry=weibo&callback=sinaSSOController.preloginCallBack&").append("su="+getEncodedU()).append("&rsakt=mod&checkpin=1&client=ssologin.js(v1.4.5)&_="+System.currentTimeMillis());return builder.toString();}//对用户名进行编码private String getEncodedU() {if(username!=null && username.length()>0){return Base64Encoder.encode(EncodeUtils.encodeURL(username,"utf-8").getBytes());}return "";}//对密码进行编码private String getEncryptedP(){ // return EncodeSuAndSp.getEncryptedP(password, servertime, nonce);String data=servertime+"\t"+nonce+"\n"+password;String spT=rsaCrypt(pubkey, "10001", data);return spT;}public static String rsaCrypt(String pubkey, String exponentHex, String pwd,String servertime,String nonce) {String data=servertime+"\t"+nonce+"\n"+pwd;return rsaCrypt(pubkey,exponentHex,data);} public static String rsaCrypt(String pubkey, String exponentHex, String messageg) {KeyFactory factory=null;try {factory = KeyFactory.getInstance("RSA");} catch (NoSuchAlgorithmException e1) {return "";}BigInteger publicExponent = new BigInteger(pubkey, 16); /* public exponent */BigInteger modulus = new BigInteger(exponentHex, 16); /* modulus */RSAPublicKeySpec spec = new RSAPublicKeySpec(publicExponent, modulus);RSAPublicKey pub=null;try {pub = (RSAPublicKey) factory.generatePublic(spec);} catch (InvalidKeySpecException e1) {return "";}Cipher enc=null;byte[] encryptedContentKey =null;try {enc = Cipher.getInstance("RSA");enc.init(Cipher.ENCRYPT_MODE, pub);encryptedContentKey = enc.doFinal(messageg.getBytes());} catch (NoSuchAlgorithmException e1) {System.out.println(e1.getMessage());return "";} catch (NoSuchPaddingException e1) {System.out.println(e1.getMessage());return "";} catch (InvalidKeyException e1) {System.out.println(e1.getMessage());return "";} catch (IllegalBlockSizeException e1) {System.out.println(e1.getMessage());return "";} catch (BadPaddingException e1) {System.out.println(e1.getMessage());return "";} return new String(Hex.encodeHex(encryptedContentKey));}public void setUserid(String userid) {this.userid = userid;}public String getUserid() {return userid;}public void setUserdomainname(String userdomainname) {this.userdomainname = userdomainname;}public String getUserdomainname() {return userdomainname;}}
Spider.sina类如下:
1 package main.java.sina.httpclient; 2 import java.util.HashMap; 3 import java.util.List; 4 import java.util.Map; 5 6 import org.apache.http.HttpResponse; 7 import org.apache.http.cookie.Cookie; 8 9 import main.java.sina.utils.Constant; 10 import main.java.sina.utils.EncodeUtils; 11 import main.java.sina.utils.HttpUtils; 12 import main.java.sina.utils.Utils; 13 14 public class SpiderSina { 15 private LoginSina ls; 16 private Map<String,String> headers; 17 private final int ADDFOLLOWING =1; 18 private final int CANCELFOLLOWING =2; 19 public SpiderSina(LoginSina ls){ 20 this.ls=ls; 21 this.headers=new HashMap<String,String>(); 22 headers.put("Accept", "text/html, application/xhtml+xml, */*"); 23 headers.put("Accept-Language", "zh-cn"); 24 headers.put("User-Agent", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; BOIE9;ZHCN"); 25 headers.put("Connection", "Keep-Alive"); 26 headers.put("Cache-Control", "no-cache"); 27 String cookieValue=HttpUtils.setCookie2String(ls.getCookies()); 28 headers.put("Cookie", cookieValue); 29 } 36 public String getGroupCategory(){ 37 String url="http://q.weibo.com/"; 38 this.headers.put("Host", "q.weibo.com"); 39 HttpResponse response=HttpUtils.doGet(url, headers); 40 String responseText=HttpUtils.getStringFromResponse(response); 41 responseText=EncodeUtils.unicdoeToGB2312(responseText); 42 return responseText; 43 } 44 public String search(String keyword, int pageNo){ 47 String url="http://s.weibo.com/weibo/%25E5%25AE%2581%25E6%25B3%25A2%25E5%25A4%25A7%25E5%25AD%25A6&page="+pageNo; 48 String cookieValue = "SINAGLOBAL=8556698272004.724.1417744632425; un=shy_annan@126.com; myuid=5439352084; wvr=6; un=sm2014121904@126.com; _s_tentry=developer.51cto.com; SWB=usrmdinst_14; SUS=SID-5438576807-1419173757-GZ-lrze7-d8e1e3f082b428c12412c8ba30f0a6de; SUE=es%3D4cdfdd5d5f0f75141c092b32f89525a2%26ev%3Dv1%26es2%3D469e50c869315e57efeec3012c3bb6a8%26rs0%3DoWdG36CQ33LUEtKTvGn907Zy1mwFETvSVJsxeHEiaMPcKDB7pFxg596a2pLhFLJfQmswf4AvXYAkzTfemrYgWrz%252BQPustEA2wLNYufYpAZqFsGWanhTBq6elzB2yoZp41xcpy1WwXn1CuvzIzzEYpuILjHahkmJDQDQy6KaxlbA%253D%26rv%3D0; SUP=cv%3D1%26bt%3D1419173757%26et%3D1419260157%26d%3Dc909%26i%3Da6de%26us%3D1%26vf%3D0%26vt%3D0%26ac%3D27%26st%3D0%26uid%3D5438576807%26name%3Dsm2014121904%2540126.com%26nick%3DSocialMedia%25E5%259B%259B%25E5%25A8%2583%26fmp%3D%26lcp%3D; SUB=_2A255kq8tDeTxGeNK6FoU9yjEyzuIHXVa6DVlrDV8PUNbvtBeLW3TkW-bMoi0G_bBfpbS3TMqcXg6zDWFLA..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhGThsH46uNrx1VY0ApV0SR5JpX5KMt; ALF=1450709756; SSOLoginState=1419173757; WBStore=bc5ad8450c3f8a48|undefined; Apache=1027467835228.8901.1419173761694; ULV=1419173761704:6:6:1:1027467835228.8901.1419173761694:1418797827169; UOR=www.ilehao.com,widget.weibo.com,login.sina.com.cn; ULOGIN_IMG=14192385783486"; 49 headers.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"); 50 //headers.put("Accept-Encoding", "gzip, deflate, sdch"); 51 headers.put("Accept-Language", "zh-CN"); 52 headers.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36"); 53 headers.put("Connection", "Keep-Alive"); 54 headers.put("Cache-Control", "max-age=0"); 55 headers.put("Referer", "http://login.sina.com.cn/sso/login.php?url=http%3A%2F%2Fs.weibo.com%2Fweibo%2F%2525E6%252583%2525A0%2525E6%252599%2525AE%26page%3D2&_rand=1419173756.6387&gateway=1&service=weibo&entry=miniblog&useticket=1&returntype=META"); 56 headers.put("Cookie", cookieValue); 57 this.headers.put("Host", "s.weibo.com"); 58 HttpResponse response=HttpUtils.doGet(url, headers); 59 String responseText=HttpUtils.getStringFromResponse(response); 60 responseText=EncodeUtils.unicdoeToGB2312(responseText); 61 62 63 return responseText; 64 } 65 66 public String searchCommentsByUid(String uid){ 67 68 String url="http://www.weibo.com/u/"+uid; 69 String cookieValue = "SINAGLOBAL=8556698272004.724.1417744632425; myuid=2035860051; wvr=6; YF-Ugrow-G0=ad06784f6deda07eea88e095402e4243; SSOLoginState=1423150079; YF-V5-G0=32eb5467e9bfc8b60c2d771056535ac5; _s_tentry=www.weibo.com; Apache=6264929557219.147.1423150103832; ULV=1423150103842:18:2:2:6264929557219.147.1423150103832:1422769721265; ULOGIN_IMG=1423233797946; YF-Page-G0=82cdcdfb16327a659fbb60cc9368fb19; SUS=SID-2035860051-1423286223-GZ-jdkh4-c8ea11de0a42151313986e52f9aa6017; SUE=es%3D8701ff5aca59244ff1ff263cf985bee6%26ev%3Dv1%26es2%3D7995c9eb7455697c09fac4f7486e14eb%26rs0%3DTyXXIRjcEw%252BeS5PaVSM%252FhQjc2JGhKBOe3uFTgShiIUAbPFI2eKtrgxM2wIi9A1xndiTFFM72zY%252FDKYFXONrgkao5cRo%252FHkydV%252FnaQjNmXoeESu5gi6Iq0aX883NhGR0utBVNZb5XaIG3X6HMMfBJC%252B7pnVHogEo8eD6cx8nzN5c%253D%26rv%3D0; SUP=cv%3D1%26bt%3D1423286223%26et%3D1423372623%26d%3Dc909%26i%3D6017%26us%3D1%26vf%3D0%26vt%3D0%26ac%3D0%26st%3D0%26uid%3D2035860051%26name%3Dshy_annan%2540126.com%26nick%3D%25E7%2594%25A8%25E6%2588%25B72035860051%26fmp%3D%26lcp%3D2013-08-18%252021%253A48%253A10; SUB=_2A2550e-fDeTxGeRO6FcZ9i7Mzj2IHXVap0ZXrDV8PUNbvtBuLWnTkW-gBGVORTA7J_lSZzAqzW6E50JjBQ..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9Wh7oKNCGYcNnhlC6eqqQbbl5JpX5KMt; SUHB=0M20OGRPiOKzyc; ALF=1454822222; UOR=www.ilehao.com,widget.weibo.com,login.sina.com.cn"; 70 headers.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"); 71 headers.put("Accept-Language", "zh-CN"); 72 headers.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36"); 73 headers.put("Connection", "Keep-Alive"); 74 headers.put("Cache-Control", "max-age=0"); 75 headers.put("Cookie", cookieValue); 76 this.headers.put("Host", "www.weibo.com"); 77 HttpResponse response=HttpUtils.doGet(url, headers); 78 String responseText=HttpUtils.getStringFromResponse(response); 79 responseText=EncodeUtils.unicdoeToGB2312(responseText); 82 return responseText; 83 } 85 //爬虫根据关键字,查询时间断,和查询页数 来得到htmlContent 86 public String search(String keyword, int pageNo, String fromdate,String todate){ 87 StringBuffer stringBuffer = new StringBuffer(200); 93 stringBuffer.append("http://s.weibo.com/weibo/"+ keyword +"&page="); 94 stringBuffer.append(pageNo); 95 stringBuffer.append("&typeall=1&suball=1×cope=custom:"); 96 stringBuffer.append(fromdate); 97 stringBuffer.append(":"); 98 stringBuffer.append(todate); 99 stringBuffer.append("&Refer=g"); 104 String url = stringBuffer.toString(); 105 String cookieValue = headers.get("Cookie"); 106 headers.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"); 107 //headers.put("Accept-Encoding", "gzip, deflate, sdch"); 108 headers.put("Accept-Language", "zh-CN"); 109 headers.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36"); 110 headers.put("Connection", "Keep-Alive"); 111 headers.put("Cache-Control", "max-age=0"); 112 headers.put("Referer", "http://s.weibo.com/weibo/%25E5%25AE%2581%25E6%25B3%25A2%25E5%25A4%25A7%25E5%25AD%25A6&typeall=1&suball=1×cope=custom:"+fromdate+":"+todate+"&Refer=g"); 113 headers.put("Cookie", cookieValue); 114 this.headers.put("Host", "s.weibo.com"); 115 HttpResponse response=HttpUtils.doGet(url, headers); 116 String responseText=HttpUtils.getStringFromResponse(response); 117 responseText=EncodeUtils.unicdoeToGB2312(responseText); 118 119 System.out.println("************htmlContent start***********"); 120 System.out.println(responseText); 121 System.out.println("************htmlContent end***********"); 125 return responseText; 127 } 129 public void forwardToWeiboPage(){ 130 String url = Constant.personalHomePage; 131 headers.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"); 133 headers.put("Accept-Language", "zh-CN"); 134 headers.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36"); 135 headers.put("Connection", "Keep-Alive"); 137 this.headers.put("Host", "s.weibo.com"); 138 HttpResponse response=HttpUtils.doGet(url, headers); 139 String responseText=HttpUtils.getStringFromResponse(response); 140 responseText=EncodeUtils.unicdoeToGB2312(responseText); 141 List<Cookie> cookies = HttpUtils.getResponseCookies(response); 142 String cookie = HttpUtils.setCookie2String(cookies); 144 headers.put("Cookie", cookie); 146 } 150 public String getGroupCategory(int id){ 151 String url="http://q.weibo.com/class/category/?id="+id; 152 this.headers.put("Host", "q.weibo.com"); 154 HttpResponse response=HttpUtils.doGet(url, headers); 155 String responseText=HttpUtils.getStringFromResponse(response); 156 responseText=EncodeUtils.unicdoeToGB2312(responseText); 157 return responseText; 158 } 169 //得到微群管理员ID信息,其实用户成员的第一页 HTML页面 170 public String getGroupAdministrator(String groupid) { 171 String url="http://q.weibo.com/"+groupid+"/members/all"; 172 this.headers.remove("Referer"); 173 this.headers.put("Host", "q.weibo.com"); 174 this.headers.remove("Content-Type"); 175 this.headers.remove("x-requested-with"); 176 HttpResponse response=HttpUtils.doGet(url, headers); 177 String responseText=HttpUtils.getStringFromResponse(response); 178 return responseText; 179 } 180 //根据微群号和页号得到群成员ID信息 -----JSON格式数据 181 public String getGroupMembers(String groupid,int pagenumber){ 182 this.headers.put("Referer", "http://q.weibo.com/"+groupid+"/members/all"); 183 this.headers.put("Host", "q.weibo.com"); 184 this.headers.put("Content-Type", "application/x-www-form-urlencoded"); 185 this.headers.put("x-requested-with", "XMLHttpRequest"); 187 Map<String,String> params=new HashMap<String,String>(); 188 params.put("_t", "0"); 189 params.put("page", pagenumber+""); 190 params.put("gid", groupid); 191 params.put("query",""); 192 params.put("tab", "all"); 193 params.put("vip", "1"); 194 String url="http://q.weibo.com/ajax/members/page"; 195 HttpResponse response=HttpUtils.doPost(url, headers, params); 196 return HttpUtils.getStringFromResponse(response); 197 } 198 /* 199 * 得到微群中微博信息 经过多次尝试成功 200 * 每次获得50个微博记录,page是页号, count值50 可以在1-75之间,但是,每次开始的时候还是从50的倍数开始的 201 */ 202 public String getGroupTopic(int page,int count,String gid){ 203 this.headers.put("Referer", "http://q.weibo.com/"+gid); 204 this.headers.put("Host", "q.weibo.com"); 205 this.headers.put("Content-Type", "application/x-www-form-urlencoded"); 206 this.headers.put("x-requested-with", "XMLHttpRequest"); 207 Integer pre_page=1; 208 if(page==1){ 209 pre_page=2; 210 }else{ 211 pre_page=page-1; 212 } 213 Map<String,String> params=new HashMap<String,String>(); 214 params.put("_k", System.currentTimeMillis()+""); 215 params.put("_t", "0"); 216 params.put("count", count+""); 217 //params.put("end_id", end_id); 218 params.put("gid", gid); 219 params.put("is_search",""); 220 params.put("key_word", ""); 221 params.put("me", "0"); 222 params.put("mids", ""); 223 params.put("new", "0"); 224 params.put("page", page+""); 225 params.put("pagebar", "0"); 226 params.put("pre_page", pre_page+""); 227 params.put("since_id", "0"); 228 params.put("uid", "0"); 229 230 String url="http://q.weibo.com/ajax/mblog/groupfeed"; 231 HttpResponse response=HttpUtils.doPost(url, headers, params); 232 return HttpUtils.getStringFromResponse(response); 233 } 234 /* 235 * 得到微群中微博信息数目 236 * 这个信息中其实还包含了微群的所有的基本信息~~~~~~~~~~****** json格式的数据信息 237 */ 238 public String getGroupMessageNumber(String gid){ 239 this.headers.put("Referer", "http://q.weibo.com/"+gid); 240 this.headers.put("Host", "q.weibo.com"); 241 this.headers.put("Content-Type", "application/x-www-form-urlencoded"); 242 this.headers.put("x-requested-with", "XMLHttpRequest"); 243 String url="http://q.weibo.com/ajax/rightnav/groupprofile?gid="+gid+"&_t=0&__rnd="+System.currentTimeMillis(); 244 HttpResponse response=HttpUtils.doGet(url, headers); 245 return HttpUtils.getStringFromResponse(response); 246 } 247 //得到微群的主页信息 HTML页码 主要是为了得到第一条微博记录的MID值 248 public String getgroupMainPage(String groupid) { 249 String url="http://q.weibo.com/"+groupid+"?topnav=1"; 250 this.headers.remove("Referer"); 251 this.headers.put("Host", "q.weibo.com"); 252 this.headers.remove("Content-Type"); 253 this.headers.remove("x-requested-with"); 254 255 HttpResponse response=HttpUtils.doGet(url, headers); 256 String responseText=HttpUtils.getStringFromResponse(response); 257 return responseText; 258 } 259 /* 260 * 根据分类得到微群信息 261 * categroyID :分类ID号 262 * pagenumber:页号 263 * sort:分类方式 1 按成员人数 2按 微群博数 3按创建时间分类 264 * count:每页的记录数目 265 */ 266 public String getGroupByCategroy(int categroyID,int pagenumber,int sort,int count){ 267 this.headers.put("Referer", "http://q.weibo.com/class/category/?id="+categroyID); 268 this.headers.put("Host", "q.weibo.com"); 269 this.headers.put("Content-Type", "application/x-www-form-urlencoded"); 270 this.headers.put("x-requested-with", "XMLHttpRequest"); 271 Map<String,String> params=new HashMap<String,String>(); 272 params.put("_t", "0"); 273 params.put("page", pagenumber+""); 274 params.put("id", categroyID+""); 275 params.put("sort",sort+""); 276 params.put("count", count+""); 277 278 String url="http://q.weibo.com/ajax/class/category"; 279 HttpResponse response=HttpUtils.doPost(url, headers,params); 280 String responseText=HttpUtils.getStringFromResponse(response); 281 responseText=EncodeUtils.unicdoeToGB2312(responseText); 282 return responseText; 283 } 284 //得到表情列表信息 285 public String getFaceList(){ 286 String url="http://weibo.com/aj/mblog/face?type=face&_t=0&__rnd="+System.currentTimeMillis(); 287 this.headers.put("Referer", "http://weibo.com/"); 288 this.headers.put("Host", "weibo.com"); 289 this.headers.put("Content-Type", "application/x-www-form-urlencoded"); 290 this.headers.put("x-requested-with", "XMLHttpRequest"); 291 292 HttpResponse response=HttpUtils.doGet(url, headers); 293 String responseText=HttpUtils.getStringFromResponse(response); 294 System.out.println(responseText); 295 Utils.writeFileFromString("tmpFile/faceList.txt", responseText); 296 return responseText; 297 } 307 //用户基本信息 主要是将要解析用户主页下方经过编码后的内容 308 public String getMemberInfo(String memberID){ 309 String url="http://weibo.com/"+memberID+"/info"; 310 this.headers.put("Host", "weibo.com"); 311 this.headers.put("Referer", "http://weibo.com/u/"+memberID); 312 HttpResponse response=HttpUtils.doGet(url, headers); 313 String responseText=HttpUtils.getStringFromResponse(response); 314 return responseText; 315 } 316 //用户粉丝用户信息 html页面,每次20个 317 public String getMemberFans(String memberID,int page){ 318 String url="http://weibo.com/"+memberID+"/fans?&uid=1689219395&tag=&page="+page; 319 this.headers.put("Host", "weibo.com"); 320 this.headers.put("Referer", "http://weibo.com/"+memberID+"/fans"); 321 HttpResponse response=HttpUtils.doGet(url, headers); 322 String responseText=HttpUtils.getStringFromResponse(response); 323 return responseText; 324 } 325 //用户关注的用户信息 html页面 326 public String getMemberFollowing(String memberID,int page){ 327 String url="http://weibo.com/"+memberID+"/follow?page="+page; 328 this.headers.put("Host", "weibo.com"); 329 this.headers.put("Referer", "http://weibo.com/"+memberID+"/follow"); 330 HttpResponse response=HttpUtils.doGet(url, headers); 331 String responseText=HttpUtils.getStringFromResponse(response); 332 return responseText; 333 } 334 335 /* 336 * @params 337 * memberID:是用户ID 338 * max_id:每次AJAX获得数据时上面一次的最后一个ID值 339 * end_id:用户最新的一条微博的ID值 340 * k:一个随机数 341 * page:页号 342 * pre_page:前一页 343 * count:每次返回的数值 当max_id为null是 count=50 否则为15 344 * pagebar:ajax时,第一次为0,第二次为1 345 * 注意: 346 * 1 用此请求,每次获得的数据格式都一样,用同样的解析方法来进行解析。 347 * 2 每次一页可以获得总共45条记录,需要三次请求。每次请求可获得15条记录。 348 * 3 max_id可以不用到,直接等于 end_id就可以了. 349 * 4 第一次请求时可以将end_id设置为NUll,即为第一次时翻页时的请求后边的滚动时必须有end_id参数,end_id为第一页的第一条ID即可。 350 */ 351 //获得用户发布的微博信息 json格式的数据 352 public String getMemberReleaseTopic(String memberID,String end_id,Integer page,Integer pagebar){ 353 String url=""; 354 Integer pre_page=1; 355 Integer count=0; 356 String k=System.currentTimeMillis()+""+(int)(Math.random()*100000)%100; 357 if(end_id==null){ 358 count=50; 359 if(page==1){ 360 pre_page=2; 361 }else{ 362 pre_page=page-1; 363 } 364 url="http://weibo.com/aj/mblog/mbloglist?" + 365 "page="+page+"&count="+count+"&pre_page="+pre_page+"&" + 366 "_k="+ k+"&uid="+memberID+ 367 "&_t=0&__rnd="+System.currentTimeMillis(); 368 }else{ 369 count=15; 370 pre_page=page; 371 url="http://weibo.com/aj/mblog/mbloglist?" + 372 "page="+page+"&count="+count+"&max_id="+end_id+"&" + 373 "pre_page="+pre_page+"&end_id="+end_id+"&" + 374 "pagebar="+pagebar+"&_k="+k+"&" + 375 "uid="+memberID+"&_t=0&__rnd="+System.currentTimeMillis(); 376 } 377 String cookieValue = "SINAGLOBAL=8556698272004.724.1417744632425; un=sm2014121903@126.com; myuid=5439352084; YF-Ugrow-G0=4703aa1c27ac0c4bab8fc0fc5968141e; SSOLoginState=1421374583; wvr=6; YF-V5-G0=8c4aa275e8793f05bfb8641c780e617b; _s_tentry=login.sina.com.cn; Apache=2461283528245.9854.1421374588453; ULV=1421374588550:13:5:3:2461283528245.9854.1421374588453:1421210767499; UOR=www.ilehao.com,widget.weibo.com,login.sina.com.cn; SUS=SID-2035860051-1421462085-GZ-7jcgb-1539d643bae5195fb7f792b2ae77befb; SUE=es%3Df15e11ed09b6a0108a28adfa58609b78%26ev%3Dv1%26es2%3Da0f706efac5c89495062648a4de3e337%26rs0%3DZBxlOUv0mhmxyHfOVmZ3tH7tNvAp08BjPeLUJPdu9WzG38Dsm40px%252Bd9w21ycDpZQwBK3q0prFfNs%252F8ZuZSasa1eps%252FOGNxJ3CIHN8JN%252Fik6gVpIPgVeeRdalNWTIbth6hLa34uOp%252BXii%252Bxeib%252BvINsr%252FdOvQx6kjp6fsC44QXc%253D%26rv%3D0; SUP=cv%3D1%26bt%3D1421462085%26et%3D1421548485%26d%3Dc909%26i%3Dbefb%26us%3D1%26vf%3D0%26vt%3D0%26ac%3D2%26st%3D0%26uid%3D2035860051%26name%3Dshy_annan%2540126.com%26nick%3D%25E7%2594%25A8%25E6%2588%25B72035860051%26fmp%3D%26lcp%3D2013-08-18%252021%253A48%253A10; SUB=_2A255vboVDeTxGeRO6FcZ9i7Mzj2IHXVazdpdrDV8PUNbvtBuLVj-kW91jmbQSGo7Rn30RVvGP5KOgBgNgQ..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9Wh7oKNCGYcNnhlC6eqqQbbl5JpX5KMt; ALF=1452998078; ULOGIN_IMG=14214638933178; YF-Page-G0=0acee381afd48776ab7a56bd67c2e7ac"; 378 headers.put("Cookie", cookieValue); 379 this.headers.put("Referer", "http://weibo.com/u/"+memberID); 380 this.headers.put("Host", "www.weibo.com"); 381 this.headers.put("Content-Type", "application/x-www-form-urlencoded"); 382 this.headers.put("x-requested-with", "XMLHttpRequest"); 383 url = "http://weibo.com/u/"+memberID; 384 HttpResponse response=HttpUtils.doGet(url, headers); 385 if(response==null){ 386 return ""; 387 } 388 return HttpUtils.getStringFromResponse(response); 389 } 390 /* 391 * ~~~~~~~~~~~~~~~~~~~~~获取用户的一些信息~~~end~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 392 */ 393 394 395 //********************************************************************************** 396 397 /* 398 * 名人堂与达人信息 399 */ 400 public String getVerified(String url){ 401 this.headers.put("Host", "verified.weibo.com"); 402 this.headers.put("Referer", "http://plaza.weibo.com/?topnav=1&wvr=4"); 403 HttpResponse response=HttpUtils.doGet(url, headers); 404 String responseText=HttpUtils.getStringFromResponse(response); 405 return responseText; 406 } 407 408 public String getVerifiedMember(String path,Integer g_index){ 409 String url="http://verified.weibo.com/aj/getgrouplist?g_index="+g_index+ 410 "&path="+path+"&_t=0&__rnd="+System.currentTimeMillis(); 411 this.headers.put("Host", "verified.weibo.com"); 412 this.headers.put("Referer", path); 413 this.headers.put("Content-Type", "application/x-www-form-urlencoded"); 414 this.headers.put("x-requested-with", "XMLHttpRequest"); 415 HttpResponse response=HttpUtils.doGet(url, headers); 416 String responseText=HttpUtils.getStringFromResponse(response); 417 418 return responseText; 419 } 420 421 public String setArea(Integer provinceID){ 422 this.headers.put("Referer", "http://club.weibo.com/list"); 423 this.headers.put("Host", "club.weibo.com"); 424 this.headers.put("Content-Type", "application/x-www-form-urlencoded"); 425 this.headers.put("x-requested-with", "XMLHttpRequest"); 426 427 Map<String,String> params=new HashMap<String,String>(); 428 429 params.put("_t", "0"); 430 params.put("city", "1000"); 431 params.put("prov", provinceID+""); 432 433 String url="http://club.weibo.com/ajax_setArea.php"; 434 HttpResponse response=HttpUtils.doPost(url, headers, params); 435 436 List<Cookie> cks=HttpUtils.getResponseCookies(response); 437 List<Cookie> cookies=ls.getCookies(); 438 cookies.addAll(cks); 439 String cookieValue=HttpUtils.setCookie2String(cookies); 440 this.headers.put("Cookie", cookieValue); 441 442 return HttpUtils.getStringFromResponse(response); 443 } 444 445 public String getDaRen(Integer page){ 446 String op="ltime"; 447 String url="http://club.weibo.com/list?sex=3&op="+op+"&page="+page+"&"; 448 Integer pre_page=(page<=1? 2:page-1); 449 this.headers.put("Host", "club.weibo.com"); 450 this.headers.put("Referer", "http://club.weibo.com/list?sex=3&op=ltime&page="+pre_page+"&"); 451 this.headers.remove("Content-Type"); 452 this.headers.remove("x-requested-with"); 453 454 HttpResponse response=HttpUtils.doGet(url, headers); 455 if(response!= null){ 456 return HttpUtils.getStringFromResponse(response); 457 } 458 return ""; 459 460 } 470 //发布一条文字微博 471 public String releaseTopic(String content){ 472 this.headers.put("Referer", "http://weibo.com/"); 473 this.headers.put("Host", "weibo.com"); 474 this.headers.put("Content-Type", "application/x-www-form-urlencoded"); 475 this.headers.put("x-requested-with", "XMLHttpRequest"); 476 Map<String,String> params=new HashMap<String,String>(); 477 params.put("_t", "0"); 478 params.put("location", "home"); 479 params.put("module", "stissue"); 480 params.put("pic_id", ""); 481 params.put("text", content); 482 String url="http://weibo.com/aj/mblog/add?__rnd="+System.currentTimeMillis(); 483 HttpResponse response=HttpUtils.doPost(url, headers, params); 484 return HttpUtils.getStringFromResponse(response); 485 }519 //得到自己关注的成员 520 public String getSelfFollowIngs(){ 521 return ""; 522 } 523 //得到自己的粉丝 524 public String getSelfFollowers(){ 525 return ""; 526 } 527 //得到自己加入的微群 528 public String getSelfJoinedGroups(){ 529 return ""; 530 } 531 //得到自己的标签 532 public String getSelfTags(){ 533 return ""; 534 } 535 //得到自己发布的微博 536 public String getSelfReleaseTopics(){ 537 return ""; 538 } 539 //得到自己主页的微博 540 public String getSelfPageTopics(){ 541 return ""; 542 } 543 //关注一个人 544 public String addFollowing(String memberid){ 545 return addorcancleFollowing(memberid,this.ADDFOLLOWING); 546 } 547 //取消关注一个人 548 public String cancelFollowing(String memberid){ 549 return addorcancleFollowing(memberid,this.CANCELFOLLOWING); 550 } 551 private String addorcancleFollowing(String memberid,int option){ 552 String url=""; 553 switch(option){ 554 case ADDFOLLOWING: 555 url="http://weibo.com/aj/f/followed?__rnd="+System.currentTimeMillis(); 556 break; 557 case CANCELFOLLOWING: 558 url="http://weibo.com/aj/f/unfollow?__rnd="+System.currentTimeMillis(); 559 break; 560 } 561 562 Map<String,String> params=new HashMap<String,String>(); 563 564 this.headers.put("Referer", "http://weibo.com/"); 565 this.headers.put("Host", "weibo.com"); 566 this.headers.put("Content-Type", "application/x-www-form-urlencoded"); 567 this.headers.put("Referer", "http://weibo.com/"); 568 this.headers.put("x-requested-with", "XMLHttpRequest"); 569 570 params.put("_t", "0"); 571 params.put("f", "1"); 572 params.put("location", "profile"); 573 params.put("refer_flag", ""); 574 params.put("refer_sort", "profile"); 575 params.put("uid", memberid); 576 577 HttpResponse response=HttpUtils.doPost(url, headers, params); 578 return HttpUtils.getStringFromResponse(response); 579 } 584 /** 585 * 得到的标签信息 调用一次10个 586 * @return 587 */ 588 public String getTags(){ 589 String url="http://account.weibo.com/set/aj/tagsuggest?__rnd="+System.currentTimeMillis(); 590 this.headers.put("Referer", "http://account.weibo.com/set/tag#"); 591 this.headers.put("Host", "account.weibo.com"); 592 HttpResponse response=HttpUtils.doGet(url, headers); 593 return HttpUtils.getStringFromResponse(response); 594 } 595 596 /** 597 * 得到微博热词信息 598 * @param k :热词的门类 599 */ 600 public String getHotWords(String k){ 601 String url="http://data.weibo.com/top/keyword?k="+k; 602 try{ 603 Integer.parseInt(k); 604 }catch(Exception ex){ 605 url="http://data.weibo.com/top/keyword?t="+k; 606 } 607 this.headers.put("Referer", "http://data.weibo.com/top/keyword"); 608 this.headers.put("Host", "data.weibo.com"); 609 HttpResponse response=HttpUtils.doGet(url, headers); 610 return HttpUtils.getStringFromResponse(response); 611 } 612 613 /** 614 * 得到微博热帖子 615 * @param cat 表示热帖门类 616 * @param page 表示页号 617 */ 618 public String getHotWeibo(String cat,int page){ 619 String url="http://data.weibo.com/hot/ajax/catfeed?page="+page+"&cat="+cat+"&_t=0&__rnd="+System.currentTimeMillis(); 620 this.headers.put("Referer", "http://data.weibo.com/hot/minibloghot"); 621 this.headers.put("Host", "data.weibo.com"); 622 HttpResponse response=HttpUtils.doGet(url, headers); 623 return HttpUtils.getStringFromResponse(response); 624 } 625 626 /** 627 * 按照分类获取 微博吧名字 第一步 628 */ 629 public String getWeiBar(String ctgid,int p){ 630 String sort="post"; 631 String url="http://weiba.weibo.com/aj_f/CategoryList?sort="+sort+"&p="+p+"&ctgid="+ctgid+"&_t=0&__rnd="+System.currentTimeMillis(); 632 this.headers.put("Referer", "http://weiba.weibo.com/ct/"+ctgid); 633 this.headers.put("Host", "weiba.weibo.com"); 634 this.headers.put("Accept", "*/*"); 635 this.headers.put("Content-Type", "application/x-www-form-urlencoded"); 636 this.headers.put("X-Requested-With", "XMLHttpRequest"); 637 HttpResponse response=HttpUtils.doGet(url, headers); 638 return HttpUtils.getStringFromResponse(response); 639 } 640 /** 641 * 根据微博吧 名称 ,得到该吧内的所有帖子标题 第二步 642 */ 643 public String getWeiBarByWeibarName(String bid,int p){ 644 String url="http://weiba.weibo.com/aj_t/postlist?bid="+bid+"&p="+p+"&_t=all&__rnd="+System.currentTimeMillis(); 645 this.headers.put("Referer", "http://weiba.weibo.com/"); 646 this.headers.put("Host", "weiba.weibo.com"); 647 this.headers.put("Accept", "*/*"); 648 this.headers.put("Content-Type", "application/x-www-form-urlencoded"); 649 this.headers.put("X-Requested-With", "XMLHttpRequest"); 650 HttpResponse response=HttpUtils.doGet(url, headers); 651 return HttpUtils.getStringFromResponse(response); 652 } 653 654 /** 655 * 新浪微公益名单 656 * type ="donate" 657 * type="discuss" 658 */ 659 public String getWeiGongYiMember(int page,int projectID,String type){ 660 String url="http://gongyi.weibo.com/aj_personal_helpdata?page="+page+"&type="+type+"&project_id="+projectID+"&_t=0&__rnd="+System.currentTimeMillis(); 661 this.headers.put("Referer", "http://gongyi.weibo.com/"+projectID); 662 this.headers.put("Host", "gongyi.weibo.com"); 663 this.headers.put("Accept", "*/*"); 664 this.headers.put("Content-Type", "application/x-www-form-urlencoded"); 665 this.headers.put("X-Requested-With", "XMLHttpRequest"); 666 HttpResponse response=HttpUtils.doGet(url, headers); 667 return HttpUtils.getStringFromResponse(response); 668 } 669 }
转载于:https://www.cnblogs.com/RunForLove/p/4511920.html
用java实现新浪爬虫,代码完整剖析(仅针对当前SinaSignOn有效)相关推荐
- java获取新浪天气预报代码
package com.test.commons;/*** java获取新浪天气预报代码*/ import java.io.FileNotFoundException; import java.io. ...
- 新浪天气预报代码及城市代码
名称:新浪天气预报代码 代码 :<IFRAME ID='ifm2' WIDTH='260' HEIGHT='70' ALIGN='CENTER' MARGINWIDTH='0' MARGINHE ...
- Java 模拟新浪登录 2016
想学习一下网络爬虫,涉及到模拟登录,查阅了一番资料以后发现大部分都有点过时了,就使用前辈们给的经验,Firefox抓包调试,采用httpclient模拟了一下新浪登录.不对之处多多包含.需要的可以用浏 ...
- Java模拟新浪和腾迅自动登录并发送微博(2013年3月更新可用)
1.准备工作 只是登录无需申请新浪和腾迅的开发者账号,如果需要发送微博功能,需要申请一个新浪和腾迅的开发者账号,并添加一个测试应用. 过程请参考官方帮助文档,申请地址:新浪:http://open.w ...
- 飞翔的小鸟--Java小游戏实战(代码完整)
一.写在前面: <飞扬的小鸟>是一款曾经比较火热的小游戏,本文可以带你你从零开始,一步一步的开发出这款小游戏. 语言 Java 工具 IntelliJ IDEA,JDK 16 准备工作 创 ...
- java 学习 新浪微群
java 新浪微群 http://q.weibo.com/803436/invitation=11mQrw0-1f2c0?source=weiqun_notice_app_18
- 2022年新浪股票接口更新需要加Referer才能获取数据, java获取新浪股票数据 http://hq.sinajs.cn
新浪股票 2022年更新后 java获取数据 引用 <dependency><groupId>cn.hutool</groupId><artifactId&g ...
- java 利用新浪天气API获取天气预报
新浪为我们提供了天气预报获取接口API http://php.weather.sina.com.cn/xml.php?city=武汉&password=DJOYnieT8234jlsK&am ...
- html注册新浪邮箱代码,JS仿新浪邮箱点击联系人添加Email地址
新浪邮箱添加功能 var ev={}; var flyDiv="bxAddrFly"; var inceptDiv="SendAddress"; var add ...
- java 的新浪oauth_新浪微博OAuth授权的Java实现
一.OAuth协议简介 OAuth授权在各社交网站中广泛使用,该协议使用户不需要直接向第三方应用提供用户名及密码,并且使一个账户在多个网站中使用成为可能,OAuth协议的细节描述可参考其官方网站:ht ...
最新文章
- 如何把照片压缩到20k一下_如何将图像压缩10倍?阿里工程师有个大胆的想法!...
- 影响中国发展的七大垂直搜索引擎
- java arrays.equals_Java Arrays类的常见使用
- 看图说cnblogs-强大的SEO功能【有实例】
- Job中的Task是如何调度的
- CTSC2017 APIO2017 THUSC2017 游记
- Unity Application Block 发布
- 请教大家一个问题,有关于数据库的设计
- Android滑动返回上一级界面
- 我所理解的Reed solomon 算法
- vue鼠标经过效果实现
- 人像抠图——基于深度学习一键去除视频背景
- Cass符号填充设置
- 网络错误 —未连接到互联网
- 【Windows】win7虚拟机安装VMware Tools
- 常用算法简要总结(C语言)
- iOS WIFI 相关
- 工资3000,靠国际版抖音TikTok月入2W+:这个风口真的很赚钱!
- STM32C8T6编码器电机测速与arduino光电模块测速
- 总结高精度定位难点与解决办法
热门文章
- word电子签名工具_电子签名是您不会想到的必备工具的5个理由
- Java基础知识总结(一)创建和销毁对象
- linux系统gaussian09,高斯(Gaussian)软件linux下安装
- 红帽(redhat linux) 初级认证(RHCSA)考点详解
- HTML 制作简单的个人简历
- html读写txt文件,JS读写文本文件示例代码
- vlfeat工具包的MATLAB安装
- 之前做设计收集的部分网站
- postSQL安装和GIS数据导入
- awvs12 Server Exception_使用WebSocket搭建服务器server