java 抓图程序的实现

Java抓图程序的实现（改进版）收藏

转自：[url]http://blog.csdn.net/binyao02123202/archive/2010/07/17/5741519.aspx[/url]

主要难点:

1.并发线程的控制采用了JDK5的UTIL包里的concurrent子包

2.去重

3.序列化

运行方法:java -Xms128M -Xmx512M -jar JavaCrawler.jar http://foxhq.com/ C:/a.log 0 D:/pic D:/url.tmp D:/img.tmp

SimpleBloomFilter.java

public class SimpleBloomFilter implements Serializable {

/**
*
*/
private static final long serialVersionUID = 1L;
private final int DEFAULT_SIZE = 2 << 24;
private final int[] seeds = new int[] { 7, 11, 13, 31, 37, 61, };
private BitSet bits = new BitSet(DEFAULT_SIZE);
private SimpleHash[] func = new SimpleHash[seeds.length];
// public void main(String[] args) {
// String value = "stone2083@yahoo.cn";
// SimpleBloomFilter filter = new SimpleBloomFilter();
// System.out.println(filter.contains(value));
// filter.add(value);
// System.out.println(filter.contains(value));
// }
public SimpleBloomFilter() {
for (int i = 0; i < seeds.length; i++) {
func[i] = new SimpleHash(DEFAULT_SIZE, seeds[i]);
}
}
public void add(String value) {
for (SimpleHash f : func) {
bits.set(f.hash(value), true);
}
}
public boolean contains(String value) {
if (value == null) {
return false;
}
boolean ret = true;
for (SimpleHash f : func) {
ret = ret && bits.get(f.hash(value));
}
return ret;
}

public class SimpleHash implements Serializable {
private int cap;
private int seed;
public SimpleHash(int cap, int seed) {
this.cap = cap;
this.seed = seed;
}
public int hash(String value) {
int result = 0;
int len = value.length();
for (int i = 0; i < len; i++) {
result = seed * result + value.charAt(i);
}
return (cap - 1) & result;
}
}
@Override
public String toString() {
// TODO Auto-generated method stub
return super.toString();
}

}

UtilSeriz.java

/**
*反序列化,将磁盘文件转化为对象
*@paramf
*@return
*@throwsException
*/
public static Object readObject(String strPath) throws Exception{
File f=new File(strPath);
if(!f.exists())
{
return null;
}
InputStream is=new FileInputStream(f);
//ObjectOutputStream 核心类
ObjectInputStream ois=new ObjectInputStream(is);
return ois.readObject();
}

}

SearchCrawler.java

view plaincopy to clipboardprint?
·········10········20········30········40········50········60········70········80········90········100·······110·······120·······130·······140·······15001.package com.hengking.crawl;
02.import java.awt.image.BufferedImage;
03.import java.io.BufferedInputStream;
04.import java.io.BufferedReader;
05.import java.io.BufferedWriter;
06.import java.io.File;
07.import java.io.FileOutputStream;
08.import java.io.FileWriter;
09.import java.io.IOException;
10.import java.io.InputStreamReader;
11.import java.net.URL;
12.import java.text.SimpleDateFormat;
13.import java.util.ArrayList;
14.import java.util.Calendar;
15.import java.util.Date;
16.import java.util.HashMap;
17.import java.util.LinkedHashSet;
18.import java.util.concurrent.Callable;
19.import java.util.concurrent.ExecutorService;
20.import java.util.concurrent.Executors;
21.import java.util.concurrent.Semaphore;
22.import java.util.regex.Matcher;
23.import java.util.regex.Pattern;
24.import javax.imageio.ImageIO;
25.import com.hengking.crawl.po.PoCalSearch;
26.import com.hengking.crawl.po.PoDownload;
27.
28.
29./***
30. * 说明:抓图工具
31. * @author 君望永远
32. *
33. */
34.public class SearchCrawler implements Runnable{
35.
36. /* disallowListCache缓存robot不允许搜索的URL。 Robot协议在Web站点的根目录下设置一个robots.txt文件,
37. *规定站点上的哪些页面是限制搜索的。搜索程序应该在搜索过程中跳过这些区域,下面是robots.txt的一个例子:
38. # robots.txt for http://somehost.com/
39. User-agent: *
40. Disallow: /cgi-bin/
41. Disallow: /registration # /Disallow robots on registration page
42. Disallow: /login
43. */
44. public static SimpleBloomFilter filterUrl;
45. public static SimpleBloomFilter filterImg;
46. private HashMap< String,ArrayList< String>> disallowListCache = new HashMap< String,ArrayList< String>>();
47. ArrayList< String> errorList= new ArrayList< String>();//错误信息
48. ArrayList< String> result=new ArrayList< String>(); //搜索到的结果
49. String startUrl;//开始搜索的起点
50. LinkedHashSet<String> toCrawlList = new LinkedHashSet<String>();
51. boolean caseSensitive=false;//是否区分大小写
52. boolean limitHost=false;//是否在限制的主机内搜索
53. private static String outdir;
54.
55. private static String seroutdir;
56. private static String seroutdirimg;
57. private boolean blnFlag=false;
58.
59. private static PoCalSearch ps=null;
60. private static PoDownload pd=null;
61.
62. //300个图片分析线程
63. private static ExecutorService execImg;
64. final Semaphore sempImg = new Semaphore(300);
65.
66. //30个网页分析线程
67. private static ExecutorService execPage;
68. final Semaphore sempPage = new Semaphore(30);
69.
70. private ArrayList<ParsePage> arrPar=new ArrayList<ParsePage>();
71.
72. //记录抓图结果
73. private static BufferedWriter bw = null;
74.
75. public SearchCrawler(String startUrl)
76. {
77. this.startUrl=startUrl;
78.
79. }
80. public ArrayList< String> getResult(){
81. return result;
82. }
83. public void run(){//启动搜索线程
84. new Thread(new TimeWrite2File()).start();
85. blnFlag=true;
86. crawl(startUrl,limitHost,caseSensitive);
87.
88. }
89.
90. //检测URL格式
91. private URL verifyUrl(String url) {
92. // 只处理HTTP URLs.
93. if (!url.toLowerCase().startsWith("http://"))
94. return null;
95. URL verifiedUrl = null;
96. try {
97. verifiedUrl = new URL(url);
98. } catch (Exception e) {
99. return null;
100. }
101. return verifiedUrl;
102. }
103. // 检测robot是否允许访问给出的URL.
104. private boolean isRobotAllowed(URL urlToCheck) {
105. String host = urlToCheck.getHost().toLowerCase();//获取给出RUL的主机
106. //System.out.println("主机="+host);
107. // 获取主机不允许搜索的URL缓存
108. ArrayList< String> disallowList =disallowListCache.get(host);
109. // 如果还没有缓存,下载并缓存。
110. if (disallowList == null) {
111. disallowList = new ArrayList< String>();
112. try {
113. URL robotsFileUrl =new URL("http://" + host + "/robots.txt");
114. BufferedReader reader =new BufferedReader(new InputStreamReader(robotsFileUrl.openStream()));
115. // 读robot文件，创建不允许访问的路径列表。
116. String line;
117. while ((line = reader.readLine()) != null) {
118. if (line.indexOf("Disallow:") == 0) {//是否包含"Disallow:"
119. String disallowPath =line.substring("Disallow:".length());//获取不允许访问路径
120. // 检查是否有注释。
121. int commentIndex = disallowPath.indexOf("#");
122. if (commentIndex != - 1) {
123. disallowPath =disallowPath.substring(0, commentIndex);//去掉注释
124. }
125.
126. disallowPath = disallowPath.trim();
127. disallowList.add(disallowPath);
128. }
129. }
130. // 缓存此主机不允许访问的路径。
131. disallowListCache.put(host, disallowList);
132. } catch (Exception e) {
133. return true; //web站点根目录下没有robots.txt文件,返回真
134. }
135. }
136.
137. String file = urlToCheck.getFile();
138. //System.out.println("文件getFile()="+file);
139. for (int i = 0; i < disallowList.size(); i++) {
140. String disallow = disallowList.get(i);
141. if (file.startsWith(disallow)) {
142. return false;
143. }
144. }
145. return true;
146. }
147.
148.
149. private String downloadPage(URL pageUrl) {
150. try {
151.
152.
153.
154. // Open connection to URL for reading.
155. BufferedReader reader =
156. new BufferedReader(new InputStreamReader(pageUrl.openStream()));
157.
158.
159. // Read page into buffer.
160. String line;
161. StringBuffer pageBuffer = new StringBuffer();
162. while ((line = reader.readLine()) != null) {
163. pageBuffer.append(line);
164. }
165.
166. return pageBuffer.toString();
167. } catch (Exception e) {
168. e.printStackTrace();
169. }
170. return null;
171. }
172. // 从URL中去掉"www"
173. private String removeWwwFromUrl(String url) {
174. int index = url.indexOf("://www.");
175. if (index != -1) {
176. return url.substring(0, index + 3) +
177. url.substring(index + 7);
178. }
179. return (url);
180. }
181. // 解析页面并找出链接
182. private ArrayList< String> retrieveLinks(URL pageUrl, String pageContents,
183. boolean limitHost)
184. {
185. // 用正则表达式编译链接的匹配模式。
186. Pattern p =Pattern.compile("<a\\s+href\\s*=\\s*\"?(.*?)[\"|>]",Pattern.CASE_INSENSITIVE);
187. Matcher m = p.matcher(pageContents);
188.
189. ArrayList< String> linkList = new ArrayList< String>();
190. while (m.find()) {
191. String link = m.group(1).trim();
192.
193. if (link.length() < 1) {
194. continue;
195. }
196. // 跳过链到本页面内链接。
197. if (link.charAt(0) == '#') {
198. continue;
199. }
200.
201. if (link.indexOf("mailto:") != -1) {
202. continue;
203. }
204.
205. if (link.toLowerCase().indexOf("javascript") != -1) {
206. continue;
207. }
208. if (link.indexOf("://") == -1){
209. if (link.charAt(0) == '/') {//处理绝对地
210. link = "http://" + pageUrl.getHost()+":"+pageUrl.getPort()+ link;
211. } else {
212. String file = pageUrl.getFile();
213. if (file.indexOf('/') == -1) {//处理相对地址
214. link = "http://" + pageUrl.getHost()+":"+pageUrl.getPort() + "/" + link;
215. } else {
216. String path =file.substring(0, file.lastIndexOf('/') + 1);
217. link = "http://" + pageUrl.getHost() +":"+pageUrl.getPort()+ path + link;
218. }
219. }
220. }
221. int index = link.indexOf('#');
222. if (index != -1) {
223. link = link.substring(0, index);
224. }
225. link = removeWwwFromUrl(link);
226. URL verifiedLink = verifyUrl(link);
227. if (verifiedLink == null) {
228. continue;
229. }
230. /* 如果限定主机，排除那些不合条件的URL*/
231. if (limitHost &&
232. !pageUrl.getHost().toLowerCase().equals(
233. verifiedLink.getHost().toLowerCase()))
234. {
235. continue;
236. }
237. // 跳过那些已经处理的链接.
238. if(filterUrl.contains(link))
239. {
240. logEvent("匹配了:"+link);
241. continue;
242. }
243. else
244. {
245. filterUrl.add(link);
246. }
247.
248. linkList.add(link);
249. }
250. return (linkList);
251. }
252.
253.
254.
255.
256. // 解析页面并找出链接
257. private ArrayList< String> retrieveImgLinks(URL pageUrl, String pageContents,
258. boolean limitHost)
259. {
260. // 用正则表达式编译链接的匹配模式。
261. Pattern p =Pattern.compile("<img\\s+src\\s*=\\s*\"?(.*?)[\"|>]",Pattern.CASE_INSENSITIVE);
262. Matcher m = p.matcher(pageContents);
263.
264. ArrayList< String> linkList = new ArrayList< String>();
265. while (m.find()) {
266. String link = m.group(1).trim();
267.
268. if (link.length() < 1) {
269. continue;
270. }
271. // 跳过链到本页面内链接。
272. if (link.charAt(0) == '#') {
273. continue;
274. }
275.
276. if (link.indexOf("mailto:") != -1) {
277. continue;
278. }
279.
280. if (link.toLowerCase().indexOf("javascript") != -1) {
281. continue;
282. }
283. if (link.toLowerCase().endsWith("gif")) {
284. continue;
285. }
286. if (link.indexOf("://") == -1)
287. {
288. if (link.charAt(0) == '/')
289. {//处理绝对地
290. link = "http://" + pageUrl.getHost()+":"+pageUrl.getPort()+ link;
291. }
292. else
293. {
294. String file = pageUrl.getFile();
295. if (file.indexOf('/') == -1) {//处理相对地址
296. link = "http://" + pageUrl.getHost()+":"+pageUrl.getPort() + "/" + link;
297. } else {
298. String path =file.substring(0, file.lastIndexOf('/') + 1);
299. link = "http://" + pageUrl.getHost() +":"+pageUrl.getPort()+ path + link;
300. }
301. }
302. }
303. int index = link.indexOf('#');
304. if (index != -1) {
305. link = link.substring(0, index);
306. }
307. link = removeWwwFromUrl(link);
308. URL verifiedLink = verifyUrl(link);
309. if (verifiedLink == null) {
310. continue;
311. }
312. /* 如果限定主机，排除那些不合条件的URL*/
313. if (limitHost &&
314. !pageUrl.getHost().toLowerCase().equals(
315. verifiedLink.getHost().toLowerCase()))
316. {
317. continue;
318. }
319. // 跳过那些已经处理的链接.
320.// if (crawledList.contains(link)) {
321.// continue;
322.// }
323. if(filterImg.contains(link))
324. {
325. logEvent("图片匹配了:"+link);
326. continue;
327. }
328. else
329. {
330. filterImg.add(link);
331. }
332.
333.
334. if(link.lastIndexOf(".gif")==-1)
335. {
336. linkList.add(link);
337. }
338.
339.
340.
341. }
342. return (linkList);
343. }
344.
345. //执行实际的搜索操作
346. public ArrayList< String> crawl(String startUrl,boolean limithost,boolean caseSensitive )
347. {
348.
349. // 从开始URL中移出www
350. startUrl = removeWwwFromUrl(startUrl);
351.
352. toCrawlList.add(startUrl);
353.
354. int idxPageParse=0;
355. while (toCrawlList.size()>0)
356. {
357. try
358. {
359. idxPageParse++;
360. // Get URL at bottom of the list.
361. String url = toCrawlList.iterator().next();
362. ps.setIntUrl(ps.getIntUrl()+1);
363. // Remove URL from the to crawl list.
364. toCrawlList.remove(url);
365.
366. int intRetryPage=0;
367. while (sempPage.availablePermits()<=0)
368. {
369. System.out.println("暂时没有空闲的网页分析线程，等待3秒再执行...");
370. try {
371. intRetryPage++;
372. if(intRetryPage==10)
373. {
374. logEvent("分析网页"+url+"超时");
375. sempPage.release();
376. break;
377. }
378. Thread.sleep(3000);
379. } catch (InterruptedException e) {
380. e.printStackTrace();
381. }
382. }
383.
384.
385. ParsePage tempPageThread=new ParsePage(url);
386. execPage.submit(tempPageThread);
387. logEvent("开启网页分析线程"+idxPageParse);
388. if(idxPageParse==1)
389. {
390. Thread.currentThread().sleep(30000);
391. }
392.
393. }catch(Exception e)
394. {
395. e.printStackTrace();
396. }
397.
398. }
399. blnFlag=false;
400.
401. logEvent("抓图完成......");
402.
403. return result;
404. }
405. public static void logEvent(String strLog)
406. {
407. System.out.println( new SimpleDateFormat("yyyy年MM月dd日HH时mm分ss秒").format(new Date(Calendar.getInstance().getTimeInMillis()))+"=====>"+strLog);
408.
409. }
410.
411. // 主函数
412. public static void main(String[] args) {
413. if(args.length!=6)
414. {
415. System.out.println("Usage:java SearchCrawler startUrl maxUrl searchString");
416. return;
417. }
418. @SuppressWarnings("unused")
419. String strLogPath=args[1];
420. SearchCrawler crawler = new SearchCrawler(args[0]);
421.
422.
423. outdir=args[3]+"/pic"+new SimpleDateFormat("yyyyMMdd").format(new Date(Calendar.getInstance().getTimeInMillis()))+"/";
424. File f=new File(outdir);
425. if(!f.exists())
426. {
427. f.mkdir();
428. }
429.
430. execPage = Executors.newFixedThreadPool(30);
431. execImg = Executors.newFixedThreadPool(300);
432.
433. seroutdir=args[4];
434. seroutdirimg=args[5];
435.
436. ps=new PoCalSearch();
437. pd=new PoDownload();
438. try {
439. if(UtilSeriz.readObject(seroutdir)!=null)
440. {
441. System.out.println(new SimpleDateFormat("yyyy年MM月dd日HH时mm分ss秒").format(new Date(Calendar.getInstance().getTimeInMillis()))+"=====>"+"反序列化URL...");
442. filterUrl=(SimpleBloomFilter)UtilSeriz.readObject(seroutdir);
443. }
444. else
445. {
446. filterUrl=new SimpleBloomFilter();
447. }
448. if(UtilSeriz.readObject(seroutdir)!=null)
449. {
450. System.out.println(new SimpleDateFormat("yyyy年MM月dd日HH时mm分ss秒").format(new Date(Calendar.getInstance().getTimeInMillis()))+"=====>"+"反序列化图片...");
451.
452. filterImg=(SimpleBloomFilter)UtilSeriz.readObject(seroutdirimg);
453. }
454. else
455. {
456. filterImg=new SimpleBloomFilter();
457. }
458. } catch (Exception e) {
459. e.printStackTrace();
460. }
461.
462. String strPic=args[3]+"/pic"+new SimpleDateFormat("yyyyMMdd").format(new Date(Calendar.getInstance().getTimeInMillis()))+".log";
463. try {
464. bw=new BufferedWriter(new FileWriter(strPic,false));
465. } catch (IOException e) {
466. // TODO Auto-generated catch block
467. e.printStackTrace();
468. }
469.
470. Thread search=new Thread(crawler);
471. System.out.println( new SimpleDateFormat("yyyy年MM月dd日HH时mm分ss秒").format(new Date(Calendar.getInstance().getTimeInMillis()))+"=====>"+"开始爬图...");
472. System.out.println("下载了图:");
473. search.start();
474. try {
475. search.join();
476. logEvent("主函数结束");
477. bw.close();
478. } catch (Exception e) {
479. // TODO Auto-generated catch block
480. e.printStackTrace();
481. }
482.
483.
484.
485.
486. }
487.
488. /**
489. * 说明:下载图片的线程
490. * @author binbin0915
491. *
492. */
493. public class ImgDownThread implements Runnable,Callable<Long>{
494. //待下载的URL
495. private String stru;
496.
497. private boolean isStart=true;
498.
499. public ImgDownThread(String strurl) {
500. super();
501. this.stru = strurl;
502. }
503.
504. @Override
505. public void run()
506. {
507.
508.
509. try
510. {
511. sempImg.acquire();
512. try{
513.
514.
515. URL url=new URL(stru);
516. BufferedInputStream in = new BufferedInputStream(url.openStream());
517.
518. BufferedImage bi=ImageIO.read(url.openStream());
519.
520. //尺寸要求
521. if (bi==null|| bi.getWidth()<30 || bi.getHeight()<30 )
522. {
523.
524. in.close();
525. return;
526. }
527. String ss=new SimpleDateFormat("yyyyMMddHHmmss").format(new Date(Calendar.getInstance().getTimeInMillis()))+"_"+Math.round(Math.random()*89999999999999L+1000)+stru.substring(stru.lastIndexOf("."));
528. String s=outdir+ss;
529. FileOutputStream file = new FileOutputStream(new File(s));
530. int t;
531. while ((t = in.read()) != -1)
532. {
533. file.write(t);
534. }
535. file.close();
536. if(new File(s).length()<=10*1024)
537. {
538.
539. in.close();
540. new File(s).delete();
541. return;
542. }
543.
544. synchronized(bw)
545. {
546. String str=ss+":"+stru;
547. bw.write(str);
548. bw.newLine();
549. bw.flush();
550. }
551. logEvent("下载了:"+stru);
552. ps.setIntImg(ps.getIntImg()+1);
553. in.close();
554.
555. }catch(Exception e){
556. logEvent("**********************下载图片:"+stru+"超时");
557. }
558. }
559.
560.
561. catch (Exception e)
562. {
563.
564. e.printStackTrace();
565. }
566. finally{
567. sempImg.release();
568. }
569.
570. }
571.
572.
573. public boolean isStart() {
574. return isStart;
575. }
576.
577. public void setStart(boolean isStart) {
578. this.isStart = isStart;
579. }
580.
581. @Override
582. public Long call() throws Exception {
583. try
584. {
585. sempImg.acquire();
586. try{
587.
588.
589. URL url=new URL(stru);
590. BufferedInputStream in = new BufferedInputStream(url.openStream());
591.
592. BufferedImage bi=ImageIO.read(url.openStream());
593.
594. //尺寸要求
595. if (bi==null|| bi.getWidth()<30 || bi.getHeight()<30 )
596. {
597.
598. in.close();
599. return 0l;
600. }
601. String ss=new SimpleDateFormat("yyyyMMddHHmmss").format(new Date(Calendar.getInstance().getTimeInMillis()))+"_"+Math.round(Math.random()*89999999999999L+1000)+stru.substring(stru.lastIndexOf("."));
602. String s=outdir+ss;
603. FileOutputStream file = new FileOutputStream(new File(s));
604. int t;
605. while ((t = in.read()) != -1)
606. {
607. file.write(t);
608. }
609. file.close();
610. if(new File(s).length()<=10*1024)
611. {
612.
613. in.close();
614. new File(s).delete();
615. return 0l;
616. }
617.
618. logEvent("下载了:"+stru);
619. ps.setIntImg(ps.getIntImg()+1);
620. in.close();
621.
622. }catch(Exception e){
623. logEvent("**********************下载图片:"+stru+"超时");
624. }
625. }
626.
627.
628. catch (Exception e)
629. {
630.
631. e.printStackTrace();
632. }
633. finally{
634. sempImg.release();
635. return 1l;
636. }
637.
638. }
639.
640. }
641.
642. /***
643. * 序列化已访问的URL
644. * @author binbin0915
645. *
646. */
647. public class TimeWrite2File implements Runnable
648. {
649. @Override
650. public void run()
651. {
652. while(blnFlag)
653. {
654. try
655. {
656.
657. synchronized(ps)
658. {
659. logEvent("开始序列化URL");
660. UtilSeriz.writeObject(filterUrl,seroutdir);
661. logEvent("结束序列化URL");
662. logEvent("开始序列化图片");
663. UtilSeriz.writeObject(filterImg,seroutdirimg);
664. logEvent("结束序列化图片");
665. logEvent("分析了"+ps.getIntUrl()+"个链接");
666. logEvent("下载了"+ps.getIntImg()+"张图片");
667. }
668. Thread.sleep(600000);
669.
670. }
671. catch (Exception e)
672. {
673. e.printStackTrace();
674. }
675.
676. }
677.
678. }
679.
680. }
681.
682.
683. /***
684. * 分析对应URL网页的线程
685. * @author Administrator
686. *
687. */
688. class ParsePage extends Thread
689. {
690.
691. String url;
692. int iCount=0;
693.
694. public int getiCount() {
695. return iCount;
696. }
697. public void setiCount(int iCount) {
698. this.iCount = iCount;
699. }
700. public String getUrl()
701. {
702. return url;
703. }
704. public void setUrl(String url)
705. {
706. this.url = url;
707. }
708. public ParsePage(String url) {
709. this.url=url;
710. }
711. @Override
712. public void run()
713. {
714. try
715. {
716. sempPage.acquire();
717. // Convert string url to URL object.
718. URL verifiedUrl = verifyUrl(url);
719.
720. // Skip URL if robots are not allowed to access it.
721. if (!isRobotAllowed(verifiedUrl))
722. {
723. Thread.currentThread().stop();
724. }
725.
726.
727. // 增加已处理的URL到crawledList
728. String pageContents="";
729.
730. pageContents = downloadPage(verifiedUrl);
731.
732. logEvent("分析了:"+verifiedUrl);
733. logEvent("待分析URL数:"+toCrawlList.size()+"个");
734.
735.
736. if (pageContents != null && pageContents.length() > 0)
737. {
738. // 从页面中获取有效的链接
739. ArrayList< String> links =retrieveLinks(verifiedUrl, pageContents,limitHost);
740.
741. // 从页面中获取有效的链接
742. ArrayList< String> imglinks =retrieveImgLinks(verifiedUrl, pageContents,limitHost);
743.
744. //添加到图片下载队列
745. if(toCrawlList.size()<100000)
746. {
747. toCrawlList.addAll(links);
748. }
749. else
750. {
751. logEvent("待分析的网页URL超过100000！！！！跳过.......");
752. }
753.
754.
755.
756. for(int i=0;i<imglinks.size();i++)
757. {
758. if(imglinks.get(i).indexOf("http:")!=-1)
759. {
760.
761. iCount++;
762. filterImg.add(imglinks.get(i));
763. ps.setIntImg(ps.getIntImg()+1);
764.
765.
766. int intRetryImg=0;
767. while (sempImg.availablePermits() <= 0)
768. {
769. System.out.println("暂时没有空闲的抓图线程，等待3秒再执行...");
770. try {
771. intRetryImg++;
772. if(intRetryImg==10)
773. {
774. logEvent("抓图"+imglinks.get(i)+"超时");
775. sempImg.release();
776. }
777. Thread.sleep(3000);
778. } catch (InterruptedException e) {
779. e.printStackTrace();
780. }
781. }
782. Thread tempImgThread=new Thread(new ImgDownThread(imglinks.get(i)));
783. execImg.submit(tempImgThread);
784.
785. if((iCount!=1) && (iCount%10==1) )
786. {
787. try
788. {
789. logEvent("图多休息2秒......");
790. Thread.currentThread().sleep(2000);
791. }
792. catch (InterruptedException e)
793. {
794. e.printStackTrace();
795. }
796. }
797.
798.
799. }
800.
801. }
802. }
803. synchronized(arrPar)
804. {
805. arrPar.remove(this);
806. }
807. }
808. catch(Exception e)
809. {
810. e.printStackTrace();
811.
812. }
813. finally
814. {
815. sempPage.release();
816. }
817. }
818.
819. }
820.}

本文来自CSDN博客，转载请标明出处：http://blog.csdn.net/binyao02123202/archive/2010/07/17/5741519.aspx

java 抓图程序的实现相关推荐

自己写的Java抓图程序
公司里要写一个抓图的程序其实和搜索引擎差不多的原理下载分析网页遇到<a>标签继续模拟点击进去再分析该网页遇到<img>就下载该图难点: 1 URL去重采用bloomf ...
java程序ssh置顶_使用shell脚本启动远程(SSH)Java应用程序不会返回本地提示
我见过类似的问题,所有已解决的问题已经解决/不适用. 我在启动Java应用程序的远程计算机中有一个bash脚本.相关的行将是: #!/usr/bin/env bash ... java -cp /fu ...
java 正则 cpu 100_这六个原因真的可以使Java应用程序的CPU使用率飙升到100％吗？...
点击上方的"代码农户的冥想记录",然后选择"设为明星" 高质量文章,及时交付问题 1. 无限while循环会导致CPU使用率飙升吗? 2.经常使用Young ...
java web程序示例_想要建立一些有趣的东西吗？这是示例Web应用程序创意的列表。...
java web程序示例 Interested in learning JavaScript? Get my ebook at jshandbook.com 有兴趣学习JavaScript吗? 在js ...
java 用程序代码解释继承_关于初级java程序员笔试题
关于初级java程序员笔试题 Sun 认证Java程序员考试内容涉及Java所有相关知识.编程概念及applet开发技巧.下面是小编整理的关于初级java程序员笔试题,欢迎大家参考! 第一题:判断题 ...
[Google API](8)构建使用Google API的Java应用程序
Google 搜索引擎建立起了通过 Web 服务接口可用的索引.拼写建议和缓存页面,从而允许所有语言的程序员都能就个人使用存取信息.Google 搜索引擎还提供了 Java API,从而存取数据更为便 ...
用JEP 343打包工具，构建自包含、可安装的Java应用程序
OpenJDK社区发布了JEP 343:打包工具的早期访问版本.JEP 343:打包工具,又名jpackage,是打包自包含Java应用程序和Java运行时环境的新工具.这个基于JavaFX java ...
Java应用程序项目的打包与发行
这里主要是讲解一下怎样将 Java程序打包成独立运行的exe程序包,以下这种方法应该是最佳的解决方案了.NetDuke的EXE程序包了是使用这种方案制作的.在操作步骤上还是比较简单的,而且通用性强. ...
面向 Java 开发人员的 Ajax: 构建动态的 Java 应用程序
面向 Java 开发人员的 Ajax: 构建动态的 Java 应用程序 Ajax 为更好的 Web 应用程序铺平了道路在 Web 应用程序开发中,页面重载循环是最大的一个使用障碍,对于 Java™ ...
Java高级程序员（5年左右）面试的题目集
Java高级程序员(5年左右)面试的题目集 https://blog.csdn.net/fangqun663775/article/details/73614850?utm_source=blogxg ...

java 抓图程序的实现

java 抓图程序的实现相关推荐

最新文章

热门文章