php网页采集想对高效版

想对前面写的版本，极大的减小了IO开销，减小了对主机的解析

<?php
header("content-type: text/html; charset=utf-8");
class HttpWrap
{public $timeout=10;public $status='';public $host;public $port=80;private $ip;private $conn;private $path;private $url;private $scheme;public $http_method='GET';public $http_version="HTTP/1.1";public $agent="Mozilla/5.0 (Windows NT 6.1; rv:33.0) Gecko/20100101 Firefox/33.0";public $accept="image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, */*";public $gzip="gzip";public $referer;public $cookie;public $submit_type="application/x-www-form-urlencoded";private $accept_language="zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3";public $connection="keep-alive";private $cmd_line;private $header;public $post_content;private $redirect;private $is_gzip;public $response_num;public $response_header;public $response_body_length=0;public $response_body;public $roll_link;public $roll_group;public $filename;public $encoding;public  function init($url){$this->url=$url;$url_pair = parse_url($url);$this->host = $url_pair['host'];$this->path = $url_pair['path'];$this->scheme = $url_pair['scheme'];if(empty($this->ip)){$this->ip = gethostbyname($this->host);}if(!empty($url_pair['port'])){$this->port = $url_pair['port'];}$this->connect();// echo $this->status; exit();$this->sendRequest();//如果响应头部存在重定向，则对重定向发送请求if($this->redirect){if(preg_match("#^http://".preg_quote($this->host)."#i",$this->redirect)){$this->referer=$this->host."/".parse_url($this->redirect)['path'];$this->init($this->redirect);}}if($this->roll_link){$next_url = substr($this->url,0,strripos($this->url, '/')+1).$this->roll_link;//如果下一页等于当前页if(strtolower(trim(basename($this->url,'.html'))) == strtolower(trim(basename($next_url,'.html')))){$next_group = $this->getNextGroup($this->response_body);echo "<font color='color'>即将采集下一组</font><br />";sleep(1);$this->init($next_group);}else{$this->init($next_url);}}else{die('没有下一页');}}private function connect(){$this->conn = fsockopen($this->ip,$this->port,$errno,$errstr,$this->timeout);if($this->conn){$this->status = '链接成功';return true;}else{switch($errno){case -3:$this->status="创建socket链接失败";case -4:$this->status="dns查询失败";case -5:$this->status="链接被拒绝或超时";default:$this->status="创建连接失败";}return false;}}private function sendRequest(){if(empty($this->path)){$this->path="/";}$this->cmd_line=$this->http_method." ".$this->path." ".$this->http_version."\r\n";if(!empty($this->host)){$this->header .= "Host: ".$this->host."\r\n";}if(!empty($this->agent)){$this->header .="User-Agent: ".$this->agent."\r\n";}if(!empty($this->accept)){$this->header .= "Accept: ". $this->accept ."\r\n";}if(!empty($this->gzip)){if ( function_exists("gzinflate") ){$this->header .= "Accept-encoding: gzip\r\n";}else{$this->status = "不支持压缩";}}if(!empty($this->referer)){$this->header .= "Referer: ".$this->referer."\r\n";}if(!empty($this->accept_language)){$this->header .= "Accept-Language: ".$this->accept_language."\r\n";}if(!empty($this->cookie)){if(!is_array($this->cookie)){$this->header .="Cookie: ".$this->cookie;}else{if(count($this->cookie) >0){$cookie = "Cookie: ";foreach($this->cookie as $key => $val){$cookie.=$key."=".urlencode($val).";";}$cookie = substr($cookie, 0, strlen($cookie)-1)."\r\n";}$this->header .= $cookie;}}if(!empty($this->submit_type)){$this->header .="Content-Type: ".$this->submit_type."\r\n";}if(!empty($this->post_content)){$this->header .= "Content-length: ".strlen($this->post_content)."\r\n";}if(!empty($this->connection)){$this->header .= "Connection: ".$this->connection."\r\n";}$this->header .="\r\n";//上面是HTTP请求头部信息//echo $this->cmd_line.$this->header.$this->post_content; exit();//发送请求$len = strlen($this->cmd_line.$this->header.$this->post_content);if($len != fwrite($this->conn, $this->cmd_line.$this->header.$this->post_content,$len)){$this->status = "发送请求failed";}//接受响应，每次读取一行内容，首先解析响应头while($response_header = fgets($this->conn, 1024)){if(preg_match("#^HTTP/#",$response_header)){//匹配状态数字,200表示请求成功if(preg_match("#^HTTP/[^\s]*\s(.*?)\s#",$response_header, $status)){$this->response_num= $status[1];//返回代表数字的状态}}//echo $this->response_num; exit();// 判断是否需要重定向if(preg_match("#^(Location:|URI:)#i",$response_header)){// 获取重定向地址preg_match("#^(Location:|URI:)\s+(.*)#",trim($response_header),$matches);//如果重定向字段不包含主机名，不是以以://开头的，则拼接王完整的请求地址，模式+主机+端口if(!preg_match("#\:\/\/#",$matches[2])){// 补全主机名$this->redirect = "http://".$this->host.":".$this->port;//添加路径if(!preg_match("|^/|",$matches[2]))$this->redirect .= "/".$matches[2];else$this->redirect .= $matches[2];}else//包含完整的主机地址$this->redirect = $matches[2];}//判断返回的数据的压缩格式if (preg_match("#^Content-Encoding: gzip#", $response_header) ){$this->is_gzip = true;}if(preg_match('#^Content-Length:\s*(\d+)#i', $response_header, $len)){$this->response_body_length = $len[1];}//解析完响应头部if(preg_match("/^\r?\n$/", $response_header) )break;$this->response_header[]=$response_header;}//可以成功返回响应头部信息,响应状态码也为200// var_dump($this->response_header); exit();if($this->response_num==200){//问题出在这里//echo "ok"; exit();$sub_dir;$dirname;$path;$filename;if(preg_match('#/(\d+)/#', $this->url, $sub_dir)){$dirname = "./download/".$sub_dir[1];}else{$dirname = "./download/".date("Ymd");}$len=0;while($items = fread($this->conn, $this->response_body_length)){if(!is_dir($dirname)){$path = mkdir($dirname,0777,true);}$filename = $dirname.'/'.basename($this->url);$len = $len+strlen($items);$this->response_body = $items;file_put_contents($filename, $items, FILE_APPEND);//这里必须判断读取的长度，不然会在这里阻塞if($len >= $this->response_body_length) break;}if($this->is_gzip){$this->response_body = gzinflate ($this->response_body);}echo str_repeat("  ", 2048);echo "对链接".$this->url."发起请求<br />";$this->getRollLink($this->response_body);// sleep(1);}}private function getRollLink($filename){$content='';if(empty($this->encoding)){$this->encoding=mb_detect_encoding(substr($filename,0,32), array('GB2312','GBK','UTF-8','BIG5','LATIN1'));if($this->encoding !='UTF-8'){$content = mb_convert_encoding($filename, 'UTF-8', $this->encoding);}}else{$content = mb_convert_encoding($filename, 'UTF-8', $this->encoding);}if(preg_match('#<ul\s+class="image"[^>]*?>.*?</ul>#is', $content, $match)){if(preg_match('#<a\s+href="([^"]+?)">下一页</a>#ui', $match[0], $next)){$this->roll_link =  trim($next[1]);}}else{$this->roll_link = false;}}private  function getNextGroup($filename){if(empty($this->encoding)){$this->encoding=mb_detect_encoding(substr($filename,0,32), array('GB2312','GBK','UTF-8','BIG5','LATIN1'));if($this->encoding !='UTF-8'){$content = mb_convert_encoding($filename, 'UTF-8', $this->encoding);}}else{$content = mb_convert_encoding($filename, 'UTF-8', $this->encoding);}if(preg_match('#<ul\s+class="page"[^>]*?>.*?</ul>#is', $content, $match)){//echo $match[0]."<br />";if(preg_match_all('#<a\s+href="([^"]*?)">.*?</a>#usi', $match[0], $next)){//var_dump($next[1]);$choice;if(count($next[1])==2){$first = basename($next[1][0], ".html");$second = basename($next[1][1], ".html");//往前翻页,进入下一组if(intval($first) < intval($second)){$choice = $first;}else{$choice = $second;}//h获取下一组foreach($next[1] as $item){if(strripos($item, $choice) !=false ){if(substr($item, 0,2) =='..'){$link=  substr($item, 2);$sub_path = explode('/', $this->path);$url = $this->scheme.'://'.$this->host.'/'.$sub_path[1].$link;return $url;}}}}//如果是最后一组，即没有下一组了else if(count($next[1])==1){if(substr($next[1][0],0,2)=='..'){$link = substr($next[1][0],2);$sub_path = explode('/', $this->path);$url = $this->scheme.'://'.$this->host.'/'.$sub_path[1].$link;return $url;}}}else{$this->status = "failed to match href";}}else{$this->status = "failed to match class=page";}}}
ob_implicit_flush(true);
set_time_limit(0);
$url = $url = "http://www.mmkao.com/Beautyleg/201412/7066.html";
$http = new HttpWrap();
$http->cookie = "safedog-flow-item=41E2DBFEF121A8A2835ADB4476E5D3EC";
$http->referer = "www.mmkao.com";
$http->init($url);
?>

php网页采集想对高效版相关推荐

高效采集互联网信息，用绿色版网页采集器
随着信息时代的到来,人们对于信息的需求越来越大.而互联网上的信息量巨大,如何快速.准确地获取需要的信息成为了一项重要的技能.而网页采集器便是一种能够帮助我们实现这一目标的工具.在本文中,我们将介绍一款 ...
Fiddler 网页采集抓包利器__手机app抓包
用curl技术开发了一个微信文章聚合类产品,把抓取到的数据转换成json格式,并在android端调用json数据接口加以显示: 基于weiphp做了一个掌上头条插件,也是用的网页采集技术:和一个创业 ...
【Fiddler】网页采集必备抓包利器
最近这段时间,网页采集方面的工作做得比较多.用curl技术开发了一个微信文章聚合类产品,把抓取到的数据转换成json格式,并在android端调用json数据接口加以显示:基于weiphp做了一个掌上 ...
网页采集dz文章（文章采集网站源码）
随着互联网的发展,网页采集成为了获取信息的一种重要方法.而对于DZ(Discuz)论坛上的文章,如何高效地进行采集呢?本文将为你详细介绍. 概述: 在进行网页采集之前,我们需要明确目标,即要采集的DZ ...
Python爬虫之reuqests实现简单网页采集--网页采集教程
我们介绍了一种新的爬取网页的方法–reuqests,并介绍了它的使用方法,我们还介绍了urllib与reuqests的区别.这节课我们通过一个实例–reuqests实现简单网页采集来加深大家对reuq ...
js变量显示在html中_PythonDjango+JS+Ajax实现网页采集并动态显示PLC变量
Python-snap7与S7-1500的通讯可以查阅前几篇.篇末演示功能实现的效果. 利用PythonDjango+JS+Ajax技术来实现:网页发送读写请求至服务器,服务器写入数据至PLC,并读取 ...
使用cpolar发布群晖NAS上的网页上篇（7.X版）
系列文章使用cpolar发布群晖NAS上的网页上篇(7.X版) 使用cpolar发布群晖NAS上的网页中篇(7.X版) 使用cpolar发布群晖NAS上的网页下篇(7.X版) 随着网络科技和硬 ...
excel计算式自动计算_想要高效工作？62套自动计算EXCEL表格，结果准确速度快！...
想要高效工作?62套自动计算EXCEL表格,结果准确速度快! 随着社会的整体发展节奏加快,人们的生活和工作节奏也随之加快!尤其是在职场上,高效工作是每个职场人员必须不断锻炼并掌握的专业技能.作为电气工 ...
php出入库单生成源码,php 网页采集入库程序代码
网页采集现在用到最多是工具了,像最受站长欢迎的就是火车头了,但有一些站长喜欢使用网页来自定义采集了,下面一起来看一个php 网页采集入库程序代码 php 网页采集程序总结,最近帮朋友做了个采集程序以 ...

php网页采集想对高效版

php网页采集想对高效版相关推荐

最新文章

热门文章

php网页采集 想对高效版

php网页采集 想对高效版相关推荐

最新文章

热门文章

php网页采集想对高效版

php网页采集想对高效版相关推荐