改进了检查,显示给出错误提示信息,匹配模式更通用

<?php
header("content-type: text/html; charset=gb2312");
class HttpWrap
{public $timeout=10;public $status='';public $host;public $port=80;private $ip;private $conn;private $path;private $url;private $scheme;public $http_method='GET';public $http_version="HTTP/1.1";public $agent="Mozilla/5.0 (Windows NT 6.1; rv:33.0) Gecko/20100101 Firefox/33.0";public $accept="image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, */*";public $gzip="gzip";public $referer;public $cookie;public $submit_type="application/x-www-form-urlencoded";private $accept_language="zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3";public $connection="keep-alive";private $cmd_line;private $header;public $post_content;private $redirect;private $is_gzip;public $response_num;public $response_header;public $response_body_length=0;public $response_body;public $roll_link;public $roll_group;public $filename;public $encoding;public $match_status=0;public  function init($url){$this->url=$url;$url_pair = parse_url($url);$this->host = $url_pair['host'];$this->path = $url_pair['path'];$this->scheme = $url_pair['scheme'];if(empty($this->ip)){$this->ip = gethostbyname($this->host);}if(!empty($url_pair['port'])){$this->port = $url_pair['port'];}if($this->connect()){$this->sendRequest();}else{die($this->status);}//如果响应头部存在重定向,则对重定向发送请求if($this->redirect){if(preg_match("#^http://".preg_quote($this->host)."#i",$this->redirect)){$this->referer=$this->host."/".parse_url($this->redirect)['path'];$this->init($this->redirect);}}if($this->roll_link){$next_url = substr($this->url,0,strripos($this->url, '/')+1).$this->roll_link;//如果下一页等于当前页if(strtolower(trim(basename($this->url,'.html'))) == strtolower(trim(basename($next_url,'.html')))){$next_group = $this->getNextGroup($this->response_body);echo "<font color='color'>即将采集下一组</font><br />";sleep(1);$this->init($next_group);}else{$this->init($next_url);}}else{die("<font color='red'>没有下一页,原因是</font>:".$this->match_status);}}private function connect(){$this->conn = fsockopen($this->ip,$this->port,$errno,$errstr,$this->timeout);if($this->conn){$this->status = '链接成功';return true;}else{switch($errno){case -3:$this->status="创建socket链接失败";case -4:$this->status="dns查询失败";case -5:$this->status="链接被拒绝或超时";default:$this->status="创建连接失败";}return false;}}private function sendRequest(){if(empty($this->path)){$this->path="/";}$this->cmd_line=$this->http_method." ".$this->path." ".$this->http_version."\r\n";if(!empty($this->host)){$this->header .= "Host: ".$this->host."\r\n";}if(!empty($this->agent)){$this->header .="User-Agent: ".$this->agent."\r\n";}if(!empty($this->accept)){$this->header .= "Accept: ". $this->accept ."\r\n";}if(!empty($this->gzip)){if ( function_exists("gzinflate") ){$this->header .= "Accept-encoding: gzip\r\n";}else{$this->status = "不支持压缩";}}if(empty($this->referer)){$this->header .= "Referer: ".$this->url."\r\n";}else{$this->header .= "Referer: ".$this->referer."\r\n";}if(!empty($this->accept_language)){$this->header .= "Accept-Language: ".$this->accept_language."\r\n";}if(!empty($this->cookie)){if(!is_array($this->cookie)){$this->header .="Cookie: ".$this->cookie;}else{if(count($this->cookie) >0){$cookie = "Cookie: ";foreach($this->cookie as $key => $val){$cookie.=$key."=".urlencode($val).";";}$cookie = substr($cookie, 0, strlen($cookie)-1)."\r\n";}$this->header .= $cookie;}}if(!empty($this->submit_type)){$this->header .="Content-Type: ".$this->submit_type."\r\n";}if(!empty($this->post_content)){$this->header .= "Content-length: ".strlen($this->post_content)."\r\n";}if(!empty($this->connection)){$this->header .= "Connection: ".$this->connection."\r\n";}$this->header .="\r\n";//上面是HTTP请求头部信息//echo $this->cmd_line.$this->header.$this->post_content; exit();//发送请求$len = strlen($this->cmd_line.$this->header.$this->post_content);if($len != fwrite($this->conn, $this->cmd_line.$this->header.$this->post_content,$len)){$this->status = "发送请求failed";die($this->status);}//接受响应,每次读取一行内容,首先解析响应头while($response_header = fgets($this->conn, 1024)){if(preg_match("#^HTTP/#",$response_header)){//匹配状态数字,200表示请求成功if(preg_match("#^HTTP/[^\s]*\s(.*?)\s#",$response_header, $status)){$this->response_num= $status[1];//返回代表数字的状态}}//echo $this->response_num; exit();// 判断是否需要重定向if(preg_match("#^(Location:|URI:)#i",$response_header)){// 获取重定向地址preg_match("#^(Location:|URI:)\s+(.*)#",trim($response_header),$matches);//如果重定向字段不包含主机名,不是以以://开头的,则拼接王完整的请求地址,模式+主机+端口if(!preg_match("#\:\/\/#",$matches[2])){// 补全主机名$this->redirect = "http://".$this->host.":".$this->port;//添加路径if(!preg_match("|^/|",$matches[2]))$this->redirect .= "/".$matches[2];else$this->redirect .= $matches[2];}else//包含完整的主机地址$this->redirect = $matches[2];}//判断返回的数据的压缩格式if (preg_match("#^Content-Encoding: gzip#", $response_header) ){$this->is_gzip = true;}if(preg_match('#^Content-Length:\s*(\d+)#i', $response_header, $len)){$this->response_body_length = $len[1];}if(preg_match('#^Set-Cookie:#i', $response_header)){$items = explode(':', $response_header);$this->cookie = explode(';', $items[1])[0];}//解析完响应头部if(preg_match("/^\r?\n$/", $response_header) )break;$this->response_header[]=$response_header;}//可以成功返回响应头部信息,响应状态码也为200//var_dump($this->response_header); exit();if($this->response_num==200){$sub_dir;$dirname;$path;$filename;if(preg_match('#/(\d+)/#', $this->url, $sub_dir)){$dirname = "./download/".$sub_dir[1];}else{$dirname = "./download/".date("Ymd");}$len=0;while($items = fread($this->conn, $this->response_body_length)){if(!is_dir($dirname)){$path = mkdir($dirname,0777,true);}$filename = $dirname.'/'.basename($this->url);$len = $len+strlen($items);$this->response_body = $items;file_put_contents($filename, $items, FILE_APPEND);//这里必须判断读取的长度,不然会在这里阻塞if($len >= $this->response_body_length) break;}if($this->is_gzip){$this->response_body = gzinflate ($this->response_body);}echo str_repeat("  ", 2048);echo "对链接".$this->url."发起请求<br />";$this->getRollLink($this->response_body);}}//这个函数主要实现分析出下一页的链接private function getRollLink($content, $flag=''){if(preg_match('#<ul\s+class="image"[^>]*?>.*?</ul>#is', $content, $match)){if(preg_match_all('#<a\s+[^>]+?>.*?</a>#is', $match[0], $items)){//var_dump($items[0]);$len = count($items[0]);$next_page = $items[0][$len-1];if(preg_match('#<a\s+href="([^"]+)"\s*>#i', $next_page, $link)){$this->roll_link = $link[1];}else{$this->roll_link = false;$this->match_status="匹配下一页失败";}}else{$this->roll_link = false;$this->match_status="匹配<a>组失败";}}else{$this->roll_link = false;$this->match_status="匹配image分页组失败";}}//这个函数主要实现分析出下一组的链接private  function getNextGroup($content){if(preg_match('#<ul\s+class="page"[^>]*?>.*?</ul>#is', $content, $match)){//echo $match[0]."<br />";if(preg_match_all('#<a\s+href="([^"]*?)">.*?</a>#si', $match[0], $next)){//var_dump($next[1]);$choice;if(count($next[1])==2){$first = basename($next[1][0], ".html");$second = basename($next[1][1], ".html");//往前翻页,进入下一组if(intval($first) < intval($second)){$choice = $first;}else{$choice = $second;}//h获取下一组foreach($next[1] as $item){if(strripos($item, $choice) !=false ){if(substr($item, 0,2) =='..'){$link=  substr($item, 2);$sub_path = explode('/', $this->path);$url = $this->scheme.'://'.$this->host.'/'.$sub_path[1].$link;return $url;}}}}//如果是最后一组,即没有下一组了else if(count($next[1])==1){if(substr($next[1][0],0,2)=='..'){$link = substr($next[1][0],2);$sub_path = explode('/', $this->path);$url = $this->scheme.'://'.$this->host.'/'.$sub_path[1].$link;return $url;}}}else{$this->status = "failed to match href";}}else{$this->status = "failed to match class=page";}}}
ob_implicit_flush(true);
set_time_limit(0);
$url = $url = "http://www.mmkao.com/Beautyleg/201412/7066_2.html";
$http = new HttpWrap();
$http->init($url);
?>

php网页采集 修改版相关推荐

  1. php网页采集 修正版

    修正了在第一组,没有上一组的问题.实现了单次配置,自动化采集,不需人为监控,会自动分类的把HTML网页采集到对应的分类目录,对HTML网页进行过滤和提取就比较简单了 <?php header(& ...

  2. 隐藏探针显示php版本号,修改版雅黑PHP探针 支持PHP7+(v0.4.7.2)

    雅黑 PHP 探针用于 Linux 系统(不推荐使用于 Windows 系统),每秒更新,不用刷网页.可以实时查看服务器硬盘资源.内存占用.网卡流量.系统负载.服务器时间等信息,1 秒钟刷新一次. 可 ...

  3. phpcms9.6 ueditor_PHPCMS V9.6.6 修改版

    本帖最后由 zhaoxunzhiyin 于 2020-12-12 18:14 编辑 PHPCMS V9.6.6 修改版官方改了后台界面,修复bug 安装文件不检查index.html 安装完删除安装目 ...

  4. php7米酷cms,米酷CMS6.2修改版 支持PHP7 独家首发 - 百码云

    修改版介绍 米酷CMS在8月已经更新到了6.2版本,已经解决了首页加载速度缓慢的问题,我对比了6.0版本和6.2版本的代码,改动其实有点多,但不知道为什么不支持PHP7,单从PHP版本来说,PHP7比 ...

  5. 一个神奇的资源网站「有趣网站收藏家」共有186个站点资源-北忘山修改版

    一个神奇的资源网站「有趣网站收藏家」共有186个站点资源-北忘山修改版 网站介绍 这个网站有点东西,目前收集了186个资源,小北当时拔下来之后发现都是作者纯html手写的,意思就是每每添加网站都是手动 ...

  6. H5小游戏从修改到发布--无编程基础修改版

    H5小游戏从修改到发布–无编程基础修改版 合成哈工大 流程来源于程序员鱼皮合成大西瓜魔改版 原教程链接 合成大西瓜源码 素材获取:h5小游戏源码 注册登录,直接下载压缩包版本的源代码,网页还有其他游戏 ...

  7. fc天使之翼2020修改版下载_海岛奇兵无限钻石修改版下载-海岛奇兵无限钻石修改版下载2020...

    海岛奇兵无限钻石修改版绝对是小伙伴们都在寻找的资源,修改众多资源,玩家可以解锁不同兵种建立更强大的部队啦!通过塔防与掠夺不断强化我方战力,合理运用策略至关重要,喜欢的朋友千万别错过! 修改无限钻石.金 ...

  8. Python爬虫之reuqests实现简单网页采集--网页采集教程

    我们介绍了一种新的爬取网页的方法–reuqests,并介绍了它的使用方法,我们还介绍了urllib与reuqests的区别.这节课我们通过一个实例–reuqests实现简单网页采集来加深大家对reuq ...

  9. 【转】修改版WinXP集体歇业避免遭遇调查

    来自:新浪科技   立雄 20日,有消息披露番茄花园作者洪磊确认被拘的消息后,在业界引发了强烈的反响.22日凌晨,记者再次尝试登陆这些版本的官方下载网站时发现,他们已经集体歇业,或关闭提供软件下载的官 ...

最新文章

  1. 非确定性算法_带你从不同角度了解强化学习算法的分类
  2. antd 嵌套子表格_大型前端项目架构优化探索之路腾讯文档表格
  3. 使用pytz模块进行时区转换及时间计算
  4. iOS 限制输入字数完美解决方案
  5. 详解SpringMVC中Controller的方法中参数的工作原理[附带源码分析]
  6. JavaScript原生对象属性和方法详解——Date对象
  7. 【工程项目经验】函数编译可见性
  8. 抓linux肉鸡教程视频,抓肉鸡的教程和软件免费分享(2018一天抓1000只电脑肉鸡视频)...
  9. 驱动精灵w8ndows xp sp2,惠普打印机驱动官方正式版下载,适用于winxp,win2003,winvista,win7,win8,win10,win2008,win2012-驱动精灵...
  10. 如何在线批量将JPG图片转Word文件
  11. 【PC】如何让程序开机自启动/如何打开开机自启动文件夹
  12. 掌握Python核心编程的四大神兽,何尝拿不到高薪资
  13. 清微智能CTO欧阳鹏:架构创新是通往高性能计算芯片必由之路|量子位·视点分享回顾...
  14. Android之模仿微信登陆界面(一)
  15. uniapp全局数据(全局url、全局openId)
  16. 区块链工程师薪资竟然被AI吊打?最新出炉的《2018区块链招聘分析报告》,释放了哪些重要信号?...
  17. 模糊神经网络2--基于ANFIS的混沌时间序列预测
  18. 2022年全球市场360度手机摄像头总体规模、主要生产商、主要地区、产品和应用细分研究报告
  19. 11-04Physics-Aware Learning-based Longitudinal Vehicle Trajectory Prediction in Congested Traffic
  20. P1823 [COI2007] Patrik 音乐会的等待(单调栈)

热门文章

  1. 小奇画画(线段树+map)(水题)
  2. 饮冰三年-人工智能-Python-17Python基础之模块与包
  3. 联考事业单位计算机类面试,2018年5.26事业单位联考E类常见面试题(下)
  4. Flutter2.10开始支持Windows
  5. Lazy and Hungry
  6. Position Calc TdPositionCanClose Error
  7. Moby_Dick.txt
  8. 《巴黎协定》生效 越南和印尼有望至2020年各新增5GW光伏容量
  9. svg-captcha验证码识别,成功率100%
  10. CentOS 7使用samba共享文件夹