这里使用到了一个php插件下面是源码simple_html_dom.php

defined('IN_ECS');

define('HDOM_TYPE_ELEMENT', 1);

define('HDOM_TYPE_COMMENT', 2);

define('HDOM_TYPE_TEXT', 3);

define('HDOM_TYPE_ENDTAG', 4);

define('HDOM_TYPE_ROOT', 5);

define('HDOM_TYPE_UNKNOWN', 6);

define('HDOM_QUOTE_DOUBLE', 0);

define('HDOM_QUOTE_SINGLE', 1);

define('HDOM_QUOTE_NO', 3);

define('HDOM_INFO_BEGIN', 0);

define('HDOM_INFO_END', 1);

define('HDOM_INFO_QUOTE', 2);

define('HDOM_INFO_SPACE', 3);

define('HDOM_INFO_TEXT', 4);

define('HDOM_INFO_INNER', 5);

define('HDOM_INFO_OUTER', 6);

define('HDOM_INFO_ENDSPACE',7);

// helper functions

// -----------------------------------------------------------------------------

// get html dom form file

function file_get_html() {

$dom = new simple_html_dom;

$args = func_get_args();

$dom->load(call_user_func_array('file_get_contents', $args), true);

return $dom;

}

// get html dom form string

function str_get_html($str, $lowercase=true) {

$dom = new simple_html_dom;

$dom->load($str, $lowercase);

return $dom;

}

// dump html dom tree

function dump_html_tree($node, $show_attr=true, $deep=0) {

$lead = str_repeat(' ', $deep);

echo $lead.$node->tag;

if ($show_attr && count($node->attr)>0) {

echo '(';

foreach($node->attr as $k=>$v)

echo "[$k]=>\"".$node->$k.'", ';

echo ')';

}

echo "\n";

foreach($node->nodes as $c)

dump_html_tree($c, $show_attr, $deep+1);

}

// get dom form file (deprecated)

function file_get_dom() {

$dom = new simple_html_dom;

$args = func_get_args();

$dom->load(call_user_func_array('file_get_contents', $args), true);

return $dom;

}

// get dom form string (deprecated)

function str_get_dom($str, $lowercase=true) {

$dom = new simple_html_dom;

$dom->load($str, $lowercase);

return $dom;

}

// simple html dom node

// -----------------------------------------------------------------------------

class simple_html_dom_node {

public $nodetype = HDOM_TYPE_TEXT;

public $tag = 'text';

public $attr = array();

public $children = array();

public $nodes = array();

public $parent = null;

public $_ = array();

private $dom = null;

function __construct($dom) {

$this->dom = $dom;

$dom->nodes[] = $this;

}

function __destruct() {

$this->clear();

}

function __toString() {

return $this->outertext();

}

// clean up memory due to php5 circular references memory leak...

function clear() {

$this->dom = null;

$this->nodes = null;

$this->parent = null;

$this->children = null;

}

// dump node's tree

function dump($show_attr=true) {

dump_html_tree($this, $show_attr);

}

// returns the parent of node

function parent() {

return $this->parent;

}

// returns children of node

function children($idx=-1) {

if ($idx===-1) return $this->children;

if (isset($this->children[$idx])) return $this->children[$idx];

return null;

}

// returns the first child of node

function first_child() {

if (count($this->children)>0) return $this->children[0];

return null;

}

// returns the last child of node

function last_child() {

if (($count=count($this->children))>0) return $this->children[$count-1];

return null;

}

// returns the next sibling of node

function next_sibling() {

if ($this->parent===null) return null;

$idx = 0;

$count = count($this->parent->children);

while ($idxparent->children[$idx])

++$idx;

if (++$idx>=$count) return null;

return $this->parent->children[$idx];

}

// returns the previous sibling of node

function prev_sibling() {

if ($this->parent===null) return null;

$idx = 0;

$count = count($this->parent->children);

while ($idxparent->children[$idx])

++$idx;

if (--$idx<0) return null;

return $this->parent->children[$idx];

}

// get dom node's inner html

function innertext() {

if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];

if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);

$ret = '';

foreach($this->nodes as $n)

$ret .= $n->outertext();

return $ret;

}

// get dom node's outer text (with tag)

function outertext() {

if ($this->tag==='root') return $this->innertext();

// trigger callback

if ($this->dom->callback!==null)

call_user_func_array($this->dom->callback, array($this));

if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER];

if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);

// render begin tag

$ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();

// render inner text

if (isset($this->_[HDOM_INFO_INNER]))

$ret .= $this->_[HDOM_INFO_INNER];

else {

foreach($this->nodes as $n)

$ret .= $n->outertext();

}

// render end tag

if(isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0)

$ret .= ''.$this->tag.'>';

return $ret;

}

// get dom node's plain text

function text() {

if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];

switch ($this->nodetype) {

case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);

case HDOM_TYPE_COMMENT: return '';

case HDOM_TYPE_UNKNOWN: return '';

}

if (strcasecmp($this->tag, 'script')===0) return '';

if (strcasecmp($this->tag, 'style')===0) return '';

$ret = '';

foreach($this->nodes as $n)

$ret .= $n->text();

return $ret;

}

function xmltext() {

$ret = $this->innertext();

$ret = str_ireplace('

$ret = str_replace(']]>', '', $ret);

return $ret;

}

// build node's text with tag

function makeup() {

// text, comment, unknown

if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);

$ret = 'tag;

$i = -1;

foreach($this->attr as $key=>$val) {

++$i;

// skip removed attribute

if ($val===null || $val===false)

continue;

$ret .= $this->_[HDOM_INFO_SPACE][$i][0];

//no value attr: nowrap, checked selected...

if ($val===true)

$ret .= $key;

else {

switch($this->_[HDOM_INFO_QUOTE][$i]) {

case HDOM_QUOTE_DOUBLE: $quote = '"'; break;

case HDOM_QUOTE_SINGLE: $quote = '\''; break;

default: $quote = '';

}

$ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote;

}

}

$ret = $this->dom->restore_noise($ret);

return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';

}

// find elements by css selector

function find($selector, $idx=null) {

$selectors = $this->parse_selector($selector);

if (($count=count($selectors))===0) return array();

$found_keys = array();

// find each selector

for ($c=0; $c

if (($levle=count($selectors[0]))===0) return array();

if (!isset($this->_[HDOM_INFO_BEGIN])) return array();

$head = array($this->_[HDOM_INFO_BEGIN]=>1);

// handle descendant selectors, no recursive!

for ($l=0; $l

$ret = array();

foreach($head as $k=>$v) {

$n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k];

$n->seek($selectors[$c][$l], $ret);

}

$head = $ret;

}

foreach($head as $k=>$v) {

if (!isset($found_keys[$k]))

$found_keys[$k] = 1;

}

}

// sort keys

ksort($found_keys);

$found = array();

foreach($found_keys as $k=>$v)

$found[] = $this->dom->nodes[$k];

// return nth-element or array

if (is_null($idx)) return $found;

else if ($idx<0) $idx = count($found) + $idx;

return (isset($found[$idx])) ? $found[$idx] : null;

}

// seek for given conditions

protected function seek($selector, &$ret) {

list($tag, $key, $val, $exp, $no_key) = $selector;

// xpath index

if ($tag && $key && is_numeric($key)) {

$count = 0;

foreach ($this->children as $c) {

if ($tag==='*' || $tag===$c->tag) {

if (++$count==$key) {

$ret[$c->_[HDOM_INFO_BEGIN]] = 1;

return;

}

}

}

return;

}

$end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;

if ($end==0) {

$parent = $this->parent;

while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) {

$end -= 1;

$parent = $parent->parent;

}

$end += $parent->_[HDOM_INFO_END];

}

for($i=$this->_[HDOM_INFO_BEGIN]+1; $i

$node = $this->dom->nodes[$i];

$pass = true;

if ($tag==='*' && !$key) {

if (in_array($node, $this->children, true))

$ret[$i] = 1;

continue;

}

// compare tag

if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;}

// compare key

if ($pass && $key) {

if ($no_key) {

if (isset($node->attr[$key])) $pass=false;

}

else if (!isset($node->attr[$key])) $pass=false;

}

// compare value

if ($pass && $key && $val && $val!=='*') {

$check = $this->match($exp, $val, $node->attr[$key]);

// handle multiple class

if (!$check && strcasecmp($key, 'class')===0) {

foreach(explode(' ',$node->attr[$key]) as $k) {

$check = $this->match($exp, $val, $k);

if ($check) break;

}

}

if (!$check) $pass = false;

}

if ($pass) $ret[$i] = 1;

unset($node);

}

}

protected function match($exp, $pattern, $value) {

switch ($exp) {

case '=':

return ($value===$pattern);

case '!=':

return ($value!==$pattern);

case '^=':

return preg_match("/^".preg_quote($pattern,'/')."/", $value);

case '$=':

return preg_match("/".preg_quote($pattern,'/')."$/", $value);

case '*=':

if ($pattern[0]=='/')

return preg_match($pattern, $value);

return preg_match("/".$pattern."/i", $value);

}

return false;

}

protected function parse_selector($selector_string) {

// pattern of CSS selectors, modified from mootools

$pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";

preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER);

$selectors = array();

$result = array();

//print_r($matches);

foreach ($matches as $m) {

$m[0] = trim($m[0]);

if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue;

// for borwser grnreated xpath

if ($m[1]==='tbody') continue;

list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false);

if(!empty($m[2])) {$key='id'; $val=$m[2];}

if(!empty($m[3])) {$key='class'; $val=$m[3];}

if(!empty($m[4])) {$key=$m[4];}

if(!empty($m[5])) {$exp=$m[5];}

if(!empty($m[6])) {$val=$m[6];}

// convert to lowercase

if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);}

//elements that do NOT have the specified attribute

if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;}

$result[] = array($tag, $key, $val, $exp, $no_key);

if (trim($m[7])===',') {

$selectors[] = $result;

$result = array();

}

}

if (count($result)>0)

$selectors[] = $result;

return $selectors;

}

function __get($name) {

if (isset($this->attr[$name])) return $this->attr[$name];

switch($name) {

case 'outertext': return $this->outertext();

case 'innertext': return $this->innertext();

case 'plaintext': return $this->text();

case 'xmltext': return $this->xmltext();

default: return array_key_exists($name, $this->attr);

}

}

function __set($name, $value) {

switch($name) {

case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;

case 'innertext':

if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value;

return $this->_[HDOM_INFO_INNER] = $value;

}

if (!isset($this->attr[$name])) {

$this->_[HDOM_INFO_SPACE][] = array(' ', '', '');

$this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;

}

$this->attr[$name] = $value;

}

function __isset($name) {

switch($name) {

case 'outertext': return true;

case 'innertext': return true;

case 'plaintext': return true;

}

//no value attr: nowrap, checked selected...

return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);

}

function __unset($name) {

if (isset($this->attr[$name]))

unset($this->attr[$name]);

}

// camel naming conventions

function getAllAttributes() {return $this->attr;}

function getAttribute($name) {return $this->__get($name);}

function setAttribute($name, $value) {$this->__set($name, $value);}

function hasAttribute($name) {return $this->__isset($name);}

function removeAttribute($name) {$this->__set($name, null);}

function getElementById($id) {return $this->find("#$id", 0);}

function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}

function getElementByTagName($name) {return $this->find($name, 0);}

function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);}

function parentNode() {return $this->parent();}

function childNodes($idx=-1) {return $this->children($idx);}

function firstChild() {return $this->first_child();}

function lastChild() {return $this->last_child();}

function nextSibling() {return $this->next_sibling();}

function previousSibling() {return $this->prev_sibling();}

}

// simple html dom parser

// -----------------------------------------------------------------------------

class simple_html_dom {

public $root = null;

public $nodes = array();

public $callback = null;

public $lowercase = false;

protected $pos;

protected $doc;

protected $char;

protected $size;

protected $cursor;

protected $parent;

protected $noise = array();

protected $token_blank = " \t\r\n";

protected $token_equal = ' =/>';

protected $token_slash = " />\r\n\t";

protected $token_attr = ' >';

// use isset instead of in_array, performance boost about 30%...

protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1);

protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1);

protected $optional_closing_tags = array(

'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1),

'th'=>array('th'=>1),

'td'=>array('td'=>1),

'li'=>array('li'=>1),

'dt'=>array('dt'=>1, 'dd'=>1),

'dd'=>array('dd'=>1, 'dt'=>1),

'dl'=>array('dd'=>1, 'dt'=>1),

'p'=>array('p'=>1),

'nobr'=>array('nobr'=>1),

);

function __construct($str=null) {

if ($str) {

if (preg_match("/^http:\/\//i",$str) || is_file($str))

$this->load_file($str);

else

$this->load($str);

}

}

function __destruct() {

$this->clear();

}

// load html from string

function load($str, $lowercase=true) {

// prepare

$this->prepare($str, $lowercase);

// strip out comments

$this->remove_noise("''is");

// strip out cdata

$this->remove_noise("''is", true);

// strip out

$this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is");

$this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is");

// strip out

$this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is");

$this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is");

// strip out preformatted tags

$this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");

// strip out server side scripts

$this->remove_noise("'(<\?)(.*?)(\?>)'s", true);

// strip smarty scripts

$this->remove_noise("'(\{\w)(.*?)(\})'s", true);

// parsing

while ($this->parse());

// end

$this->root->_[HDOM_INFO_END] = $this->cursor;

}

// load html from file

function load_file() {

$args = func_get_args();

$this->load(call_user_func_array('file_get_contents', $args), true);

}

// set callback function

function set_callback($function_name) {

$this->callback = $function_name;

}

// remove callback function

function remove_callback() {

$this->callback = null;

}

// save dom as string

function save($filepath='') {

$ret = $this->root->innertext();

if ($filepath!=='') file_put_contents($filepath, $ret);

return $ret;

}

// find dom node by css selector

function find($selector, $idx=null) {

return $this->root->find($selector, $idx);

}

// clean up memory due to php5 circular references memory leak...

function clear() {

foreach($this->nodes as $n) {$n->clear(); $n = null;}

if (isset($this->parent)) {$this->parent->clear(); unset($this->parent);}

if (isset($this->root)) {$this->root->clear(); unset($this->root);}

unset($this->doc);

unset($this->noise);

}

function dump($show_attr=true) {

$this->root->dump($show_attr);

}

// prepare HTML data and init everything

protected function prepare($str, $lowercase=true) {

$this->clear();

$this->doc = $str;

$this->pos = 0;

$this->cursor = 1;

$this->noise = array();

$this->nodes = array();

$this->lowercase = $lowercase;

$this->root = new simple_html_dom_node($this);

$this->root->tag = 'root';

$this->root->_[HDOM_INFO_BEGIN] = -1;

$this->root->nodetype = HDOM_TYPE_ROOT;

$this->parent = $this->root;

// set the length of content

$this->size = strlen($str);

if ($this->size>0) $this->char = $this->doc[0];

}

// parse html content

protected function parse() {

if (($s = $this->copy_until_char('<'))==='')

return $this->read_tag();

// text

$node = new simple_html_dom_node($this);

++$this->cursor;

$node->_[HDOM_INFO_TEXT] = $s;

$this->link_nodes($node, false);

return true;

}

// read tag info

protected function read_tag() {

if ($this->char!=='<') {

$this->root->_[HDOM_INFO_END] = $this->cursor;

return false;

}

$begin_tag_pos = $this->pos;

$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next

// end tag

if ($this->char==='/') {

$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next

$this->skip($this->token_blank_t);

$tag = $this->copy_until_char('>');

// skip attributes in end tag

if (($pos = strpos($tag, ' '))!==false)

$tag = substr($tag, 0, $pos);

$parent_lower = strtolower($this->parent->tag);

$tag_lower = strtolower($tag);

if ($parent_lower!==$tag_lower) {

if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower])) {

$this->parent->_[HDOM_INFO_END] = 0;

$org_parent = $this->parent;

while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower)

$this->parent = $this->parent->parent;

if (strtolower($this->parent->tag)!==$tag_lower) {

$this->parent = $org_parent; // restore origonal parent

if ($this->parent->parent) $this->parent = $this->parent->parent;

$this->parent->_[HDOM_INFO_END] = $this->cursor;

return $this->as_text_node($tag);

}

}

else if (($this->parent->parent) && isset($this->block_tags[$tag_lower])) {

$this->parent->_[HDOM_INFO_END] = 0;

$org_parent = $this->parent;

while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower)

$this->parent = $this->parent->parent;

if (strtolower($this->parent->tag)!==$tag_lower) {

$this->parent = $org_parent; // restore origonal parent

$this->parent->_[HDOM_INFO_END] = $this->cursor;

return $this->as_text_node($tag);

}

}

else if (($this->parent->parent) && strtolower($this->parent->parent->tag)===$tag_lower) {

$this->parent->_[HDOM_INFO_END] = 0;

$this->parent = $this->parent->parent;

}

else

return $this->as_text_node($tag);

}

$this->parent->_[HDOM_INFO_END] = $this->cursor;

if ($this->parent->parent) $this->parent = $this->parent->parent;

$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next

return true;

}

$node = new simple_html_dom_node($this);

$node->_[HDOM_INFO_BEGIN] = $this->cursor;

++$this->cursor;

$tag = $this->copy_until($this->token_slash);

// doctype, cdata & comments...

if (isset($tag[0]) && $tag[0]==='!') {

$node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');

if (isset($tag[2]) && $tag[1]==='-' && $tag[2]==='-') {

$node->nodetype = HDOM_TYPE_COMMENT;

$node->tag = 'comment';

} else {

$node->nodetype = HDOM_TYPE_UNKNOWN;

$node->tag = 'unknown';

}

if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>';

$this->link_nodes($node, true);

$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next

return true;

}

// text

if ($pos=strpos($tag, '<')!==false) {

$tag = '<' . substr($tag, 0, -1);

$node->_[HDOM_INFO_TEXT] = $tag;

$this->link_nodes($node, false);

$this->char = $this->doc[--$this->pos]; // prev

return true;

}

if (!preg_match("/^[\w-:]+$/", $tag)) {

$node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');

if ($this->char==='<') {

$this->link_nodes($node, false);

return true;

}

if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>';

$this->link_nodes($node, false);

$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next

return true;

}

// begin tag

$node->nodetype = HDOM_TYPE_ELEMENT;

$tag_lower = strtolower($tag);

$node->tag = ($this->lowercase) ? $tag_lower : $tag;

// handle optional closing tags

if (isset($this->optional_closing_tags[$tag_lower]) ) {

while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) {

$this->parent->_[HDOM_INFO_END] = 0;

$this->parent = $this->parent->parent;

}

$node->parent = $this->parent;

}

$guard = 0; // prevent infinity loop

$space = array($this->copy_skip($this->token_blank), '', '');

// attributes

do {

if ($this->char!==null && $space[0]==='') break;

$name = $this->copy_until($this->token_equal);

if($guard===$this->pos) {

$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next

continue;

}

$guard = $this->pos;

// handle endless '<'

if($this->pos>=$this->size-1 && $this->char!=='>') {

$node->nodetype = HDOM_TYPE_TEXT;

$node->_[HDOM_INFO_END] = 0;

$node->_[HDOM_INFO_TEXT] = '<'.$tag . $space[0] . $name;

$node->tag = 'text';

$this->link_nodes($node, false);

return true;

}

// handle mismatch '<'

if($this->doc[$this->pos-1]=='<') {

$node->nodetype = HDOM_TYPE_TEXT;

$node->tag = 'text';

$node->attr = array();

$node->_[HDOM_INFO_END] = 0;

$node->_[HDOM_INFO_TEXT] = substr($this->doc, $begin_tag_pos, $this->pos-$begin_tag_pos-1);

$this->pos -= 2;

$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next

$this->link_nodes($node, false);

return true;

}

if ($name!=='/' && $name!=='') {

$space[1] = $this->copy_skip($this->token_blank);

$name = $this->restore_noise($name);

if ($this->lowercase) $name = strtolower($name);

if ($this->char==='=') {

$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next

$this->parse_attr($node, $name, $space);

}

else {

//no value attr: nowrap, checked selected...

$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;

$node->attr[$name] = true;

if ($this->char!='>') $this->char = $this->doc[--$this->pos]; // prev

}

$node->_[HDOM_INFO_SPACE][] = $space;

$space = array($this->copy_skip($this->token_blank), '', '');

}

else

break;

} while($this->char!=='>' && $this->char!=='/');

$this->link_nodes($node, true);

$node->_[HDOM_INFO_ENDSPACE] = $space[0];

// check self closing

if ($this->copy_until_char_escape('>')==='/') {

$node->_[HDOM_INFO_ENDSPACE] .= '/';

$node->_[HDOM_INFO_END] = 0;

}

else {

// reset parent

if (!isset($this->self_closing_tags[strtolower($node->tag)])) $this->parent = $node;

}

$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next

return true;

}

// parse attributes

protected function parse_attr($node, $name, &$space) {

$space[2] = $this->copy_skip($this->token_blank);

switch($this->char) {

case '"':

$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;

$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next

$node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('"'));

$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next

break;

case '\'':

$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE;

$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next

$node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('\''));

$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next

break;

default:

$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;

$node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr));

}

}

// link node's parent

protected function link_nodes(&$node, $is_child) {

$node->parent = $this->parent;

$this->parent->nodes[] = $node;

if ($is_child)

$this->parent->children[] = $node;

}

// as a text node

protected function as_text_node($tag) {

$node = new simple_html_dom_node($this);

++$this->cursor;

$node->_[HDOM_INFO_TEXT] = '' . $tag . '>';

$this->link_nodes($node, false);

$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next

return true;

}

protected function skip($chars) {

$this->pos += strspn($this->doc, $chars, $this->pos);

$this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next

}

protected function copy_skip($chars) {

$pos = $this->pos;

$len = strspn($this->doc, $chars, $pos);

$this->pos += $len;

$this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next

if ($len===0) return '';

return substr($this->doc, $pos, $len);

}

protected function copy_until($chars) {

$pos = $this->pos;

$len = strcspn($this->doc, $chars, $pos);

$this->pos += $len;

$this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next

return substr($this->doc, $pos, $len);

}

protected function copy_until_char($char) {

if ($this->char===null) return '';

if (($pos = strpos($this->doc, $char, $this->pos))===false) {

$ret = substr($this->doc, $this->pos, $this->size-$this->pos);

$this->char = null;

$this->pos = $this->size;

return $ret;

}

if ($pos===$this->pos) return '';

$pos_old = $this->pos;

$this->char = $this->doc[$pos];

$this->pos = $pos;

return substr($this->doc, $pos_old, $pos-$pos_old);

}

protected function copy_until_char_escape($char) {

if ($this->char===null) return '';

$start = $this->pos;

while(1) {

if (($pos = strpos($this->doc, $char, $start))===false) {

$ret = substr($this->doc, $this->pos, $this->size-$this->pos);

$this->char = null;

$this->pos = $this->size;

return $ret;

}

if ($pos===$this->pos) return '';

if ($this->doc[$pos-1]==='\\') {

$start = $pos+1;

continue;

}

$pos_old = $this->pos;

$this->char = $this->doc[$pos];

$this->pos = $pos;

return substr($this->doc, $pos_old, $pos-$pos_old);

}

}

// remove noise from html content

protected function remove_noise($pattern, $remove_tag=false) {

$count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE);

for ($i=$count-1; $i>-1; --$i) {

$key = '___noise___'.sprintf('% 3d', count($this->noise)+100);

$idx = ($remove_tag) ? 0 : 1;

$this->noise[$key] = $matches[$i][$idx][0];

$this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));

}

// reset the length of content

$this->size = strlen($this->doc);

if ($this->size>0) $this->char = $this->doc[0];

}

// restore noise to html content

function restore_noise($text) {

while(($pos=strpos($text, '___noise___'))!==false) {

$key = '___noise___'.$text[$pos+11].$text[$pos+12].$text[$pos+13];

if (isset($this->noise[$key]))

$text = substr($text, 0, $pos).$this->noise[$key].substr($text, $pos+14);

}

return $text;

}

function __toString() {

return $this->root->innertext();

}

function __get($name) {

switch($name) {

case 'outertext': return $this->root->innertext();

case 'innertext': return $this->root->innertext();

case 'plaintext': return $this->root->text();

}

}

// camel naming conventions

function childNodes($idx=-1) {return $this->root->childNodes($idx);}

function firstChild() {return $this->root->first_child();}

function lastChild() {return $this->root->last_child();}

function getElementById($id) {return $this->find("#$id", 0);}

function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}

function getElementByTagName($name) {return $this->find($name, 0);}

function getElementsByTagName($name, $idx=-1) {return $this->find($name, $idx);}

function loadFile() {$args = func_get_args();$this->load(call_user_func_array('file_get_contents', $args), true);}

}

?>

采集的时候需要设置超时时间

set_time_limit(0);

然后引入文件

require_once 'simple_html_dom.php';

require_once 'common.php';

$dom =new simple_html_dom();

$url = isset($_POST['jdurl']) ? $_POST['jdurl'] : '';

$html = str_get_html($url);//获取html

$dom->load_file($html);

$plist=$dom->find('div[id=plist] li');//获取所有的li

foreach ($plist as $key => $value) {

$goods_id_node=$value->find('div[class=gl-i-wrap j-sku-item]',0);

$goods_id= $goods_id_node->getAttribute('data-sku');//获取data-sku 就是商品id

$name_node=$value->find('div[class=p-name] em',0);//获取商品名称节点

$name=trim($name_node->plaintext);//获取商品名称

$href_node=$value->find('div[class=p-name] a',0);//获取路径节点

$href=$href_node->getAttribute('href');//获取路径

$data['market_price']=getPrice($goods_id);//获取价格

使用common中的获取价格函数

/**

* @method 获取商品价格

* @param $id 商品id

*/

function getPrice($id){

$ch = curl_init();

curl_setopt($ch,CURLOPT_URL,'http://p.3.cn/prices/mgets?skuIds=J_'.$id);

curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);

curl_setopt($ch,CURLOPT_RETURNTRANSFER,true);

$data = curl_exec($ch);

curl_close($ch);

$data=json_decode($data,true);

return $data[0]['p'];

}

获取库存

$kuncunstr=getkucun("https://c0.3.cn/stocks?type=getstocks&skuIds=".$goods_id."&area=1_72_2799_0");

使用了获取库存函数

function getkucun($url){

$ch = curl_init();

curl_setopt($ch,CURLOPT_URL,$url);

curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);

curl_setopt($ch,CURLOPT_RETURNTRANSFER,true);

$data = curl_exec($ch);

curl_close($ch);

if (strpos($data, 'charset="utf8') === false) {

$data = mb_convert_encoding($data, "UTF-8", "gbk");

}

preg_match('/无货/', $data, $kucun);

if($kucun){

return 0;//无货

}else{

return 10;//有货 默认给个10个

}

}

大概就这么个意思。。。

补充个下载图片本地化的方法

/**

* 下载远程图片保存到本地

* @access public

* @since 1.0

* @return string

* @params string $url 远程图片地址

* @params string $filename 保存文件名

*/

function download($url,$head='thumb_')

{

$save_dir='../../images/spider/'.date('Ymd',time())."/";

$returnstr='images/spider/'.date('Ymd',time()).'/';

//创建保存目录

if(!file_exists($save_dir)&&!mkdir($save_dir,0777,true))

return array('file_name'=>'','save_path'=>'','error'=>5);

$ch = curl_init();

curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); // 信任任何证书

curl_setopt($ch, CURLOPT_URL, $url);

curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);

curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);

$file = curl_exec($ch);

curl_close($ch);

$filename =pathinfo($url, PATHINFO_BASENAME);

$filename=$head. $filename;

@$resource = fopen($save_dir . $filename, 'a');

if (!$resource){

return 0;

}

@fwrite($resource, $file);

@fclose($resource);

unset($file,$url);

return $returnstr.$filename;

}

获取商品详情的方法

/**

* 获取商品详情信息

* @param $url

* @return mixed

*/

function getdetail($url){

$ch = curl_init();

curl_setopt($ch,CURLOPT_URL,$url);

curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);

curl_setopt($ch,CURLOPT_RETURNTRANSFER,true);

$data = curl_exec($ch);

curl_close($ch);

if(strpos($data,'charset="utf8') === false){

$data = mb_convert_encoding($data,"UTF-8","gbk");

}

$data=json_decode($data,true);

$returnstr=$data['content'];

$content=get_content_img($data['content']);

$content1=get_content_css_img($data['content']);

for ($i=0;$i

$preg = "/^http(s)?:\\/\\/.+/";

if(preg_match($preg,$content[$i])){

$url=download($content[$i],'content');

}else{

$url=download('http:'.$content[$i],'content');

}

$returnstr=str_replace($content[$i],'/'.$url, $returnstr);

}

for ($i=0;$i

$preg = "/^http(s)?:\\/\\/.+/";

if(preg_match($preg,$content[$i])){

$url=download($content1[$i],'content');

}else{

$url=download('http:'.$content1[$i],'content');

}

$returnstr.=''.$url.'';

}

$returnstr=str_replace('data-lazyload','src', $returnstr);

$returnstr=str_replace('\'','\"', $returnstr);

return $returnstr;

}

function get_content_css_img($str){

preg_match_all("/.*background[^;\"]+url\(([^\)]+)\).*/im", $str, $matches);

return $matches[1];

}

function get_content_img($str)

{

preg_match_all("/]*data-lazyload=[\"]([^\"]+)[\"][^<>]*>/im", $str, $matches); //这个规则也可以

return $matches[1];

}

自己练习使用

php 获取京东交易账号,PHP爬虫爬取京东列表相关推荐

  1. Python动态爬虫爬取京东商品评论

    Python 动态爬虫爬取京东商品评论 1. 概述 京东商城是Python爬虫初学者试手的经典平台,反爬虫程度较低,但评论采取了动态加载的方式,爬取京东商品评论是学习动态爬虫的一个极佳方法. 动态爬虫 ...

  2. 爬虫爬取京东商品详细数据 (品牌、售价、各类评论量(精确数量)、热评词及数量等)json解析部分数据

    文章目录 前言 一.数据保存格式设置及数据库准备(CentOS云mysql数据库) 1.分析数据需求(单一商品为例) 2.数据库保存格式 3.用到的数据库操作及指令 二.网页分析 1.分析网页源码,确 ...

  3. python爬虫爬取京东、淘宝、苏宁上华为P20购买评论

    爬虫爬取京东.淘宝.苏宁上华为P20购买评论 1.使用软件 Anaconda3 2.代码截图 三个网站代码大同小异,因此只展示一个 3.结果(部分) 京东 淘宝 苏宁 4.分析 这三个网站上的评论数据 ...

  4. python制作爬虫爬取京东商品评论教程

    python制作爬虫爬取京东商品评论教程 作者:蓝鲸 类型:转载 本文是继前2篇Python爬虫系列文章的后续篇,给大家介绍的是如何使用Python爬取京东商品评论信息的方法,并根据数据绘制成各种统计 ...

  5. python爬虫爬取京东商品评价_网络爬虫-爬取京东商品评价数据

    前段时间做商品评价的语义分析,需要大量的电商数据,于是乎就自己动手爬取京东的数据.第一次接触爬虫是使用selenium爬取CNKI的摘要,基于惯性思维的我仍然想用selenium+Firefox的方法 ...

  6. 用Python实现爬虫爬取京东笔记本电脑图片

    最近需要从网上搜集一些数据,于是花了几个小时把爬虫学了一下.其实我在两年前的一个简单的站内搜索引擎的实现已经涉及到了简单的爬虫,不过那时候老想着偷懒,加上山大新闻网的设计又很好爬,所以当时写得很low ...

  7. 利用python爬虫爬取京东商城商品图片

    笔者曾经用python第三方库requests来爬取京东商城的商品页内容,经过解析之后发现只爬到了商品页一半的图片.(这篇文章我们以爬取智能手机图片为例) 当鼠标没有向下滑时,此时查看源代码的话,就会 ...

  8. 超详细解析python爬虫爬取京东图片

    超详细图片爬虫实战 实例讲解(京东商城手机图片爬取) 1.创建一个文件夹来存放你爬取的图片 2.第一部分代码分析 3.第二部分代码分析 完整的代码如下所示: 升级版代码: 爬取过程中首先你需要观察在手 ...

  9. 网络爬虫-爬取京东商品评价数据

    前段时间做商品评价的语义分析,需要大量的电商数据,于是乎就自己动手爬取京东的数据.第一次接触爬虫是使用selenium爬取CNKI的摘要,基于惯性思维的我仍然想用selenium+Firefox的方法 ...

最新文章

  1. 辞旧迎新,总结2010,展望2011
  2. Boost:时间副本的测试程序
  3. 【邀请函】2021钉钉宜搭·线上沙龙,邀您云上相见!
  4. 获取cookie里面的值
  5. asp.net menu Operation aborted
  6. Java——容器(泛型)
  7. cadence 16.6中Z-copy的用法
  8. servlet监听器Listener介绍和使用
  9. 大华 解码器上大屏代码事例
  10. Javashop-B2B2C多店铺系统,Javashop B2C开源电商系统下载
  11. ubuntu彻底卸载Nvidia显卡驱动
  12. 前端之HTML表格s
  13. 拍牌(沪牌)软件,开源全部代码。有精力可以自己研究然后自用拍牌
  14. 【2022最新Java面试宝典】—— Memcache面试题(23道含答案)
  15. 猫哥教你写爬虫 049--完结撒花
  16. 这5个摸鱼神器太火了!程序员:知道了快删!
  17. python如何实现电子邮件的发送
  18. (18)ROS学习-TF坐标变换之静态坐标变换
  19. Labview | VDM与VAS2018下载
  20. GEE学习笔记:在Google Earth Engine(GEE)中导出shp文件

热门文章

  1. c语言凸包算法,基于C语言的凸包算法实现
  2. matlab项目实例教程,matlab简明实例教程.doc
  3. linux 中 timeval结构体
  4. reactor设计模式 1
  5. Linux两种定时器
  6. qt能使用logback_X04Logback的配置
  7. java二级缓存技术_Java二级缓存
  8. python正则判断_Python 正则表达式
  9. Sharepoin学习笔记—架构系列--04 Sharepoint的四种执行模型 1
  10. Iaas,Paas,Saas三者的区别联系是什么?