python海贼王logo_Python 实现的下载op海贼王网的图片（网络爬虫）

没得事就爬一下我喜欢的海贼王上的图片

需要在d盘下建立一个imgcache文件夹

# -*- coding: utf-8 -*-

import urllib

import urllib2

import json

from bs4 import BeautifulSoup

import threadpool

import thread

class htmlpaser:

def __init__(self):

self.url='http://1.hzfans.sinaapp.com/process.php'

#POST数据到接口

def Post(self,postdata):

# headers = {

# 'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'

# }

# data = urllib.urlencode(postdata)

# req = urllib2.Request(self.url,data,headers)

# resp = urllib2.urlopen(req,None,20)

# html = resp.read()

# return html

data = urllib.urlencode(postdata)

req = urllib2.Request(url, data)

html= urllib2.urlopen(req).read()

print html

#获取html内容

def GetHtml(self,url):

headers = {

'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'

}

req = urllib2.Request(url,None,headers)

resp = urllib2.urlopen(req,None,5)

html = resp.read()

#return html.decode('utf8')

return html

def GetHtml2(self,url):

page = urllib.urlopen(url)

html = page.read()

page.close()

return html

def GetHtml3(self,url):

req_header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',

'Accept':'text/html;q=0.9,*/*;q=0.8',

'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3',

'Accept-Encoding':'gzip',

'Connection':'close',

'Referer':None #注意如果依然不能抓取的话，这里可以设置抓取网站的host

}

req_timeout = 5

req = urllib2.Request(url,None,req_header)

resp = urllib2.urlopen(req,None,req_timeout)

html = resp.read()

return html

def GetList(self,html):

soup = BeautifulSoup(''.join(html))

baseitem=soup.find('ul',{'class':'list'})

slist=baseitem.select('li a')

return slist

def DownImg(self,imgurl):

path= r"d:/imgcache/"+self.gGetFileName(imgurl)

data = urllib.urlretrieve(imgurl,path)

return data

def gGetFileName(self,url):

if url==None: return None

if url=="" : return ""

arr=url.split("/")

return arr[len(arr)-1]

def mkdir(path):

import os

path=path.strip()

path=path.rstrip("\\")

# 判断路径是否存在

# 存在 True

# 不存在 False

isExists=os.path.exists(path)

# 判断结果

if not isExists:

# 如果不存在则创建目录

# 创建目录操作函数

os.makedirs(path)

return True

else:

# 如果目录存在则不创建，并提示目录已存在

return False

#返回两个值

def ParseContent(self,html):

soup = BeautifulSoup(''.join(html))

baseitem=soup.find('div',{'class':'showbox'})

title=soup.find('div',{'class':'msg'}).find('div',{'class':'m_left'}).get_text()

imglist=baseitem.find_all('img')

for img in imglist:

imgurl=img.get('src')

self.DownImg(imgurl)

content=baseitem.get_text().encode('utf8')

position=content.find('热点推荐')

return title,content[0:position]

def ParseItem(self,item):

url=item.get('href')

if url==None:

return

#print url+'\n'

html=obj.GetHtml2(url)

title,content=obj.ParseContent(html)

#print title+'\n'

return title

def print_result(request, result):

print str(request.requestID)+":"+result

obj=htmlpaser()

pool = threadpool.ThreadPool(10)

for i in range(1,40):

url="http://op.52pk.com/shtml/op_wz/list_2594_%d.shtml"%(i)

html=obj.GetHtml2(url)

items=obj.GetList(html)

print 'add job %d\r' % (i)

requests = threadpool.makeRequests(obj.ParseItem, items, print_result)

[pool.putRequest(req) for req in requests]

pool.wait()

python海贼王logo_Python 实现的下载op海贼王网的图片（网络爬虫）相关推荐

python海贼王logo_Python实现的下载op海贼王网的图片
没得事就爬一下我喜欢的海贼王上的图片需要在d盘下建立一个imgcache文件夹 # -*- coding: utf-8 -*- import urllib import urllib2 import ...
Python实训day05pm【JS-DOM-获取元素节点对象、网络爬虫】
Python实训-15天-博客汇总表目录 1.CSS选择器 2.网络爬虫 2.1.练习1 2.2.练习2 1.CSS选择器 .rank-body .book-mid-info .author a:n ...
Python 实现的下载op海贼王网的图片（网络爬虫）
没得事就爬一下我喜欢的海贼王上的图片须要在d盘下建立一个imgcache目录 # -*- coding: utf-8 -*-import urllib import urllib2import js ...
python serial库文件下载_Pyserial python 串口驱动库pyserial - 下载 - 搜珍网
Pyserial/ Pyserial/pyserial-2.7.win32.exe Pyserial/pyserial-2.7.win32_py3k.exe Pyserial/pyserial-2.7 ...
python中data.find_all爬取网站为空列表_Python网络爬虫之Scrapy 框架-分布式【第二十九节】...
1. 介绍scrapy-redis框架 scrapy-redis 一个三方的基于redis的分布式爬虫框架,配合scrapy使用,让爬虫具有了分布式爬取的功能. github地址: https://g ...
python从零基础到项目实战当当_Python 3.x网络爬虫从零基础到项目实战
● 案例完整本书中的所有案例都是通过理论讲解环境搭建完整代码及分析运行结果这种完善的结构进行讲解的.此外,复杂的案例配有项目结构图,有难度的案例还分析了底层源码,并且对于所有案例的讲解,都 ...
Python实训day04pm【网络爬虫（文本、图片）】
Python实训-15天-博客汇总表目录 1.网络爬虫 1.1.爬取文本 1.2.爬取图片 2.其他知识点上午题目讲解昨天的基础题目讲解爬取非文本(图片) 1.网络爬虫 #bili 视频,爬下 ...
python爬虫抓取文本_Python实现可获取网易页面所有文本信息的网易网络爬虫功能示例...
本文实例讲述了Python实现可获取网易页面所有文本信息的网易网络爬虫功能.分享给大家供大家参考,具体如下: #coding=utf-8 #------------------------------ ...
Python 爬虫---（7） Python3网络爬虫快速入门实战解析
转载请注明作者和出处: http://blog.csdn.net/c406495762 Github代码获取:https://github.com/Jack-Cherish/python-spide ...

python海贼王logo_Python 实现的下载op海贼王网的图片（网络爬虫）

python海贼王logo_Python 实现的下载op海贼王网的图片（网络爬虫）相关推荐

最新文章

热门文章