多线程----使用线程池爬取二手房信息

最开始采用单线程,但是总共要爬取100页共计6000多个html数据,效率特别低,临时学习了一下线程池,非常好用

直接上代码:

import urllib.request
from lxml import etreeimport threadpool
import ssl
ssl._create_default_https_context = ssl._create_unverified_context# cookie参数
temp="sale_history_6346474=%257B%2522caseType%2522%3A%2522%257B%24caseType%257D%2522%2C%2522name%2522%3A%2522%25E5%259F%25B9%25E6%25A3%25AE%25E5%25A4%25A7%25E5%258E%25A6%2522%2C%2522useage%2522%3A%2522%25E4%25BD%258F%25E5%25AE%2585%2522%2C%2522price%2522%3A%2522%253Cem%253E330%253C/em%253E%25E4%25B8%2587%2522%2C%2522area%2522%3A%2522%253Cem%253E65.86%253C/em%253E%25E3%258E%25A1%2522%2C%2522room%2522%3A%25222%2522%2C%2522url%2522%3A%2522https%3A//sz.haofang.net/ershoufang/6346474_1.html%2522%257D;"
class HaoFang():def __init__(self,url):self.url=urlself.page_headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36","Cookie":temp}def get_data(self):try:req=urllib.request.Request(self.url,headers=self.page_headers)data=urllib.request.urlopen(req).read().decode('utf-8','ignore')code=urllib.request.urlopen(req).codeuri=urllib.request.urlopen(req).url# print("===>%s"%code)# print("--->%s"%uri)# print("+++>%s"%data)return code,uri,dataexcept Exception as e:print(str(e))return 0,0,0def get_house(self):code,uri,data=self.get_data()if code==200:html=etree.HTML(data)# house_title=html.xpath('//div[@class="house-details"]/div[@class="house-title"]/a/@title')#print(house_title)house_url=html.xpath('//div[@class="info fr"]/div[@class="title"]/a/@href')print(house_url)for x in house_url:print(x)req = urllib.request.Request(x, headers=self.page_headers)data = urllib.request.urlopen(req).read().decode('utf-8', 'ignore')html = etree.HTML(data)house_title = html.xpath('//body[@class="house-detail"]/div[@class="wraper"][1]/div[@class="info"]/h1/text()')print(house_title)# 房价house_price = html.xpath('//div[@class="con"]/div[@class="detail"]/div[@class="c-orange"]/div[@class="c-orange-price"]/text()')# print(house_price)# 单价(元/平米)house_unit = html.xpath('//div[@class="con"]/div[@class="detail"]/div[@class="c-orange"]/div[@class="c-orange-unit"]/text()')# print(house_unit)# 电梯house_lift = html.xpath('//div[@class="con"]/div[@class="m-box"]/dl[2]/dd[3]/label/text()')# print(house_lift)# 房屋户型house_type = html.xpath('//div[@class="con"]/div[@class="jichu-info"]/div[@class="jichu-info-1"][1]/dl/dd[1]/text()')# print(house_type)# 建筑面积house_area = html.xpath('//div[@class="con"]/div[@class="jichu-info"]/div[@class="jichu-info-1"][1]/dl/dd[2]/em/text()')# print(house_area)# 房屋朝向house_direction = html.xpath('//div[@class="con"]/div[@class="jichu-info"]/div[@class="jichu-info-1"][1]/dl/dd[3]/text()')# print(house_direction)# 装修情况house_fixture = html.xpath('//div[@class="con"]/div[@class="jichu-info"]/div[@class="jichu-info-1"][1]/dl/dd[4]/text()')# print(house_fixture)# 小区house_village = html.xpath('//div[@class="con"]/div[@class="jichu-info"]/div[@class="jichu-info-1"][1]/dl/dd[5]/text()')# print(house_village)# 开发商house_developer = html.xpath('//div[@class="con"]/div[@class="jichu-info"]/div[@class="jichu-info-1"][1]/dl/dd[6]/text()')# print(house_developer)# 所在楼层house_floor = html.xpath('//div[@class="con"]/div[@class="jichu-info"]/div[@class="jichu-info-1"][2]/dl/dd[1]/text()')# print(house_floor)# 产权年限house_time = html.xpath('//div[@class="con"]/div[@class="m-box"]/dl[1]/dd[3]/label/span/text()')if house_time == []:house_time = html.xpath('//div[@class="con"]/div[@class="m-box"]/dl[1]/dd[3]/label/text()')# print(house_time)# 物业类型house_wuye = html.xpath('//div[@class="con"]/div[@class="jichu-info"]/div[@class="jichu-info-1"][2]/dl/dd[3]/text()')# print(house_wuye)# 物业费house_wuyefei = html.xpath('//div[@class="con"]/div[@class="jichu-info"]/div[@class="jichu-info-1"][2]/dl/dd[4]/span[2]/text()')# print(house_wuyefei)# 建成年限house_build =  html.xpath('//div[@class="con"]/div[@class="jichu-info"]/div[@class="jichu-info-1"][2]/dl/dd[5]/text()')# print(house_build)# 地址house_adrress =  html.xpath('//div[@class="con"]/div[@class="jichu-info"]/div[@class="jichu-info-1"][2]/dl/dd[6]/text()')# print(house_adrress)# 交通出行house_path =  html.xpath('//div[@class="con"]/dl/dd[@class="wAll"]/span[@class="jiaotong-chuxing"]/text()')# print(house_path)# 核心卖点house_point = html.xpath('//div[@class="con"]/dl/dd[@class="wAll"]/span[@class="hexin-maidain"]/text()')# print(house_point)# 首付预算house_shoufu = html.xpath('//div[@class="build"]/div[@class="build-1"]/a[1]/text()')# print(house_shoufu)# 经纪人agent_name = html.xpath('//div[@class="info-box"]/div[@class="call-phone-box"]/div[@class="call-phone-info"]/text()')if agent_name == []:agent_name = html.xpath('//div[@class="info-box"]/div[@class="info-star"]/p/text()')if agent_name == []:agent_name = html.xpath('//div[@class="owner"]/div[@class="owner-box"]/div[@class="owner-name"]/text()')# print(agent_name)# # 服务区域agent_area = html.xpath('//div[@class="info-box"]/div[@class="call-phone-box"]/div[@class="call-phone-info"]/span/text()')if agent_area == []:agent_area = html.xpath('//div[@class="info-box"]/div[@class="info-quyu"]/text()')if agent_area == []:agent_area = ['个人房源']# print(agent_area)# # 电话agent_phone = html.xpath('//div[@class="info-box"]/div[@class="call-phone-box"]/div[@class="call-broker-phone"]/@data-phone')if agent_phone == []:agent_phone = html.xpath('//div[@class="info-box"]/div[@class="info-mobile"]/span/text()')if agent_phone == []:agent_phone = ['无']# print(agent_phone)#将房价信息写入文本中n = len(house_title)if n == 1:# if len(house_title)==len(house_price)==len(house_unit)==len(house_lift)==len(house_type)==len(house_area)==len(house_direction)==len(house_fixture)==len(house_village)==len(house_developer)==len(house_floor)==len(house_wuye)==len(house_wuyefei)==len(house_build)==(house_adrress)==len(house_path)==len(house_point)==len(house_shoufu)==len(house_shoufu)==n:with open('haofang_ershoufang.csv','a+',encoding='utf-8') as fh:fh.write(house_title[0]  +","+house_price[0]+","+house_unit[0]+","+house_lift[0].replace(' ','').replace('\r\n','')+","+house_type[0]+","+house_area[0]+","+house_direction[0].replace(' ','')+","+house_fixture[0].replace(' ','')+","+house_village[0].replace(' ','')+","+house_developer[0].replace(' ','')+","+house_floor[0]+","+house_time[0]+","+house_wuye[0].replace(' ','')+","+house_wuyefei[0]+","+house_build[0]+","+house_adrress[0].replace(' ','')+","+house_path[0]+","+house_point[0]+","+house_shoufu[0].replace(' ','').replace('\r\n','')+","+agent_name[0]+","+agent_area[0]+","+agent_phone[0]+"\n")return uridef task(m):print("第" + str(m) + "页爬取完成")print("正在爬取第" + str(m) + "页...")url = url_start + "/p" + str(m) + ".html" + "/"house = HaoFang(url)uri = house.get_house()if __name__=='__main__':try:url_start="https://sz.haofang.net/ershoufang"with open('haofang_ershoufang.csv','w',encoding='utf-8') as fh:fh.write("title,价格(元),单价(元/平米),电梯,房屋户型,建筑面积(㎡),房屋朝向,装修情况,小区,开发商,所在楼层,产权年限,物业类型,物业费,建成年限,地址,交通出行,核心卖点,首付预算,经纪人,服务区域,电话\n")i=1house=HaoFang(url_start)uri=house.get_house()print('第一页爬取完成')# 尝试消息队列创建多线程,有待完善# while True:#     print("第"+str(i)+"页爬取完成")#     i+=1#     print("正在爬取第"+str(i)+"页...")#     url=url_start+"/p"+str(i)+".html"+"/"#     house=HaoFang(url)#     uri=house.get_house()# threads = []# # q = queue.Queue()# # for m in range(2,101):# #     q.put(m)# for i in range(4):  # 开启三个线程#     thread = threading.Thread(target=task,args=(m,m))#     thread.start()#     threads.append(thread)# for thread in threads:#     thread.join()list = []for m in range(2,101):list.append(m)pool = threadpool.ThreadPool(10)requests = threadpool.makeRequests(task, list)[pool.putRequest(req) for req in requests]pool.wait()except Exception as e:print(str(e))

多线程----使用线程池爬取二手房信息相关推荐

使用selenium,xpath,线程池爬取斗鱼主播信息
使用xpath,线程池爬取斗鱼主播信息: 主要爬取主播昵称,直播内容分类,房间名称,房间号以及人气,共爬取了大概110多页数据,大概15000条,保存在txt文本中, import timefrom ...
【python爬虫学习记录持续更新】多线程多进程，带线程池爬取实例
文章目录简介多线程codingFrame 多进程codingFrame 线程池与进程池线程池爬取实例(主页url隐了主要看思路和如何使用线程池框架) 简介进程是资源单位线程是执行单位每 ...
Python爬虫——使用线程池爬取同程旅行景点数据并做数据可视化
大家好!我是霖hero 正所谓:有朋自远方来,不亦乐乎?有朋友来找我们玩,是一件很快乐的事情,那么我们要尽地主之谊,好好带朋友去玩耍!那么问题来了,什么时候去哪里玩最好呢,哪里玩的地方最多呢? 今天将 ...
基于requests模块的cookie,session和线程池爬取
基于requests模块的cookie,session和线程池爬取有些时候,我们在使用爬虫程序去爬取一些用户相关信息的数据(爬取张三"人人网"个人主页数据)时,如果使用之前req ...
【每日爬虫】：利用线程池爬取2万张装修效果图
文章目录一.前言二.需求三.技术路线四.线程池爬取2万张装修效果图五.其他一.前言 2020-04-08日爬虫练习每日一个爬虫小练习,学习爬虫的记得关注哦! 学习编程就像学习骑自行车一样 ...
使用requests、BeautifulSoup、线程池爬取艺龙酒店信息并保存到Excel中
import requests import time, random, csv from fake_useragent import UserAgent from bs4 import Beauti ...
爬虫项目代理操作和线程池爬取
代理操作代理操作的目的一些网站会有相应的反爬虫措施,例如很多网站会检测某一段时间某个IP的访问次数,如果访问频率太快以至于看起来不像正常访客,它可能就会会禁止这个IP的访问.所以我们需要设置一些代 ...
Scrapy爬取二手房信息+可视化数据分析
本篇介绍一个scrapy的实战爬虫项目,并对爬取信息进行简单的数据分析.目标是北京二手房信息,下面开始分析. 网页结构分析采用安居客网页信息作为二手房的信息来源,直接点击进入二手房信息的页面. 每页 ...
python爬取二手房信息_刚刚接触PythonR？教你爬取分析赶集网北京二手房数据（附详细代码）...
原标题:刚刚接触Python&R?教你爬取分析赶集网北京二手房数据(附详细代码) 源 /数据森麟文 /徐涛前言: 本文主要分为两部分:Python爬取赶集网北京二手房数据&R对爬取的 ...
使用线程池爬取PPT模版
声明: 全网最菜,只是记录分享,虚心接受点评任务爬取目标网站PPT模版目标地址:http://www.1ppt.com/ 分析网站结构通过观察,我们找到模版导航,获取每一个模版导航的URL,可以 ...

多线程----使用线程池爬取二手房信息

多线程----使用线程池爬取二手房信息相关推荐

最新文章

热门文章