Python

# 加载模块
import csv
import time
import codecs
import random
import requests
from bs4 import BeautifulSoup# 伪装报头
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'}# 在指定路径新建csv文件，后续写入数据
f = codecs.open('D://Spyder/WD/xiaozhu.csv', 'w', 'utf-8_sig')
writer = csv.writer(f)
writer.writerow(('title', 'address', 'price', 'img', 'name', 'gender'))# 定义judgement_gender，判别房东性别
def judgement_gender(class_name):if class_name == ['member_ico1']:return '女'else:return '男'# 定义get_links，获取房屋详情页的链接
def get_links(url):destination = requests.get(url, headers = headers)soup = BeautifulSoup(destination.text, 'lxml')links = soup.select('#page_list ul li > a')for link in links:href = link.get('href')get_info(href)# 定义get_info，获取房屋详情页面里的信息：标题、地址、价格、房东照片、昵称、性别
def get_info(url):destination = requests.get(url, headers = headers)soup  = BeautifulSoup(destination.text, 'lxml')title = soup.select('div.pho_info h4 em')[0].textaddr  = soup.select('span.pr5')if len(addr) == 0:address = 'NA'else:address = soup.select('span.pr5')[0].textprice  = soup.select('div.day_l span')[0].textimg    = soup.select('div.member_pic a img')[0].get('src')name   = soup.select('div.w_240 h6 a')[0].textgender = judgement_gender(soup.select('div.member_pic div')[0].get('class'))writer.writerow((title, address, price, img, name, gender))# 程序入口
if __name__ == '__main__':urls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(number) \for number in range (1, 14)]for single_url in urls:get_links(single_url)time.sleep(random.randint(1, 5))# 停止写入
f.close()

R

# 加载包
library(rvest)
library(stringr)# 定义judgeFunc，判断房东性别
judgeFunc <- function(class_name) {if (class_name == 'member_ico1') {return('女')} else { return('男')}
}# 定义GetlinkFunc，获取房屋详情页的链接
GetlinkFunc <- function(url) {result <- list()for (i in seq_along(url)){destination <- read_html(url[i], encoding = 'UTF-8')data <- destination %>% html_nodes('#page_list ul li > a') %>% html_attr('href')result <- rbind(result, data)cat(sprintf('第【%d】页抓取成功', i), sep = '\n')Sys.sleep(runif(1, 1, 5))}return(result)
}# 定义GetinfoFunc，获取房屋详情页面里的信息：标题、地址、价格、房东照片、昵称、性别
GetinfoFunc <- function(url) {result <- data.frame()for (i in seq_along(url)) {destination <- read_html(url[i], encoding = 'UTF-8')title <- destination %>% html_nodes('div.pho_info h4 em') %>% html_text()addr <- destination %>% html_nodes('span.pr5') %>% html_text() %>% str_trim()if (length(addr) == 0) {address <- NA} else {address <- addr}price  <- destination %>% html_nodes('div.day_l span') %>% html_text()img    <- destination %>% html_nodes('div.member_pic a img') %>% html_attr('src')name   <- destination %>% html_nodes('div.w_240 h6 a') %>% html_text()gender <- destination %>% html_nodes('div.member_pic div') %>% html_attr('class') %>% judgeFunc()data   <- data.frame(title, address, price, img, name, gender)cat(sprintf('第【%d】条房屋链接抓取成功', i), sep = '\n')result <- rbind(result, data)}return(result)
} # 执行函数
base    <- 'http://bj.xiaozhu.com/search-duanzufang-p'
url     <- paste0(base, 1:13, '-0/')
link    <- GetlinkFunc(url) %>% unlist()
xiaozhu <- GetinfoFunc(link)# 导出csv文件
write.table(xiaozhu, row.names = FALSE, sep = ',', 'xiaozhu.csv')

【CSS Selector】小猪网短租房（Python R）相关推荐

爬取小猪网的短租房信息
爬取小猪网的短租房信息的实现 #小猪网爬虫2.0 #功能:实现爬取多页面,并将图片和CSV文件存入桌面文件夹 from PIL import Image import requests from bs ...
Python疫起学习·万丈高楼平地起Day09（精简版|浓缩就是精华）爬虫知识附上案例爬取北京地区短租房信息、爬取酷狗TOP500的数据以及爬取网易云音乐热歌榜单
爬虫知识 Requests库部分运行结果如下: 有时爬虫需要加入请求头来伪装成浏览器,以便更好地抓取数据.在Chrome浏览器中按F12键打开Chrome开发者工具,刷新网页后找到User-Agen ...
Python 中 xpath 语法与 lxml 库解析 HTML/XML 和 CSS Selector
The lxml.etree Tutorial :https://lxml.de/tutorial.html python3 解析 xml:https://www.cnblogs.com/deadwo ...
Python爬取北京地区短租房信息
本文利用Requests和BeautifulSoup第三方库,爬取小猪短租网北京地区短租房的信息.代码参考<从零开始学Python网络爬虫>. 完整代码如下: from bs4 impor ...
Python爬虫实战 [成都短租房项目]
Python爬虫实战[成都短租房项目] 一.项目需求二.需求分析三.爬虫部分 3.1 获取原始报文 3.2 数据清洗(re+string方法) 3.3 数据清洗(BeautifulSoup方法) ...
python 爬取链家网北京租房信息
刚学习了python,中途遇到很多问题,查了很多资料,最关键的就是要善于调试,div信息一定不要找错,下面就是我爬取租房信息的代码和运行结果: 链家的房租网站两个导入的包 1.requests 用来 ...
链家网北京市租房数据分析（二）——基于python的数据可视化
本次分析的数据为爬取链家网租房首页的3000余条整租房源数据.数据量较小,分析结果难免存在偏差,本分析报告仅作为实战项目展示.本报告中所描述的平均租金指单套房源租金的中位数. 数据源可至百度网盘提取, ...
链家网北京市租房数据分析（一）——基于python的数据清洗
作为北漂中的一员,我们都明白,租房是不能回避的问题.租房被坑,也是难以避免的.多数人都有那么一段不堪回首的与黑中介面对面的往事.其实,就是贪图便宜. 便宜可以占,但是我们要理性地占便宜.要有全局观.大 ...
python二手房价格预测_Python爬取赶集网北京二手房数据R对爬取的二手房房价做线性回归分析...
前言:本文主要分为两部分:Python爬取赶集网北京二手房数据&R对爬取的二手房房价做线性回归分析.文章思路清晰,代码详细,特别适合刚刚接触Python&R的同学学习参考. Part1 ...

【CSS Selector】小猪网短租房（Python R）

Python

R

【CSS Selector】小猪网短租房（Python R）相关推荐

最新文章

热门文章