主要的程序

import scrapy
from ..items import LianjiatestchenItem
import jsonclass LianjiachenSpider(scrapy.Spider):name = 'Lianjiachen'# allowed_domains = ['bj.lianjia.com/zufang']start_urls = ['http://bj.lianjia.com/zufang/']def parse(self, response, **kwargs):# 获取城区area_bj_list = response.xpath('//*[@id="filter"]/ul[2]//li[position()>1]//a/@href').extract()# 拼接urlfor i in area_bj_list:real_url = "https://bj.lianjia.com" + i# print(real_url)yield scrapy.Request(url=real_url, callback=self.parse_page_url, dont_filter=True)passdef parse_page_url(self, response):# 获取最大页数max_page_lianjia = response.xpath("""//*[@id="content"]/div[1]/div[2]/@data-totalpage""").extract()# print(max_page_lianjia)for i in range(1, int(max_page_lianjia[0]) + 1):# 拼接页数链接url = response.url + "pg" + str(i)# print(url)yield scrapy.Request(url=url, callback=self.paese_message, dont_filter=True)passdef paese_message(self, response):# 获取所有的div# print(response.url)all_div = response.xpath("""//*[@id="content"]/div[1]/div[1]//div""")for house in all_div:name_type_orientation = house.xpath(""".//p[@class='content__list--item--title']/a/text()""").extract()[0].strip().split(' ')# 房子名字name = name_type_orientation[0]# 房子户型house_type = name_type_orientation[1]# 房子朝向orientation = name_type_orientation[2]# 房子区域area = house.xpath(""".//p[@class='content__list--item--des']/a/text()""").extract()[0]# print(area)# 房子街道street = house.xpath(""".//p[@class='content__list--item--des']/a[2]/text()""").extract()[0]# print(street)# 房子具体位置concrete = house.xpath(""".//p[@class='content__list--item--des']/a[3]/text()""").extract()[0]# 房子租金lease = house.xpath(""".//span/em/text()""").extract()[0]# print(lease)# characteristic  特点characteristic = house.xpath(""".//p[3]//i/text()""").extract()characteristic = ''.join([i + '-' for i in characteristic]) if characteristic else '空的'characteristic = characteristic.rstrip('-')# print(characteristic)# 维护时间maintenance_time = house.xpath(""".//p[4]/span[2]/text()""").extract()[0]# print(maintenance_time)item = LianjiatestchenItem()item['name'] = nameitem['house_type'] = house_typeitem['orientation'] = orientationitem['street'] = streetitem['area'] = areaitem['concrete'] = concreteitem['lease'] = leaseitem['characteristic'] = characteristicitem['maintenance_time'] = maintenance_time# 获取详情页地址detail_url = house.xpath(""".//p[@class='content__list--item--title']/a/@href""").extract()[0]detail_url = 'https://bj.lianjia.com' + detail_url# print(detail_url)yield scrapy.Request(url=detail_url, callback=self.detail_page, meta={'item': item})passdef detail_page(self, response):item = response.meta['item']loupan = response.xpath("""//*[@id="aside"]/ul/li[3]/span[2]/text()""").extract()[0]loupan = loupan.split(' ')[1].split('/')floor_properties = loupan[0]floor_num = loupan[1]item['floor_properties'] = floor_propertiesitem['floor_num'] = floor_numphone_url = 'https://ex.lianjia.com/sdk/phone400'# 手机参数datadata_dict = response.xpath("""//*[@id="aside"]/div[2]/div[1]/@data-agent""").extract()[0]data_dict = json.loads(data_dict)ucId = data_dict['ucId']digV = data_dict['digV']# print(digV)adId = json.loads(digV)['adId']data = {"adId": str(adId),"digV": str(digV),"hdicCityId": "110000","mediumId": "100000032","mobileType": "AGENT","required400": "true","ucId": str(ucId)}yield scrapy.Request(url=phone_url, callback=self.phone_num, method='post', body=json.dumps(data),headers={"Content-Type": "application/json"}, meta={'item': item})passdef phone_num(self, response):item = response.meta['item']phone = json.loads(response.text)phone_num = phone['data'][0]['phone400']item['phone_num'] = phone_num# print(phone)print(phone_num)yield itempass

管道

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymysqlclass LianjiaSpiderPipeline:def __init__(self):self.conn_mysql()def conn_mysql(self):self.db = pymysql.connect(host='127.0.0.1', port=3306, user='root', password='123456',database='test_chen_Lianjia', charset='utf8')self.cursor = self.db.cursor()passdef process_item(self, item, spider):pic = item["pic"]title = item["title"]city_area = item["city_area"]business_area = item["business_area"]road_area = item["road_area"]toward = item["toward"]area = item["area"]room = item["room"]hall = item["hall"]toliet = item["toliet"]sign_list = item["sign_list"]publish_time = item["publish_time"]lese = item["lese"]phone = item["phone"]floor_type = item["floor_type"]floor_num = item["floor_num"]sql = """insert into linajia_table (pic,title,city_area,business_area,road_area,toward,area,room,hall,toliet,sign_list,publish_time,lese,phone,floor_type,floor_num) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""try:# 执行sql语句self.cursor.execute(sql, (pic, title, city_area, business_area, road_area, toward, area, room, hall, toliet, sign_list, publish_time,lese, phone, floor_type, floor_num))# 提交到数据执行self.db.commit()except Exception as e:print(e)# 回滚 要么执行,要么不执行self.db.rollback()return item

ITEM

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.htmlimport scrapyclass LianjiatestchenItem(scrapy.Item):# define the fields for your item here like:# name = scrapy.Field()name = scrapy.Field()house_type = scrapy.Field()orientation = scrapy.Field()street = scrapy.Field()area = scrapy.Field()concrete = scrapy.Field()lease = scrapy.Field()characteristic = scrapy.Field()maintenance_time = scrapy.Field()floor_properties = scrapy.Field()floor_num = scrapy.Field()phone_num = scrapy.Field()pass

1111

2021-05-26--CHEN scary相关推荐

  1. 京东活动+自动运行脚本+签到2021.05.26更新

    https://wws.lanzoux.com/iu5UIphzwba 地址不行的把x改成i 密码:fxfx 5.26更新 添加了618等5个活动. 5.05更新 更新了最新脚本 移除过期脚本 太多改 ...

  2. [2021.05.26]AudioTrack流程分析

    转载自: Android深入浅出之Audio 第一部分 AudioTrack分析_阿拉神农的博客-CSDN博客 UML顺序图: AudioTrack.svg https://download.csdn ...

  3. 2021.05.27 发表自己第一篇技术文章

    从昨晚开始做家庭财经系统.不要以为有系统两个字就怎么复杂,都是大家都会的.建库.建表. 第一个使用软件是mysql 2021.05.26 数据库 lianxi 已建好. 表 members_basic ...

  4. 前端面试题笔记 2021.8.26

    2021.8.26学习笔记 如果需要匹配包含文本的元素,用下面哪种方法来实现? A. text() B. contains() C. input() D. attr(name) 正确答案: B tex ...

  5. 市面上主流编辑器介绍(2021/05/20)

    市面上主流编辑器介绍(2021/05/20) 背景 Markdown是一种有用的轻量级标记语言,后续Markdown简写为md. 富文本编辑器(Rich Text Editor,RTE)是一种可内嵌于 ...

  6. DS SIMULIA CST STUDIO SUITE 2021.05 SP5

    CST Studio Suite 2021.05 - 发行说明 此补丁是推荐更新,其中包括以下更正和改进. 许可 CST Studio Suite Frontend 包括 CST Studio Sui ...

  7. 2021.05.05青蛙过河

    2021.05.05青蛙过河 (题目来源:https://leetcode-cn.com/problems/frog-jump/) 题目描述 一只青蛙想要过河. 假定河流被等分为若干个单元格,并且在每 ...

  8. 2021.05.20最少数量的箭引爆气球

    2021.05.20最少数量的箭引爆气球 题目描述 在二维空间中有许多球形的气球.对于每个气球,提供的输入是水平方向上,气球直径的开始和结束坐标.由于它是水平的,所以纵坐标并不重要,因此只要知道开始和 ...

  9. 2021.1.26课程摘要(逻辑教育-王劲胜)

    2021.1.26课程摘要 逻辑教育-13期-Python基础班-王劲胜 一.循环控制 二.列表 三.99乘法表 四.作业讲解 逻辑教育-13期-Python基础班-王劲胜 一.循环控制 1.brea ...

  10. 【离散数学】 SEU - 24 - 2021/05/28 - Algebraic System

    Discrete Mathematical Structures (6th Edition) 2021/05/28 - Algebraic System Algebraic System Binary ...

最新文章

  1. SDWebImage内部实现过程
  2. python3.8还是3.7_选择 Python3.6 还是 Python 3.7
  3. 如何自动填充网页表单_iCab for Mac(web网页浏览器)
  4. ubuntu中的fi语法_Shell脚本语法--if/then/elif/else/fi
  5. Spring+EhCache缓存实例(详细讲解+源码下载)
  6. 【Vue2.0学习】—Vuex工作原理图(二十五)
  7. qqzoneQQ空间漏洞扫描器的设计attilax总结
  8. linux下开源电子设计软件
  9. 大学计算机基础应用教程ppt,大学计算机基础教程.ppt
  10. 计算机手机共享上网,上网教程_电脑wifi怎么实现手机共享上网 - 驱动管家
  11. 给自己职业发展的建议
  12. proxmox 控制台无法连接_Proxmox VE 5.4中的NAT网络-重置网络接口后,VM的连接消失了...
  13. 新闻闲话:低龄儿童,如何快快乐乐学英语
  14. hive 关于用户留存率的计算
  15. qq邮箱隐藏代码html,QQ邮箱原来这么好用,4个隐藏设置格调满满
  16. 从内盘外盘的变化中看趋势
  17. 【Web技术】959- JavaScript 如何在线解压 ZIP 文件?
  18. OpenCV中的相机失真、内外参、不失真图像、相机校准
  19. 他他他!! 都进了大公司。。。
  20. autohotkey-运算符

热门文章

  1. Blue Coat 庖丁解牛 云安全势在必行
  2. 应广PFS122单片机比较器测供电电源VDD电压带临界点消抖处理
  3. 计算机老是重启进不了桌面,电脑无限重启进不桌面
  4. html 磁贴自动布局,也来“玩”MetroUI之磁贴(一)_html/css_WEB-ITnose
  5. MT6625平台SP_META软件测试工具下载
  6. 一线明星纷纷失业,数据告诉你今年的演员有多难
  7. 无法启动此程序因为计算机中丢失msvcp140d,msvcp140.dll文件丢失修复工具
  8. 研报解读 | 中国云计算:从数据看未来
  9. 360wifi驱动linux驱动安装,ubuntu安装360随身wifi驱动
  10. css 大于号 标签_css里大于号表示什么