python爬取网页信息，用正则达到快刀斩乱麻的效果

# -*- coding: utf-8 -*-
"""
Created on Fri Jun 14 17:37:44 2019@author: User
"""import re
import sys
from bs4 import BeautifulSoup       #beautifulsoup4库使用时是简写的bs4
import requests
import numpy as np
#import stringheader={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
}i_pg = 0
area_dtype=np.dtype([('bianhao',np.str_,50),('quyu',np.str_,30),('riqi',np.str_,30),('nian',np.str_,30),('yue',np.str_,30),('ri',np.str_,30),('zhuangtai',np.str_,30),  ('mianji',np.str_,30),                     ('jiage',np.str_,30),('yongtu',np.str_,30)])global array_area_all
array_area_all = np.array([('title', 'quyu','riqi', 'nian','yue', 'ri','zhuangtai', 'mianji', 'jiage', 'yongtu')], dtype = area_dtype)def get_page(url):global i_pgi_pg += 1print('页：', str(i_pg))
#    print(url)if i_pg > 50:sys.exit()try:response = requests.get(url, timeout = 30, headers=header)# 如果状态码不是200 则应发HTTOError异常response.raise_for_status()# 设置正确的编码方式response.encoding = response.apparent_encodingsoup = BeautifulSoup(response.text, 'html.parser')result_li = soup.find_all(class_=re.compile("rich-table-row.*?"))
#        print('result_li:',result_li) j = 0
#        #处理当前页面房源链接for row_text in result_li:#            这里每个列表页只处理前4个房源链接，是为了家加快调试j = j + 1# 编号title_str = row_text.find('td',{'id':re.compile("j_id46:\d+:j_id92")})global titletitle = title_str.text.lstrip()
#            print('编号：',title)# 区域quyu_str = row_text.find('td',{'id':re.compile("j_id46:\d+:j_id98")})global quyuquyu = quyu_str.text.lstrip()
#            print('区域：',quyu)# 成交价chengjiaojia_str = row_text.find('td',{'id':re.compile("j_id46:\d+:j_id107")})global chengjiaojiachengjiaojia = chengjiaojia_str.text.lstrip().replace('万元','',1)chengjiaojia = my_strip(chengjiaojia)if len(chengjiaojia) == 0:chengjiaojia = '0'
#            print('成交价：',chengjiaojia)            # 成交时间chengjiao_date_str = row_text.find('td',{'id':re.compile("j_id46:\d+:j_id116")})global chengjiao_datechengjiao_date = chengjiao_date_str.text.lstrip()#年
#            nian_str = chengjiao_date.find('td',{'id':re.compile("j_id46:\d+:j_id116")})global niannian = chengjiao_date[0:4]
#            print('年:', nian)global yueyue = chengjiao_date[5:7]
#            print('月:', yue)  global riri = chengjiao_date[8:10]
#            print('日:', ri)            #            print('成交时间：',chengjiao_date)             # 成交状态chengjiao_state_str = row_text.find('td',{'id':re.compile("j_id46:\d+:j_id119")})global chengjiao_statechengjiao_state = chengjiao_state_str.text.lstrip()
#            print('成交状态：',chengjiao_state) detail_href = row_text.find('a', {'id': re.compile("j_id46:\d+:j_id124")})detail_url = 'http://****' + detail_href.attrs['href']get_page_detail(detail_url)# 新的数组global area_newarea_new = np.array([(title, quyu,chengjiao_date, nian,yue,ri,chengjiao_state,area, chengjiaojia,use_1)], dtype = area_dtype)     #            print('area_new:', area_new)# 数组合并global array_area_allarray_area_all = np.vstack((array_area_all, area_new))    # 下一页的爬取result_next_page = 'http://****?firstResult='+str((i_pg - 1) * 20) +'&priceUnit=TotalPrice&logic=and'if len(result_next_page) != 0:# 函数进行递归get_page(result_next_page)            else:print('没有下一页了')        return response.textexcept:return '产生异常!'#进行字符串中空格，换行，tab键的替换及删除字符串两边的空格删除
def my_strip(s):return str(s).replace(" ", "").replace("\n", "").replace("\t", "").strip()#由于频繁进行BeautifulSoup的使用，封装一下
def my_Beautifulsoup(response):return BeautifulSoup(str(response), 'html.parser')# 详细页面的爬取
def get_page_detail(url):response = requests.get(url, headers=header)
#    print("9999999")if response.status_code == 200:soup = BeautifulSoup(response.text, 'html.parser')#        宗地面积area_all = soup.find(attrs={'id': 'j_id242'})area_2 = area_all.find_all('span', {'class': 'layout'})[1]global areaarea = area_2.text.lstrip().replace('平方米','',1)area = my_strip(area)if len(area) == 0:area = '0'
#        print('面积:'+ area)      #        用途use_all = soup.find(attrs={'id': 'j_id267'})use_2 = use_all.find_all('span', {'class': 'layout'})[1]global use_1use_1 = use_2.text.lstrip()use_1 = my_strip(use_1)
#        print('用途:'+ use_1)       # =============================================================================get_page('http://****')
print(array_area_all)
np.savetxt('data\\zl.csv', array_area_all, delimiter=',',fmt="%s")

python爬取网页信息，用正则达到快刀斩乱麻的效果相关推荐

Python 爬取网页信息并保存到本地爬虫爬取网页第一步【简单易懂，注释超级全，代码可以直接运行】
Python 爬取网页信息并保存到本地[简单易懂,代码可以直接运行] 功能:给出一个关键词,根据关键词爬取程序,这是爬虫爬取网页的第一步步骤: 1.确定url 2.确定请求头 3.发送请求 4.写入 ...
python爬取网页信息
最近在学习python,发现通过python爬取网页信息确实方便,以前用C++写了个简单的爬虫,爬取指定网页的信息,代码随便一写都几百行,而要用python完成相同的工作,代码量相当少.前几天看到了一 ...
[python] 常用正则表达式爬取网页信息及分析HTML标签总结
这篇文章主要是介绍Python爬取网页信息时,经常使用的正则表达式及方法.它是一篇总结性文章,实用性比较大,主要解决自己遇到的爬虫问题,也希望对你有所帮助~ 当然如果会Selenium基于自动化测试爬 ...
python正则表达式爬取网页数据_常用正则表达式爬取网页信息及HTML分析总结
Python爬取网页信息时,经常使用的正则表达式及方法. 1.获取标签之间内容2.获取超链接之间内容3.获取URL最后一个参数命名图片或传递参数4.爬取网页中所有URL链接5.爬取网页标题titl ...
常用正则表达式爬取网页信息及分析HTML标签总结
这篇文章主要是介绍Python爬取网页信息时,经常使用的正则表达式及方法.它是一篇总结性文章,实用性比较大,主要解决自己遇到的爬虫问题,也希望对你有所帮助~ 当然如果会Selenium基于自动化测试爬 ...
常用正则表达式爬取网页信息及HTML分析总结
Python爬取网页信息时,经常使用的正则表达式及方法. 1.获取<tr></tr>标签之间内容 2.获取<a href..></a>超链接之间内容 3 ...
Python爬虫：Xpath爬取网页信息（附代码）
Python爬虫:Xpath爬取网页信息(附代码) 上一次分享了使用Python简单爬取网页信息的方法.但是仅仅对于单一网页的信息爬取一般无法满足我们的数据需求.对于一般的数据需求,我们通常需要从一个 ...
python爬取网页表格数据匹配,python爬虫——数据爬取和具体解析
标签:pattern div mat txt 保存关于 json result with open 关于正则表达式的更多用法,可参考链接:https://blog.c ...
Python爬取网页所有小说
Python爬取网页所有小说 python 2.7.15 练习beautifulsoup的使用不了解bs的可以先看一下这个bs文档一.看URL的规律因为是要爬取网页上所有的小说,所以不仅要获取网 ...

python爬取网页信息，用正则达到快刀斩乱麻的效果

python爬取网页信息，用正则达到快刀斩乱麻的效果相关推荐

最新文章

热门文章