蛋壳公寓信息/信息网爬取/小猪短租/豆瓣/拉钩/人民邮电报/百度电视剧/加载更多-获取字符串中间任意内容
调试
import requests
from lxml import etreebase_url = 'https://www.danke.com/room/bj?page=1'headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'+'Chrome/62.0.3202.94 Safari/537.36'}
res = requests.get(base_url,headers = headers)
html = res.text
html = etree.HTML(html)
ii_list = html.xpath('//div[@class="r_lbx"]')title = ii_list[0].xpath('.//div[@class="r_lbx_cena"]/a/text()')[0].strip()
di_tie = ii_list[0].xpath('.//div[@class="r_lbx_cena"]/text()')[4].strip()
details = ii_list[0].xpath('.//div[@class="r_lbx_cenb"]/text()')[1].strip()
yangtai = '-'.join(ii_list[0].xpath('.//div[@class="r_lbx_cenc"]/span/text()'))
zicainuan = ii_list[0].xpath('.//div[@class="r_lbx_cenc"]/span[2]/text()')[0].strip()
price = ii_list[0].xpath('.//div[@class="r_lbx_moneya"]/span/text()')[0].strip() + '元/月'
import time
import requests
from lxml import etree
from multiprocessing.dummy import Pool
from requests.exceptions import RequestException
import openpyxldef get_one_page(url):try:res = requests.get(url,headers = headers)if res.status_code == 200:return res.textreturn Noneexcept RequestException:return Nonedef parse_one_page(html):# 构造HTML解析器info_list=[]ii_list = html.xpath('//div[@class="r_lbx"]')for ii in ii_list:try:##提取title = ii.xpath('.//div[@class="r_lbx_cena"]/a/text()')[0].strip()di_tie = ii.xpath('.//div[@class="r_lbx_cena"]/text()')[4].strip()details = ii.xpath('.//div[@class="r_lbx_cenb"]/text()')[1].strip().replace("\n", "")details = details.replace(" ","")yangtai = '-'.join(ii.xpath('.//div[@class="r_lbx_cenc"]/span/text()'))price = ii.xpath('.//div[@class="r_lbx_moneya"]/span/text()')[0].strip() + '元/月'sheet.append([title, di_tie, details,yangtai,price])except Exception:passdef main(offset):# 构造主函数,初始化各个模块,传入入口URLbase_url = 'https://www.danke.com/room/bj?page={}'url = base_url.format(offset)html = etree.HTML(get_one_page(url))parse_one_page(html)if __name__ == '__main__':wb = openpyxl.Workbook() # 获取工作簿对象sheet = wb.active # 活动的工作表# 添加列名sheet.append(['title', 'di_tie', 'details','yangtai','price'])# 请求头headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'+'Chrome/62.0.3202.94 Safari/537.36'}# 使用线程池print('多线程爬取开始')start_time=time.time()p = Pool(8)p.map(main,[i for i in range(1,3)])# 保存位置wb.save(r'C:\Users\Administrator\Desktop\info.xlsx')#关闭线程池end_time=time.time()print('多线程爬取结束')print('耗时:',end_time-start_time)p.close()p.join()#用来等待进程池中的worker进程执行完毕,防止主进程在worker进程结束前结束。
中国贸易救济信息网
调试
import requests
import json
from urllib.parse import urlencodeheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'+'Chrome/62.0.3202.94 Safari/537.36'}base_url = 'http://www.cacs.mofcom.gov.cn/cacscms/list/notice/ssqdc?'
formdata ={'pageNumber': 1,'condition': 'respondent','conditionValue': 97}url = base_url + urlencode(formdata)
response = requests.request("POST",url, headers = headers)
response = json.loads(response.text)
import time
import requests
from lxml import etree
from multiprocessing.dummy import Pool
from requests.exceptions import RequestException
import openpyxl
from fake_useragent import UserAgentdef get_one_page(url):try:res = requests.request("POST",url, headers = headers)if res.status_code == 200:return json.loads(res.text)return Noneexcept RequestException:return Nonedef parse_one_page(response):# 构造HTML解析器rows = response['rows']time.sleep(2)for i in rows:try:##提取title = i['TITLE']hangye = i['INDUSTRY_TYPE']date = i['GGTIME']state = i['CASE_STATE']sheet.append([title, hangye, date,state])except Exception:passdef main(offset):# 构造主函数,初始化各个模块,传入入口URLbase_url = 'http://www.cacs.mofcom.gov.cn/cacscms/list/notice/ssqdc?'formdata ={'pageNumber': offset,'condition': 'respondent','conditionValue': 227}base_url = base_url + urlencode(formdata)time.sleep(2)response = get_one_page(base_url)parse_one_page(response)if __name__ == '__main__':wb = openpyxl.Workbook() # 获取工作簿对象sheet = wb.active # 活动的工作表# 添加列名sheet.append(['标题', '行业','日期','状态'])# 请求头headers = {'User-Agent':UserAgent(verify_ssl=False).random}# 使用线程池print('多线程爬取开始')start_time=time.time()p = Pool(8)p.map(main,[i for i in range(1,21)])# 保存位置wb.save(r'C:\Users\Administrator\Desktop\台湾.xlsx')#关闭线程池end_time=time.time()print('多线程爬取结束')print('耗时:',end_time-start_time)p.close()p.join()#用来等待进程池中的worker进程执行完毕,防止主进程在worker进程结束前结束。
小猪短租
# -*- coding: utf-8 -*-
import requests
from lxml import etree
import re
import time
from multiprocessing.dummy import Pool
from requests.exceptions import RequestException
import openpyxl
from fake_useragent import UserAgent
wb = openpyxl.Workbook() # 获取工作簿对象
sheet = wb.active # 活动的工作表
# 添加列名
sheet.append(['评论', '时间'])# city = ['bj','sh','cd','cq']#
for c in range(1,5):base_url = 'https://'+'cd'+'.xiaozhu.com/search-duanzufang-p'+str(c)+'-0/'headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36','cookie': 'distinctId=177bdc122a3198-0fd1056ccdbe9c-73e356b-1049088-177bdc122a47ba; abtest_ABTest4SearchDate=b; xzucode=7cc36c9920e5ea4af320c54a2a91bb1f; xzucode4im=22484cf244628897eac55751ca877be6; xzSessId4H5=36fa5293fc9ca5c3cfed7c6e255d6c05; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22154772273197%22%2C%22first_id%22%3A%22177bdc122a3198-0fd1056ccdbe9c-73e356b-1049088-177bdc122a47ba%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22%24device_id%22%3A%22177bdc1228d6b9-036524b7051d1d-73e356b-1049088-177bdc1228e314%22%7D; xzuuid=2448739e; wttXMuWwbC=ffd787eba9db1d78d87caeed4143592ff9e9858d; ATNgmRNkrw=1613911104; Hm_lvt_92e8bc890f374994dd570aa15afc99e1=1613796288,1613907686,1613910153,1613911108; xzuinfo=%7B%22user_id%22%3A154772273197%2C%22user_name%22%3A%2215223820758%22%2C%22user_nickName%22%3A%22tanguancc%22%2C%22login_time%22%3A%222021-02-21+20%3A43%3A58%22%2C%22user_key%22%3A%22e7f28964b91b%22%7D; xztoken=WyI1ODA4MDI0MzIxS0RMdSIseyJ1c2VyaWQiOjE1NDc3MjI3MzE5NywiZXhwaXJlIjoxNjE1MDk3NzYxLCJjIjoid2ViIn0sIjk3OTlhMzU1MjBlYTc2ZWEyZmIzYjBhZjZhNzAwMzllIl0%3D; rule_math=xoj0qewbfr; Hm_lpvt_92e8bc890f374994dd570aa15afc99e1=1613911787','referer':'https://bj.xiaozhu.com/search-duanzufang-p1-0/'}res = requests.get(base_url,headers = headers)html = res.texthtml = etree.HTML(html)ii_list = html.xpath('//ul[@class="pic_list clearfix list_code"]/li')for j in ii_list:url = j.xpath('.//a[@class="resule_img_a"]/@href')[0]com_num = j.xpath('.//span[@class="commenthref"]/text()')[0].strip()num = re.findall(r'\d+', com_num)[-1]print(num)if int(num)<=10:headers = {'authority': 'bj.xiaozhu.com','accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9','accept-encoding': 'gzip, deflate, br','accept-language': 'zh-CN,zh;q=0.9','cache-control': 'max-age=0','cookie': 'distinctId=177bdc122a3198-0fd1056ccdbe9c-73e356b-1049088-177bdc122a47ba; abtest_ABTest4SearchDate=b; xzucode=7cc36c9920e5ea4af320c54a2a91bb1f; xzucode4im=22484cf244628897eac55751ca877be6; xzSessId4H5=36fa5293fc9ca5c3cfed7c6e255d6c05; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22154772273197%22%2C%22first_id%22%3A%22177bdc122a3198-0fd1056ccdbe9c-73e356b-1049088-177bdc122a47ba%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22%24device_id%22%3A%22177bdc1228d6b9-036524b7051d1d-73e356b-1049088-177bdc1228e314%22%7D; xzuuid=2448739e; wttXMuWwbC=ffd787eba9db1d78d87caeed4143592ff9e9858d; ATNgmRNkrw=1613911104; Hm_lvt_92e8bc890f374994dd570aa15afc99e1=1613796288,1613907686,1613910153,1613911108; xzuinfo=%7B%22user_id%22%3A154772273197%2C%22user_name%22%3A%2215223820758%22%2C%22user_nickName%22%3A%22tanguancc%22%2C%22login_time%22%3A%222021-02-21+20%3A43%3A58%22%2C%22user_key%22%3A%22e7f28964b91b%22%7D; xztoken=WyI1ODA4MDI0MzIxS0RMdSIseyJ1c2VyaWQiOjE1NDc3MjI3MzE5NywiZXhwaXJlIjoxNjE1MDk3NzYxLCJjIjoid2ViIn0sIjk3OTlhMzU1MjBlYTc2ZWEyZmIzYjBhZjZhNzAwMzllIl0%3D; rule_math=xoj0qewbfr; Hm_lpvt_92e8bc890f374994dd570aa15afc99e1=1613911787','referer': 'https://bj.xiaozhu.com/','sec-fetch-dest': 'document','sec-fetch-mode': 'navigate','sec-fetch-site': 'same-origin','sec-fetch-user': '?1','user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36'}res1 = requests.get(url,headers = headers)html_ = res1.texthtml_ = etree.HTML(html_)comment = html_.xpath('//div[@class="dp_box clearfix mt_10"]')for i in comment:if ''.join(i.xpath('.//div[@class="dp_con"]/text()')).strip() != '':com = ''.join(i.xpath('.//div[@class="dp_con"]/text()')).strip()print(com)time = i.xpath('.//div[@class="dp_con"]/h6/if/text()')[0]print(time)sheet.append([com, time])print('*'*20)else:headers = {'cookie': 'distinctId=177bdc122a3198-0fd1056ccdbe9c-73e356b-1049088-177bdc122a47ba; abtest_ABTest4SearchDate=b; xzucode=7cc36c9920e5ea4af320c54a2a91bb1f; xzucode4im=22484cf244628897eac55751ca877be6; xzSessId4H5=36fa5293fc9ca5c3cfed7c6e255d6c05; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22154772273197%22%2C%22first_id%22%3A%22177bdc122a3198-0fd1056ccdbe9c-73e356b-1049088-177bdc122a47ba%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22%24device_id%22%3A%22177bdc1228d6b9-036524b7051d1d-73e356b-1049088-177bdc1228e314%22%7D; xzuuid=2448739e; wttXMuWwbC=ffd787eba9db1d78d87caeed4143592ff9e9858d; ATNgmRNkrw=1613911104; Hm_lvt_92e8bc890f374994dd570aa15afc99e1=1613796288,1613907686,1613910153,1613911108; xzuinfo=%7B%22user_id%22%3A154772273197%2C%22user_name%22%3A%2215223820758%22%2C%22user_nickName%22%3A%22tanguancc%22%2C%22login_time%22%3A%222021-02-21+20%3A43%3A58%22%2C%22user_key%22%3A%22e7f28964b91b%22%7D; xztoken=WyI1ODA4MDI0MzIxS0RMdSIseyJ1c2VyaWQiOjE1NDc3MjI3MzE5NywiZXhwaXJlIjoxNjE1MDk3NzYxLCJjIjoid2ViIn0sIjk3OTlhMzU1MjBlYTc2ZWEyZmIzYjBhZjZhNzAwMzllIl0%3D; rule_math=xoj0qewbfr; Hm_lpvt_92e8bc890f374994dd570aa15afc99e1=1613911787','referer': url,'accept': 'text/html, */*; q=0.01','accept-encoding': 'gzip, deflate, br','accept-language': 'zh-CN,zh;q=0.9','sec-fetch-dest': 'empty','sec-fetch-mode': 'cors','sec-fetch-site': 'same-origin','user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36','x-requested-with': 'XMLHttpRequest','x-tingyun-id': 'uxh10gyAidI;r=895770757','xsrf-token': '18f0b12fc8ab58385d5619e298601b4c'}for h in range(int(num)//10+1):number = re.findall(r'\d+', url)[0]url_ = 'https://bj.xiaozhu.com/ajaxRequest/Ajax_GetDetailComment?lodgeId='+number+'&cityDomain=undefined&p='+str(h) print(url_)res2 = requests.get(url_,headers = headers)_html = res2.text#print(_html)_html = etree.HTML(_html)comment = _html.xpath('//div[@class="dp_box clearfix mt_10"]')for k in comment:if ''.join(k.xpath('.//div[@class="dp_con"]/text()')).strip() != '':com = ''.join(k.xpath('.//div[@class="dp_con"]/text()')).strip()print(com)time = k.xpath('.//div[@class="dp_con"]/h6/if/text()')[0]sheet.append([com, time])print('--'*20)
wb.save(r'C:\Users\Administrator\Desktop\成都.xlsx')
豆瓣
https://mp.weixin.qq.com/s/xKkGdXxmduv0uxr-54ZFBg
import time
import requests
import proxy2808
from bs4 import BeautifulSoupUSERNAME = '用户名'
PASSWORD = '密码'headers = {'Cookie': '你的Cookie值','User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}def get_comments(page, proxy_url_secured):"""评论获取"""# 热门评论获取url = 'https://movie.douban.com/subject/26797690/comments?start=' + str(page) + '&limit=20&sort=new_score&status=P'# 好评获取# url = 'https://movie.douban.com/subject/26797690/comments?start=' + str(page) + '&limit=20&sort=new_score&status=P&percent_type=h'# 一般评论获取# url = 'https://movie.douban.com/subject/26797690/comments?start=' + str(page) + '&limit=20&sort=new_score&status=P&percent_type=m'# 差评获取# url = 'https://movie.douban.com/subject/26797690/comments?start=' + str(page) + '&limit=20&sort=new_score&status=P&percent_type=l'# 使用2808proxy代理response = requests.get(url=url, headers=headers, proxies={'http': proxy_url_secured, 'https': proxy_url_secured})soup = BeautifulSoup(response.text, 'html.parser')for div in soup.find_all(class_='comment-item'):time.sleep(3)# 评论信息comment_info = div.find(class_='comment-info')# 用户名user_name = comment_info.find('a').get_text()print(user_name)# 用户主页地址user_url = comment_info.find('a').attrs['href']print(user_url)# 获取用户注册时间,看水军必备registered_time = get_user(user_url, proxy_url_secured)print(registered_time)# 用户评分score = comment_info.find_all('span')[1].attrs['class'][0][-2:-1]print(score)# 用户评价eva = comment_info.find_all('span')[1].attrs['title']print(eva)# 有用数useful_num = div.find(class_='votes').get_text()print(useful_num)# 评价日期date = comment_info.find(class_='comment-time ').attrs['title'].split(' ')[0]print(date)# 评价时间comment_time = comment_info.find(class_='comment-time ').attrs['title'].split(' ')[1]print(comment_time)# 用户评论comment = div.find(class_='short').get_text().replace('\n', '').strip().replace(',', ',').replace(' ', '')print(comment)# 写入csv文件with open('comments_douban_l.csv', 'a', encoding='utf-8-sig') as f:f.write(user_name + ',' + user_url + ',' + registered_time + ',' + score + ',' + date + ',' + comment_time + ',' + useful_num + ',' + comment + '\n')f.close()def get_user(user_url, proxy_url_secured):"""获取用户注册时间"""# 使用2808proxy代理response = requests.get(url=user_url, headers=headers, proxies={'http': proxy_url_secured, 'https': proxy_url_secured})soup = BeautifulSoup(response.text, 'html.parser')user_message = soup.find(class_='basic-info')# 获取用户注册时间try:user_registered = user_message.find(class_='pl')registered_time = user_registered.get_text().split(' ')[1].replace('加入', '')except:registered_time = 'unknow'return registered_timedef main():num = 0for i in range(0, 500, 20):cli = proxy2808.Client(username=USERNAME, password=PASSWORD)cli.release_all()p = cli.get_proxies(amount=1, expire_seconds=300)[0]proxy_url_secured = "%s://%s:%s@%s:%d" % ('http', USERNAME, PASSWORD, p['ip'], p['http_port_secured'])print(proxy_url_secured)get_comments(i, proxy_url_secured)num += 1if __name__ == '__main__':main()
拉钩
https://zhuanlan.zhihu.com/p/96073413
拉勾网的反爬介绍和解决方法(更新时间:2019/2/20)
import requests
import time
import random
import pymongoclient = pymongo.MongoClient(host = "127.0.0.1",port=27017)
db = client["spider"]
collection = db["lagou"]def get_cookie():headers = {'Connection': 'keep-alive','Cache-Control': 'max-age=0','Upgrade-Insecure-Requests': '1','User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36','Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8','Accept-Encoding': 'gzip, deflate, br','Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
}response = requests.get('https://www.lagou.com/jobs/list_?city=%E4%B8%8A%E6%B5%B7&cl=false&fromSearch=true&labelWords=&suginput=',headers=headers) # 请求原网页r = requests.utils.dict_from_cookiejar(response.cookies) # 获取cookiescookies = {'X_MIDDLE_TOKEN': '797bc148d133274a162ba797a6875817','JSESSIONID': 'ABAAABAAAIAACBI03F33A375F98E05C5108D4D742A34114','_ga': 'GA1.2.1912257997.1548059451','_gat': '1','Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1548059451','user_trace_token': '20190121163050-dbd72da2-1d56-11e9-8927-525400f775ce','LGSID': '20190121163050-dbd72f67-1d56-11e9-8927-525400f775ce','PRE_UTM': '','PRE_HOST': '','PRE_SITE': '','PRE_LAND': 'https%3A%2F%2Fwww.lagou.com%2F%3F_from_mid%3D1','LGUID': '20190121163050-dbd73128-1d56-11e9-8927-525400f775ce','_gid': 'GA1.2.1194828713.1548059451','index_location_city': '%E5%85%A8%E5%9B%BD','TG-TRACK-CODE': 'index_hotjob','LGRID': '20190121163142-fb0cc9c0-1d56-11e9-8928-525400f775ce','Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1548059503','SEARCH_ID': '86ed37f5d8da417dafb53aa25cd6fbc0',}cookies.update(r) # 更新接口的cookiesreturn cookiesdef crawl(city = "", pn = 1, cookies = None):headers = {'Origin': 'https://www.lagou.com','X-Anit-Forge-Code': '0','Accept-Encoding': 'gzip, deflate, br','Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8','User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36','Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8','Accept': 'application/json, text/javascript, */*; q=0.01','Referer': 'https://www.lagou.com/jobs/list_java?px=new&city=%E4%B8%8A%E6%B5%B7','X-Requested-With': 'XMLHttpRequest','Connection': 'keep-alive','X-Anit-Forge-Token': 'None',}params = (('px', 'default'),('city',city),('needAddtionalResult', 'false'),)data = {"first":"true",'kd': '数据分析','pn': pn}if pn>1:data["first"] = "false"response = requests.post('https://www.lagou.com/jobs/positionAjax.json', headers=headers, params=params,cookies=cookies, data=data) # 请求接口return response.json()city_list = ["北京","上海","深圳","广州","杭州","成都","南京","武汉","西安","厦门","长沙","苏州","天津"]for city in city_list:print("*"*60)print("{city} start".format(city=city))for i in range(1,31):if (i-1)%5==0:cookies = get_cookie()time.sleep(random.random()+random.randint(1,2))response_json = crawl(city=city,pn=i,cookies=cookies)try:position_list = response_json["content"][ 'positionResult']["result"]except:print(response_json)if len(position_list)<1: print("{city} start".format(city=city))print("*"*60)breakcollection.insert_many(position_list)print(cookies)print("{city} end".format(city=city))print("*"*60)
人民邮电报
import time
import requests
from lxml import etree
from multiprocessing.dummy import Pool
from requests.exceptions import RequestException
import openpyxl
from fake_useragent import UserAgent
import osdef GetDesktopPath():return os.path.join(os.path.expanduser("~"), 'Desktop')
DesktopPath = GetDesktopPath()+'\\'
os.makedirs(DesktopPath+'新闻')#自己电脑的路径def get_one_page(url):try:res = requests.get(url,headers = headers)if res.status_code == 200:return res.textreturn Noneexcept RequestException:return Nonedef parse_one_page(html):# 构造HTML解析器h3_list = html.xpath('//div[@class="text"]/h3')ul_list = html.xpath('//div[@class="text"]/ul')return h3_list,ul_listdef saveTxt(h3_list,ul_list,offset):filename = '4月'+str(offset)+'日.txt'with open(DesktopPath+'新闻'+'\\'+filename,'a',encoding='utf-8') as f:# 写入for index,h3 in enumerate(h3_list):try:edition = h3.xpath('./text()')[0].strip()# print(edition)f.write(edition)f.write('\r\n')li_list = ul_list[index].xpath('./li')for li in li_list:# print(li)title = li.xpath('./a/text()')[0].strip()link = 'http://paper.cnii.com.cn'+li.xpath('./a/@href')[0].strip()# print(title)# print(link)f.write('标题:')f.write(title)f.write('\n')f.write('链接')f.write(link)f.write('\n')# 获取二级页面(标题、日期、内容)# time.sleep(2)res1 = requests.get(link,headers = headers)html_a = res1.texthtml_a = etree.HTML(html_a)title_container = html_a.xpath('//div[@class="title-container"]')[0]author = title_container.xpath('./span[@class="date"]/text()')[0].strip()f.write('作者:')f.write(author)f.write('\t')date = title_container.xpath('./p[@class="date"]/text()')[0].strip()f.write('日期')f.write(date)f.write('\n')content = ''.join(html_a.xpath('//div[@class="text"]//text()'))# 清洗content = content.replace('\n','').replace('\r','').replace(' ','').replace('\u3000','')#print(content)f.write('内容:')f.write(content)f.write('\n')f.write('-'*50)f.write('\n')f.write('='*150)f.write('\n')except Exception:passf.close()def main(offset):# 构造主函数,初始化各个模块,传入入口URLbase_url = 'http://paper.cnii.com.cn/item/rmydb_2021_4_{}_1.html'url = base_url.format(offset)html = etree.HTML(get_one_page(url))h3_list,ul_list = parse_one_page(html)saveTxt(h3_list,ul_list,offset)if __name__ == '__main__':# 请求头headers = {'User-Agent':UserAgent(verify_ssl=False).random}# 使用线程池print('多线程爬取开始')start_time=time.time()p = Pool(8)p.map(main,[i for i in range(7,9)])#关闭线程池end_time=time.time()print('多线程爬取结束')print('耗时:',end_time-start_time)p.close()p.join()#用来等待进程池中的worker进程执行完毕,防止主进程在worker进程结束前结束。
百度电影
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 20 09:05:41 2021@author: ABC
"""# 导入包
import pandas as pd
import time
import requests
import json
from fake_useragent import UserAgent
from multiprocessing.dummy import Pool
from requests.exceptions import RequestException
import openpyxldef main(i):# 获取URLtry:##提取url = 'https://sp0.baidu.com/8aQDcjqpAAV3otqbppnN2DJv/api.php?resource_id=28287&from_mid=1&&format=json&ie=utf-8&oe=utf-8&query=%E7%94%B5%E8%A7%86%E5%89%A7&sort_key=16&sort_type=1&stat0=&stat1=&stat2=&stat3=&pn={}&rn=8&cb=jQuery1102040638565016800876_1618883195864&_=1618883195868'.format(i*8)# 发起请求# time.sleep(1)r = requests.get(url, headers=headers)data = r.text.replace('jQuery1102040638565016800876_1618883195864(','')[:-1]# 解析网页json_data = json.loads(data)# 获取数据comment_list = json_data['data'][0]['result']for j in comment_list:TV_name = j['name']# print(TV_name)sheet.append([TV_name])except Exception:passif __name__ == '__main__':wb = openpyxl.Workbook() # 获取工作簿对象sheet = wb.active # 活动的工作表# 添加列名sheet.append(['TV_name'])# 请求头headers = {'user-agent': UserAgent().random}# 使用线程池print('多线程爬取开始')start_time=time.time()p = Pool(8)p.map(main,[i for i in range(5800,6300)])# 保存位置wb.save('info8.xlsx')#关闭线程池end_time=time.time()print('多线程爬取结束')print('耗时:',end_time-start_time)p.close()p.join()#用来等待进程池中的worker进程执行完毕,防止主进程在worker进程结束前结束。
加载更多-获取字符串中间任意内容
import requests
from lxml import etree
from urllib.parse import urlencode
base_url = 'http://auto.nbd.com.cn/columns/511?'
# 加载更多===需要加入 X-CSRF-Token X-Requested-With
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'+'Chrome/62.0.3202.94 Safari/537.36','Cookie': '_nbd_session_id=96e21e926782b634e4edd013d3e79c2e','X-CSRF-Token': 'r4LEtPuhDCDEbquEfD46gcKc4rOTAKDVnQ2XJ5k1e4A=',
'X-Requested-With': 'XMLHttpRequest'}formdata ={'last_article_pos': 8289}
url = base_url + urlencode(formdata)res = requests.get(url,headers = headers)
html = res.text
# 获取字符串中间任意内容
def get_str_btw(s, f, b):par = s.partition(f)return (par[2].partition(b))[0][:]st1r= get_str_btw(html, 'append(\'', '\\n\');').strip().replace('\\n','').replace(' ','')
html = etree.HTML(st1r)
蛋壳公寓信息/信息网爬取/小猪短租/豆瓣/拉钩/人民邮电报/百度电视剧/加载更多-获取字符串中间任意内容相关推荐
- 疫情过去女朋友想去重庆玩,python批量爬取小猪短租重庆民宿信息
疫情过去女朋友想去重庆玩,python批量爬取小猪短租重庆民宿信息 随着时间的流逝,在中国共产党的领导,全国人民的共同努力下,疫情逐渐受到了控制,逐渐好转,复工,开学有望.最近在和女朋友的闲聊当中得知 ...
- Python爬虫入门 | 5 爬取小猪短租租房信息
小猪短租是一个租房网站,上面有很多优质的民宿出租信息,下面我们以成都地区的租房信息为例,来尝试爬取这些数据. 小猪短租(成都)页面:http://cd.xiaozhu.com/ 1.爬取租房标题 ...
- python爬取网上租房信息_Python爬虫入门 | 5 爬取小猪短租租房信息
小猪短租是一个租房网站,上面有很多优质的民宿出租信息,下面我们以成都地区的租房信息为例,来尝试爬取这些数据. 1.爬取租房标题 按照惯例,先来爬下标题试试水,找到标题,复制xpath. 多复制几个房屋 ...
- python3通过Beautif和XPath分别爬取“小猪短租-北京”租房信息,并对比时间效率(附源代码)...
爬虫思路分析: 1. 观察小猪短租(北京)的网页 首页:http://www.xiaozhu.com/?utm_source=baidu&utm_medium=cpc&utm_term ...
- 使用BeautifulSoup爬取小猪短租的租房信息
直接上代码 没有添加间隔时间 几页之后就被封了 #!/user/bin/env python #-*- coding:utf-8 -*- from bs4 import BeautifulSoup i ...
- [python爬虫] BeautifulSoup设置Cookie解决网站拦截并爬取蚂蚁短租
我们在编写Python爬虫时,有时会遇到网站拒绝访问等反爬手段,比如这么我们想爬取蚂蚁短租数据,它则会提示"当前访问疑似黑客攻击,已被网站管理员设置为拦截"提示,如下图所示.此时我 ...
- python爬虫cookie池 与ip绑定_Python爬虫:设置Cookie解决网站拦截并爬取蚂蚁短租
前言 文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理. 作者: Eastmount PS:如有需要Python学习资料的小伙伴可以加 ...
- Python爬虫:设置Cookie解决网站拦截并爬取蚂蚁短租
我们在编写Python爬虫时,有时会遇到网站拒绝访问等反爬手段,比如这么我们想爬取蚂蚁短租数据,它则会提示"当前访问疑似黑客攻击,已被网站管理员设置为拦截"提示,如下图所示.此时我 ...
- 用pyton爬取某短租网信息
import requests #用于向网站服务器发起请求 from bs4 import BeautifulSoup #用于处理服务反馈回来的网页文件 import pymongo #用于连接Mon ...
最新文章
- diou diou_nms代码分享
- Micropython教程之TPYBoardv102 DIY蓝牙智能小车实例
- TDSQL 全时态数据库系统--核心技术
- 【快报】基于K2 BPM的新一代协同办公门户实践交流会
- Navicat for MySQL连接MySQL数据库时各种错误解决
- 漫画:什么是JVM的垃圾回收?
- SVN修改用户名与密码
- 计算机英语讲课笔记(2020-6-13)
- libevent 例子,从简单到复杂
- DEPRECATION: Python 2.7 reached the end of its life on January 1st, 2020. Please upgrade
- STM32+FreeRtos 移植letter-shell工具
- 高效记忆/形象记忆(14)110数字编码表 81-90
- html编辑器pp,在线轻设计工具之H5
- mysql不小心删除root恢复
- js判断手机是否安装了某个APP,如果安装了就打开,没安装就下载
- layDate 时间范围限制 开始时间小于结束时间
- 博途SCL模板项目实例,SCL学习资料,SCL详细资料,SCL教程
- J2EE基础之map集合框架
- latex初学者的经验
- pdd实现主图详情图片一键下载
热门文章
- 信号与传输介质和计算机进制转换
- 哈理工计算机组成原理,哈尔滨理工大学计算机组成原理课程设计.pdf
- 声音发生器、pwm、占空比
- 阿里云配置SSH密钥连接
- ECshop文件结构说明
- 计算机会计信息系统的要素,【会计信息论文】计算机会计信息系统的内部控制制度(共3879字)...
- Java中的参数传递,到底是值传递还是引用传递?
- 两年聚37亿美元,“庞氏骗局” 维卡币负责人在美被捕
- java net unicode / native2ascii / url decode / url encode / UTF8 / js url code
- 深入RecyclerView(一)