调试

import requests
from lxml import etreebase_url = 'https://www.danke.com/room/bj?page=1'headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'+'Chrome/62.0.3202.94 Safari/537.36'}
res = requests.get(base_url,headers = headers)
html = res.text
html = etree.HTML(html)
ii_list  = html.xpath('//div[@class="r_lbx"]')title = ii_list[0].xpath('.//div[@class="r_lbx_cena"]/a/text()')[0].strip()
di_tie = ii_list[0].xpath('.//div[@class="r_lbx_cena"]/text()')[4].strip()
details = ii_list[0].xpath('.//div[@class="r_lbx_cenb"]/text()')[1].strip()
yangtai = '-'.join(ii_list[0].xpath('.//div[@class="r_lbx_cenc"]/span/text()'))
zicainuan = ii_list[0].xpath('.//div[@class="r_lbx_cenc"]/span[2]/text()')[0].strip()
price = ii_list[0].xpath('.//div[@class="r_lbx_moneya"]/span/text()')[0].strip() + '元/月'
import time
import requests
from lxml import etree
from multiprocessing.dummy import Pool
from requests.exceptions import RequestException
import openpyxldef get_one_page(url):try:res = requests.get(url,headers = headers)if res.status_code == 200:return res.textreturn Noneexcept RequestException:return Nonedef parse_one_page(html):# 构造HTML解析器info_list=[]ii_list  = html.xpath('//div[@class="r_lbx"]')for ii in ii_list:try:##提取title = ii.xpath('.//div[@class="r_lbx_cena"]/a/text()')[0].strip()di_tie = ii.xpath('.//div[@class="r_lbx_cena"]/text()')[4].strip()details = ii.xpath('.//div[@class="r_lbx_cenb"]/text()')[1].strip().replace("\n", "")details = details.replace(" ","")yangtai = '-'.join(ii.xpath('.//div[@class="r_lbx_cenc"]/span/text()'))price = ii.xpath('.//div[@class="r_lbx_moneya"]/span/text()')[0].strip() + '元/月'sheet.append([title, di_tie, details,yangtai,price])except Exception:passdef main(offset):# 构造主函数,初始化各个模块,传入入口URLbase_url = 'https://www.danke.com/room/bj?page={}'url = base_url.format(offset)html = etree.HTML(get_one_page(url))parse_one_page(html)if __name__ == '__main__':wb = openpyxl.Workbook()    # 获取工作簿对象sheet = wb.active           # 活动的工作表# 添加列名sheet.append(['title', 'di_tie', 'details','yangtai','price'])# 请求头headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'+'Chrome/62.0.3202.94 Safari/537.36'}# 使用线程池print('多线程爬取开始')start_time=time.time()p = Pool(8)p.map(main,[i for i in range(1,3)])# 保存位置wb.save(r'C:\Users\Administrator\Desktop\info.xlsx')#关闭线程池end_time=time.time()print('多线程爬取结束')print('耗时:',end_time-start_time)p.close()p.join()#用来等待进程池中的worker进程执行完毕,防止主进程在worker进程结束前结束。

中国贸易救济信息网

调试

import requests
import json
from urllib.parse import urlencodeheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'+'Chrome/62.0.3202.94 Safari/537.36'}base_url = 'http://www.cacs.mofcom.gov.cn/cacscms/list/notice/ssqdc?'
formdata ={'pageNumber': 1,'condition': 'respondent','conditionValue': 97}url = base_url + urlencode(formdata)
response = requests.request("POST",url, headers = headers)
response = json.loads(response.text)
import time
import requests
from lxml import etree
from multiprocessing.dummy import Pool
from requests.exceptions import RequestException
import openpyxl
from fake_useragent import UserAgentdef get_one_page(url):try:res = requests.request("POST",url, headers = headers)if res.status_code == 200:return json.loads(res.text)return Noneexcept RequestException:return Nonedef parse_one_page(response):# 构造HTML解析器rows = response['rows']time.sleep(2)for i in rows:try:##提取title = i['TITLE']hangye = i['INDUSTRY_TYPE']date = i['GGTIME']state = i['CASE_STATE']sheet.append([title, hangye, date,state])except Exception:passdef main(offset):# 构造主函数,初始化各个模块,传入入口URLbase_url = 'http://www.cacs.mofcom.gov.cn/cacscms/list/notice/ssqdc?'formdata ={'pageNumber': offset,'condition': 'respondent','conditionValue': 227}base_url = base_url + urlencode(formdata)time.sleep(2)response = get_one_page(base_url)parse_one_page(response)if __name__ == '__main__':wb = openpyxl.Workbook()    # 获取工作簿对象sheet = wb.active           # 活动的工作表# 添加列名sheet.append(['标题', '行业','日期','状态'])# 请求头headers = {'User-Agent':UserAgent(verify_ssl=False).random}# 使用线程池print('多线程爬取开始')start_time=time.time()p = Pool(8)p.map(main,[i for i in range(1,21)])# 保存位置wb.save(r'C:\Users\Administrator\Desktop\台湾.xlsx')#关闭线程池end_time=time.time()print('多线程爬取结束')print('耗时:',end_time-start_time)p.close()p.join()#用来等待进程池中的worker进程执行完毕,防止主进程在worker进程结束前结束。

小猪短租

# -*- coding: utf-8 -*-
import requests
from lxml import etree
import re
import time
from multiprocessing.dummy import Pool
from requests.exceptions import RequestException
import openpyxl
from fake_useragent import UserAgent
wb = openpyxl.Workbook()    # 获取工作簿对象
sheet = wb.active           # 活动的工作表
# 添加列名
sheet.append(['评论', '时间'])# city = ['bj','sh','cd','cq']#
for c in range(1,5):base_url = 'https://'+'cd'+'.xiaozhu.com/search-duanzufang-p'+str(c)+'-0/'headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36','cookie': 'distinctId=177bdc122a3198-0fd1056ccdbe9c-73e356b-1049088-177bdc122a47ba; abtest_ABTest4SearchDate=b; xzucode=7cc36c9920e5ea4af320c54a2a91bb1f; xzucode4im=22484cf244628897eac55751ca877be6; xzSessId4H5=36fa5293fc9ca5c3cfed7c6e255d6c05; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22154772273197%22%2C%22first_id%22%3A%22177bdc122a3198-0fd1056ccdbe9c-73e356b-1049088-177bdc122a47ba%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22%24device_id%22%3A%22177bdc1228d6b9-036524b7051d1d-73e356b-1049088-177bdc1228e314%22%7D; xzuuid=2448739e; wttXMuWwbC=ffd787eba9db1d78d87caeed4143592ff9e9858d; ATNgmRNkrw=1613911104; Hm_lvt_92e8bc890f374994dd570aa15afc99e1=1613796288,1613907686,1613910153,1613911108; xzuinfo=%7B%22user_id%22%3A154772273197%2C%22user_name%22%3A%2215223820758%22%2C%22user_nickName%22%3A%22tanguancc%22%2C%22login_time%22%3A%222021-02-21+20%3A43%3A58%22%2C%22user_key%22%3A%22e7f28964b91b%22%7D; xztoken=WyI1ODA4MDI0MzIxS0RMdSIseyJ1c2VyaWQiOjE1NDc3MjI3MzE5NywiZXhwaXJlIjoxNjE1MDk3NzYxLCJjIjoid2ViIn0sIjk3OTlhMzU1MjBlYTc2ZWEyZmIzYjBhZjZhNzAwMzllIl0%3D; rule_math=xoj0qewbfr; Hm_lpvt_92e8bc890f374994dd570aa15afc99e1=1613911787','referer':'https://bj.xiaozhu.com/search-duanzufang-p1-0/'}res = requests.get(base_url,headers = headers)html = res.texthtml = etree.HTML(html)ii_list  = html.xpath('//ul[@class="pic_list clearfix list_code"]/li')for j in ii_list:url = j.xpath('.//a[@class="resule_img_a"]/@href')[0]com_num = j.xpath('.//span[@class="commenthref"]/text()')[0].strip()num = re.findall(r'\d+', com_num)[-1]print(num)if int(num)<=10:headers = {'authority': 'bj.xiaozhu.com','accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9','accept-encoding': 'gzip, deflate, br','accept-language': 'zh-CN,zh;q=0.9','cache-control': 'max-age=0','cookie': 'distinctId=177bdc122a3198-0fd1056ccdbe9c-73e356b-1049088-177bdc122a47ba; abtest_ABTest4SearchDate=b; xzucode=7cc36c9920e5ea4af320c54a2a91bb1f; xzucode4im=22484cf244628897eac55751ca877be6; xzSessId4H5=36fa5293fc9ca5c3cfed7c6e255d6c05; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22154772273197%22%2C%22first_id%22%3A%22177bdc122a3198-0fd1056ccdbe9c-73e356b-1049088-177bdc122a47ba%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22%24device_id%22%3A%22177bdc1228d6b9-036524b7051d1d-73e356b-1049088-177bdc1228e314%22%7D; xzuuid=2448739e; wttXMuWwbC=ffd787eba9db1d78d87caeed4143592ff9e9858d; ATNgmRNkrw=1613911104; Hm_lvt_92e8bc890f374994dd570aa15afc99e1=1613796288,1613907686,1613910153,1613911108; xzuinfo=%7B%22user_id%22%3A154772273197%2C%22user_name%22%3A%2215223820758%22%2C%22user_nickName%22%3A%22tanguancc%22%2C%22login_time%22%3A%222021-02-21+20%3A43%3A58%22%2C%22user_key%22%3A%22e7f28964b91b%22%7D; xztoken=WyI1ODA4MDI0MzIxS0RMdSIseyJ1c2VyaWQiOjE1NDc3MjI3MzE5NywiZXhwaXJlIjoxNjE1MDk3NzYxLCJjIjoid2ViIn0sIjk3OTlhMzU1MjBlYTc2ZWEyZmIzYjBhZjZhNzAwMzllIl0%3D; rule_math=xoj0qewbfr; Hm_lpvt_92e8bc890f374994dd570aa15afc99e1=1613911787','referer': 'https://bj.xiaozhu.com/','sec-fetch-dest': 'document','sec-fetch-mode': 'navigate','sec-fetch-site': 'same-origin','sec-fetch-user': '?1','user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36'}res1 = requests.get(url,headers = headers)html_ = res1.texthtml_ = etree.HTML(html_)comment  = html_.xpath('//div[@class="dp_box clearfix mt_10"]')for i in comment:if ''.join(i.xpath('.//div[@class="dp_con"]/text()')).strip() != '':com = ''.join(i.xpath('.//div[@class="dp_con"]/text()')).strip()print(com)time = i.xpath('.//div[@class="dp_con"]/h6/if/text()')[0]print(time)sheet.append([com, time])print('*'*20)else:headers = {'cookie': 'distinctId=177bdc122a3198-0fd1056ccdbe9c-73e356b-1049088-177bdc122a47ba; abtest_ABTest4SearchDate=b; xzucode=7cc36c9920e5ea4af320c54a2a91bb1f; xzucode4im=22484cf244628897eac55751ca877be6; xzSessId4H5=36fa5293fc9ca5c3cfed7c6e255d6c05; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22154772273197%22%2C%22first_id%22%3A%22177bdc122a3198-0fd1056ccdbe9c-73e356b-1049088-177bdc122a47ba%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22%24device_id%22%3A%22177bdc1228d6b9-036524b7051d1d-73e356b-1049088-177bdc1228e314%22%7D; xzuuid=2448739e; wttXMuWwbC=ffd787eba9db1d78d87caeed4143592ff9e9858d; ATNgmRNkrw=1613911104; Hm_lvt_92e8bc890f374994dd570aa15afc99e1=1613796288,1613907686,1613910153,1613911108; xzuinfo=%7B%22user_id%22%3A154772273197%2C%22user_name%22%3A%2215223820758%22%2C%22user_nickName%22%3A%22tanguancc%22%2C%22login_time%22%3A%222021-02-21+20%3A43%3A58%22%2C%22user_key%22%3A%22e7f28964b91b%22%7D; xztoken=WyI1ODA4MDI0MzIxS0RMdSIseyJ1c2VyaWQiOjE1NDc3MjI3MzE5NywiZXhwaXJlIjoxNjE1MDk3NzYxLCJjIjoid2ViIn0sIjk3OTlhMzU1MjBlYTc2ZWEyZmIzYjBhZjZhNzAwMzllIl0%3D; rule_math=xoj0qewbfr; Hm_lpvt_92e8bc890f374994dd570aa15afc99e1=1613911787','referer': url,'accept': 'text/html, */*; q=0.01','accept-encoding': 'gzip, deflate, br','accept-language': 'zh-CN,zh;q=0.9','sec-fetch-dest': 'empty','sec-fetch-mode': 'cors','sec-fetch-site': 'same-origin','user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36','x-requested-with': 'XMLHttpRequest','x-tingyun-id': 'uxh10gyAidI;r=895770757','xsrf-token': '18f0b12fc8ab58385d5619e298601b4c'}for h in range(int(num)//10+1):number = re.findall(r'\d+', url)[0]url_ = 'https://bj.xiaozhu.com/ajaxRequest/Ajax_GetDetailComment?lodgeId='+number+'&cityDomain=undefined&p='+str(h)    print(url_)res2 = requests.get(url_,headers = headers)_html = res2.text#print(_html)_html = etree.HTML(_html)comment  = _html.xpath('//div[@class="dp_box clearfix mt_10"]')for k in comment:if ''.join(k.xpath('.//div[@class="dp_con"]/text()')).strip() != '':com = ''.join(k.xpath('.//div[@class="dp_con"]/text()')).strip()print(com)time = k.xpath('.//div[@class="dp_con"]/h6/if/text()')[0]sheet.append([com, time])print('--'*20)
wb.save(r'C:\Users\Administrator\Desktop\成都.xlsx')

豆瓣

https://mp.weixin.qq.com/s/xKkGdXxmduv0uxr-54ZFBg

import time
import requests
import proxy2808
from bs4 import BeautifulSoupUSERNAME = '用户名'
PASSWORD = '密码'headers = {'Cookie': '你的Cookie值','User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}def get_comments(page, proxy_url_secured):"""评论获取"""# 热门评论获取url = 'https://movie.douban.com/subject/26797690/comments?start=' + str(page) + '&limit=20&sort=new_score&status=P'# 好评获取# url = 'https://movie.douban.com/subject/26797690/comments?start=' + str(page) + '&limit=20&sort=new_score&status=P&percent_type=h'# 一般评论获取# url = 'https://movie.douban.com/subject/26797690/comments?start=' + str(page) + '&limit=20&sort=new_score&status=P&percent_type=m'# 差评获取# url = 'https://movie.douban.com/subject/26797690/comments?start=' + str(page) + '&limit=20&sort=new_score&status=P&percent_type=l'# 使用2808proxy代理response = requests.get(url=url, headers=headers, proxies={'http': proxy_url_secured, 'https': proxy_url_secured})soup = BeautifulSoup(response.text, 'html.parser')for div in soup.find_all(class_='comment-item'):time.sleep(3)# 评论信息comment_info = div.find(class_='comment-info')# 用户名user_name = comment_info.find('a').get_text()print(user_name)# 用户主页地址user_url = comment_info.find('a').attrs['href']print(user_url)# 获取用户注册时间,看水军必备registered_time = get_user(user_url, proxy_url_secured)print(registered_time)# 用户评分score = comment_info.find_all('span')[1].attrs['class'][0][-2:-1]print(score)# 用户评价eva = comment_info.find_all('span')[1].attrs['title']print(eva)# 有用数useful_num = div.find(class_='votes').get_text()print(useful_num)# 评价日期date = comment_info.find(class_='comment-time ').attrs['title'].split(' ')[0]print(date)# 评价时间comment_time = comment_info.find(class_='comment-time ').attrs['title'].split(' ')[1]print(comment_time)# 用户评论comment = div.find(class_='short').get_text().replace('\n', '').strip().replace(',', ',').replace(' ', '')print(comment)# 写入csv文件with open('comments_douban_l.csv', 'a', encoding='utf-8-sig') as f:f.write(user_name + ',' + user_url + ',' + registered_time + ',' + score + ',' + date + ',' + comment_time + ',' + useful_num + ',' + comment + '\n')f.close()def get_user(user_url, proxy_url_secured):"""获取用户注册时间"""# 使用2808proxy代理response = requests.get(url=user_url, headers=headers, proxies={'http': proxy_url_secured, 'https': proxy_url_secured})soup = BeautifulSoup(response.text, 'html.parser')user_message = soup.find(class_='basic-info')# 获取用户注册时间try:user_registered = user_message.find(class_='pl')registered_time = user_registered.get_text().split('  ')[1].replace('加入', '')except:registered_time = 'unknow'return registered_timedef main():num = 0for i in range(0, 500, 20):cli = proxy2808.Client(username=USERNAME, password=PASSWORD)cli.release_all()p = cli.get_proxies(amount=1, expire_seconds=300)[0]proxy_url_secured = "%s://%s:%s@%s:%d" % ('http', USERNAME, PASSWORD, p['ip'], p['http_port_secured'])print(proxy_url_secured)get_comments(i, proxy_url_secured)num += 1if __name__ == '__main__':main()

拉钩
https://zhuanlan.zhihu.com/p/96073413

拉勾网的反爬介绍和解决方法(更新时间:2019/2/20)

import requests
import time
import random
import pymongoclient = pymongo.MongoClient(host = "127.0.0.1",port=27017)
db = client["spider"]
collection = db["lagou"]def get_cookie():headers = {'Connection': 'keep-alive','Cache-Control': 'max-age=0','Upgrade-Insecure-Requests': '1','User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36','Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8','Accept-Encoding': 'gzip, deflate, br','Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
}response = requests.get('https://www.lagou.com/jobs/list_?city=%E4%B8%8A%E6%B5%B7&cl=false&fromSearch=true&labelWords=&suginput=',headers=headers)  # 请求原网页r = requests.utils.dict_from_cookiejar(response.cookies)  # 获取cookiescookies = {'X_MIDDLE_TOKEN': '797bc148d133274a162ba797a6875817','JSESSIONID': 'ABAAABAAAIAACBI03F33A375F98E05C5108D4D742A34114','_ga': 'GA1.2.1912257997.1548059451','_gat': '1','Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1548059451','user_trace_token': '20190121163050-dbd72da2-1d56-11e9-8927-525400f775ce','LGSID': '20190121163050-dbd72f67-1d56-11e9-8927-525400f775ce','PRE_UTM': '','PRE_HOST': '','PRE_SITE': '','PRE_LAND': 'https%3A%2F%2Fwww.lagou.com%2F%3F_from_mid%3D1','LGUID': '20190121163050-dbd73128-1d56-11e9-8927-525400f775ce','_gid': 'GA1.2.1194828713.1548059451','index_location_city': '%E5%85%A8%E5%9B%BD','TG-TRACK-CODE': 'index_hotjob','LGRID': '20190121163142-fb0cc9c0-1d56-11e9-8928-525400f775ce','Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1548059503','SEARCH_ID': '86ed37f5d8da417dafb53aa25cd6fbc0',}cookies.update(r)  # 更新接口的cookiesreturn cookiesdef crawl(city = "", pn = 1, cookies = None):headers = {'Origin': 'https://www.lagou.com','X-Anit-Forge-Code': '0','Accept-Encoding': 'gzip, deflate, br','Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8','User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36','Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8','Accept': 'application/json, text/javascript, */*; q=0.01','Referer': 'https://www.lagou.com/jobs/list_java?px=new&city=%E4%B8%8A%E6%B5%B7','X-Requested-With': 'XMLHttpRequest','Connection': 'keep-alive','X-Anit-Forge-Token': 'None',}params = (('px', 'default'),('city',city),('needAddtionalResult', 'false'),)data = {"first":"true",'kd': '数据分析','pn': pn}if pn>1:data["first"] = "false"response = requests.post('https://www.lagou.com/jobs/positionAjax.json', headers=headers, params=params,cookies=cookies, data=data)  # 请求接口return response.json()city_list = ["北京","上海","深圳","广州","杭州","成都","南京","武汉","西安","厦门","长沙","苏州","天津"]for city in city_list:print("*"*60)print("{city} start".format(city=city))for i in range(1,31):if (i-1)%5==0:cookies = get_cookie()time.sleep(random.random()+random.randint(1,2))response_json = crawl(city=city,pn=i,cookies=cookies)try:position_list = response_json["content"][ 'positionResult']["result"]except:print(response_json)if len(position_list)<1: print("{city} start".format(city=city))print("*"*60)breakcollection.insert_many(position_list)print(cookies)print("{city} end".format(city=city))print("*"*60)

人民邮电报

import time
import requests
from lxml import etree
from multiprocessing.dummy import Pool
from requests.exceptions import RequestException
import openpyxl
from fake_useragent import UserAgent
import osdef GetDesktopPath():return os.path.join(os.path.expanduser("~"), 'Desktop')
DesktopPath = GetDesktopPath()+'\\'
os.makedirs(DesktopPath+'新闻')#自己电脑的路径def get_one_page(url):try:res = requests.get(url,headers = headers)if res.status_code == 200:return res.textreturn Noneexcept RequestException:return Nonedef parse_one_page(html):# 构造HTML解析器h3_list  = html.xpath('//div[@class="text"]/h3')ul_list = html.xpath('//div[@class="text"]/ul')return h3_list,ul_listdef saveTxt(h3_list,ul_list,offset):filename  = '4月'+str(offset)+'日.txt'with open(DesktopPath+'新闻'+'\\'+filename,'a',encoding='utf-8') as f:# 写入for index,h3 in enumerate(h3_list):try:edition = h3.xpath('./text()')[0].strip()# print(edition)f.write(edition)f.write('\r\n')li_list = ul_list[index].xpath('./li')for li in li_list:# print(li)title = li.xpath('./a/text()')[0].strip()link = 'http://paper.cnii.com.cn'+li.xpath('./a/@href')[0].strip()# print(title)# print(link)f.write('标题:')f.write(title)f.write('\n')f.write('链接')f.write(link)f.write('\n')# 获取二级页面(标题、日期、内容)# time.sleep(2)res1 = requests.get(link,headers = headers)html_a = res1.texthtml_a = etree.HTML(html_a)title_container  = html_a.xpath('//div[@class="title-container"]')[0]author = title_container.xpath('./span[@class="date"]/text()')[0].strip()f.write('作者:')f.write(author)f.write('\t')date = title_container.xpath('./p[@class="date"]/text()')[0].strip()f.write('日期')f.write(date)f.write('\n')content  = ''.join(html_a.xpath('//div[@class="text"]//text()'))# 清洗content  = content.replace('\n','').replace('\r','').replace(' ','').replace('\u3000','')#print(content)f.write('内容:')f.write(content)f.write('\n')f.write('-'*50)f.write('\n')f.write('='*150)f.write('\n')except Exception:passf.close()def main(offset):# 构造主函数,初始化各个模块,传入入口URLbase_url = 'http://paper.cnii.com.cn/item/rmydb_2021_4_{}_1.html'url = base_url.format(offset)html = etree.HTML(get_one_page(url))h3_list,ul_list = parse_one_page(html)saveTxt(h3_list,ul_list,offset)if __name__ == '__main__':# 请求头headers = {'User-Agent':UserAgent(verify_ssl=False).random}# 使用线程池print('多线程爬取开始')start_time=time.time()p = Pool(8)p.map(main,[i for i in range(7,9)])#关闭线程池end_time=time.time()print('多线程爬取结束')print('耗时:',end_time-start_time)p.close()p.join()#用来等待进程池中的worker进程执行完毕,防止主进程在worker进程结束前结束。

百度电影

# -*- coding: utf-8 -*-
"""
Created on Tue Apr 20 09:05:41 2021@author: ABC
"""# 导入包
import pandas as pd
import time
import requests
import json
from fake_useragent import UserAgent
from multiprocessing.dummy import Pool
from requests.exceptions import RequestException
import openpyxldef main(i):# 获取URLtry:##提取url = 'https://sp0.baidu.com/8aQDcjqpAAV3otqbppnN2DJv/api.php?resource_id=28287&from_mid=1&&format=json&ie=utf-8&oe=utf-8&query=%E7%94%B5%E8%A7%86%E5%89%A7&sort_key=16&sort_type=1&stat0=&stat1=&stat2=&stat3=&pn={}&rn=8&cb=jQuery1102040638565016800876_1618883195864&_=1618883195868'.format(i*8)# 发起请求# time.sleep(1)r = requests.get(url, headers=headers)data = r.text.replace('jQuery1102040638565016800876_1618883195864(','')[:-1]# 解析网页json_data = json.loads(data)# 获取数据comment_list = json_data['data'][0]['result']for j in comment_list:TV_name = j['name']# print(TV_name)sheet.append([TV_name])except Exception:passif __name__ == '__main__':wb = openpyxl.Workbook()    # 获取工作簿对象sheet = wb.active           # 活动的工作表# 添加列名sheet.append(['TV_name'])# 请求头headers = {'user-agent': UserAgent().random}# 使用线程池print('多线程爬取开始')start_time=time.time()p = Pool(8)p.map(main,[i for i in range(5800,6300)])# 保存位置wb.save('info8.xlsx')#关闭线程池end_time=time.time()print('多线程爬取结束')print('耗时:',end_time-start_time)p.close()p.join()#用来等待进程池中的worker进程执行完毕,防止主进程在worker进程结束前结束。

加载更多-获取字符串中间任意内容

import requests
from lxml import etree
from urllib.parse import urlencode
base_url = 'http://auto.nbd.com.cn/columns/511?'
# 加载更多===需要加入 X-CSRF-Token   X-Requested-With
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'+'Chrome/62.0.3202.94 Safari/537.36','Cookie': '_nbd_session_id=96e21e926782b634e4edd013d3e79c2e','X-CSRF-Token': 'r4LEtPuhDCDEbquEfD46gcKc4rOTAKDVnQ2XJ5k1e4A=',
'X-Requested-With': 'XMLHttpRequest'}formdata ={'last_article_pos': 8289}
url = base_url + urlencode(formdata)res = requests.get(url,headers = headers)
html = res.text
# 获取字符串中间任意内容
def get_str_btw(s, f, b):par = s.partition(f)return (par[2].partition(b))[0][:]st1r= get_str_btw(html, 'append(\'', '\\n\');').strip().replace('\\n','').replace('                ','')
html = etree.HTML(st1r)

蛋壳公寓信息/信息网爬取/小猪短租/豆瓣/拉钩/人民邮电报/百度电视剧/加载更多-获取字符串中间任意内容相关推荐

  1. 疫情过去女朋友想去重庆玩,python批量爬取小猪短租重庆民宿信息

    疫情过去女朋友想去重庆玩,python批量爬取小猪短租重庆民宿信息 随着时间的流逝,在中国共产党的领导,全国人民的共同努力下,疫情逐渐受到了控制,逐渐好转,复工,开学有望.最近在和女朋友的闲聊当中得知 ...

  2. Python爬虫入门 | 5 爬取小猪短租租房信息

    小猪短租是一个租房网站,上面有很多优质的民宿出租信息,下面我们以成都地区的租房信息为例,来尝试爬取这些数据. 小猪短租(成都)页面:http://cd.xiaozhu.com/   1.爬取租房标题 ...

  3. python爬取网上租房信息_Python爬虫入门 | 5 爬取小猪短租租房信息

    小猪短租是一个租房网站,上面有很多优质的民宿出租信息,下面我们以成都地区的租房信息为例,来尝试爬取这些数据. 1.爬取租房标题 按照惯例,先来爬下标题试试水,找到标题,复制xpath. 多复制几个房屋 ...

  4. python3通过Beautif和XPath分别爬取“小猪短租-北京”租房信息,并对比时间效率(附源代码)...

    爬虫思路分析: 1. 观察小猪短租(北京)的网页 首页:http://www.xiaozhu.com/?utm_source=baidu&utm_medium=cpc&utm_term ...

  5. 使用BeautifulSoup爬取小猪短租的租房信息

    直接上代码 没有添加间隔时间 几页之后就被封了 #!/user/bin/env python #-*- coding:utf-8 -*- from bs4 import BeautifulSoup i ...

  6. [python爬虫] BeautifulSoup设置Cookie解决网站拦截并爬取蚂蚁短租

    我们在编写Python爬虫时,有时会遇到网站拒绝访问等反爬手段,比如这么我们想爬取蚂蚁短租数据,它则会提示"当前访问疑似黑客攻击,已被网站管理员设置为拦截"提示,如下图所示.此时我 ...

  7. python爬虫cookie池 与ip绑定_Python爬虫:设置Cookie解决网站拦截并爬取蚂蚁短租

    前言 文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理. 作者: Eastmount PS:如有需要Python学习资料的小伙伴可以加 ...

  8. Python爬虫:设置Cookie解决网站拦截并爬取蚂蚁短租

    我们在编写Python爬虫时,有时会遇到网站拒绝访问等反爬手段,比如这么我们想爬取蚂蚁短租数据,它则会提示"当前访问疑似黑客攻击,已被网站管理员设置为拦截"提示,如下图所示.此时我 ...

  9. 用pyton爬取某短租网信息

    import requests #用于向网站服务器发起请求 from bs4 import BeautifulSoup #用于处理服务反馈回来的网页文件 import pymongo #用于连接Mon ...

最新文章

  1. diou diou_nms代码分享
  2. Micropython教程之TPYBoardv102 DIY蓝牙智能小车实例
  3. TDSQL 全时态数据库系统--核心技术
  4. 【快报】基于K2 BPM的新一代协同办公门户实践交流会
  5. Navicat for MySQL连接MySQL数据库时各种错误解决
  6. 漫画:什么是JVM的垃圾回收?
  7. SVN修改用户名与密码
  8. 计算机英语讲课笔记(2020-6-13)
  9. libevent 例子,从简单到复杂
  10. DEPRECATION: Python 2.7 reached the end of its life on January 1st, 2020. Please upgrade
  11. STM32+FreeRtos 移植letter-shell工具
  12. 高效记忆/形象记忆(14)110数字编码表 81-90
  13. html编辑器pp,在线轻设计工具之H5
  14. mysql不小心删除root恢复
  15. js判断手机是否安装了某个APP,如果安装了就打开,没安装就下载
  16. layDate 时间范围限制 开始时间小于结束时间
  17. 博途SCL模板项目实例,SCL学习资料,SCL详细资料,SCL教程
  18. J2EE基础之map集合框架
  19. latex初学者的经验
  20. pdd实现主图详情图片一键下载

热门文章

  1. 信号与传输介质和计算机进制转换
  2. 哈理工计算机组成原理,哈尔滨理工大学计算机组成原理课程设计.pdf
  3. 声音发生器、pwm、占空比
  4. 阿里云配置SSH密钥连接
  5. ECshop文件结构说明
  6. 计算机会计信息系统的要素,【会计信息论文】计算机会计信息系统的内部控制制度(共3879字)...
  7. Java中的参数传递,到底是值传递还是引用传递?
  8. 两年聚37亿美元,“庞氏骗局” 维卡币负责人在美被捕
  9. java net unicode / native2ascii / url decode / url encode / UTF8 / js url code
  10. 深入RecyclerView(一)