from bs4 import BeautifulSoup
from MysqlTest import *
import requests
import time
import datetime#是否打印相关信息
isprint = 0def caiji_byid(cid, url, soupselect, dateselect, titleselect, urlselect, datetype = '%Y-%m-%d', datelen = 0):"""通过id采集包含结构数据并存入数据库"""t0 = time.perf_counter()n = 0for sp in caiji_byurl(url, soupselect):#如果datelen为0或为空,则不对日期进行长度处理if(datelen == 0):if(len(dateselect)>0):cdate = sp.select(dateselect)[0].text.strip().strip().replace(" ","")if(len(cdate)>4):cdate = str(datetime.datetime.strptime(cdate, datetype))else:cdate = str(cdate.strip())else:#如果dateselect为空,则没有日期cdate = ""else:if (len(dateselect) > 0):cdate = sp.select(dateselect)[0].text.strip()[:datelen].strip()else:cdate = sp.text.strip()[:datelen].strip()cdate = str(datetime.datetime.strptime(cdate, datetype))if(len(titleselect)>0):ctitle = sp.select(titleselect)[0].text.strip()else:ctitle = sp.text.strip()if(len(urlselect)>0):myurl = urlbase(url,sp.select(urlselect)[0].attrs['href'].strip())else:myurl = sp.attrs['href'].strip()if(isprint == 1):print(cid, cdate, ctitle, myurl)#处理USTR301中的日期if(cid == 6):if(ctitle.find("Exclusions Granted")>=0 and cdate==""):cdate = ctitle[19:].strip().replace(" ","")cdate = str(datetime.datetime.strptime(cdate, "%B%d,%Y"))n += caiji_save(cid, cdate, ctitle, myurl)print_results(cid, n, t0)def caiji_bynodes(cid, dateselect, titleselect, url, urlselect, datetype="%Y-%m-%d"):"""通过平行结构采集网页数据"""n = 0t0 = time.perf_counter()soup = mysoup(url)dates = soup.select(dateselect)titles = soup.select(titleselect)urls = soup.select(urlselect)for cdate, title, urlt in zip(dates, titles, urls):cdate = str(datetime.datetime.strptime(cdate.text.replace(" ", ""), datetype))ctitle = title.text.strip()myurl = urlbase(url,urlt.get('href').strip())if (isprint == 1):print(cid, cdate, ctitle, myurl)n += caiji_save(cid, cdate, ctitle, myurl)print_results(cid, n, t0)# def caiji_data(cid):
#     """通过leibie数据库采集信息"""
#     sql = "select * from leibie where cid=%d" % (cid)
#
#     print(sqlfetchone(sql))
#     # if(typeid==1):
#     #     caiji_byid(cid, url, soupselect, dateselect, titleselect, urlselect, datetype, datelen)
#     # else:
#     #     caiji_bynodes(cid,dateselect,titleselect,url,urlselect,datetype)def caiji_save(cid, cdate, ctitle, url):"""查询有无存在记录,如果没有,则存入数据库"""#处理日期为空的情况if(len(cdate)==0):sql = "select count(*) from caiji where cid=%d and cdate is null and ctitle=%s " % (cid, repr(ctitle))else:#处理日期只有年度的情况if(len(cdate) == 4): cdate = cdate + '-1-1'sql = "select count(*) from caiji where cid=%d and cdate=%s and ctitle=%s " % (cid, repr(cdate), repr(ctitle))cjdate = str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))if (int(querysql(sql)[0][0]) == 0):sql = 'insert into caiji(cid,cdate,ctitle,url,cjdate) values(%d,%s,%s,%s,%s)' % (cid, repr(cdate), repr(ctitle), repr(url), repr(cjdate))if (len(cdate.strip()) == 0):sql = 'insert into caiji(cid,cdate,ctitle,url,cjdate) values(%d,null,%s,%s,%s)' % (cid, repr(ctitle), repr(url), repr(cjdate))cmdsql(sql)print(str(cid), cdate, ctitle, url)return 1else:return 0def caiji_byurl(url, soupselect):headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'}html = requests.get(url, headers=headers).contentsoup = BeautifulSoup(html, "lxml").select(soupselect)return soupdef mysoup(url):headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'}html = requests.get(url, headers=headers).contentsoup = BeautifulSoup(html, "lxml")return soupdef urlbase(url, urlrr):"""获取URL根地址"""if(urlrr.find("http")<0):if(url.find("http")>=0):urltmp = url.split("//")urlr = urltmp[1].split("/")[0]url = urltmp[0] + "//" + urlr + "/" + urlrrreturn(url)def print_results(cid, n, t0):"""打印采集数据结果和所用时间"""t2 = time.perf_counter()sql = "select ctitle from leibie where cid = %d" % (cid)leibie = querysql(sql)[0][0]print("采集ID", cid, leibie, "共", n, "条记录", "耗时:", t2 - t0)"""采集具体数据"""
def caiji_zhongguooumengshanghui():"""采集中国欧盟商会信息"""""""""cid = 1url = "https://www.europeanchamber.com.cn/en/press-releases"dateselect = ".chapter-category"titleselect = "h3 a"urlselect = "h3 a"soupselect = ".panel-default"datelen = 10caiji_byid(cid=cid, url=url, soupselect=soupselect, dateselect=dateselect, titleselect=titleselect,urlselect=urlselect, datelen = datelen)def caiji_unctad():"""采集联合国贸发会议新闻"""cid = 3url = "https://unctad.org/en/Pages/Home.aspx"soupselect = "#container1 .row"dateselect = "div p b"titleselect = "a span"urlselect = "div div a"datetype = "%d%B%Y"caiji_byid(cid, url, soupselect, dateselect, titleselect, urlselect, datetype)def caiji_OECD_pre():"""采集OECD PRE新闻"""cid = 4url = "https://www.oecd-ilibrary.org/search?value51=%27igo%2Foecd%27&sortDescending=false&value5=30191110114407&operator51=AND&value1=subtype%2Fissue+OR+subtype%2Fbook&value4=20191110114407&option5=sortDate_to&value3=status%2F50embargoDate&publisherId=%2Fcontent%2Figo%2Foecd&facetNames=pub_igoId_facet+pub_themeId_facet&option3=pub_contentStatus&sortField=prism_publicationDate&option4=sortDate_from&option1=dcterms_type&facetOptions=51+52&option51=pub_igoId_facet&operator52=AND&option52=pub_themeId_facet&value52=%27theme%2Foecd-79%27"soupselect = ".title_box"dateselect = ".search-metaitem + .comma_separated li"titleselect = ".search_title"urlselect = ".search_title a"datetype = "%d%b%Y"caiji_byid(cid, url, soupselect, dateselect, titleselect, urlselect, datetype)def caiji_imfnews():"""采集IFM最新信息""""""日期为平行结构"""cid = 2url = "https://www.imf.org/external/what/whatsnewenglish/what.aspx"dateselect = "#content p span"titleselect = "h4 a"urlselect = "h4 a"datetype = "%B%d,%Y"caiji_bynodes(cid, dateselect, titleselect, url, urlselect, datetype)def caiji_WorlBank():"""采集World Bank新闻"""cid = 5url = "https://openknowledge.worldbank.org/discover?scope=%2F&query=&submit="soupselect = ".item-metadata"dateselect = ".date-info a"titleselect = "h4"urlselect = "h4 a"datetype = "%b%d,%Y"caiji_byid(cid, url, soupselect, dateselect, titleselect, urlselect, datetype)def caiji_USTR301():"""采集USTR 301新闻"""cid = 6url = "https://ustr.gov/issue-areas/enforcement/section-301-investigations/tariff-actions"soupselect = ".content p"dateselect = ""titleselect = "a"urlselect = "a"caiji_byid(cid, url, soupselect, dateselect, titleselect, urlselect)soupselect = ".content p a"for urls in caiji_byurl(url,soupselect):turl = urls.attrs['href'].strip()soupselect = ".content p a"dateselect = ""titleselect = ""urlselect = ""caiji_byid(cid, turl, soupselect, dateselect, titleselect, urlselect)def caiji_USTR_News():"""采集USTR新闻"""cid = 7url = "https://ustr.gov/about-us/policy-offices/press-office/press-releases"soupselect = ".listing li"dateselect = ""titleselect = "a"urlselect = "a"datetype = "%m/%d/%Y"datelen = 10caiji_byid(cid, url, soupselect, dateselect, titleselect, urlselect, datetype, datelen)def caiji_AmchamChina():"""采集中国美国商会新闻"""cid = 8url = "https://www.amchamchina.org/about/press-center/amcham-statement/"soupselect = ".tag-news"dateselect = ".date"titleselect = ".tag-news-title h2 a"urlselect = ".tag-news-title h2 a"datetype = "%d%B,%Y"caiji_byid(cid, url, soupselect, dateselect, titleselect, urlselect, datetype)if __name__ == '__main__':#1采集中国欧盟商会新闻caiji_zhongguooumengshanghui()#2采集IMF新闻caiji_imfnews()#3采集联合国贸发会议新闻caiji_unctad()#4采集OECD新闻caiji_OECD_pre()#5 采集世界银行信息caiji_WorlBank()#6 采集USTR301调查相关信息caiji_USTR301()#7 采集USTR新闻caiji_USTR_News()#8 采集中国美国商会新闻caiji_AmchamChina()

利用python采集相关网站信息(函数化)相关推荐

  1. 利用Python采集电影详细信息(上)

    前段时间猫哥给大家讲解了12306换乘抢票.今天猫哥带着大家一起对豆瓣电影(音乐.图书也一样)的详细信息进行一次抓取,我们先打开豆瓣电影的链接https://movie.douban.com/,然后点 ...

  2. 【Python例】利用 python 进行图片文字信息的提取 --- OCR-EasyOCR

    [Python例]利用 python 进行图片文字信息的提取 - OCR-EasyOCR 本文主要用于记录,并使用 python 脚本进行图片文字信息的生成. 什么是 OCR? OCR OCR(Opt ...

  3. 用Python采集财经数据信息并作可视化

    嗨嗨,大家好下午好,我是小圆 ~ 今天给大家分享一下,如何用python采集财经数据信息并作可视化 开发环境: 解释器版本: python 3.8 代码编辑器: pycharm 2021.2 requ ...

  4. python summary_利用python爬取新闻信息

    简介 所谓爬虫,就是按照一定的规则,自动的从网络中抓取信息的程序或者脚本.万维网就像一个巨大的蜘蛛网,我们的爬虫就是上面的一个蜘蛛,不断的去抓取我们需要的信息.今天我们学习如何抓取网络数据,对数据进行 ...

  5. 这福利给你要不要 — 用Python采集相亲网站女生数据

    前言 俗话说学咱这行的男同志 找对象容易吗 这马上就要过完年了 是时候找找女朋友了 我在这里摸索到了个网站 或许你们可以来看看 送一波单身福利 不需要的也可以学学怎么采集这些数据呗 环境与模块 环境开 ...

  6. 数据采集:利用Scrapy采集前程无忧招聘信息

    需求分析: 1.采集目标网站:前程无忧 https://www.51job.com/ 2.可根据工作关键字采集不同的工作类别.如"工程师","教师" 3.采集字 ...

  7. python采集小说网站完整教程(附完整代码)

    python 采集网站数据,本教程用的是scrapy蜘蛛 1.安装Scrapy框架 命令行执行: pip install scrapy 安装的scrapy依赖包和原先你安装的其他python包有冲突话 ...

  8. 实验一 /*被动扫描,利用搜索引擎或相关网站*/

    1.用搜索引擎Google或百度搜索麻省理工学院网站中文件名包含"network security"的pdf文档,截图搜索得到的页面. 2.照片中的女生在哪里旅行? 截图搜索到的地 ...

  9. pythonttf字体反爬虫_利用Python采集起点中文网小说,并解决字体反爬的问题

    个人比较喜欢看小说,于是乎想利用Python爬取小说网站--起点中文网,在Python编程爬取定位过程中遇到了Python反爬虫,咨询了我旁边的前端大神,说下方法 当前页面接口返回的html源码 当前 ...

最新文章

  1. Android深度探索(卷1)HAL与驱动开发读后感---第四章
  2. 安卓车机root改流浪地球_教你王者荣耀改战区
  3. 云服务如何搭建数据库_【MySQL8.0.18】阿里云服务器上搭建MySQL数据库
  4. 每天九点十分开始每半小时一次执行一个cron_每天通勤4小时!西咸双城生活的上班族,不简单...
  5. 使用python和pandas进行同类群组分析
  6. IntelliJ IDEA最常用的一些快捷键,学会了室友还以为你在祖安对线
  7. 一张图明白jenkins和docker作用
  8. Swift:分别使用SwiftyJSON、ObjectMapper、HandyJSON处理JSON
  9. 敏捷开发框架—Scrum
  10. 3、在hilens_kit安装ros后,运行小车导航
  11. 咸鱼的 Github 情报 | 一个支持边下边播、无版权限制和自动上传的BT离线下载程序...
  12. 大文件前端直接上传至七牛
  13. GBase 8c产品简介
  14. Alcohol Sensor(1)
  15. Win10切换到了Users用户怎么切换回来
  16. Elasticsearch数据库all shards failed
  17. 机器人RobotCali数据集解读
  18. 解决 “git status”失败,错误代码 1:BUG(fork bomb):D:\Git\bin\git.exe·····
  19. 【Python开发】Flask开发实战:个人博客(三)
  20. 树莓派搭建百度云同步

热门文章

  1. 艾司博讯:拼多多场景模式的计费标准和用法
  2. [iCloud]iCloud学习笔记--APP内启用iCloud及CloudKit Dashboard介绍
  3. 微信小程序封装request请求,primise队列化,async await做同步处理,缓存token信息
  4. 关于红帽认证,你想知道的这里全都有(二):为什么需要红帽认证
  5. English trip V1 - 21. I dreamed dream Teacher:Corrine Key: past tense(过去式)
  6. ESP32 GPIOV1_0
  7. JAVA基础复习之顺序结构、选择结构、循环结构、break、continue、goto
  8. 诺基亚A7Android,诺基亚转战安卓 各操作系统代表机型推荐
  9. 使用python定时更换桌面壁纸
  10. [云盘](二)我的文件和共享列表后台实现