python必备源代码-Python 自用代码（某方标准类网页源代码清洗）

#coding=utf-8

from pymongo importMongoClientfrom lxml importetreeimportrequests

s= [u'标准编号：',u'发布单位：',u'发布日期：',u'状态：',u'实施日期：',u'开本页数：',u'采用关系：',

u'中图分类号：',u'中国标准分类号：',u'国际标准分类号：',u'国别：',u'关键词：',u'摘要：']#获取数据库

defget_db():

client= MongoClient('IP', 27017)

db=client.wanfang

db.authenticate("用户名","密码")returndb#获取第num条数据

defget_data(table, num):

i= 1

for item in table.find({}, {"content":1,"_id":0}):if i==num:if item.has_key('content') and item['content']:return item['content']else:

i+=1

continue

#列表转字符串

deflist_str(list):if len(list)!=0:returnlist[0]else:return ""

#提取分类号

defcode_ls(list):if len(list)!=0:

ls=list[0].split()

shanchu=[]for i inls:if ("("in i) or (")"in i) or ("（"in i) or("）"ini):

shanchu.append(i)for i inshanchu:

ls.remove(i)returnlselse:return ""

#构造关键词列表

defkeywords_ls(list):if len(list)!=0:returnlistelse:return ""

#替代标准

defreplace_str(replace):if replace!="":

ls= [i.strip().replace(" ", "") for i inreplace]if len(ls)!=0:return ls[0][5:]else:return ""

else:return ""

#提取摘要

defsummary_str(list):if len(list)!=0:if list[0][0]!="<":returnlist[0]else:return ""

else:return ""

#调整日期格式

defdate_str(list):if len(list)!=0:

year= list[0].find(u'年')

month= list[0].find(u'月')

day= list[0].find(u'日')if month-year==2:

list[0]= list[0].replace(u"年",u"年0")if day-month==2:

list[0]= list[0].replace(u"月",u"月0")return list[0].replace(u"日","").replace(u"月","-").replace(u"年","-")else:return ""

#调整采标格式

defadopted_ls(string, ls):

dc={}

loc= string.find(',')if loc==-1:returnlselse:

dc["code"] =string[:loc].strip()

dc["type"] = string[loc+1:loc+4]

ls.append(dc)return adopted_ls(string[loc+4:],ls)#构造标准入库字典

defstandard_dict(html):

dc={}

tree=etree.HTML(html)#标准名称

dc["title"] = list_str(tree.xpath("//h1/text()"))#外文名称

dc["title_eng"] = list_str(tree.xpath("//h2/text()"))#标准编号

dc["standard_number"] = list_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[0])))#发布单位

dc["publishing_department"] = list_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[1])))#发布日期

dc["release_date"] = date_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[2])))#状态

dc["state"] = list_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[3])))#实施日期

dc["enforcement_date"] = date_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[4])))#开本页数

dc["pages"] = list_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[5])))#采用关系

dc["adopted"] = adopted_ls(list_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[6]))), [])#中图分类号

dc["clc"] = code_ls(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[7])))#中国标准分类号

dc["ccs"] = code_ls(tree.xpath("//span[text()='%s']/following-sibling::*/child::*/text()"%(s[8])))#国际标准分类号

dc["ics"] = code_ls(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[9])))#国别

dc["country"] = list_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[10])))#关键词

dc["keywords"] = keywords_ls(tree.xpath("//span[text()='%s']/following-sibling::*/child::*/text()"%(s[11])))#摘要

dc["summary"] = summary_str(tree.xpath("//span[text()='%s']/parent::*/following-sibling::*/text()"%(s[12])))#替代标准

dc["replace_for"] = replace_str(tree.xpath("//div[@id='replaceStandard']//child::*//text()"))returndc#主函数

defmain():

db=get_db()

collection=db.standard

collection2=db.standard_cleanedfor item in collection.find({}, {"content":1,"_id":0}):if item.has_key('content') and item['content']:

dc= standard_dict(item['content'])

collection2.insert(dc)if __name__ == '__main__':

main()#以下代码用于测试清洗特定一条数据

#db = get_db()

#collection=db.standard

#collection2 = db.standard_cleaned

#data = get_data(collection, 8)

#dc = standard_dict(data)

#collection2.insert(dc)

#for k,v in dc.items():

#print k,v

## 以下代码用于测试提取摘要

#data = requests.get('http://d.wanfangdata.com.cn/Standard/ISO%208528-5-2013')

#dc = standard_dict(data.text)

#for k,v in dc.items():

#print k,v

## 以下代码用于测试修改日期格式

#l1 = [u"2017年6月28日"]

#l2 = [u"2017年10月27日"]

#l3 = [u"2017年12月1日"]

#l4 = [u"2017年7月1日"]

#print date_str(l1)

#print date_str(l2)

#print date_str(l3)

#print date_str(l4)

python必备源代码-Python 自用代码（某方标准类网页源代码清洗）相关推荐

c# 获取html代码怎么写,C#获取网页源代码的方法
本文实例讲述了C#获取网页源代码的方法.分享给大家供大家参考.具体如下: public string GetPageHTML(string url) { try { HttpWebRequest wr ...
Python colorama 彩色打印实现代码
这篇文章主要介绍了Python colorama 彩色打印实现代码,将介绍的类为Back, 它实现了与 Fore 类相同的九个关键字:BLACK.RED.GREEN.YELLOW.BLUE.MAGEN ...
python骗局-1.python真的是万恶之源么?(初识python)
python真的是万恶之源么? 计算机基础及puthon了解 1.计算机基础知识 cpu : 相当于人类大脑,运算和处理问题内存 : 临时存储数据,单点就消失,4G,8G,16G,32G 硬盘 : ...
php的源码怎么查看,php 查看页面源代码的实现代码(图文)
本节分享的这段php代码,可用于显示与查看网页的源代码. 代码: /** * 显示与查看网页源代码 * edit:www.jbxue.com */ // Page title $pagetitle = ...
使用 Python3 获取网页源代码
爬虫的数据爬取量非常大,显然不可能对每个页面都手动复制源代码,因此就有必要使用自动化的方式来获取网页源代码.requests是Python的一个第三方HTTP(Hypertext Transfer P ...
c#正则表达式取出数据库中带html标签的内容,C#用正则表达式获取网页源代码标签的属性或值...
1.有url获取到网页源代码: using System.Web; using System.IO; using System.Net; private void GetHtmlinfo(string ...
html历史查看器,网页源代码查看器
网页源代码查看器是一款可以查看手机浏览器网页的代码查看工具.平时我们都是在电脑IE上查看代码,现在手机上也可以查看了.网页源代码查看器app可以进行网页源代码的查看,并且拥有高亮显示,支持同步预览以及 ...
苹果手机怎么能查看网页源代码
苹果手机查看网页源代码的方法是:打开Safari浏览器――点击网址分享――点击书签――命名后点击存储――点击书签――点击编辑――点击看源代码即可查看. 苹果手机查看网页源代码的方法是: 1.打开Saf ...
python获取网站代码_python爬虫1——获取网站源代码(豆瓣图书top250信息)
# -*- coding: utf-8 -*- import requests import re import sys reload(sys) sys.setdefaultencoding('utf ...

python必备源代码-Python 自用代码（某方标准类网页源代码清洗）

python必备源代码-Python 自用代码（某方标准类网页源代码清洗）相关推荐

最新文章

热门文章