python 爬取财经新闻_Python光大证券中文云系统—

【任务目标】

调通光大证券中文云系统

【任务进度】

依据Github光大证券中文云系统开源的说明文档，应该是分爬虫模块、检索模块、统计模块、关键词频模块和关键词网络模块，是一个整体非常庞大的系统。现在的进度是，深入研究了爬虫模块。爬虫模块主要作用在于将股票论坛、个股新闻、研究报告三个网站的网页

数据通过网页解析的方式将文本内容爬下来,用于之后模块的文本挖掘。爬虫模块将爬到的文本数据以【日期 + 股票代码】为单位存至相应的TXT

文本文件当中,同时将文本文件所在的位置以及其他相关信息写入数据库。

对于每个数据源,都有一个独立的程序进行网页爬虫,他们分别是:

 GetGuba_pylucene.py 股票论坛网页爬虫  GetMbReport_pylucene.py 研究报告网页爬虫



GetSinaNews_pylucene.py 个股新闻网页爬虫

重点目前研究了个股新闻网页爬虫。由于该系统已有一定的年代，采用python2.7版本编写，已经不能再Python 3以上的版本中运行，因为语法有一定程度的变动，且该模块调用了很多包。所以前期花了很长的时间在配置环境上。其次，由于个股新闻网页爬虫与数据库直接相连，而原本光大证券的数据库肯定不可能给出，也无法再度连接。不得不去揣摩每张表的样貌，有哪些指标，从而在自己的数据库里去建表模拟，否则根本也无法运行。最后，要看懂代码，并对相关部分进行修改，调试，方便运行。

目前的进展是，我附上的GetSinaNews_pylucene.py这个代码已经可以直接运行，连接读取数据库也成功。但是无法出现Technical说明文档中，存储text文件的效果。经查证，似乎是调用的方式不对，GetSinaNews_pylucene.py似乎是后台文件被调用，要直接运行前端的某个文件，然而前端该文件似乎还调用了整个项目其他py代码，很复杂，因此搁置，未能调通。

具体的代码阅读理解，在注释中说明。

#coding=GBK

import Queue #一个队列的包，与thread配合使用，多线程运行，保证速度

import threading #多线程

#from pymmseg import mmseg ##load seperateword tool

#mmseg.dict_load_defaults() ##load

seperateword default dictionary

#mmseg.dict_load_words('stockname.dic') ##load special

stockname and stock numberdictionary

import urllib2 #爬虫抓取网页内容常用的一个包，抓取下来后，利用包中自带的一系列函数，对内容进行类似文档一样的处理和调用

import time

import re

from bs4 import BeautifulSoup

#beatifulsoup一个爬虫解析网页的包

from urlparse import urljoin

import pyodbc #连接远程sql server数据库的包

#import MySQLdb

import string

import sys

reload(sys)

sys.setdefaultencoding('gbk') #对中文字符进行的编码处理

originalURL="http://money.finance.sina.com.cn/corp/view/vCB_AllNewsStock.php?symbol=sh600418&Page=1"

#原始网页

queue = Queue.Queue()

out_queue = Queue.Queue()

hosts=[]

conn1=pyodbc.connect("DRIVER={SQL

Server};SERVER=;DATABASE=;UID=;PWD=") #连接数据库

cur=conn1.cursor() #游标

cur.execute("select s_info_code,market from ggg") #where class

like 'A%' and closedate is Null order by code

#读取所有股票编号及所在市场

result=cur.fetchall()

stockCodes=[]

for i in range(len(result)):

result[i][1]=='SZ'.encode('GBK'):

stockCodes.append('sz'+result[i][0])

result[i][1]=='SH'.encode('GBK'):

stockCodes.append('sh'+result[i][0])

#将股票代码，市场重新存入stockCodes中，有一个转码，不再是GBK了

for stock_code in stockCodes:

oriURL=originalURL[:-15]+stock_code+originalURL[-7:]

print oriURL

#time.sleep(1)

hosts.append([oriURL,stock_code[2:]])

#替换原始网址中的symbol=的值，从而形成新浪财经各个编码股票的网址存入host格式是：网址+股票编码

class ThreadUrl(threading.Thread):

def __init__(self,

queue, out_queue):

threading.Thread.__init__(self)

self.queue = queue

self.out_queue = out_queue

#利用urillib这个包，爬取网页的所有内容

def run(self):

while True:

try:

#grabs host from queue

hostchunk = self.queue.get()

host=hostchunk[0]

queuestock_code=hostchunk[1]

#grabs urls of hosts and then

grabs chunk of webpage

#print self.getName()+'Now

Grabbing'+host

url = urllib2.urlopen(host)

# print

self.getName()+'END___Grabbing'+host

chunk = url.read()

#将读取的网页内容存入chunk

#再将chunk和股票编码存入队列

#place chunk into out queue

self.out_queue.put([host,chunk,queuestock_code])

#signals to queue job is done

#self.queue.task_done()

#特例出现报错的情况

except

Exception,e:

#writeindex=file('GrabErro.txt','a+')

print 'There is

Problem@@@@@@@@@@@@@@@@@@@GRABErro'

# writeindex.write(str(e)+'\n')

# writeindex.close()

if 'HTTP

Error' in str(e):

time.sleep(10)

filer=file('stock_code.txt','r')

stock_codes=filer.readlines()

for

stock_code in stock_codes:

oriURL=host[:71]+stock_code[:-1]+host[79:]

hosts.append([oriURL,stock_code[:-1]])

continue

class DatamineThread(threading.Thread):

def

__init__(self,queue,out_queue):

threading.Thread.__init__(self)

self.out_queue = out_queue

self.queue = queue

#没有完全读懂这个函数，大体是结合新浪财经网页的具体特点，将用urllib2爬下来的内容运用Beautiful

Soup进行解析。

def run(self):

crawler1=crawler(self.getName())

while True:

try:

chunkUrl =

self.out_queue.get()

# print

'QUEUE_____SIZE:::*********************'+str(self.out_queue.qsize())

chunk=chunkUrl[1]

page=chunkUrl[0]

tempstock_code=chunkUrl[2]

print 'addtoindex::'+page

#parse the chunk

if 'Page'

in page:

self.getName()+'Beginning Souping'+page

soup=BeautifulSoup(chunk,fromEncoding='GBK')

crawler1.addtoindex(page,tempstock_code)

######################ÐÂÀËÖÐÓÐÐ©Ò³ÃæÊÇÌø×ªµ½ÁíÒ»¸öÒ³ÃæµÄ,ÏÂÃæÓÐ½øÐÐÏàÓ¦µÄ´¦Àí###

if 'AllNews'not in page:

InCaseTransLink=soup.find('meta',{"http-equiv":"Refresh"})

InCaseTransLink==None:

crawler1.insertTextInfo(soup,page,tempstock_code)

else:

transLink=InCaseTransLink['content'][6:]

if not

crawler1.isindexed(transLink,tempstock_code):

self.queue.put([transLink,tempstock_code])

'AllNews' in page:

soup1=soup.find('table',{"class":"table2"})

if soup1!=None:

links=soup1.findAll('a',href=True)

indexFlag=0

EarlyindexFlag=0

for link in links :

link=link["href"]

if 'Early' in page:

if not crawler1.isInserted(link,page[78:84]):

if not

crawler1.isindexed(link,page[78:84]):

#if EarlyindexFlag==0:

self.queue.put([link,page[78:84]])

else:

EarlyindexFlag=1

else:

if not crawler1.isInserted(link,page[73:79]):

if not

crawler1.isindexed(link,page[73:79]):

# if

indexFlag==0:

self.queue.put([link,page[73:79]])

else:

indexFlag=1

#print self.getName()+'Ending

Souping'+page

##ÒòÎªÔÚÐÂÀË¸ö¹ÉÔçÆÚ×ÊÑ¶ÖÐ,ÏÂÒ»Ò³µÄÁ´½ÓÖ¸Ïò´íÎó,Òò´ËÐ´ÁËÏÂÃæµÄ´úÂë

if 'Early'

in page:

lastlink=links[-1]["href"]

if 'News'

in lastlink :

if 'Early'

not in lastlink:

newlastlink=lastlink[:59]+'Early'+lastlink[59:]

if not

crawler1.isindexed(newlastlink,page[78:84]):

self.queue.put([newlastlink,page[78:84]])

crawler1.dbcommit()

except

Exception,e:

writeindex2=file('DatamineErro.txt','a+')

'There is Problem@@@@@@@@@@@@@@@@@@@DATAMINEErro'

writeindex2.write(str(e)+'\n')

writeindex2.close()

continue

###################################################################################

class crawler:

def __init__(self, name):

self.conn=sqlite.connect(dbname)

self.conn=pyodbc.connect("DRIVER={SQL

Server};SERVER=;DATABASE=;UID=;PWD=")

self.conn=MySQLdb.connect(host='localhost',user='root',passwd='233218')

self.conn.select_db(dbname)

self.cursor=self.conn.cursor()

self.cursor.execute('ALTER DATABASE DEFAULT CHARACTER SET GBK')

self.cursor.execute('set names GBK')

self.name=name

#自行定义封装的一个完成爬虫爬下来的内容和云端数据库存储，查询等操作的一个类

def __del__(self):

self.cursor.close() #数据库基本操作，关闭游标

def dbcommit(self):

self.conn.commit() #数据库基本操作，提交

#查询云端数据库某张表是否存储了该网址和对应的股票代码，存储了则返回rowid，没有则添加。

def getentryid(self

,table,field1,value1,field2,value2,createnew=True):

self.cursor.execute("select rowid from %s where

%s='%s' and %s='%s'"%(table,field1,value1,field2,value2))

res=self.cursor.fetchone()

# if

table=='urllist':

# print

self.name+table+'*****the rowid******'+str(res)

res==None:

#I have changed the cursor.py

line 127 charset

###################sql

serverµÄÐ´·¨##################################

self.cursor.execute("insert

into %s values('%s','%s')"%(table,value1,value2))

self.conn.commit()

self.cursor.execute("select

max(rowid) from %s"%(table))

LastRowID=self.cursor.fetchone()[0]

######################################################################

return LastRowID

else:

return res[0]

#如果在表sinaNewsUrllist中没有存储，则添加

def addtoindex(self,url,stockcode):

(self.isindexed(url,stockcode)==False):

urlid=self.getentryid('sinaNewsUrllist','url',url,'stockcode',stockcode)

#解析取所有网页内容中text部分

def gettextonly(self,soup):

v=soup.string

v==None:

c=soup.contents

resulttext=''

for t in c :

subtext=self.gettextonly(t)

resulttext+=subtext

return resulttext

else:

return v.strip()

#取文章的内容

def getContentsText(self,soup,url):

url[26:30]=='look':

content=soup.find('div',{"class":"huifu"})

resultContenttext=self.gettextonly(content)

return resultContenttext

#取文章的标题

def getTitleText(self,soup,url):

url[26:30]=='look':

content1=soup.find('div',{"class":"biaoti"})

resultTitletext=self.gettextonly(content1)

return resultTitletext

def separatewords(self,text):

utf8text=text.encode('utf-8') #because mmseg can only deal with utf-8

algor=mmseg.Algorithm(utf8text)

resulttext=[]

for

tok in algor:

resulttext.append(tok.text.decode('utf-8').encode('GBK'))

return

resulttext

#查询是否有在数据库表sinaNewsUrllist中存储

def isindexed(self,url,stockcode):

self.cursor.execute("select rowid from

sinaNewsUrllist where url='%s' and stockcode='%s'" %

(url,stockcode))

u=self.cursor.fetchone()

u!=None:

return True

return

False

#检查是否在表sinaStockNews中已经添加成功

def isInserted(self,url,stockcode):

self.cursor.execute("select * from sinaStockNews

where url='%s' and stockcode='%s'" % (url,stockcode))

u=self.cursor.fetchone()

u!=None:

return True

return

False

#将爬到的内容以一行：url，股票代码，发表时间，文章标题，内容的形式存储到表sinaStockNews中

def

insertTextInfo(self,soup,url,stock_code):

print '@@@@@@@@@@@@@@@@@@@@@@@@'

tempsoup=soup.find('div',{"class":"blkContainerSblk"})

title=tempsoup.find('h1').string.encode('GBK')

timestring=soup.find('span',{"id":"pub_date"}).string

time=timestring[0:4]+timestring[5:7]+timestring[8:10]

time

tempsoup != None:

texts=tempsoup.findAll('p')

len(texts)>0:

resultText=''

for text

in texts:

tmptext=self.gettextonly(text)

'document.getElementByIdx_x("artibodyTitle").innerHTML'not in tmptext:

resultText=resultText+tmptext+'\n'

resultText.replace("'"," ")

self.cursor.execute("insert into sinaStockNews

values('%s','%s','%s','%s','%s')"%(url,stock_code,str(time),title,resultText.decode('GBK').encode('GBK')))

writeindex.write(resultText.decode('GBK').encode('utf-8'))

#没有太看明白这个函数，和爬新闻内容没有太大关系，涉及到后面的字段分析。

def insertWordInfo(self,soup,url):

id=string.atoi(url[31:37]+url[38:-5])

self.cursor.execute("select * from wordlocation

where textid='%d'"%(id))

u=self.cursor.fetchone()

u==None:

contentsText=str(self.getContentsText(soup,url))

titleText=str(self.getTitleText(soup,url))

contentsWords=self.separatewords(contentsText)

titleWords=self.separatewords(titleText)

for i in

range(len(contentsWords)):

word=contentsWords[i]

self.cursor.execute("insert into

wordlocation(textid,word,location)

values('%d','%s','contents')"%(id,word))

for i in

range(len(titleWords)):

word=titleWords[i]

self.cursor.execute("insert into

wordlocation(textid,word,location)

values('%d','%s','title')"%(id,word))

self.conn.commit()

def createindextables(self):

self.cursor.execute('create database gubaSearch character set gbk')

self.cursor.execute('create table urllist(rowid int identity(1,1)

,url varchar(100))default charset=gbk')

self.cursor.execute('create table wordlocation(textid bigint,word

varchar(100) ,location varchar(100))default charset=gbk')

self.cursor.execute('create index urlidx on urllist(url)')

self.cursor.execute('create index textidx on wordlocation(textid)')

self.cursor.execute("create table text(id bigint Primary key,url

varchar(100),pub_date date,topic_id int,reply_num int,access_num

int ,subject varchar(100))default charset=gbk")

self.cursor.execute('create index idx on text(id)')

self.cursor.execute('create index pubdatex on text(pub_date)')

self.conn.commit()

def main():

#spawn a pool of

threads.

#开了15个多进程

for i in range(15):

t = ThreadUrl(queue, out_queue)

t.setDaemon(True) #以后台的方式运行

t.start()

#populate queue with

data

#放入队列

for host in hosts:

queue.put(host)

for i in range(40):

dt = DatamineThread(queue,out_queue)

dt.setDaemon(True)

dt.start()

print 'bbbb'

#wait on the queue until

everything has been processed

queue.join()

out_queue.join()

main()

python 爬取财经新闻_Python光大证券中文云系统——爬取新浪财经新闻相关推荐

Python光大证券中文云系统——爬取新浪财经新闻
[任务目标] 调通光大证券中文云系统 [任务进度] 依据Github光大证券中文云系统开源的说明文档,应该是分爬虫模块.检索模块.统计模块.关键词频模块和关键词网络模块,是一个整体非常庞大的系统.现在 ...
光大证券“中文云”项目开源地址
光大"中文云"项目开源地址为带动国内金融文本研究,光大中文云系统现已开源.7人专业团队, 6年专注研究,2万行代码,爬虫.索引.检索.统计.热度.选股一气呵成.共享地址: htt ...
python爬取小说基本信息_Python爬虫零基础实例---爬取小说吧小说内容到本地
Python爬虫实例--爬取百度贴吧小说写在前面本篇文章是我在简书上写的第一篇技术文章,作为一个理科生,能把仅剩的一点文笔拿出来献丑已是不易,希望大家能在指教我的同时给予我一点点鼓励,谢谢. 一.介 ...
python爬网易新闻_Python爬虫实战教程：爬取网易新闻
前言本文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理. 作者: Amauri PS:如有需要Python学习资料的小伙伴可以加点击 ...
python爬网易新闻_Python爬虫实战教程：爬取网易新闻；爬虫精选高手技巧
Python爬虫实战教程:爬取网易新闻:爬虫精选高手技巧发布时间:2020-02-21 17:42:43 前言本文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,版权归原作者所有, ...
python爬取bilibili弹幕_python爬虫：bilibili弹幕爬取+词云生成
如果你懒得看下边的文字,我录了一个完整的教学视频在b站上. 我的B站教学:https://www.bilibili.com/video/av75377135?p=2 工作原理 b站是提供弹幕接口的,所 ...
python简单实践作业答案_python入门实践四：爬取牛客网面试专项练习题及答案
说明:个人练手python用. 操作系统:window10 x64 IDE:Pycharm 2017.2.2 Python版本:3.6.2 目标牛客网是一个IT笔试面试的平台,提供了很多题库,今天我 ...
python从网址爬图片协程_python 用 gevent 协程抓取海量网页
python作为爬虫利器,抓网页的方式简洁明了.爬成百上千的网页,都可以很快爬完,但是如果网页数量上万呢?速度就不能忍受了. 这是一段爬取页面的函数,用了requests库:1 2 3 4 5impo ...
python爬火车票是不是违法_python利用selenium+requests+beautifulsoup爬取12306火车票信息...
在高速发展的时代.乘车出远门是必不可少的,有些查询信息是要收费的.这里打造免费获取火车票信息想要爬取12306火车票信息,访问12306官方网站,输入出发地,目的地 ,时间之后点击确定,这是我 ...

python 爬取财经新闻_Python光大证券中文云系统——爬取新浪财经新闻

python 爬取财经新闻_Python光大证券中文云系统——爬取新浪财经新闻相关推荐

最新文章

热门文章