Python 爬取所有51VOA网站的Learn a words文本及mp3音频

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#Python 爬取所有51VOA网站的Learn a words文本及mp3音频
import os
import sys
import time
import urllib as req
from threading import Thread
import urllib2
import urllib
from threading import Thread
import xml
import re
class MyWorkThread(Thread, urllib.FancyURLopener):"""Multi-thread downloading class.run() is a vitual method of Thread"""def __init__(self, threadname, url, filename, ranges = 0):Thread.__init__(self, name = threadname)urllib.FancyURLopener.__init__(self)self.name = threadnameself.url = urlself.filename = filenameself.ranges = rangesself.downloaded = 0def run(self):"""virtual function in Thread"""try:self.downloaded = os.path.getsize(self.filename)except OSError:self.downloaded = 0#rebuild start pointself.startpoint = self.ranges[0] + self.downloaded#if this part is completedif self.startpoint >= self.ranges[1]:print 'Part %s has been downloaded over.' % self.filenamereturnself.oneTimeSize = 8 * 1024 #8K bytes / timeprint 'task %s will download from %d to %d' %(self.name, self.startpoint, self.ranges[1])self.addheader('Range', 'bytes=%d-%d' %(self.startpoint, self.ranges[1]))self.urlhandle = self.open(self.url)data = self.urlhandle.read(self.oneTimeSize)while data:filehandle = open(self.filename, 'ab+')filehandle.write(data)filehandle.close()self.downloaded += len(data)data = self.urlhandle.read(self.oneTimeSize)
def GetUrlFileSize(url):urlHandler = urllib.urlopen(url)headers = urlHandler.info().headerslength = 0for header in headers:if header.find('Length') != -1:length = header.split(':')[-1].strip()length = int(length)return length
def SpliteBlocks(totalsize, blocknumber):blocksize = totalsize / blocknumberranges = []for i in range(0, blocknumber -1):ranges.append((i * blocksize, i * blocksize + blocksize -1))ranges.append((blocksize * (blocknumber -1), totalsize -1))return ranges
def isLive(tasks):for task in tasks:if task.isAlive():return Truereturn False
def downLoadFile(url, output, blocks = 6):sys.stdout.write('Begin to download from %s\n' %url )sys.stdout.flush()size = GetUrlFileSize(url)ranges = SpliteBlocks(size, blocks)threadname = ["thread_%d" %i for i in range(0, blocks)]filename = ["tmpfile_%d" %i for i in range(0, blocks)]tasks = []for i in range(0, blocks):task = MyWorkThread(threadname[i], url, filename[i], ranges[i])task.setDaemon(True)task.start()tasks.append(task)time.sleep(2)while isLive(tasks):downloaded = sum([task.downloaded for task in tasks])process = downloaded / float(size) * 100show = u'\rFilesize: %d Downloaded:%d Completed: %.2f%%' %(size, downloaded, process)sys.stdout.write(show)sys.stdout.flushtime.sleep(1)output = formatFileName(output)filehandle = open(output, 'wb+')for i in filename:f = open(i, 'rb')filehandle.write(f.read())f.close()os.remove(i)filehandle.close()sys.stdout.write("Completed!\n")sys.stdout.flush()
def formatFileName(filename):if isinstance(filename, str):header, tail = os.path.split(filename)if tail != '':tuple = ('\\','/',':','*', '?', '"', '<', '>', '|')for char in tuple:if tail.find(char) != -1:tail = tail.replace(char, ' ')filename = os.path.join(header, tail)#print filenamereturn filenameelse:return 'None'def remove_tags(raw_html):cleanr =re.compile('<.*?>')cleantext = re.sub(cleanr,'', raw_html)return cleantextdef saveword(url,name):res=req.urlopen(url)data=res.readlines()res.close()startag=r'id="mp3"'endtag=r'</div>'k=80data2=''data3=''data4=''while k<len(data)-10:if(data[k].find(startag)!=-1):data2=data[k]if(data[k].find('<div id="content">')!=-1):data3=data[k]if(data[k+1].find('<p>')!=-1):data4=data[k+1]
#                if(data4.rfind('...')!=-1):
#                    endid = data4.find('...')+3
#                else:
#                    endid = data4.find('</p>')
#                data4 = data4[3:endid]data4=remove_tags(data4)k=k+1
#    print data2
##    data=str(data)
##    data2=data[(data.find(startag)+14):data.lower().find(endtag)+3]
##    data3=data[105]
#    print data3mp3url=data2[data2.find('http'):data2.find(''' title="''')-1]if(data3.find(endtag)!=-1):sent = data3[data3.find('今天我们要学'):data3.find(endtag)]else:sent = data3[data3.find('今天我们要学'):].strip('\n').strip('\r')+data4.strip('\n')
#        sent = sent.replace('\n','. ')
#    print mp3url,sentf=open('LearningWord.txt','a+')sent=remove_tags(sent)    f.write(name+'\n'+sent.strip('\r')+'\n')f.close()
#    print str(name)+'.mp3'if(data2.find(startag)!=-1):downLoadFile(mp3url,str(formatFileName(name.replace(':', ' ')))+'.mp3', blocks = 4)def savepage(url):res=req.urlopen(url)data=res.read()res.close()startag='''<ul><li>'''endtag='''</li></ul>'''data=str(data)data2=data[data.find(startag)+12:data.find(endtag)]linestart='href'meddle = '''" target'''lineend = '</a>'urls=[]words = []i=data2.find(linestart)while(i!=-1):k = data2.find(meddle)j = data2.find(lineend)url = 'http://www.51voa.com/'+data2[i+6:k]urls = urls+[url]word = data2[k+16:j]print i,k,j, word,urlwords = words + [word]data2=data2[j+3:]saveword(url,word)i=data2.find(linestart)
#        break#下载所有单词
f=open('LearningWord.txt','w')
f.close()
i=53
while i<=54:url = 'http://www.51voa.com/Learn_A_Word_'+str(i)+'.html'savepage(url)i=i+1#下载指定单词
#url = "http://www.51voa.com/Voa_English_Learning/Learn_A_Word_21951.html"
#name ='9:pop up'
#saveword(url,name)

下载单词文本示例:(全部单词文本下载地址:http://pan.baidu.com/s/1o8pmojS)

2650 endorse
今天我们要学的词是 endorse. Endorse 作为动词,有支持的意思。Senator Ted Cruz endorsed Donald Trump, but later said the decision was “agonizing.” 美国联邦参议员克鲁兹支持川普,但是后来又表示,他做出这一决定十分痛苦。The New York Times endorsed Hillary Clinton for president in a Saturday editorial, and dismissed Donald Trump as “the worst nominee put forward by a major party in modern American history.” 纽约时报在星期六的社论中支持希拉里.克林顿当总统,并批评说,川普是“美国现代史上主要政党推举出的最差劲的候选人”。好的,我们今天学习的词是 endorse, endorse, endorse...
2649 deportation
今天我们要学的词是 deportation. Deportation 名词,驱逐出境,递解出境。The Obama administration said it would fully resume deportations of undocumented Haitian immigrants. 奥巴马政府表示,将全面恢复对无证海地移民的遣返工作。China and Canada have reached a new border agreement that would speed up the deportation of Chinese nationals inadmissible in Canada. 中国和加拿大达成新的边境协议,加快遣返那些本不该被允许进入加拿大的中国公民。好的,我们今天学习的词是 deportation, deportation, deportation...
2648 voluntarily
今天我们要学的词是 voluntarily. Voluntarily 副词,自愿地。The International Organization for Migrants says that more people are voluntarily returning to their home countries. 国际移民组织说,越来越多的人开始自愿返回自己的祖国。A high-tech diagnostic company voluntarily withdrew its Zika virus blood test from FDA approval. 一家高科技诊断公司自愿撤回递交美国食品药物管理局的寨卡病毒血液检测批准申请。好的,我们今天学习的词是 voluntarily, voluntarily, voluntarily...
2647 guerrilla
今天我们要学的词是 guerrilla. Guerrilla 形容词,游击队的。The Columbian government signed a peace agreement on Monday with the Revolutionary Armed Forces of Columbia (FARC), a national guerrilla movement. 哥伦比亚政府星期一跟全国游击队运动“哥伦比亚革命武装力量”签署了和平协议。The agreement needs to be approved by an Oct. 2 referendum before roughly 7,000 guerrilla fighters start their transition to civilian life. 这项协议还需经过10月2号全民公决批准,大约七千名游击队员才会开始向平民生活过渡。好的,我们今天学习的词是 guerrilla, guerrilla, guerrilla...
2646 curfew
今天我们要学的词是 curfew. Curfew 名词,宵禁。The city of Charlotte in North Carolina has lifted its midnight curfew, but the state of emergency remains in effect. 北卡罗来纳州夏洛特市取消了午夜宵禁,但是紧急状态依旧生效。Authorities in an Austrian city imposed a curfew on young immigrants following a series of sexual attacks at a local beer and wine festival. 奥地利一个城市的有关当局对未成年移民实施宵禁,此前当地一个啤酒葡萄酒节期间发生了一系列性侵事件。 好的,我们今天学习的词是 curfew, curfew, curfew...
2645 estimate
今天我们要学的词是 estimate. Estimate 动词,估计。A recent study estimates that the Indonesian forest fires that created a smoky haze last year may have caused more than 100,000 premature deaths. 一项最新研究估计,去年印尼山火引发的雾霾可能造成了10万人过早死亡。A new survey estimates that Americans own 265 million guns, but half of these guns are in the hands of only 3% of Americans. 最新调查估计,美国人拥有枪支总数2.65亿支,但其中半数都集中在3%的人手中。好的,我们今天学习的词是 estimate, estimate, estimate...
2644 mercy killing
今天我们要学的词是 mercy killing. Mercy killing 名词,安乐死。A terminally ill 17-year-old has become the first minor to be euthanized in Belgium since the age restrictions on such mercy killings were lifted in 2014. 比利时一个17岁绝症男孩安乐死,他是比利时2014年取消对安乐死年龄限制以来第一个安乐死的未成年人。The United Arab Emirates passed a new law banning all mercy killings. 阿联酋通过新法律,禁止安乐死。好的,我们今天学习的词是 mercy killing, mercy killing, mercy killing...

  

转载于:https://www.cnblogs.com/huadongw/p/5947863.html

Python 爬取所有51VOA网站的Learn a words文本及mp3音频相关推荐

  1. 教你用Python爬取表情包网站下的全部表情图片

    教你用Python爬取表情包网站下的全部表情图片 又是我啦~~~ 最近上网的时候老看到有人用Python爬取表情包,心痒痒自己也整了一个. 使用到的扩展库:BeautifulSoup, request ...

  2. python 爬取猫眼电影网站数据

    完整代码下载:https://github.com/tanjunchen/SpiderProject/tree/master/maoyan python 爬取 movie.douban.com 网站 ...

  3. python爬取安居客网站上北京二手房数据

    目标:爬取安居客网站上前10页北京二手房的数据,包括二手房源的名称.价格.几室几厅.大小.建造年份.联系人.地址.标签等. 网址为:https://beijing.anjuke.com/sale/ B ...

  4. 手把手教你用python爬取人人贷网站借款人信息

    P2P是近年来很热的一个行业,由于这个行业在国内兴起才不久,国内的很多学者对这个行业都兴趣盎然,在大学研究互联网金融的学者更是有一大群.小编是学金融出身,深知数据在做学术研究的重要性,之前有不少学互联 ...

  5. [Python]爬取 游民星空网站 每周精选壁纸(1080高清壁纸) 网络爬虫

    一.检查 首先进入该网站的https://www.gamersky.com/robots.txt页面 给出提示: 弹出错误页面 注: 网络爬虫:自动或人工识别robots.txt,再进行内容爬取 约束 ...

  6. Python爬取某音乐网站

    本文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,如有问题请及时联系我们以作处理. 爬取某音乐网站,我们先搜索歌曲,然后随意点进一首歌,然后在新弹出的歌曲页面按F12开始抓包,并刷新 ...

  7. 利用python爬取实习僧网站上的数据

    最近在找实习,就顺便想到用python爬取一些职位信息看看,有哪些岗位比较缺人. #_*_coding:utf-8_*_import requests from bs4 import Beautifu ...

  8. Python爬取斗鱼直播网站信息

    一.需求 爬取斗鱼直播网站信息,如直播名字.主播名字.热度.图片和房间网址,将这些数据保存到csv文件中,并单独创建文件夹保存图片. 斗鱼直播网址:https://www.douyu.com/g_LO ...

  9. python爬取10个网站_十个Python爬虫武器库示例,十个爬虫框架,十种实现爬虫的方法!...

    一般比价小型的爬虫需求,我是直接使用requests库 + bs4就解决了,再麻烦点就使用selenium解决js的异步 加载问题.相对比较大型的需求才使用框架,主要是便于管理以及扩展等. 1.Scr ...

  10. python分支机构_基于Python爬取天眼查网站的企业信息!Python无所不能!

    注:这是一个在未登录的情况下,根据企业名称搜索,爬取企业页面数据的采集程序,是一个比较简单的爬虫,基本上只用到了代理,没有用到其他的反反爬技术,不过由于爬取的数据比较多, 适合刷解析技能的熟练度 . ...

最新文章

  1. 简单验证码识别 tessnet2
  2. 一次搞定:分布式缓存 Redis 集群搭建!
  3. mysql linux 客户端_MySQL—Linux查看客户端连接信息(连接数、进程等)
  4. python顺序结构实验_Python程序设计实验报告二:顺序结构程序设计
  5. 如何解决数据倾斜问题?
  6. 各类木材强度_凯狄解析各类抽芯铆钉的工作原理
  7. php 字符串含有下划线,PHP-我的会话ID中有下划线
  8. Spark集群试运行
  9. matlab管道泄漏定位,在Matlab中分析基于小波变换的管道泄漏定位方法
  10. JConsole详解
  11. 又读《孙子兵法》,谈领导力
  12. beyond compare 3 过期解决办法
  13. 儿童摄影html代码源,HTML5织梦dede儿童摄影/影楼/写真/摄影工作室网站模板
  14. find命令的基础用法以及按文件修改时间查找文件
  15. mysql用户信息表储存引擎,Mysql表引擎
  16. Python 入门 —— Python 面向对象:类的创建及其基本内置方法的使用
  17. SQL Server(六)-Transact-SQL语言
  18. python电子章_python二级电子教案 第2章 Python语言基本语法元素
  19. Latex引用参考文献-BibTex的使用
  20. IPad上windows远程桌面软件推荐

热门文章

  1. python解析_第107天: Python 解析 PDF
  2. 走进龙芯3A3000(二)安装Gentoo N64
  3. 原生PHP调用科大讯飞语音合成(流式版)WebAPI
  4. vba 涉及合并取消合并单元格
  5. html m3u8 flash,m3u8插件-支持在pc端(flashplayer)播放m3u8格式文件-ckplayer使用方法...
  6. KDiff3 merge conflict 窗口操作说明
  7. Servlet过滤器
  8. 【STM32f401学习之路-02】USART串口通信
  9. 将扫描文件放至ftp服务器,复印机扫描至FTP详细设置步骤!
  10. Qt5+vs2017 UI界面添加新控件后,提示没有类成员