python爬取大量数据报错_【Python】Python爬取FAERS数据报错

2024-04-26 00:49:07

问题描述

使用luigi框架爬取faers数据报错，IDE为pycharm

错误信息为

No task specified

Process finished with exit code 1

2.源代码

import os

import re

import shutil

import requests

from io import BytesIO

from zipfile import ZipFile

from urllib.request import urlretrieve

from urllib.request import urlopen

from bs4 import BeautifulSoup

import pandas as pd

import numpy as np

import warnings

import luigi

import sys

import logging

def extractZip(url, source_dir, data_dir):

logging.debug('In the Task : extractZip')

r = requests.get(url)

z = ZipFile(BytesIO(r.content))

z.extractall(source_dir)

deletePDF(source_dir)

copyFile(source_dir, data_dir)

def deletePDF(path):

logging.debug('In the Task : deletePDF')

for parent, dirnames, filenames in os.walk(source_dir):

for fn in filenames:

if fn.lower().endswith('.pdf'):

print("Deleteting" + fn)

os.remove(os.path.join(parent, fn))

if fn.lower().endswith('.doc'):

print("Deleteting" + fn)

os.remove(os.path.join(parent, fn))

if fn.startswith("RPSR"):

print("Deleteting" + fn)

os.remove(os.path.join(parent, fn))

if fn.startswith("INDI"):

print("Deleteting" + fn)

os.remove(os.path.join(parent, fn))

if fn.startswith("THER"):

print("Deleteting" + fn)

os.remove(os.path.join(parent, fn))

def copyFile(source_dir, data_dir):

logging.debug('In the Task : copyFiles')

RootDir1 = os.getcwd() + '/' + source_dir

TargetFolder = os.getcwd() + '/' + data_dir

for root, dirs, files in os.walk((os.path.normpath(RootDir1)), topdown=False):

for name in files:

if name.endswith('.txt'):

SourceFolder = os.path.join(root, name)

shutil.move(SourceFolder, TargetFolder)

class get_files_url(luigi.Task):

logging.debug('In the Task : getWebUrls')

def requires(self):

return []

def run(self):

source_dir = "FAERSsrc"

data_dir = "FAERSdata"

files = {}

url = {}

host_url = "http://www.fda.gov"

target_page = [

"http://www.fda.gov/Drugs/GuidanceComplianceRegulatoryInformation/Surveillance/AdverseDrugEffects/ucm082193.htm"]

for page_url in target_page:

try:

page_bs = BeautifulSoup(urlopen(page_url), "lxml")

except:

page_bs = BeautifulSoup(urlopen(page_url))

for url in page_bs.find_all("a"):

a_string = str(url.string)

if "ASCII" in a_string.upper():

files[a_string.encode("utf-8")] = host_url + url["href"]

url = host_url + url["href"]

extractZip(url, source_dir, data_dir)

for url in page_bs.find_all("linktitle"):

a_string = str(url.string)

if "ASCII" in a_string.upper():

files[a_string.encode("utf-8")] = host_url + url.parent["href"]

url = host_url + url.parent["href"]

extractZip(url, source_dir, data_dir)

with self.output().open('w') as f:

f.write("hello")

def output(self):

return luigi.LocalTarget('url.txt')

class mergeData(luigi.Task):

def requires(self):

return [get_files_url()]

def run(self):

directoryPath = os.getcwd() + "/FAERSdata"

demo = pd.DataFrame(

columns=['primaryid', 'caseid', 'mfr_dt', 'init_fda_dt', 'rept_cod', 'mfr_num', 'mfr_sndr', 'age',

'sex', 'wt', 'wt_cod', 'occp_cod', 'occr_country'])

drug = pd.DataFrame(columns=['primaryid', 'caseid', 'role_cod', 'drugname', 'route', 'dose_amt', 'dose_unit',

'dose_form', 'dose_freq'])

reaction = pd.DataFrame(columns=['primaryid', 'caseid', 'pt'])

outcome = pd.DataFrame(columns=['primaryid', 'caseid', 'outc_cod'])

print("in run")

for filename in os.listdir(directoryPath):

if "DEMO" in filename:

demo_df = pd.read_csv(directoryPath + "/" + filename, low_memory=False, sep="$", error_bad_lines=False)

demo_df.drop(

['caseversion', 'i_f_code', 'lit_ref', 'event_dt', 'auth_num', 'fda_dt', 'age_cod', 'age_grp',

'e_sub', 'rept_dt', 'to_mfr', 'reporter_country'], inplace=True, axis=1, errors='ignore')

demo_df = demo_df.loc[(demo_df['wt_cod'] == 'KG')]

demo_df = demo_df[pd.notnull(demo_df['age'])]

demo_df = demo_df[1:]

demo = demo.append(demo_df, ignore_index=True)

if "DRUG" in filename:

durg_df = pd.read_csv(directoryPath + "/" + filename, low_memory=False, sep="$", error_bad_lines=False)

durg_df.drop(['drug_seq', 'val_vbm', 'dose_vbm', 'cum_dose_chr', 'prod_ai', 'cum_dose_unit', 'dechal',

'rechal', 'lot_num', 'exp_dt', 'nda_num'], inplace=True, axis=1, errors='ignore')

durg_df = durg_df[pd.notnull(durg_df['dose_amt'])]

durg_df = durg_df[pd.notnull(durg_df['dose_unit'])]

durg_df = durg_df.loc[(durg_df['role_cod'] == 'PS')]

durg_df = durg_df[1:]

drug = drug.append(durg_df, ignore_index=True)

if "REAC" in filename:

reac_df = pd.read_csv(directoryPath + "/" + filename, low_memory=False, sep="$", error_bad_lines=False)

reac_df = reac_df.groupby('primaryid')

reac_df = reac_df.filter(lambda x: len(x) == 1)

reac_df = reac_df[1:]

reaction = reaction.append(reac_df, ignore_index=True)

if "OUTC" in filename:

out_df = pd.read_csv(directoryPath + "/" + filename, low_memory=False, sep="$", error_bad_lines=False)

out_df = out_df.groupby('primaryid')

out_df = out_df.filter(lambda x: len(x) == 1)

out_df = out_df[1:]

outcome = outcome.append(out_df, ignore_index=True)

demo['sex'] = np.where(pd.isnull(demo['sex']), demo['gndr_cod'], demo['sex'])

demo.drop(['gndr_cod'], inplace=True, axis=1, errors='ignore')

demo_durg_df = pd.merge(drug, demo, on=('primaryid', 'caseid'), how='left')

demodurgreact_df = pd.merge(demo_durg_df, reaction, on=('primaryid', 'caseid'), how='inner')

demodrugreactout_df = pd.merge(demodurgreact_df, outcome, on=('primaryid', 'caseid'), how='inner')

demodrugreactout_df.drop(['drug_rec_act'], inplace=True, axis=1, errors='ignore')

demodrugreactout_df['occp_cod'] = demodrugreactout_df['occp_cod'].fillna('OT')

demodrugreactout_df['rept_cod'] = demodrugreactout_df['rept_cod'].fillna('EXP')

demodrugreactout_df['mfr_sndr'] = demodrugreactout_df['mfr_sndr'].fillna('Others')

demodrugreactout_df['route'] = demodrugreactout_df['route'].fillna('Unknown')

demodrugreactout_df['dose_form'] = demodrugreactout_df['dose_form'].fillna('Others')

demodrugreactout_df['dose_freq'] = demodrugreactout_df['dose_freq'].fillna('Others')

demodrugreactout_df.to_csv(self.output().path, header=True, index=False);

def output(self):

return luigi.LocalTarget('MergedFile.csv')

if __name__ == '__main__':

source_dir = "FAERSsrc"

data_dir = "FAERSdata"

if not os.path.isdir(source_dir):

os.makedirs(source_dir)

if not os.path.isdir(data_dir):

os.makedirs(data_dir)

luigi.run()

回答

python爬取大量数据报错_【Python】Python爬取FAERS数据报错相关推荐

Python爬取网站用户手机号_用Python爬虫爬取学校网妹子QQ号，100行代码撩妹，用技术脱单...
前言: 其实这个项目没什么难度,稍微懂一点爬虫的人或者是已经就业的程序员都可以用自己学的编程语言写出来,但是正是这也原因,也间接证明现在网络很多安全问题的存在,简单的说就是这个网站的程序员偷懒,让用户 ...
python 爬取链家数据_用python爬取链家网的二手房信息
题外话:这几天用python做题,算是有头有尾地完成了.这两天会抽空把我的思路和方法,还有代码贴出来,供python的初学者参考.我python的实战经历不多,所以代码也是简单易懂的那种.当然过程中还 ...
python如何爬取网站所有目录_用python爬虫爬取网站的章节目录及其网址
认识爬虫网络爬虫(又被称为网页蜘蛛,网络机器人,在FOAF社区中间,更经常的称为网页追逐者),是一种按照一定的规则,自动地抓取万维网信息的程序或者脚本.另外一些不常使用的名字还有蚂蚁.自动索引.模拟 ...
用python爬取qq空间内容_用python爬取QQ空间
原博文 2016-11-18 17:19 − 好久没写博文了,最近捣鼓了一下python,好像有点上瘾了,感觉python比js厉害好多,但是接触不久,只看了<[大家网]Python基础教程(第 ...
python爬取国家男女比例_用python爬取3万多条评论，看韩国人如何评价韩国电影《寄生虫》？...
用python爬取3万多条评论,看韩国人如何评价韩国电影<寄生虫>? 朱小五凹凸数据大家好,我是朱小五今天给大家带来一个关于电影的数据分析文章. 别走啊,这次不是豆瓣,也不是猫眼真 ...
python 循环定时器 timer显示数据_【Python】多线程、定时循环爬取优信二手车信息...
爬虫爬取优信二手车:循环遍历每页,获取相应的有价值字段信息,这里不详细阐释了. 多线程 Python中,使用concurrent.futures模块下的ThreadPoolExecutor类来实现线 ...
python爬取微博用户正文_基于Python的新浪微博用户信息爬取与分析
基于 Python 的新浪微博用户信息爬取与分析邓文萍 [摘要] 摘要:本文设计并实现了一个微博用户信息爬取与分析系统 , 利用 Cookie 实现了用户的模拟登录 , 使用 Python 语言的 ...
python爬取数据案例分析_基于Python及webdriver的网页抓取案例
上次有朋友问怎么抓取交易所网站的数据,特别是历史数据,这里特别推荐使用selenium这一自动化测试框架. 原本selenium是用来完成大量基于浏览器的自动化测试的,但由于可以方便地执行JS代码,摸 ...
python爬取历史天气查询_历史天气爬取
历史天气爬取爬取来源:2345天气网爬取周期:月度,可自行更改爬取城市:部分城市,可自行更改具体代码: import requests import demjson import csv li ...

最新文章

热门文章