这个单子没什么新的技术，还是和那四个化学一样，不知道他们最后的分数怎么样，希望高一点吧，不然我也不好意思，这里面附带一个爬虫，就直接拿来用了

#!/usr/bin/env python
# coding: utf-8
# In[1]:
#!/usr/bin/env python
# coding: utf-8
# In[ ]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import requests
import time
import re
import csv
from bs4 import BeautifulSoup#设置URL固定部分
url='http://www.cbooo.cn/year?year='
#设置请求头部信息
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}
#循环抓取列表页信息
for year in range(2009,2019):if year == 2009:year=str(year)a=(url+year)r=requests.get(url=a,headers=headers)html=r.contentelse:year=str(year)a=(url+year)r=requests.get(url=a,headers=headers)html2=r.contenthtml = html + html2#每次间隔0.5秒time.sleep(0.5)
lj=BeautifulSoup(html,'html.parser')
#print(lj)
#提取名称、类型、总票房（万）、平均票价、场均人次及国家及地区
result=lj.find_all('td')
#print(result)
#print(len(result))
mname=[]
title=""
index=1
year=2009
for i in result:i=str(i)title=re.findall(r'</span>(.*?)</p>',i,re.I|re.M)if len(title)>0:mname.append(index)index=index+1mname.append(title[0])else:info=re.findall(r'<td>(.*?)</td>',i,re.I|re.M)mname.append(info[0])
#print(len(mname))
#print(mname)
k=0
data=[]
while k<2000:year=2009year=year+(k//200)data.append([mname[k],mname[k+1],mname[k+2],mname[k+3],mname[k+4],mname[k+5],mname[k+6],mname[k+7],year,1])k=k+8
#print(data)
print(len(data))#一共250条数据
#将结果存到CSV文件
with open('data.csv','w') as fout:cin= csv.writer(fout,lineterminator='\n')#写入row_1    cin.writerow(["index","name","type","zpf","mantimes","price","area","datatime","year","mark"])for item in data:cin.writerow(item)

import pandas as pd
import numpy  as np
import matplotlib.pyplot as plt
%matplotlib inline
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['FangSong'] # 指定默认字体
mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题

test=pd.read_csv('data.csv',encoding='gbk')
test.head()

	id	影片名	类型	总票房	平均票价	场均人次	国家及地区	上映日期	年
0	1	2012世界末日	灾难	44745	32	68	美国	2009/11/13	2009
1	2	变形金刚2	科幻/动作	40364	32	53	美国	2009/6/24	2009
2	3	建国大业	剧情	39288	32	54	中国/中国香港	2009/9/16	2009
3	4	赤壁(下)	动作	24353	34	49	中国/中国香港	2009/1/7	2009
4	5	三枪拍案惊奇	喜剧	22011	33	49	中国	2009/12/10	2009

检查数据合理性，对数据进行清洗

缺省值查看

test.isnull().sum()

id       0
影片名      0
类型       1
总票房      0
平均票价     0
场均人次     0
国家及地区    1
上映日期     2
年        0
dtype: int64

test.loc[test['类型'].isnull()]
test.drop([36],inplace=True)
test.loc[test['上映日期'].isnull()]
test.drop([65],inplace=True)

test.isnull().sum()

id       0
影片名      0
类型       0
总票房      0
平均票价     0
场均人次     0
国家及地区    0
上映日期     0
年        0
dtype: int64

test.isnull().sum()

id       0
影片名      0
类型       0
总票房      0
平均票价     0
场均人次     0
国家及地区    0
上映日期     0
年        0
dtype: int64

test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 248 entries, 0 to 249
Data columns (total 9 columns):
id       248 non-null int64
影片名      248 non-null object
类型       248 non-null object
总票房      248 non-null int64
平均票价     248 non-null int64
场均人次     248 non-null int64
国家及地区    248 non-null object
上映日期     248 non-null object
年        248 non-null int64
dtypes: int64(5), object(4)
memory usage: 19.4+ KB

test.hist(figsize=(20,10))

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x0000021685CAD710>,<matplotlib.axes._subplots.AxesSubplot object at 0x0000021685F6C898>],[<matplotlib.axes._subplots.AxesSubplot object at 0x0000021685F94F28>,<matplotlib.axes._subplots.AxesSubplot object at 0x0000021685FC75C0>],[<matplotlib.axes._subplots.AxesSubplot object at 0x0000021685FEDC50>,<matplotlib.axes._subplots.AxesSubplot object at 0x0000021685FEDC88>]],dtype=object)

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-CNQ89Apa-1575806387908)(output_10_1.png)]

对所得数据按日期排序

test=test.sort_values(by='上映日期')

test_num=test.groupby(by=['年']).sum()
test_num

	id	总票房	平均票价	场均人次
年
2009	325	395890	797	1058
2010	913	648652	856	961
2011	1509	710355	856	824
2012	2200	1011515	931	803
2013	2825	1174380	939	727
2014	3450	1633415	913	749
2015	4075	2495002	900	799
2016	4700	2513007	861	655
2017	5325	3287129	882	558
2018	5950	3916309	894	544

test_num['总票房'].plot()

<matplotlib.axes._subplots.AxesSubplot at 0x21685cad940>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-WOAUBwR1-1575806387910)(output_14_1.png)]

test_num['平均票价'].plot()

<matplotlib.axes._subplots.AxesSubplot at 0x21686107cc0>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-hMBEjtG2-1575806387911)(output_15_1.png)]

y = test_num['总票房']
X = test_num.drop(['总票房','id'],axis=1)
print('data shape: {0}; no. positive: {1}; no. negative: {2}'.format(X.shape, y[y==1].shape[0], y[y==0].shape[0]))
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

data shape: (10, 2); no. positive: 0; no. negative: 0

from sklearn import linear_model
model =linear_model.LinearRegression()
model.fit(X_train, y_train)
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print('train score: {train_score:.6f}; test score: {test_score:.6f}'.format(train_score=train_score, test_score=test_score))

train score: 0.785987; test score: 0.901816

20191126_1_电影票房分析相关推荐

基于机器学习的电影票房分析与预测系统
温馨提示:文末有 CSDN 平台官方提供的博主 Wechat / QQ 名片 :) 1. 项目简介票房作为衡量电影能否盈利的重要指标受诸多因素共同作用影响且其影响机制较为复杂,电影票房的准确预测是比 ...
用Python分析了十年电影票房，原来我错过了这么多好电影！
"玩电影票房数据,我教你啊" 3月8日妇女节,我很期待的超级英雄电影<惊奇队长>上映了,票房表现很快过亿,但大众口碑却让人失望. 一个有趣且常见的现象是,隔壁获奖无数, ...
十年电影票房数据爬取与分析 | 免费数据教程
3月8日妇女节,我很期待的超级英雄电影<惊奇队长>上映了,票房表现很快过亿,但大众口碑却让人失望. 一个有趣且常见的现象是,隔壁获奖无数,口碑爆炸的<绿皮书>,票房却远远不如& ...
【大数据分析专业毕设之基于python爬虫的电影票房大数据预测分析+大屏可视化分析
[大数据分析专业毕设之基于python爬虫的电影票房大数据预测分析+大屏可视化分析-哔哩哔哩https://b23.tv/saIKtBH flask web框架,数据使用requests模块爬取数据, ...
python爬虫实战三：近十年中国电影票房数据爬取与分析
近十年中国电影票房数据爬取与分析前言爬取分析十年top10 年度top5 每年电影数每年总票房二八原则代码与数据前言这篇文章主要讲述的是近十年(2010-2019)中国电影票房数据的 ...
项目三：近10年来中国电影票房数据爬取分析
近10年来中国电影票房数据爬取分析前言数据采集与存储数据清洗和简单分析引入库,导入数据近10年top 年度top5 每年电影数每年总票房结论二八原则 end 点击跳转到总目录前言这 ...
未明学院学员报告:「看电影攻略」之豆瓣电影票房与口碑分析，这几类电影不易踩雷！
要说有什么经济实惠.老少咸宜的娱乐休闲方式,当然非看电影莫属啦~~ 心血来潮,说看就看的你,如何盲选电影避免踩雷?今天就奉上未明学院L同学的数据报告之看电影攻略,教你最小化踩雷风险. 前言以下分析报 ...
2022-11-28-大数据可视化“可视化国产/进口电影票房榜单”分析，特征维度大于50
可视化国产/进口电影票房榜单前言数据分析数据可视化过程分析总结前言党的十八大以来,国产电影产业与事业快速发展,创作水平不断提高,题材类型丰富多元,受众口碑不断提升,在市场竞争中表现愈发突出 ...
python电视剧口碑分析_用Python分析IMDB历届电影票房数据（上）
一.我们需要什么结论? 我们首先要做的就是定义我们想要的结论.那么根据这份IMDB网站上提取的100年间,65个国家,4937部电影数据,我们想要分析什么呢? 根据我的框架我想知道的结论有以下几点:电 ...
L:python的Pandas模块:实例练习(泰坦尼克号数据集分析,电影票房统计,股票基本面统计)
实例练习泰坦尼克号数据集分析使用Seaborn库中包含的titanic数据集进行一些数据统计. Seaborn是一个图形库,Anaconda已包含此库.数据集参见: https://github. ...

20191126_1_电影票房分析

检查数据合理性，对数据进行清洗

缺省值查看

对所得数据按日期排序

20191126_1_电影票房分析相关推荐

最新文章

热门文章