这个是给国外大哥做的,很尴尬,做完了有答案,要和答案一样,临时就又改了,结果鼠标以及各种问题,难受呀

Perform polynomial regression to predict wage using age. Use cross-validation to select the
optimal degree d for the polynomial. What degree was chosen, and how does this compare to the
results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the
data.

%matplotlib inline
import matplotlib as mpl
mpl.rcParams['font.sans-serif']=[u'simHei']
mpl.rcParams['axes.unicode_minus']=False
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import sklearn
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
# from sklearn.linear_model.coordinate_descent import ConvergenceWarning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.linear_model import Lasso
from sklearn.metrics import explained_variance_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
test=pd.read_csv('Wage.csv')
test.head()
Unnamed: 0 year age sex maritl race education region jobclass health health_ins logwage wage
0 231655 2006 18 1. Male 1. Never Married 1. White 1. < HS Grad 2. Middle Atlantic 1. Industrial 1. <=Good 2. No 4.318063 75.043154
1 86582 2004 24 1. Male 1. Never Married 1. White 4. College Grad 2. Middle Atlantic 2. Information 2. >=Very Good 2. No 4.255273 70.476020
2 161300 2003 45 1. Male 2. Married 1. White 3. Some College 2. Middle Atlantic 1. Industrial 1. <=Good 1. Yes 4.875061 130.982177
3 155159 2003 43 1. Male 2. Married 3. Asian 4. College Grad 2. Middle Atlantic 2. Information 2. >=Very Good 1. Yes 5.041393 154.685293
4 11443 2005 50 1. Male 4. Divorced 1. White 2. HS Grad 2. Middle Atlantic 2. Information 1. <=Good 1. Yes 4.318063 75.043154
test.isnull().sum()
Unnamed: 0    0
year          0
age           0
sex           0
maritl        0
race          0
education     0
region        0
jobclass      0
health        0
health_ins    0
logwage       0
wage          0
dtype: int64
X=test['age'].values.reshape(-1,1)
Y=test['wage']
models = [Pipeline([('ss', StandardScaler()),('poly', PolynomialFeatures()),('linear', RidgeCV(alphas=np.logspace(-3,1,20)))]),Pipeline([('ss', StandardScaler()),('poly', PolynomialFeatures()),('linear', LassoCV(alphas=np.logspace(-3,1,20)))])
]
parameters = {"poly__degree": [3,2,1], "poly__interaction_only": [True, False],"poly__include_bias": [True, False],"linear__fit_intercept": [True, False]
}
rf=PolynomialFeatures(2,interaction_only=True)
a=pd.DataFrame({'name':[1,2,3,4,5],'score':[2,3,4,4,5]
})
b=rf.fit_transform(a)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
for t in range(2):model = GridSearchCV(models[t], param_grid=parameters,cv=5, n_jobs=1)model.fit(x_train, y_train)print (model.best_params_)print ("R=%.3f" %model.best_score_)y_predict = model.predict(x_test)
{'linear__fit_intercept': True, 'poly__degree': 3, 'poly__include_bias': False, 'poly__interaction_only': False}
R=0.072
{'linear__fit_intercept': True, 'poly__degree': 3, 'poly__include_bias': True, 'poly__interaction_only': False}
R=0.072

models = [Pipeline([('Poly', PolynomialFeatures()),('Linear', LinearRegression(fit_intercept=False)) ])
]
model = models[0]
X_train,X_test,Y_train,Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)t=np.arange(len(X_test))
N = 6
d_pool = np.arange(1,N,1)
m = d_pool.size
clrs = []
for c in np.linspace(16711680, 255, m):clrs.append('#%06x' % int(c))
line_width = 3
a=[]
plt.figure(figsize=(12,6), facecolor='w')
for i,d in enumerate(d_pool):model.set_params(Poly__degree=d)model.fit(X_train, Y_train) lin = model.get_params('Linear')['Linear']output = u'%d阶,系数为:' % dif hasattr(lin, 'alpha_'):idx = output.find(u'系数')output = output[:idx] + (u'alpha=%.6f, ' % lin.alpha_) + output[idx:]if hasattr(lin, 'l1_ratio_'):idx = output.find(u'系数')output = output[:idx] + (u'l1_ratio=%.6f, ' % lin.l1_ratio_) + output[idx:]y_hat = model.predict(X_test)s = model.score(X_test, Y_test)mse_predict = mean_squared_error(y_test, y_hat)a.append(mse_predict)
<Figure size 864x432 with 0 Axes>
plt.plot([1,2,3,4,5],a)
[<matplotlib.lines.Line2D at 0x20a61cf7c18>]

X=test['age'].values.reshape(-1,1)
y=test['wage']
ss = StandardScaler()
X_train = ss.fit_transform(X)
poly = PolynomialFeatures(degree=3,interaction_only=False)
train1 = poly.fit_transform(X_train)
linear = LinearRegression()
linear.fit(train1,y)
print(linear.coef_)
print(linear.intercept_)
[ 0.          6.00961425 -7.98309085  1.30560185]
119.49381727555215
from sklearn.linear_model import LinearRegression
import numpy as np
import matplotlib.pyplot as plt
x_train=test['age'].values.reshape(-1,1)
y_train=test['wage']
regressor = LinearRegression()
regressor.fit(x_train, y_train)
xx = np.linspace(10, 80, 100)
xx = xx.reshape(xx.shape[0],1)
yy = regressor.predict(xx)
plt.scatter(x_train, y_train)
plt1, = plt.plot(xx, yy,'r',label="degree=1")
plt.xlabel("age")
plt.ylabel("wage")
plt.legend(handles=[plt1])
plt.show()


poly2 = PolynomialFeatures(degree=3)
x_train_poly2 = poly2.fit_transform(x_train)regressor_poly2 = LinearRegression()
regressor_poly2.fit(x_train_poly2, y_train)xx_poly2 = poly2.transform(xx)
yy_poly2 = regressor_poly2.predict(xx_poly2)
plt.scatter(x_train, y_train)
plt2, = plt.plot(xx, yy_poly2,'y',label="Degree3")
plt.xlabel("age")
plt.ylabel("wage")
plt.legend(handles=[plt2])
plt.show()

test.head()
Unnamed: 0 year age sex maritl race education region jobclass health health_ins logwage wage
0 231655 2006 18 1. Male 1. Never Married 1. White 1. < HS Grad 2. Middle Atlantic 1. Industrial 1. <=Good 2. No 4.318063 75.043154
1 86582 2004 24 1. Male 1. Never Married 1. White 4. College Grad 2. Middle Atlantic 2. Information 2. >=Very Good 2. No 4.255273 70.476020
2 161300 2003 45 1. Male 2. Married 1. White 3. Some College 2. Middle Atlantic 1. Industrial 1. <=Good 1. Yes 4.875061 130.982177
3 155159 2003 43 1. Male 2. Married 3. Asian 4. College Grad 2. Middle Atlantic 2. Information 2. >=Very Good 1. Yes 5.041393 154.685293
4 11443 2005 50 1. Male 4. Divorced 1. White 2. HS Grad 2. Middle Atlantic 2. Information 1. <=Good 1. Yes 4.318063 75.043154
from scipy import stats
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import warnings
warnings.filterwarnings("ignore")
import itertools
# for A
anova_reA= anova_lm(ols('age~C(wage)',data=test[['age','wage']]).fit())
print(anova_reA)
              df         sum_sq     mean_sq         F        PR(>F)
C(wage)    507.0   87692.862849  172.964227  1.382137  5.346952e-07
Residual  2492.0  311855.291817  125.142573       NaN           NaN

Fit a step function to predict wage using age, and perform cross-validation to choose the
optimal number of cuts. Make a plot of the fit obtained

test['age'].head()
0    18
1    24
2    45
3    43
4    50
Name: age, dtype: int64
#18 26 34 42 50
def function1(a):if a>32 :return 1else:return 2
def function2(a):if a>42 :return 1elif 34<a<42:return 2elif 26<a<34:return 3 else:return 4
def function3(a):if a>37:return 1elif 28<a<37:return 2else:return 3
test['二'] = test.apply(lambda x: function1(x['age']), axis = 1)
test['三'] = test.apply(lambda x: function3(x['age']), axis = 1)
test['四'] = test.apply(lambda x: function2(x['age']), axis = 1)
a=[]
for i in ['二','三','四']:print(i)x = test[i].values.reshape(-1,1)y = test['age']x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1,train_size=0.8)    model = Lasso()alpha_can = np.logspace(-3,2,10)np.set_printoptions(suppress=True)#print 'alpha_can = ', alpha_canlasso_model = GridSearchCV(model, param_grid={'alpha': alpha_can}, cv=5)lasso_model.fit(x_train, y_train)print(':', lasso_model.best_params_)  y_predict=lasso_model.predict(x_test)print('score:',explained_variance_score(y_test,y_predict))a.append(explained_variance_score(y_test,y_predict))
二
: {'alpha': 0.001}
score: 0.5294966544778303
三
: {'alpha': 0.001}
score: 0.6360385629505032
四
: {'alpha': 0.003593813663804626}
score: 0.6908722988300695
plt.plot([2,3,4],a)
[<matplotlib.lines.Line2D at 0x20a622bb390>]


The Wage data set contains a number of other features not explored in this chapter, such as
marital status (maritl), job class (jobclass), and others. Explore the relationships between some of
these other predictors and wage, and use non-linear fitting techniques in order to fit flexible
models to the data. Create plots of the results obtained, and write a summary of your findings.
(Note, this question is quite open ended. You should think about questions like this as a small
precursor to the final project. Be creative!)

test=pd.read_csv('Wage.csv')
test.head()
Unnamed: 0 year age sex maritl race education region jobclass health health_ins logwage wage
0 231655 2006 18 1. Male 1. Never Married 1. White 1. < HS Grad 2. Middle Atlantic 1. Industrial 1. <=Good 2. No 4.318063 75.043154
1 86582 2004 24 1. Male 1. Never Married 1. White 4. College Grad 2. Middle Atlantic 2. Information 2. >=Very Good 2. No 4.255273 70.476020
2 161300 2003 45 1. Male 2. Married 1. White 3. Some College 2. Middle Atlantic 1. Industrial 1. <=Good 1. Yes 4.875061 130.982177
3 155159 2003 43 1. Male 2. Married 3. Asian 4. College Grad 2. Middle Atlantic 2. Information 2. >=Very Good 1. Yes 5.041393 154.685293
4 11443 2005 50 1. Male 4. Divorced 1. White 2. HS Grad 2. Middle Atlantic 2. Information 1. <=Good 1. Yes 4.318063 75.043154

sex

test['sex']=test['sex'].apply(lambda x:x.split('.')[0])
test['sex'].value_counts()
1    3000
Name: sex, dtype: int64

maritl

test['maritl']=test['maritl'].apply(lambda x:x.split('.')[0])
test['maritl'].value_counts()
2    2074
1     648
4     204
5      55
3      19
Name: maritl, dtype: int64

race

test['race']=test['race'].apply(lambda x:x.split('.')[0])
test['race'].value_counts()
1    2480
2     293
3     190
4      37
Name: race, dtype: int64
test['education']=test['education'].apply(lambda x:x.split('.')[0])
test['education'].value_counts()
2    971
4    685
3    650
5    426
1    268
Name: education, dtype: int64
test['jobclass']=test['jobclass'].apply(lambda x:x.split('.')[0])
test['jobclass'].value_counts()
1    1544
2    1456
Name: jobclass, dtype: int64
test['health']=test['health'].apply(lambda x:x.split('.')[0])
test['health'].value_counts()
2    2142
1     858
Name: health, dtype: int64
test['health_ins']=test['health_ins'].apply(lambda x:x.split('.')[0])
test['health_ins'].value_counts()
1    2083
2     917
Name: health_ins, dtype: int64
test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 13 columns):#   Column      Non-Null Count  Dtype
---  ------      --------------  -----  0   Unnamed: 0  3000 non-null   int64  1   year        3000 non-null   int64  2   age         3000 non-null   int64  3   sex         3000 non-null   object 4   maritl      3000 non-null   object 5   race        3000 non-null   object 6   education   3000 non-null   object 7   region      3000 non-null   object 8   jobclass    3000 non-null   object 9   health      3000 non-null   object 10  health_ins  3000 non-null   object 11  logwage     3000 non-null   float6412  wage        3000 non-null   float64
dtypes: float64(2), int64(3), object(8)
memory usage: 304.8+ KB

wage和year age education

x=test[['year','age','education']]
y=test['wage']
x= pd.get_dummies(x)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2,random_state=2020)
clf =RandomForestRegressor()
clf.fit(X_train, y_train)
y_predict=clf.predict(X_test)
print('score:',explained_variance_score(y_test,y_predict))
score: 0.06934098768375041

wage和year age education maritl

x=test[['year','age','education','maritl']]
y=test['wage']
x= pd.get_dummies(x)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2,random_state=2020)
clf =RandomForestRegressor()
clf.fit(X_train, y_train)
y_predict=clf.predict(X_test)
print('score:',explained_variance_score(y_test,y_predict))
score: 0.19806405061206722

wage和year age education jobcalss

x=test[['year','age','education','jobclass']]
y=test['wage']
x= pd.get_dummies(x)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2,random_state=2020)
clf =RandomForestRegressor()
clf.fit(X_train, y_train)
y_predict=clf.predict(X_test)
print('score:',explained_variance_score(y_test,y_predict))
score: 0.05475790627923649

wage和year age education maritl jobclass

x=test[['year','age','education','jobclass','maritl']]
y=test['wage']
x= pd.get_dummies(x)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2,random_state=2020)
clf =RandomForestRegressor()
clf.fit(X_train, y_train)
y_predict=clf.predict(X_test)
print('score:',explained_variance_score(y_test,y_predict))
score: 0.1757834299406874

20200308——多项式回归预测工资相关推荐

  1. [Step By Step]SAP HANA PAL多项式回归预测分析Polynomial Regression编程实例FORECASTWITHPOLYNOMIALR(预测)...

    多项式回归预测分析编程规范(模型):http://www.cnblogs.com/omygod/archive/2013/05/15/3080391.html 多项式回归预测分析编程规范(预测):ht ...

  2. python多项式回归预测(基于淘宝双十一数据)

    双11已经结束,按照天猫官方公布的最终数据看,今年的双11成交额为2684亿元,成功刷新了自己创下的商业纪录.然而,早在2019年4月就有网友指出,从天猫双十一的全天销售额来看,实际生产数据几乎完美地 ...

  3. [PAL编程规范]SAP HANA PAL多项式回归预测分析Polynomial Regression编程规范FORECASTWITHPOLYNOMIALR(预测)...

    1. 生成规范 CALL SYSTEM.AFL_WRAPPER_GENERATOR ( ' <procedure name> ','AFLPAL',  'FORECASTWITHPOLYN ...

  4. 用python程序编写二元多项式_Python多项式回归的实现方法

    多项式回归是一种线性回归形式,其中自变量x和因变量y之间的关系被建模为n次多项式.多项式回归拟合x的值与y的相应条件均值之间的非线性关系,表示为E(y | x) 为什么多项式回归: 研究人员假设的某些 ...

  5. 如何评价算法的好坏?

    2019-11-23 18:32:55 序言 评价一个算法的好坏,我认为关键是看能不能解决问题.如果算法能很好地解决实际的问题,那么我认为就是好算法. 比如预测的算法,关键是看预测的准确率,即预测值与 ...

  6. [Python从零到壹] 十二.机器学习之回归分析万字总结全网首发(线性回归、多项式回归、逻辑回归)

    欢迎大家来到"Python从零到壹",在这里我将分享约200篇Python系列文章,带大家一起去学习和玩耍,看看Python这个有趣的世界.所有文章都将结合案例.代码和作者的经验讲 ...

  7. 【2022年华为杯数学建模E题赛后总结加思路详细介绍配代码----10月11号写的总结】

    提示:下文将介绍2022年华为杯数学建模E题赛后总结加思路详细介绍配代码 傻逼队友,傻逼队友,傻逼队友一定要看好人在进行组队,这是劝告. 这里有几点总结进行描述: 第一,图一定要尽量多,对图的解释要多 ...

  8. 【机器学习】多项式回归

    文章目录 多项式回归介绍 多项式回归基础 实现二次多项式拟合 实现N次多项式拟合 使用 scikit-learn 进行多项式拟合 多项式回归预测 导入数据集 绘制图表查看变化趋势 线性回归与 2 次多 ...

  9. 作业——机器学习教你预测商品销售额

    (一)作业要求 advertising.csv文件(文件私聊可取)是某商品的广告推广费用(单位为元)和销售额数据(单位为千元),其中每行代表每一周的广告推广费用(包含微信.微博和其他类型三种广告费用) ...

  10. 不靠谱的预测:今年双十一的销量是 6213 亿元

    CDA数据分析师 出品 作者:曹鑫 双十一到今年已经是13个年头,每年大家都在满心期待看着屏幕上的数字跳动,年年打破记录.而 2019 年的天猫双11的销售额却被一位微博网友提前7个月用数据拟合的方法 ...

最新文章

  1. 转:Ubuntu 开机加载路由设置
  2. linux网络属性配置
  3. C++11 基于范围的 for 循环
  4. 腐蚀rust研究台抽奖_超级石化推荐:中石化青岛安工院专家分享延迟焦化装置的腐蚀风险分析!...
  5. DBUtils - Python数据库连接池
  6. hdu 2036 计算多边形面积
  7. Android Studio百度地图开发所需参数获取SHA1或MD5的最简单方法(图文教程)
  8. 文本纠错pycorrector
  9. UVA 10129 Play on Words
  10. 微信小程序开发资料汇总
  11. 系统地编译Hi3519过程及其处理问题思路
  12. 服务器网卡无法开启lldp协议,lldp支持-VMware vSphere - 思科华为论坛
  13. Qt天气助手小程序(QCutomPlot绘图)
  14. Visio2——Visio2019任意位置添加连接点
  15. win7本地登录计算机,win7系统绑定微软账户到本地电脑上的具体办法
  16. Android权限 权限等级 普通权限 敏感权限 申请权限
  17. linux停止tomcat 8005,严重:无法联系localhost:8005.关闭tomcat服务器时,Tomcat可能没有运行错误....
  18. al11-添加SAP目录
  19. 360误删mysql_回收站里的文件被清空如何还原?360安全卫士快速恢复误删文件方法...
  20. OneDrive卸载后,无法重新安装OneDrive的问题(已解决)

热门文章

  1. Linux-进程间通信(三): 共享内存
  2. 单机类似节奏大师游戏源码项目
  3. 怎样让jquery mobile 的footer/header 固定?
  4. 将应用程序11M内存占用,降至500K左右
  5. chromium笔记目录
  6. 【机器学习_2】机器学习资料
  7. Kali Linux 更新源 操作完整版教程
  8. Servlet中的请求转发
  9. Codeforces Round #352 (Div. 1) B. Robin Hood
  10. 删除数据所有的表和存储过程