20200308——多项式回归预测工资

这个是给国外大哥做的，很尴尬，做完了有答案，要和答案一样，临时就又改了，结果鼠标以及各种问题，难受呀

Perform polynomial regression to predict wage using age. Use cross-validation to select the
optimal degree d for the polynomial. What degree was chosen, and how does this compare to the
results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the
data.

%matplotlib inline
import matplotlib as mpl
mpl.rcParams['font.sans-serif']=[u'simHei']
mpl.rcParams['axes.unicode_minus']=False
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import sklearn
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
# from sklearn.linear_model.coordinate_descent import ConvergenceWarning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.linear_model import Lasso
from sklearn.metrics import explained_variance_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

test=pd.read_csv('Wage.csv')
test.head()

	Unnamed: 0	year	age	sex	maritl	race	education	region	jobclass	health	health_ins	logwage	wage
0	231655	2006	18	1. Male	1. Never Married	1. White	1. < HS Grad	2. Middle Atlantic	1. Industrial	1. <=Good	2. No	4.318063	75.043154
1	86582	2004	24	1. Male	1. Never Married	1. White	4. College Grad	2. Middle Atlantic	2. Information	2. >=Very Good	2. No	4.255273	70.476020
2	161300	2003	45	1. Male	2. Married	1. White	3. Some College	2. Middle Atlantic	1. Industrial	1. <=Good	1. Yes	4.875061	130.982177
3	155159	2003	43	1. Male	2. Married	3. Asian	4. College Grad	2. Middle Atlantic	2. Information	2. >=Very Good	1. Yes	5.041393	154.685293
4	11443	2005	50	1. Male	4. Divorced	1. White	2. HS Grad	2. Middle Atlantic	2. Information	1. <=Good	1. Yes	4.318063	75.043154

test.isnull().sum()

Unnamed: 0    0
year          0
age           0
sex           0
maritl        0
race          0
education     0
region        0
jobclass      0
health        0
health_ins    0
logwage       0
wage          0
dtype: int64

X=test['age'].values.reshape(-1,1)
Y=test['wage']

models = [Pipeline([('ss', StandardScaler()),('poly', PolynomialFeatures()),('linear', RidgeCV(alphas=np.logspace(-3,1,20)))]),Pipeline([('ss', StandardScaler()),('poly', PolynomialFeatures()),('linear', LassoCV(alphas=np.logspace(-3,1,20)))])
]
parameters = {"poly__degree": [3,2,1], "poly__interaction_only": [True, False],"poly__include_bias": [True, False],"linear__fit_intercept": [True, False]
}
rf=PolynomialFeatures(2,interaction_only=True)
a=pd.DataFrame({'name':[1,2,3,4,5],'score':[2,3,4,4,5]
})
b=rf.fit_transform(a)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
for t in range(2):model = GridSearchCV(models[t], param_grid=parameters,cv=5, n_jobs=1)model.fit(x_train, y_train)print (model.best_params_)print ("R=%.3f" %model.best_score_)y_predict = model.predict(x_test)

{'linear__fit_intercept': True, 'poly__degree': 3, 'poly__include_bias': False, 'poly__interaction_only': False}
R=0.072
{'linear__fit_intercept': True, 'poly__degree': 3, 'poly__include_bias': True, 'poly__interaction_only': False}
R=0.072


models = [Pipeline([('Poly', PolynomialFeatures()),('Linear', LinearRegression(fit_intercept=False)) ])
]
model = models[0]
X_train,X_test,Y_train,Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)t=np.arange(len(X_test))
N = 6
d_pool = np.arange(1,N,1)
m = d_pool.size
clrs = []
for c in np.linspace(16711680, 255, m):clrs.append('#%06x' % int(c))
line_width = 3
a=[]
plt.figure(figsize=(12,6), facecolor='w')
for i,d in enumerate(d_pool):model.set_params(Poly__degree=d)model.fit(X_train, Y_train) lin = model.get_params('Linear')['Linear']output = u'%d阶，系数为：' % dif hasattr(lin, 'alpha_'):idx = output.find(u'系数')output = output[:idx] + (u'alpha=%.6f, ' % lin.alpha_) + output[idx:]if hasattr(lin, 'l1_ratio_'):idx = output.find(u'系数')output = output[:idx] + (u'l1_ratio=%.6f, ' % lin.l1_ratio_) + output[idx:]y_hat = model.predict(X_test)s = model.score(X_test, Y_test)mse_predict = mean_squared_error(y_test, y_hat)a.append(mse_predict)

<Figure size 864x432 with 0 Axes>

plt.plot([1,2,3,4,5],a)

[<matplotlib.lines.Line2D at 0x20a61cf7c18>]

X=test['age'].values.reshape(-1,1)
y=test['wage']
ss = StandardScaler()
X_train = ss.fit_transform(X)
poly = PolynomialFeatures(degree=3,interaction_only=False)
train1 = poly.fit_transform(X_train)
linear = LinearRegression()
linear.fit(train1,y)
print(linear.coef_)
print(linear.intercept_)

[ 0.          6.00961425 -7.98309085  1.30560185]
119.49381727555215

from sklearn.linear_model import LinearRegression
import numpy as np
import matplotlib.pyplot as plt
x_train=test['age'].values.reshape(-1,1)
y_train=test['wage']
regressor = LinearRegression()
regressor.fit(x_train, y_train)
xx = np.linspace(10, 80, 100)
xx = xx.reshape(xx.shape[0],1)
yy = regressor.predict(xx)
plt.scatter(x_train, y_train)
plt1, = plt.plot(xx, yy,'r',label="degree=1")
plt.xlabel("age")
plt.ylabel("wage")
plt.legend(handles=[plt1])
plt.show()


poly2 = PolynomialFeatures(degree=3)
x_train_poly2 = poly2.fit_transform(x_train)regressor_poly2 = LinearRegression()
regressor_poly2.fit(x_train_poly2, y_train)xx_poly2 = poly2.transform(xx)
yy_poly2 = regressor_poly2.predict(xx_poly2)
plt.scatter(x_train, y_train)
plt2, = plt.plot(xx, yy_poly2,'y',label="Degree3")
plt.xlabel("age")
plt.ylabel("wage")
plt.legend(handles=[plt2])
plt.show()

test.head()

	Unnamed: 0	year	age	sex	maritl	race	education	region	jobclass	health	health_ins	logwage	wage
0	231655	2006	18	1. Male	1. Never Married	1. White	1. < HS Grad	2. Middle Atlantic	1. Industrial	1. <=Good	2. No	4.318063	75.043154
1	86582	2004	24	1. Male	1. Never Married	1. White	4. College Grad	2. Middle Atlantic	2. Information	2. >=Very Good	2. No	4.255273	70.476020
2	161300	2003	45	1. Male	2. Married	1. White	3. Some College	2. Middle Atlantic	1. Industrial	1. <=Good	1. Yes	4.875061	130.982177
3	155159	2003	43	1. Male	2. Married	3. Asian	4. College Grad	2. Middle Atlantic	2. Information	2. >=Very Good	1. Yes	5.041393	154.685293
4	11443	2005	50	1. Male	4. Divorced	1. White	2. HS Grad	2. Middle Atlantic	2. Information	1. <=Good	1. Yes	4.318063	75.043154

from scipy import stats
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import warnings
warnings.filterwarnings("ignore")
import itertools
# for A
anova_reA= anova_lm(ols('age~C(wage)',data=test[['age','wage']]).fit())
print(anova_reA)

              df         sum_sq     mean_sq         F        PR(>F)
C(wage)    507.0   87692.862849  172.964227  1.382137  5.346952e-07
Residual  2492.0  311855.291817  125.142573       NaN           NaN

Fit a step function to predict wage using age, and perform cross-validation to choose the
optimal number of cuts. Make a plot of the fit obtained

test['age'].head()

0    18
1    24
2    45
3    43
4    50
Name: age, dtype: int64

#18 26 34 42 50

def function1(a):if a>32 :return 1else:return 2
def function2(a):if a>42 :return 1elif 34<a<42:return 2elif 26<a<34:return 3 else:return 4
def function3(a):if a>37:return 1elif 28<a<37:return 2else:return 3

test['二'] = test.apply(lambda x: function1(x['age']), axis = 1)
test['三'] = test.apply(lambda x: function3(x['age']), axis = 1)
test['四'] = test.apply(lambda x: function2(x['age']), axis = 1)

a=[]
for i in ['二','三','四']:print(i)x = test[i].values.reshape(-1,1)y = test['age']x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1,train_size=0.8)    model = Lasso()alpha_can = np.logspace(-3,2,10)np.set_printoptions(suppress=True)#print 'alpha_can = ', alpha_canlasso_model = GridSearchCV(model, param_grid={'alpha': alpha_can}, cv=5)lasso_model.fit(x_train, y_train)print(':', lasso_model.best_params_)  y_predict=lasso_model.predict(x_test)print('score：',explained_variance_score(y_test,y_predict))a.append(explained_variance_score(y_test,y_predict))

二
: {'alpha': 0.001}
score： 0.5294966544778303
三
: {'alpha': 0.001}
score： 0.6360385629505032
四
: {'alpha': 0.003593813663804626}
score： 0.6908722988300695

plt.plot([2,3,4],a)

[<matplotlib.lines.Line2D at 0x20a622bb390>]

The Wage data set contains a number of other features not explored in this chapter, such as
marital status (maritl), job class (jobclass), and others. Explore the relationships between some of
these other predictors and wage, and use non-linear fitting techniques in order to fit flexible
models to the data. Create plots of the results obtained, and write a summary of your findings.
(Note, this question is quite open ended. You should think about questions like this as a small
precursor to the final project. Be creative!)

test=pd.read_csv('Wage.csv')
test.head()

	Unnamed: 0	year	age	sex	maritl	race	education	region	jobclass	health	health_ins	logwage	wage
0	231655	2006	18	1. Male	1. Never Married	1. White	1. < HS Grad	2. Middle Atlantic	1. Industrial	1. <=Good	2. No	4.318063	75.043154
1	86582	2004	24	1. Male	1. Never Married	1. White	4. College Grad	2. Middle Atlantic	2. Information	2. >=Very Good	2. No	4.255273	70.476020
2	161300	2003	45	1. Male	2. Married	1. White	3. Some College	2. Middle Atlantic	1. Industrial	1. <=Good	1. Yes	4.875061	130.982177
3	155159	2003	43	1. Male	2. Married	3. Asian	4. College Grad	2. Middle Atlantic	2. Information	2. >=Very Good	1. Yes	5.041393	154.685293
4	11443	2005	50	1. Male	4. Divorced	1. White	2. HS Grad	2. Middle Atlantic	2. Information	1. <=Good	1. Yes	4.318063	75.043154

sex

test['sex']=test['sex'].apply(lambda x:x.split('.')[0])
test['sex'].value_counts()

1    3000
Name: sex, dtype: int64

maritl

test['maritl']=test['maritl'].apply(lambda x:x.split('.')[0])
test['maritl'].value_counts()

2    2074
1     648
4     204
5      55
3      19
Name: maritl, dtype: int64

race

test['race']=test['race'].apply(lambda x:x.split('.')[0])
test['race'].value_counts()

1    2480
2     293
3     190
4      37
Name: race, dtype: int64

test['education']=test['education'].apply(lambda x:x.split('.')[0])
test['education'].value_counts()

2    971
4    685
3    650
5    426
1    268
Name: education, dtype: int64

test['jobclass']=test['jobclass'].apply(lambda x:x.split('.')[0])
test['jobclass'].value_counts()

1    1544
2    1456
Name: jobclass, dtype: int64

test['health']=test['health'].apply(lambda x:x.split('.')[0])
test['health'].value_counts()

2    2142
1     858
Name: health, dtype: int64

test['health_ins']=test['health_ins'].apply(lambda x:x.split('.')[0])
test['health_ins'].value_counts()

1    2083
2     917
Name: health_ins, dtype: int64

test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 13 columns):#   Column      Non-Null Count  Dtype
---  ------      --------------  -----  0   Unnamed: 0  3000 non-null   int64  1   year        3000 non-null   int64  2   age         3000 non-null   int64  3   sex         3000 non-null   object 4   maritl      3000 non-null   object 5   race        3000 non-null   object 6   education   3000 non-null   object 7   region      3000 non-null   object 8   jobclass    3000 non-null   object 9   health      3000 non-null   object 10  health_ins  3000 non-null   object 11  logwage     3000 non-null   float6412  wage        3000 non-null   float64
dtypes: float64(2), int64(3), object(8)
memory usage: 304.8+ KB

wage和year age education

x=test[['year','age','education']]
y=test['wage']
x= pd.get_dummies(x)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2,random_state=2020)
clf =RandomForestRegressor()
clf.fit(X_train, y_train)
y_predict=clf.predict(X_test)
print('score：',explained_variance_score(y_test,y_predict))

score： 0.06934098768375041

wage和year age education maritl

x=test[['year','age','education','maritl']]
y=test['wage']
x= pd.get_dummies(x)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2,random_state=2020)
clf =RandomForestRegressor()
clf.fit(X_train, y_train)
y_predict=clf.predict(X_test)
print('score：',explained_variance_score(y_test,y_predict))

score： 0.19806405061206722

wage和year age education jobcalss

x=test[['year','age','education','jobclass']]
y=test['wage']
x= pd.get_dummies(x)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2,random_state=2020)
clf =RandomForestRegressor()
clf.fit(X_train, y_train)
y_predict=clf.predict(X_test)
print('score：',explained_variance_score(y_test,y_predict))

score： 0.05475790627923649

wage和year age education maritl jobclass

x=test[['year','age','education','jobclass','maritl']]
y=test['wage']
x= pd.get_dummies(x)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2,random_state=2020)
clf =RandomForestRegressor()
clf.fit(X_train, y_train)
y_predict=clf.predict(X_test)
print('score：',explained_variance_score(y_test,y_predict))

score： 0.1757834299406874