第一部分

第1章

3*'Python is easy! '

abs(3*4+(-2/5)**2)/(4.5-76)+max(-34,9)

import os
print (os.getcwd()) #查看目前的工作目录
os.chdir('D:/Python work') #改变工作目录

第2章

import os
os.getcwd()

import os
os.chdir('D:/work') #或者os.chdir('D:\\work')

import os
os.mkdir('work2')

import os
os.rmdir('work2')

import os
os.rename('fff.txt','fool.txt') #重命名
os.remove('h.txt')              #删除文件

ST='I am happy and you too'
print('length =',len(ST),'\n',ST[5:10],'\n',(ST+'! ')*2)

x=[list(range(5)),"Python is great!",["Program is art"],abs(-2.34),[[1,20],[-34,60]]]

y=(list(range(5)),"Python is great!",["Program is art"],abs(-2.34),[[1,20],[-34,60]])

Tup=3,4,6,[2,3],"Time"

z={'seq': list(range(5)), 'string': "Python is great!", 'ls': ["Program is art"], 'value': abs(-2.34), 'mat':  [[1,20],[-34,60]]}

s1='A great person'
s2=['you', 'I', 'they','we','you','he','they']
s3=(32,64,32,'He is the one',(2,3))
s4={'One': 234, 'Two': 45,'Three': 45}
print(set(s1),'\n',set(s2),'\n',set(s3),'\n',set(s4))

print(type(x),type(y),type(z),'\n',type('string'),type(3.5),type(7))

print(x[2:],'\n',y[-4:],'\n',z['mat'])

x=list(range(10))#=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
print(x[:3],x[7:],x[3:6],x[-3:],x[-1],x[:-4])

x='A poet can survive everything but a misprint.'
x[:10]+x[10:20]+x[20:30]+x[30:40]+x[40:]

x=[[1,15,3],[['People'],' above all']]
y=("Good morning",[2,5,-1])
z={'a': 'A string', 'b': [[2,3],'yes'],'c':{'A': [3,'Three',4],'B':range(5)}}
print(x[0][:2],x[1][1][:3],'\n',y[0][:3],'\n', z['c']['B'][-3:],'\n',z['b'][1][1:] )

for i in x:print(i)for k in i:print(k)

for i in list(z):print(z[i])for j in z[i]:if type(z[i])==dict:j=z[i][j]print(j)

for (i,j) in (x,y):print(i,j)for l in i:print(l)for m in j:print(m)

for i in range(len(x)):print(x[i])for j in range(len(x[i])):print(x[i][j])

print(list(range(-1,11,2)), list(range(2,7)), list(range(10,-10,-3)))

'I'+' have to say:'+' "You are '+ 'very '*2 + 'good!"'

print(['Hi!'] * 2+['I am']+['here']+["Isn't It"])
print(('Tiger','Lion')*2+('Wolf','Cat')+([1,-3.9],'Good'))

s='Good morning!'
x=[[1, 15, 3], [['People'], ' above all']]
y=('Good morning', [2, 5, -1])
z={'a': 'A string', 'b': [[2, 3], 'yes'], 'c': {'A': [3, 'Three', 4], 'B': range(0, 5)}}print(len(s),len(x),len(y),len(z))

print(['People'] in x, ['People'] in x[1], 'Good' in s)
print('Good morning' in y, 'A string' in z, 'a' in z)

print(max('A', 'black', 'rose'),max([1,-5]), min(['people','leader']),min({"a":2,"b":4}))

x=[[3,7],'Oscar Wilde']
y=['save','the world',['is','impossible']]
x.append(y);print(x)

x=[[3,5,7],'Oscar Wilde']
y=['save','the world',['is','impossible']]
x.extend(y);print (x)

x=[[1,2],'Word',[3,5,7],'Oscar Wilde']
x.pop();print(x) #去掉最后一个
x=[[1,2],'Word',[3,5,7],'Oscar Wilde']
x.pop(2);print(x) #去掉下标为2的元素(即[3,5,7])

x=[[1,2],'Word',[3,5,7],'Oscar Wilde',[3,5,7]]
x.remove([3,5,7]);print(x)
x.remove([3,5,7]);print(x)
x.remove('Word');print(x)

y=('Efficiency', [2, [5, -1]])
print(type(y),'\n',list(y),type(list(y)),'\n',tuple(list(y)),type(tuple(list(y))))

z={'a': 'A string', 'b': [[2, 3], 'yes'], 'c': {'A': 'Why', 'B': 4}}
print('keys:\n',z.keys(),'\nget:\n',z.get('a'),'\nitems:\n',z.items(),'\nvalues:\n',z.values())

z.pop('c') #去掉'c'
print('pop last:',z.popitem()) #去掉剩下的最后一个('b')
print('after pop:',z) #还剩下'a'

z['new']=[[2,4],[5,7,9]];z

a={'a': (2,3),'b': ['word','sentence']}
b={2:[345,321],'a':("two","three")}
c={2:999,'b':'strong'}
print({**a,**b,**c})
print({**b,**a})

z['new']=34/56.2; z

del z['a'];z

y=zip(('100','A',1202,),'ABCDE',['I', 'like','apple','very much'])
print('y=',y)
print('list(y)=',list(y))
print('list(y)=',list(y))
y2=list(y)
print('y2=',y2)
print('y=',y)

A=(2,'5','Today');B=[30,'tax',[5,4]]
D=dict()
for i in range(len(A)):D[A[i]]=B[i]
print(D)

A=(2,'5','Today');B=[30,'tax',[5,4]]
print(dict(zip(A,B)))

A=(2,'5','Today')
print(list(zip(A)))

A=('What','is','this');B=[30,'tax',[5,4]]
for i in zip(A,B):print(i)
for i,j in zip(A,B):print(i)print(j)

A=('What','is','this');B=[30,'tax',[5,4]]
ZIP=zip(A,B)
for i in ZIP:print(i)
for i,j in ZIP:print(i)print(j)

A='ABCD';B=[1,2,3,4]
x=list(zip(A,B))
print(x)

A,B=zip(*x)
print('A=',A,'; B=',B)

year=[2017,2018,2019];inport=[2800,3496,4765];export=[3990,5023,8766]
for i,j,k in zip(year,export,inport):print('In year',i ,' red=',j-k)

height=[1.74,1.83,1.69];weight=[55, 62, 71];name=['Tom', 'Jack','Smith']
print('sort by height:',sorted(zip(height,weight,name)))
print('sort by weight:',sorted(zip(weight,height,name)))
print('sort by name:',sorted(zip(name,height,weight)))

A = set('geography');print(A)
v={(1,3,6),'world',((2,3),(1,7)),'world',('world')};v

for i in v:print(i)

A={1,4,'world',(3,4,'country'),'world',1}
print(list(A),type(list(A)),'\n',tuple(A),type(tuple(A)),'\n',set(list(A)),type(set(list(A))),'\n',set(tuple(A)),type(set(tuple(A))))

u={1.2,5.7,'word',(1,4),('key',5)}
u.add((2,6,1,'sun'));u

x=set(['I','you','he','I','they','we','we'])
x.remove('I');print(x)

A={1,4,'world',(3,4,'country')}
B={'world',5,1,('one','two')}
A|B #union 可试试 A|=B

A={1,4,'world',(3,4,'country')}
B={'world',5,1,('one','two')}
print(A&B,'\n',A,B) #可试试: A.intersection(B)

A={1,4,'world',(3,4,'country')}
B={'world',5,1,('one','two')}
print(A-B) #可试试 A -= B

A={1,4,'world',(3,4,'country')}
B={'world',5,1,('one','two')}
print(A ^ B,'\n',(A|B)-(A&B))

A = {1, 2, 3, 1, 2}
B = {3, 2, 3, 1}
print(A == B,A!=B,A<=B,B>A&B,{1,3,4}>={1,3},)

print(set.difference(set(['a',2,'5']),set(['a',7])))
print(set.union(set(['a',2,'5']),set(['a','a',7])))
print(set.intersection(set(['a',2,'5']),set(['a','a',7])))

from collections import Counter
s1='pneumonoultramicroscopicsilicovolcanoconiosis'
s2=['you', 'I', 'they','we','you','he','they']
s3=(32,64,32,'He is the one',(2,3))
s4={'One': 234, 'Two': 45,'Three': 45,'One': 299,'One': 23}
s5=set(s2)
print(Counter(s1),'\n',Counter(s2),'\n',Counter(s3),'\n',Counter(s4),'\n', Counter(s5))

s=[(1,2,4),'happy',('peace','and', 'war'),'happy']
u=['happy',(1,2,4),('peace','and', 'war')]
print(Counter(s)==Counter(u),set(s)==set(u))

print(Counter(s))
print(dict(Counter(s)))
print(list(Counter(s)))
print(tuple(Counter(s)))
print(set(Counter(s)))

def Positive(x):y=[]for i in x:if i>0: y.append(i)return(y)
print(Positive([-2,-2,3,5,7,3])) #对list
print(Positive((-2,-2,3,5,7,3))) #对tuple
print(Positive({-2,-2,3,5,7,3})) #对set

def f(x): return x**2-x
g=lambda x,y: max(x**2,y**3+x)
f(0.8),g(3.4,0.5) #把数值代入函数执行

print(list(map(lambda x: x**2+1-abs(x), [1.2,5.7,23.6,6,1.2])))
print(list(map(lambda x: x**2+1-abs(x), (1.2,5.7,23.6,6,1.2))))
print(list(map(lambda x: x**2+1-abs(x), {1.2,5.7,23.6,6,1.2})))

for i in map(lambda x: x**2+1-abs(x), [1.2,5.7,23.6,6,1.2]): print(i)
for j in map(lambda x: x**2+1-abs(x), (1.2,5.7,23.6,6,1.2)): print(j)
for j in map(lambda x: x**2+1-abs(x), {1.2,5.7,23.6,6,1.2}): print(j)

print(tuple(map(lambda x,y: x**2*y-abs(x)/y, [1.2,5.7],[-45,26])))
print(tuple(map(lambda x,y: x**2*y-abs(x)/y, (1.2,5.7),(-45,26))))
print(tuple(map(lambda x,y: x**2*y-abs(x)/y, {1.2,5.7},{-45,26})))

gg=lambda x,y: x**2*y-abs(x)/y
print(gg(1.2,-45),gg(5.7,26))#表面上的`次序`
print(gg(1.2,26),gg(5.7,-45))#实际次序

print(list(filter(lambda x: x>0,[-1,4,-5,7])))#滤去list的负值
print(list(filter(lambda x: x>0,(-1,4,-5,7,-5,8,7))))#滤去tuple的负值
print(list(filter(lambda x: x>0,{-1,4,-5,7,-5,8,7})))#滤去set的负值
print(list(filter(lambda x: abs(x)>5,range(-10,12,2))))#取绝对值大于5的值

list(filter(bool,('',(1,2,4),'happy',0,None,True,False,2020)))

def Age():x1=120.x0=0x=x1/2for i in range(6):y=input("Is your age greater than %s ? Input 'Y' or 'N':" %x)if y=='Y' or y=='y' :x0=xx=x0+(x1-x0)/2else:x1=xx=x0+(x1-x0)/2print('Your age is about {} years old'.format(int(x)))
Age() #执行上面函数的语句

x=input('Type your name please: ')
print('My name is',x)

x=eval(input('Type any number: '))
print('The square root of your number is',x**(1/2))

print('World'!='word')
print(34==34.0)
print(3>2 and 4>=3)
print(3<2 or 'c'>='a')
print(not 3<2)
print('A'<'a' and 'A'>'1')

def f(x): return 2*x**3-4*x**2+5*x-20 #定义多项式函数
def solf(f=f): #定义solf函数x1=3.x0=2.x=x1/2.e=10**(-18) #确定精度while abs(f(x))>e:  #不满足精度则继续的循环if f(x)<0:x0=xx=x0+(x1-x0)/2else:x1=xx=x0+(x1-x0)/2return x
solf(f) #运行函数solf

x=eval(input('Enter a number'))
if x<0:x=x**2w='x is negative and change to'
elif x==0:x=x+1.w='x=0 and change to'
else:x=x**3w='x>0 and change to'
print (w,x)

import random #输入模块
random.seed(1010) #设定随机种子使得这里产生的结果可以重复
print(random.randint(1,100)) #从1到100中随机选择一个数字
print(random.choice([1,2.0,4,'word'])) #从表中随机选择一个元素
print(random.sample(range(100),5)) #从[0,100)(不包含100)随机选择5个数字
print(random.sample([1,2.0,4,'word'],2)) #从[1,2.0,4,'word']随机选择2元素
print(random.random()) #产生区间[0.0,1)(不包含1)中的随机数
print(random.uniform(2,5)) #产生一个2,5之间的均匀分布随机数
print(random.gauss(3,5)) #产生一个均值为3, 标准差为5的正态分布随机数

x=99;y=x;print(x,y,id(x)==id(y))
y=10;print(x,y,id(x)==id(y))

x=[1,2,3];y=x;y[0]=10;print(x,y,id(x)==id(y))
x[2]='test';print(x,y,id(x)==id(y))

x=[1,2];y=x[:]
print(x,y,id(x)==id(y),id(x[0])==id(y[0]),id(x[1])==id(y[1]))
print(id(x),id(y),id(x[0]),id(y[0]),id(x[1]),id(y[1]))#位置(与电脑有关)

x=[1,2];y=x[:]
y[0]=33;print(x,y)

x=[1,2,5,2];y=x;y[0]=3;y[3]=99
print(x,y)
x=[1,2,5,2];y=x;x[0]=7;x[3]=88;y[2]='string'
print(x,y)
x=[1,2,5,2];y=x[:];y[0]=3;y[3]=99
print(x,y)
x=[1,2,5,2];y=x[:];x[0]=44;y[3]=77
print(x,y)

x=eval(input('Enter a number'))
print(x,type(x))
y=input('Enter a word')
print(y,type(y))

p=open('PYGMALION.txt','r') #打开文件
print('file name=',p.name)#打印文件名

print('Is file closed? ', p.closed) #是否关闭了
print('Access mode=',p.mode) #可访问的权限
print('position=', p.tell()) #指针位置

print(p.read(194))            #读取并打印头194字节(byte)
print('position=', p.tell())  #显示指针(读到哪里了)

import textwrap
p.seek(0,0)                 #指针位置归零
print('Position=',p.tell())
print("\n".join(textwrap.wrap(p.read(194),70)))
print('Position=',p.tell())

p.close()                   #关闭
print('Is file closed? ', p.closed)

a=open('fool.txt','w')
a.write('A message ')
a.write('and more.')
a.close()

b=open('fool.txt','a')
b.write(' OK?')
b.close()

b=open('fool.txt','r+')
print(b.read(100))
b.write(' OK?')
b.seek(0,0) #回到指针0, 再读取, 看加入的内容有没有.
print(b.read(100))
b.close()

O=open("UN.txt")
print(O.name)
print(O.encoding)
print(O.mode)

O=open("UN.txt")
for line in O:  #按序提取O中的元素(line)print(line)

with open("UN.txt", "rt") as O:text = O.read()
print(text)

x=[]                           #建立空list
O=open("UN.txt")               #Open file
for line in O:                 #按序提取O中的元素(line)for word in line.split():  #按序取每个line中的元素(word)if word.endswith('lity'): #条件x.append(word)     #把满足条件的词逐个放入x中
print('There are', len(x), 'words ended with "lity", they are:\n',x)

b=0;c=0;d=0;e=0
for line in open("UN.txt"):b+=1                    #行计数if len(line.split())>0: #不算空行c+=1                #对非空行计数   for word in line.split():d+=1                #对词计数for char in word:e+=1            #对字符计数
print('Total {} lines with {} no-empty lines, {} words and {} characters'\.format(b,c,d,e))

print('Integer: {:2d}, float: {:1.2f}, \
anything: {} and: {}'.format(234,21.5, 2.718, 'Hi!'))

print('Integer: %s, float: %s, anything: %s and: %s' %(234,21.5, 2.718, 'Hi!'))

b=0
for line in open("UN.txt"):if len(line.split())>0:for word in line.split():if word=='Whereas':b+=1
print('The count of word "Whereas" is %s' %b)

import textwrap
c=0
for line in open("OW.txt"):if c<3:if len(line.split())>0:c+=1print('The line {} has {} words:'.format(c,len(line.split())))print("\n".join(textwrap.wrap(line,70)))

import textwrap
c=0
g=open('OW.txt')
for line in g.readlines():if len(line)>1:if c<3:c+=1print('Line {} has {} characters'.format(c,len(i)),'\n',"\n".\join(textwrap.wrap(line,70)))
g.close()

import textwrap
g=open('OW.txt')
print('The 9th line:\n', "\n".join(textwrap.wrap(g.readlines()[8],70)) )
g.seek(0,0)
print('The 8th words of the 9th line:\n',\"\n".join(textwrap.wrap(g.readlines()[8].split()[7],70)))
g.close()

第3章

class Customer(object):"""A customer of XXX Bank with an account have thefollowing properties:Attributes:name: The customer's name.balance: The current balance.penalty: Penalty for overwithdraw (%)reward: reward for deposit (%)"""def __init__(self, name, balance=0.0, penalty=0.3, reward=0.1):"""Return a Customer object whose name is *name*, startingbalance is *balance*, the penalty rate is *penalty* and the reward rate is *reward*."""self.name = nameself.balance = balanceself.p = penaltyself.r = rewarddef withdraw(self, amount):"""Return the balance after withdrawing *amount*."""self.withd=amountself.balance=self.balance-self.withdif self.balance < 0:self.balance=self.balance*(1+self.p)return self.balancedef deposit(self, amount):"""Return the balance after depositing *amount*."""self.depos=amountself.balance=self.balance+self.deposif self.balance > 0:self.balance=self.balance*(1+self.r)return self.balance

print(Customer.__doc__)
print(Customer.withdraw.__doc__)
print(Customer.deposit.__doc__)

Jack=Customer('Jack',1000, 0.7, 0.25)
print('Name=', Jack.name)
print('Original balance=', Jack.balance)
Jack.withdraw(1500)
print('Withdraw {}, balance={}'.format(Jack.withd,Jack.balance))
print('Penalty rate={}, Reward rate={}'.format(Jack.p, Jack.r))
Jack.deposit(3700)
print('Deposite {}, balance={}'.format(Jack.depos,Jack.balance))
print('Penalty rate={}, Reward rate={}'.format(Jack.p, Jack.r))

June=Customer('Smith',30, 0.44, 0.13)
print('Name=', June.name)
print('Original balance=', June.balance)
June.withdraw(20)
print('Withdraw {}, balance={}'.format(June.withd,June.balance))
print('Penalty rate={}, Reward rate={}'.format(June.p, June.r))
June.deposit(125)
print('Deposite {}, balance={}'.format(June.depos,June.balance))
print('Penalty rate={}, Reward rate={}'.format(June.p, June.r))

print(Jack.__doc__)
print(Jack.withdraw.__doc__)
print(Jack.deposit.__doc__)

class Son(Customer):def withdraw(self, amount):"""Return the balance after withdrawing *amount*."""self.withd=amountself.r0=self.rself.p0=self.pself.balance=self.balance-self.withdif self.balance < -30:self.p0=self.p0*10self.balance=self.balance*(1+self.p0)else:self.p0=self.preturn self.balance, self.p0def deposit(self, amount):"""Return the balance after depositing *amount*."""self.depos=amountself.r0=self.rself.p0=self.pself.balance=self.balance+self.deposif self.balance > 0:self.r0=self.r0*3self.balance=self.balance*(1+self.r0)else:self.p0=self.p0self.r0=self.r0return self.balance, self.r0

Jackson=Son('Jackson',30, 0.44, 0.13)
print('Name=', Jackson.name)
print('Original balance=', Jackson.balance)
print('Original Penalty rate={}, Reward rate={}'.format(Jackson.p,\Jackson.r))
Jackson.withdraw(250)
print('Withdraw {}, balance={}'.format(Jackson.withd,Jackson.balance))
print('Penalty rate={}, Reward rate={}'.format(Jackson.p0, Jackson.r0))
Jackson.deposit(5000)
print('Deposite {}, balance={}'.format(Jackson.depos,Jackson.balance))
print('Penalty rate={}, Reward rate={}'.format(Jackson.p0, Jackson.r0))
Jackson.deposit(50)
print('Deposite {}, balance={}'.format(Jackson.depos,Jackson.balance))
print('Penalty rate={}, Reward rate={}'.format(Jackson.p0, Jackson.r0))

第二部分

第4章

import numpy as np

np.random.seed(1010) #随机种子
np.random.rand(2,5,3) #产生30个[0.0,1.0)中的随机数并形成2乘5乘3的三维数组
np.random.randn(3,5) #产生15个标准正态分布随机数并形成3乘5的二维数组
np.random.normal(3,5,100) #产生100个均值为3, 标准差为5的N(3,5)随机数
np.random.uniform(3,7,100) #产生100个上下界分别为3和7的均匀分布随机数
np.random.randint(3,30,34) #产生34个[3,30)中的随机整数
np.random.random_integers(3,30,34) #产生34个[3,30)中的随机整数
x=[2,5,-7.6]
#下面是从数组x中按照给定概率p随机(放回)抽取20个样本
np.random.choice(x,20,replace=True,p=[0.1,0.3,0.6])
#下面是从数组x中完全随机(不放回)抽取2个样本
np.random.choice(x,2,replace=False)
np.random.permutation(range(10)) #把0到9的自然数随机排列

x0=[[1,3,-5],[3,4],'It is a word',(2,6),{3:51,'I':(2,1)}]
x=np.array([[1,3,-5],[3,4],'It is a word',(2,6),{3:51,'I':(2,1)}])print(x0,'\n', x)
print(x0[0][:2], x0[4][3],x0[2][3:5],x0[4]['I'],len(x0))
print(x[0][:2], x[4][3],x[2][3:5],x[4]['I'],x.shape,x.size)

y=np.array(((2,1,-7),[5.5,21,32],(3,8.,1)))
z=np.array((((2,3),(1,43),[2,8]),[[2,3],[3,1],(9,5)]))
print(y,'\n',z,'\nshape of y ={}, shape of z ={}, \
\ndim of y={}, dim of z={}, size of y={}, size of z={}'.format(y.shape,\z.shape,y.ndim,z.ndim,y.size,z.size))

np.arange(3.2),np.arange(3.2,7.8),np.arange(2.2,5.8,.5),np.arange(2.3,-9,-1.5)

np.linspace(-2.1,6,3),np.linspace(-2.5,-16,4)

a=np.array([[2,5,-1,2,10],(3,1,4.,6,34)])
print(np.zeros([2,3]),'\n',np.ones((2,4)),'\n',np.full((2,5),-np.inf),'\n',np.zeros_like(a),'\n',np.eye(3),'\n',np.identity(2))

a=np.array([[2,5,-1,2],(3,1,4.,6)])
print(np.empty((2,3)),'\n',np.empty_like(a))

np.fromfunction(lambda i, j: i**2 + i*j, (3, 4))

np.random.seed(1010);a=np.random.rand(3,4)
id=np.fromfunction(lambda i, j: i==j, (3, 4))
id,a,a[id]

x = np.random.randn(5,3) #产生标准正态随机数组成的5乘3矩阵
np.savetxt('tabs1.txt',x) #存成以制表符分隔的文件
np.savetxt('commas1.csv',x,delimiter=',') #存成以逗号分隔的文件(如csv)
u = np.loadtxt('commas1.csv',delimiter=',') #读取以逗号分隔的文件
v = np.loadtxt('tabs1.txt') #读取以制表符分隔的文件

print('Shape of x, u and v are: [%s, %s ,%s]'%(x.shape,u.shape,v.shape))
print('x has', x.ndim, 'dimensions')
print('x and u are identical? %s' %(np.sum(x!=u)==0))
print('x and v are identical? %s' %(np.sum(x!=v)==0))

y = np.array([[[1,4,7],[2,5,8]],[[3,6,9],[10,100,1000]]])
# 等价于 y = np.as,matrix([[[1,4,7],[2,5,8]],[[3,6,9],[10,100,1000]]])
print('y=\n',y)
print('y[0,:,:]=\n',y[0,:,:])
print('y[1,:,:]=\n',y[1,:,:])
print('y[:,0,:]=\n',y[:,0,:])
print('y[:,1,:]=\n',y[:,1,:])
print('y[:,:,0]=\n',y[:,:,0])
print('y[:,:,1]=\n',y[:,:,1])
print('y[1,0,0]={}, y[0,1,:]={}'.format(y[1,0,0],y[0,1,:]))

print('shape of y=', np.shape(y),'\ndimension of y=', y.ndim)
print('"type(y)"=%s, "y.dtype"=%s' %(type(y),y.dtype))

x=np.arange(16).reshape(2,8);x

x.reshape(4,4),x.reshape(1,-1)

x.reshape(2,-1,4),x.reshape(4,-1,2).shape #shape (4,2,2)

x=np.arange(4)
print(x[np.newaxis,:],x.reshape(1,-1)) #行向量 1x8 矩阵
print(x[:,np.newaxis]==x.reshape(-1,1)) #列向量 8x1 矩阵

x=np.arange(5)
print(np.resize(x,(2,8)),'\n',np.resize(x,(1,3)))

import numpy as np
u=np.array([0, 1, 2]);v=np.array([5,2,7]) #整型list转换成np.array
print('shape of u=%s; shape of v=%s' %(u.shape,v.shape)) #形状
print('type of u=%s, type of v=%s' %(u.dtype,v.dtype)) #输出u和v类型
print('type of (u+v) is %s, type of (u*v) is %s, \ntype of (u/v)is %s,\
type of (u**v)is %s' %((u+v).dtype,(u*v).dtype,(u/v).dtype,(u**v).dtype))
print("u+v,u*v,u/v:u**v:\n",u+v,u*v,u/v,u**v)

x=np.array([1,3,2.7]);y=np.array([2,-2.5,-1])
print(x+y,'\n',x-y,'\n',x/y,'\n',x**y)

x=np.array([[1,3,2],[2,3,1]])
#上式等价于 x=np.asmatrix([[1,3,2],[2,3,1]])
print('x=\n',x)
print('x**3=\n',x**3,'\n3**x=\n',3**x)

x=np.array([1,3,2]) #行矩阵
y=np.array((2.,-2)).reshape(-1,1) #变成列矩阵

x=np.ones((3,4))
y=np.arange(4)
z=np.arange(3)
print(x*y[np.newaxis,:])#等价于 x*y.reshape(1,-1), y*x 和 x*y
print(x*z[:,np.newaxis]) #等价于 x*z.reshape(-1,1))

x = np.array([ 123.858, 112.9652, -16.4278])
print(np.round(x,3),np.round(x, -2)) #四舍五入位数(负数为小数点前位数)
print(np.around(x,3),np.around(x,-2)) #同上
print(np.floor(x),np.ceil(x)) #比x小的最大整数及比x大的最小整数

x=np.array([-2,7,-1,9,6,-5]).reshape(2,3)
print('x=','\n', x)
print('np.max(x)=', np.max(x))
print('np.argmax(x)=', np.argmax(x))

print('x=','\n', x)
print('x.max(0)=' ,x.max(axis=0),'x.argmax(0)=' ,x.argmax(axis=0))
print('x.min(1)=' ,x.min(axis=1),'x.argmin(1)=', x.argmin(axis=1))

x = np.array([123.858, -23.6, 112.9652, -16.4278])
print('sum=', np.sum(x),'\ncumsum=', np.cumsum(x)) #和及累积和
print('prod=',np.prod(x),'\ncumprod=', np.cumprod(x)) #乘积及累积乘积
print('diff(x)=',np.diff(x)) #差分

x.shape=2,2 #把x转换成2乘2矩阵
print('x=\n',x)
print('diff by column =',np.diff(x,axis=0)) #按列(对不同的行元素)差分
print('diff by row =\n',np.diff(x,axis=1)) #按行(对不同的列元素)差分

y=np.arange(32).reshape(2,2,8)
y.sum(axis=0) # 2x8

y.sum(axis=(0,1))

print('sign(x)=\n' ,np.sign(x),'\nexp(x)=\n', np.exp(x))
print('log(abs(x))=\n', np.log(np.abs(x)),'\nx**2=\n', x**2)

x=np.arange(3,5,.5) #从3到5(不包含5)等间隔为0.5的数列
y=np.arange(4)
print(x,y,x.shape,y.shape)
print('np.dot(x,y)={}, np.sum(x*y)={}'.format(np.dot(x,y),np.sum(x*y)))

np.random.seed(1010)
x=np.random.randn(3,5)
y=np.random.randn(3,5)
print(x.dot(y.T)) #x 和 y 的转置做矩阵乘法
print(x.T.dot(y)) # x 转置和 y做矩阵乘法

x=np.arange(3);y=np.linspace(1,10,5)
x,y,np.outer(x,y)

x=np.array(['I', 'am', 'OK'], dtype=object);y=np.arange(5)
x,y,np.outer(x,y)

x = np.array([[1.0,2.0,4],[3.0,4.0,-1]])
y = np.array([[5.0,6.0,-2],[7.0,8.0,9]])
print('x.shape=',x.shape,'y.shape=',y.shape) #都是2乘3矩阵
print('x=\n',x,'\ny=\n',y)
z = np.vstack((x,y)) #x,y纵向叠加合并成4乘3矩阵
z1 = np.hstack((x,y)) ##x,y横向叠加合并成2乘6矩阵
print('z=\n',z,'\nz1=\n',z1, '\nz.shape=',z.shape,'z1.shape=', z1.shape)
z = np.concatenate((x,y),axis=0)#等同于 np.vstack((x,y))
z1 = np.concatenate((x,y),axis=1) #等同于 np.hstack((x,y))

x = np.arange(24).reshape(4, 6)
print('x= %s \n hsplit=\n%s'%(x,np.hsplit(x,2)))

np.vsplit(x,4)

x=np.arange(9)
np.split(x,(2,5,7,12))

u=np.array([[11,32,26],[47,54,89],[92,64,95]]);u

np.insert(u,1,0) #相当于np.insert(u.flatten(),1,0)

np.insert(u,1,0,axis=1)

np.insert(u,1,0,axis=0)

np.insert(u,1,[[1,2,3],[4,5,6]],axis=0)

np.insert(u,1,np.array([[1,2,3],[4,5,6]]),axis=1)

np.random.seed(1010)
x=np.arange(12).reshape(2,6)
y=x;z=x.copy()
print(y is x,'\n',y==x,'\n',z is x,'\n',z==x)

print(x)
y[0,0]=99;z[0,:]=-777
print(x,'\n',y,'\n',z)

x=np.zeros((4,5))+999 #产生全部元素为999的4乘5矩阵
print('x=\n',x)
x[0,:]=np.pi #第0行全部赋值为圆周率pi
print('x=\n',x)
x[0:2,0:2]=0 #0到1行及0到1列赋值为0
print('\nx=\n',x)
x[:,4]=np.arange(4) #第4列赋值为0,1,2,3
print('\nx=\n',x)
x[1:3,2:4]=np.array([[1,2],[3,4]]) #1到2行及2到3列用2乘2矩阵赋值
print('\nx=\n',x)

x=np.c_[0:12:4] #从0开始, 间隔4, 直到(但不包含)12为止的列向量
y=np.arange(0,12,4).reshape(-1,1) #等价语句
print('x=\n',x)
print('y=\n',y)
print('Is x and y identical? ',np.sum(x-y)==0)

x=np.c_[0:10:3j] #从0开始, 3个元素, 直到(包含)10为止的列向量
y=np.arange(0,11,10/(3-1)).reshape(-1,1) #等价语句
print('x=\n',x)
print('y=\n',y)
print('Is x and y identical? ',np.sum(x-y)==0)

x=np.r_[0:10:4] #从0开始, 间隔4, 直到(但不包含)10为止的行向量
y=np.arange(0,10,4) #等价语句
print('x=\n',x)
print('y=\n',y)
print('Is x and y equal? ',np.sum(x-y)==0)

x=np.c_[0:10:5j] #从0开始, 5个元素, 直到(包含)10为止的列向量
y=np.arange(0,12,10/(5-1))[:,np.newaxis] #等价语句
print('x=\n',x)
print('y=\n',y)

x = np.arange(-10,10,.2)
y = np.arange(-10,10,.2)
X, Y = np.meshgrid(x, y)
Z = X**2 + Y**2
print(X.shape,Y.shape,Z.shape)

from mpl_toolkits.mplot3d import axes3d
import matplotlib.pyplot as pltfig = plt.figure(figsize=(10,4))
ax = fig.add_subplot(111, projection='3d')
ax.plot_wireframe(X, Y, Z, rstride=10, cstride=10)
plt.show()

x = np.array([[10,2,7],[3,5,4],[45,76,100],[30,2,0]])
y=np.diag(x) #对角线元素
z=np.diag(y) #x的对角线元素组成的对角型方阵(非对角型元素为0)
print('x=\n{}\ny=diag(x)=\n{}\nz=diag(y)=\n{}'.format(x,y,z))

x = np.array([[10,2,7],[3,5,4],[45,76,100],[30,2,0]])
print('np.triu(x)=\n' ,np.triu(x)) #x上三角阵
print('np.tril(x)=\n',np.tril(x)) #x下三角阵

np.random.seed(1010)
x = np.random.randn(50,5)
va,ve=np.linalg.eig(np.corrcoef(x.T))
print('eigen values=\n{}\neigen vectors=\n{}'.format(va,ve))

import numpy as np
np.random.seed(1010)
x=np.random.randn(3,4)
print('x=\n',x)
u,d,v= np.linalg.svd(x) #奇异值分解
print('u=\n',u)
print('D=\n',np.diag(d))
print('v=\n',v)
print('condition number=',np.linalg.cond(x)) #条件数
#验证: 条件数等于最大和最小奇异值之比
print('Are they equal?',np.max(d)/np.min(d)-np.linalg.cond(x)<10**15)

Z=np.array([[1,-2j],[2j,5]])
print('Z=\n',Z)
L=np.linalg.cholesky(Z) #Cholsky分解
print('L=\n',L)          #L
L1=L.T.conj()
print('L.T.conj()=\n',L1) #L的共轭转置
print(np.sum(np.dot(L,L1)-Z)) #验证其等于Z (差的元素总和为0)

np.random.seed(1010)
A=np.random.randn(3,3)#产生一个标准正态随机数的矩阵A
print('inverse of A=\n',np.linalg.inv(A)) #A的逆
print('determinant of A=\n',np.linalg.det(A)) #行列式|A|
b=np.random.randn(3)
print('solution of Ax=b:\n',np.linalg.solve(A,b)) #解联立方程Ax=b

np.random.seed(1010)
X = np.random.randn(100,3)         #无截距项的自变量
X1=np.hstack((np.ones((100,1)),X)) #有截距项的自变量
y = X.dot(np.array([1,2,3]))+np.random.randn(100)

print('OLS without intercept:')
beta, SSR, rank, sv= np.linalg.lstsq(X,y,rcond=None)#无截距最小二乘法
print('beta={}\nSSR={}\nrank={}\nsv={}'.format(beta, SSR, rank, sv))

print('OLS with intercept:')
beta, SSR, rank, sv= np.linalg.lstsq(X1,y,rcond=None)#有截距最小二乘法
print('beta={}\nSSR={}\nrank={}\nsv={}'.format(beta, SSR, rank, sv))

A=np.eye(3)
B=np.array([[1,2],[3,4]])
print('A=\n{}\n B=\n{}'.format(A,B))
Z = np.kron(A,B) #A和B矩阵的Kronecker积
print('Z=np.kron(A,B)=\n{}\nz.shape={}'.format(Z,Z.shape))
print('trace(Z)={}, rank(Z)={}'.format(np.trace(Z),np.linalg.matrix_rank(Z)))

import datetime as dt
yr, mo, dd = 2016, 8, 30
hr, mm, ss, ms= 10, 32, 10, 11

print('dt.date(yr, mo, dd)=',dt.date(yr, mo, dd)) #标准输出年月日
print('dt.time(hr, mm, ss, ms)=',dt.time(hr, mm, ss, ms))#最小至毫秒
d1=dt.datetime(yr, mo, dd, hr, mm, ss, ms)#年月日及时间全部
print(d1)

d2 = dt.datetime(yr + 1, mo+2, dd+1, hr-1, mm, ss, ms)
print('time difference d2-d1=', d2-d1)

dates = np.array(['2016-09-01','2017-09-02'],dtype='datetime64')
print('dates=\n',dates,'\ntype of dates=',dates.dtype)
print('dates[0]=',dates[0],'dates[1]=',dates[1])

x=np.array([2+3j])
y=np.array([4-13j])
z=np.array(-20-4j)
print(x/z*y+x**2*z/y)

coef = [3.2, 12, 1, 4, -15, 28]
np.roots(coef)

p=np.poly1d([3,-4,6,2])
p1=np.poly1d([2,4])
print ('p=\n',p) #打印p
print ('p1=\n',p1) #打印p1

print ('p(1:9)=',p(np.arange(1,10,1)))#计算x取1,2,. . .,9时p的值
print ('p*p1=\n',p*p1) #打印p*p1

pi27=p.integ(m=2,k=7)
print ('p.integ(m=2,k=7)=\n',pi27)
pd1=p.deriv(m=1)
print ('p.deriv(m=1)=\n',pd1)

def mine(a, b, c):if a > 2*b:return np.log(a-2*b)*celif a< 2*b:return np.log(2*b-a)*c**2else:return np.pi

mine(3,7,8)

mine([3,5,9,0],[7,-5,7,8],8)

vmine = np.vectorize(mine)

print (vmine([3,5,9,0],[7,-5,7,8],[8,8,8,8]))
print (vmine([3,5,9,0],[7,-5,7,8],8))

第5章

import pandas as pd

d0={'x':5,'y':989}
d1={'y':np.arange(3), 'x':([4.5,9],8),'z': (2,4,2)}
d2={'y': {'a':4,'b': 90}, 'x':([4.5],[9,8])}
z=pd.DataFrame([d0,d1,d2])
print(z)

print('Use "iloc" with indices:\n' ,z.iloc[2,0][1])
print('Use "loc" with indices and names:\n',z.loc[2,'y']['b'])
print('Use column names and indices:\n',z['z'][:1])
print('Use column names and indices:\n',z.y[1])

d3={'x':[-5,7,9,-2.5],'y':[1,-2,9.8,6.4]}
u=pd.DataFrame(d3)
print('u=\n',u)
d4=np.array([[-5,7,9,-2.5],[1,-2,9.8,6.4]]).T
v=pd.DataFrame(d4,columns=['x','y'])
print('v=\n',v)

import numpy as np
np.random.seed(1010)
name1=['X1','X2','X3','Y']
w=pd.DataFrame(np.random.randn(7,4),columns=name1)
w['sex']=['Femal']*3+['Male']*4
print(w)

print(w.head(2)) #前2行(默认值是5行)
print(w.tail(3)) #最后3行(默认值是5行)

print(w.describe())

print(w.columns)
print(w.index)

w.index=['A','B','C','D','E','F','G']
print(w[w.columns[2:]][:2]) #输出最后3个变量的头2行

print('size of w=',w.size)
print('shape of w=',w.shape)

df=pd.DataFrame({'price': [12,34,10],'tax': [0.12,0.4,0.5]})
df.columns

df.rename(columns={'price':'P','tax':'T'},inplace=True)
df.columns

np.random.seed(1010)
w={'X':np.random.randn(7),'Y':np.random.randn(7),'Year':np.arange(2014,2021,1)}
df=pd.DataFrame(w)
print(df)

df.index=np.arange(10,17)
print(df)

df1=df.set_index('Year')
print(df1)

del df1.index.name
print(df1)

new_index = df.index[::-1]
print(df.reindex(new_index))

print(df1.reset_index())

print(df1.reset_index(drop=True))

import numpy as np
np.random.seed(1010)
name1=['X1','X2','X3','Y']
w=pd.DataFrame(np.random.randn(7,4),columns=name1)
w['sex']=['Femal']*3+['Male']*4
w.index=['A','B','C','D','E','F','G']
v=pd.DataFrame(np.random.randn(5,3),columns=['X1','X2','Y'])
print(w,'\n',v)

w.to_csv('Test.csv',index=False) #index=False意味着文件不置行名字
w.to_csv('Test2.txt',index=True) #index=True在文件中增加了一列

w1=pd.read_csv('Test.csv')
w2=pd.read_table('Test.csv',sep=',')
w3=pd.read_table('Test2.txt',sep=',')
print('w1:\n',w1,'\nw2==w1:\n',w2==w1,'\nw3:\n',w3)

df=w.to_csv(sep=';',index=True)
print(df)

writer=pd.ExcelWriter('Test1.xlsx')
w.to_excel(writer,'Sheet1',index=True)
# 数据v存入指定工作表左上角位置: 从第2行, 第3列开始(从第0行列算起)
v.to_excel(writer,'Sheet2',startrow=2,startcol=3,index=False)
writer.save()

W=pd.read_excel('Test1.xlsx','Sheet1',index_col=0)
print(W)

V=pd.read_excel('Test1.xlsx','Sheet2',usecols=range(3,6),skiprows=2)
# 下式和上式等价
#V=pd.read_excel('Test1.xlsx','Sheet2',usecols='D:F',skiprows=2)
print(V)

w.to_pickle("test.pkl")
w_pkl=pd.read_pickle('test.pkl')
print(w_pkl.head(2))

w.to_json('test_index.json',orient='index')
w_index_json=pd.read_json('test_index.json')
print(w_index_json)

w.to_json('test_records.json',orient='records')
w_records_json=pd.read_json('test_records.json')
print(w_records_json.head(2))

w_table_json=pd.read_json('test_table.json',orient='table')
print(w_table_json.tail(2))

w.to_hdf('data.h5', key='w', mode='w')
w_h5=pd.read_hdf('data.h5', key='w')
print(w_h5.tail(3))

w.to_parquet('w.parquet.gzip', compression='gzip')
w_parq=pd.read_parquet('w.parquet.gzip')
print(w_parq.head(3))

import feather
feather.write_dataframe(w, 'data.feather')
w_feather = feather.read_dataframe('data.feather')
print(w_feather.head(2))

w.to_stata('test.dta')
w_dta=pd.read_stata('test.dta')
print(w_dta.head(3))

print(w[['X1','Y']][:2]) #X1和Y的前2行
print(w[:2]) #所有变量的前两行
print(w[w.columns[3:]][-3:]) #第3个变量及后面变量的最后3行

print(w.sex[:4]) #sex变量的前4个元素

print(w.loc['A':'C','X3':'sex']) #index'A'到'C', 变量'X3'到'sex'
print(w.loc[['G','A','F'],['sex','Y','X1']]) # 随意选择的行名和变量名

print(w.iloc[[1,0,3],[0,4,2]])
print(w.iloc[[3,2,0],-3:])
print(w.iloc[:2,-3:])

import pandas as pd
np.random.seed(8888)
name1=['X1','X2','X3','Y']
u=pd.DataFrame(np.random.randn(7,4),columns=name1)
print('u.head(2)=\n',u.head(2))
print('u.shape=',u.shape)
v=pd.DataFrame(np.random.randn(5,3),columns=['X1','X2','Y'])
print('v.head(2)=\n',v.head(2))
print('v.shape=',v.shape)
x=pd.DataFrame(np.random.randn(3,4),index=['s','u','t'])
x.columns=['w','u','v','x']
print('x.head(2)=\n',x.head(2))
print('v.shape=',x.shape)
s=pd.DataFrame({'sex':['Male','Female','Male','Female','Male'],'X1': range(5)})
print('s.head(2)=\n',s.head(2))
print('s.shape=',s.shape)
np.random.seed(1010)
name1=['X1','X2','X3','Y']
w=pd.DataFrame(np.random.randn(7,4),columns=name1)
w['sex']=['Femal']*3+['Male']*4

np.random.seed(1010)
df=pd.DataFrame(np.random.randn(7,2),columns=('X1','X2'))
df['sex']=['Female']*4+['Male']*3
print(df,'\n',df.T) #或 df.transpose()

print(s+w)

print('w*v/u=\n',w*v/u,'\nw**u=\n',w**u)

print('v**2+v*5+2*np.exp(v)=\n',v**2+v*5+2*np.exp(v)) #简单运算
print('v-v.iloc[0]=\n',v-v.iloc[0]) #v的每一行减去第0行
print('x-x[index=t]=\n',x-x.loc['t']) #x的每一行减去标签为't'的行
print('x.T.dot(x)=\n',x.T.dot(x)) #用numpy的矩阵转置及矩阵乘法函数

print(x.sum(axis=0),"\n",x.sum(axis=1),"\n",x.mean(axis=0))
print(x.std(axis=0),"\n",x.prod(axis=0),"\n",x.count(axis=0),"\n",x.cumsum(axis=0))

np.random.seed(1010)
w=pd.DataFrame(np.random.randn(7,4),columns=['X1','X2','X3','Y'])
w['sex']=['Femal']*3+['Male']*4
w.index=['A','B','C','D','E','F','G']

print(w.loc[(w['X1']<0) | (w.sex=='Female'),['sex','X1','Y','X3']])

w[(w['X1']<0) | (w.sex=='Female')][['sex','X1','Y','X3']]

print(w.sort_values(by='X1', ascending=False))

print(w.sort_values(by=['sex','Y'], ascending=[False,True]))

np.random.seed(1010)
Grade = {'score': np.random.choice(range(30,100),size=6)}
df = pd.DataFrame(Grade)
print(df.T)

df.loc[df.score<60,'result']='fail'
df.loc[df.score>=60,'result']='pass'
print(df)

df.insert(loc=0,column='name', value=['Tom','John','Jane','Ted',"Bob",'Lee'])
print(df)

df.insert(3,'extra',0)
print(df)

df.insert(3,'extra',np.arange(6)[::-1],allow_duplicates=True)
print(df)

v=np.random.choice(np.arange(60,100),(12,3))
name=np.repeat(['Tom','Bob','June'],4).reshape(-1,1)
year=np.array([2014,2015,2016,2017]*3).reshape(-1,1)
dd=np.hstack((name,year,v))
u=pd.DataFrame(data=dd,columns=['name','year','Math','Pys','Lit'])
u3=u.set_index(['name','year'])print('u=\n',u,'\nu3=\n',u3)

u3.drop(['Lit','Math'],axis=1) #等价于 u3.drop(columns=['Lit','Math'])

print(u.drop([0,4,3]))

u3.drop(index='2014',level=1) #这里level=1标明'2014'是第1列index

u3.drop(index='June',level=0)

print(u.drop(index=[0,4,3],columns='Math'))

u3.rename_axis([None,None],axis=0).drop(index='June',columns='Math')

Df=pd.DataFrame({'Math':[67,83,98],'Pys': [98,25,37]},index=['Tom','Bob','June'])
Df

new_index=['Tom', 'June', 'John']
Df.reindex(new_index)

Df.reindex(index=new_index,columns=['Math','Hist'],fill_value=999)

Gd=np.array([[87,79,80],[98,65,72],[69,88,86]])
w=pd.DataFrame(data=Gd,index=['Tom','Bob','June'],columns=['Math','Phy','Lit'])
print(w)

w1=w.stack() #等同于w.stack(0)
print(w1)

w2=pd.DataFrame(w1)
w2.reset_index(inplace=True)
w2.columns=('name','class','grade')
w2

w1.unstack() #等同于w1.unstack(level=-1)

w1.unstack(0)

v=np.random.choice(np.arange(60,100),(12,3))
name=np.repeat(['Tom','Bob','June'],4).reshape(-1,1)
year=np.array([2014,2015,2016,2017]*3).reshape(-1,1)
dd=np.hstack((name,year,v))
u=pd.DataFrame(data=dd,columns=['name','year','Math','Pys','Lit'])
print(u)

u.pivot(index ='year',columns ='name',values =['Math','Pys','Lit'])

u1=pd.DataFrame(u.set_index(['name','year']).stack())
u1.reset_index(inplace=True)
u1.columns=['name','year','class','grade']
print(u1)

Tom=u1[u1['name']=='Tom'].pivot(index='year', columns='class',values='grade')
print(Tom)

Tom.rename_axis(None,axis=1).rename_axis(None,axis=0)

Tom.rename_axis(None,axis=1).reset_index('year')

y2014=u1[u1['year']=='2014'].pivot(index='name',columns='class',values='grade')
y2014.reset_index(level="name").rename_axis(None,axis=1)

Math=u1[u1['class']=='Math'].pivot(index='year',columns='name',values='grade')
Math.reset_index(level='year').rename_axis(None,axis=1)

df1=pd.DataFrame({'X1': [1, 3., 2],'X2': [-2., -1, 9]},index=[0, 1, 2])
df2=pd.DataFrame({'X1': [1/2, 3.5, 12, 43],'X2': [6., -5, 4, 7]},index=[0, 1, 2, 3])
print(df1,'\n',df2)

print(pd.concat((df1,df2))) #相当于 pd.concat((df1,df2),axis=0)

print(pd.concat((df1,df2),ignore_index=True))

df3=pd.DataFrame({'X3': ['Male', 'Female', 'Female', 'Male'],'X4': ['H','P','G', 'H']},index=[5, 6, 7, 8])
print(df3)

print(pd.concat((df1,df2,df3),axis=1))

print(pd.concat((df1,df2,df3),ignore_index=True,axis=1))

s1=pd.Series([1, 2, 3], name='H')
s2=pd.Series([6, 5, 4], name='A')
s3=pd.Series([8, 9, 7], name='C')
pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three'])

df1=pd.DataFrame({'id': [1,3,5,2],'m_grade': [98,60,81,70]})
df2=pd.DataFrame({'id':[1,2,5,6],'s_grade':[50,90,78,60],'m_grade':[99,75,60,78]})
df3=pd.DataFrame({'xid': [6,1,2,4],'c_grade': [20,65,83,98]})
print(df1,'\n',df2,'\n',df3,'\n',df4)

pd.merge(df1,df2[['id','s_grade']])#默认 how='inner'

pd.merge(df1,df2[['id','s_grade']],how='outer')#默认 how='inner'

pd.merge(df1,df2,left_on='id', right_on='id',suffixes=('_first', '_second'))

pd.merge(left=df2,right=df3,left_on='id',right_on='xid',how='outer')

pd.merge(left=df2,right=df3,left_on='id',right_on='xid',how='left')

df2.join(df3,how='inner')

df1.join(df2,lsuffix='_l', rsuffix='_r',how='inner')

df1.join(df2,on='id', lsuffix='_l', rsuffix='_r',how='inner')

df1.join(df2,on='id', lsuffix='_l', rsuffix='_r',how='outer')

df1=pd.DataFrame({'id': [1,3,5,2],'m_grade': [98,60,81,70]})
df2=pd.DataFrame({'id':[1,2,5,6],'s_grade':[50,90,78,60],'m_grade':[99,75,60,78]})

df1.append(df2,sort=True,ignore_index=False)

df1.append(df2,sort=False,ignore_index=True)

d = [{'id':6,'m_grade':100},{'id':8,'m_grade':50}]
print(df1.append(d,ignore_index=True))\end{verbatim}

s = pd.Series([6, 100], index=['id', 'm_grade'])
df1.append(s,ignore_index=True)

np.random.seed(1010)
s=pd.Series(np.random.randn(4),index=['a','b','c','d'])
print('s=\n',s)
d=pd.Series({'a':2.7,'b':-3.6})
print('d=\n',d)

print('s[:3]=\n',s[:3])
print('s[[0,3]]=\n',s[[0,3]])

print("s[s.index>'b'=\n",s[s.index>'b'])
print("s[(s>-1.2) & (s<1.5)]=\n",s[(s>-1.2) & (s<1.5)])

print('s*2+np.exp(s)-abs(s**3)=\n',s*2+np.exp(s)-abs(s**3))
print('s[:2]+s[1:]=\n',s[:2]+s[1:])

import pandas as pd
import numpy as np
import matplotlib
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
np.random.seed(1010)
dates = pd.date_range('1989-01', periods=100, freq='M')
s1=pd.Series(np.random.randn(100).cumsum(), index=dates)
fig=plt.figure(figsize=(15,4))
plt.plot(s1)

import pandas as pd
diamonds=pd.read_csv("diamonds.csv")
print(diamonds.head()) #打印前几行
print('diamonds.columns=\n',diamonds.columns) #变量名字
print('sample shape=', diamonds.shape) #样本形状(行, 列数目)

print(diamonds.iloc[:,:7].describe()) #对除最后3个之外的数量变量进行描述

cut=diamonds.groupby("cut") #按照变量cut的各水平分群
print('cut.median()=\n',cut.median()) #变量相应cut的各个水平的中位数

print('Cross table=\n',pd.crosstab(diamonds.cut,diamonds.color))

np.random.seed(1010)
n=1000
x=pd.Series(np.random.randn(n),
index=pd.date_range('1/1/2014',periods=n,freq='D'))
x=x.cumsum()
x5=pd.DataFrame(np.random.randn(n,5),index=x.index,
columns=['One','Two','Three','Four','Five'])
x5=x5.cumsum()
fig,axes=plt.subplots(nrows=1,ncols=2,figsize=(12,4))
x.plot(ax=axes[0])
x5.plot(ax=axes[1])

xw=pd.get_dummies(adult['workclass']).sum(axis=0) #转换成哑元再求和
fig,axes=plt.subplots(nrows=1,ncols=2,figsize=(12,3)) #两个图的排列
xw.plot(kind='barh',ax=axes[0]) #条形图
xw.plot(kind='pie',ax=axes[1]) #饼图

w=adult
M=w['hours_per_week'][w['sex']=='Male']
F=w['hours_per_week'][w['sex']=='Female']
fig,axes=plt.subplots(nrows=1,ncols=2,figsize=(12,4))
w[['hours_per_week']].plot(kind='hist',orientation='horizontal',
alpha=0.5,ax=axes[0])
M.plot(kind='hist',alpha=0.5,ax=axes[1],label='Male')
F.plot(kind='hist',alpha=0.5,ax=axes[1],label='Female')
plt.legend()

np.random.seed(1010)
fig,axes=plt.subplots(nrows=1,ncols=3,figsize=(12,4))
x3=pd.DataFrame(np.random.rand(10,3),columns=['ABC','NBC','CBS'])
x3.plot(kind='bar',ax=axes[0])
x3.plot(kind='bar',stacked=True,ax=axes[1])
x3.plot(kind='barh',stacked=True,ax=axes[2])#水平叠放条形图

diamonds=pd.read_csv("diamonds.csv")
diamonds.boxplot(column='carat',by='cut',figsize=(12,4))

diamonds.boxplot(column=['price'],by=['color','cut'],figsize=(12,5))

x=np.sin(np.arange(0,5,.2))+1
y=np.cos(np.arange(0,5,.2))+1
w=np.stack((x,y),axis=1)
w=pd.DataFrame(w,columns=['sin','cos'])
fig,axes=plt.subplots(nrows=1,ncols=2,figsize=(12,3.5))
w.plot(kind='area',ax=axes[0])
w.plot(kind='area',stacked=False,ax=axes[1])

X=pd.DataFrame(np.random.rand(60, 4), columns=['One', 'Two', 'Three','Four'])
X.plot(kind='scatter',x='One',y='Two',c='Four', s=X.Three*500,figsize=(12,4))

第6章

import matplotlib
%matplotlib inline
#如果输入上面一行, 则会产生在输出结果之间的插图(不是独立的图)
import matplotlib.pyplot as plt

x=np.arange(0.1,4,.01)
plt.figure(figsize=(20,7))
plt.plot(x,np.sin(x*50)/x,linewidth=3) #实线形式的曲线, 默认'b-'
plt.plot(x,np.zeros(len(x)),'g--',linewidth=2) #虚线形式的绿色水平线
plt.title('Curve $y=\sin(50x)/x$',fontsize=40,y=1.04)
plt.xlabel('$x$',fontsize=30)
plt.ylabel('$y=\sin(50x)/x$',fontsize=30)
plt.grid(True)
plt.savefig('mplsin.pdf') #存入文件

x=np.arange(0.,2.,.05)
plt.figure(figsize=(20,7))
plt.plot(x,np.cos(x),'r:',x,np.cos(x**2),'b^',
x,np.cos(x**3),'g-.',x,np.cos(x**4),'mo',
linewidth=15,markersize=30)
plt.ylim((-1.5,1.5))
plt.title('4 curves in one figure',fontsize=30)

x=np.arange(0.,2.,.05)
plt.figure(figsize=(20,7))
plt.plot(x,np.cos(x),'r:',linewidth=15,markersize=30)
plt.plot(x,np.cos(x**2),'b^',markersize=30)
plt.plot(x,np.cos(x**3),'g-.',linewidth=15)
plt.plot(x,np.cos(x**4),'mo',markersize=30)
plt.ylim((-1.5,1.5)) #确定图形的纵向空间范围
plt.title('4 curves in one figure',fontsize=30)

import scipy.stats as stats
plt.figure(figsize=(27,9))
plt.subplot(2, 3, 1) #2x3图形阵的第1个
y = 50*np.exp(.0004 + np.cumsum(.01*np.random.randn(100)))
plt.plot(y) #默认画蓝色实线
plt.xlabel('time ($\tau$)') #x轴标签
plt.ylabel('Price',fontsize=16) #y轴标签
plt.title('Random walk: $d\ln p_t = \mu dt + \sigma dW_t$',fontsize=16)y = np.random.rand(5)
x = np.arange(5)
plt.subplot(2, 3, 2) #2x3图形阵的第2个
colors = ['#FF0000','#FFFF00','#00FF00','#00FFFF','#0000FF'] #颜色代码
plt.barh(x, y, height = 0.5, color = colors, \
edgecolor = '#000000', linewidth = 5) #水平条形图(barh)
plt.title('Bar plot')y = np.random.rand(5)
y = y / sum(y)
y[y < .05] = .05
plt.subplot(2, 3, 3)
plt.pie(y) #饼图
plt.title('Pie plot')z = np.random.randn(100, 2)
z[:, 1] = 0.5 * z[:, 0] + np.sqrt(0.5) * z[:, 1]
x = z[:, 0]
y = z[:, 1]
plt.subplot(2, 3, 4)
plt.scatter(x, y)
plt.title('Scatter plot')plt.subplot(2, 3, 5)
x = np.random.randn(100)
plt.hist(x, bins=30, label='Empirical') #画直方图
xlim = plt.xlim()
ylim = plt.ylim()
pdfx = np.linspace(xlim[0], xlim[1], 200)
pdfy = stats.norm.pdf(pdfx) #scipy模块中的标准正态分布密度函数
pdfy = pdfy / pdfy.max() * ylim[1]
plt.plot(pdfx, pdfy,'r-',label='PDF')
plt.ylim((ylim[0], 1.2 * ylim[1]))
plt.legend()
plt.title('Histogram')plt.subplot(2, 3, 6)
x = np.cumsum(np.random.randn(100,4), axis = 0)
plt.plot(x[:,0],'b-',label = 'Series 1')
plt.plot(x[:,1],'g-.',label = 'Series 2')
plt.plot(x[:,2],'r:',label = 'Series 3')
plt.plot(x[:,3],'h--',label = 'Series 4')
plt.legend()
plt.title('Random lines')

fig=plt.figure(figsize=(10,3))
f1=fig.add_subplot(1,2,1)
x=np.linspace(0.1,1)
f1.plot(x,stats.chi2.pdf(x,1),'-', label='$\chi^2(1)$')
f1.plot(x,stats.chi2.pdf(x,2),'-.', label='$\chi^2(2)$')
f1.plot(x,stats.chi2.pdf(x,3),'--', label='$\chi^2(3)$')
f1.plot(x,stats.chi2.pdf(x,4),':', label='$\chi^2(4)$')
f1.set_title('$\chi^2$ density functions')
f1.legend()
f2=fig.add_subplot(1,2,2)
x=np.linspace(0.01,.5,50)
f2.plot(x,stats.f.pdf(x,1,2),'-', label='$F(1,2)$')
f2.plot(x,stats.f.pdf(x,2,1),'-', label='$F(2,1)$')
f2.plot(x,stats.f.pdf(x,2,2),'-.', label='$F(2,2)$')
f2.plot(x,stats.f.pdf(x,1,1),'--', label='$F(1,1)$')
f2.plot(x,stats.f.pdf(x,1,3),':', label='$F(1,3)$')
f2.set_title('$F$ density functions')
f2.legend()

from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
X=np.arange(-5,5,0.25)
Y=np.arange(-5,5,0.25)
X,Y=np.meshgrid(X,Y) #X为每行相同的矩阵,Y为X转置
Z=np.sin(np.sqrt(X**2+Y**2))
x=X.reshape(len(X)**2)#把矩阵拉长成为一个向量
y=Y.reshape(len(Y)**2)
z=Z.reshape(len(Z)**2)
fig=plt.figure()
ax=fig.gca(projection='3d')
ax.plot_trisurf(x,y,z,cmap=cm.jet,linewidth=0.3)

z=np.linspace(-1,1,1000)
x=z*np.sin(100*z)
y=z*np.cos(100*z)
plt.figure(figsize=(30,10))
plt.axes(projection='3d')
plt.plot(x,y,z,'-b')

第7章

import scipy.stats as stats
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd

from scipy import io as sio
np.random.seed(789)
data = np.random.randn(5, 4)
sio.savemat("randn.mat", {'normal': data})
data = sio.loadmat('randn.mat', struct_as_record=True)
data['normal']

plt.figure(figsize=(18,4))
plt.subplots_adjust(top=1.5) #调节每个图四周的空间
x=np.arange(-4,4,.01)
plt.subplot(2,2,1)
plt.plot(x,stats.norm.cdf(x))
plt.title('cdf of $N(0,1): \Phi(x)$')
plt.subplot(2,2,2)
plt.plot(x,stats.norm.pdf(x))
plt.title('pdf of $N(0,1): \phi(x)$')
plt.subplot(2,2,3)
plt.plot(x,stats.norm.sf(x))
plt.title('sf of $N(0,1): 1-\Phi(x)$')
x=np.arange(.01,.99,.01)
plt.subplot(2,2,4)
plt.plot(x,stats.norm.ppf(x))
plt.title('ppf of $N(0,1): \Phi^{-1}(x)$')

np.random.seed(999)
stats.norm.rvs(size=10,loc=5,scale=2)

stats.norm.rvs(size=10,random_state=999,loc=5,scale=2)

fr=stats.norm(loc=5,scale=2) #把N(5,2)冻结到对象fr下面可得到各种有关结果
print('rvs(size=3,random=999): %s \nmean: %s \nstd: %s\
\ncdf(5.97): %s\npdf([-0.5,2.96]): %s \nkwds %s' \%(fr.rvs(size=3,random_state=999),fr.mean(),fr.std(),\fr.cdf(5.97),fr.pdf([-0.5,2.96]),fr.kwds))

stats.norm.isf([0.1,0.05,0.025,0.01,0.001])
#等价代码: -stats.norm.ppf([0.1,0.05,0.025,0.01,0.001])

stats.t.isf([0.1,0.05,0.025,0.01,0.001],[[2],[5],[500]])
#等价代码: -stats.t.ppf([0.1,0.05,0.025,0.01,0.001],[[2],[5],[500]])

from scipy.stats import rv_continuous
class exponential_gen(rv_continuous):'''Exponential distribution'''def _pdf(self,x,L):return L*np.exp(-x*L)def _cdf(self,x,L):return 1-np.exp(-x*L)

Exp=exponential_gen(name='exponential')
print('Exp.cdf:\n',Exp.cdf(np.arange(1,4,.3),.5))
print('Exp.pdf:\n',Exp.pdf(np.arange(1,4,.3),.6))
print('Exp.ppf:\n',Exp.ppf([0.1,0.05,0.01],.6))
print('Exp.rvs:\n',Exp.rvs(.6,size=7))
print('Exp.mean(.6):\n',Exp.mean(.6), Exp.var(.7),Exp.std(.7))

from scipy.stats import rv_continuous
class gaussian_gen(rv_continuous):'''Gaussian distribution'''def _pdf(self,x,m,s):return np.exp(-(x-m)**2/2./s**2)/np.sqrt(2.0*s**2*np.pi)

Gaussian=gaussian_gen(name='gaussian')
print('Gaussian.cdf:\n',Gaussian.cdf(np.arange(-4,4,1),.01,3))
print('Gaussian.pdf:\n',Gaussian.pdf(np.arange(-4,4,1),0.01,2))
print('Gaussian.rvs:\n',Gaussian.rvs(0.001,2,size=3))
print('Gaussian.mean:\n',Gaussian.mean(0.001,2),Gaussian.var(0.1,2))
print('Gaussian.ppf:\n',Gaussian.ppf([.1,.2,.5,.9],2,4))

print(Gaussian.cdf([2,0.1],0,2))

from scipy.stats import rv_discrete
class pois_gen(rv_discrete):'''Poisson distribution'''def _pmf(self,k,m):return np.exp(-m)*m**k/math.factorial(k)
import math

x=np.arange(1,6,1)
p=np.array([.1,.2,.3,.3,.1])
mydf=rv_discrete(name='mydf',values=(x,p))

plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
plt.xlim((0,6))
plt.ylim((0,.4))
plt.plot(x,mydf.pmf(x),'bo',ms=12,mec='r')
plt.title('PMF')
#上面ms为markersize简写, mec为markeredgecolor简写
plt.vlines(x,0,mydf.pmf(x),colors='k',lw=5)
plt.subplot(1,2,2)
plt.xlim((0,5))
plt.ylim((0,1.5))
plt.step(x,mydf.cdf(x),'b--',lw=4)
plt.title('CDF')

from scipy import integrate
f = lambda x: 6*x**3-2*x**2+x-1 #定义被积函数f
integrate.quad(f, 0, 4)   #做f的从0到4的积分

def g(t, n, x):return np.exp(-x*t) / t**n
def gint(n, x):return integrate.quad(g, 1, np.inf, args=(n, x))[0]
vec_gint = np.vectorize(gint)   #向量化

vec_gint(5, [4.3,3.1,0.2,0.21])

integrate.quad(lambda x: gint(4, x), 0, np.inf)

integrate.dblquad(lambda x, y: x*y, 0, 1/3.,lambda x: 0,lambda x: 1-3.*x)

def f(x, y):return x*y
def by():return [0, 1/3.]
def bx(y):return [0, 1-3.*y]
integrate.nquad(f, [bx, by])

def f(t, x):return np.exp(-x*t) / t**4
integrate.nquad(f, [[1, np.inf],[0, np.inf]])

第三部分

第8章

(np.arange(2.5),np.arange(3,8),np.arange(2,-4.6,-.5))

def Arange(x,y=None,step=None):if y==None and step==None:z=[]i=0while i< x:z.append(i)i+=1return(z)elif y!=None and step==None:z=[]i=xif y>x:while i< y:z.append(i)i+=1else:while i>y:z.append(i)i-=1return(z)elif y!=None and step!=None:z=[]i=xif y>x and step>0:while i< y:z.append(i)i+=stepelif y<x and step<0:while i> y:z.append(i)i+=stepelse:print('Error1')return(z) else:print('Error2')(Arange(2.5),Arange(3,8),Arange(2,-4.6,-.5),Arange(2,-5))

def Length(x):if not hasattr(x, '__iter__'):return 1else:n=0for i in x:n+=1return nx=[-6.3830e-01, -9.0400e-02, 2.2958e+00, -6.2140e-01, 8.5560e-01,
-9.6000e-03, 5.4290e-01, 3.4290e-01, 1.5519e+00, -8.4850e-01,1.0000e-04, 1.9846e+00, 1.2267e+00, 1.6071e+00, 2.4000e-03,6.4780e-01, 2.4260e-01, -1.5200e+00, 2.4870e-01, -7.4300e-01,
-5.8180e-01, -1.6385e+00, -4.3300e-02, 1.5950e+00, -7.1700e-01]
y='I am a student'
z={1:[2,4,6],'b':'list'}
(Length(x),Length(-2.3),Length('a'),Length(y),Length(z))

def Dim(x):n1=Length(x)n2=0if hasattr(x[0], '__iter__'):for i in x[0]:n2+=1else:n2=1return(n1,n2)
# 测试
import numpy as np
x=np.random.randn(137,560)
Dim(x),Dim([[3,2],[3,4],[3,3]]),Dim([2,-1,5]),Dim(np.random.randn(12))

def DimList(x):n=[]k=0for i in x:j=0k+=1if not hasattr(i, '__iter__'): n.append(1) else:for m in i:j+=1n.append(j)return (k,n)# 测试:
x=[{2:1,'s':3},'a',['I am', 'a', 'student'],'probability',(1,'a')]
DimList(x)

def SumV(x): #向量和s=0for i in x:s=s+ireturn sdef SumAll(x): #求两层list或矩阵全部元素和z=0for i in x:if Length(i)>1:for j in i:z+=jelse:z+=ireturn zdef Mean(x): #向量均值n=Length(x)return SumV(x)/ndef Prod(x): #向量乘积s=1for i in x:s=s*ireturn sdef Var(x): #样本方差s=SumV((np.array(x)-Mean(x))**2)/(Length(x)-1)return sdef Sd(x): #样本标准差s=Var(x)**(.5)return sdef Max(x):m=x[0]for i in x:if i>m:m=ireturn mdef Min(x):m=x[0]for i in x:if i<m:m=ireturn mdef CumSum(x): #向量累积和C=[]c=0for i in x:c=c+iC.append(c)return Cdef CumMean(x): #向量累积和或累积均值M=[]c=0k=0for i in x:k+=1c=c+iM.append(c/k)return Mdef Scale(x): #向量标准化, 每个元素减去样本均值后除以样本标准差s=[]m=Mean(x)sd=Sd(x)for i in x:s.append((i-m)/sd)return s# 测试
x=[1,4,2,6,-1,.9,-.3]
print('sum =',SumV(x),'prod =',Prod(x),'mean =',Mean(x),'sd =',Sd(x),'var =',Var(x),'\nMax =',Max(x),'Min =',Min(x),'\ncum sum =\n', CumSum(x),'\ncum mean =\n', CumMean(x),'\nScale =\n',Scale(x),'\nSumAll=', SumAll([x,x]))

def FApply(M,axis=0,fun=Mean):R=[]X=np.array(M)r,c=Dim(X)if axis==0:j=0while j<c:R.append(fun(X[:,j]))j+=1if axis==1:i=0while i<r:R.append(fun(X[i,:]))i+=1return R

x=[[-1.,4.,2.,7.],[12,6,-1,.9],[5,16,-11,5.9]]
print('\ncolumn sum:',FApply(x,0,SumV),'\nrow sum:',FApply(x,1,SumV))
print('\ncolumn max:',FApply(x,0,Max),'\nrow max:',FApply(x,1,Max))
print('\ncolumn min:',FApply(x,0,Min),'\nrow min:',FApply(x,1,Min))
print('\ncolumn mean:',FApply(x,0,Mean),'\nrow mean:',FApply(x,1,Mean))
print('\ncolumn var:',FApply(x,0,Var),'\nrow var:',FApply(x,1,Var))
print('\ncolumn sd:',FApply(x,0,SumV),'\nrow sd:',FApply(x,1,SumV))
print('\ncolumn scale:\n',Trans(np.array(FApply(x,0,Scale))),'\nrow scale:\n',np.array(FApply(x,1,Scale)))
print('\ncolumn cummean:\n',Trans(np.array(FApply(x,0,CumMean))),'\nrow cummean:\n',np.array(FApply(x,1,CumMean)))
print('\ncolumn cumsum:\n',Trans(np.array(FApply(x,0,CumSum))),'\nrow cumsum:\n',np.array(FApply(x,1,CumSum)))

def AllConst(c,n,m=None):if m==None:z=[]i=0while i<n:z.append(c)i+=1else:z=[]i=0while i<n:z0=[]j=0while j<m:z0.append(c)j+=1z.append(z0)i+=1return np.array(z)        print(AllConst(1.,3,5),'\n',AllConst(0.,10))

def DM(x):n,m=Dim(x)if n!=m:print('Not square matrix')if n>m: w=melse:w=nz=[]i=0while i<w:z.append(x[i,i])i+=1return np.array(z)    # 测试
np.random.seed(1010)
y=np.random.randn(5,13)
DM(y)

def Diag(n):i=0Z=[]while i<n:j=0z=[]while j<n: if i==j:z.append(1)else:z.append(0)j+=1i+=1Z.append(z)return np.array(Z)# 测试:
Diag(5)

def MD(x):n=Length(x)i=0Z=[]while i<n:j=0z=[]while j<n: if i==j:z.append(x[i])else:z.append(0)j+=1i+=1Z.append(z)return np.array(Z)# 测试:
np.random.seed(1010)
y=np.random.randn(4)
MD(y)

a={'Ray':2,'Tom': 64,'Babara':99,'Ted':47,'John':53,'Jane':30,'Titi':21,
'Baby': 10,'Lucy': 5}

old=[];middle=[];young=[]
for k in a:if a[k]<30: young.append(k)elif a[k]<70: middle.append(k)else: old.append(k)
print('old:',old,'middle:',middle,'young:',young)

def Straighten(y):z=[]; k=0if Dim(y)[1]==1: return ywhile (k<Dim(y)[0]):z.extend(y[k,:])k+=1return np.array(z)# 测试:
np.random.seed(1010)
y=np.random.choice(range(50),24,replace=True)
Y=y.reshape(4,6)
Straighten(Y),Straighten(y)

def Reshape(y,r=3,c=8):if Prod(Dim(y))!=r*c: return print('Wrong dimension!')if Dim(y)[1]!=1:y1=Straighten(y)else:y1=ym=[]k=1while (k<=r):m.append(y1[(k-1)*c:k*c])k+=1return np.array(m)# 测试
Reshape(Y),Reshape(y) #使用前面生成的Y和y

def Stack(x,y,axis=0): #axis=1 等于 hatsck; axis=0 等于 vatsck rx,cx=Dim(x)ry,cy=Dim(y)if axis==1:if rx!=ry:return('Dimension wrong')i=0res=[]while (i<=rx-1):res0=[]res0.extend(x[i,:].tolist())res0.extend(y[i,:].tolist())res.append(res0)i+=1return np.array(res)elif axis==0:if cx!=cy:return('Dimension wrong')i=0res=x.tolist()res.extend(y.tolist())return np.array(res)# 测试:
np.random.seed(1010)
x=np.random.rand(3,2)
y=np.random.rand(3,3)
z=np.random.rand(2,5)
w=np.random.rand(3,5)
xy=Stack(x,y,axis=1)
zw=Stack(z,w,axis=0)
(xy,zw)

def Stack2(x,y,axis=0):#axis=1 等于 hatsck; axis=0 等于 vatsck rx,cx=Dim(x)ry,cy=Dim(y)if axis==1:if rx!=ry:return('Dimension wrong')z=AllConst(0.,rx,cx+cy)z[:,:cx]=xz[:,cx:]=yreturn zelif axis==0:if cx!=cy:return('Dimension wrong')z=AllConst(0.,rx+ry,cx) z[:rx,:]=xz[rx:,:]=yreturn z#产生数据测试函数:
np.random.seed(1010)
x=np.random.randn(2,3)
y=np.random.randn(2,2)
z=np.random.randn(3,3)
xy=Stack2(x,y,axis=1)
xz=Stack2(x,z,axis=0)
(xy,xz)

def Trans(x):r,c=Dim(x)i=0R=[]while i<c:e=[]j=0while j<r:e.append(x[j,i])j+=1R.append(e)i+=1return np.array(R)# 测试:
np.random.seed(1010)
x=np.random.rand(3,2)
(Trans(x),Trans(Trans(x)))

def Outer(x,y,math=["+","-","*","/","%","**",">","<"]): op=math[0]nx=Length(x)ny=Length(y)R=AllConst(0.,nx,ny)i=0while i < nx:j=0while j < ny:R[i,j]=eval(str(x[i])+op+str(y[j]))j+=1i+=1return R# 测试
np.random.seed(8888)
x=np.random.randn(3)
y=np.random.randn(5)
Outer(x,y,"*"),Outer(x,y,"%"),Outer(x,y,"+"),Outer(x,y,"/"),Outer(x,y,"<")

def Sweep(M,V,axis=0,math=["+","-","*","/","%","**",">","<"]): #axis=0 按列(对行元素)运算,axis=1 按行(对列元素)运算op=math[0]r,c=Dim(M);n=Length(V)if axis==0:if c!=n: return ('Wrong dimension!')else:if r!=n: return ('Wrong dimension!')R=AllConst(0.,r,c)i=0while i < r:j=0while j < c:if axis==0:R[i,j]=eval(str(M[i,j])+op+str(V[j]))else:R[i,j]=eval(str(M[i,j])+op+str(V[i]))j+=1i+=1return R# 测试
np.random.seed(1010)
M=np.random.randn(3,4); V=np.random.randn(4)
W=np.random.choice(np.arange(10),3)
Sweep(M,V,0,"%"),Sweep(M,W,1,"**"),Sweep(M,V,0,"/"),Sweep(M,W,1,"-")

def MProd(x,y):rx,cx=Dim(x)ry,cy=Dim(y)if cx!=ry:return ('Wrong dimension')z=AllConst(0.,rx,cy)for i in Arange(rx):for j in Arange(cy):for k in Arange(cx):z[i,j]+=x[i,k]*y[k,j]return z# 测试
np.random.seed(1010)
x=np.random.rand(3,20)
y=np.random.rand(4,20)
MProd(x,Trans(y))

def Inv(w):n=Dim(w) I=Diag(n[0])W=Stack(w.astype('float'),I,1)for i in Arange(n[1]):W[i,:]=W[i,:]/W[i,i]for j in Arange(n[0]):if j!=i:W[j,:]=W[j,:]-W[i,:]*W[j,i]else:continuereturn W[:,n[0]:] # 测试:
np.random.seed(1010)
w=np.random.rand(5,5)
np.round(MProd(Inv(w),w),12)

def SimpleSort(z, increasing=True): x=z.copy()n = Length(x) for i in Arange(n-1): for j in Arange(0, n-i-1): if increasing:if x[j] > x[j+1] : x[j], x[j+1] = x[j+1], x[j] else:if x[j] < x[j+1] : x[j], x[j+1] = x[j+1], x[j] return x# 测试
x = [64, 34, 25, -2,12, 22, 11, 90,25,-1]
SimpleSort(x,increasing=False),SimpleSort(x,True),x

def SORT(x,decreasing=False):if x[0]<x[1]:s=x[:2].tolist()else:s=[x[1],x[0]]for i in Arange(2,Length(x)):j=0while j<Length(s):if x[i]<s[j]:s.insert(j,x[i])breakelif j==Length(s)-1 and x[i]>s[j]:s.insert(j+1,x[i])breakelse:    j+=1if decreasing==True:s=s[::-1]return s   # 测试:
np.random.seed(1010)
x=np.random.randn(10)
print(np.array(SORT(x)))
print(np.array(SORT(x,decreasing=True)))
y=np.array(['I', 'am', 'a', 'student', 'and', 'you','are', 'a', 'teacher'])
print(SORT(y))
print(SORT(y,decreasing=True))

def Sort(x, decreasing=False):m=Length(x)s=AllConst(0.,m,m)for i in range(m):for j in Arange(m):s[i,j]=(x[i]<x[j])*1ind=AllConst(0.,m)for i in Arange(m):ind[i]=SumAll(s[i,:])m=len(ind);k=0;z=AllConst(0.,m)for i in Arange(m)[::-1]:for j in Arange(m):if ind[j]==i: z[k]=x[j]k+=1j=j+1else:continueif decreasing==True:return z[::-1]else:return z#测试:
np.random.seed(1010)
x=np.random.randn(5)
Sort(x),Sort(x,decreasing=True)

def Order(x, decreasing=False):m=Length(x)s=AllConst(0.,m,m)for i in range(m):for j in Arange(m):s[i,j]=(x[i]<x[j])*1ind=AllConst(0.,m)for i in Arange(m):ind[i]=SumAll(s[i,:])m=len(ind);k=0;z=AllConst(0.,m)for i in Arange(m)[::-1]:for j in Arange(m):if ind[j]==i: z[k]=jk+=1j=j+1else:continueif decreasing==True:return z[::-1].astype('int')else:return z.astype('int')#测试:
np.random.seed(1010)
x=np.random.randn(5)
Order(x),Order(x,decreasing=True),x

def ORDERSORT(x,decreasing=False):if x[0]<x[1]:s=x[:2].tolist()O=[0,1]else:s=[x[1],x[0]]O=[1,0]for i in Arange(2,Length(x)):j=0while j<Length(s):if x[i]<s[j]:s.insert(j,x[i])O.insert(j,i)breakelif j==Length(s)-1 and x[i]>s[j]:s.insert(j+1,x[i])O.insert(j+1,i)breakelse:    j+=1if decreasing==True:s=s[::-1]O=O[::-1]return O,s # 测试:
np.random.seed(1010)
x=np.random.randn(10)
y=np.array(['I', 'am', 'a', 'student', 'and', 'you','are', 'a', 'teacher'])
print(np.array(ORDERSORT(x)))
print(np.array(ORDERSORT(x,decreasing=True)))
print(ORDERSORT(y)[0],'\n',ORDERSORT(y)[1])
print(ORDERSORT(y,decreasing=True)[0],'\n',ORDERSORT(y,decreasing=True)[1])

import random
import matplotlib.pyplot as plt
x=0;y=0;X=[];Y=[]
random.seed(1010)
for i in range(1000):x=x+random.normalvariate(0,1)y=y+random.normalvariate(0,1)X.append(x)Y.append(y)
plt.figure(figsize=(20,7))
plt.plot(X,Y,'b.-')

np.random.seed(1010)
X=np.hstack((np.ones(200).reshape(-1,1),np.random.normal(20,3,(200,10))))
y=X.dot(np.array([29,10,-7,8,2,9,-2,-12,23,3,6]))+np.random.randn(200)
np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)

def f(x):return np.log(x)+3/(16-2*np.exp(x))def df(x):return 1/x+6*np.exp(x)/(16-2*np.exp(x))**2ep=30;x0=.5;k=0
while abs(ep) > 10**-15 and k<10000:x0=x0-f(x0)/df(x0)ep=f(x0)k+=1
print('root=',x0,'\nf(x)=',f(x0),'\nafter',k,'iterations')

def ee(x):y=(np.exp(x)+np.exp(-x))/2return ydef ee1(x):y=(np.exp(x)-np.exp(-x))/2return ydef p3(a,b):U=ee1(b);V=ee1(a);Z=3*(ee(b)-ee(a))/(b-a)-U-VW=np.sqrt(Z**2-U*V)y=a+(b-a)*(1-(U+W+Z)/(U-V+2*W))return yk=0
a=-200;b=300
y=p3(a,b)
while abs(ee1(y))>10**-15 and k<100:if y>0:b=yy=p3(a,b)else:a=yy=p3(a,b)k+=1
print('y=',y,'\nf(x)=',ee(y),'f(x)=',ee1(y),'k=',k)

def f(x):y=(np.exp(x)+np.exp(-x))/2return yG=(np.sqrt(5)-1)/2
a1=-200;a2=300
a3=a2-G*(a2-a1)
a4=a1+G*(a2-a1)
f3=f(a3);f4=f(a4)
k=0
while abs(a3-a4)>10**-15 and k<1000:if f3<f4:a2=a4a4=a3f4=f3a3=a2-G*(a2-a1)f3=f(a3)else:a1=a3a3=a4f3=f4a4=a1+G*(a2-a1)f4=f(a4)k+=1
print('abs(a3-a4)=',np.abs(a3-a4),'abs(f3-f4)=',np.abs(f3-f4),'f(a_3)=',f3,'k=',k)

def Rand(n,seed):U=2053.;V=13849.;R=seeda=[]i=0S=2**16while i<n:R=(U*R+V)%Sa.append(R/S)i+=1return a
# 测试: 画散点图及直方图
import matplotlib.pyplot as plt
import seaborn as sns
y=Rand(10000,1010)
fig = plt.figure(figsize=(20,5))
plt.subplot(121)
plt.scatter(range(len(y)),y,s=5)
plt.subplot(122)
sns.distplot(y, hist=True, kde=True, bins=int(300/5), color = 'darkblue', hist_kws={'edgecolor':'black'},kde_kws={'linewidth': 4})

def RandN(n,loc,sd,seed=1010):A=[]i=0Seed=(np.array(Rand(n,seed))*10000)while i<n:RN=np.array(Rand(12,Seed[i]))A.append(loc+sd*(SumV(RN[:12])-6))i+=1return A# 测试并画图
import seaborn as sns
y=RandN(10000,0.,1.)
fig = plt.figure(figsize=(20,5))
plt.subplot(121)
plt.scatter(range(len(y)),y,s=5)
plt.subplot(122)
sns.distplot(y, hist=True, kde=True, bins=int(300/5), color = 'darkblue', hist_kws={'edgecolor':'black'},kde_kws={'linewidth': 4})

import random
random.seed(1010)
np.random.seed(1010)
n=50000
x_np=np.random.randn(n)
y=[]
for i in range(n):y.append(random.normalvariate(0,1))
x_sq=CumMean(x_np)
y_sq=CumMean(y)
print(x_sq[-1],y_sq[-1])plt.figure(figsize=(20,6))
plt.subplot(121)
plt.plot(x_sq[1000:],'b-',label='numpy.random',linewidth=1)
plt.plot(y_sq[1000:],'g-',label='random',linewidth=1)
plt.plot(np.zeros(n-1000),'k--',label='zeros',linewidth=1)
plt.legend(loc='best')
plt.subplot(122)
plt.plot(x_sq[-50:],'b-o',label='numpy.random',linewidth=5)
plt.plot(y_sq[-50:],'g-s',label='random',linewidth=5)
plt.plot(np.zeros(50),'k--',label='zeros',linewidth=5)
plt.legend(loc='best')

print('numpy.random: ',np.mean(np.array(x_sq[-3000:])>0),'\nrandom: ',np.mean(np.array(y_sq[-3000:])>0))

import ospath = '/users/data/站点监测数据/'
files = []
file_names=[]
# r=root, d=directories, f = files
for r, d, f in os.walk(path):for file in f:if '.xlsx' in file:files.append(os.path.join(r, file))file_names.append(file)

CP=[]
for i in pd.Series(file_names):CP.append(i.split('_')[0])
CP=np.unique(CP)
CP

#产生与CP对应的dict() FD相应于CP的元素 各元素为相应省市文件地址
FD=dict()
for i in CP:FD[i]=list()for i in CP:for j in files:if j.split('/')[7].split('_')[0]==i:FD[i].append(j)

def DY(w):Date=[]    Year=[]y=w.f_datetime[0].split(' ')[0].split('/')[0]s=w.f_datetime[0].split(' ')[0]Date.append(s)Year.append(y)for i in w.f_datetime:if i.split(' ')[0] !=s:s=i.split(' ')[0]Date.append(s)if i.split(' ')[0].split('/')[0]!=y:y=i.split(' ')[0].split('/')[0]Year.append(y)Month=list(map(lambda x: '/'+str(x)+'/', range(1,13)))Hour=list(map(lambda x: ' '+str(x)+':', range(24)))month=np.arange(1,13)hour=np.arange(24)return {'Date':Date,'Year':Year,'Month':Month,'Hour':Hour},\{'Date':Date,'Year':Year,'Month':month,'Hour':hour}

w=pd.read_excel(files[0])
B,B0=DY(w)
print(B.keys(),B['Year'],B['Month'],B['Hour'],B['Date'][:10])
print(B0.keys(),'\n',B0['Hour'],'\n',B0['Month'])

def EX(w):#只有一个文件(站点)的4个汇总from astropy.time import TimeB,B0=DY(w) #提取文件中的日和年(字符串), 月和小时比较规范不用函数nm=w.columns #文件中所有变量的名称表X=w[nm[7:]] #X不包括省市站点经纬度和时间, 从级别开始的度量X=pd.get_dummies(X, dummy_na=False) #把仅有的定性变量"级别"哑元化X.iloc[:,-6:]=X.iloc[:,-6:].astype('float') #把数量变量标为浮点型X=w[nm[1:7]].join(X)#加入前面未包含的省市站点经纬度和时间等变量W=dict() #准备一个dict以待装入年, 月, 小时, 日的平均for b in B:df=pd.DataFrame() #准备空DataFramefor i in B[b]:A=X[X['f_datetime'].str.contains(i)].iloc[:,6:].mean()df=df.append(pd.DataFrame(data=A.values.reshape(1,-1),columns=A.index),\sort=None)df.insert(0,value=B0[b],column=b) #插入到DataFramefor i in np.arange(1,6)[::-1]:df.insert(0,value=w[nm[i]][0],column=nm[i])if b=='Date': #转换日期格式df['Year']=list(map(lambda x: x.split('/')[0],df.Date)) df.Date=list(map(lambda x: (Time(x.replace('/','-')).value)\.split(' ')[0],df.Date)) df['Month']=list(map(lambda x: x.split('-')[1],df.Date)) df['YearMonth']=list(map(lambda x: x.split('-')[0]+'-'+x.split('-')[1],\df.Date)) W[b]=dfreturn W

RS=dict()
for cp in CP: #省市名print(cp) #查看进度, 输出省市名称U=dict() #制造空dictfor i in B:U[i]=pd.DataFrame()for i in FD[cp]: #文件地址print(i) #查看进度, 输出文件路径u=pd.read_excel(i) #读入cp省市的一个站点数据W=EX(u)for k in U: #对年月日时分别合并各个站点数据U[k]=U[k].append(W[k],ignore_index=True)RS[cp]=U  #形成省市为元素的dict

for i in RS:print(i)for j in RS[i]:print(j, RS[i][j].shape, type(RS[i][j]))

for i in RS:print(i)with pd.ExcelWriter(i+'.xlsx') as writer: for j in RS[i]:print(j)RS[i][j].to_excel(writer,sheet_name=j)

v1=RS['北京']['Year'].pivot(index='Year',columns='c_station',values='g_level_一级')
st=list(map(lambda x: 'Station-'+str(x),range(12)));st
for i in range(len(st)):print(st[i],'=',v1.columns[i])

Marker=['o','^','s','P','p','*','D','v','<','>','X','H']
import matplotlib.pyplot as plt
plt.figure(figsize=(20,6))
for i in range(12):plt.plot(v1.iloc[:,i],marker=Marker[i], linestyle='dashed',linewidth=2, markersize=12)
plt.legend(loc='best',ncol=6, shadow=True,labels=st)
plt.title('Beijing level-1 percentage for 12 stations from 2014 to 2019')

H=dict()
for i in CP:H[i]=RS[i]['Hour'].groupby(['Hour']).mean().reset_index()['g_level_一级']
H=pd.DataFrame(H)
print(H.head())

Marker=['o','^','s','P']
CP0=['Beijing','Anhui','Hebei','Chongqing']
import matplotlib.pyplot as plt
plt.figure(figsize=(20,6))
for i in range(len(Marker)):plt.plot(H[CP[i]],'.-',marker=Marker[i], linestyle='dashed',linewidth=2, markersize=12)
plt.legend(loc='best',ncol=4, shadow=True,labels=CP0)
plt.title('Level-1 percentage of 24 hours for 4 places')

D=dict()
for i in CP:D1=RS[i]['Date'].loc[:,['Date','i_gkd_pm25(μg/m3)']]D1.rename(columns={'i_gkd_pm25(μg/m3)':i},inplace=True)D1=D1.set_index('Date')D[i]=D1

Marker=['o','^','s','P']
CP0=['Beijing','Anhui','Hebei','Chongqing']
import matplotlib.pyplot as plt
plt.figure(figsize=(20,6))
for i in range(len(CP)):t=list(map(lambda x: np.datetime64(Time(x.replace('/','-')).value,'D')\-np.datetime64('2014-01-01','D'),D[CP[i]].index))t=list(map(lambda x: x.astype('int'),t))plt.scatter(t,D[CP[i]].values,marker=Marker[i])
plt.legend(loc='best',ncol=4, shadow=True,labels=CP0)
plt.title('Daily PM-2.5 in 4 places for 6 years')

M=RS['北京']['Month'].loc[:,['Month','i_gkd_pm25(μg/m3)']].groupby('Month').mean()
M.rename(columns={'i_gkd_pm25(μg/m3)':'北京'},inplace=True)
for i in CP[1:]:M1=RS[i]['Month'].loc[:,['Month','i_gkd_pm25(μg/m3)']].groupby('Month').mean()M1.rename(columns={'i_gkd_pm25(μg/m3)':i},inplace=True)M=pd.merge(M,M1,left_on='Month', right_on='Month')
print(M.head())

Marker=['o','^','s','P']
CP0=['Beijing','Anhui','Hebei','Chongqing']
import matplotlib.pyplot as plt
plt.figure(figsize=(20,6))
for i in range(len(Marker)):plt.plot(M[CP[i]],'.-',marker=Marker[i], linestyle='dashed',linewidth=2, markersize=12)
plt.legend(loc='best',ncol=4, shadow=True,labels=CP0)
plt.title('Monthly PM-2.5 in 4 places for 6 year mean')

vc=RS['重庆']['Month'].pivot(index='Month',columns='c_station',values='i_gkd_pm25(μg/m3)')
st=list(map(lambda x: 'Station-'+str(x),range(len(vc.columns))));st
for i in range(len(st)):print(st[i],'=',vc.columns[i])

Marker=['o','^','s','P','p','*','D','v','<','X']
import matplotlib.pyplot as plt
plt.figure(figsize=(20,6))
for i in range(len(vc.columns)):plt.plot(vc.iloc[:,i],marker=Marker[i], linestyle='dashed',linewidth=2, markersize=12)
plt.legend(loc='best',ncol=5, shadow=True,labels=st)
plt.title('Chongqing monthly mean pm2.5 for 10 stations')

Y=RS['北京']['Year'].loc[:,['Year','i_gkd_pm25(μg/m3)']].groupby('Year').mean()
Y.rename(columns={'i_gkd_pm25(μg/m3)':'北京'},inplace=True)for i in CP[1:]:Y1=RS[i]['Year'].loc[:,['Year','i_gkd_pm25(μg/m3)']].groupby('Year').mean()Y1.rename(columns={'i_gkd_pm25(μg/m3)':i},inplace=True)Y=pd.concat((Y,Y1),axis=1,sort=False)
print(Y)

Marker=['o','^','s','P']
CP0=['Beijing','Anhui','Hebei','Chongqing']
import matplotlib.pyplot as plt
plt.figure(figsize=(20,6))
for i in range(len(CP)):plt.plot(Y.index,Y.iloc[:,i],marker=Marker[i], linestyle='dashed',linewidth=2, markersize=12)
plt.legend(loc='best',ncol=len(CP), shadow=True,labels=CP0)
plt.title('Yearly PM-2.5 in 4 places')

Marker=['o','^','s','P']
CP0=['Beijing','Anhui','Hebei','Chongqing']
import matplotlib.pyplot as plt
plt.figure(figsize=(20,6))
for j in range(len(CP)):pm=RS[CP[j]]['Date'].groupby('YearMonth').mean()plt.plot(pm.index,pm['i_gkd_pm25(μg/m3)'],marker=Marker[j], linestyle='dashed',linewidth=1, markersize=5)
plt.legend(loc='best',shadow=True,labels=CP0)
plt.xticks(rotation=45)
plt.title('Monthly mean PM-2.5 in 4 places through years')

Marker=['o','^','s','P']
CP0=['Beijing','Anhui','Hebei','Chongqing']
import matplotlib.pyplot as plt
plt.figure(figsize=(20,6))
plt.subplot(121)
for j in range(len(CP)):pm=RS[CP[j]]['Date'].groupby('YearMonth').mean()plt.plot(pm.index,pm['i_gkd_pm25(μg/m3)'],marker=Marker[j], linestyle='dashed',linewidth=1, markersize=5)
plt.legend(loc='best',shadow=True,labels=CP0)
plt.xticks(rotation=45,fontsize=5)
plt.ylim(top=180)
plt.title('Monthly mean PM-2.5 in 4 places through years')
plt.subplot(122)
for j in range(len(CP)):pm=RS[CP[j]]['Date'].groupby('YearMonth').median()plt.plot(pm.index,pm['i_gkd_pm25(μg/m3)'],marker=Marker[j], linestyle='dashed',linewidth=1, markersize=5)
plt.legend(loc='best',shadow=True,labels=CP0)
plt.xticks(rotation=45,fontsize=5)
plt.ylim(top=180)
plt.title('Monthly median PM-2.5 in 4 places through years')

第9章

def iter(x, P, n):res = np.zeros((n+1, len(x)))res[0,] = xfor i in range(n):x=x.dot(P)res[i+1,] = xreturn resP=np.array([[.5,.23,.27],[.3,.25,.45],[.1,.5,.4]])x=np.identity(3)
n=10
y={}
for i in range(x.shape[1]):y[i]=iter(x[i,:], P, n)
ev=np.linalg.eig(P.T)[1][:,0]
ev=ev/ev.sum()import matplotlib.pyplot as plt
fig = plt.figure(figsize=(20,6))
for i in range(len(y)):plt.plot(y[i])plt.plot([10,10,10],ev,'p')

def run(i,P, n): # i 是出发状态, P是转移阵, n是走多少步res = []for t in range(n):i=np.random.choice(P.shape[0], size=1,p= P[int(i)])res.append(i)return resnp.random.seed(1010)
s = run(0,P, 100)fig = plt.figure(figsize=(20,6))
plt.step(range(len(s)),s)

import matplotlib.pyplot as pltfig = plt.figure(figsize=(20,8))for i in np.unique(s):ss=np.cumsum(s==i)/np.arange(1,len(s)+1)plt.plot(ss,label='stat={}'.format(int(i)))
plt.legend(loc='best')

np.random.seed(1010)
s = run(0,P, 5000)
fig = plt.figure(figsize=(20,6))
for i in np.unique(s):ss=np.cumsum(s==i)/np.arange(1,len(s)+1)plt.plot(ss,label='stat={}'.format(int(i)))
plt.legend(loc='best')

# summary statistics of sample
n = 30
ybar = 15
s2 = 3
N=11000
# sample from the joint posterior (mu, tau | data)
mu   = np.zeros(N)
tau = np.zeros(N)
T = 1000    # burnin
tau[0]=1 # initialisation
for i in range(1,N): mu[i] = np.random.normal(loc=ybar, scale=np.sqrt(1/(n*tau[i - 1])),size=1)    tau[i] = np.random.gamma(shape = n/2, scale = 2/((n-1)*s2+n*(mu[i]-ybar)**2),size=1)mu  = mu[T:]   # remove burnin
tau = tau[T:] # remove burninimport matplotlib.pyplot as plt
import seaborn as sns
fig = plt.figure(figsize=(20,6))
ax = fig.add_subplot(221)
ax.plot(mu)
ax = fig.add_subplot(222)
ax.plot(tau)
ax = fig.add_subplot(223)
sns.distplot(mu, hist=True, kde=True, bins=int(300/5), color = 'darkblue', hist_kws={'edgecolor':'black'},kde_kws={'linewidth': 4})
ax = fig.add_subplot(224)
sns.distplot(tau, hist=True, kde=True, bins=int(300/5), color = 'darkblue', hist_kws={'edgecolor':'black'},kde_kws={'linewidth': 4})

coin=[1, 2, 1, 2, 2, 1, 1, 2, 1, 2]
heads=[55, 20, 57, 14, 13, 57, 56, 10, 49, 16]heads=np.array(heads).astype(float)
coin=np.array(coin)p1MLE = heads[coin==1].sum()/(sum(coin==1)*n)
p2MLE = heads[coin==2].sum()/(sum(coin==2)*n)print('MLE of p1 =',p1MLE,'\nMLE of p2 =', p2MLE)

np.random.seed(1010)
p1ME = np.random.uniform(0,1,1) # 用均匀分布设置初始猜测p1
p2ME = np.random.uniform(0,1,1) # 用均匀分布设置初始猜测p2P1 = 0 #放置替换的估计
P2 = 0from scipy.stats import binomwhile (np.abs(p1ME-P1)>10**-15)& (np.abs(p2ME-P2)>10**-15):P1 = p1ME #迭代中替换前一步估计的 p1(t)以同时保留两次估计的记录P2 = p2MEden1 = binom.pmf(heads,n,p1ME) # 概率质量函数 p(n,p1(t))den2 = binom.pmf(heads,n,p2ME)# E-步骤h1 = den1/(den1+den2)*heads #根据 p1/(p1+p2)重新计算 10个 x数目的期望(按照p1)h2 = den2/(den1+den2)*heads #根据 p2/(p1+p2)重新计算 10个 x数目的期望(按照p2)t1 = den1/(den1+den2)*(n-heads) #根据p1/(p1+p2)重新计算 10个 n-x数目(按照p1)t2 = den2/(den1+den2)*(n-heads)# M-步骤p1ME = np.sum(h1)/np.sum((h1,t1)) #得到最大似然估计p1(t+1)并返回上面作为初始值p2ME = np.sum(h2)/np.sum((h2,t2))# 先前计算的 MLE 估计
print("MLE estimates: p1MLE=%s, p2MLE=%s"%(p1MLE,p2MLE))# EM 估计
print("EM estimates: p1EM=%s, p2EM=%s"%(np.round(p1ME,10),np.round(p2ME,10)))

p = 0.4
mu = (-1, 2)
sd = (.5, 2)from scipy.stats import norm
import numpy as npdef f(x,p=p,mu=mu,sd=sd):return p*norm.pdf(x, mu[0], sd[0])+(1-p)*norm.pdf(x, mu[1], sd[1])def q(x,sd=4):return np.random.normal(x,sd,1)[0]def step(x, f, q, sd=4):xp = q(x,sd) # 随机从N(x,4)选一点alpha = min(1, f(xp) / f(x)) #接受概率(<=1)if (np.random.uniform(0,1,1)[0] < alpha): #以概率alpha接受新点x = xpreturn xdef run(x, f, q, sd, nsteps):res = np.zeros(nsteps)for i in range(nsteps):x = step(x, f, q)res[i] = xreturn resres = run(0, f, q, 4,5000)import matplotlib.pyplot as plt
fig = plt.figure(figsize=(20,6))
plt.plot(res)

import matplotlib.pyplot as plt
import seaborn as sns
fig = plt.figure(figsize=(20,6))
sns.distplot(res, hist=True, kde=True, bins=int(300/5), color = 'darkblue', hist_kws={'edgecolor':'black'},kde_kws={'linewidth': 4})

resfast=run(0, f, q, 33,5000)
resslow=run(0, f, q, .3,5000)import matplotlib.pyplot as plt
fig = plt.figure(figsize=(20,6))
plt.subplot(221)
plt.plot(resfast)
plt.title('Trace for fast sampling')
plt.subplot(222)
sns.distplot(resfast, hist=True, kde=True, bins=int(300/5), color = 'darkblue', hist_kws={'edgecolor':'black'},kde_kws={'linewidth': 4})
plt.title('Histogram and density for fast sampling')
plt.subplot(223)
plt.plot(resslow)
plt.title('Trace for slow sampling')
plt.subplot(224)
sns.distplot(resslow, hist=True, kde=True, bins=int(300/5), color = 'darkblue', hist_kws={'edgecolor':'black'},kde_kws={'linewidth': 4})
plt.title('Histogram and density for slow sampling')

resfast=run(0, f, q, 33,1000)
resslow=run(0, f, q, .3,1000)

#下面函数控制得到的参数不能是[0,1]区间之外的数
def watch(p):if p<0 or p>1:return 0.else:return 1.#似然函数
def lh(p, f, nAA, nAa, naa):r=(f*p+(1-f)*p*p)**nAA*((1-f)*2*p*(1-p))**nAa*(f*(1-p)+(1-f)*(1-p)*(1-p))**naareturn r# 主要抽样程序
def fp(nAA, nAa, naa, niter, f0, p0, fsd, psd):f=np.ones(niter)*0.5p=np.ones(niter)*0.5f[0]=f0p[0]=p0for i in np.arange(2,niter):oldf=f[i-1]oldp=p[i-1]newf=oldf+np.random.normal(0,1,1)[0]newp=oldp+np.random.normal(0,1,1)[0]Af = watch(newf)*watch(newp)*lh(newp,newf,nAA,nAa,naa)/\lh(oldp,oldf,nAA,nAa,naa)if np.random.uniform(0,1,1)[0] < Af: #以概率alpha接受新点f[i] = newfelse:f[i] =oldfAp = watch(newf)*watch(newp)*lh(newp,f[i],nAA,nAa,naa)/\lh(oldp,f[i],nAA,nAa,naa)if np.random.uniform(0,1,1)[0] < Ap: #以概率alpha接受新点p[i] = newpelse:p[i] =oldpreturn f,p

# 执行抽样
f,p=fp(8,7,12,50000,0.5,0.5,0.01,0.01)
# 画痕迹图及直方图
import scipy.stats as stats
import seaborn as sns
import scipy.stats as stats
import seaborn as sns
plt.figure(figsize=(20,7))
plt.subplot(221)
plt.plot(f)
plt.title('Trace plot of f')
plt.subplot(222)
sns.distplot(f, hist=True, kde=True, bins=15, color = 'darkblue', hist_kws={'edgecolor':'black'},kde_kws={'linewidth': 4})
plt.title('Histogram of f')
plt.subplot(223)
plt.plot(p)
plt.title('Trace plot of p')
plt.subplot(224)
sns.distplot(p, hist=True, kde=True, bins=15, color = 'darkblue', hist_kws={'edgecolor':'black'},kde_kws={'linewidth': 4})
plt.title('Histogram of p')

第四部分

第10章

import pandas as pd
import numpy as np
w=pd.read_csv('SYB58_35_Index of industrial production.csv',skiprows=1)
w.head()

import pandas as pd
import numpy as np
w=pd.read_csv('SYB58_35_Index of industrial production.csv',skiprows=1)
w.head()

w.rename(columns={'Unnamed: 1':'CountryArea'},inplace=True)
w.columns

print('w.shape =',w.shape)
for i in [0,1,2,3]:print('Number of',w.columns[i],'=',len(set(w.iloc[:,i])))
print('Series;\n')
for i in set(w.iloc[:,2]):print(i)

for i in w.columns[[0,5,6]]:del w[i]
w.columns

for i in set(list(w['Series'])):w["Series"]= w["Series"].replace(i, i.split(': ')[1].split(' (')[0]) set(w['Series'])

w.to_csv('II.csv',index=False)

import pandas as pd
import numpy as np
u=pd.read_csv('II.csv', thousands=',')
print(u.head())

G=u[u['CountryArea']=='Germany'].pivot(index='Year',columns='Series',values='Value')
G

import matplotlib.pyplot as plt
G.plot(style='.-',figsize=(20,6))
plt.title('Index of industrial production of Germany')
plt.show()

T=u[u["Series"]=="Manufacturing"].pivot(index='Year',columns='CountryArea', values='Value')
print(T[["Denmark","Finland", "Sweden", 'Norway','Japan']].head())

import matplotlib.pyplot as plt
T[["Denmark","Finland", "Sweden", 'Norway','Japan']].plot(style='.-',figsize=(20,6))
plt.show()

I2014=u[u.Year==2014].pivot(index='Series',columns='CountryArea',values='Value')
I2014[["Denmark","Finland", "Sweden", 'Norway','Japan']].plot(style='.-',figsize=(20,6))
plt.show()

I14=u[u.Year==2014].pivot(index='CountryArea',columns='Series',values='Value')
I14.loc[["Denmark","Finland", "Sweden", 'Norway', 'Japan'],:].\plot(style='.-', figsize=(20,6))
plt.show()

import pandas as pd
import numpy as np;import scipy.stats as stats
adult=pd.read_csv("adult.csv",header=None)
names=["age","workclass","fnlwgt","education","education_nnum",
"marital_status","occupation","relationship","race",
"sex","capital_gain","capital_loss","hours_per_week",
"native_location","income"]
adult.columns=names
print(adult.head())

print(adult.columns,'\n',adult.shape)

adult.describe()

print(stats.describe(adult.age))

adult.occupation.dtype

cat_cols = [adult.columns.get_loc(col) \for col in adult.select_dtypes(['object']).columns.tolist()]
print(cat_cols, '\n',adult.columns[cat_cols])

workclass=adult.groupby("workclass")
print(len(workclass))
workclass.mean()

adult.isna().sum(axis=0)#默认值是axis=0

print(pd.crosstab(adult.income,adult.race))

pd.crosstab([adult.income,adult.sex],adult.occupation)

xtb=pd.crosstab(adult.race,adult.marital_status)
print(xtb)

import matplotlib.pyplot as plt
%matplotlib inline
fig=plt.figure(figsize=(10,4.5))
plt.subplot(1,2,1)
plt.pie(xtb.sum(0),labels=xtb.columns,autopct='%1.2f%%') #7
plt.title('marital status')
plt.subplot(1,2,2)
plt.pie(xtb.sum(1),labels=xtb.index,autopct='%1.1f') #5
plt.title('race')

print('1000*pi={:1.4f},\n1000*pi={:20.5f}'.format(np.pi*1000,np.pi*1000))

fig=plt.figure(figsize=(10,4.5))
plt.subplot(1,1,1)
plt.barh(y=range(len(xtb.columns)),width=xtb.sum(0),tick_label=xtb.columns)
plt.title('marital status')

fig=plt.figure(figsize=(10,3))
plt.hist(adult.age,density=True,bins=15)
kde=stats.gaussian_kde(adult.age)
x=np.sort(adult.age)
plt.plot(x,kde(x),'k-')
plt.title('Age histogram and density estimation')

import numpy as np
import pandas as pd
import seaborn as sns
w = pd.read_csv('iris.csv')
print(w.head())

L=set(w['Species']);L

u=w.iloc[[1,3,5,51,53,55,101,103,105],:]
print(u)

np.random.seed(999)
u_nan=u.mask(np.random.random(u.shape)<0.3)
print(u_nan)

print(u_nan.isna().sum())#按列计算, 和u_nan.isna().sum(axis=0)
print('Total number of missing values =',u_nan.isna().sum().sum())

MM=list(set(u_nan['Species']))#[nan, 'setosa', 'virginica', 'versicolor']
S=u_nan['Species']
SS=np.zeros(len(S))>0
for i in np.arange(1,len(MM)):SS=SS+(S==MM[i])*i
u_nan['Species']=SS
u_nan

u_nan['Species']=u_nan['Species'].mask(SS==0) #只有data frame 有mask函数
print(u_nan)

from missingpy import MissForest
imputer = MissForest(random_state=1010)
imputed = imputer.fit_transform(u_nan, cat_vars=4)#标明第4个是分类变量
imputed #得到的是np.array

u2=pd.DataFrame(imputed,columns=w.columns,)
print(u2)

Y=pd.DataFrame({'sex':[1,0,0,0,1,1,1,0]})
print('type before:', Y.sex.dtype)
Y['sex']=Y['sex'].astype('category') #改变类型
print('type after:', Y.sex.dtype)

u2["Species"] = u2["Species"].astype('category')

u2=pd.get_dummies(u2,drop_first=False)
print(u2.iloc[:,4:])

on=u2.columns[4:];on
nm=dict()
for i in range(len(on)):nm[on[i]]='Species_'+MM[i+1]
u2=u2.rename(columns=nm)
print(u2.iloc[:,4:])#只显示哑元化的几列

import numpy as np
import pandas as pd
adult=pd.read_csv("adult.csv",header=None)
names=["age","workclass","fnlwgt","education","education_nnum",
"marital_status","occupation","relationship","race",
"sex","capital_gain","capital_loss","hours_per_week",
"native_location","income"]
adult.columns=names
adult.isna().sum() #查看缺失值情况
print(adult.isna().sum())
print('Ratio of NaN =', adult.isna().sum().sum()/adult.size)

cat_cols=[]
for i in range(len(adult.columns)):if adult.iloc[:,i].dtype=='O':cat_cols.append(i)
print(cat_cols, '\n',adult.columns[cat_cols])

mm=[]
for i in cat_cols:l=list(set(adult.iloc[:,i]))mm.append([x for x in l if type(x) != float])
print(mm,len(mm))

v=adult.copy()for i in range(len(cat_cols)):S=v.iloc[:,cat_cols[i]]SS=np.zeros(len(S))>0for j in np.arange(len(mm[i])):SS=SS+(S==mm[i][j])*(j+1)v.iloc[:,cat_cols[i]]=SSv.iloc[:,cat_cols[i]]=v.iloc[:,cat_cols[i]].mask(SS==0)
v.head() #不显示, 所有字符型水平已经转换成数字

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
w=pd.read_csv('iris.csv')
le.fit(w['Species'])
le.classes_

S1=le.transform(w.Species);S1

le.inverse_transform(S1)

from missingpy import MissForest
imputer = MissForest(random_state=1010)
imputed = imputer.fit_transform(v, cat_vars=cat_cols)
imputed #得到的是np.array
v2=pd.DataFrame(imputed,columns=v.columns,)

for i in cat_cols:v2.iloc[:,i] = v2.iloc[:,i].astype('category')
v3=pd.get_dummies(v2,drop_first=False)
v3.columns

k=0;d=dict()
for i in cat_cols:print(v2.columns[i])for j in range(len(set(v2.iloc[:,i]))):no=v2.columns[i]+'_'+str(list(set(v2.iloc[:,i]))[j])nn=v2.columns[i]+'_'+mm[k][j]d[no]=nnk=k+1v3 = v3.rename(columns=d)

def ImpDum(df, drop=False):import numpy as npimport pandas as pdfrom missingpy import MissForestcat_cols=[]for i in range(len(df.columns)):if df.iloc[:,i].dtype=='O':cat_cols.append(i)mm=[]for i in cat_cols:l=list(set(df.iloc[:,i]))mm.append([x for x in l if type(x) != float])v=df.copy()for i in range(len(cat_cols)):S=v.iloc[:,cat_cols[i]]SS=np.zeros(len(S))>0for j in np.arange(len(mm[i])):SS=SS+(S==mm[i][j])*(j+1)v.iloc[:,cat_cols[i]]=SSv.iloc[:,cat_cols[i]]=v.iloc[:,cat_cols[i]].mask(SS==0) imputer = MissForest(random_state=1010)v2 = imputer.fit_transform(v, cat_vars=cat_cols)v2=pd.DataFrame(v2,columns=v.columns,)for i in cat_cols:v2.iloc[:,i] = v2.iloc[:,i].astype('category')v3=pd.get_dummies(v2,drop_first=drop)k=0;d=dict()for i in cat_cols:for j in range(len(set(v2.iloc[:,i]))):no=v2.columns[i]+'_'+str(list(set(v2.iloc[:,i]))[j])nn=v2.columns[i]+'_'+mm[k][j]d[no]=nnk=k+1v3 = v3.rename(columns=d)return v3

w=ImpDum(df=adult)
print('w.shape =',w.shape,'\nw.columns =',w.columns)

def Imp(df):import numpy as npimport pandas as pdfrom missingpy import MissForestcat_cols=[]for i in range(len(df.columns)):if df.iloc[:,i].dtype=='O':cat_cols.append(i)mm=[]for i in cat_cols:l=list(set(df.iloc[:,i]))mm.append([x for x in l if type(x) != float])v=df.copy()for i in range(len(cat_cols)):S=v.iloc[:,cat_cols[i]]SS=np.zeros(len(S))>0for j in np.arange(len(mm[i])):SS=SS+(S==mm[i][j])*(j+1)v.iloc[:,cat_cols[i]]=SSv.iloc[:,cat_cols[i]]=v.iloc[:,cat_cols[i]].mask(SS==0) imputer = MissForest(random_state=1010)v2 = imputer.fit_transform(v, cat_vars=cat_cols)v2=pd.DataFrame(v2,columns=v.columns,)return v2

w = pd.read_csv('iris.csv')
u=w.iloc[[1,3,5,51,53,55,101,103,105],:]
np.random.seed(999)
u_nan=u.mask(np.random.random(u.shape)<0.3)
u_imp=Imp(u_nan)
print(u_imp) #打印结果

def Dum(df, drop=False):import numpy as npimport pandas as pdcat_cols=[]for i in range(len(df.columns)):if df.iloc[:,i].dtype=='O':cat_cols.append(i)mm=[]for i in cat_cols:l=list(set(df.iloc[:,i]))mm.append([x for x in l if type(x) != float])v=df.copy()for i in range(len(cat_cols)):S=v.iloc[:,cat_cols[i]]SS=np.zeros(len(S))>0for j in np.arange(len(mm[i])):SS=SS+(S==mm[i][j])*(j+1)v.iloc[:,cat_cols[i]]=SSv.iloc[:,cat_cols[i]]=v.iloc[:,cat_cols[i]].mask(SS==0) for i in cat_cols:v.iloc[:,i] = v.iloc[:,i].astype('category')v3=pd.get_dummies(v,drop_first=drop)k=0;d=dict()for i in cat_cols:for j in range(len(set(v.iloc[:,i]))):no=v.columns[i]+'_'+str(list(set(v.iloc[:,i]))[j])nn=v.columns[i]+'_'+mm[k][j]d[no]=nnk=k+1v3 = v3.rename(columns=d)return v3

w = pd.read_csv('iris.csv')
w_dum=Dum(w)
print(w_dum.iloc[[0,1,51,52,101,102],4:])

第11章

import pandas as pd
import numpy as npw=pd.read_csv('Boston.csv')
print(w.head(3))
y=w.MEDV  #因变量
n=len(y)  #样本量
X=w.iloc[:,:-1] #自变量
print(X.columns)

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegressionnames = ['HGBoost',"Adaboost","Bagging", "Random Forest",\"Linear Model"]
regressors = [HistGradientBoostingRegressor(random_state=1010),AdaBoostRegressor(random_state=1010, n_estimators=100),BaggingRegressor(n_estimators=100),RandomForestRegressor(n_estimators=500,random_state=1010),LinearRegression()]
REG=dict(zip(names,regressors))

def Rfold(n,Z,seed):zid=(list(range(Z))*int(n/Z+1))[:n]np.random.seed(seed)np.random.shuffle(zid)return(np.array(zid))

Z=10
zid=Rfold(n,Z,1010)
YPred=dict();
for i in REG:Y_pred=np.zeros(n)for j in range(Z):reg=REG[i]reg.fit(X[zid!=j],y[zid!=j])Y_pred[zid==j]=reg.predict(X[zid==j])YPred[names[i]]=Y_pred
R=pd.DataFrame(YPred)

M=np.sum((y-np.mean(y))**2)
A=dict()
for i in REG:A[i]=np.sum((y-YPred[i])**2)/M

import matplotlib.pyplot as plt
plt.figure(figsize = (12,4))
plt.barh(range(len(A)), A.values(), color = 'navy', height = 0.6)
plt.xlabel('NMSE')
plt.ylabel('Model')
plt.title('Normalized MSE for 5 Models')
plt.yticks(np.arange(len(A)),A.keys())
for v,u in enumerate(A.values()): plt.text(u, v, str(round(u,4)), va = 'center')
plt.show()

def RegCV(X,y,regress, Z=10, seed=8888, trace=True):from datetime import datetimen=len(y)zid=Rfold(n,Z,seed)YPred=dict();M=np.sum((y-np.mean(y))**2)A=dict()for i in regress:if trace: print(i,'\n',datetime.now())Y_pred=np.zeros(n)for j in range(Z):reg=regress[i]reg.fit(X[zid!=j],y[zid!=j])Y_pred[zid==j]=reg.predict(X[zid==j])YPred[i]=Y_pred A[i]=np.sum((y-YPred[i])**2)/Mif trace: print(datetime.now())R=pd.DataFrame(YPred)    return R,A

R,A=RegCV(X,y,REG)

def BarPlot(A,xlab='',ylab='',title='',size=[None,None,None,None,None]):import matplotlib.pyplot as pltplt.figure(figsize = (12,4))plt.barh(range(len(A)), A.values(), color = 'navy')plt.xlabel(xlab,size=size[0])plt.ylabel(ylab,size=size[1])plt.title(title,size=size[2]) plt.yticks(np.arange(len(A)),A.keys(),size=size[3])for v,u in enumerate(A.values()): plt.text(u, v, str(round(u,4)), va = 'center',color='navy',size=size[4])plt.show()

BarPlot(A,'NMSE','Model','Normalized MSE for 5 Models')

import pandas as pd
import numpy as npw=pd.read_csv("DNA.csv")
X=w.iloc[:,:-1];y=w.iloc[:,-1];n=len(y)
y=pd.get_dummies(y).dot(np.arange(1,4))

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifiernames = ["Bagging", "Linear SVM", "RBF SVM", "Decision Tree","Random Forest", "AdaBoost", "Naive Bayes",'HGboost']
classifiers = [BaggingClassifier(n_estimators=100,random_state=1010),SVC(kernel="linear", C=0.025,random_state=0),SVC(gamma='auto', C=1,random_state=0),DecisionTreeClassifier(max_depth=5,random_state=0),RandomForestClassifier(n_estimators=500,random_state=0),AdaBoostClassifier(n_estimators=100,random_state=0),GaussianNB(),HistGradientBoostingClassifier(random_state=0)]CLS=dict(zip(names,classifiers))

def Fold(u,Z=10,seed=8888):u=np.array(u).reshape(-1)id=np.arange(len(u))zid=[];ID=[];np.random.seed(seed)for i in np.unique(u):n=sum(u==i)ID.extend(id[u==i])k=(list(range(Z))*int(n/Z+1))np.random.shuffle(k)zid.extend(k[:n])zid=np.array(zid);ID=np.array(ID)zid=zid[np.argsort(ID)]return zid

Z=10
Zid=Fold(y,Z=10,seed=8888)YCPred=dict();
for i in CLS:print(i,'\n',datetime.now())Y_pred=np.zeros(len(y))for j in range(Z):clf=CLS[i]clf.fit(X[Zid!=j],y[Zid!=j])Y_pred[Zid==j]=clf.predict(X[Zid==j])YCPred[i]=Y_pred   print(datetime.now())
R=pd.DataFrame(YCPred)

A=dict()
for i in CLS:A[i]=np.mean(y!=R[i])

BarPlot(A,'Error rate','Model','Error rates of 8 models')

def ClaCV(X,y,CLS, Z=10,seed=8888, trace=True):from datetime import datetimen=len(y)Zid=Fold(y,Z,seed=seed)YCPred=dict();A=dict()for i in CLS:if trace: print(i,'\n',datetime.now())Y_pred=np.zeros(n)for j in range(Z):clf=CLS[i]clf.fit(X[Zid!=j],y[Zid!=j])Y_pred[Zid==j]=clf.predict(X[Zid==j])YCPred[i]=Y_pred A[i]=np.mean(y!=YCPred[i])if trace: print(datetime.now())  R=pd.DataFrame(YCPred)return R, A

R,A=ClaCV(X,y,CLS)

import pandas as pd
import numpy as np
w=pd.read_csv('diamonds.csv') u=Dum(w)
u1=Dum(w,drop=True)
y=w.price
n=len(y)
X=u.copy();del X['price']
X1=u1.copy();del X1['price']Z=10
zid=Rfold(n,Z,1010)

from sklearn.linear_model import LinearRegression
lm=LinearRegression()lm_pred=np.zeros(n)
for j in range(Z):lm.fit(X1[zid!=j],y[zid!=j])lm_pred[zid==j]=lm.predict(X1[zid==j])
lm_NMSE=((y-lm_pred)**2).sum()/np.sum((y-y.mean())**2)
lm_NMSE

def SRCV(X,y,REG,Z=10,seed=1010):n=len(y)zid=Rfold(n,Z,seed) pred=np.zeros(n)for j in range(Z):REG.fit(X[zid!=j],y[zid!=j])pred[zid==j]=REG.predict(X[zid==j]) NMSE=((y-pred)**2).sum()/np.sum((y-y.mean())**2)return NMSE, pred

NMSE, pred=SRCV(X1,y,lm);print(NMSE)

lm.fit(X1,y)
print('Coef:\n',lm.coef_,'\nIntercept =',lm.intercept_)

df=pd.read_csv('commun123.csv')
df_y=df['ViolentCrimesPerPop'];df_X=df.iloc[:,:-1]
LM=LinearRegression(fit_intercept=False, normalize=False)M_coef=LM.fit(df_X,df_y).coef_
S_coef=[]
for i in range(df_X.shape[1]):S_coef.extend(LM.fit(np.array(df_X.iloc[:,i]).reshape(-1,1),df_y).coef_)
S_coef=np.array(S_coef)plt.style.use('ggplot')
n = 122
fig, ax = plt.subplots(figsize=(18,6))
index = np.arange(n)
bar_width = 0.35
opacity = 0.9
ax.bar(index, M_coef, bar_width, alpha=opacity, color='r',\label='Coefficients of multiple regressions')
ax.bar(index+bar_width, S_coef, bar_width, alpha=opacity, color='b',\label='Coefficients of univariate regressions')
ax.set_xlabel('Covariates')
ax.set_ylabel('Coefficients')
ax.set_title('Coefficient comparison between multiple and univariate regression\
without constant term')
ax.set_xticks(index + bar_width / 2)
ax.set_xticklabels(df_X.columns,rotation=90)
ax.legend(loc='upper left')
plt.show()

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
w=pd.read_csv('trans.csv')X=w.iloc[:,:3]
y=w['Donate']
n=len(y);Z=10
Zid=Fold(y,Z=10,seed=1010)

pred=np.zeros(len(y))
clf=LogisticRegression(solver='lbfgs')
for j in range(Z):clf.fit(X[Zid!=j],y[Zid!=j])pred[Zid==j]=clf.predict(X[Zid==j])from sklearn.metrics import confusion_matrix
print("confusion matrix:\n",confusion_matrix(y, pred))
print('error rate =',np.mean(y!=pred))

def SCCV(X,y,CLS,Z=10,seed=1010):n=len(y)Zid=Fold(y,Z,seed)pred=np.zeros(len(y))for j in range(Z):CLS.fit(X[Zid!=j],y[Zid!=j])pred[Zid==j]=CLS.predict(X[Zid==j])error=np.mean(y!=pred)return error, pred

error, pred=SCCV(X,y,clf);print(error)

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import graphviz w=pd.read_csv("mushroom.csv")
w.columns

X=pd.get_dummies(w.iloc[:,1:],drop_first=False)
X.columns

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(w['type'])
y=le.transform(w['type'])
print('original levels =',le.classes_,'\nafter transform: y =',y)
print('inverse transform back =',le.inverse_transform(y))

clf=DecisionTreeClassifier(random_state=0, max_depth=None) #'gini'准则
clf=clf.fit(X,y)
dot_data=tree.export_graphviz(clf,out_file=None, feature_names = X.columns,rounded=True, filled=True)
graph = graphviz.Source(dot_data)
graph.render("mushroom") #输出图到mushroom.pdf文件
graph #显示图

clf=DecisionTreeClassifier(random_state=0, max_depth=None)
error, pred=SCCV(X,y,clf)
print('confusion matrix:\n',confusion_matrix(y, pred))
print('error rate = ', error)

from sklearn.model_selection import cross_val_score
clf=DecisionTreeClassifier(random_state=0, max_depth=None)#'gini'准则
cross_val_score(clf, X, y, cv=10)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
import graphviz w=pd.read_csv('Boston.csv')
y=w.MEDV
n=len(y)
X=w.iloc[:,:-1]

reg = DecisionTreeRegressor(random_state=0,max_depth=2)
reg=reg.fit(X,y)
dot_data=tree.export_graphviz(reg,out_file=None, feature_names = X.columns,rounded=True, filled=True)
graph = graphviz.Source(dot_data)
graph.render("Bostontree") #输出图到Bostontree.pdf文件
graph #显示图

print(((y[w.RM <= 6.941]-y[w.RM <= 6.941].mean())**2).mean(),
((y[w.RM > 6.941]-y[w.RM > 6.941].mean())**2).mean())

reg = DecisionTreeRegressor(random_state=0)
NMSE, pred=SRCV(X,y,reg,seed=1010);print(NMSE)

from sklearn.model_selection import cross_val_score
reg = DecisionTreeRegressor(random_state=0)
cross_val_score(reg, X, y, cv=10)

import pandas as pd
import numpy as np
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegressionw=pd.read_csv('diamonds.csv')
u=Dum(w)y=w.price
n=len(y)
X=u.copy();del X['price']

names = ["Bagging", "Random Forest", "HGboost",'Linear Model']
regressors = [BaggingRegressor(n_estimators=100,random_state=1010),RandomForestRegressor(n_estimators=500,random_state=1010),HistGradientBoostingRegressor(random_state=1010),LinearRegression()]
REG=dict(zip(names,regressors))

R,A=RegCV(X,y,REG,seed=1010)
xlab='NMSE'
ylab='Model'
title='Normalized MSE for 4 Models'
BarPlot(A,xlab,ylab,title)

import pandas as pd
import numpy as np
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifierv= pd.read_csv('pendigits.csv',index_col=False)
X=v[v.columns[:16]]#自变量
y=v[v.columns[16]]#因变量names = ["Bagging", "Random Forest", "HGBoost"]
classifiers = [BaggingClassifier(n_estimators=100,random_state=1010),RandomForestClassifier(n_estimators=500,random_state=1010),HistGradientBoostingClassifier(random_state=1010)]
CLS=dict(zip(names,classifiers))R,A=ClaCV(X,y,CLS,seed=1010)xlab='Error rate';ylab='Model';title='Error rate for 3 models'
BarPlot(A,xlab,ylab,title)

import pandas as pd
import numpy as np
w = pd.read_csv('pima.csv')
X=w.iloc[:,:-1]
y=w.iloc[:,-1]from sklearn.neural_network import MLPClassifier
CLS=MLPClassifier()error,pred=SCCV(X,y,CLS,seed=1010)from sklearn.metrics import confusion_matrix
print("confusion matrix:\n",confusion_matrix(y, pred))
print('error rate =',error)

import pandas as pd
import numpy as npw=pd.read_csv('Boston.csv')
y=w.MEDV
X=w.iloc[:,:-1]from sklearn.neural_network import MLPRegressor
REG=MLPRegressor(max_iter=1000)
NMSE, pred=SRCV(X,y,REG)NMSE

import pandas as pd
import numpy as np
w = pd.read_csv('pima.csv')
X=w.iloc[:,:-1]
y=w.iloc[:,-1]from sklearn.neighbors import KNeighborsClassifier
CLS=KNeighborsClassifier(n_neighbors=50)error,pred=SCCV(X,y,CLS,seed=1010)
from sklearn.metrics import confusion_matrix
print("confusion matrix:\n",confusion_matrix(y, pred))
print('error rate =',error)

import pandas as pd
import numpy as npw=pd.read_csv('Boston.csv')
y=w.MEDV
X=w.iloc[:,:-1]from sklearn.neighbors import KNeighborsRegressor
REG=KNeighborsRegressor(n_neighbors=3)
NMSE, pred=SRCV(X,y,REG)NMSE

import pandas as pd
import numpy as np
w = pd.read_csv('pima.csv')
X=w.iloc[:,:-1]
y=w.iloc[:,-1]from sklearn.svm import SVC
names=['Linear SVM','RBF SVM']
Cls=[SVC(kernel="linear", C=0.025,random_state=0),
SVC(gamma='auto', C=1,random_state=0)]
CLS=dict(zip(names,Cls))R,A=ClaCV(X,y,CLS, seed=8888)xlab='Error Rate';ylab='Model';title='Error rate of 2 SVM models'
BarPlot(A,xlab,ylab,title)

import pandas as pd
import numpy as np
w = pd.read_csv('pima.csv')
X=w.iloc[:,:-1]
y=w.iloc[:,-1]from sklearn.naive_bayes import GaussianNB
CLS = GaussianNB()error,pred=SCCV(X,y,CLS,seed=1010)
from sklearn.metrics import confusion_matrix
print("confusion matrix:\n",confusion_matrix(y, pred))
print('error rate =',error)

Python —— 数据科学的手段 (第2版)配套代码相关推荐

《Python数据科学实践指南》——0.2节如何成为数据科学家
本节书摘来自华章社区<Python数据科学实践指南>一书中的第0章,第0.2节如何成为数据科学家,作者纪路,更多章节内容可以访问云栖社区"华章社区"公众号查看 0.2 ...
python 数据科学书籍_您必须在2020年阅读的数据科学书籍
python 数据科学书籍 "We're entering a new world in which data may be more important than software.&qu ...
python书籍推荐：Python数据科学手册
所属网站分类: 资源下载 > python电子书作者:today 链接:http://www.pythonheidong.com/blog/article/448/ 来源:python黑洞网 ...
Python 数据科学入门教程：机器学习：回归
Python 数据科学入门教程:机器学习:回归原文:Regression - Intro and Data 译者:飞龙协议:CC BY-NC-SA 4.0 引言和数据欢迎阅读 Python 机器 ...
使用python构建向量空间_使用Docker构建Python数据科学容器
人工智能(AI)和机器学习(ML)最近真的火了,并驱动了从自动驾驶汽车到药物发现等等应用领域的快速发展.AI和ML的前途一片光明. 另一方面,Docker通过引入临时轻量级容器彻底改变了计算世界.通过 ...
visio 科学图形包_【数据科学的python系列3】Python数据科学环境设置
1. 用Python搭建数据科学环境今天,在本篇Python数据科学教程中,我们将看到Python的数据科学环境设置.此外,我们将告诉你数据科学环境设置需要安装的所有内容,如Python.Anaco ...
Python数据科学学习进阶
Python菜鸟到Python Kaggler 如果你梦想成为一名数据科学家,或者已然是数据科学家的你想扩展自己的工具库,那么,你找对地方啦.本文旨在为做数据分析的Python人提供一条全方位的学习之 ...
python数据科学实践常象宇_python数据科学
Python语言拥有大量可用于存储.操作和洞察数据的程序库,已然成为深受数据科学研究人员推崇的工具.本书以IPython.NumPy.Pandas.Matplotlib和Scikit-Learn这5个 ...
python数据科学手册pdf中文版百度云_书籍推荐：《Python数据科学手册》（高清官方中文版PDF）...
内容简介: 本书是对以数据深度需求为中央的科学.研究以及针对盘算和统计方式的参考书.本书共五章,每章先容一到两个Python数据科学中的重点工具包.首先从IPython和Jupyter最先,它们提供了 ...
Mozilla将Python数据科学带入浏览器
Mozilla的实验性Pyodide项目通过将Python数据科学堆栈编译为WebAssembly ,将其带入Web浏览器. Pyodide是Python的Numpy科学计算库,Pandas数据分析库 ...

Python —— 数据科学的手段 (第2版)配套代码

第一部分

第1章

第2章

第3章

第二部分

第4章

第5章

第6章

第7章

第三部分

第8章

第9章

第四部分

第10章

第11章

Python —— 数据科学的手段 (第2版)配套代码相关推荐

最新文章

热门文章