Sorry, your browser cannot access this site
This page requires browser support (enable) JavaScript
Learn more >

first.py

机器学习的HelloWorld,暴力方法对y=kx+b线性模型进行拟合,拟合的结果

暴力拟合

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from matplotlib import pyplot as plot
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
# 线性模型,暴力求解
拟合精度 = 0.1;

standard_x=[1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0]
standard_y=[3.0,5.0,6.0,8.0,11.0,13.0,14.0,16.0,19.0,22.0]

def forward(w,b,x):
return w*x+b;

def loss(w,b,x,y):
y_hat = forward(w,b,x);
return (y-y_hat) ** 2

mse_list=[] #Z轴,平均偏差
w_list=[]
b_list=[] #y轴,b参数
x_list=[] #x轴,w参数
fig = plot.figure();
ax = Axes3D(fig);
ax.set_xlabel('w');
ax.set_ylabel('b');
ax.set_zlabel('MSE');
for w in np.arange(-8,8.1,拟合精度):
#w_list.append(w)
for b in np.arange(-8,8.1,拟合精度):
b_list.append(b);
x_list.append(w);
errSum=0;
for x,y in zip(standard_x,standard_y):
error = loss(w,b,x,y);
errSum = error + errSum;
mse_list.append(errSum / len(standard_y)); # 平均误差
Z = np.array(mse_list);
minMSE = min(mse_list);
print("Minimum MSE:"+str(minMSE)); # 最优模型
w=x_list[mse_list.index(minMSE)];
b=b_list[mse_list.index(minMSE)];
print("最优w:"+str(w));
print("最优b:"+str(b));
pred_y=[]
for x, y in zip(standard_x, standard_y): # 可视化最优模型
pred_y_ = forward(w,b,x);
print("拟合结果:",pred_y_,y);
pred_y.append(pred_y_);
plot.figure();
plot.plot(standard_x,standard_y);
plot.plot(standard_x,pred_y);
plot.legend(["standard","predict"]);
ax.scatter(x_list,b_list,Z);
plot.show();

backpropagation.py

反向传播+求导梯度下降的方法进行手动拟合,减小参数拟合的时间复杂度,拟合的结果

反向传播

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import torch
from matplotlib import pyplot as plot

# 线性模型,反向传播与最速下降求解
standard_x=[1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0]
standard_y=[3.0,5.0,6.0,8.0,11.0,13.0,14.0,16.0,19.0,22.0]
w = torch.Tensor([1.0])
w.requires_grad = True
b = torch.Tensor([1.0])
b.requires_grad = True
def forward(w,b,x):
return w*x+b;
def loss(w,b,x,y):
y_hat = forward(w,b,x);
return (y-y_hat) ** 2


for epoch in range(100):
for x,y in zip(standard_x,standard_y):
l = loss(w,b,x,y) # 前馈
l.backward(); # 反向传播求梯度
w.data = w.data-0.01*w.grad.data; # 最速下降法优化w参数
b.data = b.data-0.01*b.grad.data; # 优化截距参数
w.grad.data.zero_();
b.grad.data.zero_(); # 恢复梯度,避免 pytorch 对梯度累加
print("progress:\teproch",epoch,"loss",l.item());
print("done!")
print("w =",w.item(),"b =",b.item());
pred_y=[]
for x, y in zip(standard_x, standard_y):
pred_y_ = forward(w.item(),b.item(),x);
print("拟合结果:",pred_y_,y);
pred_y.append(pred_y_);
plot.figure();
plot.plot(standard_x,standard_y);
plot.plot(standard_x,pred_y);
plot.legend(["standard","predict"]);
plot.show();

PytorchLinearModel.py

用Pytorch库函数提供的反向传播+求导梯度下降的方法进行参数自动优化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import torch
from matplotlib import pyplot as plot

standard_x = torch.tensor([[1.0],[2.0],[3.0],[4.0],[5.0],[6.0],[7.0],[8.0],[9.0],[10.0]])
standard_y = torch.tensor([[3.0],[5.0],[6.0],[8.0],[11.0],[13.0],[14.0],[16.0],[19.0],[22.0]])
class LinearModel(torch.nn.Module):
def __init__(self):
super(LinearModel,self).__init__()
self.linear = torch.nn.Linear(1,1);

def forward(self,x):
y_pred = self.linear(x);
return y_pred;


model = LinearModel();
criteria = torch.nn.MSELoss(size_average=True);
optimizer = torch.optim.SGD((model.parameters()),lr=0.01) # 最速下降法
for epoch in range(100):
y_pred = model(standard_x); # 前馈(预测值、损失)
loss = criteria(y_pred, standard_y);
optimizer.zero_grad(); # 归零避免梯度叠加
loss.backward(); # 反向传播
optimizer.step(); # 优化参数
print("epoch:",epoch,"loss:",loss.item())
print("done!");
print("w =",model.linear.weight.item(),"b =",model.linear.bias.item());
# 模型可视化
predict_y = model(standard_x);
plot.figure();
plot.plot(standard_x.detach().numpy(),standard_y.detach().numpy());
plot.plot(standard_x.detach().numpy(),predict_y.detach().numpy());
plot.legend(["standard","predict"]);
plot.show();

LogisitcModel.py

逻辑斯蒂回归模型,用于分类,该模型用于给糖尿病数据集进行分类

常用的激活函数有ReLU、Tanh、Sigmoid

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import torch
import numpy
from matplotlib import pyplot as plot

class LogisticModel(torch.nn.Module):
def __init__(self):
super(LogisticModel,self).__init__();
self.linear1 = torch.nn.Linear(8,7);
self.linear2 = torch.nn.Linear(7,6);
self.linear3 = torch.nn.Linear(6,5);
self.linear4 = torch.nn.Linear(5,4);
self.linear5 = torch.nn.Linear(4,2);
self.linear6 = torch.nn.Linear(2,1);
self.activate = torch.nn.Tanh(); # 激活函数

def forward(self,x):
x = self.activate(self.linear1(x)); # 给线性模型增加非线性因子
x = self.activate(self.linear2(x)); # 每一层
x = self.activate(self.linear3(x)); # 都加上
x = self.activate(self.linear4(x));
x = self.activate(self.linear5(x));
finallyActivate = torch.nn.Sigmoid();
x = finallyActivate(self.linear6(x));
#x = self.activate(self.linear3(x))
return x;


model = LogisticModel();
dataSet = numpy.loadtxt("diabetes.csv",delimiter=',',dtype=numpy.float32);
data_X = dataSet[:,:-1];


input_Y = torch.from_numpy(dataSet[:,[-1]]) # 专抽最后一个元素
criteria = torch.nn.BCELoss(reduction='mean') # 二叉分类交叉熵
optimizer = torch.optim.SGD(model.parameters(),lr=0.08) # 梯度下降法优化
for i in range(len(data_X[1,:])): # 归一化处理
orient = dataSet[:,i];
Min = numpy.min(orient)
Max = numpy.max(orient)
for index in range(len(orient)):
x = (orient[index] - Min) / (Max - Min)
data_X[index, i] = x;
#numpy.savetxt("s.csv",data_X,fmt="%f",delimiter=","); # 保存归一化的数据
input_X = torch.from_numpy(data_X) # 把归一化的数据拿来用
for epoch in range(15000):
pred_Y = model(input_X);
# Step1: 求损失
loss = criteria(pred_Y,input_Y);
# Step2: 清零、反向传播
optimizer.zero_grad();
loss.backward();
# Step3: 优化参数
optimizer.step();
print("epoch:", epoch, "loss:", loss.item())
X_Line = []
pred_Y = model(input_X);
#print(pred_Y)
for X in range(len(dataSet[:,1])):
X_Line.append(X)
plot.figure();
plot.plot(X_Line,input_Y.detach().numpy());
plot.plot(X_Line,pred_Y.detach().numpy());
plot.legend(["standard","predict"]);

Y_check = (dataSet[:,-1])
number = 0
correctNumber=0
wrongNumber=0
for i in pred_Y.detach().numpy():
if i > 0.65:
print("1",Y_check[number])
if Y_check[number] == 1:
correctNumber+=1
else:
wrongNumber+=1
else:
print("0",Y_check[number])
if Y_check[number] == 0:
correctNumber += 1
else:
wrongNumber += 1
number+=1

print("预测正确数量:",correctNumber,"预测错误数量",wrongNumber,"正确率:",correctNumber*100/(correctNumber+wrongNumber),"%")

plot.show();

TitanicSurviverPredict.py

是Kaggle上的比赛,只拿了55分,最难做的是数据处理

相比于之前的模型的改进是:能自动筛选在训练集上表现最好的模型,但是无法筛掉过拟合的模型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import torch
import numpy
from torch.utils.data import Dataset
import csv
from torch.utils.data import DataLoader
import copy
from matplotlib import pyplot as plot
#torch.Tensor.view()
class TitanicDataset(Dataset):
def __init__(self,path,isTest):
self.isTest = isTest;
data_X = []
data_Y = []
with open(path, 'r', encoding="UTF-8") as file:
dataMap = csv.DictReader(file);
nameList = []
sexList = []
ticketList = []
cabinList = []
embarkedList = []
for i in dataMap:
PassengerId = i["PassengerId"]
if not isTest:
Survived = i["Survived"]
Pclass = i["Pclass"]
Name = i["Name"] # NaN
if Name not in nameList: # 去重复
nameList.append(Name)
# 非数字元素数字化
Name = nameList.index(Name);
Sex = i["Sex"] # NaN
if Sex not in sexList: # 去重复
sexList.append(Sex);
# 非数字元素数字化
Sex = sexList.index(Sex);
Age = i["Age"]
if (Age == ""):
Age = 0 # 避免空值
# print(Age)
SibSp = i["SibSp"]
Parch = i["Parch"]
Ticket = i["Ticket"] # NaN
if Ticket not in ticketList: # 去重复
ticketList.append(Ticket)
# 非数字元素数字化
Ticket = ticketList.index(Ticket);
Fare = i["Fare"]
if Fare == "":
Fare = 0
Cabin = i["Cabin"] # NaN
if Cabin not in cabinList: # 去重复
cabinList.append(Cabin)
# 非数字元素数字化
Cabin = cabinList.index(Cabin);
Embarked = i["Embarked"] # NaN
if Embarked not in embarkedList: # 去重复
embarkedList.append(Embarked)
# 非数字元素数字化
Embarked = embarkedList.index(Embarked);
data_X.append([numpy.float32(Pclass),
numpy.float32((Name)),
numpy.float32(Sex),
numpy.float32(Age),
numpy.float32(SibSp),
numpy.float32(Parch),
numpy.float32(Ticket),
numpy.float32(Fare),
numpy.float32(Cabin),
numpy.float32(Embarked)])
if not isTest:
data_Y.append([numpy.float32(Survived)]);
# print(data_X)
temp_list = []
for i in range(len(data_X[0])): # 归一化处理
for j in range(len(data_X)):
temp_list.append(data_X[j][i]) # 抽取一列元素
Min = numpy.min(temp_list)
Max = numpy.max(temp_list)
# print(len())
for index in range(len(temp_list)):
x = (temp_list[index] - Min) / (Max - Min)
# print(index)
data_X[index][i] = x;
temp_list.clear();
self.X = torch.Tensor(data_X);
if not isTest:
self.Y = torch.Tensor(data_Y);

def __getitem__(self, item):
if not self.isTest:
return self.X[item],self.Y[item]
else:
return self.X[item]

def __len__(self):
return len(self.X)

class LogisticModel(torch.nn.Module):
def __init__(self):
super(LogisticModel,self).__init__();
self.loss=1000
self.linear1 = torch.nn.Linear(10,8);
self.linear2 = torch.nn.Linear(8,6);
self.linear3 = torch.nn.Linear(6,5);
self.linear4 = torch.nn.Linear(5,4);
self.linear5 = torch.nn.Linear(4,2);
self.linear6 = torch.nn.Linear(2,1);
self.activate = torch.nn.ReLU(); # 激活函数

def forward(self,x):
x = self.activate(self.linear1(x)); # 给线性模型增加非线性因子
x = self.activate(self.linear2(x)); # 每一层
x = self.activate(self.linear3(x)); # 都加上
x = self.activate(self.linear4(x));
x = self.activate(self.linear5(x));
finallyActivate = torch.nn.Sigmoid();
x = finallyActivate(self.linear6(x));
return x;
def setloss(self,loss):
self.loss = loss;

def getloss(self):
return self.loss;


model = LogisticModel();
dataset_train = TitanicDataset("titanic_train.csv",isTest=False);
dataset_test = TitanicDataset("test.csv",isTest=True);
criteria = torch.nn.BCELoss(reduction="mean"); # 二叉分类交叉熵,求均值
optimizer = torch.optim.SGD(model.parameters(),lr=0.01); # 最速下降
train_loader = DataLoader(dataset=dataset_train,batch_size=32,shuffle=True)
test_loader = DataLoader(dataset=dataset_test,batch_size=32,shuffle=True)
bestmodel = copy.deepcopy(model);
for epoch in range(3000):
for i,(x,y) in enumerate(train_loader,0):
y_hat = model(x);
loss = criteria(y_hat,y);
optimizer.zero_grad();
loss.backward();
optimizer.step();
model.setloss(loss.item());
if bestmodel.getloss() > model.getloss():
bestmodel = copy.deepcopy(model);
print("epoch:", epoch, "loss:", loss.item())
model = bestmodel;
print("表现最好的模型:", "loss", model.getloss())
result=[]
for i,(x) in enumerate(test_loader,0):
y_hat = model(x)
result.append(y_hat.detach().numpy())
index = 892;
map = {}
head = ("PassengerId","Survived");
output=[]
for i in result:
for j in i:
map[head[1]] = 1 if j > 0.65 else 0;
map[head[0]] = index;
index+=1;
output.append(map.copy());
map.clear()
print(output)
with open("output.csv", 'w', encoding='UTF-8') as file:
writer = csv.DictWriter(file, fieldnames=head);
writer.writeheader();
writer.writerows(output);

HandWritingNumbers.py

多输入多输出的回归模型,对训练好的模型进行持久化,多线程结合socket与Java上的SpringBoot后端连接,Android应用作为前端,接收用户绘制的图像,模型返回预测值,不过安卓前端和SpringBoot的Bug还很多,主要是为了优化对IO的使用,导致项目到现在还没完成。

单看模型,能达到大概97%的正确率

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
from torchvision import datasets
from torchvision import transforms
from torch.utils.data import DataLoader
import torch
import socket
import numpy as np
import cv2
import time
from threading import Thread # 多线程实现Socket通信
# 高并发情况下,粘包真是烦死了烦死了QAQ
# Socket就用来激活一下程序来读取文件,虽然这样做烧点磁盘IO..
train_new_model = False;
should_i_test = False;
host = 'localhost';
port = 14159;
training_epoch = 13;
client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
datas = []
class MNISTModel(torch.nn.Module):
def __init__(self):
super(MNISTModel, self).__init__()
self.linear1 = torch.nn.Linear(784,512);
self.linear2 = torch.nn.Linear(512, 256);
self.linear3 = torch.nn.Linear(256, 128);
self.linear4 = torch.nn.Linear(128, 64);
self.linear5 = torch.nn.Linear(64, 32);
self.linear6 = torch.nn.Linear(32, 10);
self.activate = torch.nn.ReLU();

def forward(self,x):
x = x.view(-1,784);
x = self.activate(self.linear1(x));
x = self.activate(self.linear2(x));
x = self.activate(self.linear3(x));
x = self.activate(self.linear4(x));
x = self.activate(self.linear5(x));
x = (self.linear6(x));
return x;

def connect():
client.setsockopt(socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1) # 在客户端开启心跳维护
try:
client.connect((host, port))
print("模型已成功连接至服务器")
except Exception:
print("连接失败,三秒后重连")
time.sleep(3)


def receive(model,datas): #接收数据并返回
data = client.recv(1024);

if "OK" not in data.decode():
for i in data.decode():
datas.append(np.float32(i));
else:
print("数据接收完成")
x = torch.from_numpy(np.asarray(datas)).view(1,1,784,512)
y_hat = model(x);
_, predited = torch.max(y_hat.data, dim=1);
for index, y_pred in enumerate(predited.detach().numpy().tolist(), 0):
print(y_pred);
#print(predited.detach().numpy().tolist())
datas.clear();
client.send(str((predited.detach().numpy())).encode("UTF-8"));

# 数据集的初始化工作
batch_size=64
transform = transforms.Compose([ # 归一化的东西 Compose 把中括号的对象转为张量
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081, )) # 正态分布的归一化
])

train_dataset = datasets.MNIST(root='../dataset/mnist',
train=True,
download=True,
transform=transform);

train_loader = DataLoader(train_dataset,shuffle=True,batch_size=batch_size);
test_dataset = datasets.MNIST(root='../dataset/mnist',
train=False,
download=True,
transform=transform);
test_loader = DataLoader(test_dataset,shuffle=True,batch_size=batch_size);
# 数据集的初始化工作

# 模型/训练


if train_new_model:
model = MNISTModel();
criteria = torch.nn.CrossEntropyLoss(); # 交叉熵
optimizer = torch.optim.SGD(model.parameters(),lr=0.01,momentum=0.15); # momentum 是冲量,作用:更快冲出局部最低点

for epoch in range(13):
for batch_id,data in enumerate(train_loader,0): # 供给数据
x,y=data # 解构参数
y_hat = model(x);
loss = criteria(y_hat,y);
optimizer.zero_grad();
loss.backward();
optimizer.step();
print("epoch:", epoch,"batch:",batch_id, "loss:", loss.item());
# 模型/训练
torch.save(model,"HandWritingNumberModel.pkl") # 持久化
else:
print("加载模型...")
model = torch.load("HandWritingNumberModel.pkl");
print("加载模型完成!")
# 测试
if should_i_test:
total=0;
correct=0;
for data in test_loader:
x,y = data;
y_hat = model(x);
_, predited = torch.max(y_hat.data, dim=1);
#print(predited.detach().numpy().tolist())
total+=y.size(0)
for index,y_pred in enumerate(predited.detach().numpy().tolist(),0):
y_true = y.detach().numpy().tolist();
print("真实值",y_true[index],"预测值",y_pred);
if y_true[index] == y_pred:
correct+=1;
print("正确率:",100*correct/total);
# 测试
connect();
while True:
try:
rec_thread = Thread(target=receive(model=model,datas=datas));
rec_thread.start();
except socket.error:
print("模型与服务器断开连接,正在重新连接")
connect(); # 自动重连
#except Exception as e:
# print("未知错误")
# connect();

# threading.Thread(target=receive);

评论