# numpy完成手写字体识别(机器学习02)
> 参考代码:[mnielsen/neural-networks-and-deep-learning:](https://github.com/mnielsen/neural-networks-and-deep-learning)
>
> 参考讲解:[深度学习多分类任务的损失函数详解 - 知乎 (zhihu.com)](https://zhuanlan.zhihu.com/p/100921909)
>
> softmax推导:[【深度学习】:超详细的Softmax求导](https://blog.csdn.net/bqw18744018044/article/details/83120425)
## 第一题
![](https://www.writebug.com/myres/static/uploads/2022/4/27/e78763a208ada745629ca724c81d273f.writebug)
推导过程如下:
![](https://www.writebug.com/myres/static/uploads/2022/4/27/59c38d3b810be73d0b2a197b430b9465.writebug)
![](https://www.writebug.com/myres/static/uploads/2022/4/27/296ae12ef5e198408a6c703acede51b6.writebug)
![](https://www.writebug.com/myres/static/uploads/2022/4/27/ddfeb1d7b11d1256189850c63d73c532.writebug)
![](https://www.writebug.com/myres/static/uploads/2022/4/27/aca3bc6af156c1366ccba9fe397be8d6.writebug)
![](https://www.writebug.com/myres/static/uploads/2022/4/27/1a3d225025bc4e3b203c371ef49c3679.writebug)
![](https://www.writebug.com/myres/static/uploads/2022/4/27/24e8aa3a721a78ee079b77b77dff477b.writebug)
## 第二题
![](https://www.writebug.com/myres/static/uploads/2022/4/27/951d86468c6c1351cb4dcb43b0a385c9.writebug)
### 数据预处理
首先拿到的数据是经过处理的二进制文件,需要先对数据进行预处理,原始的数据分为训练集、验证集和测试集三部分,分别是50000,10000,1000的数据大小,要将每张手写字体图像做归一化的处理(原始文件中已经做了归一化),这里只需要将二维的图片向量降低到一维的数据即可,然后添加对应的标签,具体的代码如下:
```python
import pickle
import gzip
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
# 数据解压,类似与字典一样的数据,通过pickle来进行解压
def load_data():
f = gzip.open('data/mnist.pkl.gz', 'rb')
train_data, val_data, test_data = pickle.load(f, encoding='bytes')
f.close()
return train_data, val_data, test_data
# 数据处理,将二维的图像数据处理成实际上我们使用的数据
def raw_data_preprocess():
raw_train_data, raw_val_data, raw_test_data = load_data()
# 处理训练数据
train_inputs = [np.reshape(x, (784, 1)) for x in raw_train_data[0]]
train_labels = [one_hot(y) for y in raw_train_data[1]]
train_data = zip(train_inputs, train_labels)
print("训练数据大小:{}".format(len(train_labels)))
# 处理验证数据
val_inputs = [np.reshape(x, (784, 1)) for x in raw_val_data[0]]
val_labels = [one_hot(y) for y in raw_val_data[1]]
val_data = zip(val_inputs, val_labels)
print("验证数据大小:{}".format(len(val_labels)))
# 处理测试数据
test_inputs = [np.reshape(x, (784, 1)) for x in raw_test_data[0]]
test_labels = [one_hot(y) for y in raw_test_data[1]]
test_data = zip(test_inputs, test_labels)
print("测试数据大小:{}".format(len(test_labels)))
print("mnist数据加载完毕")
return list(train_data), list(val_data), list(test_data)
# 将图片转化为onehot向量的形势
def one_hot(j):
one_hot_j = np.zeros((10, 1))
one_hot_j[j] = 1.0
return one_hot_j
def show_img(img_array):
img = Image.fromarray(img_array)
img_gray = img.convert("L")
# pillow 读取完的图片还是具体的整数,没有做归一化的处理
# matplotlib做显示的时候显示的不是灰度图,是因为颜色通道的缘故
# print(img)
# print(np.array(img).shape)
plt.figure("Image")
plt.imshow(img)
plt.axis("on")
plt.title("mnist image")
img_gray.save("test.jpg", quality=95, subsampling=0)
plt.show()
```
一起来看看数据集里面的图片和数据集处理情况
```python
train_data, val_data, test_data = load_data()
img_array = test_data[0][0]
img_28_28 = img_array.reshape(28, 28) * 255
img_28_28_int = img_28_28.astype("int")
show_img(img_28_28_int)
t,v,test = raw_data_preprocess()
```
![](https://www.writebug.com/myres/static/uploads/2022/4/27/b15bc3f297b44b85fbb17651a3967c43.writebug)
## 网络搭建
按照题目要求,中间隐藏层的激活函数选择relu,输出层的激活函数为softmax,一定要注意,使用了relu作为激活函数,最后一层是不使用relu的,直接对z进行softmax即可,代码和效果如下:
```python
import random
import numpy as np
from PIL import Image
def activate_function(z, name='relu'):
if name == 'relu':
z[z < 0] = 0
return z
# elif name == 'sigmoid':
# return 1.0 / (1.0 + np.exp(-z))
def activate_function_prime(z, name='relu'):
if name == 'relu':
z[z > 0] = 1
z[z <= 0] = 0
return z
# elif name == 'sigmoid':
# return activate_function(z, "sigmoid") * (1 - activate_function(z, "sigmoid"))
def softmax(z):
t = np.exp(z)
a = np.nan_to_num(np.exp(z) / np.sum(t))
return a
class Network(object):
# sizes 用来描述网络的结构,包括输入层、隐藏层和输出层
def __init__(self, sizes):
self.num_layers = len(sizes)
self.sizes = sizes
# randn 用来生成标准正态分布,生成的是列向量
self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
# 生成正态分布的权重矩阵,其中后面一层的结点数目为网路的左下标,前面一层的结点数目为网络的右下表,方便进行矩阵运算
self.weights = [np.random.randn(y, x) for x, y in zip(sizes[:-1], sizes[1:])]
# 执行一次前向传播, 输出的是a,根据不同的损失函数去计算损失
def feedforward(self, a):
# 按照课上的ppt指出,如果是最后一层,只留下z即可, 不再进行进一步的激活
num_forward = 0
for b, w in zip(self.biases, self.weights):
z = np.dot(w, a) + b
if num_forward < (self.num_layers - 2):
# print("relu")
a = activate_function(z)
else:
# print("softmax")
a = softmax(z)
num_forward = num_forward + 1
# todo 实际上这个位置返回的应该是z,而不是a
# y_hat = softmax(z)
return a
def SGD(self, train_data, epochs, mini_batch_size, eta, test_data=None):
n = len(train_data)
for j in range(epochs):
random.shuffle(train_data)
# 数据划分
mini_batches = [train_data[k: k + mini_batch_size] for k in range(0, n, mini_batch_size)]
# 小批量数据梯度更新
for mini_batch in mini_batches:
self.update_mini_batch(mini_batch, eta)
if test_data:
n_test = len(test_data)
correct_num = self.evaluate(test_data)
print("Epoch {} : {} / {}".format(j, correct_num, n_test))
else:
print("Epoch {} complete".format(j))
# 把一个合适的权重保存下来
def update_mini_batch(self, mini_batch, eta):
# nabla用来表示微分,就是每一层求偏导之后的结果
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
for x, y in mini_batch:
# 把这个小批量的梯度回
delta_nabla_b, delta_nabla_w = self.backprop(x, y)
nabla_b = [nb + dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
nabla_w = [nw + dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
# 终于开始梯度更新
self.weights = [w - (eta / len(mini_batch)) * nw for w, nw in zip(self.weights, nabla_w)]
self.biases = [b - (eta / len(mini_batch)) * nb for b, nb in zip(self.biases, nabla_b)]
def backprop(self, x, y):
# 初始化w和b的微分
nabla_b =