前回の続きで、手書き数字(MNIST)を学習したPyTorchの畳み込みニューラルネットワーク(CNN)のパラメータを利用した自前畳み込み演算プログラムを、Cythonで作成して、手書き数字認識ライブラリ化しました。
前回と違い、モデルの定義を含め、殆どをCython内で記述しており、CNNとしての柔軟性はなく、目的である手書き数字認識ライブラリとしてしか使用できません。生Pythonでは、1文字1.609秒かかった処理が、0.001秒になり、およそ160倍高速化しました。前回のライブラリからは、およそ6倍の高速化です。
ダウンロードするなら、こちらから。
動作確認環境:Python3.6以降、Windows64bit
MNISTでの正解率を確認すると、
0:99.90 % 1:99.21 % 2:99.13 % 3:99.60 % 4:98.68 % 5:98.65 % 6:98.85 % 7:98.54 % 8:99.79 % 9:99.01 % total: 99.14 %
微妙にPyTorchの時の正解率と異なるのですが、、、数値誤差と信じよう。
自分の文字も9割くらいは正解するので、認識率的にはまぁまぁ実用的なんじゃないかなぁと。でも、2値化やサイズ・位置調整等の画像の前処理は必要です。
まぁただ、エラー処理はまったくしていないので、そのへんが実用に耐えうるのかはわかりませんが。
使い方
from PIL import Image, ImageOps
import hand_num_reader as reader
img = Image.open(image_path)
img = img.convert('L').resize((28,28))
img = ImageOps.invert(img)
result = reader.recognize(img)
hand_num_reader.pydと同フォルダにパラメータファイルであるparam.datが必要です。
判別させたい画像は、PIL(またはPillow)で開いて渡します。画像サイズは28×28で、グレースケール。
1画像に1文字で、文字を中央に配置して、画像の80%くらいに調整する前処理すると正解率が高いです。
黒ベースの画像に白で数値を描いた画像を想定しており、白ベースで黒い数字を認識させる場合は、invertで反転が必要です。
ソースコード:hand_num_reader.pyx
# -*- coding: utf-8 -*-
import os,pickle
from cpython cimport bool
from cpython.mem cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free
cdef struct Conv2d:
int in_channels
int out_channels
int kernel_size
int input_h
int input_w
int output_h
int output_w
double *bias
double ****weight
double ***output
cdef struct MaxPool2d:
int kernel_size
int stride
int channels
int input_h
int input_w
int output_h
int output_w
double ***output
cdef struct Linear:
int in_features
int out_features
double *bias
double **weight
double *output
cdef struct Data1d:
double *array
int shape[1]
cdef struct Data3d:
double ***array
int shape[3]
cdef Conv2d *conv1
conv1 = PyMem_Malloc(sizeof(Conv2d))
conv1.in_channels = 1
conv1.out_channels = 20
conv1.kernel_size = 5
conv1.input_w = 28
conv1.input_h = 28
conv1.output_w = 24 #input_w - kernel_size + 1
conv1.output_h = 24 #input_h - kernel_size + 1
init_conv(conv1)
cdef MaxPool2d *maxpool1
maxpool1 = PyMem_Malloc(sizeof(MaxPool2d))
maxpool1.kernel_size = 2
maxpool1.stride = 2
maxpool1.channels = 20 #conv1.out_channels
maxpool1.input_w = 24 #conv1.output_w
maxpool1.input_h = 24 #conv1.output_h
maxpool1.output_w = 12 #ceil(input_w-(kernel_size-1))/stride)
maxpool1.output_h = 12 #ceil(input_h-(kernel_size-1))/stride)
init_maxpool(maxpool1)
cdef Conv2d *conv2
conv2 = PyMem_Malloc(sizeof(Conv2d))
conv2.in_channels = 20 #conv1.out_channels
conv2.out_channels = 50
conv2.kernel_size = 5
conv2.input_w = 12 #maxpool1.output_w
conv2.input_h = 12 #maxpool1.output_h
conv2.output_w = 8 #input_w - kernel_size + 1
conv2.output_h = 8 #input_h - kernel_size + 1
init_conv(conv2)
cdef MaxPool2d *maxpool2
maxpool2 = PyMem_Malloc(sizeof(MaxPool2d))
maxpool2.kernel_size = 2
maxpool2.stride = 2
maxpool2.channels = 50 #conv2.out_channels
maxpool2.input_w = 8 #conv2.output_w
maxpool2.input_h = 8 #conv2.output_h
maxpool2.output_w = 4 #ceil(input_w-(kernel_size-1))/stride)
maxpool2.output_h = 4 #ceil(input_h-(kernel_size-1))/stride)
init_maxpool(maxpool2)
cdef Linear *fc1
fc1 = PyMem_Malloc(sizeof(Linear))
fc1.in_features = 4*4*50 #output_h*output_w*out_channels
fc1.out_features = 500
fc1.output = PyMem_Malloc(fc1.out_features * sizeof(double))
cdef Linear *fc2
fc2 = PyMem_Malloc(sizeof(Linear))
fc2.in_features = 500 #fc1.out_feature
fc2.out_features = 10
fc2.output = PyMem_Malloc(fc2.out_features * sizeof(double))
cdef Data3d *in_data
in_data = PyMem_Malloc(sizeof(Data3d))
init_data3d(in_data,1,28,28)
cdef Data1d *lines
lines = PyMem_Malloc(sizeof(Data1d))
init_data1d(lines,4*4*50)
cdef bool ready = False
cdef init():
set_param()
cdef init_conv(Conv2d* conv):
cdef int i,j,k
conv.weight = PyMem_Malloc(conv.out_channels * sizeof(double*))
conv.bias = PyMem_Malloc(conv.out_channels * sizeof(double))
conv.output = PyMem_Malloc(conv.out_channels * sizeof(double*))
for i in range(conv.out_channels):
conv.weight[i] = PyMem_Malloc(conv.in_channels * sizeof(double*))
for j in range(conv.in_channels):
conv.weight[i][j] = PyMem_Malloc(conv.kernel_size * sizeof(double*))
for k in range(conv.kernel_size):
conv.weight[i][j][k] = PyMem_Malloc(conv.kernel_size * sizeof(double))
conv.output[i] = PyMem_Malloc(conv.output_h * sizeof(double*))
for j in range(conv.output_h):
conv.output[i][j] = PyMem_Malloc(conv.output_w * sizeof(double))
cdef init_maxpool(MaxPool2d* pool):
cdef int i,j
pool.output = PyMem_Malloc(pool.channels * sizeof(double*))
for i in range(pool.channels):
pool.output[i] = PyMem_Malloc(pool.output_h * sizeof(double*))
for j in range(pool.output_h):
pool.output[i][j] = PyMem_Malloc(pool.output_w * sizeof(double))
cdef init_data3d(Data3d* data,int channel,int height,int width):
cdef int i,j
data.array = PyMem_Malloc(channel * sizeof(double*))
for i in range(channel):
data.array[i] = PyMem_Malloc(height * sizeof(double*))
for j in range(height):
data.array[i][j] = PyMem_Malloc(width * sizeof(double))
data.shape[0] = channel
data.shape[1] = height
data.shape[2] = width
cdef init_data1d(Data1d* data,elem_num):
data.array = PyMem_Malloc(4*4*50*sizeof(double))
data.shape[0] = elem_num
cdef set_param():
global conv1,conv2,fc1,fc2
if not os.path.exists('param.dat'):
print('Cannot find param.dat')
return
f = open('param.dat','rb')
state = pickle.load(f)
f.close()
conv_set_param(conv1,state['conv1.weight'],state['conv1.bias'])
conv_set_param(conv2,state['conv2.weight'],state['conv2.bias'])
fc_set_param(fc1,state['fc1.weight'],state['fc1.bias'])
fc_set_param(fc2,state['fc2.weight'],state['fc2.bias'])
global ready
ready = True
cdef conv_set_param(Conv2d* conv,weights,bias):
cdef int i,j,k,l
for i in range(conv.out_channels):
for j in range(conv.in_channels):
for k in range(conv.kernel_size):
for l in range(conv.kernel_size):
conv.weight[i][j][k][l] = weights[i][j][k][l]
for i in range(conv.out_channels):
conv.bias[i] = bias[i]
cdef fc_set_param(Linear* fc,weights,bias):
cdef int i,j
fc.weight = PyMem_Malloc(fc.out_features * sizeof(double*))
for i in range(fc.out_features):
fc.weight[i] = PyMem_Malloc(fc.in_features * sizeof(double))
fc.bias = PyMem_Malloc(fc.out_features * sizeof(double))
for i in range(fc.out_features):
for j in range(fc.in_features):
fc.weight[i][j] = weights[i][j]
for i in range(fc.out_features):
fc.bias[i] = bias[i]
cdef forward(pil_image):
global ready
if not ready:
print('Error: Cannot set parameters.')
return
global conv1,conv2,fc1,fc2,maxpool1,maxpool2,in_data,lines
img_to_data(pil_image,in_data)
forward_conv2d(conv1,in_data.array)
max_pool2d(maxpool1,conv1.output)
forward_conv2d(conv2,maxpool1.output)
max_pool2d(maxpool2,conv2.output)
trans3dto1d(lines,maxpool2.output,50,4,4)
forward_fc(fc1,lines.array,True)
forward_fc(fc2,fc1.output)
return max_ch(fc2.output,10)
cdef forward_conv2d(Conv2d* conv,double*** in_data):
cdef int ch0,ch1,i,j,k,l,x,y
cdef double v,weight
for ch1 in range(conv.out_channels):
for i in range(conv.output_h):
for j in range(conv.output_w):
v = 0.0
for ch0 in range(conv.in_channels):
for k in range(conv.kernel_size):
for l in range(conv.kernel_size):
y = i + k
x = j + l
weight = conv.weight[ch1][ch0][k][l]
v += in_data[ch0][y][x] * weight
v += conv.bias[ch1]
if v < 0.0: # relu
v = 0.0
conv.output[ch1][i][j] = v
cdef max_pool2d(MaxPool2d* maxpool,double*** in_data):
cdef int ch,i,j,k,l,x,y
cdef double v,v_max
for ch in range(maxpool.channels):
for i in range(maxpool.output_h):
for j in range(maxpool.output_w):
v_max = 0.0
for k in range(maxpool.kernel_size):
for l in range(maxpool.kernel_size):
y = i*maxpool.stride + k
x = j*maxpool.stride + l
v = in_data[ch][y][x]
if v > v_max:
v_max = v
maxpool.output[ch][i][j] = v_max
cdef trans3dto1d(Data1d* out_data,double*** in_data,int channel,int height,int width):
cdef int i,j,k,n
n = 0
for i in range(channel):
for j in range(height):
for k in range(width):
out_data.array[n] = in_data[i][j][k]
n += 1
cdef forward_fc(Linear* fc,double* in_data,bool use_relu=False):
cdef int i,j
cdef double value,pvalue
for i in range(fc.out_features):
value = 0.0
for j in range(fc.in_features):
pvalue = in_data[j]
value += pvalue * fc.weight[i][j]
value += fc.bias[i]
if use_relu:
if value < 0.0:
value = 0.0
fc.output[i] = value
cdef max_ch(double* data,int lengh):
cdef int i,ch
cdef double v,max_v
max_v = 0.0
ch = 0
for i in range(lengh):
v = data[i]
if v > max_v:
max_v = v
ch = i
return ch
cdef img_to_data(pil_img,Data3d* data):
cdef int i,j,w,h
w,h = pil_img.size
for i in range(w):
for j in range(h):
data.array[0][j][i] = pil_img.getpixel((i,j))
def recognize(pil_img):
return forward(pil_img)
init()
