前回の続きで、手書き数字(MNIST)を学習したPyTorchの畳み込みニューラルネットワーク(CNN)のパラメータを利用した自前畳み込み演算プログラムを、Cythonで作成して、手書き数字認識ライブラリ化しました。
前回と違い、モデルの定義を含め、殆どをCython内で記述しており、CNNとしての柔軟性はなく、目的である手書き数字認識ライブラリとしてしか使用できません。生Pythonでは、1文字1.609秒かかった処理が、0.001秒になり、およそ160倍高速化しました。前回のライブラリからは、およそ6倍の高速化です。
ダウンロードするなら、こちらから。
動作確認環境:Python3.6以降、Windows64bit
MNISTでの正解率を確認すると、
0:99.90 % 1:99.21 % 2:99.13 % 3:99.60 % 4:98.68 % 5:98.65 % 6:98.85 % 7:98.54 % 8:99.79 % 9:99.01 % total: 99.14 %
微妙にPyTorchの時の正解率と異なるのですが、、、数値誤差と信じよう。
自分の文字も9割くらいは正解するので、認識率的にはまぁまぁ実用的なんじゃないかなぁと。でも、2値化やサイズ・位置調整等の画像の前処理は必要です。
まぁただ、エラー処理はまったくしていないので、そのへんが実用に耐えうるのかはわかりませんが。
使い方
from PIL import Image, ImageOps import hand_num_reader as reader img = Image.open(image_path) img = img.convert('L').resize((28,28)) img = ImageOps.invert(img) result = reader.recognize(img)
hand_num_reader.pydと同フォルダにパラメータファイルであるparam.datが必要です。
判別させたい画像は、PIL(またはPillow)で開いて渡します。画像サイズは28×28で、グレースケール。
1画像に1文字で、文字を中央に配置して、画像の80%くらいに調整する前処理すると正解率が高いです。
黒ベースの画像に白で数値を描いた画像を想定しており、白ベースで黒い数字を認識させる場合は、invertで反転が必要です。
ソースコード:hand_num_reader.pyx
# -*- coding: utf-8 -*- import os,pickle from cpython cimport bool from cpython.mem cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free cdef struct Conv2d: int in_channels int out_channels int kernel_size int input_h int input_w int output_h int output_w double *bias double ****weight double ***output cdef struct MaxPool2d: int kernel_size int stride int channels int input_h int input_w int output_h int output_w double ***output cdef struct Linear: int in_features int out_features double *bias double **weight double *output cdef struct Data1d: double *array int shape[1] cdef struct Data3d: double ***array int shape[3] cdef Conv2d *conv1 conv1 =PyMem_Malloc(sizeof(Conv2d)) conv1.in_channels = 1 conv1.out_channels = 20 conv1.kernel_size = 5 conv1.input_w = 28 conv1.input_h = 28 conv1.output_w = 24 #input_w - kernel_size + 1 conv1.output_h = 24 #input_h - kernel_size + 1 init_conv(conv1) cdef MaxPool2d *maxpool1 maxpool1 = PyMem_Malloc(sizeof(MaxPool2d)) maxpool1.kernel_size = 2 maxpool1.stride = 2 maxpool1.channels = 20 #conv1.out_channels maxpool1.input_w = 24 #conv1.output_w maxpool1.input_h = 24 #conv1.output_h maxpool1.output_w = 12 #ceil(input_w-(kernel_size-1))/stride) maxpool1.output_h = 12 #ceil(input_h-(kernel_size-1))/stride) init_maxpool(maxpool1) cdef Conv2d *conv2 conv2 = PyMem_Malloc(sizeof(Conv2d)) conv2.in_channels = 20 #conv1.out_channels conv2.out_channels = 50 conv2.kernel_size = 5 conv2.input_w = 12 #maxpool1.output_w conv2.input_h = 12 #maxpool1.output_h conv2.output_w = 8 #input_w - kernel_size + 1 conv2.output_h = 8 #input_h - kernel_size + 1 init_conv(conv2) cdef MaxPool2d *maxpool2 maxpool2 = PyMem_Malloc(sizeof(MaxPool2d)) maxpool2.kernel_size = 2 maxpool2.stride = 2 maxpool2.channels = 50 #conv2.out_channels maxpool2.input_w = 8 #conv2.output_w maxpool2.input_h = 8 #conv2.output_h maxpool2.output_w = 4 #ceil(input_w-(kernel_size-1))/stride) maxpool2.output_h = 4 #ceil(input_h-(kernel_size-1))/stride) init_maxpool(maxpool2) cdef Linear *fc1 fc1 = PyMem_Malloc(sizeof(Linear)) fc1.in_features = 4*4*50 #output_h*output_w*out_channels fc1.out_features = 500 fc1.output = PyMem_Malloc(fc1.out_features * sizeof(double)) cdef Linear *fc2 fc2 = PyMem_Malloc(sizeof(Linear)) fc2.in_features = 500 #fc1.out_feature fc2.out_features = 10 fc2.output = PyMem_Malloc(fc2.out_features * sizeof(double)) cdef Data3d *in_data in_data = PyMem_Malloc(sizeof(Data3d)) init_data3d(in_data,1,28,28) cdef Data1d *lines lines = PyMem_Malloc(sizeof(Data1d)) init_data1d(lines,4*4*50) cdef bool ready = False cdef init(): set_param() cdef init_conv(Conv2d* conv): cdef int i,j,k conv.weight = PyMem_Malloc(conv.out_channels * sizeof(double*)) conv.bias = PyMem_Malloc(conv.out_channels * sizeof(double)) conv.output = PyMem_Malloc(conv.out_channels * sizeof(double*)) for i in range(conv.out_channels): conv.weight[i] = PyMem_Malloc(conv.in_channels * sizeof(double*)) for j in range(conv.in_channels): conv.weight[i][j] = PyMem_Malloc(conv.kernel_size * sizeof(double*)) for k in range(conv.kernel_size): conv.weight[i][j][k] = PyMem_Malloc(conv.kernel_size * sizeof(double)) conv.output[i] = PyMem_Malloc(conv.output_h * sizeof(double*)) for j in range(conv.output_h): conv.output[i][j] = PyMem_Malloc(conv.output_w * sizeof(double)) cdef init_maxpool(MaxPool2d* pool): cdef int i,j pool.output = PyMem_Malloc(pool.channels * sizeof(double*)) for i in range(pool.channels): pool.output[i] = PyMem_Malloc(pool.output_h * sizeof(double*)) for j in range(pool.output_h): pool.output[i][j] = PyMem_Malloc(pool.output_w * sizeof(double)) cdef init_data3d(Data3d* data,int channel,int height,int width): cdef int i,j data.array = PyMem_Malloc(channel * sizeof(double*)) for i in range(channel): data.array[i] = PyMem_Malloc(height * sizeof(double*)) for j in range(height): data.array[i][j] = PyMem_Malloc(width * sizeof(double)) data.shape[0] = channel data.shape[1] = height data.shape[2] = width cdef init_data1d(Data1d* data,elem_num): data.array = PyMem_Malloc(4*4*50*sizeof(double)) data.shape[0] = elem_num cdef set_param(): global conv1,conv2,fc1,fc2 if not os.path.exists('param.dat'): print('Cannot find param.dat') return f = open('param.dat','rb') state = pickle.load(f) f.close() conv_set_param(conv1,state['conv1.weight'],state['conv1.bias']) conv_set_param(conv2,state['conv2.weight'],state['conv2.bias']) fc_set_param(fc1,state['fc1.weight'],state['fc1.bias']) fc_set_param(fc2,state['fc2.weight'],state['fc2.bias']) global ready ready = True cdef conv_set_param(Conv2d* conv,weights,bias): cdef int i,j,k,l for i in range(conv.out_channels): for j in range(conv.in_channels): for k in range(conv.kernel_size): for l in range(conv.kernel_size): conv.weight[i][j][k][l] = weights[i][j][k][l] for i in range(conv.out_channels): conv.bias[i] = bias[i] cdef fc_set_param(Linear* fc,weights,bias): cdef int i,j fc.weight = PyMem_Malloc(fc.out_features * sizeof(double*)) for i in range(fc.out_features): fc.weight[i] = PyMem_Malloc(fc.in_features * sizeof(double)) fc.bias = PyMem_Malloc(fc.out_features * sizeof(double)) for i in range(fc.out_features): for j in range(fc.in_features): fc.weight[i][j] = weights[i][j] for i in range(fc.out_features): fc.bias[i] = bias[i] cdef forward(pil_image): global ready if not ready: print('Error: Cannot set parameters.') return global conv1,conv2,fc1,fc2,maxpool1,maxpool2,in_data,lines img_to_data(pil_image,in_data) forward_conv2d(conv1,in_data.array) max_pool2d(maxpool1,conv1.output) forward_conv2d(conv2,maxpool1.output) max_pool2d(maxpool2,conv2.output) trans3dto1d(lines,maxpool2.output,50,4,4) forward_fc(fc1,lines.array,True) forward_fc(fc2,fc1.output) return max_ch(fc2.output,10) cdef forward_conv2d(Conv2d* conv,double*** in_data): cdef int ch0,ch1,i,j,k,l,x,y cdef double v,weight for ch1 in range(conv.out_channels): for i in range(conv.output_h): for j in range(conv.output_w): v = 0.0 for ch0 in range(conv.in_channels): for k in range(conv.kernel_size): for l in range(conv.kernel_size): y = i + k x = j + l weight = conv.weight[ch1][ch0][k][l] v += in_data[ch0][y][x] * weight v += conv.bias[ch1] if v < 0.0: # relu v = 0.0 conv.output[ch1][i][j] = v cdef max_pool2d(MaxPool2d* maxpool,double*** in_data): cdef int ch,i,j,k,l,x,y cdef double v,v_max for ch in range(maxpool.channels): for i in range(maxpool.output_h): for j in range(maxpool.output_w): v_max = 0.0 for k in range(maxpool.kernel_size): for l in range(maxpool.kernel_size): y = i*maxpool.stride + k x = j*maxpool.stride + l v = in_data[ch][y][x] if v > v_max: v_max = v maxpool.output[ch][i][j] = v_max cdef trans3dto1d(Data1d* out_data,double*** in_data,int channel,int height,int width): cdef int i,j,k,n n = 0 for i in range(channel): for j in range(height): for k in range(width): out_data.array[n] = in_data[i][j][k] n += 1 cdef forward_fc(Linear* fc,double* in_data,bool use_relu=False): cdef int i,j cdef double value,pvalue for i in range(fc.out_features): value = 0.0 for j in range(fc.in_features): pvalue = in_data[j] value += pvalue * fc.weight[i][j] value += fc.bias[i] if use_relu: if value < 0.0: value = 0.0 fc.output[i] = value cdef max_ch(double* data,int lengh): cdef int i,ch cdef double v,max_v max_v = 0.0 ch = 0 for i in range(lengh): v = data[i] if v > max_v: max_v = v ch = i return ch cdef img_to_data(pil_img,Data3d* data): cdef int i,j,w,h w,h = pil_img.size for i in range(w): for j in range(h): data.array[0][j][i] = pil_img.getpixel((i,j)) def recognize(pil_img): return forward(pil_img) init()