chainer-goghでAI画像作成

chainer-goghを使ってある画像をスタイル画像に合わせて合成してみます。
(結構時間がかかります・・・)

まず、作成する画像を出力するディレクトリを作成しておきます。

1
!mkdir result

次に画像合成に必要な関数を定義します。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import chainer
from chainer import cuda
import chainer.functions as F
from chainer.links import caffe
from chainer import Variable, optimizers

class NIN:
def __init__(self, fn="nin_imagenet.caffemodel", alpha=[0,0,1,1], beta=[1,1,1,1]):
print ("load model... %s"%fn)
self.model = caffe.CaffeFunction(fn)
self.alpha = alpha
self.beta = beta
def forward(self, x):
y0 = F.relu(self.model.conv1(x))
y1 = self.model.cccp2(F.relu(self.model.cccp1(y0)))
x1 = F.relu(self.model.conv2(F.average_pooling_2d(F.relu(y1), 3, stride=2)))
y2 = self.model.cccp4(F.relu(self.model.cccp3(x1)))
x2 = F.relu(self.model.conv3(F.average_pooling_2d(F.relu(y2), 3, stride=2)))
y3 = self.model.cccp6(F.relu(self.model.cccp5(x2)))
x3 = F.relu(getattr(self.model,"conv4-1024")(F.dropout(F.average_pooling_2d(F.relu(y3), 3, stride=2))))
return [y0,x1,x2,x3]

class VGG:
def __init__(self, fn="VGG_ILSVRC_16_layers.caffemodel", alpha=[0,0,1,1], beta=[1,1,1,1]):
print ("load model... %s"%fn)
self.model = caffe.CaffeFunction(fn)
self.alpha = alpha
self.beta = beta
def forward(self, x):
y1 = self.model.conv1_2(F.relu(self.model.conv1_1(x)))
x1 = F.average_pooling_2d(F.relu(y1), 2, stride=2)
y2 = self.model.conv2_2(F.relu(self.model.conv2_1(x1)))
x2 = F.average_pooling_2d(F.relu(y2), 2, stride=2)
y3 = self.model.conv3_3(F.relu(self.model.conv3_2(F.relu(self.model.conv3_1(x2)))))
x3 = F.average_pooling_2d(F.relu(y3), 2, stride=2)
y4 = self.model.conv4_3(F.relu(self.model.conv4_2(F.relu(self.model.conv4_1(x3)))))
return [y1,y2,y3,y4]

class VGG_chainer:
def __init__(self, alpha=[0,0,1,1], beta=[1,1,1,1]):
from chainer.links import VGG16Layers
print ("load model... vgg_chainer")
self.model = VGG16Layers()
self.alpha = alpha
self.beta = beta
def forward(self, x):
feature = self.model(x, layers=["conv1_2", "conv2_2", "conv3_3", "conv4_3"])
return [feature["conv1_2"], feature["conv2_2"], feature["conv3_3"], feature["conv4_3"]]

class I2V:
def __init__(self, fn="illust2vec_tag_ver200.caffemodel", alpha=[0,0,0,1,10,100], beta=[0.1,1,1,10,100,1000]):
print ("load model... %s"%fn)
self.model = caffe.CaffeFunction(fn)
self.alpha = alpha
self.beta = beta
self.pool_func = F.average_pooling_2d

def forward(self, x):
y1 = self.model.conv1_1(x)
x1 = self.pool_func(F.relu(y1), 2, stride=2)
y2 = self.model.conv2_1(x1)
x2 = self.pool_func(F.relu(y2), 2, stride=2)
y3 = self.model.conv3_2(F.relu(self.model.conv3_1(x2)))
x3 = self.pool_func(F.relu(y3), 2, stride=2)
y4 = self.model.conv4_2(F.relu(self.model.conv4_1(x3)))
x4 = self.pool_func(F.relu(y4), 2, stride=2)
y5 = self.model.conv5_2(F.relu(self.model.conv5_1(x4)))
x5 = self.pool_func(F.relu(y5), 2, stride=2)
y6 = self.model.conv6_4(F.relu(F.dropout(self.model.conv6_3(F.relu(self.model.conv6_2(F.relu(self.model.conv6_1(x5))))),train=False)))
return [y1,y2,y3,y4,y5,y6]

class GoogLeNet:
def __init__(self, fn="bvlc_googlenet.caffemodel", alpha=[0,0,0,0,1,10], beta=[0.00005, 5, 50, 50, 5000, 500000]):
print ("load model... %s"%fn)
self.model = caffe.CaffeFunction(fn)
self.alpha = alpha
self.beta = beta
self.pool_func = F.average_pooling_2d

def forward(self, x):
y1 = self.model['conv1/7x7_s2'](x)
h = F.relu(y1)
h = F.local_response_normalization(self.pool_func(h, 3, stride=2), n=5)
h = F.relu(self.model['conv2/3x3_reduce'](h))
y2 = self.model['conv2/3x3'](h)
h = F.relu(y2)
h = self.pool_func(F.local_response_normalization(h, n=5), 3, stride=2)
out1 = self.model['inception_3a/1x1'](h)
out3 = self.model['inception_3a/3x3'](F.relu(self.model['inception_3a/3x3_reduce'](h)))
out5 = self.model['inception_3a/5x5'](F.relu(self.model['inception_3a/5x5_reduce'](h)))
pool = self.model['inception_3a/pool_proj'](self.pool_func(h, 3, stride=1, pad=1))
y3 = F.concat((out1, out3, out5, pool), axis=1)
h = F.relu(y3)

out1 = self.model['inception_3b/1x1'](h)
out3 = self.model['inception_3b/3x3'](F.relu(self.model['inception_3b/3x3_reduce'](h)))
out5 = self.model['inception_3b/5x5'](F.relu(self.model['inception_3b/5x5_reduce'](h)))
pool = self.model['inception_3b/pool_proj'](self.pool_func(h, 3, stride=1, pad=1))
y4 = F.concat((out1, out3, out5, pool), axis=1)
h = F.relu(y4)

h = self.pool_func(h, 3, stride=2)

out1 = self.model['inception_4a/1x1'](h)
out3 = self.model['inception_4a/3x3'](F.relu(self.model['inception_4a/3x3_reduce'](h)))
out5 = self.model['inception_4a/5x5'](F.relu(self.model['inception_4a/5x5_reduce'](h)))
pool = self.model['inception_4a/pool_proj'](self.pool_func(h, 3, stride=1, pad=1))
y5 = F.concat((out1, out3, out5, pool), axis=1)
h = F.relu(y5)

out1 = self.model['inception_4b/1x1'](h)
out3 = self.model['inception_4b/3x3'](F.relu(self.model['inception_4b/3x3_reduce'](h)))
out5 = self.model['inception_4b/5x5'](F.relu(self.model['inception_4b/5x5_reduce'](h)))
pool = self.model['inception_4b/pool_proj'](self.pool_func(h, 3, stride=1, pad=1))
y6 = F.concat((out1, out3, out5, pool), axis=1)
h = F.relu(y6)

return [y1,y2,y3,y4,y5,y6]

いよいよ合成処理を実行します。
139行目から147行目で入力画像や各パラメータを設定しています。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import argparse
import os
import sys

import numpy as np
from PIL import Image

import chainer
from chainer import cuda
import chainer.functions as F
import chainer.links
from chainer.links import caffe
from chainer import Variable, optimizers

import pickle

def subtract_mean(x0):
x = x0.copy()
x[0,0,:,:] -= 120
x[0,1,:,:] -= 120
x[0,2,:,:] -= 120
return x
def add_mean(x0):
x = x0.copy()
x[0,0,:,:] += 120
x[0,1,:,:] += 120
x[0,2,:,:] += 120
return x

def image_resize(img_file, width):
gogh = Image.open(img_file)
orig_w, orig_h = gogh.size[0], gogh.size[1]
if orig_w>orig_h:
new_w = width
new_h = width*orig_h//orig_w
gogh = np.asarray(gogh.resize((new_w,new_h)))[:,:,:3].transpose(2, 0, 1)[::-1].astype(np.float32)
gogh = gogh.reshape((1,3,new_h,new_w))
print("image resized to: ", gogh.shape)
hoge= np.zeros((1,3,width,width), dtype=np.float32)
hoge[0,:,width-new_h:,:] = gogh[0,:,:,:]
gogh = subtract_mean(hoge)
else:
new_w = width*orig_w//orig_h
new_h = width
gogh = np.asarray(gogh.resize((new_w,new_h)))[:,:,:3].transpose(2, 0, 1)[::-1].astype(np.float32)
gogh = gogh.reshape((1,3,new_h,new_w))
print("image resized to: ", gogh.shape)
hoge= np.zeros((1,3,width,width), dtype=np.float32)
hoge[0,:,:,width-new_w:] = gogh[0,:,:,:]
gogh = subtract_mean(hoge)
return xp.asarray(gogh), new_w, new_h

def save_image(img, width, new_w, new_h, it):
def to_img(x):
im = np.zeros((new_h,new_w,3))
im[:,:,0] = x[2,:,:]
im[:,:,1] = x[1,:,:]
im[:,:,2] = x[0,:,:]
def clip(a):
return 0 if a<0 else (255 if a>255 else a)
im = np.vectorize(clip)(im).astype(np.uint8)
Image.fromarray(im).save(args['out_dir']+"/im_%05d.png"%it)

if args['gpu']>=0:
img_cpu = add_mean(img.get())
else:
img_cpu = add_mean(img)
if width==new_w:
to_img(img_cpu[0,:,width-new_h:,:])
else:
to_img(img_cpu[0,:,:,width-new_w:])

def get_matrix(y):
ch = y.data.shape[1]
wd = y.data.shape[2]
gogh_y = F.reshape(y, (ch,wd**2))
gogh_matrix = F.matmul(gogh_y, gogh_y, transb=True)/np.float32(ch*wd**2)
return gogh_matrix

class Clip(chainer.Function):
def forward(self, x):
x = x[0]
ret = cuda.elementwise(
'T x','T ret',
'''
ret = x<-120?-120:(x>136?136:x);
''','clip')(x)
return ret

def generate_image(img_orig, img_style, width, nw, nh, max_iter, lr, img_gen=None):
mid_orig = nn.forward(Variable(img_orig))
style_mats = [get_matrix(y) for y in nn.forward(Variable(img_style))]

if img_gen is None:
if args['gpu'] >= 0:
img_gen = xp.random.uniform(-20,20,(1,3,width,width),dtype=np.float32)
else:
img_gen = np.random.uniform(-20,20,(1,3,width,width)).astype(np.float32)
img_gen = chainer.links.Parameter(img_gen)
optimizer = optimizers.Adam(alpha=lr)
optimizer.setup(img_gen)
for i in range(max_iter):
img_gen.zerograds()

x = img_gen.W
y = nn.forward(x)

L = Variable(xp.zeros((), dtype=np.float32))
for l in range(len(y)):
ch = y[l].data.shape[1]
wd = y[l].data.shape[2]
gogh_y = F.reshape(y[l], (ch,wd**2))
gogh_matrix = F.matmul(gogh_y, gogh_y, transb=True)/np.float32(ch*wd**2)

L1 = np.float32(args['lam']) * np.float32(nn.alpha[l])*F.mean_squared_error(y[l], Variable(mid_orig[l].data))
L2 = np.float32(nn.beta[l])*F.mean_squared_error(gogh_matrix, Variable(style_mats[l].data))/np.float32(len(y))
L += L1+L2

if i%100==0:
print(i,l,L1.data,L2.data)

L.backward()
img_gen.W.grad = x.grad
optimizer.update()

tmp_shape = x.data.shape
if args['gpu'] >= 0:
img_gen.W.data += Clip().forward(img_gen.W.data).reshape(tmp_shape) - img_gen.W.data
else:
def clip(x):
return -120 if x<-120 else (136 if x>136 else x)
img_gen.W.data += np.vectorize(clip)(img_gen.W.data).reshape(tmp_shape) - img_gen.W.data

if i%50==0:
save_image(img_gen.W.data, W, nw, nh, i)

# 各パラメータを設定
args = {}
args['orig_img'] = 'cat.png' # オリジナルファイル
args['style_img'] = 'style_6.png' # スタイルファイル
args['out_dir'] = 'result' # 出力ディレクトリ
args['model'] = 'nin_imagenet.caffemodel' # 学習済みモデルファイル
args['width'] = 435 # 出力画像の幅
args['iter'] = 5000 # 繰り返し回数
args['gpu'] = -1
args['lam'] = 0.005
args['lr'] = 4.0

if args['gpu'] >= 0:
cuda.check_cuda_available()
chainer.Function.type_check_enable = False
cuda.get_device(args['gpu']).use()
xp = cuda.cupy
else:
xp = np

if 'nin' in args['model']:
nn = NIN()
elif 'vgg' == args['model']:
nn = VGG()
elif 'vgg_chainer' == args['model']:
nn = VGG_chainer()
elif 'i2v' in args['model']:
nn = I2V()
elif 'googlenet' in args['model']:
nn = GoogLeNet()
else:
print ('invalid model name. you can use (nin, vgg, vgg_chainer, i2v, googlenet)')
if args['gpu']>=0:
nn.model.to_gpu()

W = args['width']
img_content,nw,nh = image_resize(args['orig_img'], W)
img_style,_,_ = image_resize(args['style_img'], W)

generate_image(img_content, img_style, W, nw, nh, img_gen=None, max_iter=args['iter'], lr=args['lr'])

[入力ファイル] ※あらかじめGoogle Colaboratoryにアップロードしておきます。

入力ファイル 内容
cat.png オリジナルファイル
style_6.png スタイルファイル(オリジナルファイルをこのファイルっぽく画像合成する)
nin_imagenet.caffemodel 学習済みモデルファイル(ネットに落ちてます)

[合成された画像] ※50ファイル出力されるので、そのうち5ファイルをピックアップしてます。

im_01000.png
im_02000.png
im_03000.png
im_04000.png
im_04950.png

合成時間がかかるものの写真をマンガ風にしたり、ゴシック調にしたりとなにかに使えるような気がしないでもありません。。。

(Google Colaboratoryで動作確認しています。)