chainer-goghでAI画像作成

August 10, 2019

chainer-goghを使ってある画像をスタイル画像に合わせて合成してみます。
（結構時間がかかります・・・）

まず、作成する画像を出力するディレクトリを作成しておきます。

1	!mkdir result

次に画像合成に必要な関数を定義します。

import chainer
from chainer import cuda
import chainer.functions as F
from chainer.links import caffe
from chainer import Variable, optimizers

class NIN:
    def __init__(self, fn="nin_imagenet.caffemodel", alpha=[0,0,1,1], beta=[1,1,1,1]):
        print ("load model... %s"%fn)
        self.model = caffe.CaffeFunction(fn)
        self.alpha = alpha
        self.beta = beta
    def forward(self, x):
        y0 = F.relu(self.model.conv1(x))
        y1 = self.model.cccp2(F.relu(self.model.cccp1(y0)))
        x1 = F.relu(self.model.conv2(F.average_pooling_2d(F.relu(y1), 3, stride=2)))
        y2 = self.model.cccp4(F.relu(self.model.cccp3(x1)))
        x2 = F.relu(self.model.conv3(F.average_pooling_2d(F.relu(y2), 3, stride=2)))
        y3 = self.model.cccp6(F.relu(self.model.cccp5(x2)))
        x3 = F.relu(getattr(self.model,"conv4-1024")(F.dropout(F.average_pooling_2d(F.relu(y3), 3, stride=2))))
        return [y0,x1,x2,x3]

class VGG:
    def __init__(self, fn="VGG_ILSVRC_16_layers.caffemodel", alpha=[0,0,1,1], beta=[1,1,1,1]):
        print ("load model... %s"%fn)
        self.model = caffe.CaffeFunction(fn)
        self.alpha = alpha
        self.beta = beta
    def forward(self, x):
        y1 = self.model.conv1_2(F.relu(self.model.conv1_1(x)))
        x1 = F.average_pooling_2d(F.relu(y1), 2, stride=2)
        y2 = self.model.conv2_2(F.relu(self.model.conv2_1(x1)))
        x2 = F.average_pooling_2d(F.relu(y2), 2, stride=2)
        y3 = self.model.conv3_3(F.relu(self.model.conv3_2(F.relu(self.model.conv3_1(x2)))))
        x3 = F.average_pooling_2d(F.relu(y3), 2, stride=2)
        y4 = self.model.conv4_3(F.relu(self.model.conv4_2(F.relu(self.model.conv4_1(x3)))))
        return [y1,y2,y3,y4]

class VGG_chainer:
    def __init__(self, alpha=[0,0,1,1], beta=[1,1,1,1]):
        from chainer.links import VGG16Layers
        print ("load model... vgg_chainer")
        self.model = VGG16Layers()
        self.alpha = alpha
        self.beta = beta
    def forward(self, x):
        feature = self.model(x, layers=["conv1_2", "conv2_2", "conv3_3", "conv4_3"])
        return [feature["conv1_2"], feature["conv2_2"], feature["conv3_3"], feature["conv4_3"]]

class I2V:
    def __init__(self, fn="illust2vec_tag_ver200.caffemodel", alpha=[0,0,0,1,10,100], beta=[0.1,1,1,10,100,1000]):
        print ("load model... %s"%fn)
        self.model = caffe.CaffeFunction(fn)
        self.alpha = alpha
        self.beta = beta
        self.pool_func = F.average_pooling_2d

    def forward(self, x):
        y1 = self.model.conv1_1(x)
        x1 = self.pool_func(F.relu(y1), 2, stride=2)
        y2 = self.model.conv2_1(x1)
        x2 = self.pool_func(F.relu(y2), 2, stride=2)
        y3 = self.model.conv3_2(F.relu(self.model.conv3_1(x2)))
        x3 = self.pool_func(F.relu(y3), 2, stride=2)
        y4 = self.model.conv4_2(F.relu(self.model.conv4_1(x3)))
        x4 = self.pool_func(F.relu(y4), 2, stride=2)
        y5 = self.model.conv5_2(F.relu(self.model.conv5_1(x4)))
        x5 = self.pool_func(F.relu(y5), 2, stride=2)
        y6 = self.model.conv6_4(F.relu(F.dropout(self.model.conv6_3(F.relu(self.model.conv6_2(F.relu(self.model.conv6_1(x5))))),train=False)))
        return [y1,y2,y3,y4,y5,y6]

class GoogLeNet:
    def __init__(self, fn="bvlc_googlenet.caffemodel", alpha=[0,0,0,0,1,10], beta=[0.00005, 5, 50, 50, 5000, 500000]):
        print ("load model... %s"%fn)
        self.model = caffe.CaffeFunction(fn)
        self.alpha = alpha
        self.beta = beta
        self.pool_func = F.average_pooling_2d

    def forward(self, x):
        y1 = self.model['conv1/7x7_s2'](x)
        h = F.relu(y1)
        h = F.local_response_normalization(self.pool_func(h, 3, stride=2), n=5)
        h = F.relu(self.model['conv2/3x3_reduce'](h))
        y2 = self.model['conv2/3x3'](h)
        h = F.relu(y2)
        h = self.pool_func(F.local_response_normalization(h, n=5), 3, stride=2)
        out1 = self.model['inception_3a/1x1'](h)
        out3 = self.model['inception_3a/3x3'](F.relu(self.model['inception_3a/3x3_reduce'](h)))
        out5 = self.model['inception_3a/5x5'](F.relu(self.model['inception_3a/5x5_reduce'](h)))
        pool = self.model['inception_3a/pool_proj'](self.pool_func(h, 3, stride=1, pad=1))
        y3 = F.concat((out1, out3, out5, pool), axis=1)
        h = F.relu(y3)

        out1 = self.model['inception_3b/1x1'](h)
        out3 = self.model['inception_3b/3x3'](F.relu(self.model['inception_3b/3x3_reduce'](h)))
        out5 = self.model['inception_3b/5x5'](F.relu(self.model['inception_3b/5x5_reduce'](h)))
        pool = self.model['inception_3b/pool_proj'](self.pool_func(h, 3, stride=1, pad=1))
        y4 = F.concat((out1, out3, out5, pool), axis=1)
        h = F.relu(y4)

        h = self.pool_func(h, 3, stride=2)

        out1 = self.model['inception_4a/1x1'](h)
        out3 = self.model['inception_4a/3x3'](F.relu(self.model['inception_4a/3x3_reduce'](h)))
        out5 = self.model['inception_4a/5x5'](F.relu(self.model['inception_4a/5x5_reduce'](h)))
        pool = self.model['inception_4a/pool_proj'](self.pool_func(h, 3, stride=1, pad=1))
        y5 = F.concat((out1, out3, out5, pool), axis=1)
        h = F.relu(y5)

        out1 = self.model['inception_4b/1x1'](h)
        out3 = self.model['inception_4b/3x3'](F.relu(self.model['inception_4b/3x3_reduce'](h)))
        out5 = self.model['inception_4b/5x5'](F.relu(self.model['inception_4b/5x5_reduce'](h)))
        pool = self.model['inception_4b/pool_proj'](self.pool_func(h, 3, stride=1, pad=1))
        y6 = F.concat((out1, out3, out5, pool), axis=1)
        h = F.relu(y6)

        return [y1,y2,y3,y4,y5,y6]

いよいよ合成処理を実行します。
139行目から147行目で入力画像や各パラメータを設定しています。

import argparse
import os
import sys

import numpy as np
from PIL import Image

import chainer
from chainer import cuda
import chainer.functions as F
import chainer.links
from chainer.links import caffe
from chainer import Variable, optimizers

import pickle

def subtract_mean(x0):
    x = x0.copy()
    x[0,0,:,:] -= 120
    x[0,1,:,:] -= 120
    x[0,2,:,:] -= 120
    return x
def add_mean(x0):
    x = x0.copy()
    x[0,0,:,:] += 120
    x[0,1,:,:] += 120
    x[0,2,:,:] += 120
    return x

def image_resize(img_file, width):
    gogh = Image.open(img_file)
    orig_w, orig_h = gogh.size[0], gogh.size[1]
    if orig_w>orig_h:
        new_w = width
        new_h = width*orig_h//orig_w
        gogh = np.asarray(gogh.resize((new_w,new_h)))[:,:,:3].transpose(2, 0, 1)[::-1].astype(np.float32)
        gogh = gogh.reshape((1,3,new_h,new_w))
        print("image resized to: ", gogh.shape)
        hoge= np.zeros((1,3,width,width), dtype=np.float32)
        hoge[0,:,width-new_h:,:] = gogh[0,:,:,:]
        gogh = subtract_mean(hoge)
    else:
        new_w = width*orig_w//orig_h
        new_h = width
        gogh = np.asarray(gogh.resize((new_w,new_h)))[:,:,:3].transpose(2, 0, 1)[::-1].astype(np.float32)
        gogh = gogh.reshape((1,3,new_h,new_w))
        print("image resized to: ", gogh.shape)
        hoge= np.zeros((1,3,width,width), dtype=np.float32)
        hoge[0,:,:,width-new_w:] = gogh[0,:,:,:]
        gogh = subtract_mean(hoge)
    return xp.asarray(gogh), new_w, new_h

def save_image(img, width, new_w, new_h, it):
    def to_img(x):
        im = np.zeros((new_h,new_w,3))
        im[:,:,0] = x[2,:,:]
        im[:,:,1] = x[1,:,:]
        im[:,:,2] = x[0,:,:]
        def clip(a):
            return 0 if a<0 else (255 if a>255 else a)
        im = np.vectorize(clip)(im).astype(np.uint8)
        Image.fromarray(im).save(args['out_dir']+"/im_%05d.png"%it)

    if args['gpu']>=0:
        img_cpu = add_mean(img.get())
    else:
        img_cpu = add_mean(img)
    if width==new_w:
        to_img(img_cpu[0,:,width-new_h:,:])
    else:
        to_img(img_cpu[0,:,:,width-new_w:])

def get_matrix(y):
    ch = y.data.shape[1]
    wd = y.data.shape[2]
    gogh_y = F.reshape(y, (ch,wd**2))
    gogh_matrix = F.matmul(gogh_y, gogh_y, transb=True)/np.float32(ch*wd**2)
    return gogh_matrix

class Clip(chainer.Function):
    def forward(self, x):
        x = x[0]
        ret = cuda.elementwise(
            'T x','T ret',
            '''
                ret = x<-120?-120:(x>136?136:x);
            ''','clip')(x)
        return ret

def generate_image(img_orig, img_style, width, nw, nh, max_iter, lr, img_gen=None):
    mid_orig = nn.forward(Variable(img_orig))
    style_mats = [get_matrix(y) for y in nn.forward(Variable(img_style))]

    if img_gen is None:
        if args['gpu'] >= 0:
            img_gen = xp.random.uniform(-20,20,(1,3,width,width),dtype=np.float32)
        else:
            img_gen = np.random.uniform(-20,20,(1,3,width,width)).astype(np.float32)
    img_gen = chainer.links.Parameter(img_gen)
    optimizer = optimizers.Adam(alpha=lr)
    optimizer.setup(img_gen)
    for i in range(max_iter):
        img_gen.zerograds()

        x = img_gen.W
        y = nn.forward(x)

        L = Variable(xp.zeros((), dtype=np.float32))
        for l in range(len(y)):
            ch = y[l].data.shape[1]
            wd = y[l].data.shape[2]
            gogh_y = F.reshape(y[l], (ch,wd**2))
            gogh_matrix = F.matmul(gogh_y, gogh_y, transb=True)/np.float32(ch*wd**2)

            L1 = np.float32(args['lam']) * np.float32(nn.alpha[l])*F.mean_squared_error(y[l], Variable(mid_orig[l].data))
            L2 = np.float32(nn.beta[l])*F.mean_squared_error(gogh_matrix, Variable(style_mats[l].data))/np.float32(len(y))
            L += L1+L2

            if i%100==0:
                print(i,l,L1.data,L2.data)

        L.backward()
        img_gen.W.grad = x.grad
        optimizer.update()

        tmp_shape = x.data.shape
        if args['gpu'] >= 0:
            img_gen.W.data += Clip().forward(img_gen.W.data).reshape(tmp_shape) - img_gen.W.data
        else:
            def clip(x):
                return -120 if x<-120 else (136 if x>136 else x)
            img_gen.W.data += np.vectorize(clip)(img_gen.W.data).reshape(tmp_shape) - img_gen.W.data

        if i%50==0:
            save_image(img_gen.W.data, W, nw, nh, i)

# 各パラメータを設定
args = {}
args['orig_img'] = 'cat.png'        # オリジナルファイル
args['style_img'] = 'style_6.png'   # スタイルファイル
args['out_dir'] = 'result'          # 出力ディレクトリ
args['model'] = 'nin_imagenet.caffemodel' # 学習済みモデルファイル
args['width'] = 435                 # 出力画像の幅
args['iter'] = 5000                 # 繰り返し回数
args['gpu'] = -1
args['lam'] = 0.005
args['lr'] = 4.0

if args['gpu'] >= 0:
    cuda.check_cuda_available()
    chainer.Function.type_check_enable = False
    cuda.get_device(args['gpu']).use()
    xp = cuda.cupy
else:
    xp = np

if 'nin' in args['model']:
    nn = NIN()
elif 'vgg' == args['model']:
    nn = VGG()
elif 'vgg_chainer' == args['model']:
    nn = VGG_chainer()
elif 'i2v' in args['model']:
    nn = I2V()
elif 'googlenet' in args['model']:
    nn = GoogLeNet()
else:
    print ('invalid model name. you can use (nin, vgg, vgg_chainer, i2v, googlenet)')
if args['gpu']>=0:
    nn.model.to_gpu()

W = args['width']
img_content,nw,nh = image_resize(args['orig_img'], W)
img_style,_,_ = image_resize(args['style_img'], W)

generate_image(img_content, img_style, W, nw, nh, img_gen=None, max_iter=args['iter'], lr=args['lr'])

[入力ファイル] ※あらかじめGoogle Colaboratoryにアップロードしておきます。

入力ファイル	内容
	オリジナルファイル
	スタイルファイル（オリジナルファイルをこのファイルっぽく画像合成する）
nin_imagenet.caffemodel	学習済みモデルファイル（ネットに落ちてます）