图像标记器 - 基于卷积神经网络的图像分类器

Huseyin Atasoy

4.95/5 (16投票s)

2019 年 4 月 3 日

Apache

3分钟阅读

35090

1407

一个基于卷积神经网络的图像分类器/标签器。现在借助 Intel MKL 支持，速度提高了 10 倍以上。

我们将编写一个应用程序，允许我们按关键字搜索图像。我讨厌库依赖项或“黑盒”。因此，我们将不使用任何第 3 方 API 或库。一切都将以纯 C# 编写并且简单。（使用 CeNiN v0.2，当Intel MKL 支持可用时，速度现在提高了 10 倍以上。）

引言

深度卷积神经网络是图像处理领域的热门话题之一。存在各种语言的不同实现。但是，如果您试图了解这些想法背后的逻辑，那么大型实现并不总是有帮助的。因此，我以最简形式将卷积神经网络的前馈阶段实现为一个 .NET 库；CeNiN.dll。

我们将使用 CeNiN 对图像进行分类，并使用关键字标记它们，以便我们可以搜索一组图像中的对象或场景。例如，我们将能够在一个我们选择的文件夹中搜索并找到包含猫、汽车或我们想要的任何内容的图像。

CeNiN 不包含反向传播的实现，反向传播是训练神经网络模型所必需的。我们将使用预训练的模型。我们将使用的原始模型 (imagenet-matconvnet-vgg-f) 以及格式与 CeNiN 兼容的同一模型可以在这里和这里找到。该模型包含 19+2（输入和输出）层和 60824256 个权重，并已针对 1000 个类别的图像进行了训练...

准备模型

首先，我们使用构造函数加载模型。由于从模型文件中加载数百万个参数可能需要一段时间，因此我们在单独的线程中调用构造函数以避免阻塞 UI

Thread t = new Thread(() =>
{
    try
    {
        cnn = new CNN("imagenet-matconvnet-vgg-f.cenin");
        ddLabel.Invoke((MethodInvoker)delegate ()
        {
            cbClasses.Items.AddRange(cnn.outputLayer.classes);
            dropToStart();
        });
    }
    catch (Exception exp)
    {
        ddLabel.Invoke((MethodInvoker)delegate ()
        {
            ddLabel.Text = "Missing model file!";
            if (MessageBox.Show(this, "Couldn't find model file. 
                Do you want to be redirected to download page?", "Missing Model File", 
                MessageBoxButtons.YesNo,MessageBoxIcon.Error) == DialogResult.Yes)
                Process.Start("http://huseyinatasoy.com/y.php?bid=71");
        });
    }
});
t.Start();

分类图像

我们需要一个结构来保存结果

private struct Match
{
    public int ImageIndex { set; get; }
    public string Keywords { set; get; }
    public float Probability { set; get; }
    public string ImageName { set; get; }

    public Match(int imageIndex, string keywords, float probability, string imageName)
    {
        ImageIndex = imageIndex;
        Keywords = keywords;
        Probability = probability;
        ImageName = imageName;
    }
}

CeNiN 将层作为层链加载到内存中。该链是一个链表，其第一个和最后一个节点是Input和Output层。为了对图像进行分类，将图像设置为输入，并迭代各层，调用feedNext()函数来馈送每个步骤中的下一层。当数据到达Output层时，它采用概率向量的形式。调用getDecision()会按从高到低的顺序对概率进行排序，然后我们可以将每个概率视为Match。重要的是再次在线程内部进行这些调用，以避免阻塞 UI。此外，由于线程无法修改 UI 元素，因此修改 UI 元素（向lv_KeywordList添加新行，更新ddLabel.Text）的代码应由 GUI 线程调用。

Thread t = new Thread(() =>
{
    int imCount = imageFullPaths.Length;
    for (int j = 0; j < imCount; j++)
    {
        Bitmap b = (Bitmap)Image.FromFile(imageFullPaths[j]);
        ddLabel.Invoke((Action<int,int>)delegate (int y, int n)
        {
            ddLabel.Text = "Processing [" + (y + 1) + "/" + n + "]...\n\n" + 
                            getImageName(imageFullPaths[y]);
        }, j, imCount);
        Application.DoEvents();

        cnn.inputLayer.setInput(b, Input.ResizingMethod.ZeroPad);
        b.Dispose();

        Layer currentLayer = cnn.inputLayer;
        while (currentLayer.nextLayer != null)
        {
            currentLayer.feedNext();
            currentLayer = currentLayer.nextLayer;
        }
        Output outputLayer = (Output)currentLayer;
        outputLayer.getDecision();

        lv_KeywordList.Invoke((MethodInvoker)delegate ()
        {
            int k = 0;
            while (outputLayer.probabilities[k] > 0.05)
            {
                Match m = new Match(
                    j,
                    outputLayer.sortedClasses[k],
                    (float)Math.Round(outputLayer.probabilities[k], 3),
                    getImageName(imageFullPaths[j])
                );
                matches.Add(m);
                k++;
            }
        });
    }

    lv_KeywordList.Invoke((MethodInvoker)delegate ()
    {
        groupBox2.Enabled = true;
        btnFilter.PerformClick();

        int k;
        for (k = 0; k < lv_KeywordList.Columns.Count - 1; k++)
            if(k!=1)
              lv_KeywordList.Columns[k].Width = -2;
        lv_KeywordList.Columns[k].Width = -1;

        dropToStart();
    });
});
t.Start();

现在所有图像都已使用关键字标记，这些关键字实际上是我们正在使用的模型的类描述。最后，我们迭代Matches 以找到包含用户编写的关键字的每个Match。

float probThresh = (float)numericUpDown1.Value;
string str = cbClasses.Text.ToLower();
lv_KeywordList.Items.Clear();
pictureBox1.Image = null;

List<int> imagesToShow = new List<int>();

int j = 0;

bool stringFilter = (str != "");

for (int i = 0; i < matches.Count; i++)
{
    bool cond = (matches[i].Probability >= probThresh);
    if (stringFilter)
        cond = cond && matches[i].Keywords.Contains(str);
    if (cond)
    {
        addMatchToList(j, matches[i]);
        int ind = matches[i].ImageIndex;
        if (!imagesToShow.Contains(ind))
            imagesToShow.Add(ind);
        j++;
    }
}
if (lv_KeywordList.Items.Count > 0)
    lv_KeywordList.Items[0].Selected = true;

就这么简单！

为 ImageTagger 训练您自己的模型

您可以使用像 matconvnet 这样的工具训练您自己的神经网络，并将其转换为 CeNiN 格式以在 ImageTagger 中使用它。这是一个将 vgg 网络转换为与 CeNiN 兼容的格式的 matlab 脚本

function vgg2cenin(vggMatFile) % vgg2cenin('imagenet-matconvnet-vgg-f.mat')
  fprintf('Loading mat file...\n');
  net=load(vggMatFile);
  lc=size(net.layers,2);

  vggMatFile(find(vggMatFile=='.',1,'last'):end)=[]; % remove extension
  
  f=fopen(strcat(vggMatFile,'.cenin'),'w');   % Open an empty file with the same name
  fprintf(f,'CeNiN NEURAL NETWORK FILE');   % Header
  fwrite(f,lc,'int');             % Layer count
  if(isfield(net.meta,'inputSize'))
    s=net.meta.inputSize;
  else
    s=net.meta.inputs.size(1:3);
  end
  for i=1:length(s)
    fwrite(f,s(i),'int'); % Input dimensions (height, width and number of channels (depth))
  end
  for i=1:3
    fwrite(f,net.meta.normalization.averageImage(i),'single');
  end
  for i=1:lc % For each layer
    l=net.layers{i};
    t=l.type;
    s=length(t);
    fwrite(f,s,'int8'); % String length
    fprintf(f,t);     % Layer type (string)

    fprintf('Writing layer %d (%s)...\n',i,l.type);

    if strcmp(t,'conv') % Convolution layers     
      st=l.stride;
      p=l.pad;
      
      % We need 4 padding values for CeNiN (top, bottom, left, right)
      % In vgg format if there are one value, all padding values are
      % the same and if there are two values, these are for top-bottom
      % and left-right paddings.
      if size(st,2)<2 , st(2)=st(1); end
      if size(p,2)<2 , p(2)=p(1); end
      if size(p,2)<3 , p(3:4)=[p(1) p(2)]; end

      % Four padding values
      fwrite(f,p(1),'int8');
      fwrite(f,p(2),'int8');
      fwrite(f,p(3),'int8');
      fwrite(f,p(4),'int8');

      s=size(l.weights{1}); % Dimensions (height, width, number of channels (depth), 
                            number of filters)
      for j=1:length(s)
        fwrite(f,s(j),'int');
      end

      % Vertical and horizontal stride values (StrideY and StrideX)
      fwrite(f,st(1),'int8');
      fwrite(f,st(2),'int8');
      
      % Weight values
      % Writing each value one by one takes long time because there are many of them.
      %   for j=1:numel(l.weights{1})
      %     fwrite(f,l.weights{1}(j),'single');
      %   end
      % This is faster:
      fwrite(f,l.weights{1}(:),'single');
      
      % And biases
      %   for j=1:numel(l.weights{2})
      %     fwrite(f,l.weights{2}(j),'single');
      %   end
      fwrite(f,l.weights{2}(:),'single');

    elseif strcmp(t,'relu') % ReLu layers
      % Layer type ('relu') has been written above. There are no extra
      % parameters to be written for this layer..

    elseif strcmp(t,'pool') % Pooling layers
      st=l.stride;
      p=l.pad;
      po=l.pool;
      if size(st,2)<2 , st(2)=st(1); end
      if size(p,2)<2 , p(2)=p(1); end
      if size(p,2)<3 , p(3:4)=[p(1) p(2)]; end
      if size(po,2)<2 , po(2)=po(1); end

      % Four padding values (top, bottom, left, right)
      fwrite(f,p(1),'int8');
      fwrite(f,p(2),'int8');
      fwrite(f,p(3),'int8');
      fwrite(f,p(4),'int8');

      % Vertical and horizontal pooling values (PoolY and PoolX)
      fwrite(f,po(1),'int8');
      fwrite(f,po(2),'int8');

      % Vertical and horizontal stride values (StrideY and StrideX)
      fwrite(f,st(1),'int8');
      fwrite(f,st(2),'int8');

    elseif strcmp(t,'softmax') % SoftMax layer (this is the last layer)
      s=size(net.meta.classes.description,2);
      fwrite(f,s,'int'); % Number of classes
      for j=1:size(net.meta.classes.description,2) % For each class description
        s=size(net.meta.classes.description{j},2);
        fwrite(f,s,'int8'); % String length
        fprintf(f,'%s',net.meta.classes.description{j}); % Class description (string)
      end
    end

  end

  fwrite(f,3,'int8'); % Length of "EOF" as if it is a layer type.
  fprintf(f,'EOF');   % And the "EOF" string itself...
  fclose(f);

end

有用链接

cuDNN：用于深度学习的高效原语
预训练模型（它们不直接与 CeNiN 兼容）

历史

2019 年 4 月 3 日：初始版本