SQL Server 风格的通配符匹配

TFB_PLC

4.20/5 (4投票s)

2009年7月5日

CPOL

4分钟阅读

31123

138

在客户端代码中匹配 SQL Server 风格的通配符。

下载源代码 - 19.19 KB

引言

我被要求突出显示 CListCtrl 中与用于检索记录的 SQL 筛选器的一部分匹配的字段。与其为每个可能匹配的筛选条件多次访问数据库，最简洁的解决方案似乎是记录 LIKE 模式并在客户端针对列表中的每一行进行匹配。此代码是该想法的实现。

背景

当然，您可以将 LIKE 语法转换为正则表达式语法，并通过您最喜欢的正则表达式引擎运行它，但要做好这一点，无论如何都需要仔细标记该模式（例如，SQL Server 不支持 C 风格的转义），那么为什么不编写一个类来完成整个工作呢？

由于我已经编写了一个词法分析器生成器类，我有一个代码库可以用来工作，事实上，我只需要大大简化我已经写好的东西。

第一个任务是标记通配符模式。 SQL Server 支持 '%' 代表任意字符的零次或多次重复，'_' 代表任意单个字符，以及字符范围，字符范围可以选择取反，例如，[a-z], [^0-9]。

以下代码标记了通配符

enum e_Token {eEOF, eZeroOrMore, eAny, eCharSet};

typedef basic_string_token<CharT> string_token;

static e_Token next(const CharT * &curr_, const CharT *end_,
    string_token &chars_, const bool icase_,
    const std::locale &locale_)
{
    e_Token eToken = eCharSet;

    if (curr_ >= end_) return eEOF;

    switch (*curr_)
    {
    case '%':
#if defined _MSC_VER && _MSC_VER <= 1200
        chars_._charset.erase();
#else
        chars_._charset.clear();
#endif
        chars_._negated = true;
        eToken = eZeroOrMore;
        break;
    case '_':
#if defined _MSC_VER && _MSC_VER <= 1200
        chars_._charset.erase();
#else
        chars_._charset.clear();
#endif
        chars_._negated = true;
        eToken = eAny;
        break;
    case '[':
        charset(curr_, end_, chars_, icase_, locale_);
        chars_.remove_duplicates();
        chars_.normalise();
        break;
    default:
        if (icase_ && (std::isupper(*curr_, locale_) ||
            std::islower(*curr_, locale_)))
        {
            const CharT upper_ = std::toupper(*curr_, locale_);
            const CharT lower_ = std::tolower(*curr_, locale_);

            chars_._charset = upper_;
            chars_._charset += lower_;
        }
        else
        {
            chars_._charset = *curr_;
        }

        chars_._negated = false;
        break;
    }

    ++curr_;
    return eToken;
}

如您所见，主标记化过程没有什么惊天动地的，尽管字符集处理变得更加复杂。在正则表达式实现中，此例程要复杂得多！

字符范围的标记化如下所示

static void charset(const CharT * &curr_, const CharT *end_,
       string_token &chars_, const bool icase_,
       const std::locale &locale_)
{
    CharT prev_ = 0;

    ++curr_;

    if (curr_ >= end_)
    {
        // Pointless returning index if at end of string
        throw runtime_error("Unexpected end of wildcard "
            "following '['.");
    }

    chars_._negated = *curr_ == '^';

    if (chars_._negated)
    {
        ++curr_;

        if (curr_ >= end_)
        {
            // Pointless returning index if at end of string
            throw runtime_error("Unexpected end of wildcard "
                "following '^'.");
        }
    }

    while (*curr_ != ']')
    {
        prev_ = *curr_;
        ++curr_;

        if (curr_ >= end_)
        {
            // Pointless returning index if at end of string
            throw runtime_error("Unexpected end of wildcard "
                "(missing ']').");
        }

        if (*curr_ == '-')
        {
            charset_range(prev_, curr_, end_, chars_, icase_, locale_);
        }
        else
        {
            if (icase_ && (std::isupper(prev_, locale_) ||
                std::islower(prev_, locale_)))
            {
                const CharT upper_ = std::toupper(prev_, locale_);
                const CharT lower_ = std::tolower(prev_, locale_);

                chars_._charset += upper_;
                chars_._charset += lower_;
            }
            else
            {
                chars_._charset += prev_;
            }
        }
    }

    if (!chars_._negated && chars_._charset.empty())
    {
        throw runtime_error("Empty charsets not allowed.");
    }
}

static void charset_range(const CharT prev_, const CharT * &curr_,
    const CharT *end_, string_token &chars_, const bool icase_,
    const std::locale &locale_)
{
    ++curr_;

    if (curr_ >= end_)
    {
        // Pointless returning index if at end of string
        throw runtime_error("Unexpected end of wildcard "
            "following '-'.");
    }

    std::size_t range_end_ = 
      static_cast<typename Traits::index_type>(*curr_);

    ++curr_;

    if (curr_ >= end_)
    {
        // Pointless returning index if at end of string
        throw runtime_error("Unexpected end of wildcard "
            "(missing ']').");
    }

    std::size_t range_start_ = 
      static_cast<typename Traits::index_type>(prev_);

    // Semanic check
    if (range_end_ < range_start_)
    {
        throw runtime_error("Invalid range in charset.");
    }

    chars_._charset.reserve(chars_._charset.size() +
        (range_end_ + 1 - range_start_));

    for (; range_start_ <= range_end_; ++range_start_)
    {
        const CharT ch_ = static_cast<CharT>(range_start_);

        if (icase_ && (std::isupper(ch_, locale_) ||
            std::islower(ch_, locale_)))
        {
            const CharT upper_ = std::toupper(ch_, locale_);
            const CharT lower_ = std::tolower(ch_, locale_);

            chars_._charset += upper_;
            chars_._charset += lower_;
        }
        else
        {
            chars_._charset += ch_;
        }
    }
}

同样，由于通配符范围的简单性，这都是非常基本的东西。

对通配符进行标记后，我们现在可以构建一个令牌的语法树，这反过来又很简短且易于理解

static node *parse(const CharT * &curr_, const CharT *end_,
       node_ptr_vector &node_ptr_vector_, const bool icase_,
       const std::locale &locale_)
{
    node *root_ = 0;
    typename tokeniser::string_token chars_;

    while (curr_ < end_)
    {
        tokeniser::e_Token eToken = tokeniser::next(curr_, end_, chars_,
            icase_, locale_);
        node *node_ = 0;

        switch (eToken)
        {
        case tokeniser::eZeroOrMore:
            node_ptr_vector_->push_back(0);
            node_ptr_vector_->back() = new leaf_node(chars_);
            node_ = node_ptr_vector_->back();
            node_ptr_vector_->push_back(0);
            node_ptr_vector_->back() = new iteration_node(node_);
            node_ = node_ptr_vector_->back();
            break;
        case tokeniser::eAny:
        case tokeniser::eCharSet:
            node_ptr_vector_->push_back(0);
            node_ptr_vector_->back() = new leaf_node(chars_);
            node_ = node_ptr_vector_->back();
            break;
        }

        if (root_)
        {
            node *lhs_ = root_;

            node_ptr_vector_->push_back(0);
            node_ptr_vector_->back() = new sequence_node(lhs_, node_);
            root_ = node_ptr_vector_->back();
        }
        else
        {
            root_ = node_;
        }
    }

    if (root_)
    {
        node *rhs_ = 0;

        node_ptr_vector_->push_back(0);
        node_ptr_vector_->back() = new end_node;
        rhs_ = node_ptr_vector_->back();
        node_ptr_vector_->push_back(0);
        node_ptr_vector_->back() = new sequence_node(root_, rhs_);
        root_ = node_ptr_vector_->back();
    }

    return root_;
}

在生成语法树之后，我们真的需要更进一步来生成 DFA。使用的方法直接来自著名的“龙书”。尽管树生成代码看起来很简单，但实际上幕后还有更多的事情发生！如果您阅读《龙书》，它将解释说，firstpos、lastpos 和 followpos 集的计算可以在事后通过单个树遍历完成。实际上，由于树的构造性质，没有必要将其作为单独的步骤来完成。因此，我们在构建每个子树时构建这些集合...

以下是执行此操作的一些快速片段的树节点

basic_leaf_node(const string_token &token_) :
    node(false),
    _token(token_)
{
    if (!_nullable)
    {
        _firstpos.push_back(this);
        _lastpos.push_back(this);
    }
}

basic_sequence_node(node *left_, node *right_) :
    node(left_->nullable() && right_->nullable()),
    _left(left_),
    _right(right_)
{
    _left->append_firstpos(_firstpos);

    if (_left->nullable())
    {
        _right->append_firstpos(_firstpos);
    }

    if (_right->nullable())
    {
        _left->append_lastpos(_lastpos);
    }

    _right->append_lastpos(_lastpos);

    node_vector &lastpos_ = _left->lastpos();
    const node_vector &firstpos_ = _right->firstpos();

    for (node_vector::iterator iter_ = lastpos_.begin(),
        end_ = lastpos_.end(); iter_ != end_; ++iter_)
    {
        (*iter_)->append_followpos(firstpos_);
    }
}

basic_iteration_node(node *next_) :
    node(true),
    _next(next_)
{
    node_vector::iterator iter_;
    node_vector::iterator end_;

    _next->append_firstpos(_firstpos);
    _next->append_lastpos(_lastpos);

    for(iter_ = _lastpos.begin(), end_ = _lastpos.end();
        iter_ != end_; ++iter_)
    {
        (*iter_)->append_followpos(_firstpos);
    }
}

basic_end_node() : 
    node(false)
{
    node::_firstpos.push_back(this);
    node::_lastpos.push_back(this);
}

basic_node(const bool nullable_) :
     _nullable(nullable_)
{
}

以下是我们如何将树转换为 DFA

void build(const CharT *curr_, const CharT *end_)
{
    typename parser::node_ptr_vector node_ptr_vector_;
    node *root_ = parser::parse(curr_, end_, node_ptr_vector_, _locale);
    typename const node::node_vector *followpos_ = &root_->firstpos();
    node_set_vector seen_sets_;
    size_t_vector hash_vector_;

    closure(followpos_, seen_sets_, hash_vector_);

    for (std::size_t index_ = 0; index_ < seen_sets_->size(); ++index_)
    {
        equivset_list equiv_list_;

        build_equiv_list(seen_sets_[index_], equiv_list_);

        for (typename equivset_list::list::const_iterator iter_ =
            equiv_list_->begin(), end_ = equiv_list_->end();
            iter_ != end_; ++iter_)
        {
            equivset *equivset_ = *iter_;
            const std::size_t transition_ = closure
                (&equivset_->_followpos, seen_sets_,
                hash_vector_);

            if (transition_ != npos)
            {
                _dfa[index_]._transitions.push_back
                    (transition(equivset_->_token, transition_));
            }
        }
    }
}

closure() 函数构建一个 followpos 集的向量，并识别何时同一个集合出现不止一次，返回现有条目的索引而不是创建一个新的条目。 build_equiv_list() 在给定的树节点处相交字符范围，并基本上解决任何歧义。由于我们没有“或”或独立的重复运算符来处理，因此唯一可能的歧义来自 '%' 通配符的使用。假设我们有序列 '%a'。我们基本上会将“任意字符”与“a”相交，给我们留下两个字符范围：'[^a]' 和 '[a]'。现在，在相交过程中，这些转换会更新其 followpos 集以反映相交。在此示例中，'[^a]' 将在自身上循环回来（状态 0），而 '[a]' 将转到 DFA 状态 1。当状态 1 发生闭包时，我们最终会得到相同的两个字符范围，但这一次，'[^a]' 将返回到状态 0（这意味着第一个 'a' 实际上是 '%' 的一部分），而 '[a]' 在自身上循环到状态 1（也意味着第一个 'a' 是 '%' 的一部分）。此示例中的 DFA 状态 1 是结束状态。

我将字符范围保留为字符串，而不是像我的词法分析器生成器那样转换为等价类，因为对于一个简单的通配符类来说，性能应该不是一个问题。希望这也使代码更容易理解，因为在逐步执行代码以查看正在发生的事情时，等价类的引入会使事情变得相当不透明！

最后，这里是匹配例程

bool match(const string &str_) const
{
    bool match_ = true;
    const CharT *curr_ = str_.c_str();
    const CharT *end_ = curr_ + str_.size();
    std::size_t state_ = 0;

    for (; curr_ < end_; ++curr_)
    {
        if (!_dfa[state_].match(*curr_, state_))
        {
            match_ = false;
            break;
        }
    }

    if (match_)
    {
        match_ = _dfa[state_]._end;
    }

    return match_;
}

DFA 状态匹配例程

bool match(const CharT c_, std::size_t &next_) const
{
    bool match_ = false;
    typename transition_deque::const_iterator iter_ =
        _transitions.begin();
    typename transition_deque::const_iterator end_ =
        _transitions.end();

    for (; iter_ != end_; ++iter_)
    {
        if (iter_->match(c_, next_))
        {
            match_ = true;
            break;
        }
    }

    return match_;
}

转换匹配例程

bool match(const CharT c_, std::size_t &next_) const
{
    bool match_ = false;

    match_ = _chars._charset.find(c_) != string_token::string::npos;
    match_ ^= _chars._negated;

    if (match_)
    {
        next_ = _next;
    }

    return match_;
}

Using the Code

zip 文件带有一个简单的例子，如下所示

#include <iostream>
#include "tfbWildcard/wildcard.hpp"

int main(int argc, char* argv[])
{
    std::string test_("%blah%");

    try
    {
        tfb::wildcard wc;

        wc.assign(test_, true);

        bool match = wc.match("blah");

        std::cout << "Match: " << match << std::endl;
    }
    catch (const std::exception &e)
    {
        std::cout << e.what();
    }

    return 0;
}

关注点

请注意，tfb::wildcard 是一个 typedef。您可以使用 tfb::wwildcard 创建一个宽字符类。您可以使用 string 构造该类而不是 assign，但除此之外，这就是整个 API！

历史

2009-07-03：第一个版本发布
2009-07-03：修复了单个字符的大小写不敏感性，并将大小写敏感性设为选项
2010-04-29：更新了源代码 - 此版本与 C++0x 兼容
2010-06-17：此版本处理空字符串作为通配符