65.9K
CodeProject 正在变化。 阅读更多。
Home

简单 XML 解析器 - 极简方法

starIconstarIconstarIconemptyStarIconemptyStarIcon

3.00/5 (6投票s)

2007 年 12 月 31 日

CPOL

2分钟阅读

viewsIcon

19248

无需验证的纯 C++ XML 解析器辅助类。

引言

在纯 C++ 程序中解析 XML 文档可能需要庞大的类库,或者通过 COM 连接到 MSXML 解析器。但有时,客户端程序升级为使用 XML 格式来发送和接收简短的消息。本文总结了一种极简的 XML 解析方法。我们把所有的验证都交给服务器和传输层,试图提供一套纯 C++、小巧便捷的类,用于处理小型和简单的 XML 消息。

代码

请跟随单个 XmlParser.h 文件,它将是你项目所需要的。它使用了 Visual Studio 2005 引入的 Microsoft 安全 C 运行时库扩展,如果可用的话。我们还使用了 STL string 类。

#include <string>
#include <string.h>

class XmlAttribute
{
  const char* pstr;
  public:
    XmlAttribute() : pstr(0) {}
    bool find(const char* element, const char* name) 
    {
      string attr = " ";
      attr += name;
      attr += "=\"";
      pstr = strstr(element, attr.c_str()); 
      if (pstr == 0)
        return false;

      pstr = strstr(pstr, "=\"") + 2;
      return true; 
    }

    operator DWORD() const { if (pstr == 0) return 0; return strtoul(pstr, NULL, 10); }
    operator int() const { if (pstr == 0) return 0; return atoi(pstr); }
    operator bool() const { if (pstr == 0) return false; 
	return _strnicmp(pstr, "true", 4) ? false : true; }
    operator float() const { if (pstr == 0) return 0; return atof(pstr); }
    operator string() const { if (pstr == 0) return string(); 
	return string(pstr, strchr(pstr, '"')-pstr); }
    template<size_t size> friend 	// cannot overload the normal assignment 
				// operator the way we need!
      const char* operator&=(char (&dest)[size], XmlAttribute attr) 
    {
      memset(dest, 0, size);
      if (attr.pstr != 0) 
#if (_MSC_VER >= 1400)
        strncpy_s(dest, size, attr.pstr, strcspn(attr.pstr, "\""));
#else
        strncpy(dest, attr.pstr, min(size, strcspn(attr.pstr, "\"")));
#endif        
      return &(dest[0]);
    }
}; 

class XmlElement
{
  const char* content;
  XmlAttribute anAttribute; // is reused

  public:
    class namestring: private string
    {
    public:
      namestring(const char* str)
      {
        if (str[0] == '<')
        {
          assign(str+1, strcspn(str+1, " ></\t"));
        }
      };
      operator const char*() const { return c_str(); };
      bool operator==(const char* str) const { return 0 == strcmp(c_str(), str); };
      bool operator!=(const char* str) const { return 0 != strcmp(c_str(), str); };
    };

    const namestring Name;
    XmlElement(const char* str) : content(str), Name(str) {}
    const XmlAttribute& GetAttribute(const char* name) 
	{ anAttribute.find(content, name); return anAttribute; }
    const char* GetChild(const char* name = 0) const
    {
      if (name == 0)
        return strchr(content, '>') + 1;
 
      string child = "<";
      child += name;
      child += " ";
      return strstr(content+1, child.c_str());
    }
    const char* GetSibling(const char* name = 0) const { return GetChild(name); }
};

我们定义了两个类,用户应该了解:XmlElementXmlAttribute

这里有一个解析简单 XML 消息的例子。它也展示了如何在没有大型 XML 库的情况下生成这样的消息。

struct Base
{
  int b;
  virtual void ToXML(string &s) const = 0;
}

struct DerivedOne : public Base
{
  int d11;
  char d12[10];
  float d13;
  void ToXML(string &s) const;
  void ToXML(string &s, const char* name) const;
  void LoadXML(XmlElement& xmlmsg);
}

struct DerivedTwo : public Base
{
  DWORD d21;
  bool d22;
  char d23[200];
  void ToXML(string &s) const;
}

struct DerivedThree: public Base
{
  DerivedOne d31[3];
  bool d32;
  void ToXML(string &s) const;
}

const Base* CreateTypedMessage(const char* msgstring)
{
  _ASSERT(msgstring != 0 && strstr(msgstring, "<") != 0);
  XmlElement xmlmsg = strstr(msgstring, "<");

  const Base* msg = NULL;
  if (xmlmsg.Name == "DervedOne")
  {
    msg = new DerivedOne();
    msg->LoadXML(xmlmsg);
  }
  else if (xmlmsg.Name == "DervedTwo")
  {
    msg = new DerivedTwo();
    msg->b = xmlmsg.GetAttribute("b");
    msg->d21 = xmlmsg.GetAttribute("d21");
    msg->d22 = xmlmsg.GetAttribute("d22");
    strcpy(msg->d23, xmlmsg.GetChild());
  }
  else if (xmlmsg.Name == "DervedThree")
  {
    msg = new DerivedThree(); 
    msg->b = xmlmsg.GetAttribute("b");
    msg->d32 = xmlmsg.GetAttribute("d32");
    XmlElement xmlchild = xmlmsg.GetChild("d31");
    msg->d31[0].LoadXML(xmlchild);
    msg->d31[1].LoadXML(xmlchild.GetSibling("d31"));
    XmlElement xmlchild2 = xmlmsg.GetSibling("d31").GetSibling("d31");
    msg->d31[2].LoadXML(xmlchild2);
  }
}

void DerivedOne::LoadXML(XmlElement& xmlmsg)
{
    b = xmlmsg.GetAttribute("b");
    d11 = xmlmsg.GetAttribute("d11");
    d12 &= xmlmsg.GetAttribute("d12"); // note use of operator &=
    d13 = xmlmsg.GetAttribute("d11");
}

void DerivedOne::ToXML(string& s) const
{
  return ToXML(s, "<DerivedOne ");
}

void DerivedOne::ToXML(string& s, const char* name) const
{
  char buf[20];

  s += name;
  s += "b="; sprintf(buf, "\"%d\" ", b); s += buf;
  s += "d11="; sprintf(buf, "\"%d\" ", d11); s += buf;
  s += "d12=\""; s += d12; s += "\" ";
  s += "d13="; sprintf(buf, "\"%f\" ", d13); s += buf;
  s += " />";
}

void DerivedTwo::ToXML(string& s) const
{
  char buf[20];

  s += "<DerivedTwo ";
  s += "b="; sprintf(buf, "\"%d\" ", b); s += buf;
  s += "d21="; sprintf(buf, "\"%u\" ", d21); s += buf;

  s += "d22="; s += (d22 ? "\"true\"" : "\"false\""); s += buf;
  s += " />";
  s += d23;
  s += "</DerivedTwo>";
}
 
void DerivedThree::ToXML(string& s) const
{
  char buf[20];

  s += "<DerivedThree ";
  s += "b="; sprintf(buf, "\"%d\" ", b); s += buf;
  s += "d32="; s += (d32 ? "\"true\"" : "\"false\""); s += buf;
  s += " />";
  d31[0].ToXML(s, "d31 idx=\"0\" ");
  d31[1].ToXML(s, "d31 idx=\"1\" ");
  d31[2].ToXML(s, "d31 idx=\"2\" ");
  s += "</DerivedThree>";
}

这里是 XML 格式的消息,供参考

<DerivedOne b="1211" d11="-12" d12="abcd" d13="31.2" />

<DerivedTwo b="1212" d21="1212" d22="True">this is the text for d23</DerivedTwo>

<DerivedThree b="1213" d32="True">
  <d31 idx="0" b="1221" d11="-12" d12="abcd" d13="31.2" />
  <d31 idx="1" b="1231" d11="-12" d12="abcd" d13="31.2" />
  <d31 idx="2" b="1241" d11="-12" d12="abcd" d13="31.2" />
 </DerivedThree>

关注点

从我们的角度来看,GetSibling()GetChild() 是同义词。我强烈建议在使用这个类的代码中,不要忘记它们的语义区别。C++ 的规则使得我们无法以所需的方式重载 operator =(),因此我使用了重载的 operator &=()

我强烈建议定义 LoadXML(XmlElement& xmlmsg) 方法(如我们为类 DerivedOne 所做的那样),而不是像对 DerivedTwo 那样直接访问数据成员。

XML 通常以 UTF-8 编码。但你也应该注意将 < 编码为 &lt; 并将其解码回去。这里提供了一个使用 Win32 API 处理 wchar_t C 字符串的编码器和解码器片段。

void Encoder(const wchar* content, std::string& s)
{
  for (const wchar_t* pc = content; *pc; pc++)
  {
    if (!iswascii(*pc))
    {
      memset(buf, 0, sizeof(buf));
      if (WideCharToMultiByte(CP_UTF8, 0, pc, 1, buf, sizeof(buf), NULL, NULL))
        s += buf;
      else
        s += '?';
    }
    else if (*pc == '<')
      s += "<";
    else if (*pc == '>')
      s += ">";
    else if (*pc == '&')
      s += "&";
    else
      s += *pc;
  }
}

const wchar_t* Decoder(const XmlElement& xmlmsg)
{
  const char* pc = xmlmsg.GetChild();
  const int size = MultiByteToWideChar(CP_UTF8, 0, pc, strchr(pc, '<') - pc, 0, 0);

  wchar_t* content = 0; // if something goes wrong during conversion

  if (size > 0)
  {
    content = new wchar_t[size+1];
    if (size == MultiByteToWideChar
	(CP_UTF8, 0, pc, strchr(pc, '<') - pc, content, size))
    {
      content[size] = 0;
      wchar_t* pdst = content;

      for (const wchar_t* psrc = content; *psrc != 0; psrc++, pdst++)
      {
        if (wcsncmp(psrc, L"&", 5) == 0)
        {
          *pdst = L'&';
          psrc += 4;
        }
        else if (wcsncmp(psrc, L"<", 4) == 0)
        {
          *pdst = L'<';
          psrc += 3;
        }
        else if (wcsncmp(psrc, L">", 4) == 0)
        {
          *pdst = L'>';
          psrc += 3;
        }
        else
        {
          *pdst = *psrc;
        }
      }
    }
  }
  return content;
}

历史

  • 2008 年 1 月 3 - 添加了 Encoder Decoder 片段
© . All rights reserved.