简单 XML 解析器 - 极简方法
无需验证的纯 C++ XML 解析器辅助类。
引言
在纯 C++ 程序中解析 XML 文档可能需要庞大的类库,或者通过 COM 连接到 MSXML 解析器。但有时,客户端程序升级为使用 XML 格式来发送和接收简短的消息。本文总结了一种极简的 XML 解析方法。我们把所有的验证都交给服务器和传输层,试图提供一套纯 C++、小巧便捷的类,用于处理小型和简单的 XML 消息。
代码
请跟随单个 XmlParser.h 文件,它将是你项目所需要的。它使用了 Visual Studio 2005 引入的 Microsoft 安全 C 运行时库扩展,如果可用的话。我们还使用了 STL string
类。
#include <string>
#include <string.h>
class XmlAttribute
{
const char* pstr;
public:
XmlAttribute() : pstr(0) {}
bool find(const char* element, const char* name)
{
string attr = " ";
attr += name;
attr += "=\"";
pstr = strstr(element, attr.c_str());
if (pstr == 0)
return false;
pstr = strstr(pstr, "=\"") + 2;
return true;
}
operator DWORD() const { if (pstr == 0) return 0; return strtoul(pstr, NULL, 10); }
operator int() const { if (pstr == 0) return 0; return atoi(pstr); }
operator bool() const { if (pstr == 0) return false;
return _strnicmp(pstr, "true", 4) ? false : true; }
operator float() const { if (pstr == 0) return 0; return atof(pstr); }
operator string() const { if (pstr == 0) return string();
return string(pstr, strchr(pstr, '"')-pstr); }
template<size_t size> friend // cannot overload the normal assignment
// operator the way we need!
const char* operator&=(char (&dest)[size], XmlAttribute attr)
{
memset(dest, 0, size);
if (attr.pstr != 0)
#if (_MSC_VER >= 1400)
strncpy_s(dest, size, attr.pstr, strcspn(attr.pstr, "\""));
#else
strncpy(dest, attr.pstr, min(size, strcspn(attr.pstr, "\"")));
#endif
return &(dest[0]);
}
};
class XmlElement
{
const char* content;
XmlAttribute anAttribute; // is reused
public:
class namestring: private string
{
public:
namestring(const char* str)
{
if (str[0] == '<')
{
assign(str+1, strcspn(str+1, " ></\t"));
}
};
operator const char*() const { return c_str(); };
bool operator==(const char* str) const { return 0 == strcmp(c_str(), str); };
bool operator!=(const char* str) const { return 0 != strcmp(c_str(), str); };
};
const namestring Name;
XmlElement(const char* str) : content(str), Name(str) {}
const XmlAttribute& GetAttribute(const char* name)
{ anAttribute.find(content, name); return anAttribute; }
const char* GetChild(const char* name = 0) const
{
if (name == 0)
return strchr(content, '>') + 1;
string child = "<";
child += name;
child += " ";
return strstr(content+1, child.c_str());
}
const char* GetSibling(const char* name = 0) const { return GetChild(name); }
};
我们定义了两个类,用户应该了解:XmlElement
和 XmlAttribute
。
这里有一个解析简单 XML 消息的例子。它也展示了如何在没有大型 XML 库的情况下生成这样的消息。
struct Base
{
int b;
virtual void ToXML(string &s) const = 0;
}
struct DerivedOne : public Base
{
int d11;
char d12[10];
float d13;
void ToXML(string &s) const;
void ToXML(string &s, const char* name) const;
void LoadXML(XmlElement& xmlmsg);
}
struct DerivedTwo : public Base
{
DWORD d21;
bool d22;
char d23[200];
void ToXML(string &s) const;
}
struct DerivedThree: public Base
{
DerivedOne d31[3];
bool d32;
void ToXML(string &s) const;
}
const Base* CreateTypedMessage(const char* msgstring)
{
_ASSERT(msgstring != 0 && strstr(msgstring, "<") != 0);
XmlElement xmlmsg = strstr(msgstring, "<");
const Base* msg = NULL;
if (xmlmsg.Name == "DervedOne")
{
msg = new DerivedOne();
msg->LoadXML(xmlmsg);
}
else if (xmlmsg.Name == "DervedTwo")
{
msg = new DerivedTwo();
msg->b = xmlmsg.GetAttribute("b");
msg->d21 = xmlmsg.GetAttribute("d21");
msg->d22 = xmlmsg.GetAttribute("d22");
strcpy(msg->d23, xmlmsg.GetChild());
}
else if (xmlmsg.Name == "DervedThree")
{
msg = new DerivedThree();
msg->b = xmlmsg.GetAttribute("b");
msg->d32 = xmlmsg.GetAttribute("d32");
XmlElement xmlchild = xmlmsg.GetChild("d31");
msg->d31[0].LoadXML(xmlchild);
msg->d31[1].LoadXML(xmlchild.GetSibling("d31"));
XmlElement xmlchild2 = xmlmsg.GetSibling("d31").GetSibling("d31");
msg->d31[2].LoadXML(xmlchild2);
}
}
void DerivedOne::LoadXML(XmlElement& xmlmsg)
{
b = xmlmsg.GetAttribute("b");
d11 = xmlmsg.GetAttribute("d11");
d12 &= xmlmsg.GetAttribute("d12"); // note use of operator &=
d13 = xmlmsg.GetAttribute("d11");
}
void DerivedOne::ToXML(string& s) const
{
return ToXML(s, "<DerivedOne ");
}
void DerivedOne::ToXML(string& s, const char* name) const
{
char buf[20];
s += name;
s += "b="; sprintf(buf, "\"%d\" ", b); s += buf;
s += "d11="; sprintf(buf, "\"%d\" ", d11); s += buf;
s += "d12=\""; s += d12; s += "\" ";
s += "d13="; sprintf(buf, "\"%f\" ", d13); s += buf;
s += " />";
}
void DerivedTwo::ToXML(string& s) const
{
char buf[20];
s += "<DerivedTwo ";
s += "b="; sprintf(buf, "\"%d\" ", b); s += buf;
s += "d21="; sprintf(buf, "\"%u\" ", d21); s += buf;
s += "d22="; s += (d22 ? "\"true\"" : "\"false\""); s += buf;
s += " />";
s += d23;
s += "</DerivedTwo>";
}
void DerivedThree::ToXML(string& s) const
{
char buf[20];
s += "<DerivedThree ";
s += "b="; sprintf(buf, "\"%d\" ", b); s += buf;
s += "d32="; s += (d32 ? "\"true\"" : "\"false\""); s += buf;
s += " />";
d31[0].ToXML(s, "d31 idx=\"0\" ");
d31[1].ToXML(s, "d31 idx=\"1\" ");
d31[2].ToXML(s, "d31 idx=\"2\" ");
s += "</DerivedThree>";
}
这里是 XML 格式的消息,供参考
<DerivedOne b="1211" d11="-12" d12="abcd" d13="31.2" />
<DerivedTwo b="1212" d21="1212" d22="True">this is the text for d23</DerivedTwo>
<DerivedThree b="1213" d32="True">
<d31 idx="0" b="1221" d11="-12" d12="abcd" d13="31.2" />
<d31 idx="1" b="1231" d11="-12" d12="abcd" d13="31.2" />
<d31 idx="2" b="1241" d11="-12" d12="abcd" d13="31.2" />
</DerivedThree>
关注点
从我们的角度来看,GetSibling()
和 GetChild()
是同义词。我强烈建议在使用这个类的代码中,不要忘记它们的语义区别。C++ 的规则使得我们无法以所需的方式重载 operator =()
,因此我使用了重载的 operator &=()
。
我强烈建议定义 LoadXML(XmlElement& xmlmsg)
方法(如我们为类 DerivedOne
所做的那样),而不是像对 DerivedTwo
那样直接访问数据成员。
XML 通常以 UTF-8 编码。但你也应该注意将 <
编码为 <
并将其解码回去。这里提供了一个使用 Win32 API 处理 wchar_t
C 字符串的编码器和解码器片段。
void Encoder(const wchar* content, std::string& s)
{
for (const wchar_t* pc = content; *pc; pc++)
{
if (!iswascii(*pc))
{
memset(buf, 0, sizeof(buf));
if (WideCharToMultiByte(CP_UTF8, 0, pc, 1, buf, sizeof(buf), NULL, NULL))
s += buf;
else
s += '?';
}
else if (*pc == '<')
s += "<";
else if (*pc == '>')
s += ">";
else if (*pc == '&')
s += "&";
else
s += *pc;
}
}
const wchar_t* Decoder(const XmlElement& xmlmsg)
{
const char* pc = xmlmsg.GetChild();
const int size = MultiByteToWideChar(CP_UTF8, 0, pc, strchr(pc, '<') - pc, 0, 0);
wchar_t* content = 0; // if something goes wrong during conversion
if (size > 0)
{
content = new wchar_t[size+1];
if (size == MultiByteToWideChar
(CP_UTF8, 0, pc, strchr(pc, '<') - pc, content, size))
{
content[size] = 0;
wchar_t* pdst = content;
for (const wchar_t* psrc = content; *psrc != 0; psrc++, pdst++)
{
if (wcsncmp(psrc, L"&", 5) == 0)
{
*pdst = L'&';
psrc += 4;
}
else if (wcsncmp(psrc, L"<", 4) == 0)
{
*pdst = L'<';
psrc += 3;
}
else if (wcsncmp(psrc, L">", 4) == 0)
{
*pdst = L'>';
psrc += 3;
}
else
{
*pdst = *psrc;
}
}
}
}
return content;
}
历史
- 2008 年 1 月 3日 - 添加了
Encoder
和Decoder
片段