MeanFileCompare - 文本文件比较






4.50/5 (7投票s)
我参与了 Code Lean and Mean 文件比较竞赛。
引言
这是我参与 Code Lean and Mean 竞赛的作品。竞赛的目的是创建一个精简高效的文件差异程序,能够显示 Grid1.html 和 Grid2.html 的差异,尽可能快地运行并使用最少的内存。这篇文章显然是针对 C++ 竞赛的,这个类别似乎比 C# 类别的参赛作品少。
我能够在小于 3ms 的时间内完成这项工作,同时使用高达 2K 的文件缓冲区大小,从 1K 的缓冲区大小开始。
Using the Code
我使用字节对字节的比较来进行差异分析,因为我想显示行号以及文本,所以我需要能够计算文件中的换行符数量。这是文件比较的主循环
void Compare::CompareFiles(LPCWSTR file1, LPCWSTR file2)
{
//
// Load the files to compare
//
if (!_compareFile1.LoadFile(file1))
{
wprintf(L"Could not open:%s", file1);
return;
}
if (!_compareFile2.LoadFile(file2))
{
wprintf(L"Could not open:%s", file2);
return;
}
//
// Keep comparing until both files have been processed
//
while (true)
{
//
// Compare until we have a mismatch
//
if (_compareFile1._buffer[_compareFile1._currentBufferIndex] ==
_compareFile2._buffer[_compareFile2._currentBufferIndex])
{
//
// Check for a new line so we can increment the line number and
// save the position for the beginning of the line(just in case the line
// is different).
//
if (_compareFile1._buffer[_compareFile1._currentBufferIndex] == '\n')
{
_compareFile1._currentLineNumber++;
_compareFile2._currentLineNumber++;
_compareFile1._startOfLine = _compareFile1._currentBufferIndex + 1;
_compareFile2._startOfLine = _compareFile2._currentBufferIndex + 1;
}
_compareFile1._currentBufferIndex++;
_compareFile2._currentBufferIndex++;
//
// Check to see if we have reached the end of the buffers
// and read more if necessary. Both buffers need to be
// checked independently because they can get out of sync
// with differences
//
if (_compareFile1._currentBufferIndex == _compareFile1._totalBytesInBuffer)
{
_compareFile1.ReadMoreData();
}
if (_compareFile2._currentBufferIndex == _compareFile2._totalBytesInBuffer)
{
_compareFile2.ReadMoreData();
}
//
// No need to check if we have processed all bytes until
// we know we've read both complete files
//
if (_compareFile1._fileCompletelyRead || _compareFile2._fileCompletelyRead)
{
if (_compareFile1.EndOfFile() || _compareFile2.EndOfFile())
{
//
// Do one of the files have extra data?
//
if (!_compareFile1.EndOfFile() || !_compareFile2.EndOfFile())
{
FindDifferences();
}
//
// We're finished
//
break;
}
}
}
else
{
_compareFile1.ReadMoreData();
_compareFile2.ReadMoreData();
FindDifferences();
if (_compareFile1.EndOfFile() && _compareFile2.EndOfFile())
{
//
// We're done
//
break;
}
}
}
_output.Display();
}
一旦我发现差异,我就开始查找行首和行长,然后开始比较每个文件的行
void Compare::FindDifferences()
{
_compareFile1._totalDiffPositions = 0;
_compareFile2._totalDiffPositions = 0;
long nextLineStartingPosition1 = _compareFile1._startOfLine;
long nextLineStartingPosition2 = _compareFile2._startOfLine;
_compareFile1.SetNextDiffPosition(nextLineStartingPosition1);
_compareFile2.SetNextDiffPosition(nextLineStartingPosition2);
int count = 0;
while (true)
{
if (nextLineStartingPosition1 != -1)
{
nextLineStartingPosition1 = _compareFile1.FindNextLine(nextLineStartingPosition1);
}
if (nextLineStartingPosition2 != -1)
{
nextLineStartingPosition2 = _compareFile2.FindNextLine(nextLineStartingPosition2);
}
if (nextLineStartingPosition1 != -1)
{
_compareFile1.SetNextDiffPosition(nextLineStartingPosition1);
}
if (nextLineStartingPosition2 != -1)
{
_compareFile2.SetNextDiffPosition(nextLineStartingPosition2);
}
//
// No more matches for the rest of the file
//
if (nextLineStartingPosition1 == -1 && nextLineStartingPosition2 == -1)
{
_output.AddText("\t----------------\n");
_compareFile1.ReportDifferences(&_output, _compareFile1._diffPositions,
_compareFile1._totalDiffPositions - 1, '+');
_compareFile2.ReportDifferences(&_output, _compareFile2._diffPositions,
_compareFile2._totalDiffPositions-1, '-');
_compareFile1._currentBufferIndex = _compareFile1._totalBytesInBuffer;
_compareFile2._currentBufferIndex = _compareFile2._totalBytesInBuffer;
return;
}
if (count > 2)
{
if (nextLineStartingPosition1 != -1)
{
int index = CompareLinesWithOtherFileLines(&_compareFile1, &_compareFile2);
if (index != -1)
{
//
// Output the results (to _output)
//
_output.AddText("\t----------------\n");
_compareFile1.ReportDifferences(&_output, _compareFile1._diffPositions,
count - 1, '-');
_compareFile2.ReportDifferences(&_output, _compareFile2._diffPositions,
index, '+');
_compareFile1._startOfLine =
_compareFile1._currentBufferIndex =
nextLineStartingPosition1;
_compareFile2._startOfLine =
_compareFile2._currentBufferIndex =
_compareFile2._diffPositions[index + 2];
break;
}
}
if (nextLineStartingPosition1 != -1)
{
int index = CompareLinesWithOtherFileLines(&_compareFile2,
&_compareFile1);
if (index != -1)
{
//
// Output the results
//
_output.AddText("\t----------------\n");
_compareFile1.ReportDifferences(&_output,
_compareFile1._diffPositions, index, '-');
_compareFile2.ReportDifferences(&_output,
_compareFile2._diffPositions, count - 1, '+');
_compareFile1._startOfLine = _compareFile1._currentBufferIndex =
_compareFile1._diffPositions[index + 2];
_compareFile2._startOfLine = _compareFile2._currentBufferIndex =
nextLineStartingPosition2;
break;
}
}
}
count++;
}
_compareFile1.RestoreNewlines();
_compareFile2.RestoreNewlines();
}
精简高效优化
小缓冲区大小
我为每个文件创建了一个仅为 1K 的缓冲区,用于读取和比较。我增加这个缓冲区大小的唯一情况是我有一条非常长的行超过了长度,或者我正在比较的当前差异超过了缓冲区大小。
#define MEMORY_BLOCK_SIZE 1024
void CompareFile::IncreaseBuffer()
{
char* newBuffer = new char[_bufferLength + MEMORY_ALLOCATION_SIZE];
//
// Copy over the bytes from the existing buffer
//
if (_totalBytesInBuffer > 0)
{
CopyMemory(newBuffer, _buffer, _totalBytesInBuffer);
}
//
// Replace the existing buffer
//
delete [] _buffer;
_buffer = newBuffer;
_bufferLength += MEMORY_ALLOCATION_SIZE;
DWORD bytesToRead = _bufferLength - _totalBytesInBuffer;
DWORD bytesRead = 0;
::ReadFile(_fileHandle, &_buffer[_totalBytesInBuffer],
bytesToRead, &bytesRead, NULL);
if (bytesRead != bytesToRead)
{
newBuffer[bytesRead] = 0;
}
_totalBytesRead += bytesRead;
_totalBytesInBuffer += bytesRead;
if (_totalBytesRead == _totalFileSize)
{
_fileCompletelyRead = true;
}
}
///////////////////////////////////////////////////////////////////////////////
//
// Compare::ReadMoreData
//
// Description: Loads more data to the buffer
//
void CompareFile::ReadMoreData()
{
if (_startOfLine == 0 && _totalBytesInBuffer == 0)
{
_startOfLine = _bufferLength;
}
if (_startOfLine == 0)
{
//
// This line exceeds the current buffer, so we need to increase it
//
IncreaseBuffer();
return;
}
//
// calculate the number of bytes at the end of the line from the current line
// being processed, just in case there is a difference in the rest of the
// line that we are comparing.
//
int bytesToCopyToBeginning = (int)(_bufferLength - _startOfLine);
if (bytesToCopyToBeginning > 0)
{
RtlMoveMemory(_buffer, &_buffer[_startOfLine], bytesToCopyToBeginning);
}
DWORD bytesRead = 0;
::ReadFile(_fileHandle, &_buffer[bytesToCopyToBeginning],
_startOfLine, &bytesRead, NULL);
if (bytesRead != (DWORD)_startOfLine)
{
_buffer[bytesToCopyToBeginning + bytesRead] = 0;
}
_totalBytesRead += bytesRead;
_totalBytesInBuffer = (int)bytesToCopyToBeginning + bytesRead;
_startOfLine = 0;
_currentBufferIndex = bytesToCopyToBeginning;
if (_totalBytesRead == _totalFileSize)
{
_fileCompletelyRead = true;
}
}
避免使用 CRT 文件函数
CRT 函数,例如 fopen
和 fgets
,适用于一般的 C/C++。但是,如果您正在编程以实现精简高效,最好避免使用这些函数。例如,CRT 的 fopen
函数在最终调用 Win32 API CreateFile
函数之前会执行一堆其他操作
_tfopen // fopen
{
_tfsopen(file, mode, _SH_DENYNO)
{
_getstream();
_openfile(file,mode,shflag,stream)
{
// validate and convert flags to CreateFile flags
_tsopen_s(&filedes, filename, modeflag, shflag, _S_IREAD | _S_IWRITE)
{
_tsopen_helper(path, oflag, shflag, pmode, pfh, 1)
{
_tsopen_nolock( &unlock_flag, pfh, path,...)
{
// More conversion of flags to CreateFile flags
CreateFile() // <-----------------
}
}
}
}
_unlock_str(stream);
}
}
fgets
函数甚至更加臃肿。读取文件需要进行大量处理,而读取文件的一行需要进行所有处理。这只是调用 fgets
函数时发生的情况的简要总结
_TSCHAR * __cdecl _fgetts (_TSCHAR *string, int count, FILE *str) // fgets
{
_lock_str(stream);
_fgettc_nolock(stream)
{
anybuf(stream);
_getbuf(stream);
_read(_fileno(stream), stream->_base, stream->_bufsiz)
{
/* Try to get a big buffer */
_malloc_crt(_INTERNAL_BUFSIZ); // <- this value is 4096
_read_nolock(fh, buf, cnt)
{
_textmode(fh);
_lseeki64_nolock(fh, 0, FILE_CURRENT);
ReadFile( (HANDLE)_osfhnd(fh), buffer, cnt, (LPDWORD)&os_read, NULL )
}
}
}
_unlock_str(stream);
}
这些函数可以节省大量的开发时间,因为它们为您完成了所有解析行的工作,但它们对使您的程序精简高效没有任何作用。
使用 Unicode 函数
我使用 CreateFileW
来专门使用 API 的 Unicode 版本。在内部,Windows 将所有 ANSI Windows API 调用转换为 Unicode。当您可以直接调用 Unicode 版本时,为什么要调用 ANSI 版本?这是 CreateFileA
的反汇编代码,它将文件名转换为 Unicode,然后调用 Unicode CreateFileW
函数。
我可以只使用 CreateFile
并使用 /UNICODE 标志进行编译,但我这样做是为了展示如何指定 UNICODE 版本。
分析
当我认为我已经尽可能地做到了精简高效,并将总时间加快到 8 到 12 毫秒时,我在 Visual Studio 中运行了分析器。我发现 wprintf
占用了超过 96% 的工作时间,而占用时间最多的下一个函数是我的主要字节比较函数 Compare::CompareFiles()
,仅占 1.8%。
我知道输出是一个非常昂贵的函数,但我有点惊讶于它占用了多少时间。我决定我需要一种方法来降低这个值,所以我创建了一个类 OutputText
,它最初创建一个 1K 缓冲区并通过 OutputText::AddText()
函数存储输出。然后,当我完成比较时,我对 OutputText::Display()
进行最终调用,该调用仅对 printf
进行一次调用。
这使总跟踪时间降低到 42%。这仍然不是很好,但比 96% 好得多。它将 Compare::CompareFiles()
提高到 16.9%,这要好得多。我本来可以作弊稍微优化一下,在计时时不显示结果,但我认为那是不对的。;)
历史
- 2009 年 8 月 29 日 - 初始文章。
- 2009 年 9 月 1 日 - 匹配我为 C# 版本所做的代码。