C# 将 DataTable 分割成多个 DataTable






4.64/5 (4投票s)
这个辅助类的目的是将一个大型数据表分割成多个小型数据表。
背景
几天前,我需要处理大型数据表,这些数据表可能包含超过 48000000 条记录。我需要将它们分割成固定大小的独立数据表,并根据需要进行处理。我发现了几种分割方法,今天我将分享所有可能的效率更高的方案。
DataTable 分割器
我们即将创建几个分割器类,将在我们的辅助类中使用。
基础分割器
using System;
using System.CodeDom;
using System.Collections.Generic;
using System.Data;
public abstract class DataTableSplitter
{
protected DataTable SourceTable;
protected int BatchSize;
protected string TableNamePrefix;
protected DataTableSplitter(DataTable table, int size, string namePrefix = "")
{
SourceTable = table;
BatchSize = size;
TableNamePrefix = namePrefix;
}
protected DataTable NewSplitTable(int tableCount)
{
DataTable dataTable = SourceTable.Clone();
dataTable.TableName = !string.IsNullOrEmpty(TableNamePrefix)
? TableNamePrefix + "_" + tableCount
: String.Empty;
dataTable.Clear();
return dataTable;
}
public abstract IEnumerable<DataTable> Split();
}
常规分割器(不修改源数据)
此分割器将创建新的数据表,而不会对源数据表进行任何修改,源数据表将保持不变。
public class DataTableRegularSplitter : DataTableSplitter
{
public DataTableRegularSplitter
(DataTable table, int size, string namePrefix = "") : base(table, size, namePrefix)
{
}
public override IEnumerable<DataTable> Split()
{
int tableCount = 0;
int rowCount = 0;
DataTable dataTable = null;
foreach (DataRow row in SourceTable.Rows)
{
/*create new table*/
if (dataTable == null)
{
tableCount++;
dataTable = NewSplitTable(tableCount);
}
/*row add to new table/remove from source*/
DataRow newRow = dataTable.NewRow();
newRow.ItemArray = row.ItemArray;
dataTable.Rows.Add(newRow);
rowCount++;
/*return batch sized new table*/
if (rowCount == BatchSize)
{
yield return dataTable;
rowCount = 0;
dataTable = null;
}
}
/*return new table with remaining rows*/
if (dataTable != null && dataTable.Rows.Count > 0)
{
yield return dataTable;
}
}
}
移除分割器(使用 Remove 功能从源数据中移除行)
此分割器将创建新的数据表,并包含对源 datatable
的修改。在处理结束时,源数据表中将没有行。Remove()
方法基本上会立即移除该行。
public class DataTableRemoveSplitter : DataTableSplitter
{
public DataTableRemoveSplitter
(DataTable table, int size, string namePrefix = "") : base(table, size, namePrefix)
{
}
public override IEnumerable<DataTable> Split()
{
int tableCount = 0;
int rowCount = 0;
DataTable dataTable = null;
while (SourceTable.Rows.Count > 0)
{
DataRow row = SourceTable.Rows[0]; /*first row*/
/*create new table*/
if (dataTable == null)
{
tableCount++;
dataTable = NewSplitTable(tableCount);
}
/*row add to new table/remove from source*/
DataRow newRow = dataTable.NewRow();
newRow.ItemArray = row.ItemArray;
dataTable.Rows.Add(newRow);
SourceTable.Rows.Remove(row);
rowCount++;
/*return batch sized new table*/
if (rowCount == BatchSize)
{
yield return dataTable;
rowCount = 0;
dataTable = null;
}
}
/*return new table with remaining rows*/
if (dataTable != null && dataTable.Rows.Count > 0)
{
yield return dataTable;
}
}
}
移除分割器(使用 Delete 功能从源数据中移除行)
此分割器将执行与上述相同操作。通过调用 Delete()
方法,行状态将变为已删除,并保留在 datatable
中,直到我们调用 AcceptChanges()
。
public class DataTableDeleteSplitter : DataTableSplitter
{
public DataTableDeleteSplitter(DataTable table,
int size, string namePrefix = "") : base(table, size, namePrefix)
{
}
public override IEnumerable<DataTable> Split()
{
SourceTable.AcceptChanges(); /*important*/
int tableCount = 0;
int rowCount = 0;
DataTable dataTable = null;
for (int sourceTableIndex = 0; SourceTable.Rows.Count > 0; sourceTableIndex++)
{
DataRow row = SourceTable.Rows[sourceTableIndex];
/*create new table*/
if (dataTable == null)
{
tableCount++;
dataTable = NewSplitTable(tableCount);
}
/*row add to new table/remove from source*/
DataRow newRow = dataTable.NewRow();
newRow.ItemArray = row.ItemArray;
dataTable.Rows.Add(newRow);
row.Delete();
rowCount++;
/*return batch sized new table*/
if (rowCount == BatchSize)
{
SourceTable.AcceptChanges();
yield return dataTable;
rowCount = 0;
dataTable = null;
sourceTableIndex = -1; /*loop need to start again from 0*/
}
}
/*return new table with remaining rows*/
if (dataTable != null && dataTable.Rows.Count > 0)
{
SourceTable.AcceptChanges();
yield return dataTable;
}
}
}
移除 vs 删除
Remove()
方法会立即移除dataRow
,并且无法恢复。Delete()
方法等待Acceptchanges()
调用。可以使用Rejectchanges()
进行恢复。Remove()
比Delete()
更快。Delete()
更适合与数据库关联的数据表。
分割辅助类
在这里,我们将使用我们的分割器类。我们还可以包括有关如何从源数据表中移除行的选项
using System;
using System.Collections.Generic;
using System.Data;
public class DataTableHelper
{
public enum RowRemoveType
{
Remove,
Delete
}
public static IEnumerable<DataTable>
Split(DataTable sourceTable, int batchSize, string tableNamePrefix = "")
{
return new DataTableRegularSplitter(sourceTable, batchSize, tableNamePrefix).Split();
}
public static IEnumerable<DataTable> SplitAndRemoveFromSource
(DataTable sourceTable, int batchSize, string tableNamePrefix = "",
RowRemoveType rowRemoveType = RowRemoveType.Remove)
{
switch (rowRemoveType)
{
case RowRemoveType.Remove:
return new DataTableRemoveSplitter
(sourceTable, batchSize, tableNamePrefix).Split();
case RowRemoveType.Delete:
return new DataTableDeleteSplitter
(sourceTable, batchSize, tableNamePrefix).Split();
default:
throw new Exception("Unknown remove type");
}
}
}
使用分割辅助类
常规分割(不修改源数据)
/*no change in source table*/
DataTable table = Data.Table(3);
List<DataTable> tables = DataTableHelper.Split(table, 1).ToList();
移除分割(使用 Remove 功能从源数据中移除行)
/*change in source table: using remove*/
DataTable table1 = Data.Table(3);
List<DataTable> tables1 = DataTableHelper.SplitAndRemoveFromSource(table1, 1).ToList();
移除分割(使用 Delete 功能从源数据中移除行)
/*change in source table: using delete, good after db operations*/
DataTable table2 = Data.Table(3);
List<DataTable> tables2 = DataTableHelper.SplitAndRemoveFromSource
(table2, 1, "", DataTableHelper.RowRemoveType.Delete).ToList();
通用分割器
此通用分割器可以分割任何 IEnumerable
源对象,包括任何 datatable
。此分割器不会对源对象进行任何修改。
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
public static class EnumerableExtension
{
/*https://stackoverflow.com/a/34665432/2948523*/
public static IEnumerable<IEnumerable<T>> ToBatch<T>
(this IEnumerable<T> enumerable, int batchSize)
{
int itemsReturned = 0;
var list = enumerable.ToList();
int count = list.Count;
while (itemsReturned < count)
{
int currentChunkSize = Math.Min(batchSize, count - itemsReturned);
yield return list.GetRange(itemsReturned, currentChunkSize);
itemsReturned += currentChunkSize;
}
}
}
使用通用分割器
在这里,我们使用通用分割器来分割 datatable
和 list
。
/*generic split*/
/*table*/
List<DataTable> tables3 = Data.Table(3).AsEnumerable()
.ToBatch(1)
.Select(x => x.CopyToDataTable()).ToList();
/*list*/
List<List<Student>> lists1 = Data.List(3).ToBatch(1).Select(x => x.ToList()).ToList();
值得了解!!!
DataTable
可以存储的最大行数是 16,777,216yield return
是你的好帮手- DataTable 最大限制:https://stackoverflow.com/questions/11123074/what-is-the-maximum-size-of-a-datatable-i-can-load-into-memory-in-net
- 移除 vs 删除
- 通用分割:https://stackoverflow.com/a/34665432/2948523
请在附件中找到 Visual Studio 2017 解决方案。如果我遗漏了任何内容或您有任何建议,请告诉我。
历史
- 2019年5月19日:初始版本