最佳化
在上一篇留下的 Dapper AOT 還有什麼特別最佳化點的問題
在仔細閱讀生成程式碼和原始碼之後,終於得到了答案
個人之前一直以為 Dapper AOT 只用了迭代器去實現,所以理應差不多實現程式碼卻又極大差距,思維陷入了僵局,一度以為有什麼黑魔法
結果 Dapper AOT 沒有用迭代器去實現!!! 靠北啦,還以為迭代器有新姿勢可以最佳化了
不再使用迭代器
List<BenchmarkTest.Dog> results = new();
try
{
while (reader.Read())
{
results.Add(ReadOne(reader, readOnlyTokens));
}
return results;
}
當然就只能要求 使用者必須使用 AsList
方法,因為 ToList
會導致複製list的問題, 導致負最佳化,
像這樣
connection.Query<Dog>("select * from dog").AsList();
// AsList 實現
public static List<T> AsList<T>(this IEnumerable<T>? source) => source switch
{
null => null!,
List<T> list => list,
_ => Enumerable.ToList(source),
};
使用 span
再沒有了迭代器方法限制, span 就可以放飛自我,隨意使用了
public static BenchmarkTest.Dog ReadOne(this IDataReader reader, ref ReadOnlySpan<int> ss)
{
var d = new BenchmarkTest.Dog();
for (int j = 0; j < ss.Length; j++)
{
使用 ArrayPool 減少記憶體佔用
public Span<int> GetTokens()
{
FieldCount = Reader!.FieldCount;
if (Tokens is null || Tokens.Length < FieldCount)
{
// no leased array, or existing lease is not big enough; rent a new array
if (Tokens is not null) ArrayPool<int>.Shared.Return(Tokens);
Tokens = ArrayPool<int>.Shared.Rent(FieldCount);
}
return MemoryMarshal.CreateSpan(ref MemoryMarshal.GetArrayDataReference(Tokens), FieldCount);
}
資料小時使用棧分配
var s = reader.FieldCount <= 64 ? MemoryMarshal.CreateSpan(ref MemoryMarshal.GetReference(stackalloc int[reader.FieldCount]), reader.FieldCount) : state.GetTokens();
提前生成部分 hashcode 進行比較
因為比較現在也並不耗時了, 所以 快取也沒有必要了, 也一併移除
public static void GenerateReadTokens(this IDataReader reader, Span<int> s)
{
for (int i = 0; i < reader.FieldCount; i++)
{
var name = reader.GetName(i);
var type = reader.GetFieldType(i);
switch (EntitiesGenerator.NormalizedHash(name))
{
case 742476188U:
s[i] = type == typeof(int) ? 1 : 2;
break;
case 2369371622U:
s[i] = type == typeof(string) ? 3 : 4;
break;
case 1352703673U:
s[i] = type == typeof(float) ? 5 : 6;
break;
default:
break;
}
}
}
效能測試說明
BenchmarkDotNet
這裡特別說明一下
使用的 BenchmarkDotNet ,其本身已經考慮了 jit最佳化等等方面, 有預熱,超多次執行,
結果值也是按照統計學有考慮結果集分佈情況處理,移除變差大的值(比如少數的孤立的極大極小值), 差異不大情況,一般顯示平均值,有大差異時還會顯示 中位值
感興趣的童鞋可以去 https://github.com/dotnet/BenchmarkDotNet 瞭解
chole 有點棘手,為了方便mock,所以 copy了部分原始碼,只比較實體對映部分
測試資料
測試資料 正如之前說過, 採用 手動 mock 方式,避免 db 驅動 、db 執行、mock庫 等等 帶來的執行差異影響
class
非常簡單的類,當然不能代表所有情況,不過簡單測試夠用了
public class Dog
{
public int? Age { get; set; }
public string Name { get; set; }
public float? Weight { get; set; }
}
mock 資料
public class TestDbConnection : DbConnection
{
public int RowCount { get; set; }
public IDbCommand CreateCommand()
{
return new TestDbCommand() { RowCount = RowCount };
}
}
public class TestDbCommand : DbCommand
{
public int RowCount { get; set; }
public IDataParameterCollection Parameters { get; } = new TestDataParameterCollection();
public IDbDataParameter CreateParameter()
{
return new TestDataParameter();
}
protected override DbDataReader ExecuteDbDataReader(CommandBehavior behavior)
{
return new TestDbDataReader() { RowCount = RowCount };
}
}
public class TestDbDataReader : DbDataReader
{
public int RowCount { get; set; }
private int calls = 0;
public override object this[int ordinal]
{
get
{
switch (ordinal)
{
case 0:
return "XX";
case 1:
return 2;
case 2:
return 3.3f;
default:
return null;
}
}
}
public override int FieldCount => 3;
public override Type GetFieldType(int ordinal)
{
switch (ordinal)
{
case 0:
return typeof(string);
case 1:
return typeof(int);
case 2:
return typeof(float);
default:
return null;
}
}
public override float GetFloat(int ordinal)
{
switch (ordinal)
{
case 2:
return 3.3f;
default:
return 0;
}
}
public override int GetInt32(int ordinal)
{
switch (ordinal)
{
case 1:
return 2;
default:
return 0;
}
}
public override string GetName(int ordinal)
{
switch (ordinal)
{
case 0:
return "Name";
case 1:
return "Age";
case 2:
return "Weight";
default:
return null;
}
}
public override string GetString(int ordinal)
{
switch (ordinal)
{
case 0:
return "XX";
default:
return null;
}
}
public override object GetValue(int ordinal)
{
switch (ordinal)
{
case 0:
return "XX";
case 1:
return 2;
case 2:
return 3.3f;
default:
return null;
}
}
public override bool Read()
{
calls++;
return calls <= RowCount;
}
}
Benchmark 程式碼
[MemoryDiagnoser, Orderer(summaryOrderPolicy: SummaryOrderPolicy.FastestToSlowest), GroupBenchmarksBy(BenchmarkLogicalGroupRule.ByCategory), CategoriesColumn]
public class ObjectMappingTest
{
[Params(1, 1000, 10000, 100000, 1000000)]
public int RowCount { get; set; }
[Benchmark(Baseline = true)]
public void SetClass()
{
var connection = new TestDbConnection() { RowCount = RowCount };
var dogs = new List<Dog>();
try
{
connection.Open();
var cmd = connection.CreateCommand();
cmd.CommandText = "select ";
using (var reader = cmd.ExecuteReader(CommandBehavior.Default))
{
while (reader.Read())
{
var dog = new Dog();
dogs.Add(dog);
dog.Name = reader.GetString(0);
dog.Age = reader.GetInt32(1);
dog.Weight = reader.GetFloat(2);
}
}
}
finally
{
connection.Close();
}
}
[Benchmark]
public void DapperAOT()
{
var connection = new TestDbConnection() { RowCount = RowCount };
var dogs = connection.Query<Dog>("select * from dog").AsList();
}
[Benchmark]
public void SourceGenerator()
{
var connection = new TestDbConnection() { RowCount = RowCount };
List<Dog> dogs;
try
{
connection.Open();
var cmd = connection.CreateCommand();
cmd.CommandText = "select ";
using (var reader = cmd.ExecuteReader(CommandBehavior.Default))
{
dogs = reader.ReadTo<Dog>().AsList();
}
}
finally
{
connection.Close();
}
}
[Benchmark]
public void Chloe()
{
var connection = new TestDbConnection() { RowCount = RowCount };
try
{
connection.Open();
var cmd = connection.CreateCommand();
var dogs = new InternalSqlQuery<Dog>(cmd, "select").AsList();
}
finally
{
connection.Close();
}
}
}
完整程式碼可以參考 https://github.com/fs7744/SlowestEM
測試結果
BenchmarkDotNet v0.13.12, Windows 11 (10.0.22631.3880/23H2/2023Update/SunValley3)
13th Gen Intel Core i9-13900KF, 1 CPU, 32 logical and 24 physical cores
.NET SDK 9.0.100-preview.6.24328.19
[Host] : .NET 8.0.7 (8.0.724.31311), X64 RyuJIT AVX2
DefaultJob : .NET 8.0.7 (8.0.724.31311), X64 RyuJIT AVX2
Method | RowCount | Mean | Error | StdDev | Ratio | RatioSD | Gen0 | Gen1 | Gen2 | Allocated | Alloc Ratio |
---|---|---|---|---|---|---|---|---|---|---|---|
DapperAOT | 1 | 294.1 ns | 5.79 ns | 7.73 ns | 0.63 | 0.02 | 0.0234 | 0.0229 | - | 440 B | 1.00 |
Dapper | 1 | 321.7 ns | 6.40 ns | 5.99 ns | 0.70 | 0.02 | 0.0405 | 0.0401 | - | 768 B | 1.75 |
SourceGenerator | 1 | 408.7 ns | 6.38 ns | 5.33 ns | 0.89 | 0.01 | 0.0234 | 0.0229 | - | 440 B | 1.00 |
SetClass | 1 | 460.6 ns | 4.82 ns | 4.51 ns | 1.00 | 0.00 | 0.0234 | 0.0229 | - | 440 B | 1.00 |
Chloe | 1 | 498.9 ns | 8.99 ns | 12.31 ns | 1.09 | 0.03 | 0.0453 | 0.0448 | - | 856 B | 1.95 |
SetClass | 1000 | 4,751.0 ns | 84.12 ns | 86.38 ns | 1.00 | 0.00 | 3.0212 | 1.2894 | - | 56912 B | 1.00 |
SourceGenerator | 1000 | 11,402.9 ns | 220.27 ns | 244.83 ns | 2.39 | 0.04 | 3.0212 | 1.2817 | - | 56912 B | 1.00 |
DapperAOT | 1000 | 11,421.3 ns | 121.00 ns | 113.18 ns | 2.41 | 0.05 | 3.0212 | 0.6409 | - | 56912 B | 1.00 |
Dapper | 1000 | 29,601.8 ns | 447.50 ns | 396.69 ns | 6.25 | 0.15 | 5.5542 | 1.0986 | - | 105192 B | 1.85 |
Chloe | 1000 | 66,872.0 ns | 150.27 ns | 133.21 ns | 14.12 | 0.27 | 2.9297 | 0.9766 | - | 57328 B | 1.01 |
SetClass | 10000 | 106,271.3 ns | 2,111.19 ns | 3,468.75 ns | 1.00 | 0.00 | 41.6260 | 41.6260 | 41.6260 | 662782 B | 1.00 |
DapperAOT | 10000 | 172,867.7 ns | 2,079.77 ns | 1,945.42 ns | 1.65 | 0.05 | 41.5039 | 41.5039 | 41.5039 | 662782 B | 1.00 |
SourceGenerator | 10000 | 181,916.1 ns | 1,653.15 ns | 1,465.47 ns | 1.74 | 0.05 | 41.5039 | 41.5039 | 41.5039 | 662782 B | 1.00 |
Dapper | 10000 | 705,883.0 ns | 8,517.90 ns | 7,550.89 ns | 6.74 | 0.19 | 82.0313 | 81.0547 | 41.0156 | 1143062 B | 1.72 |
Chloe | 10000 | 746,825.0 ns | 3,067.25 ns | 2,869.11 ns | 7.15 | 0.21 | 41.0156 | 41.0156 | 41.0156 | 663198 B | 1.00 |
SetClass | 100000 | 1,191,303.2 ns | 20,831.95 ns | 19,486.22 ns | 1.00 | 0.00 | 498.0469 | 498.0469 | 498.0469 | 6098016 B | 1.00 |
DapperAOT | 100000 | 1,794,197.8 ns | 17,937.20 ns | 16,778.47 ns | 1.51 | 0.03 | 498.0469 | 498.0469 | 498.0469 | 6098016 B | 1.00 |
SourceGenerator | 100000 | 1,973,894.9 ns | 26,063.73 ns | 24,380.03 ns | 1.66 | 0.03 | 496.0938 | 496.0938 | 496.0938 | 6098016 B | 1.00 |
Dapper | 100000 | 4,357,237.9 ns | 85,065.76 ns | 83,545.95 ns | 3.66 | 0.09 | 492.1875 | 492.1875 | 492.1875 | 10898296 B | 1.79 |
Chloe | 100000 | 7,524,264.2 ns | 91,289.38 ns | 85,392.15 ns | 6.32 | 0.14 | 492.1875 | 492.1875 | 492.1875 | 6098432 B | 1.00 |
SetClass | 1000000 | 49,990,270.7 ns | 987,172.66 ns | 1,829,787.82 ns | 1.00 | 0.00 | 3300.0000 | 3300.0000 | 1400.0000 | 56778489 B | 1.00 |
DapperAOT | 1000000 | 56,473,264.7 ns | 995,473.43 ns | 1,427,678.25 ns | 1.13 | 0.05 | 3555.5556 | 3555.5556 | 1777.7778 | 56779066 B | 1.00 |
SourceGenerator | 1000000 | 58,368,836.3 ns | 1,153,542.14 ns | 2,080,074.43 ns | 1.17 | 0.06 | 3555.5556 | 3555.5556 | 1777.7778 | 56779066 B | 1.00 |
Chloe | 1000000 | 110,416,752.0 ns | 1,562,298.26 ns | 1,461,374.77 ns | 2.19 | 0.10 | 3400.0000 | 3400.0000 | 1600.0000 | 56781312 B | 1.00 |
Dapper | 1000000 | 138,433,886.4 ns | 2,765,190.70 ns | 4,385,885.48 ns | 2.77 | 0.14 | 6250.0000 | 6250.0000 | 2000.0000 | 104779052 B | 1.85 |
SourceGenerator 基本等同 DapperAOT 了, 除了沒有使用 Interceptor, 以及各種情況細節沒有考慮之外, 兩者效能一樣
SourceGenerator 肯定現在效能最佳化最佳方式,畢竟可以生成程式碼檔案,上手難度其實比 emit 之類小多了