我做了一些优化。
public unsafe uint ParseUint2(string text)
{
fixed (char* c = text)
{
Vector128<ushort> raw = Sse3.LoadDquVector128((ushort*)c);
raw = Sse2.ShiftLeftLogical128BitLane(raw, (byte)(8 - text.Length << 1));
Vector128<ushort> digit0 = Vector128.Create('0');
raw = Sse2.SubtractSaturate(raw, digit0);
Vector128<short> mul0 = Vector128.Create(10, 1, 10, 1, 10, 1, 10, 1);
Vector128<int> res = Sse2.MultiplyAddAdjacent(raw.AsInt16(), mul0);
Vector128<int> mul1 = Vector128.Create(1000000, 10000, 100, 1);
res = Sse41.MultiplyLow(res, mul1);
res = Ssse3.HorizontalAdd(res, res);
res = Ssse3.HorizontalAdd(res, res);
return (uint)res.GetElement(0);
}
}
减少了类型转换和最终计算的数量vphaddd
。结果,它快了约 10%。
但是...imm8
必须是编译时常量。这意味着您不能在参数所在imm8
的位置使用变量。否则 JIT 编译器不会产生操作的内在指令。它会call
在这个地方创建一个外部方法(也许有一些解决方法)。感谢@PeterCordes 的帮助。
这个怪物并不明显,但比上面的一个要快,不管text.Length
。
public unsafe uint ParseUint3(string text)
{
fixed (char* c = text)
{
Vector128<ushort> raw = Sse3.LoadDquVector128((ushort*)c);
switch (text.Length)
{
case 0: raw = Vector128<ushort>.Zero; break;
case 1: raw = Sse2.ShiftLeftLogical128BitLane(raw, 14); break;
case 2: raw = Sse2.ShiftLeftLogical128BitLane(raw, 12); break;
case 3: raw = Sse2.ShiftLeftLogical128BitLane(raw, 10); break;
case 4: raw = Sse2.ShiftLeftLogical128BitLane(raw, 8); break;
case 5: raw = Sse2.ShiftLeftLogical128BitLane(raw, 6); break;
case 6: raw = Sse2.ShiftLeftLogical128BitLane(raw, 4); break;
case 7: raw = Sse2.ShiftLeftLogical128BitLane(raw, 2); break;
};
Vector128<ushort> digit0 = Vector128.Create('0');
raw = Sse2.SubtractSaturate(raw, digit0);
Vector128<short> mul0 = Vector128.Create(10, 1, 10, 1, 10, 1, 10, 1);
Vector128<int> res = Sse2.MultiplyAddAdjacent(raw.AsInt16(), mul0);
Vector128<int> mul1 = Vector128.Create(1000000, 10000, 100, 1);
res = Sse41.MultiplyLow(res, mul1);
res = Ssse3.HorizontalAdd(res, res);
res = Ssse3.HorizontalAdd(res, res);
return (uint)res.GetElement(0);
}
}
同样,@PeterCordes 不允许我编写慢速代码。以下版本进行了 2 项改进。现在加载的字符串已经移位,然后减去相同偏移量的移位掩码。这避免了ShiftLeftLogical128BitLane
使用变量计数的缓慢回退。
第二个改进是替换vphaddd
为pshufd
+ paddd
。
// Note that this loads up to 14 bytes before the data part of the string. (Or 16 for an empty string)
// This might or might not make it possible to read from an unmapped page and fault, beware.
public unsafe uint ParseUint4(string text)
{
const string mask = "\xffff\xffff\xffff\xffff\xffff\xffff\xffff\xffff00000000";
fixed (char* c = text, m = mask)
{
Vector128<ushort> raw = Sse3.LoadDquVector128((ushort*)c - 8 + text.Length);
Vector128<ushort> mask0 = Sse3.LoadDquVector128((ushort*)m + text.Length);
raw = Sse2.SubtractSaturate(raw, mask0);
Vector128<short> mul0 = Vector128.Create(10, 1, 10, 1, 10, 1, 10, 1);
Vector128<int> res = Sse2.MultiplyAddAdjacent(raw.AsInt16(), mul0);
Vector128<int> mul1 = Vector128.Create(1000000, 10000, 100, 1);
res = Sse41.MultiplyLow(res, mul1);
Vector128<int> shuf = Sse2.Shuffle(res, 0x1b); // 0 1 2 3 => 3 2 1 0
res = Sse2.Add(shuf, res);
shuf = Sse2.Shuffle(res, 0x41); // 0 1 2 3 => 1 0 3 2
res = Sse2.Add(shuf, res);
return (uint)res.GetElement(0);
}
}
~比初始解决方案快两倍。(o_O) 至少在我的 Haswell i7 上。