我的另一个答案相当长并且与这个答案完全不同,所以这里有别的东西。
这个程序不只是过滤掉前两个素数的倍数,或者将所有相关的素数编码到一个字节中,而是过滤掉所有适合八位的素数的倍数,特别是 2 到 211。所以不是通过 33%数字,这会将大约 10% 传递给除法运算符。
素数保存在三个 SSE 寄存器中,它们与运行计数器的模数保存在另外三个中。如果任何素数与计数器的模数为零,则计数器不能是素数。此外,如果任何模数等于 1,则 (counter+2) 不能是素数,依此类推,直到 (counter+30)。偶数会被忽略,并且像 +3、+6 和 +5 这样的偏移会被跳过。矢量处理允许一次更新 16 个字节大小的变量。
在对厨房水槽进行了全面的微优化(但没有比内联指令更具体的平台)之后,我的性能提升了 1.78 倍(我的笔记本电脑上是 24 秒对 13.4 秒)。如果使用 bignum 库(甚至是非常快的库),则优势更大。请参阅下文以获得更易读的预优化版本。
/*
* factorize_sse.cpp
* Filter out multiples of the first 47 primes while factorizing a number.
*
* Created by David Krauss on 10/14/10.
*
*/
#include <cmath>
#include <sstream>
#include <iostream>
#include <xmmintrin.h>
using namespace std;
inline void remove_factor( unsigned long &n, unsigned long factor ) __attribute__((always_inline));
void remove_factor( unsigned long &n, unsigned long factor ) {
while ( n % factor == 0 ) {
n /= factor;
cout << factor << " ";
}
}
int main( int argc, char *argv[] ) {
unsigned long n;
if ( argc != 2
|| ( istringstream( argv[1] ) >> n >> ws ).rdstate() != ios::eofbit ) {
cerr << "Usage: " << argv[0] << " <number>\n";
return 1;
}
int primes[] = { 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47,
53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127,
131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211 };
for ( int *p = primes; p < primes + sizeof primes/sizeof *primes; ++ p ) {
remove_factor( n, * p );
}
//int histo[8] = {}, total = 0;
enum { bias = 15 - 128 };
__m128i const prime1 = _mm_set_epi8( 21, 21, 21, 22, 22, 26, 26, 17, 19, 23, 29, 31, 37, 41, 43, 47 ),
prime2 = _mm_set_epi8( 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127 ),
prime3 = _mm_set_epi8( 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211 ),
vbias = _mm_set1_epi8( bias ),
v3 = _mm_set1_epi8( 3+bias ), v5 = _mm_set1_epi8( 5+bias ), v6 = _mm_set1_epi8( 6+bias ), v8 = _mm_set1_epi8( 8+bias ),
v9 = _mm_set1_epi8( 9+bias ), v11 = _mm_set1_epi8( 11+bias ), v14 = _mm_set1_epi8( 14+bias ), v15 = _mm_set1_epi8( 15+bias );
__m128i mod1 = _mm_add_epi8( _mm_set_epi8( 3, 10, 17, 5, 16, 6, 19, 8, 9, 11, 14, 15, 18, 20, 21, 23 ), vbias ),
mod2 = _mm_add_epi8( _mm_set_epi8( 26, 29, 30, 33, 35, 36, 39, 41, 44, 48, 50, 51, 53, 54, 56, 63 ), vbias ),
mod3 = _mm_add_epi8( _mm_set_epi8( 65, 68, 69, 74, 75, 78, 81, 83, 86, 89, 90, 95, 96, 98, 99, 105 ), vbias );
for ( unsigned long factor = 1, limit = sqrtl( n ); factor <= limit + 30; factor += 30 ) {
if ( n == 1 ) goto done;
// up to 2^32, distribution of number candidates produced (0 up to 7) is
// 0.010841 0.0785208 0.222928 0.31905 0.246109 0.101023 0.0200728 0.00145546
unsigned candidates[8], *cand_pen = candidates;
* cand_pen = 6;
cand_pen += !( _mm_movemask_epi8( _mm_cmpeq_epi8( mod1, v3 ) ) | _mm_movemask_epi8( _mm_or_si128( _mm_cmpeq_epi8( mod2, v3 ), _mm_cmpeq_epi8( mod3, v3 ) ) ) );
* cand_pen = 10;
cand_pen += !( _mm_movemask_epi8( _mm_cmpeq_epi8( mod1, v5 ) ) | _mm_movemask_epi8( _mm_or_si128( _mm_cmpeq_epi8( mod2, v5 ), _mm_cmpeq_epi8( mod3, v5 ) ) ) );
* cand_pen = 12;
cand_pen += !( _mm_movemask_epi8( _mm_cmpeq_epi8( mod1, v6 ) ) | _mm_movemask_epi8( _mm_or_si128( _mm_cmpeq_epi8( mod2, v6 ), _mm_cmpeq_epi8( mod3, v6 ) ) ) );
* cand_pen = 16;
cand_pen += !( _mm_movemask_epi8( _mm_cmpeq_epi8( mod1, v8 ) ) | _mm_movemask_epi8( _mm_or_si128( _mm_cmpeq_epi8( mod2, v8 ), _mm_cmpeq_epi8( mod3, v8 ) ) ) );
* cand_pen = 18;
cand_pen += !( _mm_movemask_epi8( _mm_cmpeq_epi8( mod1, v9 ) ) | _mm_movemask_epi8( _mm_or_si128( _mm_cmpeq_epi8( mod2, v9 ), _mm_cmpeq_epi8( mod3, v9 ) ) ) );
* cand_pen = 22;
cand_pen += !( _mm_movemask_epi8( _mm_cmpeq_epi8( mod1, v11 ) ) | _mm_movemask_epi8( _mm_or_si128( _mm_cmpeq_epi8( mod2, v11 ), _mm_cmpeq_epi8( mod3, v11 ) ) ) );
* cand_pen = 28;
cand_pen += !( _mm_movemask_epi8( _mm_cmpeq_epi8( mod1, v14 ) ) | _mm_movemask_epi8( _mm_or_si128( _mm_cmpeq_epi8( mod2, v14 ), _mm_cmpeq_epi8( mod3, v14 ) ) ) );
* cand_pen = 30;
cand_pen += !( _mm_movemask_epi8( _mm_cmpeq_epi8( mod1, v15 ) ) | _mm_movemask_epi8( _mm_or_si128( _mm_cmpeq_epi8( mod2, v15 ), _mm_cmpeq_epi8( mod3, v15 ) ) ) );
/*++ total;
++ histo[ cand_pen - candidates ];
cout << "( ";
while ( cand_pen != candidates ) cout << factor + * -- cand_pen << " ";
cout << ")" << endl; */
mod1 = _mm_sub_epi8( mod1, _mm_set1_epi8( 15 ) ); // update residuals
__m128i mask1 = _mm_cmplt_epi8( mod1, _mm_set1_epi8( 1+bias ) );
mask1 = _mm_and_si128( mask1, prime1 ); // residual goes to zero or negative?
mod1 = _mm_add_epi8( mask1, mod1 ); // combine reset into zero or negative
mod2 = _mm_sub_epi8( mod2, _mm_set1_epi8( 15 ) );
__m128i mask2 = _mm_cmplt_epi8( mod2, _mm_set1_epi8( 1+bias ) );
mask2 = _mm_and_si128( mask2, prime2 );
mod2 = _mm_add_epi8( mask2, mod2 );
mod3 = _mm_sub_epi8( mod3, _mm_set1_epi8( 15 ) );
__m128i mask3 = _mm_cmplt_epi8( mod3, _mm_set1_epi8( 1+bias ) );
mask3 = _mm_and_si128( mask3, prime3 );
mod3 = _mm_add_epi8( mask3, mod3 );
if ( cand_pen - candidates == 0 ) continue;
remove_factor( n, factor + candidates[ 0 ] );
if ( cand_pen - candidates == 1 ) continue;
remove_factor( n, factor + candidates[ 1 ] );
if ( cand_pen - candidates == 2 ) continue;
remove_factor( n, factor + candidates[ 2 ] );
if ( cand_pen - candidates == 3 ) continue;
remove_factor( n, factor + candidates[ 3 ] );
if ( cand_pen - candidates == 4 ) continue;
remove_factor( n, factor + candidates[ 4 ] );
if ( cand_pen - candidates == 5 ) continue;
remove_factor( n, factor + candidates[ 5 ] );
if ( cand_pen - candidates == 6 ) continue;
remove_factor( n, factor + candidates[ 6 ] );
}
cout << n;
done:
/*cout << endl;
for ( int hx = 0; hx < 8; ++ hx ) cout << (float) histo[hx] / total << " ";*/
cout << endl;
}
.
dkrauss$ /usr/local/bin/g++ main.cpp -o factorize_sse -O3 --profile-use
dkrauss$ time ./factorize_sse 18446743721522234449
4294967231 4294967279
real 0m13.437s
user 0m13.393s
sys 0m0.011s
以下是上述内容的初稿。包括优化
- 使循环计数器无条件合并(避免分支)。
- 通过将循环展开 15 倍来获得 ILP,将步幅增加到 30。
- 灵感来自您的优化。
- 30 似乎是一个甜蜜点,因为它可以免费删除 2、3 和 5 的倍数。
- 5 到 15 之间的素数可能有多个倍数,因此在向量中以不同的相位放置多个副本。
- 排除因素
remove_factor
。
- 将有条件的、不可预测
remove_factor
的调用更改为非分支数组写入。
- 使用 call 完全展开最终循环,
remove_factor
并确保函数始终是内联的。
- 消除最终展开的迭代,因为候选中总是有 7 的倍数。
- 添加另一个向量,其中包含所有剩余的足够小的素数。
- 通过向计数器添加偏差来腾出更多空间,并添加另一个向量。现在只有 6 个素数可以在不增加 16 位的情况下被过滤掉,而且我的寄存器也用完了:循环需要 3 个素数向量、3 个模数向量、8 个要搜索的常数,以及每个常数递增并进行范围检查。这使得 16。
- 在此应用中增益最小(但为正),但该技术的最初目的是在另一个答案中过滤筛子的素数。敬请期待……</li>
可读版本:
/*
* factorize_sse.cpp
* Filter out multiples of the first 17 primes while factorizing a number.
*
* Created by David Krauss on 10/14/10.
*
*/
#include <cmath>
#include <sstream>
#include <iostream>
#include <xmmintrin.h>
using namespace std;
int main( int argc, char *argv[] ) {
unsigned long n;
if ( argc != 2
|| ( istringstream( argv[1] ) >> n >> ws ).rdstate() != ios::eofbit ) {
cerr << "Usage: " << argv[0] << " <number>\n";
return 1;
}
int primes[] = { 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59 };
for ( int *p = primes; p < primes + sizeof primes/sizeof *primes; ++ p ) {
while ( n % * p == 0 ) {
n /= * p;
cout << * p << " ";
}
}
if ( n != 1 ) {
__m128i mod = _mm_set_epi8( 1, 2, 3, 5, 6, 8, 9, 11, 14, 15, 18, 20, 21, 23, 26, 29 );
__m128i const prime = _mm_set_epi8( 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59 ),
one = _mm_set1_epi8( 1 );
for ( unsigned long factor = 1, limit = sqrtl( n ); factor < limit; ) {
factor += 2;
__m128i mask = _mm_cmpeq_epi8( mod, one ); // residual going to zero?
mod = _mm_sub_epi8( mod, one ); // update other residuals
if ( _mm_movemask_epi8( mask ) ) {
mask = _mm_and_si128( mask, prime ); // reset cycle if going to zero
mod = _mm_or_si128( mask, mod ); // combine reset into zeroed position
} else while ( n % factor == 0 ) {
n /= factor;
cout << factor << " ";
if ( n == 1 ) goto done;
}
}
cout << n;
}
done:
cout << endl;
}