/*--------------------------------------------------------------------
This source distribution is placed in the public domain by its author,
Jason Papadopoulos. You may use it for any purpose, free of charge,
without having to notify anyone. I disclaim any responsibility for any
errors.
Optionally, please be nice and tell me if you find this source to be
useful. Again optionally, if you add to the functionality present here
please consider making those additions public too, so that others may
benefit from your work.
--jasonp@boo.net 6/3/07
--------------------------------------------------------------------*/
#include <common.h>
#include "mpqs.h"
#if BLOCK_KB == 32
#define SIEVE_BLOCK_SIZE 32768
#define LOG2_SIEVE_BLOCK_SIZE 15
#elif BLOCK_KB == 64
#define SIEVE_BLOCK_SIZE 65536
#define LOG2_SIEVE_BLOCK_SIZE 16
#else
#error "unsupported sieve block size"
#endif
/*--------------------------------------------------------------------*/
static void add_to_hashtable(hashtable_t *entry,
uint32 sieve_offset,
uint32 mask,
uint32 prime_index,
uint32 logprime) {
/* add a 'sieve update' to the hashtable bin pointed
to by 'entry'. Hashing is based on the top few
bits of sieve_offset, and the range of sieve values
represented by one hash bin is given by 'mask'.
Note that after the first polynomial it's unlikely
that any hash bin will need to grow in size. */
uint32 i = entry->num_used;
hash_entry_t new_entry;
/* always prefetch; we can't rely on the processor
automatically knowing when to do this */
if (!(i % 8))
PREFETCH(entry->list + i + 8);
if (i == entry->num_alloc) {
entry->num_alloc = 2 * i;
entry->list = (hash_entry_t *)realloc(entry->list,
2 * i * sizeof(hash_entry_t));
}
new_entry.logprime = logprime;
new_entry.prime_index = prime_index;
new_entry.sieve_offset = sieve_offset & mask;
entry->list[i] = new_entry;
entry->num_used++;
}
/*--------------------------------------------------------------------*/
static void fill_sieve_block(sieve_conf_t *conf,
hashtable_t *hash_bucket) {
/* Routine for conventional sieving
Each successive call to fill_sieve_block() reuses the root
values that were computed in the previous call */
uint32 i;
uint8 *sieve_array = conf->sieve_array;
uint32 sieve_small_fb_start = conf->sieve_small_fb_start;
uint32 sieve_large_fb_start = conf->sieve_large_fb_start;
packed_fb_t *packed_fb = conf->packed_fb;
hash_entry_t *list;
uint32 num_large;
/* First update the sieve block with values corresponding
to the small primes in the factor base. Assuming the
entire block will fit in cache, each factor base prime
will update many locations within the block, and this
phase will run very fast */
TIME1(sieve_small_time)
for (i = sieve_small_fb_start; i < sieve_large_fb_start; i++) {
packed_fb_t *pfbptr = packed_fb + i;
uint32 prime = pfbptr->prime;
uint32 root1 = pfbptr->next_loc1;
uint32 root2 = pfbptr->next_loc2;
uint8 logprime = pfbptr->logprime;
/* We assume that the starting positions for each
root (i.e. 'next_loc1' and 'next_loc2') are
stored in ascending order; thus, if next_loc2
is within the sieve block then next_loc1
automatically is too. This allows both root
updates to be collapsed into a single loop,
effectively unrolling by two.
Note that no exception is made for sieve roots
that are invalid; they skip the loop below
since INVALID_ROOT is a large number */
while (root2 < SIEVE_BLOCK_SIZE) {
sieve_array[root1] -= logprime;
sieve_array[root2] -= logprime;
root1 += prime;
root2 += prime;
}
/* once next_loc2 exceeds the sieve block size,
next_loc1 may still have one update left. In
that case, perform the update and switch
next_loc1 and next_loc2, since next_loc1 is now
the larger root */
if (root1 < SIEVE_BLOCK_SIZE) {
uint32 tmp = root2;
sieve_array[root1] -= logprime;
root2 = root1 + prime;
root1 = tmp;
}
pfbptr->next_loc1 = root1 - SIEVE_BLOCK_SIZE;
pfbptr->next_loc2 = root2 - SIEVE_BLOCK_SIZE;
}
TIME2(sieve_small_time)
/* Now update the sieve block with the rest of the
factor base. All of the offsets to update have
previously been collected into a hash table.
While the updates are to random offsets in the
sieve block, the whole block was cached by the
previous loop, and memory access to the hashtable
entry is predictable and can be prefetched */
TIME1(sieve_large_time)
list = hash_bucket->list;
num_large = hash_bucket->num_used;
for (i = 0; i < (num_large & (uint32)(~7)); i += 8) {
#ifdef MANUAL_PREFETCH
PREFETCH(list + i + 16);
#endif
sieve_array[list[i ].sieve_offset] -= list[i ].logprime;
sieve_array[list[i+1].sieve_offset] -= list[i+1].logprime;
sieve_array[list[i+2].sieve_offset] -= list[i+2].logprime;
sieve_array[list[i+3].sieve_offset] -= list[i+3].logprime;
sieve_array[list[i+4].sieve_offset] -= list[i+4].logprime;
sieve_array[list[i+5].sieve_offset] -= list[i+5].logprime;
sieve_array[list[i+6].sieve_offset] -= list[i+6].logprime;
sieve_array[list[i+7].sieve_offset] -= list[i+7].logprime;
}
for (; i < num_large; i++) {
sieve_array[list[i].sieve_offset] -= list[i].logprime;
}
TIME2(sieve_large_time)
}
/*--------------------------------------------------------------------*/
#define PACKED_SIEVE_MASK ((uint64)0x80808080 << 32 | 0x80808080)
#if defined(__GNUC__) && defined(__i386__)
#if defined(HAS_SSE2)
#define SCAN_SSE2
#elif defined(HAS_AMD_MMX) || defined(HAS_SSE)
#define SCAN_MMX
#endif
#endif
static uint32 scan_sieve_block(sieve_conf_t *conf,
mp_t *a, mp_t *b, mp_t *c,
int32 block_start,
uint32 cutoff1,
uint32 poly_index,
hashtable_t *hashtable) {
uint32 i, j;
uint8 *sieve_array = conf->sieve_array;
uint64 *packed_sieve = (uint64 *)conf->sieve_array;
uint32 relations_found = 0;
TIME1(tf_plus_scan_time)
for (i = 0; i < SIEVE_BLOCK_SIZE / 8; i += 8) {
/* test 32 sieve values at a time for large
logarithms. The sieve code actually
decrements by the logarithms of primes, so
if a value is smooth its top bit will be
set. We can easily test for this condition
Note that for really big jobs the cutoff
itself will exceed 128, so that the test
below will have many false alarms */
#if defined(SCAN_SSE2)
uint32 compare_result = 0;
asm volatile (
"movdqa (%1), %%xmm0 \n\t"
"orpd 16(%1), %%xmm0 \n\t"
"orpd 32(%1), %%xmm0 \n\t"
"orpd 48(%1), %%xmm0 \n\t"
"pmovmskb %%xmm0, %0 \n\t"
: "=r"(compare_result)
: "r"(packed_sieve + i), "0"(compare_result)
: "%xmm0");
if (compare_result == 0)
continue;
#elif defined(SCAN_MMX)
uint32 compare_result = 0;
asm volatile (
"movq (%1), %%mm0 \n\t"
"por 8(%1), %%mm0 \n\t"
"por 16(%1), %%mm0 \n\t"
"por 24(%1), %%mm0 \n\t"
"por 32(%1), %%mm0 \n\t"
"por 40(%1), %%mm0 \n\t"
"por 48(%1), %%mm0 \n\t"
"por 56(%1), %%mm0 \n\t"
"pmovmskb %%mm0, %0 \n\t"
: "=r"(compare_result)
: "r"(packed_sieve + i), "0"(compare_result)
: "%mm0");
if (compare_result == 0)
continue;
#else
if (((packed_sieve[i] | packed_sieve[i+1] |
packed_sieve[i+2] | packed_sieve[i+3] |
packed_sieve[i+4] | packed_sieve[i+5] |
packed_sieve[i+6] | packed_sieve[i+7]) &
PACKED_SIEVE_MASK) == (uint64)(0))
continue;
#endif
/* one or more of the 32 values is probably
smooth. Test one at a time and attempt
to factor them */
for (j = 0; j < 64; j++) {
uint32 bits = sieve_array[8 * i + j];
if (bits > cutoff1) {
TIME1(tf_total_time)
relations_found +=
check_sieve_val(conf,
block_start + (int32)(8*i+j),
cutoff1 + 257 - bits,
a, b, c, poly_index,
hashtable);
TIME2(tf_total_time)
}
}
}
TIME2(tf_plus_scan_time)
#if defined(SCAN_MMX)
asm volatile("emms");
#endif
return relations_found;
}
/*--------------------------------------------------------------------*/
uint32 ROUTINE_NAME(sieve_conf_t *conf,
uint32 poly_start, uint32 num_poly) {
/* The core of the sieving code. This routine performs
the sieving and relation collection for 'num_poly'
polynomials at a time.
Because this is the self-initializing quadratic sieve,
groups of polynomials are related. There are 'total_poly'
polynomials that share the same 'a' value; we number them
from 0 to total_poly-1. poly_start refers to the index
of the initial polynomial of the group of num_poly.
Ordinarily, you initialize one polynomial, sieve with it,
initialize the next polynomial, sieve with it, etc. However,
we want the sieve interval to be small, and when it's very
small but the factor base is large we will spend a lot
of time initializing polynomials and only a little time
sieving. Self-initialization improves that a lot, but when
the factor base exceeds the cache size, even for previous
versions of this program, polynomial initialization would
take 50% of the total sieving stage runtime when the sieve
interval is small. We could make the interval larger, but
that means smooth relations occur less often; sieve time
does not scale well but poly initialization time does.
This routine takes a novel approach: for small factor base
primes it sieves one polynomial at a time. However, for
all the other factor base primes (90+% of the factor base,
in a big factorization) it does not sieve at all, but fills
a hashtable with the locations of which prime updates which
sieve offset. There is a separate hashtable for each polynomial,
and a separate hashtable for positive and negative sieve
values. Because these hashtables do not interact with each other,
we can do all the sieving for a few factor base primes in one
polynomial, then switch to the next polynomial and do the
sieving for the same primes. For these few primes all of the
sieving for one poly happens at once, and so the roots of these
primes can then be updated to become roots of the next polynomial.
The upshot is that we deal with a block of primes at a time,
and for that block do all the sieving for num_poly polynomials;
the roots for each prime get updated whenever sieving for a
polynomial has finished. This in effect performs a tiling of
the sieve interval, the factor base, and the auxiliary information
needed to switch polynomials. When the tile size of these three
quantities is chosen small enough, everything fits in cache
for the duration of the sieving.
Note that putting all the updates into a hashtable uses up
much more space than sieving with it; however, when filling up
a hashtable linearly only one cache line per hash bin is active
at any given time, and if the number of bins is small enough
the processor can fit all the updates into store buffers as
they happen. Contrast that with filling up a conventional
sieve interval, where all the cache lines in the interval
are active.
All of the hashtables are filled in first; then we proceed one
polynomial at a time. For each polynomial we sieve with the
small factor base primes, add in the sieve values from the
appropriate set of hash bins (the bin size matches the sieve
block size for this implementation), then do trial factoring
on presumed smooth sieve values. There's one more perk to doing
things this way: since a hash bin can be made to contain the
actual primes as well as the log values of those primes,
trial factoring can be sped up an order of magnitude by reusing
the hash bins to detect primes that divide sieve values.
In short: black magic, awful mess, really fast. */
uint32 i, j, k, m, n;
uint32 relations_found = 0;
uint32 num_sieve_blocks = conf->num_sieve_blocks;
uint32 sieve_size;
uint32 fb_size = conf->fb_size;
fb_t *factor_base = conf->factor_base;
uint32 num_factors = conf->num_poly_factors;
uint32 total_poly = 1 << (num_factors - 1);
hashtable_t *hashtable;
uint32 poly_index;
uint32 *poly_b_array;
/* all hash bins start off empty */
for (i = 0; i < num_poly * num_sieve_blocks; i++) {
conf->hashtable[i].num_used = 0;
}
poly_b_array = conf->poly_b_array;
sieve_size = num_sieve_blocks * SIEVE_BLOCK_SIZE;
i = conf->sieve_large_fb_start;
/* for each block of factor base primes */
while (i < fb_size) {
uint32 fb_block = MIN(conf->fb_block, fb_size - i);
fb_t *fb_start = factor_base + i;
hashtable = conf->hashtable;
poly_index = poly_start;
/* for polynomial j */
for (j = 0; j < num_poly; j++) {
uint32 next_action;
uint32 *poly_b_start;
TIME1(bucket_time)
for (k = 0; k < fb_block; k++) {
fb_t *fbptr = fb_start + k;
uint32 prime = fbptr->prime;
uint32 root1, root2;
uint8 logprime = fbptr->logprime;
#ifdef MANUAL_PREFETCH
if (k % 4 == 0)
PREFETCH(fbptr + 4);
#endif
/* grab the two roots of each prime in
the block, and do all the sieving. For each
sieve value, use the upper bits of the
value to determine the hash bin of
polynomial j to update */
root1 = fbptr->root1;
while (root1 < sieve_size) {
add_to_hashtable(hashtable +
(root1>>LOG2_SIEVE_BLOCK_SIZE),
root1, SIEVE_BLOCK_SIZE - 1,
i + k, logprime);
root1 += prime;
}
root2 = fbptr->root2;
while (root2 < sieve_size) {
add_to_hashtable(hashtable +
(root2>>LOG2_SIEVE_BLOCK_SIZE),
root2, SIEVE_BLOCK_SIZE - 1,
i + k, logprime);
root2 += prime;
}
}
TIME2(bucket_time)
/* this block has finished sieving for polynomial
j; now select the new roots for polynomial j+1.
Note that there is no polynomial with index
'total_poly', so no update is needed after the
last polynomial has updated its hashtable */
if (++poly_index == total_poly)
continue;
k = poly_index;
next_action = conf->next_poly_action[k];
poly_b_start = poly_b_array;
n = next_action & 0x7f;
/* Update the two roots for each prime in the block.
Do not sort them in ascending order; the loop
above does not require it, and the unpredictable
branch required to do so takes a large fraction
of the total poly initialization time! */
TIME1(next_poly_large_time)
if (next_action & 0x80) {
for (k = 0; k < fb_block;
k++, poly_b_start += num_factors) {
fb_t *fbptr = fb_start + k;
uint32 prime = fbptr->prime;
uint32 root1 = fbptr->root1;
uint32 root2 = fbptr->root2;
m = poly_b_start[n];
root1 = mp_modadd_1(root1, m, prime);
root2 = mp_modadd_1(root2, m, prime);
fbptr->root1 = root1;
fbptr->root2 = root2;
}
}
else {
for (k = 0; k < fb_block;
k++, poly_b_start += num_factors) {
fb_t *fbptr = fb_start + k;
uint32 prime = fbptr->prime;
uint32 root1 = fbptr->root1;
uint32 root2 = fbptr->root2;
m = poly_b_start[n];
root1 = mp_modsub_1(root1, m, prime);
root2 = mp_modsub_1(root2, m, prime);
fbptr->root1 = root1;
fbptr->root2 = root2;
}
}
TIME2(next_poly_large_time)
/* polynomial j is finished; point to the
hash bins for polynomial j+1 */
hashtable += num_sieve_blocks;
}
/* current block has updated all hashtables;
go on to the next block */
i += fb_block;
poly_b_array += fb_block * num_factors;
}
/* All of the hashtables have been filled; now proceed
one polynomial at a time and sieve with the small
factor base primes */
hashtable = conf->hashtable;
poly_index = poly_start;
/* for each polynomial */
for (i = 0; i < num_poly; i++) {
mp_t *a, *b;
mp_t c, tmp;
uint32 cutoff1;
uint32 block;
packed_fb_t *packed_fb = conf->packed_fb;
uint32 next_action;
/* make temporary copies of all of the roots; sieving
cannot take place all at once like the large FB primes
got to do, so we must update the temporary roots as
we go from sieve block to sieve block */
TIME1(plus_init_time)
for (j = MIN_FB_OFFSET + 1;
j < conf->sieve_large_fb_start; j++) {
fb_t *fbptr = factor_base + j;
packed_fb_t *pfbptr = packed_fb + j;
pfbptr->prime = fbptr->prime;
pfbptr->logprime = fbptr->logprime;
pfbptr->next_loc1 = fbptr->root1;
pfbptr->next_loc2 = fbptr->root2;
}
/* Form 'c', equal to (b * b - n) / a (exact
division). Having this quantity available
makes trial factoring a little faster.
Also double the 'b' value, since nobody
else uses it and the trial factoring code
prefers 2*b */
a = &conf->curr_a;
b = conf->curr_b + poly_start + i;
mp_mul(b, b, &tmp);
mp_sub(conf->n, &tmp, &tmp);
mp_div(&tmp, a, &c);
mp_add(b, b, b);
/* choose the first sieving cutoff */
cutoff1 = mp_bits(&c);
if (cutoff1 >= conf->cutoff1)
cutoff1 -= conf->cutoff1;
else
cutoff1 = 0;
TIME2(plus_init_time)
/* for each sieve block */
for (block = 0; block < num_sieve_blocks; block++) {
int32 block_start = -(sieve_size / 2) +
(block * SIEVE_BLOCK_SIZE);
/* initialize the sieve array */
memset(conf->sieve_array, (int8)(cutoff1 - 1),
(size_t)SIEVE_BLOCK_SIZE);
/* do the sieving and add in the values from the
hash bin corresponding to this sieve block */
fill_sieve_block(conf, hashtable + block);
/* trial factor sieve values whose logarithm is
sufficiently large */
relations_found += scan_sieve_block(conf, a, b, &c,
block_start, cutoff1,
poly_index,
hashtable + block);
}
/* update the roots for each of the small factor
base primes to correspond to the next polynomial */
if (++poly_index == total_poly)
break;
k = poly_index;
next_action = conf->next_poly_action[k];
poly_b_array = conf->poly_b_small[next_action & 0x7f];
k = conf->sieve_large_fb_start;
TIME1(next_poly_small_time)
if (next_action & 0x80) {
for (j = MIN_FB_OFFSET + 1; j < k; j++) {
fb_t *fbptr = factor_base + j;
uint32 prime = fbptr->prime;
uint32 root1 = fbptr->root1;
uint32 root2 = fbptr->root2;
if (root1 == INVALID_ROOT)
continue;
m = poly_b_array[j];
root1 = mp_modadd_1(root1, m, prime);
root2 = mp_modadd_1(root2, m, prime);
if (root2 > root1) {
fbptr->root1 = root1;
fbptr->root2 = root2;
}
else {
fbptr->root1 = root2;
fbptr->root2 = root1;
}
}
}
else {
for (j = MIN_FB_OFFSET + 1; j < k; j++) {
fb_t *fbptr = factor_base + j;
uint32 prime = fbptr->prime;
uint32 root1 = fbptr->root1;
uint32 root2 = fbptr->root2;
if (root1 == INVALID_ROOT)
continue;
m = poly_b_array[j];
root1 = mp_modsub_1(root1, m, prime);
root2 = mp_modsub_1(root2, m, prime);
if (root2 > root1) {
fbptr->root1 = root1;
fbptr->root2 = root2;
}
else {
fbptr->root1 = root2;
fbptr->root2 = root1;
}
}
}
TIME2(next_poly_small_time)
hashtable += num_sieve_blocks;
}
return relations_found;
}
syntax highlighted by Code2HTML, v. 0.9.1