/*-------------------------------------------------------------------- This source distribution is placed in the public domain by its author, Jason Papadopoulos. You may use it for any purpose, free of charge, without having to notify anyone. I disclaim any responsibility for any errors. Optionally, please be nice and tell me if you find this source to be useful. Again optionally, if you add to the functionality present here please consider making those additions public too, so that others may benefit from your work. --jasonp@boo.net 6/3/07 --------------------------------------------------------------------*/ #include #include "mpqs.h" #if BLOCK_KB == 32 #define SIEVE_BLOCK_SIZE 32768 #define LOG2_SIEVE_BLOCK_SIZE 15 #elif BLOCK_KB == 64 #define SIEVE_BLOCK_SIZE 65536 #define LOG2_SIEVE_BLOCK_SIZE 16 #else #error "unsupported sieve block size" #endif /*--------------------------------------------------------------------*/ static void add_to_hashtable(hashtable_t *entry, uint32 sieve_offset, uint32 mask, uint32 prime_index, uint32 logprime) { /* add a 'sieve update' to the hashtable bin pointed to by 'entry'. Hashing is based on the top few bits of sieve_offset, and the range of sieve values represented by one hash bin is given by 'mask'. Note that after the first polynomial it's unlikely that any hash bin will need to grow in size. */ uint32 i = entry->num_used; hash_entry_t new_entry; /* always prefetch; we can't rely on the processor automatically knowing when to do this */ if (!(i % 8)) PREFETCH(entry->list + i + 8); if (i == entry->num_alloc) { entry->num_alloc = 2 * i; entry->list = (hash_entry_t *)realloc(entry->list, 2 * i * sizeof(hash_entry_t)); } new_entry.logprime = logprime; new_entry.prime_index = prime_index; new_entry.sieve_offset = sieve_offset & mask; entry->list[i] = new_entry; entry->num_used++; } /*--------------------------------------------------------------------*/ static void fill_sieve_block(sieve_conf_t *conf, hashtable_t *hash_bucket) { /* Routine for conventional sieving Each successive call to fill_sieve_block() reuses the root values that were computed in the previous call */ uint32 i; uint8 *sieve_array = conf->sieve_array; uint32 sieve_small_fb_start = conf->sieve_small_fb_start; uint32 sieve_large_fb_start = conf->sieve_large_fb_start; packed_fb_t *packed_fb = conf->packed_fb; hash_entry_t *list; uint32 num_large; /* First update the sieve block with values corresponding to the small primes in the factor base. Assuming the entire block will fit in cache, each factor base prime will update many locations within the block, and this phase will run very fast */ TIME1(sieve_small_time) for (i = sieve_small_fb_start; i < sieve_large_fb_start; i++) { packed_fb_t *pfbptr = packed_fb + i; uint32 prime = pfbptr->prime; uint32 root1 = pfbptr->next_loc1; uint32 root2 = pfbptr->next_loc2; uint8 logprime = pfbptr->logprime; /* We assume that the starting positions for each root (i.e. 'next_loc1' and 'next_loc2') are stored in ascending order; thus, if next_loc2 is within the sieve block then next_loc1 automatically is too. This allows both root updates to be collapsed into a single loop, effectively unrolling by two. Note that no exception is made for sieve roots that are invalid; they skip the loop below since INVALID_ROOT is a large number */ while (root2 < SIEVE_BLOCK_SIZE) { sieve_array[root1] -= logprime; sieve_array[root2] -= logprime; root1 += prime; root2 += prime; } /* once next_loc2 exceeds the sieve block size, next_loc1 may still have one update left. In that case, perform the update and switch next_loc1 and next_loc2, since next_loc1 is now the larger root */ if (root1 < SIEVE_BLOCK_SIZE) { uint32 tmp = root2; sieve_array[root1] -= logprime; root2 = root1 + prime; root1 = tmp; } pfbptr->next_loc1 = root1 - SIEVE_BLOCK_SIZE; pfbptr->next_loc2 = root2 - SIEVE_BLOCK_SIZE; } TIME2(sieve_small_time) /* Now update the sieve block with the rest of the factor base. All of the offsets to update have previously been collected into a hash table. While the updates are to random offsets in the sieve block, the whole block was cached by the previous loop, and memory access to the hashtable entry is predictable and can be prefetched */ TIME1(sieve_large_time) list = hash_bucket->list; num_large = hash_bucket->num_used; for (i = 0; i < (num_large & (uint32)(~7)); i += 8) { #ifdef MANUAL_PREFETCH PREFETCH(list + i + 16); #endif sieve_array[list[i ].sieve_offset] -= list[i ].logprime; sieve_array[list[i+1].sieve_offset] -= list[i+1].logprime; sieve_array[list[i+2].sieve_offset] -= list[i+2].logprime; sieve_array[list[i+3].sieve_offset] -= list[i+3].logprime; sieve_array[list[i+4].sieve_offset] -= list[i+4].logprime; sieve_array[list[i+5].sieve_offset] -= list[i+5].logprime; sieve_array[list[i+6].sieve_offset] -= list[i+6].logprime; sieve_array[list[i+7].sieve_offset] -= list[i+7].logprime; } for (; i < num_large; i++) { sieve_array[list[i].sieve_offset] -= list[i].logprime; } TIME2(sieve_large_time) } /*--------------------------------------------------------------------*/ #define PACKED_SIEVE_MASK ((uint64)0x80808080 << 32 | 0x80808080) #if defined(__GNUC__) && defined(__i386__) #if defined(HAS_SSE2) #define SCAN_SSE2 #elif defined(HAS_AMD_MMX) || defined(HAS_SSE) #define SCAN_MMX #endif #endif static uint32 scan_sieve_block(sieve_conf_t *conf, mp_t *a, mp_t *b, mp_t *c, int32 block_start, uint32 cutoff1, uint32 poly_index, hashtable_t *hashtable) { uint32 i, j; uint8 *sieve_array = conf->sieve_array; uint64 *packed_sieve = (uint64 *)conf->sieve_array; uint32 relations_found = 0; TIME1(tf_plus_scan_time) for (i = 0; i < SIEVE_BLOCK_SIZE / 8; i += 8) { /* test 32 sieve values at a time for large logarithms. The sieve code actually decrements by the logarithms of primes, so if a value is smooth its top bit will be set. We can easily test for this condition Note that for really big jobs the cutoff itself will exceed 128, so that the test below will have many false alarms */ #if defined(SCAN_SSE2) uint32 compare_result = 0; asm volatile ( "movdqa (%1), %%xmm0 \n\t" "orpd 16(%1), %%xmm0 \n\t" "orpd 32(%1), %%xmm0 \n\t" "orpd 48(%1), %%xmm0 \n\t" "pmovmskb %%xmm0, %0 \n\t" : "=r"(compare_result) : "r"(packed_sieve + i), "0"(compare_result) : "%xmm0"); if (compare_result == 0) continue; #elif defined(SCAN_MMX) uint32 compare_result = 0; asm volatile ( "movq (%1), %%mm0 \n\t" "por 8(%1), %%mm0 \n\t" "por 16(%1), %%mm0 \n\t" "por 24(%1), %%mm0 \n\t" "por 32(%1), %%mm0 \n\t" "por 40(%1), %%mm0 \n\t" "por 48(%1), %%mm0 \n\t" "por 56(%1), %%mm0 \n\t" "pmovmskb %%mm0, %0 \n\t" : "=r"(compare_result) : "r"(packed_sieve + i), "0"(compare_result) : "%mm0"); if (compare_result == 0) continue; #else if (((packed_sieve[i] | packed_sieve[i+1] | packed_sieve[i+2] | packed_sieve[i+3] | packed_sieve[i+4] | packed_sieve[i+5] | packed_sieve[i+6] | packed_sieve[i+7]) & PACKED_SIEVE_MASK) == (uint64)(0)) continue; #endif /* one or more of the 32 values is probably smooth. Test one at a time and attempt to factor them */ for (j = 0; j < 64; j++) { uint32 bits = sieve_array[8 * i + j]; if (bits > cutoff1) { TIME1(tf_total_time) relations_found += check_sieve_val(conf, block_start + (int32)(8*i+j), cutoff1 + 257 - bits, a, b, c, poly_index, hashtable); TIME2(tf_total_time) } } } TIME2(tf_plus_scan_time) #if defined(SCAN_MMX) asm volatile("emms"); #endif return relations_found; } /*--------------------------------------------------------------------*/ uint32 ROUTINE_NAME(sieve_conf_t *conf, uint32 poly_start, uint32 num_poly) { /* The core of the sieving code. This routine performs the sieving and relation collection for 'num_poly' polynomials at a time. Because this is the self-initializing quadratic sieve, groups of polynomials are related. There are 'total_poly' polynomials that share the same 'a' value; we number them from 0 to total_poly-1. poly_start refers to the index of the initial polynomial of the group of num_poly. Ordinarily, you initialize one polynomial, sieve with it, initialize the next polynomial, sieve with it, etc. However, we want the sieve interval to be small, and when it's very small but the factor base is large we will spend a lot of time initializing polynomials and only a little time sieving. Self-initialization improves that a lot, but when the factor base exceeds the cache size, even for previous versions of this program, polynomial initialization would take 50% of the total sieving stage runtime when the sieve interval is small. We could make the interval larger, but that means smooth relations occur less often; sieve time does not scale well but poly initialization time does. This routine takes a novel approach: for small factor base primes it sieves one polynomial at a time. However, for all the other factor base primes (90+% of the factor base, in a big factorization) it does not sieve at all, but fills a hashtable with the locations of which prime updates which sieve offset. There is a separate hashtable for each polynomial, and a separate hashtable for positive and negative sieve values. Because these hashtables do not interact with each other, we can do all the sieving for a few factor base primes in one polynomial, then switch to the next polynomial and do the sieving for the same primes. For these few primes all of the sieving for one poly happens at once, and so the roots of these primes can then be updated to become roots of the next polynomial. The upshot is that we deal with a block of primes at a time, and for that block do all the sieving for num_poly polynomials; the roots for each prime get updated whenever sieving for a polynomial has finished. This in effect performs a tiling of the sieve interval, the factor base, and the auxiliary information needed to switch polynomials. When the tile size of these three quantities is chosen small enough, everything fits in cache for the duration of the sieving. Note that putting all the updates into a hashtable uses up much more space than sieving with it; however, when filling up a hashtable linearly only one cache line per hash bin is active at any given time, and if the number of bins is small enough the processor can fit all the updates into store buffers as they happen. Contrast that with filling up a conventional sieve interval, where all the cache lines in the interval are active. All of the hashtables are filled in first; then we proceed one polynomial at a time. For each polynomial we sieve with the small factor base primes, add in the sieve values from the appropriate set of hash bins (the bin size matches the sieve block size for this implementation), then do trial factoring on presumed smooth sieve values. There's one more perk to doing things this way: since a hash bin can be made to contain the actual primes as well as the log values of those primes, trial factoring can be sped up an order of magnitude by reusing the hash bins to detect primes that divide sieve values. In short: black magic, awful mess, really fast. */ uint32 i, j, k, m, n; uint32 relations_found = 0; uint32 num_sieve_blocks = conf->num_sieve_blocks; uint32 sieve_size; uint32 fb_size = conf->fb_size; fb_t *factor_base = conf->factor_base; uint32 num_factors = conf->num_poly_factors; uint32 total_poly = 1 << (num_factors - 1); hashtable_t *hashtable; uint32 poly_index; uint32 *poly_b_array; /* all hash bins start off empty */ for (i = 0; i < num_poly * num_sieve_blocks; i++) { conf->hashtable[i].num_used = 0; } poly_b_array = conf->poly_b_array; sieve_size = num_sieve_blocks * SIEVE_BLOCK_SIZE; i = conf->sieve_large_fb_start; /* for each block of factor base primes */ while (i < fb_size) { uint32 fb_block = MIN(conf->fb_block, fb_size - i); fb_t *fb_start = factor_base + i; hashtable = conf->hashtable; poly_index = poly_start; /* for polynomial j */ for (j = 0; j < num_poly; j++) { uint32 next_action; uint32 *poly_b_start; TIME1(bucket_time) for (k = 0; k < fb_block; k++) { fb_t *fbptr = fb_start + k; uint32 prime = fbptr->prime; uint32 root1, root2; uint8 logprime = fbptr->logprime; #ifdef MANUAL_PREFETCH if (k % 4 == 0) PREFETCH(fbptr + 4); #endif /* grab the two roots of each prime in the block, and do all the sieving. For each sieve value, use the upper bits of the value to determine the hash bin of polynomial j to update */ root1 = fbptr->root1; while (root1 < sieve_size) { add_to_hashtable(hashtable + (root1>>LOG2_SIEVE_BLOCK_SIZE), root1, SIEVE_BLOCK_SIZE - 1, i + k, logprime); root1 += prime; } root2 = fbptr->root2; while (root2 < sieve_size) { add_to_hashtable(hashtable + (root2>>LOG2_SIEVE_BLOCK_SIZE), root2, SIEVE_BLOCK_SIZE - 1, i + k, logprime); root2 += prime; } } TIME2(bucket_time) /* this block has finished sieving for polynomial j; now select the new roots for polynomial j+1. Note that there is no polynomial with index 'total_poly', so no update is needed after the last polynomial has updated its hashtable */ if (++poly_index == total_poly) continue; k = poly_index; next_action = conf->next_poly_action[k]; poly_b_start = poly_b_array; n = next_action & 0x7f; /* Update the two roots for each prime in the block. Do not sort them in ascending order; the loop above does not require it, and the unpredictable branch required to do so takes a large fraction of the total poly initialization time! */ TIME1(next_poly_large_time) if (next_action & 0x80) { for (k = 0; k < fb_block; k++, poly_b_start += num_factors) { fb_t *fbptr = fb_start + k; uint32 prime = fbptr->prime; uint32 root1 = fbptr->root1; uint32 root2 = fbptr->root2; m = poly_b_start[n]; root1 = mp_modadd_1(root1, m, prime); root2 = mp_modadd_1(root2, m, prime); fbptr->root1 = root1; fbptr->root2 = root2; } } else { for (k = 0; k < fb_block; k++, poly_b_start += num_factors) { fb_t *fbptr = fb_start + k; uint32 prime = fbptr->prime; uint32 root1 = fbptr->root1; uint32 root2 = fbptr->root2; m = poly_b_start[n]; root1 = mp_modsub_1(root1, m, prime); root2 = mp_modsub_1(root2, m, prime); fbptr->root1 = root1; fbptr->root2 = root2; } } TIME2(next_poly_large_time) /* polynomial j is finished; point to the hash bins for polynomial j+1 */ hashtable += num_sieve_blocks; } /* current block has updated all hashtables; go on to the next block */ i += fb_block; poly_b_array += fb_block * num_factors; } /* All of the hashtables have been filled; now proceed one polynomial at a time and sieve with the small factor base primes */ hashtable = conf->hashtable; poly_index = poly_start; /* for each polynomial */ for (i = 0; i < num_poly; i++) { mp_t *a, *b; mp_t c, tmp; uint32 cutoff1; uint32 block; packed_fb_t *packed_fb = conf->packed_fb; uint32 next_action; /* make temporary copies of all of the roots; sieving cannot take place all at once like the large FB primes got to do, so we must update the temporary roots as we go from sieve block to sieve block */ TIME1(plus_init_time) for (j = MIN_FB_OFFSET + 1; j < conf->sieve_large_fb_start; j++) { fb_t *fbptr = factor_base + j; packed_fb_t *pfbptr = packed_fb + j; pfbptr->prime = fbptr->prime; pfbptr->logprime = fbptr->logprime; pfbptr->next_loc1 = fbptr->root1; pfbptr->next_loc2 = fbptr->root2; } /* Form 'c', equal to (b * b - n) / a (exact division). Having this quantity available makes trial factoring a little faster. Also double the 'b' value, since nobody else uses it and the trial factoring code prefers 2*b */ a = &conf->curr_a; b = conf->curr_b + poly_start + i; mp_mul(b, b, &tmp); mp_sub(conf->n, &tmp, &tmp); mp_div(&tmp, a, &c); mp_add(b, b, b); /* choose the first sieving cutoff */ cutoff1 = mp_bits(&c); if (cutoff1 >= conf->cutoff1) cutoff1 -= conf->cutoff1; else cutoff1 = 0; TIME2(plus_init_time) /* for each sieve block */ for (block = 0; block < num_sieve_blocks; block++) { int32 block_start = -(sieve_size / 2) + (block * SIEVE_BLOCK_SIZE); /* initialize the sieve array */ memset(conf->sieve_array, (int8)(cutoff1 - 1), (size_t)SIEVE_BLOCK_SIZE); /* do the sieving and add in the values from the hash bin corresponding to this sieve block */ fill_sieve_block(conf, hashtable + block); /* trial factor sieve values whose logarithm is sufficiently large */ relations_found += scan_sieve_block(conf, a, b, &c, block_start, cutoff1, poly_index, hashtable + block); } /* update the roots for each of the small factor base primes to correspond to the next polynomial */ if (++poly_index == total_poly) break; k = poly_index; next_action = conf->next_poly_action[k]; poly_b_array = conf->poly_b_small[next_action & 0x7f]; k = conf->sieve_large_fb_start; TIME1(next_poly_small_time) if (next_action & 0x80) { for (j = MIN_FB_OFFSET + 1; j < k; j++) { fb_t *fbptr = factor_base + j; uint32 prime = fbptr->prime; uint32 root1 = fbptr->root1; uint32 root2 = fbptr->root2; if (root1 == INVALID_ROOT) continue; m = poly_b_array[j]; root1 = mp_modadd_1(root1, m, prime); root2 = mp_modadd_1(root2, m, prime); if (root2 > root1) { fbptr->root1 = root1; fbptr->root2 = root2; } else { fbptr->root1 = root2; fbptr->root2 = root1; } } } else { for (j = MIN_FB_OFFSET + 1; j < k; j++) { fb_t *fbptr = factor_base + j; uint32 prime = fbptr->prime; uint32 root1 = fbptr->root1; uint32 root2 = fbptr->root2; if (root1 == INVALID_ROOT) continue; m = poly_b_array[j]; root1 = mp_modsub_1(root1, m, prime); root2 = mp_modsub_1(root2, m, prime); if (root2 > root1) { fbptr->root1 = root1; fbptr->root2 = root2; } else { fbptr->root1 = root2; fbptr->root2 = root1; } } } TIME2(next_poly_small_time) hashtable += num_sieve_blocks; } return relations_found; }