// // This file is part of the libecc package. // //* \file square.cc //* \brief Implementation of internal polynomial functions in order to efficiently calculate the square. // // This code has been derived from the program 'irred' version 1.50. // // Copyright (C) 2000 R. P. Brent. // // Author: Richard Brent // with assistance and contributions from // Samuli Larvala and Paul Zimmermann // // Dates: 20000528..20000817 // See http://web.comlab.ox.ac.uk/oucl/work/richard.brent/irred.html. // // // Copyright (C) 2002, by // // Carlo Wood, Run on IRC // RSA-1024 0x624ACAD5 1997-01-26 Sign & Encrypt // Fingerprint16 = 32 EC A7 B6 AC DB 65 A6 F6 F6 55 DD 1C DC FF 61 // // This program is free software; you can redistribute it and/or // modify it under the terms of the GNU General Public License // as published by the Free Software Foundation; either version 2 // of the License, or (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. // #include "sys.h" #include "debug.h" #include namespace libecc { #if ECC_BITS == 32 #if (defined(__i386) || defined(__sgi)) static bitset_digit_t tabw[256] = { 0x0000, 0x0001, 0x0004, 0x0005, 0x0010, 0x0011, 0x0014, 0x0015, 0x0040, 0x0041, 0x0044, 0x0045, 0x0050, 0x0051, 0x0054, 0x0055, 0x0100, 0x0101, 0x0104, 0x0105, 0x0110, 0x0111, 0x0114, 0x0115, 0x0140, 0x0141, 0x0144, 0x0145, 0x0150, 0x0151, 0x0154, 0x0155, 0x0400, 0x0401, 0x0404, 0x0405, 0x0410, 0x0411, 0x0414, 0x0415, 0x0440, 0x0441, 0x0444, 0x0445, 0x0450, 0x0451, 0x0454, 0x0455, 0x0500, 0x0501, 0x0504, 0x0505, 0x0510, 0x0511, 0x0514, 0x0515, 0x0540, 0x0541, 0x0544, 0x0545, 0x0550, 0x0551, 0x0554, 0x0555, 0x1000, 0x1001, 0x1004, 0x1005, 0x1010, 0x1011, 0x1014, 0x1015, 0x1040, 0x1041, 0x1044, 0x1045, 0x1050, 0x1051, 0x1054, 0x1055, 0x1100, 0x1101, 0x1104, 0x1105, 0x1110, 0x1111, 0x1114, 0x1115, 0x1140, 0x1141, 0x1144, 0x1145, 0x1150, 0x1151, 0x1154, 0x1155, 0x1400, 0x1401, 0x1404, 0x1405, 0x1410, 0x1411, 0x1414, 0x1415, 0x1440, 0x1441, 0x1444, 0x1445, 0x1450, 0x1451, 0x1454, 0x1455, 0x1500, 0x1501, 0x1504, 0x1505, 0x1510, 0x1511, 0x1514, 0x1515, 0x1540, 0x1541, 0x1544, 0x1545, 0x1550, 0x1551, 0x1554, 0x1555, 0x4000, 0x4001, 0x4004, 0x4005, 0x4010, 0x4011, 0x4014, 0x4015, 0x4040, 0x4041, 0x4044, 0x4045, 0x4050, 0x4051, 0x4054, 0x4055, 0x4100, 0x4101, 0x4104, 0x4105, 0x4110, 0x4111, 0x4114, 0x4115, 0x4140, 0x4141, 0x4144, 0x4145, 0x4150, 0x4151, 0x4154, 0x4155, 0x4400, 0x4401, 0x4404, 0x4405, 0x4410, 0x4411, 0x4414, 0x4415, 0x4440, 0x4441, 0x4444, 0x4445, 0x4450, 0x4451, 0x4454, 0x4455, 0x4500, 0x4501, 0x4504, 0x4505, 0x4510, 0x4511, 0x4514, 0x4515, 0x4540, 0x4541, 0x4544, 0x4545, 0x4550, 0x4551, 0x4554, 0x4555, 0x5000, 0x5001, 0x5004, 0x5005, 0x5010, 0x5011, 0x5014, 0x5015, 0x5040, 0x5041, 0x5044, 0x5045, 0x5050, 0x5051, 0x5054, 0x5055, 0x5100, 0x5101, 0x5104, 0x5105, 0x5110, 0x5111, 0x5114, 0x5115, 0x5140, 0x5141, 0x5144, 0x5145, 0x5150, 0x5151, 0x5154, 0x5155, 0x5400, 0x5401, 0x5404, 0x5405, 0x5410, 0x5411, 0x5414, 0x5415, 0x5440, 0x5441, 0x5444, 0x5445, 0x5450, 0x5451, 0x5454, 0x5455, 0x5500, 0x5501, 0x5504, 0x5505, 0x5510, 0x5511, 0x5514, 0x5515, 0x5540, 0x5541, 0x5544, 0x5545, 0x5550, 0x5551, 0x5554, 0x5555, }; // square using 8-bit -> 16-bit table lookup for 32-bit machines. // (works on both big-endian and little-endian machines). // void square(bitset_digit_t const* input, bitset_digit_t* output, unsigned int digits) { for (int j = digits - 1; j >= 0; --j) { bitset_digit_t tmp = input[j]; output[2 * j] = tabw[(unsigned char)tmp] | (tabw[(unsigned char)(tmp >> 8)] << 16); output[2 * j + 1] = tabw[(unsigned char)(tmp >> 16)] | (tabw[(unsigned char)(tmp >> 24)] << 16); } } #else #ifdef __i386 // Don't use unrolled version of cpu with only a few registers. // 32-bit version of square. // // Squares polynomial a of degree r - 1 (mod 2), without table lookup, assuming 32-bit words. // (1.load + 2.store).q1 memory ops. and about 24.q1 REGISTER ops. */ // void square(bitset_digit_t const* input, bitset_digit_t* output, unsigned int digits) { register bitset_digit_t t, u, next; register bitset_digit_t const c1 = 0x0000FFFFL; // Some 32-bit constants. register bitset_digit_t const c2 = 0x00FF00FFL; register bitset_digit_t const c3 = 0x0F0F0F0FL; register bitset_digit_t const c4 = 0x33333333L; register bitset_digit_t const c5 = 0x22222222L; next = input[--digits]; for (register unsigned int j = digits; j >= 0; --j) { u = next >> 16; // High order 16 bits of input[j]. t = next & c1; // Low order 16 bits. u = (u | u << 8) & c2; t = (t | t << 8) & c2; next = input[j - 1]; // Load in advance to overcome latency. May access input[-1]. u = (u | u << 4) & c3; t = (t | t << 4) & c3; u = (u | u << 2) & c4; t = (t | t << 2) & c4; output[2 * j + 1] = u + (u & c5); output[2 * j] = t + (t & c5); } } #else // 32-bit unrolled version of square. // // Squares polynomial a of degree r-1 (mod 2), result -> a, // without table lookup, assuming 64-bit words. // (1.load + 2.store).q1 memory ops. and about 24.q1 register ops. // // May be faster than "un-unrolled" version if more than two register ops can // be performed concurrently. Faster than table lookup on word-oriented // machines (e.g. SPARC) but not on IBM PC. // // Because of a loop optimisation, it should be valid to access input[-2]. // void square(bitset_digit_t const* input, bitset_digit_t* output, unsigned int digits) { register int j, low; register bitset_digit_t t, u, v, w, next1, next2; register bitset_digit_t const c1 = 0x0000FFFFL; // Some 32-bit constants. register bitset_digit_t const c2 = 0x00FF00FFL; register bitset_digit_t const c3 = 0x0F0F0F0FL; register bitset_digit_t const c4 = 0x33333333L; register bitset_digit_t const c5 = 0x22222222L; low = 2 * (digits / 2) - 1; if ((digits & 1)) // Do an iteration of loop if q1 even. { t = input[--digits]; u = t >> 16; // High order 16 bits of input[q1]. t &= c1; // Low order 16 bits. u = (u | u << 8) & c2; t = (t | t << 8) & c2; u = (u | u << 4) & c3; t = (t | t << 4) & c3; u = (u | u << 2) & c4; t = (t | t << 2) & c4; output[2 * digits + 1] = u + (u & c5); output[2 * digits] = t + (t & c5); } next1 = input[low]; next2 = input[low - 1]; for (j = low; j >= 0; j -= 2) // Here low and j are odd. { u = next1 >> 16; // High order 16 bits of input[j]. w = next2 >> 16; // Ditto of input[j - 1]. t = next1 & c1; // Low order 16 bits of input[j]. v = next2 & c1; // Ditto of input[j - 1]. u = (u | u << 8) & c2; t = (t | t << 8) & c2; next1 = input[j - 2]; // Perform loads in advance to overcome latency. w = (w | w << 8) & c2; v = (v | v << 8) & c2; next2 = input[j - 3]; // Ditto (may access input[-2]). u = (u | u << 4) & c3; t = (t | t << 4) & c3; w = (w | w << 4) & c3; v = (v | v << 4) & c3; u = (u | u << 2) & c4; t = (t | t << 2) & c4; w = (w | w << 2) & c4; v = (v | v << 2) & c4; output[2 * j + 1] = u + (u & c5); output[2 * j] = t + (t & c5); output[2 * j - 1] = w + (w & c5); output[2 * j - 2] = v + (v & c5); } } #endif // !__i386 #endif #elif ECC_BITS == 64 #ifdef ECC_USEUNUNROLLED // not defined // 64-bit version of square. // // Squares polynomial input of degree r - 1 (mod 2). // (1.load + 2.store).q1 memory ops. and about 30.q1 REGISTER ops. // NB: Because of an optimisation in the loop, it should be valid to read input[-2]. */ // void square(bitset_digit_t const* input, bitset_digit_t* output, unsigned int digits) { register bitset_digit_t const c0 = 0x00000000FFFFFFFFL; // Some 64-bit constants. register bitset_digit_t const c1 = 0x0000FFFF0000FFFFL; register bitset_digit_t const c2 = 0x00FF00FF00FF00FFL; register bitset_digit_t const c3 = 0x0F0F0F0F0F0F0F0FL; register bitset_digit_t const c4 = 0x3333333333333333L; register bitset_digit_t const c5 = 0x2222222222222222L; register bitset_digit_t t, u, next; next = input[--digits]; for (register int j = digits; j >= 0; --j) { u = next >> 32; // High order 32 bits of input[j]. t = next & c0; // Low order 32 bits. u = (u | u << 16) & c1; // Operations on t and u can be t = (t | t << 16) & c1; // done in parallel. next = input[j - 1]; // Do load early to overcome memory latency. May access input[-1]! u = (u | u << 8) & c2; t = (t | t << 8) & c2; u = (u | u << 4) & c3; t = (t | t << 4) & c3; u = (u | u << 2) & c4; t = (t | t << 2) & c4; output[2 * j + 1] = u + (u & c5); output[2 * j] = t + (t & c5); } } #else // 64-bit unrolled version of square. // // Squares polynomial a of degree r-1 (mod 2), result -> a, // without table lookup, assuming 64-bit words. // (1.load + 2.store).q1 memory ops. and about 30.q1 REGISTER ops. // // May be faster than "un-unrolled" version because more than two // register ops can be performed concurrently on machines such as // SGI R10000 and DEC alpha. // // Because of a loop optimisation, it should be valid to access input[-2]. // // squarem() is the IBM PC assembler version of this using MMX instruction. // void square(bitset_digit_t const* input, bitset_digit_t* output, unsigned int digits) { register int j, low; register bitset_digit_t t, u, v, w, next1, next2; register bitset_digit_t c0, c1, c2, c3, c4, c5; c0 = 0x00000000FFFFFFFFL; /* Some 64-bit constants */ c1 = 0x0000FFFF0000FFFFL; c2 = 0x00FF00FF00FF00FFL; c3 = 0x0F0F0F0F0F0F0F0FL; c4 = 0x3333333333333333L; c5 = 0x2222222222222222L; low = 2 * (digits / 2) - 1; if ((digits & 1)) /* Do an iteration of loop if q1 even */ { t = input[--digits]; u = t >> 32; /* High order 32 bits of input[q1] */ t &= c0; /* Low order 32 bits */ u = (u | u << 16) & c1; /* Operations on t and u can be */ t = (t | t << 16) & c1; /* done in parallel */ u = (u | u << 8) & c2; t = (t | t << 8) & c2; u = (u | u << 4) & c3; t = (t | t << 4) & c3; u = (u | u << 2) & c4; t = (t | t << 2) & c4; input[2 * digits + 1] = u + (u & c5); input[2 * digits] = t + (t & c5); } next1 = input[low]; next2 = input[low - 1]; for (j = low; j >= 0; j -= 2) // Here low and j are odd. { u = next1 >> 32; // High order 32 bits of input[j]. w = next2 >> 32; // Ditto of input[j - 1]. t = next1 & c0; // Low order 32 bits of input[j]. v = next2 & c0; // Ditto of input[j - 1]. u = (u | u << 16) & c1; /* Operations on t,u,v,w */ t = (t | t << 16) & c1; /* can be done in parallel */ w = (w | w << 16) & c1; v = (v | v << 16) & c1; next1 = input[j - 2]; // Perform loads in advance // to overcome latency. u = (u | u << 8) & c2; t = (t | t << 8) & c2; w = (w | w << 8) & c2; v = (v | v << 8) & c2; next2 = input[j - 3]; // Ditto (may access input[-2]). u = (u | u << 4) & c3; t = (t | t << 4) & c3; w = (w | w << 4) & c3; v = (v | v << 4) & c3; u = (u | u << 2) & c4; t = (t | t << 2) & c4; w = (w | w << 2) & c4; v = (v | v << 2) & c4; output[2 * j + 1] = u + (u & c5); output[2 * j] = t + (t & c5); output[2 * j - 1] = w + (w & c5); output[2 * j - 2] = v + (v & c5); } } #endif #else #error No support for non- 32 or 64 bits machines. If you have a 128 bits machine then please contact the author. #endif // ECC_BITS } // namespace libecc