/* * Copyright (c) 2000-2001 Apple Computer, Inc. All Rights Reserved. * * The contents of this file constitute Original Code as defined in and are * subject to the Apple Public Source License Version 1.2 (the 'License'). * You may not use this file except in compliance with the License. Please obtain * a copy of the License at http://www.apple.com/publicsource and read it before * using this file. * * This Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS * OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, INCLUDING WITHOUT * LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR * PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. Please see the License for the * specific language governing rights and limitations under the License. */ /* rijndael-alg-ref.c v2.0 August '99 * Reference ANSI C code * authors: Paulo Barreto * Vincent Rijmen * * PPC and 128-bit block optimization by Doug Mitchell May 2001. */ #include #include #include #include "rijndael-alg-ref.h" #include #define SC ((BC - 4) >> 1) #include "boxes-ref.h" static const word8 shifts[3][4][2] = { { { 0, 0 }, { 1, 3 }, { 2, 2 }, { 3, 1 } }, { { 0, 0 }, { 1, 5 }, { 2, 4 }, { 3, 3 } }, { { 0, 0 }, { 1, 7 }, { 3, 5 }, { 4, 4 } } }; /* 128 bit key/word shift table in bits */ static const word8 shifts128[4][2] = { { 0, 0 }, { 8, 24 }, { 16, 16 }, { 24, 8 } }; #if !AES_MUL_BY_LOOKUP /* * Profiling measurements showed that the mul routine is where a large propertion of * the time is spent. Since the first argument to mul is always one of six * constants (2, 3, 0xe, etc.), we implement six 256x256 byte lookup tables to * do the multiplies. This eliminates the need for the log/antilog tables, so * it's only adding one kilobyte of const data. Throughput improvement for this * mod is a factor of 3.3 for encrypt and 4.1 for decrypt in the 128-bit optimized * case. Improvement for the general case (with a 256 bit key) is 1.46 for encrypt * and 1.88 for decrypt. (Decrypt wins more for this enhancement because the * InvMixColumn does four muls, vs. 2 muls for MixColumn). Measurements taken * on a 500 MHz G4 with 1 MB of L2 cache. /* * The mod 255 op in mul is really expensive... * * We know that b <= (254 * 2), so there are only two cases. Either return b, * or return b-255. * * On a G4 this single optimization results in a 24% speedup for encrypt and * a 25% speedup for decrypt. */ static inline word8 mod255(word32 b) { if(b >= 255) { b -= 255; } return b; } word8 mul(word8 a, word8 b) { /* multiply two elements of GF(2^m) * needed for MixColumn and InvMixColumn */ if (a && b) return Alogtable[mod255(Logtable[a] + Logtable[b])]; else return 0; } #endif /* !AES_MUL_BY_LOOKUP */ void KeyAddition(word8 a[4][MAXBC], word8 rk[4][MAXBC], word8 BC) { /* Exor corresponding text input and round key input bytes */ int i, j; for(i = 0; i < 4; i++) for(j = 0; j < BC; j++) a[i][j] ^= rk[i][j]; } void ShiftRow(word8 a[4][MAXBC], word8 d, word8 BC) { /* Row 0 remains unchanged * The other three rows are shifted a variable amount */ word8 tmp[MAXBC]; int i, j; for(i = 1; i < 4; i++) { for(j = 0; j < BC; j++) tmp[j] = a[i][(j + shifts[SC][i][d]) % BC]; for(j = 0; j < BC; j++) a[i][j] = tmp[j]; } } void Substitution(word8 a[4][MAXBC], const word8 box[256], word8 BC) { /* Replace every byte of the input by the byte at that place * in the nonlinear S-box */ int i, j; for(i = 0; i < 4; i++) for(j = 0; j < BC; j++) a[i][j] = box[a[i][j]] ; } void MixColumn(word8 a[4][MAXBC], word8 BC) { /* Mix the four bytes of every column in a linear way */ word8 b[4][MAXBC]; int i, j; for(j = 0; j < BC; j++) { for(i = 0; i < 4; i++) { #if AES_MUL_BY_LOOKUP b[i][j] = mulBy0x02[a[i][j]] ^ mulBy0x03[a[(i + 1) % 4][j]] ^ a[(i + 2) % 4][j] ^ a[(i + 3) % 4][j]; #else b[i][j] = mul(2,a[i][j]) ^ mul(3,a[(i + 1) % 4][j]) ^ a[(i + 2) % 4][j] ^ a[(i + 3) % 4][j]; #endif } } for(i = 0; i < 4; i++) { for(j = 0; j < BC; j++) a[i][j] = b[i][j]; } } void InvMixColumn(word8 a[4][MAXBC], word8 BC) { /* Mix the four bytes of every column in a linear way * This is the opposite operation of Mixcolumn */ word8 b[4][MAXBC]; int i, j; for(j = 0; j < BC; j++) { for(i = 0; i < 4; i++) { #if AES_MUL_BY_LOOKUP b[i][j] = mulBy0x0e[a[i][j]] ^ mulBy0x0b[a[(i + 1) % 4][j]] ^ mulBy0x0d[a[(i + 2) % 4][j]] ^ mulBy0x09[a[(i + 3) % 4][j]]; #else b[i][j] = mul(0xe,a[i][j]) ^ mul(0xb,a[(i + 1) % 4][j]) ^ mul(0xd,a[(i + 2) % 4][j]) ^ mul(0x9,a[(i + 3) % 4][j]); #endif } } for(i = 0; i < 4; i++) { for(j = 0; j < BC; j++) a[i][j] = b[i][j]; } } int rijndaelKeySched ( word8 k[4][MAXKC], int keyBits, int blockBits, word8 W[MAXROUNDS+1][4][MAXBC]) { /* Calculate the necessary round keys * The number of calculations depends on keyBits and blockBits */ int KC, BC, ROUNDS; int i, j, t, rconpointer = 0; word8 tk[4][MAXKC]; switch (keyBits) { case 128: KC = 4; break; case 192: KC = 6; break; case 256: KC = 8; break; default : return (-1); } switch (blockBits) { case 128: BC = 4; break; case 192: BC = 6; break; case 256: BC = 8; break; default : return (-2); } switch (keyBits >= blockBits ? keyBits : blockBits) { case 128: ROUNDS = 10; break; case 192: ROUNDS = 12; break; case 256: ROUNDS = 14; break; default : return (-3); /* this cannot happen */ } for(j = 0; j < KC; j++) for(i = 0; i < 4; i++) tk[i][j] = k[i][j]; t = 0; /* copy values into round key array */ for(j = 0; (j < KC) && (t < (ROUNDS+1)*BC); j++, t++) for(i = 0; i < 4; i++) W[t / BC][i][t % BC] = tk[i][j]; while (t < (ROUNDS+1)*BC) { /* while not enough round key material calculated */ /* calculate new values */ for(i = 0; i < 4; i++) tk[i][0] ^= S[tk[(i+1)%4][KC-1]]; tk[0][0] ^= rcon[rconpointer++]; if (KC != 8) for(j = 1; j < KC; j++) for(i = 0; i < 4; i++) tk[i][j] ^= tk[i][j-1]; else { for(j = 1; j < KC/2; j++) for(i = 0; i < 4; i++) tk[i][j] ^= tk[i][j-1]; for(i = 0; i < 4; i++) tk[i][KC/2] ^= S[tk[i][KC/2 - 1]]; for(j = KC/2 + 1; j < KC; j++) for(i = 0; i < 4; i++) tk[i][j] ^= tk[i][j-1]; } /* copy values into round key array */ for(j = 0; (j < KC) && (t < (ROUNDS+1)*BC); j++, t++) for(i = 0; i < 4; i++) W[t / BC][i][t % BC] = tk[i][j]; } return 0; } int rijndaelEncrypt ( word8 a[4][MAXBC], int keyBits, int blockBits, word8 rk[MAXROUNDS+1][4][MAXBC]) { /* Encryption of one block, general case. */ int r, BC, ROUNDS; switch (blockBits) { case 128: BC = 4; break; case 192: BC = 6; break; case 256: BC = 8; break; default : return (-2); } switch (keyBits >= blockBits ? keyBits : blockBits) { case 128: ROUNDS = 10; break; case 192: ROUNDS = 12; break; case 256: ROUNDS = 14; break; default : return (-3); /* this cannot happen */ } /* begin with a key addition */ KeyAddition(a,rk[0],BC); /* ROUNDS-1 ordinary rounds */ for(r = 1; r < ROUNDS; r++) { Substitution(a,S,BC); ShiftRow(a,0,BC); MixColumn(a,BC); KeyAddition(a,rk[r],BC); } /* Last round is special: there is no MixColumn */ Substitution(a,S,BC); ShiftRow(a,0,BC); KeyAddition(a,rk[ROUNDS],BC); return 0; } int rijndaelDecrypt ( word8 a[4][MAXBC], int keyBits, int blockBits, word8 rk[MAXROUNDS+1][4][MAXBC]) { int r, BC, ROUNDS; switch (blockBits) { case 128: BC = 4; break; case 192: BC = 6; break; case 256: BC = 8; break; default : return (-2); } switch (keyBits >= blockBits ? keyBits : blockBits) { case 128: ROUNDS = 10; break; case 192: ROUNDS = 12; break; case 256: ROUNDS = 14; break; default : return (-3); /* this cannot happen */ } /* To decrypt: apply the inverse operations of the encrypt routine, * in opposite order * * (KeyAddition is an involution: it 's equal to its inverse) * (the inverse of Substitution with table S is Substitution with the * inverse table of S) * (the inverse of Shiftrow is Shiftrow over a suitable distance) */ /* First the special round: * without InvMixColumn * with extra KeyAddition */ KeyAddition(a,rk[ROUNDS],BC); Substitution(a,Si,BC); ShiftRow(a,1,BC); /* ROUNDS-1 ordinary rounds */ for(r = ROUNDS-1; r > 0; r--) { KeyAddition(a,rk[r],BC); InvMixColumn(a,BC); Substitution(a,Si,BC); ShiftRow(a,1,BC); } /* End with the extra key addition */ KeyAddition(a,rk[0],BC); return 0; } #if !GLADMAN_AES_128_ENABLE /* * All of these 128-bit-key-and-block routines require 32-bit word-aligned * char array pointers.ĘThe key schedule arrays are easy; they come from * keyInstance which has a 4-byte-aligned element preceeding the key schedule. * Others require manual alignment of a local variable by the caller. */ static inline void KeyAddition128( word8 a[4][BC_128_OPT], word8 rk[4][MAXBC]) { /* these casts are endian-independent */ ((word32 *)a)[0] ^= *((word32 *)(&rk[0])); ((word32 *)a)[1] ^= *((word32 *)(&rk[1])); ((word32 *)a)[2] ^= *((word32 *)(&rk[2])); ((word32 *)a)[3] ^= *((word32 *)(&rk[3])); } static void Substitution128( word8 a[4][BC_128_OPT], const word8 box[256]) { /* Replace every byte of the input by the byte at that place * in the nonlinear S-box */ int i, j; /* still to be optimized - larger S boxes? */ for(i = 0; i < 4; i++) { for(j = 0; j < BC_128_OPT; j++) { a[i][j] = box[a[i][j]]; } } } #if defined(__ppc__) && defined(__GNUC__) static inline void rotateWordLeft( word8 *word, // known to be word aligned unsigned rotCount) // in bits { word32 lword = *((word32 *)word); asm("rlwnm %0,%1,%2,0,31" : "=r"(lword) : "0"(lword), "r"(rotCount)); *((word32 *)word) = lword; } #else /* * Insert your machine/compiler dependent code here, * or just use this, which works on any platform and compiler * which supports the __attribute__((aligned(4))) directive. */ static void rotateWordLeft( word8 *word, // known to be word aligned unsigned rotCount) // in bits { word8 tmp[BC_128_OPT] __attribute__((aligned(4))); unsigned bytes = rotCount / 8; tmp[0] = word[bytes & (BC_128_OPT-1)]; tmp[1] = word[(1+bytes) & (BC_128_OPT-1)]; tmp[2] = word[(2+bytes) & (BC_128_OPT-1)]; tmp[3] = word[(3+bytes) & (BC_128_OPT-1)]; *((word32 *)word) = *((word32 *)tmp); } #endif static inline void ShiftRow128( word8 a[4][BC_128_OPT], word8 d) { /* Row 0 remains unchanged * The other three rows are shifted (actually rotated) a variable amount */ int i; for(i = 1; i < 4; i++) { rotateWordLeft(a[i], shifts128[i][d]); } } /* * The following two routines are where most of the time is spent in this * module. Further optimization would have to focus here. */ static void MixColumn128(word8 a[4][BC_128_OPT]) { /* Mix the four bytes of every column in a linear way */ word8 b[4][BC_128_OPT]; int i, j; for(j = 0; j < BC_128_OPT; j++) { for(i = 0; i < 4; i++) { #if AES_MUL_BY_LOOKUP b[i][j] = mulBy0x02[a[i][j]] ^ mulBy0x03[a[(i + 1) % 4][j]] ^ a[(i + 2) % 4][j] ^ a[(i + 3) % 4][j]; #else b[i][j] = mul(2,a[i][j]) ^ mul(3,a[(i + 1) % 4][j]) ^ a[(i + 2) % 4][j] ^ a[(i + 3) % 4][j]; #endif } } memmove(a, b, 4 * BC_128_OPT); } static void InvMixColumn128(word8 a[4][BC_128_OPT]) { /* Mix the four bytes of every column in a linear way * This is the opposite operation of Mixcolumn */ word8 b[4][BC_128_OPT]; int i, j; for(j = 0; j < BC_128_OPT; j++) { for(i = 0; i < 4; i++) { #if AES_MUL_BY_LOOKUP b[i][j] = mulBy0x0e[a[i][j]] ^ mulBy0x0b[a[(i + 1) % 4][j]] ^ mulBy0x0d[a[(i + 2) % 4][j]] ^ mulBy0x09[a[(i + 3) % 4][j]]; #else b[i][j] = mul(0xe,a[i][j]) ^ mul(0xb,a[(i + 1) % 4][j]) ^ mul(0xd,a[(i + 2) % 4][j]) ^ mul(0x9,a[(i + 3) % 4][j]); #endif } } memmove(a, b, 4 * BC_128_OPT); } int rijndaelKeySched128 ( word8 k[4][KC_128_OPT], word8 W[MAXROUNDS+1][4][MAXBC]) { /* Calculate the necessary round keys * The number of calculations depends on keyBits and blockBits */ int i, j, t, rconpointer = 0; word8 tk[4][KC_128_OPT]; unsigned numSchedRows = (ROUNDS_128_OPT + 1) * BC_128_OPT; for(j = 0; j < KC_128_OPT; j++) for(i = 0; i < 4; i++) tk[i][j] = k[i][j]; t = 0; /* copy values into round key array */ for(j = 0; (j < KC_128_OPT) && (t < numSchedRows); j++, t++) { for(i = 0; i < 4; i++) { W[t / BC_128_OPT][i][t % BC_128_OPT] = tk[i][j]; } } while (t < numSchedRows) { /* while not enough round key material calculated */ /* calculate new values */ for(i = 0; i < 4; i++) { tk[i][0] ^= S[tk[(i+1)%4][KC_128_OPT-1]]; } tk[0][0] ^= rcon[rconpointer++]; for(j = 1; j < KC_128_OPT; j++) { for(i = 0; i < 4; i++) { tk[i][j] ^= tk[i][j-1]; } } /* copy values into round key array */ for(j = 0; (j < KC_128_OPT) && (t < numSchedRows); j++, t++) { for(i = 0; i < 4; i++) { W[t / BC_128_OPT][i][t % BC_128_OPT] = tk[i][j]; } } } return 0; } int rijndaelEncrypt128 ( word8 a[4][BC_128_OPT], word8 rk[MAXROUNDS+1][4][MAXBC]) { /* Encryption of one block. */ int r; /* begin with a key addition */ KeyAddition128(a,rk[0]); /* ROUNDS-1 ordinary rounds */ for(r = 1; r < ROUNDS_128_OPT; r++) { Substitution128(a,S); ShiftRow128(a,0); MixColumn128(a); KeyAddition128(a,rk[r]); } /* Last round is special: there is no MixColumn */ Substitution128(a,S); ShiftRow128(a,0); KeyAddition128(a,rk[ROUNDS_128_OPT]); return 0; } int rijndaelDecrypt128 ( word8 a[4][BC_128_OPT], word8 rk[MAXROUNDS+1][4][MAXBC]) { int r; /* To decrypt: apply the inverse operations of the encrypt routine, * in opposite order * * (KeyAddition is an involution: it 's equal to its inverse) * (the inverse of Substitution with table S is Substitution with the * inverse table of S) * (the inverse of Shiftrow is Shiftrow over a suitable distance) */ /* First the special round: * without InvMixColumn * with extra KeyAddition */ KeyAddition128(a,rk[ROUNDS_128_OPT]); Substitution128(a,Si); ShiftRow128(a,1); /* ROUNDS-1 ordinary rounds */ for(r = ROUNDS_128_OPT-1; r > 0; r--) { KeyAddition128(a,rk[r]); InvMixColumn128(a); Substitution128(a,Si); ShiftRow128(a,1); } /* End with the extra key addition */ KeyAddition128(a,rk[0]); return 0; } #endif /* !GLADMAN_AES_128_ENABLE */