/*
 * Copyright (c) 2000-2001 Apple Computer, Inc. All Rights Reserved.
 * 
 * The contents of this file constitute Original Code as defined in and are
 * subject to the Apple Public Source License Version 1.2 (the 'License').
 * You may not use this file except in compliance with the License. Please obtain
 * a copy of the License at http://www.apple.com/publicsource and read it before
 * using this file.
 * 
 * This Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS
 * OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, INCLUDING WITHOUT
 * LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
 * PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. Please see the License for the
 * specific language governing rights and limitations under the License.
 */


/* rijndael-alg-ref.c   v2.0   August '99
 * Reference ANSI C code
 * authors: Paulo Barreto
 *          Vincent Rijmen
 *
 * PPC and 128-bit block optimization by Doug Mitchell May 2001. 
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "rijndael-alg-ref.h"
#include <AppleCSP/cspdebugging.h>

#define SC	((BC - 4) >> 1)

#include "boxes-ref.h"

static const word8 shifts[3][4][2] = {
 { { 0, 0 },
   { 1, 3 },
   { 2, 2 },
   { 3, 1 }
 },
 { { 0, 0 },
   { 1, 5 },
   { 2, 4 },
   { 3, 3 }
 },
 { { 0, 0 },
   { 1, 7 },
   { 3, 5 },
   { 4, 4 }
 }
}; 

/* 128 bit key/word shift table in bits */
static const word8 shifts128[4][2] = {
 { 0,  0 },
 { 8,  24 },
 { 16, 16 },
 { 24, 8 }
};

#if		!AES_MUL_BY_LOOKUP
/*
 * Profiling measurements showed that the mul routine is where a large propertion of 
 * the time is spent. Since the first argument to  mul is always one of six 
 * constants (2, 3, 0xe, etc.), we implement six 256x256 byte lookup tables to 
 * do the multiplies. This eliminates the need for the log/antilog tables, so
 * it's only adding one kilobyte of const data. Throughput improvement for this
 * mod is a factor of 3.3 for encrypt and 4.1 for decrypt in the 128-bit optimized
 * case. Improvement for the general case (with a 256 bit key) is 1.46 for encrypt
 * and 1.88 for decrypt. (Decrypt wins more for this enhancement because the 
 * InvMixColumn does four muls, vs. 2 muls for MixColumn). Measurements taken
 * on a 500 MHz G4 with 1 MB of L2 cache. 

/*
 * The mod 255 op in mul is really expensive...
 *
 * We know that b <= (254 * 2), so there are only two cases. Either return b, 
 * or return b-255. 
 *
 * On a G4 this single optimization results in a 24% speedup for encrypt and 
 * a 25% speedup for decrypt. 
 */
static inline word8 mod255(word32 b)
{
	if(b >= 255) {
		b -= 255;
	}
	return b;
}

word8 mul(word8 a, word8 b) {
   /* multiply two elements of GF(2^m)
    * needed for MixColumn and InvMixColumn
    */
	if (a && b) return Alogtable[mod255(Logtable[a] + Logtable[b])];
	else return 0;
}
#endif	/* !AES_MUL_BY_LOOKUP */

void KeyAddition(word8 a[4][MAXBC], word8 rk[4][MAXBC], word8 BC) {
	/* Exor corresponding text input and round key input bytes
	 */
	int i, j;
	
	for(i = 0; i < 4; i++)
   		for(j = 0; j < BC; j++) a[i][j] ^= rk[i][j];
}

void ShiftRow(word8 a[4][MAXBC], word8 d, word8 BC) {
	/* Row 0 remains unchanged
	 * The other three rows are shifted a variable amount
	 */
	word8 tmp[MAXBC];
	int i, j;
	
	for(i = 1; i < 4; i++) {
		for(j = 0; j < BC; j++) tmp[j] = a[i][(j + shifts[SC][i][d]) % BC];
		for(j = 0; j < BC; j++) a[i][j] = tmp[j];
	}
}

void Substitution(word8 a[4][MAXBC], const word8 box[256], word8 BC) {
	/* Replace every byte of the input by the byte at that place
	 * in the nonlinear S-box
	 */
	int i, j;
	
	for(i = 0; i < 4; i++)
		for(j = 0; j < BC; j++) a[i][j] = box[a[i][j]] ;
}
   
void MixColumn(word8 a[4][MAXBC], word8 BC) {
	/* Mix the four bytes of every column in a linear way
	 */
	word8 b[4][MAXBC];
	int i, j;
		
	for(j = 0; j < BC; j++) {
		for(i = 0; i < 4; i++) {
			#if		AES_MUL_BY_LOOKUP
			b[i][j] = mulBy0x02[a[i][j]]
				^ mulBy0x03[a[(i + 1) % 4][j]]
				^ a[(i + 2) % 4][j]
				^ a[(i + 3) % 4][j];
			#else
			b[i][j] = mul(2,a[i][j])
				^ mul(3,a[(i + 1) % 4][j])
				^ a[(i + 2) % 4][j]
				^ a[(i + 3) % 4][j];
			#endif
		}
	}
	for(i = 0; i < 4; i++) {
		for(j = 0; j < BC; j++) a[i][j] = b[i][j];
	}
}

void InvMixColumn(word8 a[4][MAXBC], word8 BC) {
	/* Mix the four bytes of every column in a linear way
	 * This is the opposite operation of Mixcolumn
	 */
	word8 b[4][MAXBC];
	int i, j;
	
	for(j = 0; j < BC; j++) {
		for(i = 0; i < 4; i++) {         
			#if		AES_MUL_BY_LOOKUP
			b[i][j] = mulBy0x0e[a[i][j]]
				^ mulBy0x0b[a[(i + 1) % 4][j]]
				^ mulBy0x0d[a[(i + 2) % 4][j]]
				^ mulBy0x09[a[(i + 3) % 4][j]];     
			#else
			b[i][j] = mul(0xe,a[i][j])
				^ mul(0xb,a[(i + 1) % 4][j])                 
				^ mul(0xd,a[(i + 2) % 4][j])
				^ mul(0x9,a[(i + 3) % 4][j]);     
			#endif
		}
	}
	for(i = 0; i < 4; i++) {
		for(j = 0; j < BC; j++) a[i][j] = b[i][j];
	}
}

int rijndaelKeySched (
	word8 k[4][MAXKC], 
	int keyBits, 
	int blockBits, 
	word8 W[MAXROUNDS+1][4][MAXBC]) {
	
	/* Calculate the necessary round keys
	 * The number of calculations depends on keyBits and blockBits
	 */
	int KC, BC, ROUNDS;
	int i, j, t, rconpointer = 0;
	word8 tk[4][MAXKC];   

	switch (keyBits) {
	case 128: KC = 4; break;
	case 192: KC = 6; break;
	case 256: KC = 8; break;
	default : return (-1);
	}

	switch (blockBits) {
	case 128: BC = 4; break;
	case 192: BC = 6; break;
	case 256: BC = 8; break;
	default : return (-2);
	}

	switch (keyBits >= blockBits ? keyBits : blockBits) {
	case 128: ROUNDS = 10; break;
	case 192: ROUNDS = 12; break;
	case 256: ROUNDS = 14; break;
	default : return (-3); /* this cannot happen */
	}

	
	for(j = 0; j < KC; j++)
		for(i = 0; i < 4; i++)
			tk[i][j] = k[i][j];
	t = 0;
	/* copy values into round key array */
	for(j = 0; (j < KC) && (t < (ROUNDS+1)*BC); j++, t++)
		for(i = 0; i < 4; i++) W[t / BC][i][t % BC] = tk[i][j];
		
	while (t < (ROUNDS+1)*BC) { /* while not enough round key material calculated */
		/* calculate new values */
		for(i = 0; i < 4; i++)
			tk[i][0] ^= S[tk[(i+1)%4][KC-1]];
		tk[0][0] ^= rcon[rconpointer++];

		if (KC != 8)
			for(j = 1; j < KC; j++)
				for(i = 0; i < 4; i++) tk[i][j] ^= tk[i][j-1];
		else {
			for(j = 1; j < KC/2; j++)
				for(i = 0; i < 4; i++) tk[i][j] ^= tk[i][j-1];
			for(i = 0; i < 4; i++) tk[i][KC/2] ^= S[tk[i][KC/2 - 1]];
			for(j = KC/2 + 1; j < KC; j++)
				for(i = 0; i < 4; i++) tk[i][j] ^= tk[i][j-1];
	}
	/* copy values into round key array */
	for(j = 0; (j < KC) && (t < (ROUNDS+1)*BC); j++, t++)
		for(i = 0; i < 4; i++) W[t / BC][i][t % BC] = tk[i][j];
	}		

	return 0;
}
      
int rijndaelEncrypt (
	word8 a[4][MAXBC], 
	int keyBits, 
	int blockBits, 
	word8 rk[MAXROUNDS+1][4][MAXBC])
{
	/* Encryption of one block, general case. 
	 */
	int r, BC, ROUNDS;

	switch (blockBits) {
	case 128: BC = 4; break;
	case 192: BC = 6; break;
	case 256: BC = 8; break;
	default : return (-2);
	}

	switch (keyBits >= blockBits ? keyBits : blockBits) {
	case 128: ROUNDS = 10; break;
	case 192: ROUNDS = 12; break;
	case 256: ROUNDS = 14; break;
	default : return (-3); /* this cannot happen */
	}

	/* begin with a key addition
	 */
	KeyAddition(a,rk[0],BC); 

	/* ROUNDS-1 ordinary rounds
	 */
	for(r = 1; r < ROUNDS; r++) {
		Substitution(a,S,BC);
		ShiftRow(a,0,BC);
		MixColumn(a,BC);
		KeyAddition(a,rk[r],BC);
	}
	
	/* Last round is special: there is no MixColumn
	 */
	Substitution(a,S,BC);
	ShiftRow(a,0,BC);
	KeyAddition(a,rk[ROUNDS],BC);

	return 0;
}   

int rijndaelDecrypt (
	word8 a[4][MAXBC], 
	int keyBits, 
	int blockBits, 
	word8 rk[MAXROUNDS+1][4][MAXBC])
{
	int r, BC, ROUNDS;
	
	switch (blockBits) {
	case 128: BC = 4; break;
	case 192: BC = 6; break;
	case 256: BC = 8; break;
	default : return (-2);
	}

	switch (keyBits >= blockBits ? keyBits : blockBits) {
	case 128: ROUNDS = 10; break;
	case 192: ROUNDS = 12; break;
	case 256: ROUNDS = 14; break;
	default : return (-3); /* this cannot happen */
	}

	/* To decrypt: apply the inverse operations of the encrypt routine,
	 *             in opposite order
	 * 
	 * (KeyAddition is an involution: it 's equal to its inverse)
	 * (the inverse of Substitution with table S is Substitution with the 
	 *  inverse table of S)
	 * (the inverse of Shiftrow is Shiftrow over a suitable distance)
	 */

	/* First the special round:
	 *   without InvMixColumn
	 *   with extra KeyAddition
	 */
	KeyAddition(a,rk[ROUNDS],BC);
	Substitution(a,Si,BC);
	ShiftRow(a,1,BC);              
	
	/* ROUNDS-1 ordinary rounds
	 */
	for(r = ROUNDS-1; r > 0; r--) {
		KeyAddition(a,rk[r],BC);
		InvMixColumn(a,BC);      
		Substitution(a,Si,BC);
		ShiftRow(a,1,BC);                
	}
	
	/* End with the extra key addition
	 */
	
	KeyAddition(a,rk[0],BC);    

	return 0;
}

#if		!GLADMAN_AES_128_ENABLE

/*
 * All of these 128-bit-key-and-block routines require 32-bit word-aligned 
 * char array pointers.ÊThe key schedule arrays are easy; they come from
 * keyInstance which has a 4-byte-aligned element preceeding the key schedule.
 * Others require manual alignment of a local variable by the caller.
 */

static inline void KeyAddition128(
	word8 a[4][BC_128_OPT], 
	word8 rk[4][MAXBC]) {
	
	/* these casts are endian-independent */
	((word32 *)a)[0] ^= *((word32 *)(&rk[0]));
	((word32 *)a)[1] ^= *((word32 *)(&rk[1]));
	((word32 *)a)[2] ^= *((word32 *)(&rk[2]));
	((word32 *)a)[3] ^= *((word32 *)(&rk[3]));
}

static void Substitution128(
	word8 a[4][BC_128_OPT], 
	const word8 box[256]) {
	/* Replace every byte of the input by the byte at that place
	 * in the nonlinear S-box
	 */
	int i, j;
	
	/* still to be optimized - larger S boxes? */
	for(i = 0; i < 4; i++) {
		for(j = 0; j < BC_128_OPT; j++) {
			a[i][j] = box[a[i][j]];
		}
	}
}

#if	defined(__ppc__) && defined(__GNUC__)

static inline void rotateWordLeft(
	word8 *word,			// known to be word aligned
	unsigned rotCount)		// in bits
{
	word32 lword = *((word32 *)word);
	asm("rlwnm %0,%1,%2,0,31" : "=r"(lword) : "0"(lword), "r"(rotCount));
	*((word32 *)word) = lword;
}

#else

/* 
 * Insert your machine/compiler dependent code here,
 * or just use this, which works on any platform and compiler
 * which supports the __attribute__((aligned(4))) directive. 
 */
static void rotateWordLeft(
	word8 *word,			// known to be word aligned
	unsigned rotCount)		// in bits
{
	word8 tmp[BC_128_OPT] __attribute__((aligned(4)));
	unsigned bytes = rotCount / 8;
	
	tmp[0] = word[bytes     & (BC_128_OPT-1)];
	tmp[1] = word[(1+bytes) & (BC_128_OPT-1)];
	tmp[2] = word[(2+bytes) & (BC_128_OPT-1)];
	tmp[3] = word[(3+bytes) & (BC_128_OPT-1)];
	*((word32 *)word) = *((word32 *)tmp);
}
#endif

static inline void ShiftRow128(
	word8 a[4][BC_128_OPT], 
	word8 d) {
	/* Row 0 remains unchanged
	 * The other three rows are shifted (actually rotated) a variable amount
	 */
	int i;
	
	for(i = 1; i < 4; i++) {
		rotateWordLeft(a[i], shifts128[i][d]);
	}
}

/*
 * The following two routines are where most of the time is spent in this
 * module. Further optimization would have to focus here. 
 */
static void MixColumn128(word8 a[4][BC_128_OPT]) {
	/* Mix the four bytes of every column in a linear way
	 */
	word8 b[4][BC_128_OPT];
	int i, j;
	
	for(j = 0; j < BC_128_OPT; j++) {
		for(i = 0; i < 4; i++) {
			#if		AES_MUL_BY_LOOKUP
			b[i][j] = mulBy0x02[a[i][j]]
				^ mulBy0x03[a[(i + 1) % 4][j]]
				^ a[(i + 2) % 4][j]
				^ a[(i + 3) % 4][j];
			#else
			b[i][j] = mul(2,a[i][j])
				^ mul(3,a[(i + 1) % 4][j])
				^ a[(i + 2) % 4][j]
				^ a[(i + 3) % 4][j];
			#endif
		}
	}
	memmove(a, b, 4 * BC_128_OPT);
}

static void InvMixColumn128(word8 a[4][BC_128_OPT]) {
	/* Mix the four bytes of every column in a linear way
	 * This is the opposite operation of Mixcolumn
	 */
	word8 b[4][BC_128_OPT];
	int i, j;
	
	for(j = 0; j < BC_128_OPT; j++) {
		for(i = 0; i < 4; i++) {  
			#if		AES_MUL_BY_LOOKUP
			b[i][j] = mulBy0x0e[a[i][j]]
				^ mulBy0x0b[a[(i + 1) % 4][j]]
				^ mulBy0x0d[a[(i + 2) % 4][j]]
				^ mulBy0x09[a[(i + 3) % 4][j]];     
			#else
			b[i][j] = mul(0xe,a[i][j])
				^ mul(0xb,a[(i + 1) % 4][j])                 
				^ mul(0xd,a[(i + 2) % 4][j])
				^ mul(0x9,a[(i + 3) % 4][j]);     
			#endif
		}
	}
	memmove(a, b, 4 * BC_128_OPT);
}

int rijndaelKeySched128 (
	word8 k[4][KC_128_OPT], 
	word8 W[MAXROUNDS+1][4][MAXBC]) {
	
	/* Calculate the necessary round keys
	 * The number of calculations depends on keyBits and blockBits
	 */
	int i, j, t, rconpointer = 0;
	word8 tk[4][KC_128_OPT];   
	unsigned numSchedRows = (ROUNDS_128_OPT + 1) * BC_128_OPT;
	
	for(j = 0; j < KC_128_OPT; j++)
		for(i = 0; i < 4; i++)
			tk[i][j] = k[i][j];
	t = 0;
	/* copy values into round key array */
	for(j = 0; (j < KC_128_OPT) && (t < numSchedRows); j++, t++) {
		for(i = 0; i < 4; i++) {
			W[t / BC_128_OPT][i][t % BC_128_OPT] = tk[i][j];
		}
	}
		
	while (t < numSchedRows) { 
		/* while not enough round key material calculated */
		/* calculate new values */
		for(i = 0; i < 4; i++) {
			tk[i][0] ^= S[tk[(i+1)%4][KC_128_OPT-1]];
		}
		tk[0][0] ^= rcon[rconpointer++];

		for(j = 1; j < KC_128_OPT; j++) {
			for(i = 0; i < 4; i++) {
				tk[i][j] ^= tk[i][j-1];
			}
		}

		/* copy values into round key array */
		for(j = 0; (j < KC_128_OPT) && (t < numSchedRows); j++, t++) {
			for(i = 0; i < 4; i++) {
				W[t / BC_128_OPT][i][t % BC_128_OPT] = tk[i][j];
			}
		}
	}		

	return 0;
}

int rijndaelEncrypt128 (
	word8 a[4][BC_128_OPT], 
	word8 rk[MAXROUNDS+1][4][MAXBC])
{
	/* Encryption of one block. 
	 */
	int r;

	/* begin with a key addition
	 */
	KeyAddition128(a,rk[0]); 
	
	/* ROUNDS-1 ordinary rounds
	 */
	for(r = 1; r < ROUNDS_128_OPT; r++) {
		Substitution128(a,S);
		ShiftRow128(a,0);
		MixColumn128(a);
		KeyAddition128(a,rk[r]);
	}
	
	/* Last round is special: there is no MixColumn
	 */
	Substitution128(a,S);
	ShiftRow128(a,0);
	KeyAddition128(a,rk[ROUNDS_128_OPT]);

	return 0;
}   

int rijndaelDecrypt128 (
	word8 a[4][BC_128_OPT], 
	word8 rk[MAXROUNDS+1][4][MAXBC])
{
	int r;
	
	/* To decrypt: apply the inverse operations of the encrypt routine,
	 *             in opposite order
	 * 
	 * (KeyAddition is an involution: it 's equal to its inverse)
	 * (the inverse of Substitution with table S is Substitution with the 
	 *  inverse table of S)
	 * (the inverse of Shiftrow is Shiftrow over a suitable distance)
	 */

	/* First the special round:
	 *   without InvMixColumn
	 *   with extra KeyAddition
	 */
	KeyAddition128(a,rk[ROUNDS_128_OPT]);
	Substitution128(a,Si);
	ShiftRow128(a,1);              
	
	/* ROUNDS-1 ordinary rounds
	 */
	for(r = ROUNDS_128_OPT-1; r > 0; r--) {
		KeyAddition128(a,rk[r]);
		InvMixColumn128(a);      
		Substitution128(a,Si);
		ShiftRow128(a,1);                
	}
	
	/* End with the extra key addition
	 */
	
	KeyAddition128(a,rk[0]);    

	return 0;
}

#endif		/* !GLADMAN_AES_128_ENABLE */