/*
 * Copyright (c) 2000-2001 Apple Computer, Inc. All Rights Reserved.
 * 
 * The contents of this file constitute Original Code as defined in and are
 * subject to the Apple Public Source License Version 1.2 (the 'License').
 * You may not use this file except in compliance with the License. Please obtain
 * a copy of the License at http://www.apple.com/publicsource and read it before
 * using this file.
 * 
 * This Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS
 * OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, INCLUDING WITHOUT
 * LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
 * PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. Please see the License for the
 * specific language governing rights and limitations under the License.
 */


/*
 *  vRijndael-alg-ref.c
 *
 *  Created by Robert A. Murley on Mon Jan 22 2001.
 *  Copyright (c) 2001 Apple Computer, Inc. All rights reserved.
 *
 */

#include "rijndaelApi.h"
#include "rijndael-alg-ref.h"
#include "boxes-ref.h"
#include <string.h>

/* debugger seems to have trouble with this code... */
#define VAES_DEBUG	1
#if		VAES_DEBUG
#include <stdio.h>
#define vdprintf(s)		printf s
#else
#define vdprintf(s)
#endif

#define SC	((BC - 4) >> 1)

#if defined(__ppc__) && defined(ALTIVEC_ENABLE)

typedef union {
	unsigned char		s[4][8];
	unsigned long		l[8];
	vector unsigned char 	v[2];
} doubleVec;

typedef union {
	unsigned long		s[4];
	vector unsigned long	v;
} vecLong;

static word8 shifts[3][4][2] = {
 { { 0, 0 },
   { 1, 3 },
   { 2, 2 },
   { 3, 1 }
 },
 { { 0, 0 },
   { 1, 5 },
   { 2, 4 },
   { 3, 3 }
 },
 { { 0, 0 },
   { 1, 7 },
   { 3, 5 },
   { 4, 4 }
 }
};

int vRijndaelKeySched ( vector unsigned char vk[2], int keyBits, int blockBits, 
                unsigned char W[MAXROUNDS+1][4][MAXBC])
{
	/* Calculate the necessary round keys
	 * The number of calculations depends on keyBits and blockBits
	 */
	int KC, BC, ROUNDS;
	int i, j, t, rconpointer = 0;
	doubleVec tk;
	register  vector unsigned char v1, v2, mask;

	switch (keyBits) {
	case 128: KC = 4; break;
	case 192: KC = 6; break;
	case 256: KC = 8; break;
	default : return (-1);
	}

	switch (blockBits) {
	case 128: BC = 4; break;
	case 192: BC = 6; break;
	case 256: BC = 8; break;
	default : return (-2);
	}

	switch (keyBits >= blockBits ? keyBits : blockBits) {
	case 128: ROUNDS = 10; break;
	case 192: ROUNDS = 12; break;
	case 256: ROUNDS = 14; break;
	default : return (-3); /* this cannot happen */
	}

	tk.v[0] = vk[0];
	tk.v[1] = vk[1];
	
	t = 0;
	/* copy values into round key array */
	for(j = 0; (j < KC) && (t < (ROUNDS+1)*BC); j++, t++)
		for(i = 0; i < 4; i++) W[t / BC][i][t % BC] = tk.s[i][j];
		
	while (t < (ROUNDS+1)*BC) { /* while not enough round key material calculated */
		/* calculate new values */
		for(i = 0; i < 4; i++)
			tk.s[i][0] ^= *((word8 *)S + tk.s[(i+1)%4][KC-1]);
		tk.s[0][0] ^= rcon[rconpointer++];

		if (KC != 8) {
			/* xor bytes 1-7 of each row with previous byte */
			mask = (vector unsigned char) ( 0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff );
			for ( i = 0; i < 2; i++ ) {
				v1 = vec_sld( tk.v[i], tk.v[i], 15 );
				v2 = vec_and( v1, mask );
				tk.v[i] = vec_xor( tk.v[i], v2 );
			}
		}
		else {
			/* xor bytes 1-3 of each row with previous byte */
			mask = (vector unsigned char) ( 0, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0, 0, 0, 0 );
			for ( i = 0; i < 2; i++ ) {
				v1 = vec_sld( tk.v[i], tk.v[i], 15 );
				v2 = vec_and( v1, mask );
				tk.v[i] = vec_xor( tk.v[i], v2 );
				for(j = 0; j < 4; j++) tk.s[i][KC/2] ^= *((word8 *)S + tk.s[i][KC/2 - 1]);
				/* xor bytes 5-7 of each row with previous byte */
				mask = vec_sld( mask, mask, 4 );
				v2 = vec_and( v1, mask );
				tk.v[i] = vec_xor( tk.v[i], v2 );
				mask = vec_sld( mask, mask, 4 );
			}
		}
		/* copy values into round key array */
		for(j = 0; (j < KC) && (t < (ROUNDS+1)*BC); j++, t++)
			for(i = 0; i < 4; i++) W[t / BC][i][t % BC] = tk.s[i][j];
	}		
	return 0;
}


void vMakeKey(BYTE *keyMaterial, keyInstance *key)
{
        register vector unsigned char v1, v2, v3, mask;
        vector unsigned char	  vk[2];

        /* load and align input */
        v1 = vec_ld( 0, (vector unsigned char *) keyMaterial );
        v2 = vec_ld( 16, (vector unsigned char *) keyMaterial );
        if ( (long) keyMaterial & 0x0fL )
        {	// this is required if keyMaterial is not on a 16-byte boundary
                v3 = vec_ld( 32, (vector unsigned char *) keyMaterial );
                mask = vec_lvsl( 0, keyMaterial );
                v1 = vec_perm( v1, v2, mask );
                v2 = vec_perm( v2, v3, mask );
        }

        /* parse input stream into rectangular array */
        vk[0] = vec_perm( v1, v2, (vector unsigned char) ( 0,  4,  8, 12, 16, 20, 24, 28,  1,  5,  9, 13, 17, 21, 25, 29 ) );
        vk[1] = vec_perm( v1, v2, (vector unsigned char) ( 2,  6, 10, 14, 18, 22, 26, 30,  3,  7, 11, 15, 19, 23, 27, 31 ) );
        vRijndaelKeySched (vk, key->keyLen, key->blockLen, key->keySched);
        memset( (char *) vk, 0, 4 * MAXKC);
}


/*	This routine does 16 simultaneous lookups in a 256-byte table.	*/
vector unsigned char rimskyKorsakov ( vector unsigned char v, vector unsigned char * table )
{
	register vector unsigned char	upperBits000, upperBits001, upperBits010, upperBits011,
                                        upperBits100, upperBits101, upperBits110, upperBits111,
                                        lookupBit00,  lookupBit01, lookupBit10, lookupBit11,
                                        lookupBit0, lookupBit1, lookup,
                                        maskForBit6, maskForBit7, maskForBit8, seven;
	register vector unsigned char	*tabeven, *tabodd;

	seven = vec_splat_u8 ( 7 );
	tabeven = table++;
	tabodd = table;

//	Each variable contains the correct values for the corresponding bits 6, 7 and 8.
	upperBits000 = vec_perm ( *tabeven, *tabodd, v );
	tabeven += 2; tabodd += 2;
	upperBits001 = vec_perm ( *tabeven, *tabodd, v );
	tabeven += 2; tabodd += 2;
	upperBits010 = vec_perm ( *tabeven, *tabodd, v );
	tabeven += 2; tabodd += 2;
	upperBits011 = vec_perm ( *tabeven, *tabodd, v );
	tabeven += 2; tabodd += 2;
	upperBits100 = vec_perm ( *tabeven, *tabodd, v ); 
	tabeven += 2; tabodd += 2;
	upperBits101 = vec_perm ( *tabeven, *tabodd, v );
	tabeven += 2; tabodd += 2;
	upperBits110 = vec_perm ( *tabeven, *tabodd, v );
	tabeven += 2; tabodd += 2;
	upperBits111 = vec_perm ( *tabeven, *tabodd, v );
	
//	Here we extract all the correct values for bit 6.
	maskForBit6  = vec_sl  ( v, vec_splat_u8 ( 2 ) );
	maskForBit6  = vec_sra ( maskForBit6, seven );
	lookupBit00 = vec_sel ( upperBits000, upperBits001, maskForBit6 );
	lookupBit01 = vec_sel ( upperBits010, upperBits011, maskForBit6 );
	lookupBit10 = vec_sel ( upperBits100, upperBits101, maskForBit6 );
	lookupBit11 = vec_sel ( upperBits110, upperBits111, maskForBit6 );

//	Then we get the correct values for bit 7.
	maskForBit7  = vec_sl  ( v, vec_splat_u8 ( 1 ) );
	maskForBit7  = vec_sra ( maskForBit7, seven );
	lookupBit0 = vec_sel ( lookupBit00, lookupBit01, maskForBit7 );
	lookupBit1 = vec_sel ( lookupBit10, lookupBit11, maskForBit7 );

//	Finally, the entire correct result vector.
	maskForBit8 = vec_sra ( v, seven );
	
	lookup = vec_sel ( lookupBit0, lookupBit1, maskForBit8 );
      
    return lookup;
}

vector unsigned char vmul(vector unsigned char a, vector unsigned char b)
{
	register vector unsigned char x, y, zero;
	register vector unsigned short xh, yh, zhi, zlo, two54, two55;
	
	zero = vec_splat_u8( 0 );
	two55 = vec_splat_u16( -1 );
	two55 = (vector unsigned short) vec_mergeh( zero, (vector unsigned char) two55 );
	two54 = vec_sub( two55, vec_splat_u16( 1 ) ); 

	x = rimskyKorsakov( a, (vector unsigned char *)Logtable );	// Logtable[a]
	y = rimskyKorsakov( b, (vector unsigned char *)Logtable );	// Logtable[b]

	//	Convert upper 8 bytes to shorts for addition ond modulo
	xh = (vector unsigned short) vec_mergeh( zero, x );
	yh = (vector unsigned short) vec_mergeh( zero, y );
	xh = vec_add( xh, yh );			// xh = Logtable[a] + Logtable[b]
	yh = vec_sub( xh, two55 );
	zhi = vec_sel( xh, yh, vec_cmpgt( xh, two54 ) );	// xh%255

	//	Convert lower 8 bytes to shorts for addition ond modulo
	xh = (vector unsigned short) vec_mergel( zero, x );
	yh = (vector unsigned short) vec_mergel( zero, y );
	xh = vec_add( xh, yh );
	yh = vec_sub( xh, two55 );
	zlo = vec_sel( xh, yh, vec_cmpgt( xh, two54 ) );

	x = vec_pack( zhi, zlo );			// recombine into single byte vector
	x = rimskyKorsakov( x, (vector unsigned char *)Alogtable );		// Alogtable[x]
	x = vec_sel( x, zero, vec_cmpeq( a, zero ) );	// check a = 0
	x = vec_sel( x, zero, vec_cmpeq( b, zero ) );	// check b = 0
	return x;
}

void vKeyAddition(vector unsigned char v[2], vector unsigned char rk[2])
{
	v[0] = vec_xor( v[0], rk[0] );		// first vector contains rows 0 and 1
	v[1] = vec_xor( v[1], rk[1] );		// second vector contains rows 2 and 3
}


void vShiftRow(vector unsigned char v[2], word8 d, word8 BC)
{
	vecLong			sh;
	register vector unsigned char mask, mask1, t;
	register vector bool char c;
	register int	i, j;
	
	sh.s[0] = 0;
	for (i = 1; i < 4; i++)
		sh.s[i] = shifts[SC][i][d] % BC;	//	contains the number of elements to shift each row
		
	// each vector contains two BC-byte long rows
	j = 0;
	for ( i = 0; i < 2; i++ ) {
		mask = vec_lvsl( 0, (int *) sh.s[j++]);		//	mask for even row
		mask1 = vec_lvsl( 0, (int *) sh.s[j++]);	//	mask for odd row
		if (BC == 4) {
			mask = vec_sld( mask, mask1, 8 );		//	combined rotation mask for both rows
			mask = vec_and( mask, vec_splat_u8( 3 ) );
		} else if (BC == 6) {
			mask = vec_sld( mask, mask, 8 );
			mask = vec_sld( mask, mask1, 8 );		//	combined rotation mask for both rows
			t = vec_sub( mask, vec_splat_u8( 6 ) );
			c = vec_cmpgt( mask, vec_splat_u8( 5 ) );
			mask = vec_sel( mask, t, c );
		} else {
			mask = vec_sld( mask, mask1, 8 );		//	combined rotation mask for both rows
			mask = vec_and( mask, vec_splat_u8( 7 ) );
		}
		mask1 = vec_sld( vec_splat_u8( 0 ), vec_splat_u8( 8 ), 8 );
		mask = vec_add( mask, mask1 );
		v[i] = vec_perm( v[i], v[i], mask );		//	rotate each row as required
	}
}

void vSubstitution( vector unsigned char v[2], vector unsigned char box[16] )
{
	v[0] = rimskyKorsakov( v[0], box );		// first vector contains rows 0 and 1
	v[1] = rimskyKorsakov( v[1], box );		// second vector contains rows 2 and 3
}
   
void vMixColumn(vector unsigned char v[2])
{
	//	vector 0 contains row 0 in bytes 0-7 and row 1 in bytes 8-f
	//	vector 1 contains row 2 in bytes 0-7 and row 3 in bytes 8-f

	register vector unsigned char a0, a1, a2, a3, b0, b1, b2, b3;
	register vector unsigned char two, three;
	
	two = vec_splat_u8( 2 );
	three = vec_splat_u8( 3 );

	a1 = vec_sld( v[0], v[1], 8 );		// equivalent to a[i+1] % 4
	b1 = vec_sld( v[1], v[0], 8 );
	a2 = vec_sld( a1, b1, 8 );		// equivalent to a[i+2] % 4
	b2 = vec_sld( b1, a1, 8 );
	a3 = vec_sld( a2, b2, 8 );		// equivalent to a[i+3] % 4
	b3 = vec_sld( b2, a2, 8 );
	
	//	Calculations for rows 0 and 1
	a0 = vmul( two, v[0] );				// mul(2,a[i][j])
	a0 = vec_xor( a0, vmul( three, a1 ) );		// ^ mul(3,a[(i + 1) % 4][j])
	a0 = vec_xor( a0, a2 );				// ^ a[(i + 2) % 4][j]
	v[0]  = vec_xor( a0, a3 );			// ^ a[(i + 3) % 4][j]

	//	Calculations for rows 2 and 3
	b0 = vmul( two, v[1] );
	b0 = vec_xor( b0, vmul( three, b1 ) );
	b0 = vec_xor( b0, b2 );
	v[1] = vec_xor( b0, b3 );
}

void vInvMixColumn(vector unsigned char v[2])
{
	//	vector 0 contains row 0 in bytes 0-7 and row 1 in bytes 8-f
	//	vector 1 contains row 2 in bytes 0-7 and row 3 in bytes 8-f

	register vector unsigned char a0, a1, a2, a3, b0, b1, b2, b3;
	register vector unsigned char nine, eleven, thirteen, fourteen;;
	
	nine = vec_splat_u8( 0x9 );
	eleven = vec_splat_u8( 0xb );
	thirteen = vec_splat_u8( 0xd );
	fourteen = vec_splat_u8( 0xe );

	a1 = vec_sld( v[0], v[1], 8 );			// equivalent to a[i+1] % 4
	b1 = vec_sld( v[1], v[0], 8 );
	a2 = vec_sld( a1, b1, 8 );			// equivalent to a[i+2] % 4
	b2 = vec_sld( b1, a1, 8 );
	a3 = vec_sld( a2, b2, 8 );			// equivalent to a[i+3] % 4
	b3 = vec_sld( b2, a2, 8 );
	
	//	Calculations for rows 0 and 1
	a0 = vmul( fourteen, v[0] );				// mul(0xe,a[i][j])
	a0 = vec_xor( a0, vmul( eleven, a1 ) );		// ^ mul(0xb,a[(i + 1) % 4][j])
	a0 = vec_xor( a0, vmul( thirteen, a2 ) );	// ^ mul(0xd,a[(i + 2) % 4][j])
	v[0]  = vec_xor( a0, vmul( nine, a3 ) );	// ^ mul(0x9,a[(i + 3) % 4][j])

	//	Calculations for rows 2 and 3
	b0 = vmul( fourteen, v[1] );
	b0 = vec_xor( b0, vmul( eleven, b1 ) );
	b0 = vec_xor( b0, vmul( thirteen, b2 ) );
	v[1]  = vec_xor( b0, vmul( nine, b3 ) );
}

int vRijndaelEncrypt (vector unsigned char a[2], int keyBits, int blockBits, vector unsigned char rk[MAXROUNDS+1][2])
{
	/* Encryption of one block. 
	 */
	int r, BC, ROUNDS;

	switch (blockBits) {
	case 128: BC = 4; break;
	case 192: BC = 6; break;
	case 256: BC = 8; break;
	default : return (-2);
	}

	switch (keyBits >= blockBits ? keyBits : blockBits) {
	case 128: ROUNDS = 10; break;
	case 192: ROUNDS = 12; break;
	case 256: ROUNDS = 14; break;
	default : return (-3); /* this cannot happen */
	}

        vKeyAddition( a, rk[0] ); 
        for(r = 1; r < ROUNDS; r++) {
                vSubstitution( a, (vector unsigned char *)S);
                vShiftRow( a, 0, BC);
                vMixColumn( a );
                vKeyAddition( a, rk[r] ); 	
        }
        vSubstitution( a, (vector unsigned char *)S);
        vShiftRow( a, 0, BC);
        vKeyAddition( a, rk[ROUNDS] ); 

	return 0;
}   

int vRijndaelDecrypt (vector unsigned char a[2], int keyBits, int blockBits, vector unsigned char rk[MAXROUNDS+1][2])
{
	int r, BC, ROUNDS;
	
	switch (blockBits) {
	case 128: BC = 4; break;
	case 192: BC = 6; break;
	case 256: BC = 8; break;
	default : return (-2);
	}

	switch (keyBits >= blockBits ? keyBits : blockBits) {
	case 128: ROUNDS = 10; break;
	case 192: ROUNDS = 12; break;
	case 256: ROUNDS = 14; break;
	default : return (-3); /* this cannot happen */
	}
		
        vKeyAddition( a, rk[ROUNDS] ); 
        vSubstitution( a, (vector unsigned char *)Si);
        vShiftRow( a, 1, BC);
        for(r = ROUNDS-1; r > 0; r--) {
                vKeyAddition( a, rk[r] ); 
                vInvMixColumn( a );
                vSubstitution( a, (vector unsigned char *)Si);
                vShiftRow( a, 1, BC);
        }
        vKeyAddition( a, rk[0] ); 

	return 0;
}

#if 0
/* Murley's code, to be deleted */
void vBlockEncrypt(cipherInstance *cipher, keyInstance *key, BYTE *input, int inputLen, BYTE *outBuffer)
{
        register vector unsigned char v1, v2, v3, v4, mask;
        register vector bool char cmp;

        /* load and align input */
        v1 = vec_ld( 0, (vector unsigned char *) input );
        v2 = vec_ld( 16, (vector unsigned char *) input );
        if ( (long) input & 0x0fL )
        {	// this is required if input is not on a 16-byte boundary
                v3 = vec_ld( 32, (vector unsigned char *) input );
                mask = vec_lvsl( 0, input );
                v1 = vec_perm( v1, v2, mask );
                v2 = vec_perm( v2, v3, mask );
        }

        /* parse input stream into rectangular array */
        v3 = vec_perm( v1, v2, (vector unsigned char) ( 0,  4,  8, 12, 16, 20, 24, 28,  1,  5,  9, 13, 17, 21, 25, 29 ) );
        v4 = vec_perm( v1, v2, (vector unsigned char) ( 2,  6, 10, 14, 18, 22, 26, 30,  3,  7, 11, 15, 19, 23, 27, 31 ) );

        /* store into cipher structure */
        if (cipher->mode == MODE_CBC) {
                v3 = vec_xor( v3, *((vector unsigned char *) cipher->chainBlock ) );
                v4 = vec_xor( v4, *((vector unsigned char *) cipher->chainBlock + 1 ) );
        }
        vec_st( v3, 0, (vector unsigned char *) cipher->chainBlock );
        vec_st( v4, 16, (vector unsigned char *) cipher->chainBlock );

        vRijndaelEncrypt((vector unsigned char *) cipher->chainBlock, key->keyLen, cipher->blockLen, (vector unsigned char *) key->keySched);

        v1 = vec_ld( 0, (vector unsigned char *) cipher->chainBlock );
        v2 = vec_ld( 16, (vector unsigned char *) cipher->chainBlock );

        /* parse rectangular array into output ciphertext bytes */
        v3 = vec_perm( v1, v2, (vector unsigned char) ( 0,  8, 16, 24,  1,  9, 17, 25,  2, 10, 18, 26,  3, 11, 19, 27 ) );
        v4 = vec_perm( v1, v2, (vector unsigned char) ( 4, 12, 20, 28,  5, 13, 21, 29,  6, 14, 22, 30,  7, 15, 23, 31 ) );

        if ( (long) outBuffer & 0x0fL )
        {
                /* store output data into a non-aligned buffer */
                mask = vec_lvsr( 0, outBuffer );
                cmp = vec_cmpgt( mask, vec_splat_u8( 0x0f ) );
                v1 = vec_perm( v3, v3, mask );
                v2 = vec_perm( v4, v4, mask );
                v3 = vec_ld( 0, (vector unsigned char *) outBuffer );
                v4 = vec_sel( v3, v1, cmp );
                vec_st( v4, 0, (vector unsigned char *) outBuffer );
                v1 = vec_sel( v1, v2, cmp );
                vec_st( v1, 16, (vector unsigned char *) outBuffer );
                v3 = vec_ld( 32, (vector unsigned char *) outBuffer );
                v2 = vec_sel( v2, v3, cmp );
                vec_st( v2, 32, (vector unsigned char *) outBuffer );
        } else {
                // store output data into an aligned buffer
                vec_st( v3, 0, (vector unsigned char *) outBuffer );
                vec_st( v4, 16, (vector unsigned char *) outBuffer );
        }
        return;
}

void vBlockDecrypt(cipherInstance *cipher, keyInstance *key, BYTE *input, int inputLen, BYTE *outBuffer)
{
        // for vector machines
        register vector unsigned char v1, v2, v3, v4, mask;
        register vector bool char cmp;
        vector unsigned  char	block[2], cblock[2];

        /* load and align input */
        v1 = vec_ld( 0, (vector unsigned char *) input );
        v2 = vec_ld( 16, (vector unsigned char *) input );
        if ( (long) input & 0x0fL )
        {	// this is required if input is not on a 16-byte boundary
                v3 = vec_ld( 32, (vector unsigned char *) input );
                mask = vec_lvsl( 0, input );
                v1 = vec_perm( v1, v2, mask );
                v2 = vec_perm( v2, v3, mask );
        }

        /* parse input stream into rectangular array */
        v3 = vec_perm( v1, v2, (vector unsigned char) ( 0,  4,  8, 12, 16, 20, 24, 28,  1,  5,  9, 13, 17, 21, 25, 29 ) );
        v4 = vec_perm( v1, v2, (vector unsigned char) ( 2,  6, 10, 14, 18, 22, 26, 30,  3,  7, 11, 15, 19, 23, 27, 31 ) );
        block[0] = v3;
        block[1] = v4;

        /* save a copy of incoming ciphertext for later chain */
        if (cipher->mode == MODE_CBC) {
                cblock[0] = v3;
                cblock[1] = v4;
        }
                
        vRijndaelDecrypt ((vector unsigned char *) block, key->keyLen, cipher->blockLen, (vector unsigned char *) key->keySched);
        
        v1 = block[0];
        v2 = block[1];

        /* exor with last ciphertext */
        if (cipher->mode == MODE_CBC) {
                v1 = vec_xor( v1, *((vector unsigned char *) cipher->chainBlock) );
                v2 = vec_xor( v2, *((vector unsigned char *) cipher->chainBlock + 1) );
                vec_st( cblock[0], 0, (vector unsigned char *) cipher->chainBlock );
                vec_st( cblock[1], 16, (vector unsigned char *) cipher->chainBlock );
        }
        
        /* parse rectangular array into output ciphertext bytes */
        v3 = vec_perm( v1, v2, (vector unsigned char) ( 0,  8, 16, 24,  1,  9, 17, 25,  2, 10, 18, 26,  3, 11, 19, 27 ) );
        v4 = vec_perm( v1, v2, (vector unsigned char) ( 4, 12, 20, 28,  5, 13, 21, 29,  6, 14, 22, 30,  7, 15, 23, 31 ) );

        if ( (long) outBuffer & 0x0fL )
        {	/* store output data into a non-aligned buffer */
                mask = vec_lvsr( 0, outBuffer );
                cmp = vec_cmpgt( mask, vec_splat_u8( 0x0f ) );
                v1 = vec_perm( v3, v3, mask );
                v2 = vec_perm( v4, v4, mask );
                v3 = vec_ld( 0, (vector unsigned char *) outBuffer );
                v4 = vec_sel( v3, v1, cmp );
                vec_st( v4, 0, (vector unsigned char *) outBuffer );
                v1 = vec_sel( v1, v2, cmp );
                vec_st( v1, 16, (vector unsigned char *) outBuffer );
                v3 = vec_ld( 32, (vector unsigned char *) outBuffer );
                v2 = vec_sel( v2, v3, cmp );
                vec_st( v2, 32, (vector unsigned char *) outBuffer );
        } else {
                // store output data into an aligned buffer
                vec_st( v3, 0, (vector unsigned char *) outBuffer );
                vec_st( v4, 16, (vector unsigned char *) outBuffer );
        }
}
#endif	/* Murley's code, to be deleted */

/* 
 * dmitch addenda 4/11/2001: 128-bit only encrypt/decrypt with no CBC
 */
void vBlockEncrypt128(
	keyInstance *key, 
	BYTE *input, 
	BYTE *outBuffer)
{
	vector unsigned char block[2];
	register vector unsigned char v1, v2;
	
	if ( (long) input & 0x0fL ) {
		BYTE	localBuf[16];
		vdprintf(("vBlockEncrypt128: unaligned input\n"));
		/* manually re-align - the compiler is supposed to 16-byte align this for us */
		if((unsigned)localBuf & 0xf) {
			vdprintf(("vBlockEncrypt128: unaligned localBuf!\n"));
		}
		memmove(localBuf, input, 16);
		v1 = vec_ld(0, (vector unsigned char *)localBuf);
	}
	else {
		vdprintf(("vBlockEncrypt128: aligned input\n"));
		v1 = vec_ld( 0, (vector unsigned char *) input );
	}

	/* parse input stream into rectangular array */
	/* FIXME - do we need to zero v2 (or something)? */
	block[0] = vec_perm(v1, v2,
		(vector unsigned char) ( 0,  4,  8, 12, 16, 20, 24, 28,  1,  
		5,  9, 13, 17, 21, 25, 29 ) );
	block[1] = vec_perm( v1, v2, 
		(vector unsigned char) ( 2,  6, 10, 14, 18, 22, 26, 30,  3,  
		7, 11, 15, 19, 23, 27, 31 ) );

	vRijndaelEncrypt(block, key->keyLen, 128, (vector unsigned char *) key->keySched);

	/* parse rectangular array into output ciphertext bytes */
	v1 = vec_perm(block[0], block[1], 
		(vector unsigned char) ( 0,  8, 16, 24,  1,  9, 17, 25,  2, 
		10, 18, 26,  3, 11, 19, 27 ) );
	v2 = vec_perm(block[0], block[1], 
		(vector unsigned char) ( 4, 12, 20, 28,  5, 13, 21, 29,  6, 
		14, 22, 30,  7, 15, 23, 31 ) );

	if ( (long) outBuffer & 0x0fL )
	{
		/* store output data into a non-aligned buffer */
		BYTE	localBuf[16];
		vec_st(v1, 0, (vector unsigned char *) localBuf );
		memmove(outBuffer, localBuf, 16);
	} else {
		/* store output data into an aligned buffer */
		vec_st( v1, 0, (vector unsigned char *) outBuffer );
	}
	return;
}

void vBlockDecrypt128(
	keyInstance *key, 
	BYTE *input, 
	BYTE *outBuffer)
{
	vector unsigned char block[2];
	register vector unsigned char v1, v2;
	
	if ( (long) input & 0x0fL ) {
		/* manually re-align - the compiler is supposed to 16-byte align this for us */
		BYTE	localBuf[16];
		vdprintf(("vBlockDecrypt128: unaligned input\n"));
		if((unsigned)localBuf & 0xf) {
			vdprintf(("vBlockDecrypt128: unaligned localBuf!\n"));
		}
		memmove(localBuf, input, 16);
		v1 = vec_ld(0, (vector unsigned char *)localBuf);
	}
	else {
		vdprintf(("vBlockDecrypt128: aligned input\n"));
		v1 = vec_ld( 0, (vector unsigned char *) input );
	}

	/* parse input stream into rectangular array */
	/* FIXME - do we need to zero v2 (or something)? */
	block[0] = vec_perm(v1, v2,
		(vector unsigned char) ( 0,  4,  8, 12, 16, 20, 24, 28,  1,  
		5,  9, 13, 17, 21, 25, 29 ) );
	block[1] = vec_perm( v1, v2, 
		(vector unsigned char) ( 2,  6, 10, 14, 18, 22, 26, 30,  3,  
		7, 11, 15, 19, 23, 27, 31 ) );

	vRijndaelDecrypt(block, key->keyLen, 128, (vector unsigned char *) key->keySched);

	/* parse rectangular array into output ciphertext bytes */
	v1 = vec_perm(block[0], block[1], 
		(vector unsigned char) ( 0,  8, 16, 24,  1,  9, 17, 25,  2, 
		10, 18, 26,  3, 11, 19, 27 ) );
	v2 = vec_perm(block[0], block[1], 
		(vector unsigned char) ( 4, 12, 20, 28,  5, 13, 21, 29,  6, 
		14, 22, 30,  7, 15, 23, 31 ) );

	if ( (long) outBuffer & 0x0fL ) {
		/* store output data into a non-aligned buffer */
		BYTE	localBuf[16];
		vec_st(v1, 0, (vector unsigned char *) localBuf );
		memmove(outBuffer, localBuf, 16);
	} else {
		/* store output data into an aligned buffer */
		vec_st( v1, 0, (vector unsigned char *) outBuffer );
	}
	return;
}

#endif	/* defined(__ppc__) && defined(ALTIVEC_ENABLE) */