/* K=7 r=1/2 Viterbi decoder with optional Intel SIMD
 * Dec 2001, Phil Karn, KA9Q
 */
#include <stdio.h>
#include <stdlib.h>
#include <memory.h>
#include "viterbi27.h"
#include "parity.h"

static int V27_init;
int cpu_features(void);

#if defined(SSE2)
char id_viterbi27[] = "k=7 r=1/2 Viterbi decoder, SSE2 version";
#elif defined(SSE)
char id_viterbi27[] = "k=7 r=1/2 Viterbi decoder, SSE version";
#elif defined(MMX)
char id_viterbi27[] = "k=7 r=1/2 Viterbi decoder, MMX version";
#else
char id_viterbi27[] = "k=7 r=1/2 Viterbi decoder, portable C version";
#endif

#if defined(MMX)
typedef union { long long p; char c[64]; } decision_t;
#define EXTRACT_DECISION(d,state) ((d)->c[(state)] & 1)

/*
  Branch metric lookup tables, indexed by state and input symbol
  Mettab27_1 is for the first symbol, Mettab27_2 for the second
  The first index is the input symbol (0-15)
  The second index is the state number
*/
unsigned char Mettab27_1[16][32] __attribute__((aligned(32)));
unsigned char Mettab27_2[16][32] __attribute__((aligned(32)));

#else
typedef union { long long p; unsigned long w[2]; } decision_t;
#define EXTRACT_DECISION(d,state) (((d)->w[(state)/32] >> ((state)%32)) & 1)

unsigned char Branchtab27_1[32] __attribute__ ((aligned(32)));
unsigned char Branchtab27_2[32] __attribute__ ((aligned(32)));
#endif

/* State info for instance of Viterbi decoder
 * Don't change this without also changing references in [mmx|sse|sse2]bfly29.s!
 */
struct v27 {
  unsigned char metrics1[64]; /* path metric buffer 1 */
  unsigned char metrics2[64]; /* path metric buffer 2 */
  decision_t *dp;              /* Pointer to decision output for current bit */
  unsigned char *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
  decision_t *decisions;       /* Beginning of decisions for block */
  void *alloc_blk;             /* Return value from malloc */
};

/* Create a new instance of a Viterbi decoder */
void *create_viterbi27(int len){
  void *blk;
  struct v27 *vp;
  int state;

  if(!V27_init){
    /* Check that CPU has required features */
#if defined(SSE2)
    if(!(cpu_features() & (1 << 26))){
      fprintf(stderr,"viterbi27: CPU does not support SSE2 instructions\n");
      exit(1);
    }
#elif defined(SSE)
    if(!(cpu_features() & (1 << 25))){
      fprintf(stderr,"viterbi27: CPU does not support SSE instructions\n");
      exit(1);
    }
#elif defined(MMX)
    if(!(cpu_features() & (1 << 23))){
      fprintf(stderr,"viterbi27: CPU does not support MMX instructions\n");
      exit(1);
    }
#endif
    /* Initialize metric tables */
    for(state=0;state < 32;state++){
#ifdef MMX
      int symbol;
      for(symbol = 0;symbol < 16;symbol++){
	Mettab27_1[symbol][state] = parity((2*state) & V27POLYA) ? (15-symbol):symbol;
	Mettab27_2[symbol][state] = parity((2*state) & V27POLYB) ? (15-symbol):symbol;
      }
#else
      Branchtab27_1[state] = parity((2*state) & V27POLYA) ? 15:0;
      Branchtab27_2[state] = parity((2*state) & V27POLYB) ? 15:0;
#endif
    }
    V27_init = 1;
  }
  /* Malloc only guarantees 8-byte alignment, but we want to ensure that
   * the path metric arrays are on 32-byte boundaries. At least 16-byte
   * alignment is mandatory in the SSE2 version, but the Pentium III
   * cache line size is 32 bytes
   */
  blk = malloc(sizeof(struct v27)+32);
  if((int)blk & 31){
    /* Not on 32-byte boundary; shift up */
    vp = (struct v27 *)(((int)blk + 32) & ~31);
  } else {
    vp = (struct v27 *)blk;
  }
  vp->alloc_blk = blk; /* Record original pointer from malloc for use by free() */

  /* The decisions only need be 32-bit aligned */
#if defined(MMX)
  vp->dp = vp->decisions = malloc((len+6)*64);
#else
  vp->dp = vp->decisions = malloc((len+6)*8);
#endif

  vp->old_metrics = vp->metrics1;
  vp->new_metrics = vp->metrics2;
  return vp;
}

/* Initialize Viterbi decoder for start of new frame */
int init_viterbi27(void *p,int starting_state){
  struct v27 *vp = p;

  memset(vp->metrics1,60,64);
  vp->old_metrics = vp->metrics1;
  vp->new_metrics = vp->metrics2;
  vp->dp = vp->decisions;
  vp->old_metrics[starting_state & 63] = 0; /* Bias known start state */
  return 0;
}

/* Do Viterbi chainback */
int chainback_viterbi27(
      void *p,
      unsigned char *data, /* Decoded output data */
      unsigned int nbits, /* Number of data bits */
      unsigned int endstate){ /* Terminal encoder state */

  struct v27 *vp = p;
  int k;
  decision_t *decisions = (decision_t *)(vp->decisions);

  /* Make room beyond the end of the encoder register so we can
   * accumulate a full byte of decoded data
   */
  endstate %= 64;
  endstate <<= 2;
  decisions += 6; /* Look past tail */

  while(nbits-- != 0){
    k = EXTRACT_DECISION(&decisions[nbits],endstate >> 2);
    /* The store into data[] only needs to be done every 8 bits.
     * But this avoids a conditional branch, and the writes will
     * combine in the cache anyway
     */
    data[nbits>>3] = endstate = (endstate >> 1) | (k << 7);
  }
  return 0;
}

/* Delete instance of a Viterbi decoder */
void delete_viterbi27(void *p){
  struct v27 *vp = p;

  if(vp != NULL){
    free(vp->decisions);
    free(vp->alloc_blk);
  }
}
#if !defined(MMX) && !defined(SSE) & !defined(SSE2)
/* Portable C version */
/* C-language butterfly */
#define BFLY(i) {\
unsigned char metric,m0,m1,decision;\
    metric = ((Branchtab27_1[i] ^ sym1) + (Branchtab27_2[i] ^ sym2) + 1)/2;\
    m0 = vp->old_metrics[i] + metric;\
    m1 = vp->old_metrics[i+32] + (15 - metric);\
    decision = (m0-m1) >= 0;\
    vp->new_metrics[2*i] = decision ? m1 : m0;\
    vp->dp->w[i/16] |= decision << ((2*i)&31);\
    m0 -= (metric+metric-15);\
    m1 += (metric+metric-15);\
    decision = (m0-m1) >= 0;\
    vp->new_metrics[2*i+1] = decision ? m1 : m0;\
    vp->dp->w[i/16] |= decision << ((2*i+1)&31);\
}

int update_viterbi27(void *p,unsigned char sym1,unsigned char sym2){
  struct v27 *vp = p;
  unsigned char *tmp;
  int normalize = 0;

  vp->dp->w[0] = vp->dp->w[1] = 0;

  BFLY(0);
  BFLY(1);
  BFLY(2);
  BFLY(3);
  BFLY(4);
  BFLY(5);
  BFLY(6);
  BFLY(7);
  BFLY(8);
  BFLY(9);
  BFLY(10);
  BFLY(11);
  BFLY(12);
  BFLY(13);
  BFLY(14);
  BFLY(15);
  BFLY(16);
  BFLY(17);
  BFLY(18);
  BFLY(19);
  BFLY(20);
  BFLY(21);
  BFLY(22);
  BFLY(23);
  BFLY(24);
  BFLY(25);
  BFLY(26);
  BFLY(27);
  BFLY(28);
  BFLY(29);
  BFLY(30);
  BFLY(31);

  /* Renormalize metrics */
  if(vp->new_metrics[0] > 150){
    int i;
    unsigned char minmetric = 255;
    
    for(i=0;i<64;i++)
      if(vp->new_metrics[i] < minmetric)
	minmetric = vp->new_metrics[i];
    for(i=0;i<64;i++)
      vp->new_metrics[i] -= minmetric;
    normalize = minmetric;
  }

  vp->dp++;
  tmp = vp->old_metrics;
  vp->old_metrics = vp->new_metrics;
  vp->new_metrics = tmp;

  return normalize;
}
#endif

void emms_viterbi27(void){
#if defined(MMX) || defined(SSE)
  asm("emms");
#endif
}

/* Dump current decoder state for debugging */
void debug_viterbi27(void *p){
  struct v27 *vp = p;
  int i,j;

  fprintf(stderr,"viterbi27 @ bit %d:\n",vp->dp - vp->decisions);
  fprintf(stderr,"metrics:");
  for(i=0;i<64;i++){
    fprintf(stderr," %3d",vp->old_metrics[i]);
  }
  fprintf(stderr,"\n");
  fprintf(stderr,"decisions: ");
  for(i=0;i<64;i++){
    j = EXTRACT_DECISION(vp->dp-1,i);
    fprintf(stderr,"%d",j);
  }
  fprintf(stderr,"\n");
}


syntax highlighted by Code2HTML, v. 0.9.1