/* K=9 r=1/2 Viterbi decoder with Intel SIMD
 * May 2001, Phil Karn, KA9Q
 */
#include <stdio.h>
#include <stdlib.h>
#include <memory.h>
#include "viterbi29.h"
#include "parity.h"

static int V29_init;
int cpu_features(void);

#if defined(SSE2)
char id_viterbi29[] = "k=9 r=1/2 Viterbi decoder, SSE2 version";
#elif defined(SSE)
char id_viterbi29[] = "k=9 r=1/2 Viterbi decoder, SSE version";
#elif defined(MMX)
char id_viterbi29[] = "k=9 r=1/2 Viterbi decoder, MMX version";
#else
char id_viterbi29[] = "k=9 r=1/2 Viterbi decoder, portable C version";
#endif

#if defined(MMX)

typedef union { long long p; char c[256]; } decision_t;
#define EXTRACT_DECISION(d,state) ((d)->c[state] & 1)

/* Combined tables used by mmxbfly */
unsigned char Mettab29_1[16][128] __attribute__ ((aligned(32)));
unsigned char Mettab29_2[16][128] __attribute__ ((aligned(32)));

#else

typedef union { long long p; unsigned long w[8]; } decision_t;
#define EXTRACT_DECISION(d,state) (((d)->w[state/32] >> (state%32)) & 1)

/* Symbol branch table used by ssebfly and sse2bfly */
unsigned char Branchtab29_1[128] __attribute__ ((aligned(32)));
unsigned char Branchtab29_2[128] __attribute__ ((aligned(32)));

#endif

/* State info for instance of Viterbi decoder
 * Don't change this without also changing references in (mmx|sse|sse2)bfly29.s!
 */
struct v29 {
  unsigned char metrics1[256]; /* path metric buffer 1 */
  unsigned char metrics2[256]; /* path metric buffer 2 */
  decision_t *dp;              /* Pointer to decision output for current bit */
  unsigned char *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
  decision_t *decisions;       /* Beginning of decisions for block */
  void *alloc_blk;             /* Return value from malloc */
};


/* Create a new instance of a Viterbi decoder */
void *create_viterbi29(int len){
  void *blk;
  struct v29 *vp;
  int state;

  if(!V29_init){
#if defined(SSE2)
    if(!(cpu_features() & (1 << 26))){
      fprintf(stderr,"viterbi29: CPU does not support SSE2 instructions\n");
      exit(1);
    }
#elif defined(SSE)
    if(!(cpu_features() & (1 << 25))){
      fprintf(stderr,"viterbi29: CPU does not support SSE instructions\n");
      exit(1);
    }
#elif defined(MMX)
    if(!(cpu_features() & (1 << 23))){
      fprintf(stderr,"viterbi29: CPU does not support MMX instructions\n");
      exit(1);
    }
#endif
    /* Initialize metric tables */
    for(state=0;state < 128;state++){
#if defined(MMX)
      int symbol;
      for(symbol = 0;symbol < 16;symbol++){
	Mettab29_1[symbol][state] = parity((2*state) & V29POLYA) ? (15-symbol):symbol;
	Mettab29_2[symbol][state] = parity((2*state) & V29POLYB) ? (15-symbol):symbol;
      }
#else
      Branchtab29_1[state] = parity((2*state) & V29POLYA) ? 15:0;
      Branchtab29_2[state] = parity((2*state) & V29POLYB) ? 15:0;
#endif
    }
    V29_init = 1;
  }
  /* Malloc only guarantees 8-byte alignment, but we want to ensure that
   * the path metric arrays are on 32-byte boundaries. At least 16-byte
   * alignment is mandatory in the SSE2 version, but the Pentium III
   * cache line size is 32 bytes
   */
  blk = malloc(sizeof(struct v29)+32);
  if((int)blk & 31){
    /* Not on 32-byte boundary; shift up */
    vp = (struct v29 *)(((int)blk + 32) & ~31);
  } else {
    vp = (struct v29 *)blk;
  }
  vp->alloc_blk = blk; /* Record original pointer from malloc for use by free() */

  /* The decisions only need be 32-bit aligned */
#if defined(MMX)
  vp->dp = vp->decisions = malloc((len+8)*256);
#else
  vp->dp = vp->decisions = malloc((len+8)*32);
#endif

  vp->old_metrics = vp->metrics1;
  vp->new_metrics = vp->metrics2;
  return vp;
}

/* Initialize Viterbi decoder for start of new frame */
int init_viterbi29(void *p,int starting_state){
  struct v29 *vp = p;

  memset(vp->metrics1,60,256);
  vp->old_metrics = vp->metrics1;
  vp->new_metrics = vp->metrics2;
  vp->dp = vp->decisions;
  vp->old_metrics[starting_state & 255] = 0; /* Bias known start state */
  return 0;
}

/* Do Viterbi chainback */
int chainback_viterbi29(
      void *p,
      unsigned char *data, /* Decoded output data */
      unsigned int nbits, /* Number of data bits */
      unsigned int endstate){ /* Terminal encoder state */

  struct v29 *vp = p;
  int k;
  decision_t *decisions = vp->decisions;

  /* Make room beyond the end of the encoder register so we can
   * accumulate a full byte of decoded data
   */
  endstate %= 256;
  decisions += 8; /* Look past tail */

  while(nbits-- != 0){
    k = EXTRACT_DECISION(&decisions[nbits],endstate);
    /* The store into data[] only needs to be done every 8 bits.
     * But this avoids a conditional branch, and the writes will
     * combine in the cache anyway
     */
    data[nbits>>3] = endstate = (endstate >> 1) | (k << 7);
  }
  return 0;
}

/* Delete instance of a Viterbi decoder */
void delete_viterbi29(void *p){
  struct v29 *vp = p;

  if(vp != NULL){
    free(vp->decisions);
    free(vp->alloc_blk);
  }
}

#if !defined(MMX) && !defined(SSE) & !defined(SSE2)

/* C-language butterfly */
#define BFLY(i) {\
unsigned char metric,m0,m1,decision;\
    metric = ((Branchtab29_1[i] ^ sym1) + (Branchtab29_2[i] ^ sym2) + 1)/2;\
    m0 = vp->old_metrics[i] + metric;\
    m1 = vp->old_metrics[i+128] + (15 - metric);\
    decision = (m0-m1) >= 0;\
    vp->new_metrics[2*i] = decision ? m1 : m0;\
    vp->dp->w[i/16] |= decision << ((2*i)&31);\
    m0 -= (metric+metric-15);\
    m1 += (metric+metric-15);\
    decision = (m0-m1) >= 0;\
    vp->new_metrics[2*i+1] = decision ? m1 : m0;\
    vp->dp->w[i/16] |= decision << ((2*i+1)&31);\
}

int update_viterbi29(void *p,unsigned char sym1,unsigned char sym2){
  int i;
  struct v29 *vp = p;
  unsigned char *tmp;
  int normalize = 0;

  for(i=0;i<8;i++)
    vp->dp->w[i] = 0;

  for(i=0;i<128;i++)
    BFLY(i);

  /* Renormalize metrics */
  if(vp->new_metrics[0] > 150){
    int i;
    unsigned char minmetric = 255;
    
    for(i=0;i<64;i++)
      if(vp->new_metrics[i] < minmetric)
	minmetric = vp->new_metrics[i];
    for(i=0;i<64;i++)
      vp->new_metrics[i] -= minmetric;
    normalize = minmetric;
  }

  vp->dp++;
  tmp = vp->old_metrics;
  vp->old_metrics = vp->new_metrics;
  vp->new_metrics = tmp;
  
  return normalize;
}
#endif

void emms_viterbi29(void){
#if defined(MMX) || defined(SSE)
  asm("emms");
#endif
}



syntax highlighted by Code2HTML, v. 0.9.1