/*******************************************************
                        PFTOOLS
 *******************************************************
  Sep 30, 2011 xali1_sse41.c
 *******************************************************
 (C) 2011 Swiss Institute of Bioinformatics
     Thierry Schuepbach (thierry.schuepbach@isb-sib.ch)
 *******************************************************/

#include <stdlib.h>
#include <inttypes.h> 
#include <smmintrin.h>
#include "profile.h"

#define MAX(a,b) (a>b) ? a : b
 
int xali1_sse41(const struct Profile * const restrict prf, const unsigned char * const restrict Sequence,
          int * const WORK, const size_t BSEQ, const size_t LSEQ, const int CutOff, const _Bool LOPT)
/*
 * WARNING: for SSE version, WORK should be 4 times the (profile size + 1)*sizeof(int) + 63 to align to cache line
 */
{
  int KOPD, lScore = (int) NLOW;
  
  const sIOP * restrict IOP_R;
  sIOP * restrict IOP_W = (sIOP*) WORK;

  register const TransitionScores * const restrict Transitions = prf->Scores.Insertion.Transitions;
  const short int * const restrict Match = prf->Scores.Match.Alphabet;
  const short int * const restrict Insertion = prf->Scores.Insertion.Alphabet;
  const size_t AlignStep = prf->Scores.Match.AlignStep;

  /* NOTE: The following part could be replaced and performed only once for a profile as it
   *       is profile dependent. Nevertheless it does a good job loading Match and Transition
   *       matrices into the cache hierarchy.
   */
  {
    register const short int * restrict lMatch = (const short int *) &Match[_D];
    register const ScoreTuple * restrict FirstSequenceProtein = prf->Scores.Insertion.FirstSequenceProtein;
    IOP_W[0].Element.M = (int) FirstSequenceProtein[0].To[MATCH];
    IOP_W[0].Element.I = (int) FirstSequenceProtein[0].To[INSERTION];
    KOPD               = (int) FirstSequenceProtein[0].To[DELETION];
    FirstSequenceProtein++;
    register const TransitionScores (* restrict pTransitions) = &Transitions[1];
    register sIOP * restrict pIOP = &IOP_W[1];
    register int Length = - (int) prf->Length;

//     while (Length-- != 0) {
    do {
      register const int KD = KOPD + (int) *lMatch;
      lMatch += AlignStep;
      
      // Transform KD into a vector
      __m128i __KD = _mm_set1_epi32(KD);
      // Load Transitions
      __m128i __Transitions = _mm_loadl_epi64((__m128i*) &(pTransitions->From[DELETION].mm));
      
      // Convert signed WORD into signed DWORD
      __Transitions = _mm_cvtepi16_epi32(__Transitions);
      
      // Add KD to Transitions
      __Transitions = _mm_add_epi32(__Transitions, __KD);
      
      // Move to next profile transitions
      pTransitions++;

      // Load FirstSequenceProtein
      __m128i __FirstSequenceProtein = _mm_loadl_epi64((__m128i*) &(FirstSequenceProtein[0].mm));

      // Convert signed WORD into signed DWORD
      __FirstSequenceProtein = _mm_cvtepi16_epi32(__FirstSequenceProtein);

      // Move to next profile First Sequence
      FirstSequenceProtein++;
      
      // Get maximum ( this is SSE 4.1 )
      __m128i __max = _mm_max_epi32(__Transitions, __FirstSequenceProtein);

      // Store IOPI and IOPM
      StoreMatchInsertion( &(pIOP->mm), (__m128) __max);
      pIOP++;
      
      // Set KOPD ( this is SSE 4.1 )
      KOPD = _mm_extract_epi32(__max, DELETION);

      Length++;
    } while (Length < 0);
  }

  // Swap and assign Read and write pointers
  IOP_R = IOP_W;
  IOP_W = (sIOP*) (((uintptr_t) &WORK[2*(prf->Length+1)] + 63) & ~63);

  for ( int iseq=BSEQ; iseq < LSEQ-1; ++iseq) {
//     printf("%i %i\t", iseq+1, lScore);
    register const size_t j1 = (size_t) Sequence[iseq];
    int KOPM = IOP_R[0].Element.M;
    register const short int * restrict lInsertion = Insertion;
    {
      register const int KI = IOP_R[0].Element.I + (int) lInsertion[j1];

      // Transform KI into a vector
      __m128i __KI = _mm_set1_epi32(KI);
      // Load Transitions
      __m128i __TransitionsI = _mm_loadl_epi64((__m128i*) &(Transitions[0].From[INSERTION].mm));
      // Convert signed WORD into signed DWORD
      __TransitionsI = _mm_cvtepi16_epi32(__TransitionsI);
      // Add KI to Transition
      __TransitionsI = _mm_add_epi32(__TransitionsI, __KI);

       // Load Transitions
      __m128i __TransitionsX = _mm_loadl_epi64((__m128i*) &(Transitions[0].From[EXTRA].mm));
      // Convert signed WORD into signed DWORD
      __TransitionsX = _mm_cvtepi16_epi32(__TransitionsX);

      // Insert lScore into __TransitionsX
      __TransitionsX = _mm_insert_epi32(__TransitionsX, lScore, DUMMY);

      // Get maximum ( this is SSE 4.1 )
      __m128i __max = _mm_max_epi32(__TransitionsI, __TransitionsX);

      // Store IOPI and IOPM
      StoreMatchInsertion( &(IOP_W[0].mm), (__m128) __max);
      
      // Store KOPD
      KOPD = _mm_extract_epi32(__max, DELETION);

      // Backup new score to xmm register
      lScore = _mm_extract_epi32(__max, DUMMY);
    }
    
    lInsertion += AlignStep;
    register const short int * restrict lMatch = Match;
    
    size_t iprf = 1;
//     for (size_t iprf=1; iprf<=prf->Length; ++iprf ) {
    do {
      const int KM = KOPM                  + (int) lMatch[j1];
      const int KI = IOP_R[iprf].Element.I + (int) lInsertion[j1];
      const int KD = KOPD                  + (int) lMatch[_D];
#if 0
      lMatch     += AlignStep;
      lInsertion += AlignStep;

      KOPM = IOP_R[iprf].Element.M;

      // Transform KM into a vector
      __m128i __KM = _mm_set1_epi32(KM);
      // Load Transitions
      register __m128i __TransitionsM = _mm_loadl_epi64((__m128i*) &(Transitions[iprf].From[MATCH].mm));
      // Convert signed WORD into signed DWORD
      __TransitionsM = _mm_cvtepi16_epi32(__TransitionsM);
      // Add KM to Transition
      __TransitionsM = _mm_add_epi32(__TransitionsM, __KM);

    
      // Transform KI into a vector
      __m128i __KI = _mm_set1_epi32(KI);
      // Load Transitions
      register __m128i __TransitionsI = _mm_loadl_epi64((__m128i*) &(Transitions[iprf].From[INSERTION].mm));
      // Convert signed WORD into signed DWORD
      __TransitionsI = _mm_cvtepi16_epi32(__TransitionsI);
      // Add KI to Transition
      __TransitionsI = _mm_add_epi32(__TransitionsI, __KI);

      // Get maximum ( this is SSE 4.1 )
      __m128i __max1 = _mm_max_epi32(__TransitionsM, __TransitionsI);

      // Load Transitions
      register __m128i __TransitionsX = _mm_loadl_epi64((__m128i*) &(Transitions[iprf].From[EXTRA].mm));
      // Convert signed WORD into signed DWORD
      __TransitionsX = _mm_cvtepi16_epi32(__TransitionsX);
      // Insert lscore into TransitionX
      __TransitionsX = _mm_insert_epi32(__TransitionsX, lScore, DUMMY);
      
      // Transform KD into a vector
      __m128i __KD = _mm_set1_epi32(KD);
      // Load Transitions
      __m128i __TransitionsD = _mm_loadl_epi64((__m128i*) &(Transitions[iprf].From[DELETION].mm));
      // Convert signed WORD into signed DWORD
      __TransitionsD = _mm_cvtepi16_epi32(__TransitionsD);
      // Add KD to Transition
      __TransitionsD = _mm_add_epi32(__TransitionsD, __KD);
      
      // Get maximum ( this is SSE 4.1 )
      __m128i __max2 = _mm_max_epi32(__TransitionsD, __TransitionsX);
      __max1 = _mm_max_epi32(__max1, __max2);
#else
     // Transform KM into a vector
      const register __m128i __KM  = _mm_set1_epi32(KM);
      const register __m128i __KI  = _mm_set1_epi32(KI);
      const register __m128i __KD  = _mm_set1_epi32(KD);
      
      // Load Transitions
      register __m128i __TransitionsM = _mm_loadl_epi64((__m128i*) &(Transitions[iprf].From[MATCH].mm));
//       register __m128i __TransitionsI = _mm_loadl_epi64((__m128i*) &(Transitions[iprf].From[INSERTION].mm));
      register __m128i __TransitionsD = _mm_loadl_epi64((__m128i*) &(Transitions[iprf].From[DELETION].mm));
//       register __m128i __TransitionsX = _mm_loadl_epi64((__m128i*) &(Transitions[iprf].From[EXTRA].mm));
      
      // Convert signed WORD into signed DWORD
      __TransitionsM = _mm_cvtepi16_epi32(__TransitionsM);
      register __m128i __TransitionsI = _mm_cvtepi16_epi32(*(__m128i*) &(Transitions[iprf].From[INSERTION].mm));
      __TransitionsD = _mm_cvtepi16_epi32(__TransitionsD);
      register __m128i __TransitionsX = _mm_cvtepi16_epi32(*(__m128i*) &(Transitions[iprf].From[EXTRA].mm));
      
       // Add KM to Transition
      __asm__ __volatile__ ("lea (%0,%2,%c3), %0\n" : "=r"(lMatch) : "0"(lMatch), "r"(AlignStep), "n"(sizeof(short int))  ); //lMatch     += AlignStep;
      __asm__ __volatile__ ("lea (%0,%2,%c3), %0\n" : "=r"(lInsertion) : "0"(lInsertion), "r"(AlignStep), "n"(sizeof(short int))  ); //lInsertion += AlignStep;

      __TransitionsM = _mm_add_epi32(__TransitionsM, __KM);
      __TransitionsI = _mm_add_epi32(__TransitionsI, __KI);
      __TransitionsD = _mm_add_epi32(__TransitionsD, __KD);
      __TransitionsX = _mm_insert_epi32(__TransitionsX, lScore, DUMMY); 

      // Get maximum ( this is SSE 4.1 )
      KOPM = IOP_R[iprf].Element.M;
      __m128i __max1 = _mm_max_epi32(__TransitionsM, __TransitionsI);
      __m128i __max2 = _mm_max_epi32(__TransitionsD, __TransitionsX);
      __max1 = _mm_max_epi32(__max1, __max2);
#endif
      // Store IOPI and IOPM
      StoreMatchInsertion( &IOP_W[iprf].mm, (__m128) __max1);

      // Set KOPD ( this is SSE 4.1 )
      KOPD = _mm_extract_epi32(__max1, DELETION);

      lScore = _mm_extract_epi32(__max1, DUMMY);

//       printf("%i %i\t\t%i\t%i\t\t%i\t%i\t%i\t\t%i\t%i\t\t%i\n",
//              iseq, iprf,
//              IOP_W[iprf].Element.M, IOP_W[iprf].Element.I,
//              KM, KI, KD, KOPM, KOPD,
//              lScore);

    } while (++iprf <= (size_t) prf->Length);

    // Swap Read and Write pointers
    sIOP * ptr = IOP_W;
    IOP_W = (sIOP*) IOP_R;
    IOP_R = ptr;

    if ( ! LOPT && lScore >= CutOff) return lScore;
  } 
  {
    register const short int * restrict lInsertion = Insertion;
    const int j1 = (int) Sequence[LSEQ-1];
    int KOPM     = IOP_R[0].Element.M;
    int KI       = IOP_R[0].Element.I + (int) lInsertion[j1];
    
    KOPD   = MAX( KI + (int) Transitions[0].Element[_ID],      (int) Transitions[0].Element[_XD] );
    register const ScoreTuple * const restrict LastSequenceProtein = prf->Scores.Insertion.LastSequenceProtein;
    lScore = MAX( lScore, KI + (int) LastSequenceProtein[0].From[INSERTION] );
  
    register const short int * restrict lMatch = Match;
    lInsertion += AlignStep;
    
    for (int iprf=1; iprf<=prf->Length; ++iprf) {
      const int KM = KOPM                  + lMatch[j1];
      KI           = IOP_R[iprf].Element.I + lInsertion[j1];
      const int KD = KOPD                  + lMatch[_D];

      lMatch     += AlignStep;
      lInsertion += AlignStep;

      KOPM = IOP_R[iprf].Element.M;

      const int tIOPD1 = MAX( KM + (int) Transitions[iprf].Element[_MD],      (int) Transitions[iprf].Element[_XD] );
      const int tIOPD2 = MAX( KI + (int) Transitions[iprf].Element[_ID], KD + (int) Transitions[iprf].Element[_DD] );
      KOPD             = MAX( tIOPD1, tIOPD2);

      const int tIOPT1 = MAX( KM + (int) LastSequenceProtein[iprf].From[MATCH], KI + (int) LastSequenceProtein[iprf].From[INSERTION] );
      const int tIOPT2 = MAX( lScore                                          , KD + (int) LastSequenceProtein[iprf].From[DELETION] );
      lScore           = MAX( tIOPT1, tIOPT2);
    }
  }
  //printf("That one went to the end\n");
  return lScore;
}

#undef MAX
