/**********************************************************/
/* This code is for PLDI-15 Artifact Evaluation only      */ 
/* and will be released with further copyright information*/ 
/* File: SSE block w reexpansion of fibonacci             */
/**********************************************************/

#include <iostream>
#include <fstream>
#include <math.h>

#include "harness.h"
#include "block-sse.h"
#include "simd.h"
//#define NOSC

const int MY_SIMD_WIDTH = 16; //Set simd width
using namespace std;

int dynamic_reexpand_count = 0;
_Block * g_initial_block = NULL;//For memory release
int g_is_partial = 0;

/*Pseudo tail recursive fib matching our language spec*/
void fib(int n, int *sum) {
#ifdef BLOCK_PROFILE
  profiler.record_single();
#endif

  if (n == 1 || n == 0) {
    *sum += 1;
  } else {
    fib(n - 1, sum);
    fib(n - 2, sum);
  }
}

//int _expandDepth = 0;
int _expandSize = D_MAX_BLOCK_SIZE;

void fib_expand_bf(class _BlockStack *_stack, int *_depth, int *sum);
int fib1(class _BlockStack *_stack,int _depth, int *sum);

/*sequential processing for left children*/
inline void process_point0(_Block *_block, _Block *_nextBlock0, int _bi, int *sum) {
  int n = _block->n0[_bi];
  if (n == 1 || n == 0) {
    *sum += 1;
  } else {
    _nextBlock0->add(n - 1, n - 2);
  }
}

/*simd processing for left children, we store n in char in the block*/
inline void process_simd0_opt(_Block *_block, _Block *_nextBlock0, int _si, int *sum) {
  __m128i vec_n = _mm_load_si128((__m128i*)&_block->n0[_si]);
  __m128i vec_1 = _mm_set1_epi8(1);
  __m128i vec_0 = _mm_setzero_si128();
  __m128i vec_n_eq_1 = _mm_cmpeq_epi8(vec_n, vec_1);
  __m128i vec_n_eq_0 = _mm_cmpeq_epi8(vec_n, vec_0);
  __m128i vec_cond = _mm_or_si128(vec_n_eq_1, vec_n_eq_0);
  int mask = _mm_movemask_epi8(vec_cond);

#ifdef NOSC
  //Sequential Processing
  if (mask & 1){
    *sum += 1;
  } else {
    char n = _block->n0[_si];
    _nextBlock0->add( n - 1, n - 2);
  }

  for (int i = 1; i < MY_SIMD_WIDTH; ++i){
    mask >>= 1;
    if (mask & 1){
      *sum += 1;
    } else {
      char n = _block->n0[_si + i];
      _nextBlock0->add(n - 1, n - 2);
    }
  }
#else// Streaming Compaction
  __attribute__((aligned(16))) unsigned char tmp[16];
  unsigned index = 0;
  //do first 8
  *((__int64*)tmp) = g_shuffletable[mask & 0x000000FF];
  index += g_advanceNextPtrCounts[mask & 0x000000FF];
  // now second 8
  *((__int64*)&tmp[index]) = 0x0808080808080808 + g_shuffletable[(mask & 0x0000FF00) >> 8];
  index += g_advanceNextPtrCounts[(mask & 0x0000FF00) >> 8];
  // fill rest with 0xFF
  memset(&tmp[index], 0xFF, 16 - index);

  __m128i vec_shuffleTable =  _mm_load_si128((const __m128i *) tmp);
  vec_n = _mm_shuffle_epi8(vec_n, vec_shuffleTable);

  vec_n = _mm_sub_epi8(vec_n, vec_1);
  _mm_storeu_si128((__m128i*)&_nextBlock0->n0[_nextBlock0->size], vec_n);
  vec_n = _mm_sub_epi8(vec_n, vec_1);
  _mm_storeu_si128((__m128i*)&_nextBlock0->n1[_nextBlock0->size], vec_n);

  _nextBlock0->size += index;

  *sum += 16 -  index;
#endif
}


/*Depth First execution of left children to limit the memory consumption*/
int fib0(class _BlockStack *_stack,int _depth, int *sum) {
  class _BlockSet *_set = _stack ->  get (_depth);
  class _Block *_block = _set -> block;
  class _Block *_nextBlock0 = &_set -> _BlockSet::nextBlock0;
  _nextBlock0 ->  recycle ();

  int _block_size = _block->size;
  if (_block_size <= _expandSize / 2){//Do dynamic reexpansion
    dynamic_reexpand_count++;
    g_is_partial = 1;
    fib_expand_bf(_stack, &_depth, sum);
    return 1;
  } else {
    int _si = 0;
    for (; _si < (_block->size - MY_SIMD_WIDTH + 1); _si += MY_SIMD_WIDTH) {
#ifdef SIMD_NONE
      for (int _bi = _si; _bi < _si + MY_SIMD_WIDTH; _bi++) {
        process_point0(_block, _nextBlock0, _bi, sum);
      }
#else	// not SIMD_NONE
      process_simd0_opt(_block, _nextBlock0, _si, sum);
#endif // ifdef SIMD_NONE else
    }

#ifdef SIMD_ALL
    int valid_mask = get_valid_mask(_si, _block->size);
    if (valid_mask == 1) {
      process_point0(_block, _nextBlock0, _si, sum);
    } else if (valid_mask > 1) {
      process_simd(_block, _nextBlock0, _si, valid_mask);
    }
#else	// not SIMD_ALL
    //cleanup code for stripmined loop
    for (int _bi = _si; _bi < _block->size; _bi++) {
      process_point0(_block, _nextBlock0, _bi, sum);
    }
#endif

    if (_nextBlock0 -> _Block::size > 0) {
      _stack ->  get (_depth + 1) -> _BlockSet::block = _nextBlock0;
      int skip = 0;
      skip = fib0(_stack, _depth + 1, sum);
      if (!skip)
        fib1(_stack, _depth + 1, sum);
    }
  }

  return 0;
}

/*sequential processing for right children*/
inline void process_point1(_Block *_block, _Block *_nextBlock0, int _bi, int *sum) {
  int n = _block->n1[_bi];
  if (n == 1 || n == 0) {
    *sum += 1;
  } else {
    _nextBlock0->add(n - 1, n - 2);
  }
}

/*simd processing for right children, we store n in char in the block*/
inline void process_simd1_opt(_Block *_block, _Block *_nextBlock0, int _si, int *sum) {
  __m128i vec_n = _mm_load_si128((__m128i*)&_block->n1[_si]);
  __m128i vec_1 = _mm_set1_epi8(1);
  __m128i vec_0 = _mm_setzero_si128();
  __m128i vec_n_eq_1 = _mm_cmpeq_epi8(vec_n, vec_1);
  __m128i vec_n_eq_0 = _mm_cmpeq_epi8(vec_n, vec_0);
  __m128i vec_cond = _mm_or_si128(vec_n_eq_1, vec_n_eq_0);
  int mask = _mm_movemask_epi8(vec_cond);

#ifdef NOSC
  //Sequential Processing
  if (mask & 1){
    *sum += 1;
  } else {
    char n = _block->n1[_si];
    _nextBlock0->add( n - 1, n - 2);
  }

  for (int i = 1; i < MY_SIMD_WIDTH; ++i){
    mask >>= 1;
    if (mask & 1){
      *sum += 1;
    } else {
      char n = _block->n1[_si + i];
      _nextBlock0->add(n - 1, n - 2);
    }
  }
#else//Streaming Compaction
  __attribute__((aligned(16))) unsigned char tmp[16];
  unsigned index = 0;
  //do first 8
  *((__int64*)tmp) = g_shuffletable[mask & 0x000000FF];
  index += g_advanceNextPtrCounts[mask & 0x000000FF];
  // now second 8
  *((__int64*)&tmp[index]) = 0x0808080808080808 + g_shuffletable[(mask & 0x0000FF00) >> 8];
  index += g_advanceNextPtrCounts[(mask & 0x0000FF00) >> 8];
  // fill rest with 0xFF
  memset(&tmp[index], 0xFF, 16 - index);

  __m128i vec_shuffleTable =  _mm_load_si128((const __m128i *) tmp);
  vec_n = _mm_shuffle_epi8(vec_n, vec_shuffleTable);

  vec_n = _mm_sub_epi8(vec_n, vec_1);
  _mm_storeu_si128((__m128i*)&_nextBlock0->n0[_nextBlock0->size], vec_n);
  vec_n = _mm_sub_epi8(vec_n, vec_1);
  _mm_storeu_si128((__m128i*)&_nextBlock0->n1[_nextBlock0->size], vec_n);

  _nextBlock0->size += index;

  *sum += 16 -  index;
#endif
}

/*Depth First execution of right children to limit the memory consumption*/
int fib1(class _BlockStack *_stack,int _depth, int *sum) {
  class _BlockSet *_set = _stack ->  get (_depth);
  class _Block *_block = _set -> block;
  class _Block *_nextBlock0 = &_set -> _BlockSet::nextBlock0;
  _nextBlock0 ->  recycle ();

  int _block_size = _block->size;
  if (_block_size <= _expandSize / 2){//Do dynamic reexpansion
    dynamic_reexpand_count++;
    g_is_partial = 1;
    fib_expand_bf(_stack, &_depth, sum);
    return 1;
  } else {
    int _si = 0;
    for (; _si < (_block->size - MY_SIMD_WIDTH + 1); _si += MY_SIMD_WIDTH) {
#ifdef SIMD_NONE
      for (int _bi = _si; _bi < _si + MY_SIMD_WIDTH; _bi++) {
        process_point1(_block, _nextBlock0, _bi, sum);
      }
#else	// not SIMD_NONE
      process_simd1_opt(_block, _nextBlock0, _si, sum);
#endif // ifdef SIMD_NONE else
    }

#ifdef SIMD_ALL
    int valid_mask = get_valid_mask(_si, _block->size);
    if (valid_mask == 1) {
      process_point1(_block, _nextBlock0, _si, sum);
    } else if (valid_mask > 1) {
      process_simd(_block, _nextBlock0, _si, valid_mask);
    }
#else	// not SIMD_ALL
    //cleanup code for stripmined loop
    for (int _bi = _si; _bi < _block->size; _bi++) {
      process_point1(_block, _nextBlock0, _bi, sum);
    }
#endif

    if (_nextBlock0 -> _Block::size > 0) {
      _stack ->  get (_depth + 1) -> _BlockSet::block = _nextBlock0;
      int skip = 0;
      skip = fib0(_stack, _depth + 1, sum);
      if (!skip)
        fib1(_stack, _depth + 1, sum);
    }
  }

  return 0;
}


/*Breadth First execution to expand the number of tasks in software block*/
void fib_expand_bf(class _BlockStack *_stack, int *_depth, int *sum){
  class _BlockSet *_set = _stack ->  get (*_depth);
  class _Block *_block = _set -> block;
  class _Block *_nextBlock0 = &_set -> _BlockSet::nextBlock0;
  _nextBlock0 ->  recycle ();

  //Add left
  int _si = 0;
  for (; _si < (_block->size - MY_SIMD_WIDTH + 1); _si += MY_SIMD_WIDTH) {
#ifdef SIMD_NONE
    for (int _bi = _si; _bi < _si + MY_SIMD_WIDTH; _bi++) {
      process_point0(_block, _nextBlock0, _bi, sum);
    }
#else	// not SIMD_NONE
    process_simd0_opt(_block, _nextBlock0, _si, sum);
#endif // ifdef SIMD_NONE else
  }

#ifdef SIMD_ALL
  int valid_mask = get_valid_mask(_si, _block->size);
  if (valid_mask == 1) {
    process_point0(_block, _nextBlock0, _si, sum);
  } else if (valid_mask > 1) {
    process_simd(_block, _nextBlock0, _si, valid_mask);
  }
#else	// not SIMD_ALL
  //cleanup code for stripmined loop
  for (int _bi = _si; _bi < _block->size; _bi++) {
    process_point0(_block, _nextBlock0, _bi, sum);
  }
#endif

  //Add right 
  _si = 0;
  for (; _si < (_block->size - MY_SIMD_WIDTH + 1); _si += MY_SIMD_WIDTH) {
#ifdef SIMD_NONE
    for (int _bi = _si; _bi < _si + MY_SIMD_WIDTH; _bi++) {
      process_point1(_block, _nextBlock0, _bi, sum);
    }
#else	// not SIMD_NONE
    process_simd1_opt(_block, _nextBlock0, _si, sum);
#endif // ifdef SIMD_NONE else
  }

#ifdef SIMD_ALL
  int valid_mask = get_valid_mask(_si, _block->size);
  if (valid_mask == 1) {
    process_point1(_block, _nextBlock0, _si, sum);
  } else if (valid_mask > 1) {
    process_simd(_block, _nextBlock0, _si, valid_mask);
  }
#else	// not SIMD_ALL
  //cleanup code for stripmined loop
  for (int _bi = _si; _bi < _block->size; _bi++) {
    process_point1(_block, _nextBlock0, _bi, sum);
  }
#endif

  //Free old stack space
  if (!g_is_partial){
    if (!*_depth){
      delete g_initial_block;
    } else
    {
      _stack->release(*_depth - 1);
    }
  }

  int _nextblock0_size = _nextBlock0 -> _Block::size;
  *_depth += 1;
  if (_nextblock0_size > 0 && _nextblock0_size <= _expandSize/2) {
    _stack ->  get (*_depth) -> _BlockSet::block = _nextBlock0;
    fib_expand_bf(_stack, _depth, sum);
  } else { //Reach the buffer size, or finish all evaluation
    if (!dynamic_reexpand_count){// only print for the first time
      cout << "This is the max block buffer size for dfs: " << _nextblock0_size << endl;
    }
    if (_nextblock0_size){
      _stack ->  get (*_depth) -> _BlockSet::block = _nextBlock0;
      fib0(_stack, *_depth, sum);
      fib1(_stack, *_depth, sum);
    }
  }
}

/*Benchmark entrance called by harness*/
int app_main(int argc, char **argv) {
  if (argc != 1 && argc != 2) {
    cout << "usage: fibonacci [n] or fibonacci [n] [buffer_size, pow(2, k)]" << endl;
    exit(1);
  }

  int n = atoi(argv[0]);
  if (argc == 2) _expandSize = pow(2.0, atoi(argv[1]));
  int sum = 0;

  Harness::start_timing();
  //_expandDepth = Harness::get_splice_depth();

  //Initialize software block stack
  cout << "Set fixed max block buffer size, _expandSize: " << _expandSize << endl;
  _Block::max_block = _expandSize;
  Harness::set_block_size(_expandSize);
  class _BlockStack * _stack = new _BlockStack;
  class _Block * _block = new _Block;
  g_initial_block = _block;

  if (n == 0 || n == 1){
    cout << "Please input a larger number rather than 0 or 1 ..." << endl;
    exit(0);
  }

  _block->add(n - 1, n - 2);
  int _depth = 0;
  _stack->get (_depth) -> block = _block;

  //Start to execute blocked fib
  if (_expandSize >= 2) fib_expand_bf(_stack, &_depth, &sum);
  else{    
    int df_block_size = _stack->get(_depth)->block->size;
    cout << "This is the max block buffer size for dfs: " << df_block_size << endl;

    if (df_block_size){
      fib0(_stack, _depth, &sum);
      fib1(_stack, _depth, &sum);
    }
  }

  delete _stack;
  if (_expandSize < 2) delete _block;

  Harness::stop_timing();

#ifdef BLOCK_PROFILE
  profiler.output();
#endif

  cout << sum << endl;

#ifdef PROFILE_SPACE_USE
  cout << "This is max space use (Bytes): " << m_space << endl;
  cout << "This is the total number of new operations for block: " << total_malloc << endl;
#endif
  return 0;
}


