/**********************************************************/
/* This code is for PLDI-15 Artifact Evaluation only      */ 
/* and will be released with further copyright information*/ 
/* File: SSE4 block wo reexpansion of parentheses         */
/**********************************************************/

#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include <string.h>
#include <algorithm>
#include <iostream>
#include <fstream>

#include "harness.h"
#include "block-sse.h"
#include "simd.h"

#ifdef BLOCK_PROFILE
#include "blockprofiler.h"
BlockProfiler profiler(16);
#endif

#ifdef TRACK_TRAVERSALS
uint64_t work = 0;
#endif

//int _expandDepth = 0;
long long _expandSize = D_MAX_BLOCK_SIZE;
_Block * g_initial_block = NULL;

using namespace std;

/*Pseudo tail recursive parentheses matching our language spec*/
void parentheses(int l, int r, int n, int* num){
#ifdef BLOCK_PROFILE
  profiler.record_single();
#endif
  if (l == n){
    *num += 1;
    return;
  }

  if (l < r){
    return;
  }

  parentheses(l + 1, r, n, num);
  parentheses(l, r + 1, n, num);
}


void parentheses1(int n, _BlockStack* _stack, int* num, int _depth);

/*sequential processing for left children*/
inline void process_point0(int n, _Block *_block, _Block *_nextBlock0, int _bi, int *num) {
  int l = _block->l0[_bi];
  int r = _block->r0[_bi];

  if (l == n){
    *num += 1;
    return;
  }

  if (l < r){
    return;		
  }
  _nextBlock0->add(l + 1, r, l, r + 1);
}

/*simd processing for left children, we store n in char in the block*/
inline void process_simd0_opt(int n, _Block *_block, _Block *_nextBlock0, int _si, int *num) {
  __m128i vec_l = _mm_load_si128((__m128i*)&_block->l0[_si]);
  __m128i vec_r = _mm_load_si128((__m128i*)&_block->r0[_si]);
  __m128i vec_n = _mm_set1_epi8(n);

  __m128i vec_l_eq_n = _mm_cmpeq_epi8(vec_l, vec_n);
  __m128i vec_l_lt_r = _mm_cmplt_epi8(vec_l, vec_r);
  __m128i vec_cond = _mm_or_si128(vec_l_eq_n, vec_l_lt_r);
  int mask_inc = _mm_movemask_epi8(vec_l_eq_n);
  int mask = _mm_movemask_epi8(vec_cond);

#ifdef NOSC
  //Sequential Processing
  for (int i = 0; i < MY_SIMD_WIDTH; ++i){
    int mask_tmp = (mask >> i) & 1;
    int mask_inc_tmp = (mask_inc >> i) & 1;
    if (mask_tmp){
      if (mask_inc_tmp){
        *num += 1;
      }
      continue;
    }

    char l = _block->l0[_si + i];
    char r = _block->r0[_si + i];
    _nextBlock0->add(l + 1, r, l, r + 1);
  }
#else // Streaming Compaction
  __attribute__((aligned(16))) unsigned char tmp[16];
  unsigned index = 0;
  //do first 8
  *((__int64*)tmp) = g_shuffletable[mask & 0x000000FF];
  index += g_advanceNextPtrCounts[mask & 0x000000FF];
  // now second 8
  *((__int64*)&tmp[index]) = 0x0808080808080808 + g_shuffletable[(mask & 0x0000FF00) >> 8];
  index += g_advanceNextPtrCounts[(mask & 0x0000FF00) >> 8];
  // fill rest with 0xFF
  memset(&tmp[index], 0xFF, 16 - index);

  __m128i vec_shuffleTable =  _mm_load_si128((const __m128i *) tmp);
  vec_l = _mm_shuffle_epi8(vec_l, vec_shuffleTable);
  vec_r = _mm_shuffle_epi8(vec_r, vec_shuffleTable);

  _mm_storeu_si128((__m128i*)&_nextBlock0->l1[_nextBlock0->size], vec_l);
  _mm_storeu_si128((__m128i*)&_nextBlock0->r0[_nextBlock0->size], vec_r);

  __m128i vec_1 = _mm_set1_epi8(1);
  vec_l = _mm_add_epi8(vec_l, vec_1);
  vec_r = _mm_add_epi8(vec_r, vec_1);
  _mm_storeu_si128((__m128i*)&_nextBlock0->l0[_nextBlock0->size], vec_l);
  _mm_storeu_si128((__m128i*)&_nextBlock0->r1[_nextBlock0->size], vec_r);

  _nextBlock0->size += index;

  *num += 16 - g_advanceNextPtrCounts[mask_inc & 0x000000FF] - g_advanceNextPtrCounts[(mask_inc & 0x0000FF00) >> 8];

#endif
}

/*Depth First execution of left children to limit the memory consumption*/
void parentheses0(int n, _BlockStack* _stack, int* num, int _depth){
  class _BlockSet* _set = _stack->get(_depth);
  class _Block* _block = _set->block;
  class _Block* _nextBlock0 = &_set-> nextBlock0;
  _nextBlock0->recycle();

  int _si = 0;
  for (; _si < (_block -> _Block::size - MY_SIMD_WIDTH + 1); _si += MY_SIMD_WIDTH) {
#ifdef SIMD_NONE
    for (int _bi = _si; _bi < _si + MY_SIMD_WIDTH; _bi++) {
      process_point0(n, _block, _nextBlock0, _bi, num);
    }
#else	// not SIMD_NONE
    process_simd0_opt(n, _block, _nextBlock0, _si, num);
#endif // ifdef SIMD_NONE else
  }
  //Process the rest
  for (int _bi = _si; _bi < _block->size; _bi++) {
    process_point0(n, _block, _nextBlock0, _bi, num);
  }

  if (_nextBlock0 -> _Block::size > 0) {
    _stack ->  get (_depth + 1) -> _BlockSet::block = _nextBlock0;
    parentheses0( n, _stack, num, _depth + 1);
    parentheses1( n, _stack, num, _depth + 1);
  }
}

/*sequential processing for right children*/
inline void process_point1(int n, _Block *_block, _Block *_nextBlock0, int _bi, int *num) {
  int l = _block->l1[_bi];
  int r = _block->r1[_bi];

  if (l == n){
    *num += 1;
    return;
  }

  if (l < r){
    return;		
  }
  _nextBlock0->add(l + 1, r, l, r + 1);
}


/*simd processing for right children, we store n in char in the block*/
inline void process_simd1_opt(int n, _Block *_block, _Block *_nextBlock0, int _si, int *num) {
  __m128i vec_l = _mm_load_si128((__m128i*)&_block->l1[_si]);
  __m128i vec_r = _mm_load_si128((__m128i*)&_block->r1[_si]);
  __m128i vec_n = _mm_set1_epi8(n);

  __m128i vec_l_eq_n = _mm_cmpeq_epi8(vec_l, vec_n);
  __m128i vec_l_lt_r = _mm_cmplt_epi8(vec_l, vec_r);
  __m128i vec_cond = _mm_or_si128(vec_l_eq_n, vec_l_lt_r);
  int mask_inc = _mm_movemask_epi8(vec_l_eq_n);
  int mask = _mm_movemask_epi8(vec_cond);

#ifdef NOSC
  //Sequential Processing
  for (int i = 0; i < MY_SIMD_WIDTH; ++i){
    int mask_tmp = (mask >> i) & 1;
    int mask_inc_tmp = (mask_inc >> i) & 1;
    if (mask_tmp){
      if (mask_inc_tmp){
        *num += 1;
      }
      continue;
    }

    char l = _block->l1[_si + i];
    char r = _block->r1[_si + i];
    _nextBlock0->add(l + 1, r, l, r + 1);
  }
#else // Streaming Compaction
  __attribute__((aligned(16))) unsigned char tmp[16];
  unsigned index = 0;
  //do first 8
  *((__int64*)tmp) = g_shuffletable[mask & 0x000000FF];
  index += g_advanceNextPtrCounts[mask & 0x000000FF];
  // now second 8
  *((__int64*)&tmp[index]) = 0x0808080808080808 + g_shuffletable[(mask & 0x0000FF00) >> 8];
  index += g_advanceNextPtrCounts[(mask & 0x0000FF00) >> 8];
  // fill rest with 0xFF
  memset(&tmp[index], 0xFF, 16 - index);

  __m128i vec_shuffleTable =  _mm_load_si128((const __m128i *) tmp);
  vec_l = _mm_shuffle_epi8(vec_l, vec_shuffleTable);
  vec_r = _mm_shuffle_epi8(vec_r, vec_shuffleTable);

  _mm_storeu_si128((__m128i*)&_nextBlock0->l1[_nextBlock0->size], vec_l);
  _mm_storeu_si128((__m128i*)&_nextBlock0->r0[_nextBlock0->size], vec_r);

  __m128i vec_1 = _mm_set1_epi8(1);
  vec_l = _mm_add_epi8(vec_l, vec_1);
  vec_r = _mm_add_epi8(vec_r, vec_1);
  _mm_storeu_si128((__m128i*)&_nextBlock0->l0[_nextBlock0->size], vec_l);
  _mm_storeu_si128((__m128i*)&_nextBlock0->r1[_nextBlock0->size], vec_r);

  _nextBlock0->size += index;

  *num += 16 - g_advanceNextPtrCounts[mask_inc & 0x000000FF] - g_advanceNextPtrCounts[(mask_inc & 0x0000FF00) >> 8];

#endif

}

/*Depth First execution of right children to limit the memory consumption*/
void parentheses1(int n, _BlockStack* _stack, int* num, int _depth){
  class _BlockSet* _set = _stack->get(_depth);
  class _Block* _block = _set->block;
  class _Block* _nextBlock0 = &_set-> nextBlock0;
  _nextBlock0->recycle();

  int _si = 0;
  for (; _si < (_block -> _Block::size - MY_SIMD_WIDTH + 1); _si += MY_SIMD_WIDTH) {
#ifdef SIMD_NONE
    for (int _bi = _si; _bi < _si + MY_SIMD_WIDTH; _bi++) {
      process_point1(n, _block, _nextBlock0, _bi, num);
    }
#else	// not SIMD_NONE
    process_simd1_opt(n, _block, _nextBlock0, _si, num);
#endif // ifdef SIMD_NONE else
  }
  //Process the rest
  for (int _bi = _si; _bi < _block->size; _bi++) {
    process_point1(n, _block, _nextBlock0, _bi, num);
  }

  if (_nextBlock0 -> _Block::size > 0) {
    _stack ->  get (_depth + 1) -> _BlockSet::block = _nextBlock0;
    parentheses0( n, _stack, num, _depth + 1);
    parentheses1( n, _stack, num, _depth + 1);
  }
}

/*Breadth First execution to expand the number of tasks in software block*/
void parentheses_expand_bf(int n, _BlockStack* _stack, int* num, int* _depth){
  class _BlockSet* _set = _stack->get(*_depth);
  class _Block* _block = _set->block;
  class _Block* _nextBlock0 = &_set-> nextBlock0;
  _nextBlock0->recycle();

  //Add Left
  int _si = 0;
  for (; _si < (_block -> _Block::size - MY_SIMD_WIDTH + 1); _si += MY_SIMD_WIDTH) {
#ifdef SIMD_NONE
    for (int _bi = _si; _bi < _si + MY_SIMD_WIDTH; _bi++) {
      process_point0(n, _block, _nextBlock0, _bi, num);
    }
#else	// not SIMD_NONE
    process_simd0_opt(n, _block, _nextBlock0, _si, num);
#endif // ifdef SIMD_NONE else
  }
  //Process the rest
  for (int _bi = _si; _bi < _block->size; _bi++) {
    process_point0(n, _block, _nextBlock0, _bi, num);
  }

  //Add right
  _si = 0;
  for (; _si < (_block -> _Block::size - MY_SIMD_WIDTH + 1); _si += MY_SIMD_WIDTH) {
#ifdef SIMD_NONE
    for (int _bi = _si; _bi < _si + MY_SIMD_WIDTH; _bi++) {
      process_point1(n, _block, _nextBlock0, _bi, num);
    }
#else	// not SIMD_NONE
    process_simd1_opt(n, _block, _nextBlock0, _si, num);
#endif // ifdef SIMD_NONE else
  }
  //Process the rest
  for (int _bi = _si; _bi < _block->size; _bi++) {
    process_point1(n, _block, _nextBlock0, _bi, num);
  }

  //Free old stack space
  if (!*_depth){
    delete g_initial_block;
  } else
  {
    _stack->release(*_depth - 1);
  }

  int _nextblock0_size = _nextBlock0 -> _Block::size;
  *_depth += 1;
  if (_nextblock0_size > 0 && _nextblock0_size <= _expandSize / 2) {
    _stack ->  get (*_depth) -> _BlockSet::block = _nextBlock0;
    parentheses_expand_bf(n, _stack, num, _depth);
  } else { //Reach the buffer size, or finish all evaluation
    _stack ->  get (*_depth) -> _BlockSet::block = _nextBlock0;
  }

}

/*Benchmark entrance called by harness*/
int app_main(int argc, char** argv){
  if (argc != 1 && argc != 2) {
    cout << "usage: parentheses [n] or parentheses [n] [buffer_size, pow(2, k)]" << endl;
    exit(1);
  }


  int n = atoi(argv[0]);
  if (argc == 2) _expandSize = pow(2.0, atoi(argv[1]));
  int num = 0;

  Harness::start_timing();
  //_expandDepth = Harness::get_splice_depth();

  //Initialize software block stack
  cout << "Set fixed max block buffer size, _expandSize: " << _expandSize << endl;
  _Block::max_block = _expandSize;
  Harness::set_block_size(_expandSize);
  class _BlockStack * _stack = new _BlockStack;
  class _Block * _block = new _Block;
  g_initial_block = _block;

  if (n == 0 || n == 1){
    cout << "Please input a larger number rather than 0 or 1 ..." << endl;
    exit(0);
  }

  _block->add(1, 0, 0, 1);
  int _depth = 0;
  _stack->get (_depth) -> block = _block;

  //Start to execute blocked parentheses 
  if (_expandSize >= 2) parentheses_expand_bf(n, _stack, &num, &_depth);
  int df_block_size = _stack->get(_depth)->block->size;
  cout << "This is the max block buffer size for dfs: " << df_block_size << endl;

  if (df_block_size){
    parentheses0(n, _stack, &num, _depth);
    parentheses1(n, _stack, &num, _depth);
  }

  delete _stack;
  if (_expandSize < 2) delete _block;

  Harness::stop_timing();

#ifdef BLOCK_PROFILE
  profiler.output();
  //profiler.outputBlockInfo();
#endif

  cout << num << endl;

#ifdef PROFILE_SPACE_USE
  cout << "This is max space use (Bytes): " << m_space << endl;
  cout << "This is total malloc counts: " << total_malloc << endl;
#endif
  return 0;
}
