/**********************************************************/
/* This code is for PLDI-15 Artifact Evaluation only      */ 
/* and will be released with further copyright information*/ 
/* File: MIC block w reexpansion of binomial              */
/**********************************************************/

#include <math.h>
#include <iostream>
#include <fstream>

#include "harness.h"
#include "block-mic.h"
#include "simd.h"

#ifdef BLOCK_PROFILE
#include "blockprofiler.h"
BlockProfiler profiler(16);
#endif

int dynamic_reexpand_count = 0;
_Block * g_initial_block = NULL;//For memory release
int g_is_partial = 0;

using namespace std;

const int MY_SIMD_WIDTH = 16;//Set SIMD width

/*Pseudo tail recursive binomial matching our language spec*/
void binomial(node_t n, node_t k, int *num) {
#ifdef BLOCK_PROFILE
  profiler.record_single();
#endif

  if (k == 0 || k == n) {
    *num += 1;
  } else if (k < 0 || k > n) {
  } else {
    binomial(n-1, k-1, num);
    binomial(n-1, k, num);
  }
}

//int _expandDepth = 0;
long long _expandSize = D_MAX_BLOCK_SIZE;

/*sequential processing for left children*/
inline void process_point(_Block *_block, _Block *_nextBlock0, int _bi, int *num) {
  node_t n = _block->n[_bi];
  node_t k = _block->k[_bi];
  if (k == 0 || k == n) {
    *num += 1;
  } else if (k < 0 || k > n) {
  } else {
    _nextBlock0->add(_block, _bi);
  }
}

/*simd processing for left children, we store n in int in the block*/
inline void process_simd_opt(_Block *_block, _Block *_nextBlock0, int _si, int *num, int rem_work = MY_SIMD_WIDTH) {
  __m512i vec_n = _mm512_load_epi32((__m512i*)&_block->n[_si]);
  __m512i vec_k = _mm512_load_epi32((__m512i*)&_block->k[_si]);
  __m512i vec_0 = _mm512_setzero_epi32();
  __m512i vec_1 = _mm512_set1_epi32(1);
  __mmask16 mask_k_eq_0 = _mm512_cmp_epi32_mask(vec_k, vec_0, _MM_CMPINT_EQ);	
  __mmask16 mask_k_eq_n = _mm512_cmp_epi32_mask(vec_k, vec_n, _MM_CMPINT_EQ);
  __mmask16 mask_cond1 = _mm512_kor(mask_k_eq_0, mask_k_eq_n);

  __mmask16 mask_k_lt_0 = _mm512_cmp_epi32_mask(vec_k, vec_0, _MM_CMPINT_LT);	
  __mmask16 mask_k_gt_n = _mm512_cmp_epi32_mask(vec_k, vec_n, _MM_CMPINT_GT);
  __mmask16 mask_cond2 = _mm512_kor(mask_k_lt_0, mask_k_gt_n);
  __mmask16 mask_is_leaf = _mm512_kor(mask_cond1, mask_cond2);

#ifdef SEQSC
  __declspec(align(64)) int tmp[MY_SIMD_WIDTH] = {0};
  __declspec(align(64)) int tmp_non_leaf[MY_SIMD_WIDTH] = {0};
  _mm512_mask_store_epi32(tmp_non_leaf, ~mask_is_leaf, vec_1);

  for (int i = 1; i < MY_SIMD_WIDTH; ++i){
    tmp[i] = tmp[i-1] + tmp_non_leaf[i-1];
  }

  __m512i vec_index = _mm512_add_epi32(_mm512_set1_epi32(_nextBlock0->size), _mm512_load_epi32(tmp));

  vec_n = _mm512_sub_epi32(vec_n, vec_1);
  _mm512_mask_i32scatter_epi32((int*)_nextBlock0->n, ~mask_is_leaf, vec_index, vec_n, 4);
  _mm512_mask_i32scatter_epi32((int*)_nextBlock0->k, ~mask_is_leaf, vec_index, vec_k, 4);
  int adv = tmp[MY_SIMD_WIDTH - 1] + tmp_non_leaf[MY_SIMD_WIDTH - 1];
  _nextBlock0->size += adv;

  *num += _mm512_mask_reduce_add_epi32(mask_cond1, vec_1);

#else //no SEQSC
  unsigned short low_8, high_8;
  low_8 = (~mask_is_leaf) & 0x00FF;
  high_8 = ((~mask_is_leaf) & 0xFF00) >> 8;

  __m512i vec_index = _mm512_load_epi32(g_scantable[low_8]);
  __m512i vec_index_1 = _mm512_load_epi32(g_scantable[high_8]);

  __m512i vec_index_offset = _mm512_set1_epi32(g_scantable[low_8][7] + (((~mask_is_leaf) >> 7) & 0x1));

  vec_index_1 = _mm512_mask_add_epi32(vec_index_1, 0x00FF, vec_index_1, vec_index_offset);
  vec_index_1 = _mm512_permute4f128_epi32(vec_index_1, _MM_PERM_BADC);
  vec_index = _mm512_and_epi32(vec_index, vec_index_1);

  vec_index = _mm512_add_epi32(_mm512_set1_epi32(_nextBlock0->size), vec_index);

  vec_n = _mm512_sub_epi32(vec_n, vec_1);
  _mm512_mask_i32scatter_epi32((int*)_nextBlock0->n, ~mask_is_leaf, vec_index, vec_n, 4);
  _mm512_mask_i32scatter_epi32((int*)_nextBlock0->k, ~mask_is_leaf, vec_index, vec_k, 4);
  int adv = g_scantable[low_8][7] + g_scantable[high_8][7] + (((~mask_is_leaf) >> 7) & 0x1) + (((~mask_is_leaf) >> 15) & 0x1);
  _nextBlock0->size += adv;
  *num += _mm512_mask_reduce_add_epi32(mask_cond1, vec_1);
#endif // SEQSC

}

void binomial_expand_bf(class _BlockStack *_stack, int* _depth, int *num);

int binomial_block1(class _BlockStack *_stack,int _depth, int *num);

/*Depth First execution of left children to limit the memory consumption*/
int binomial_block(class _BlockStack *_stack,int _depth, int *num) {
  class _BlockSet *_set = _stack ->  get (_depth);
  class _Block *_block = _set -> block;
  class _Block *_nextBlock0 = &_set -> _BlockSet::nextBlock0;
  _nextBlock0 ->  recycle ();
#ifdef BLOCK_PROFILE
  profiler.record(_block->size, _depth);
#endif

  int _block_size = _block->size;
  if (_block_size <= _expandSize / 2){//Do dynamic reexpansion
    dynamic_reexpand_count++;
    g_is_partial = 1;
    binomial_expand_bf(_stack, &_depth, num);
    return 1;
  } else {
    int _si = 0;
    for (; _si < (_block->size - MY_SIMD_WIDTH + 1); _si += MY_SIMD_WIDTH) {
#ifdef SIMD_NONE
      for (int _bi = _si; _bi < _si + MY_SIMD_WIDTH; _bi++) {
        process_point(_block, _nextBlock0, _bi, num);
      }
#else	// not SIMD_NONE
      process_simd_opt(_block, _nextBlock0, _si, num);
#endif // ifdef SIMD_NONE else
    }

#ifdef SIMD_ALL
    int rem_work = _block->size - _si;
    if (rem_work == 1) {
      process_point(_block, _nextBlock0, _si, num);
    } else if (rem_work > 1) {
      process_simd_opt(_block, _nextBlock0, _si, num, rem_work);
    }
#else	// not SIMD_ALL
    //cleanup code for stripmined loop
    for (int _bi = _si; _bi < _block->size; _bi++) {
      process_point(_block, _nextBlock0, _bi, num);
    }
#endif
    if (_nextBlock0 -> _Block::size > 0) {
      _stack ->  get (_depth + 1) -> _BlockSet::block = _nextBlock0;
      int skip = 0;
      skip = binomial_block1(_stack, _depth + 1, num);
      if (!skip)
        binomial_block(_stack, _depth + 1, num);

    }
  }

  return 0;
}

/*sequential processing for right children*/
inline void process_point1(_Block *_block, _Block *_nextBlock0, int _bi, int *num) {
  node_t n = _block->n[_bi];
  node_t k = _block->k[_bi] - 1;
  if (k == 0 || k == n) {
    *num += 1;
  } else if (k < 0 || k > n) {
  } else {
    _nextBlock0->add1(_block, _bi);
  }
}

/*simd processing for right children, we store n in int in the block*/
inline void process_simd1_opt(_Block *_block, _Block *_nextBlock0, int _si, int *num, int rem_work = MY_SIMD_WIDTH) {
  __m512i vec_n = _mm512_load_epi32((__m512i*)&_block->n[_si]);
  __m512i vec_k = _mm512_load_epi32((__m512i*)&_block->k[_si]);
  __m512i vec_0 = _mm512_setzero_epi32();
  __m512i vec_1 = _mm512_set1_epi32(1);
  vec_k = _mm512_sub_epi32(vec_k, vec_1);

  __mmask16 mask_k_eq_0 = _mm512_cmp_epi32_mask(vec_k, vec_0, _MM_CMPINT_EQ);	
  __mmask16 mask_k_eq_n = _mm512_cmp_epi32_mask(vec_k, vec_n, _MM_CMPINT_EQ);
  __mmask16 mask_cond1 = _mm512_kor(mask_k_eq_0, mask_k_eq_n);

  __mmask16 mask_k_lt_0 = _mm512_cmp_epi32_mask(vec_k, vec_0, _MM_CMPINT_LT);	
  __mmask16 mask_k_gt_n = _mm512_cmp_epi32_mask(vec_k, vec_n, _MM_CMPINT_GT);
  __mmask16 mask_cond2 = _mm512_kor(mask_k_lt_0, mask_k_gt_n);
  __mmask16 mask_is_leaf = _mm512_kor(mask_cond1, mask_cond2);

#ifdef SEQSC
  __declspec(align(64)) int tmp[MY_SIMD_WIDTH] = {0};
  __declspec(align(64)) int tmp_non_leaf[MY_SIMD_WIDTH] = {0};
  _mm512_mask_store_epi32(tmp_non_leaf, ~mask_is_leaf, vec_1);

  for (int i = 1; i < MY_SIMD_WIDTH; ++i){
    tmp[i] = tmp[i-1] + tmp_non_leaf[i-1];
  }

  __m512i vec_index = _mm512_add_epi32(_mm512_set1_epi32(_nextBlock0->size), _mm512_load_epi32(tmp));

  vec_n = _mm512_sub_epi32(vec_n, vec_1);
  _mm512_mask_i32scatter_epi32((int*)_nextBlock0->n, ~mask_is_leaf, vec_index, vec_n, 4);
  _mm512_mask_i32scatter_epi32((int*)_nextBlock0->k, ~mask_is_leaf, vec_index, vec_k, 4);
  int adv = tmp[MY_SIMD_WIDTH - 1] + tmp_non_leaf[MY_SIMD_WIDTH - 1];
  _nextBlock0->size += adv;

  *num += _mm512_mask_reduce_add_epi32(mask_cond1, vec_1);

#else //no SEQSC
  unsigned short low_8, high_8;
  low_8 = (~mask_is_leaf) & 0x00FF;
  high_8 = ((~mask_is_leaf) & 0xFF00) >> 8;

  __m512i vec_index = _mm512_load_epi32(g_scantable[low_8]);
  __m512i vec_index_1 = _mm512_load_epi32(g_scantable[high_8]);

  __m512i vec_index_offset = _mm512_set1_epi32(g_scantable[low_8][7] + (((~mask_is_leaf) >> 7) & 0x1));

  vec_index_1 = _mm512_mask_add_epi32(vec_index_1, 0x00FF, vec_index_1, vec_index_offset);
  vec_index_1 = _mm512_permute4f128_epi32(vec_index_1, _MM_PERM_BADC);
  vec_index = _mm512_and_epi32(vec_index, vec_index_1);

  vec_index = _mm512_add_epi32(_mm512_set1_epi32(_nextBlock0->size), vec_index);

  vec_n = _mm512_sub_epi32(vec_n, vec_1);
  _mm512_mask_i32scatter_epi32((int*)_nextBlock0->n, ~mask_is_leaf, vec_index, vec_n, 4);
  _mm512_mask_i32scatter_epi32((int*)_nextBlock0->k, ~mask_is_leaf, vec_index, vec_k, 4);
  int adv = g_scantable[low_8][7] + g_scantable[high_8][7] + (((~mask_is_leaf) >> 7) & 0x1) + (((~mask_is_leaf) >> 15) & 0x1);
  _nextBlock0->size += adv;
  *num += _mm512_mask_reduce_add_epi32(mask_cond1, vec_1);
#endif // SEQSC
}

/*Depth First execution of right children to limit the memory consumption*/
int binomial_block1(class _BlockStack *_stack,int _depth, int *num) {
  class _BlockSet *_set = _stack ->  get (_depth);
  class _Block *_block = _set -> block;
  class _Block *_nextBlock0 = &_set -> _BlockSet::nextBlock0;
  _nextBlock0 ->  recycle ();
#ifdef BLOCK_PROFILE
  profiler.record(_block->size, _depth);
#endif

  int _block_size = _block->size;
  if (_block_size <= _expandSize / 2){//Do dynamic reexpansion
    dynamic_reexpand_count++;
    g_is_partial = 1;
    binomial_expand_bf(_stack, &_depth, num);
    return 1;
  } else {
    int _si = 0;
    for (; _si < (_block->size - MY_SIMD_WIDTH + 1); _si += MY_SIMD_WIDTH) {
#ifdef SIMD_NONE
      for (int _bi = _si; _bi < _si + MY_SIMD_WIDTH; _bi++) {
        process_point1(_block, _nextBlock0, _bi, num);
      }
#else	// not SIMD_NONE
      process_simd1_opt(_block, _nextBlock0, _si, num, MY_SIMD_WIDTH);
#endif // ifdef SIMD_NONE else
    }

#ifdef SIMD_ALL
    int rem_work = _block->size - _si;
    if (rem_work == 1) {
      process_point1(_block, _nextBlock0, _si, num);
    } else if (rem_work > 1) {
      process_simd1_opt(_block, _nextBlock0, _si, num, rem_work);
    }
#else	// not SIMD_ALL
    //cleanup code for stripmined loop
    for (int _bi = _si; _bi < _block->size; _bi++) {
      process_point1(_block, _nextBlock0, _bi, num);
    }
#endif

    if (_nextBlock0 -> _Block::size > 0) {
      _stack ->  get (_depth + 1) -> _BlockSet::block = _nextBlock0;
      int skip = 0;
      skip = binomial_block1(_stack, _depth + 1, num);
      if (!skip)
        binomial_block(_stack, _depth + 1, num);
    }
  }

  return 0;
}


/*Breadth First execution to expand the number of tasks in software block*/
void binomial_expand_bf(class _BlockStack *_stack, int* _depth, int *num) {
  class _BlockSet *_set = _stack ->  get (*_depth);
  class _Block *_block = _set -> block;
  class _Block *_nextBlock0 = &_set -> _BlockSet::nextBlock0;
  _nextBlock0 ->  recycle ();
#ifdef BLOCK_PROFILE
  profiler.record(_block->size, _depth);
#endif

  //Add Left
  int _si = 0;
  for (; _si < (_block->size - MY_SIMD_WIDTH + 1); _si += MY_SIMD_WIDTH) {
#ifdef SIMD_NONE
    for (int _bi = _si; _bi < _si + MY_SIMD_WIDTH; _bi++) {
      process_point(_block, _nextBlock0, _bi, num);
    }
#else	// not SIMD_NONE
    process_simd_opt(_block, _nextBlock0, _si, num);
#endif // ifdef SIMD_NONE else
  }

#ifdef SIMD_ALL
  int rem_work = _block->size - _si;
  if (rem_work == 1) {
    process_point(_block, _nextBlock0, _si, num);
  } else if (rem_work > 1) {
    process_simd_opt(_block, _nextBlock0, _si, num, rem_work);
  }
#else	// not SIMD_ALL
  //cleanup code for stripmined loop
  for (int _bi = _si; _bi < _block->size; _bi++) {
    process_point(_block, _nextBlock0, _bi, num);
  }
#endif

  //Add Right
  _si = 0;
  for (; _si < (_block->size - MY_SIMD_WIDTH + 1); _si += MY_SIMD_WIDTH) {
#ifdef SIMD_NONE
    for (int _bi = _si; _bi < _si + MY_SIMD_WIDTH; _bi++) {
      process_point1(_block, _nextBlock0, _bi, num);
    }
#else	// not SIMD_NONE
    process_simd1_opt(_block, _nextBlock0, _si, num, MY_SIMD_WIDTH);
#endif // ifdef SIMD_NONE else
  }

#ifdef SIMD_ALL
  rem_work = _block->size - _si;
  if (rem_work == 1) {
    process_point1(_block, _nextBlock0, _si, num);
  } else if (rem_work > 1) {
    process_simd1_opt(_block, _nextBlock0, _si, num, rem_work);
  }
#else	// not SIMD_ALL
  //cleanup code for stripmined loop
  for (int _bi = _si; _bi < _block->size; _bi++) {
    process_point1(_block, _nextBlock0, _bi, num);
  }
#endif

  //Free old stack space
  if (!g_is_partial){
    if (!*_depth){
      delete g_initial_block;
    } else
    {
      _stack->release(*_depth - 1);
    }
  }

  int _nextblock0_size = _nextBlock0 -> _Block::size;
  *_depth += 1;
  if (_nextblock0_size > 0 && _nextblock0_size <= _expandSize / 2) {
    _stack ->  get (*_depth) -> _BlockSet::block = _nextBlock0;
    binomial_expand_bf(_stack, _depth, num);
  } else { //Reach the buffer size, or finish all evaluation
    if (!dynamic_reexpand_count){// only print for the first time
      cout << "This is the max block buffer size for dfs: " << _nextblock0_size << endl;
    }

    if (_nextblock0_size){
      _stack ->  get (*_depth) -> _BlockSet::block = _nextBlock0;
      binomial_block1(_stack, *_depth, num);
      binomial_block(_stack, *_depth, num);
    }
  }
}

/*Benchmark entrance called by harness*/
int app_main(int argc, char **argv) {
  if (argc != 2 && argc != 3) {
    cout << "usage: binomial [n] [k] or binomial [n] [k] [buffer_size, pow(2, i)]" << endl;
    exit(0);
  }

  node_t n = atoi(argv[0]);
  node_t k = atoi(argv[1]);
  if (argc == 3) _expandSize = pow(2.0, atoi(argv[2]));
  int num = 0;

  Harness::start_timing();

  //_expandDepth = Harness::get_splice_depth();

  //Initialize software block stack
  cout << "Set fixed max block buffer size, _expandSize: " << _expandSize << endl;
  _Block::max_block = _expandSize;
  Harness::set_block_size(_expandSize);
  class _BlockStack * _stack = new _BlockStack;
  class _Block * _block = new _Block;
  g_initial_block = _block;

  if (k == 0 || k == n) {
    num += 1;
  } else if (k < 0 || k > n) {
    return 1;
  } else {
    _block->add(n - 1, k);
  }

  int _depth = 0;
  _stack->get (_depth) -> block = _block;

  //Start to execute blocked binomial 
  if (_expandSize >= 2) binomial_expand_bf(_stack, &_depth, &num);
  else{
    int df_block_size = _stack->get(_depth)->block->size;
    cout << "This is the max block buffer size for dfs: " << df_block_size << endl;

    if (df_block_size){
      binomial_block1(_stack, _depth, &num);
      binomial_block(_stack, _depth, &num);
    }
  }

  delete _stack;
  if (_expandSize < 2) delete _block;
  Harness::stop_timing();

  cout << num << endl;

#ifdef BLOCK_PROFILE
  profiler.output();
#endif
  cout << "This is the number of reexpansions: " << dynamic_reexpand_count << endl;
#ifdef PROFILE_SPACE_USE
  cout << "This is max space use (Bytes): " << m_space << endl;
  cout << "This is the total number of new operations for block: " << total_malloc << endl;
#endif
  return 0;
}
