/**********************************************************/
/* This code is for PLDI-15 Artifact Evaluation only      */ 
/* and will be released with further copyright information*/ 
/* File: Sequential block wo reexpansion of nqueens       */
/**********************************************************/

#include <iostream>
#include <fstream>
#include "harness.h"
#include "block.h"

#ifdef BLOCK_PROFILE
#include "blockprofiler.h"
BlockProfiler profiler(16);//set simd profiler simd width as 16 
#endif
//Parallelism profiler, not used in our paper
//for further development
#ifdef PARALLELISM_PROFILE
#include "parallelismprofiler.h"
ParallelismProfiler *parallelismProfiler;
#endif

#ifdef TRACK_TRAVERSALS
uint64_t work = 0;
#endif

using namespace std;

//int _expandDepth = 0;
int _expandSize = D_MAX_BLOCK_SIZE;
_Block * g_initial_block = NULL;//For memory release

int ok(char n, char *a) {
  for (int i = 0; i < n; i++) {
    char p = a[i];

    for (int j = i + 1; j < n; j++) {
      char q = a[j];
      if (q == p || q == p - (j - i) || q == p + (j - i))
        return 0;
    }
  }
  return 1;
}

/*Pseudo tail recursive nqueens matching our language spec*/
void nqueens(char n, char j, char *a, int *num, int _callIndex) {
#ifdef BLOCK_PROFILE
  profiler.record_single();
#endif

  if (_callIndex != -1) {
    a[j - 1] = _callIndex;
    if (!ok(j, a)) {
#ifdef PARALLELISM_PROFILE
      parallelismProfiler->recordNonBlockedTruncate();
#endif
      return;
    }
  }

  if (n == j) {
    *num += 1;
#ifdef PARALLELISM_PROFILE
    parallelismProfiler->recordNonBlockedTruncate();
#endif
    return;
  }

#ifdef PARALLELISM_PROFILE
  parallelismProfiler->recordNonBlockedRecurse();
#endif

  /* try each possible position for queen <j> */
  for (int i = 0; i < n; i++) {
    nqueens(n, j + 1, a, num, i);
  }
}

/*Depth First execution of i-th children to limit the memory consumption*/
void nqueens_block(_BlockStack *_stack, int _depth, int* num, int _callIndex) {
#ifdef TRACK_TRAVERSALS
  work++;
#endif
  class _BlockSet *_set = _stack ->  get (_depth);
  class _Block *_block = _set -> block;
  class _Block *_nextBlock0 = &_set -> _BlockSet::nextBlock0;
  _nextBlock0 ->  recycle ();
#ifdef BLOCK_PROFILE
  profiler.record(_block->size, _depth);
#endif

  for (int _bi = 0; _bi < _block->size; _bi++) {
    class _Point &_point = _block ->  get (_bi);
    char *a = _point.b;

    //if (_callIndex != -1) {  // this check not necessary as block is done after expansion
    a[_depth] = _callIndex;
    if (!ok(_depth + 1, a)) {
#ifdef PARALLELISM_PROFILE
      parallelismProfiler->recordTruncate();
#endif
      continue;
    }
    //}

    if (g_nqueens == _depth + 1) {
#ifdef PARALLELISM_PROFILE
      parallelismProfiler->recordTruncate();
#endif
      *num += 1;
      continue;
    }

#ifdef PARALLELISM_PROFILE
    parallelismProfiler->recordRecurse();
#endif
    /* try each possible position for queen <j> */
    _nextBlock0->add(a, _depth + 1);
  }

  if (_nextBlock0 -> _Block::size > 0) {
    _stack ->  get (_depth + 1) -> _BlockSet::block = _nextBlock0;
    for (int i = 0; i < g_nqueens; i++) {
      nqueens_block(_stack, _depth + 1, num, i);
    }
  }
#ifdef PARALLELISM_PROFILE
  parallelismProfiler->blockEnd();
#endif
}

/*Breadth First execution to expand the number of tasks in software block*/
void nqueens_expand_bf(_BlockStack* _stack, int* _depth, int* num){
#ifdef TRACK_TRAVERSALS
  work++;
#endif
  class _BlockSet *_set = _stack ->  get (*_depth);
  class _Block *_block = _set -> block;
  class _Block *_nextBlock0 = &_set -> _BlockSet::nextBlock0;
  _nextBlock0 ->  recycle ();
#ifdef BLOCK_PROFILE
  profiler.record(_block->size, *_depth);
#endif

  if (g_nqueens == *_depth) {
#ifdef PARALLELISM_PROFILE
    for (int pi = 0; pi < _block->size; _bi++)
      parallelismProfiler->recordTruncate();
#endif
    *num += _block->size;
  } else {
    for(int i = 0; i < g_nqueens; ++i){
      for (int _bi = 0; _bi < _block->size; _bi++) {
        class _Point &_point = _block ->  get (_bi);
        char *a = _point.b;

        a[*_depth] = i;
        if (!ok(*_depth + 1, a)) {
#ifdef PARALLELISM_PROFILE
          parallelismProfiler->recordTruncate();
#endif
          continue;
        }

#ifdef PARALLELISM_PROFILE
        parallelismProfiler->recordRecurse();
#endif
        _nextBlock0->add(a, *_depth + 1);
      }
    }
  }

  //Free old stack space
  if (!*_depth){
    delete g_initial_block;
  } else
  {
    _stack->release(*_depth - 1);
  }


  int _nextblock0_size = _nextBlock0 -> _Block::size;

#ifdef _DEBUG
  cout << "This is _nextblock0_size: " << _nextblock0_size << endl;
  for (int j = 0; j < _nextblock0_size; ++j){
    for (int k = 0; k < g_nqueens; ++k){
      printf("%d ", (int)_nextBlock0->points[j].b[k]);
    }
    cout << endl;
  }
  cout << endl;
#endif

  *_depth += 1;
  if (_nextblock0_size > 0 && _nextblock0_size <= _expandSize / g_nqueens) {
    _stack ->  get (*_depth) -> _BlockSet::block = _nextBlock0;
    nqueens_expand_bf(_stack, _depth, num);
  } else { //Reach the buffer size, or finish all evaluation
    _stack ->  get (*_depth) -> _BlockSet::block = _nextBlock0;
  }
#ifdef PARALLELISM_PROFILE
  parallelismProfiler->blockEnd();
#endif
}

/*Benchmark entrance called by harness*/
int app_main(int argc, char **argv) {
  if (argc < 1) {
    printf("number of queens required\n");
    return 1;
  }
  if (argc > 2)
    printf("extra arguments being ignored\n");

  g_nqueens = atoi(argv[0]);
  printf("running queens %d\n", g_nqueens);


  if (argc == 2) _expandSize = pow(2.0, atoi(argv[1]));

#ifdef PARALLELISM_PROFILE
  parallelismProfiler = new ParallelismProfiler;
#endif
  Harness::start_timing();
  //_expandDepth = Harness::get_splice_depth();

  char *a = (char *)alloca(g_nqueens * sizeof(char));
  int num = 0;

  //Initialize software block stack
  cout << "Set fixed max block buffer size, _expandSize: " << _expandSize << endl;
  _Block::max_block = _expandSize;
  Harness::set_block_size(_expandSize);
  class _BlockStack * _stack = new _BlockStack;
  class _Block * _block = new _Block;
  g_initial_block = _block;

  _block->add(a);
  int _depth = 0;
  _stack->get (_depth) -> block = _block;

  //Start to execute blocked nqueens 
  if (_expandSize >= g_nqueens)  nqueens_expand_bf(_stack, &_depth, &num);
  int df_block_size = _stack->get(_depth)->block->size;
  cout << "This is the max block buffer size for dfs: " << df_block_size << endl;
  cout << "This is the result now: " << num << endl;

  if (df_block_size){
    for (int i = 0; i < g_nqueens; i++) {
      nqueens_block(_stack, _depth, &num, i);
    }
  }

  delete _stack;
  if (_expandSize < g_nqueens) delete _block;
  Harness::stop_timing();

#ifdef BLOCK_PROFILE
  profiler.output();
#ifdef BLOCKINFO
  profiler.outputBlockInfo();//For output task distribution profile data 
#endif 
#endif

#ifdef TRACK_TRAVERSALS
  cout << "work: " << work << endl;
#endif

  printf("nqueens = %d\n", num);

#ifdef PROFILE_SPACE_USE
  cout << "This is max space use (Bytes): " << m_space << endl;
  cout << "This is total malloc counts: " << total_malloc << endl;
#endif
  return 0;
}
