/**********************************************************/
/* This code is for PLDI-15 Artifact Evaluation only      */ 
/* and will be released with further copyright information*/ 
/* File: MIC block w reexpansion of minmax                */
/**********************************************************/

#include <stdio.h>
#include <string.h>
#include <iostream>
#include <fstream>

#include "simd.h"
#include "block-mic.h"
#include "harness.h"
#ifdef BLOCK_PROFILE
#include "blockprofiler.h"
BlockProfiler profiler;
#endif

int expand_condition = 1;

using namespace std;

//int _expandDepth = 0;
int dynamic_reexpand_count = 0;
int _expandSize = D_MAX_BLOCK_SIZE;
_Block * g_initial_block;
int g_is_partial = 0;

int pos_weights[POS_SIZE]={};

void init_board( node_t pboard[] )
{
  int i ;
  for( i = 1; i <= (BOARD_SIZE*BOARD_SIZE); i++) pboard[i] = '_';
}

void print_board(node_t pboard[] )
{
  int i;
  for (i = 1; i <= (BOARD_SIZE*BOARD_SIZE); i++)
  {
    printf("%c ",pboard[i]);
    if (i % BOARD_SIZE == 0)
      printf("\n");
  }
}

void print_weights( )
{
  int count =1;
  printf("\n*******Weighted array*****\n");
  while(count < POS_SIZE)
  {
    printf("%d\t",pos_weights[count++]);
  }
  printf("\n");
}

/*sequential checking functions*/
int check_win(node_t pboard[])
{
  int ret = 0;
  if ((pboard[1] == 'X' && pboard[2] == 'X' && pboard[3] == 'X' && pboard[4] == 'X') ||
      (pboard[5] == 'X' && pboard[6] == 'X' && pboard[7] == 'X' && pboard[8] == 'X') ||
      (pboard[9] == 'X' && pboard[10] == 'X' && pboard[11] == 'X' && pboard[12] == 'X') ||
      (pboard[13] == 'X' && pboard[14] == 'X' && pboard[15] == 'X' && pboard[16] == 'X') ||
      (pboard[1] == 'X' && pboard[5] == 'X' && pboard[9] == 'X' && pboard[13] == 'X') ||
      (pboard[2] == 'X' && pboard[6] == 'X' && pboard[10] == 'X' && pboard[14] == 'X') ||
      (pboard[3] == 'X' && pboard[7] == 'X' && pboard[11] == 'X' && pboard[15] == 'X') ||
      (pboard[4] == 'X' && pboard[8] == 'X' && pboard[12] == 'X' && pboard[16] == 'X') ||
      (pboard[1] == 'X' && pboard[6] == 'X' && pboard[11] == 'X' && pboard[16] == 'X') ||
      (pboard[4] == 'X' && pboard[7] == 'X' && pboard[10] == 'X' && pboard[13] == 'X'))
  {
    ret = 1;
  }
  else if ((pboard[1] == 'O' && pboard[2] == 'O' && pboard[3] == 'O' && pboard[4] == 'O') ||
           (pboard[5] == 'O' && pboard[6] == 'O' && pboard[7] == 'O' && pboard[8] == 'O') ||
           (pboard[9] == 'O' && pboard[10] == 'O' && pboard[11] == 'O' && pboard[12] == 'O') ||
           (pboard[13] == 'O' && pboard[14] == 'O' && pboard[15] == 'O' && pboard[16] == 'O') ||
           (pboard[1] == 'O' && pboard[5] == 'O' && pboard[9] == 'O' && pboard[13] == 'O') ||
           (pboard[2] == 'O' && pboard[6] == 'O' && pboard[10] == 'O' && pboard[14] == 'O') ||
           (pboard[3] == 'O' && pboard[7] == 'O' && pboard[11] == 'O' && pboard[15] == 'O') ||
           (pboard[4] == 'O' && pboard[8] == 'O' && pboard[12] == 'O' && pboard[16] == 'O') ||
           (pboard[1] == 'O' && pboard[6] == 'O' && pboard[11] == 'O' && pboard[16] == 'O') ||
           (pboard[4] == 'O' && pboard[7] == 'O' && pboard[10] == 'O' && pboard[13] == 'O'))
  {
    ret = -1;
  }

  return ret;
}

int check_draw(node_t pboard[])
{
  int win,ret=0;
  win = check_win(pboard);
  if ((win == 0) && (pboard[1] != '_') && (pboard[2] != '_') &&
      (pboard[3] != '_') && (pboard[4] != '_') && (pboard[5] != '_') &&
      (pboard[6] != '_') && (pboard[7] != '_') && (pboard[8] != '_') &&
      (pboard[9] != '_') && (pboard[10] != '_') && (pboard[11] != '_') &&
      (pboard[12] != '_') && (pboard[13] != '_') && (pboard[14] != '_') &&
      (pboard[15] != '_') && (pboard[16] != '_'))
  {
    ret = 1;
  }
  return ret;
}


int evaluationFunction(node_t pboard[],int *leaf_val,int depth,int player)
{
  int chk_endgame = 0;
  chk_endgame = check_win(pboard);

  if (chk_endgame == 1) {
    *leaf_val = 20 - depth;
  }
  else if (chk_endgame == (-1)) {
    *leaf_val = -(20 - depth);
  }
  else {
    chk_endgame = check_draw(pboard);

    if(chk_endgame)
    {
      *leaf_val = 0;
    }
  }

  return chk_endgame;
}

int chooseNextMove(int* position,node_t pboard[])
{
  int pos,ret=0;
  int currentpos = *position;
  *position = NOMOVE;

  for (pos = currentpos; pos <= (BOARD_SIZE*BOARD_SIZE); pos++)
  {
    if('_' == pboard[pos])
    {
      *position = pos;
      ret = 1;
      break;
    }
  }

  return ret;
}

int update_pos_weigh(int player,node_t board[])
{
  int minmaxpos = 1,i,ret;
  ret = chooseNextMove(&minmaxpos, board);
  if(!ret)
  {
    printf("\nPanic - cant choose next position*");
    return -1;
  }
  else
  {
    if(player % 2) //player 1
    {
      for( i =1;i <= (BOARD_SIZE*BOARD_SIZE);i++) //check
      {
        if( (pos_weights[i] > pos_weights[minmaxpos]) &&
           (board[i] == '_') )
          minmaxpos = i;
      }
    }
    else //player 2
    {
      for( i =1;i <= (BOARD_SIZE*BOARD_SIZE);i++) //check
      {
        if( (pos_weights[i] < pos_weights[minmaxpos]) &&
           (board[i] == '_'))
          minmaxpos = i;
      }
    }
  }
  return minmaxpos;
}


/*simd checking functions*/
//#define ShortCut
inline void check_win_simd(__m512i* vec_eval_ended, node_t pboard_vec[]){
  //For Player X
  __m512i vec_X = _mm512_set1_epi32('X');
  __mmask16 mask_cond_X = 0;

  //Case 1 - 4
  for (int i = 0; i < 4; ++i){
    __mmask16 mask_cond1 = 0xffff;
    for (int j = 1; j <= 4; ++j){
      __m512i vec_pboard = _mm512_load_epi32(((__m512i*)pboard_vec) + i * 4 + j);
      __mmask16 mask_eq = _mm512_cmp_epi32_mask(vec_pboard, vec_X, _MM_CMPINT_EQ);
      mask_cond1 = _mm512_kand(mask_cond1, mask_eq);
    }
    mask_cond_X = _mm512_kor(mask_cond_X, mask_cond1);
#ifdef ShortCut 
    if (mask_cond_X == 0xffff){
      *vec_eval_ended = _mm512_set1_epi32(1);
      return;
    } 
#endif
  }

  //Case 5 - 8
  for (int i = 1; i <= 4; ++i){
    __mmask16 mask_cond1 = 0xffff;
    for (int j = 0; j < 4; ++j){
      __m512i vec_pboard = _mm512_load_epi32(((__m512i*)pboard_vec) + i + j * 4);
      __mmask16 mask_eq = _mm512_cmp_epi32_mask(vec_pboard, vec_X, _MM_CMPINT_EQ);
      mask_cond1 = _mm512_kand(mask_cond1, mask_eq);
    }
    mask_cond_X = _mm512_kor(mask_cond_X, mask_cond1);
#ifdef ShortCut 
    if (mask_cond_X == 0xffff){
      *vec_eval_ended = _mm512_set1_epi32(1);
      return;
    } 
#endif
  }

  //Case 9
  __mmask16 mask_cond1 = 0xffff;
  for (int j = 1; j < POS_SIZE; j += 5){
    __m512i vec_pboard = _mm512_load_epi32(((__m512i*)pboard_vec) + j);
    __mmask16 mask_eq = _mm512_cmp_epi32_mask(vec_pboard, vec_X, _MM_CMPINT_EQ);
    mask_cond1 = _mm512_kand(mask_cond1, mask_eq);
  }
  mask_cond_X = _mm512_kor(mask_cond_X, mask_cond1);
#ifdef ShortCut 
  if (mask_cond_X == 0xffff){
    *vec_eval_ended = _mm512_set1_epi32(1);
    return;
  } 
#endif

  //Case 10 
  mask_cond1 = 0xffff;
  for (int j = 4; j < POS_SIZE - 1; j += 3){
    __m512i vec_pboard = _mm512_load_epi32(((__m512i*)pboard_vec) + j);
    __mmask16 mask_eq = _mm512_cmp_epi32_mask(vec_pboard, vec_X, _MM_CMPINT_EQ);
    mask_cond1 = _mm512_kand(mask_cond1, mask_eq);
  }
  mask_cond_X = _mm512_kor(mask_cond_X, mask_cond1);
#ifdef ShortCut 
  if (mask_cond_X == 0xffff){
    *vec_eval_ended = _mm512_set1_epi32(1);
    return;
  } 
#endif

  //For Player X
  __m512i vec_O = _mm512_set1_epi32('O');
  __mmask16 mask_cond_O = 0;

  //Case 1 - 4
  for (int i = 0; i < 4; ++i){
    __mmask16 mask_cond1 = 0xffff;
    for (int j = 1; j <= 4; ++j){
      __m512i vec_pboard = _mm512_load_epi32(((__m512i*)pboard_vec) + i * 4 + j);
      __mmask16 mask_eq = _mm512_cmp_epi32_mask(vec_pboard, vec_O, _MM_CMPINT_EQ);
      mask_cond1 = _mm512_kand(mask_cond1, mask_eq);
    }
    mask_cond_O = _mm512_kor(mask_cond_O, mask_cond1);
#ifdef ShortCut 
    if (mask_cond_O == 0xffff){
      *vec_eval_ended = _mm512_set1_epi32(1);
      return;
    } 
#endif
  }

  //Case 5 - 8
  for (int i = 1; i <= 4; ++i){
    __mmask16 mask_cond1 = 0xffff;
    for (int j = 0; j < 4; ++j){
      __m512i vec_pboard = _mm512_load_epi32(((__m512i*)pboard_vec) + i + j * 4);
      __mmask16 mask_eq = _mm512_cmp_epi32_mask(vec_pboard, vec_O, _MM_CMPINT_EQ);
      mask_cond1 = _mm512_kand(mask_cond1, mask_eq);
    }
    mask_cond_O = _mm512_kor(mask_cond_O, mask_cond1);
#ifdef ShortCut 
    if (mask_cond_O == 0xffff){
      *vec_eval_ended = _mm512_set1_epi32(1);
      return;
    } 
#endif
  }

  //Case 9
  mask_cond1 = 0xffff;
  for (int j = 1; j < POS_SIZE; j += 5){
    __m512i vec_pboard = _mm512_load_epi32(((__m512i*)pboard_vec) + j);
    __mmask16 mask_eq = _mm512_cmp_epi32_mask(vec_pboard, vec_O, _MM_CMPINT_EQ);
    mask_cond1 = _mm512_kand(mask_cond1, mask_eq);
  }
  mask_cond_O = _mm512_kor(mask_cond_O, mask_cond1);
#ifdef ShortCut 
  if (mask_cond_O == 0xffff){
    *vec_eval_ended = _mm512_set1_epi32(1);
    return;
  } 
#endif

  //Case 10 
  mask_cond1 = 0xffff;
  for (int j = 4; j < POS_SIZE - 1; j += 3){
    __m512i vec_pboard = _mm512_load_epi32(((__m512i*)pboard_vec) + j);
    __mmask16 mask_eq = _mm512_cmp_epi32_mask(vec_pboard, vec_O, _MM_CMPINT_EQ);
    mask_cond1 = _mm512_kand(mask_cond1, mask_eq);
  }
  mask_cond_O = _mm512_kor(mask_cond_O, mask_cond1);
#ifdef ShortCut 
  if (mask_cond_O == 0xffff){
    *vec_eval_ended = _mm512_set1_epi32(1);
    return;
  } 
#endif

  *vec_eval_ended = _mm512_mask_add_epi32(*vec_eval_ended, mask_cond_X,
                                          *vec_eval_ended, _mm512_set1_epi32(1)); 

  *vec_eval_ended = _mm512_mask_add_epi32(*vec_eval_ended, mask_cond_O,
                                          *vec_eval_ended, _mm512_set1_epi32(-1)); 
}


inline void check_draw_simd(__m512i* vec_eval_ended, node_t pboard_vec[]){
  __m512i vec_win = _mm512_setzero_epi32();
  check_win_simd(&vec_win, pboard_vec);

  __m512i vec_1 = _mm512_set1_epi32(1);
  __m512i vec_n1 = _mm512_set1_epi32(-1);
  __m512i vec_ul = _mm512_set1_epi32('_');

  __mmask16 mask_cond = 0;
  __mmask16 mask_win_eq_1 = _mm512_cmp_epi32_mask(vec_win, vec_1, _MM_CMPINT_EQ);
  __mmask16 mask_win_eq_n1 = _mm512_cmp_epi32_mask(vec_win, vec_n1, _MM_CMPINT_EQ);
  __mmask16 mask_cond_1 = _mm512_kor(mask_win_eq_1, mask_win_eq_n1);

  mask_cond = _mm512_kor(mask_cond, mask_cond_1);

  for (int j = 1; j < POS_SIZE; ++j){
    __m512i vec_pboard = _mm512_load_epi32(((__m512i*)pboard_vec) + j);
    __mmask16 mask_eq = _mm512_cmp_epi32_mask(vec_pboard, vec_ul, _MM_CMPINT_EQ);
    mask_cond = _mm512_kor(mask_cond, mask_eq);
  }

  mask_cond = ~mask_cond;
  *vec_eval_ended = _mm512_mask_add_epi32(*vec_eval_ended, mask_cond, *vec_eval_ended, vec_1);
}


inline void evaluationFunction_simd(__m512i* vec_eval_ended, node_t pboard_vec[], __m512i* vec_leaf_val, node_t depth, node_t player){

  check_win_simd(vec_eval_ended, pboard_vec);

  __m512i vec_1 = _mm512_set1_epi32(1);
  __m512i vec_n1 = _mm512_set1_epi32(-1);
  __m512i vec_20 = _mm512_set1_epi32(20);
  __m512i vec_depth = _mm512_set1_epi32(depth);

  //Can be optimized if there is a proper _mm512_mul_epi32
  //*vec_leaf_val = _mm512_mul_epi32(_mm512_sub_epi32(vec_20, vec_depth), vec_eval_ended);

  __mmask16 mask_cond_1 = _mm512_cmp_epi32_mask(*vec_eval_ended, vec_1, _MM_CMPINT_EQ); 
  __mmask16 mask_cond_n1 = _mm512_cmp_epi32_mask(*vec_eval_ended, vec_n1, _MM_CMPINT_EQ); 

  *vec_leaf_val = _mm512_mask_add_epi32(*vec_leaf_val, mask_cond_1, *vec_leaf_val, 
                                        _mm512_sub_epi32(vec_20, vec_depth));
  *vec_leaf_val = _mm512_mask_add_epi32(*vec_leaf_val, mask_cond_n1, *vec_leaf_val, 
                                        _mm512_sub_epi32(vec_depth, vec_20));

  check_draw_simd(vec_eval_ended, pboard_vec);
}

__declspec(align(64)) node_t pboard_vec[POS_SIZE * MY_SIMD_WIDTH] = {0};
__declspec(align(64)) node_t eval_ended[MY_SIMD_WIDTH] = {0};
__declspec(align(64)) node_t leaf_val[MY_SIMD_WIDTH] = {0};
__declspec(align(64)) node_t startpos[MY_SIMD_WIDTH] = {0};

/*simd processing, we store n in int in the block*/
//#define NOSC
inline void process_simd(_Block* _block, _Block* _nextBlock0, int _si, int _depth, int player, int pos){
  __m512i vec_board_pos = _mm512_load_epi32((__m512i*)_block->getptr(_si, pos));
  __m512i vec_ul = _mm512_set1_epi32('_');
  __mmask16 non_leaf_mask1 = _mm512_cmp_epi32_mask(vec_board_pos, vec_ul, _MM_CMPINT_EQ);

  if (non_leaf_mask1 == 0) return;

  for (int i = 0; i < POS_SIZE; ++i){
    memcpy(pboard_vec + i * MY_SIMD_WIDTH, _block->getptr(_si, i), sizeof(node_t) * MY_SIMD_WIDTH);
  }

  __m512i vec_O_X;
  if (player == 2){
    vec_O_X = _mm512_set1_epi32('O');
  } else if (player == 1){
    vec_O_X = _mm512_set1_epi32('X');
  }
  _mm512_store_epi32(((__m512i*)pboard_vec) + pos, vec_O_X);


  __m512i vec_startpos;
  if (_depth){
    vec_startpos = _mm512_load_epi32((__m512i*)&_block->startpos[_si]);
  } else {
    vec_startpos = _mm512_set1_epi32(pos);
  }
  _mm512_store_epi32((__m512i*)startpos, vec_startpos);

  __m512i vec_eval_ended;
  __m512i vec_leaf_val;
  vec_eval_ended = _mm512_setzero_epi32();
  vec_leaf_val = _mm512_setzero_epi32();
  evaluationFunction_simd(&vec_eval_ended, pboard_vec, &vec_leaf_val, _depth, player);
  _mm512_store_epi32((__m512i*)eval_ended, vec_eval_ended);
  _mm512_store_epi32((__m512i*)leaf_val, vec_leaf_val);

#ifdef NOSC
  //TO be opitmized by SC
  for (int i = 0; i < MY_SIMD_WIDTH; ++i){
    unsigned short f = 1 << i;
    if (f & non_leaf_mask1){
      if(eval_ended[i])
      {
        if (leaf_val[i]) pos_weights[startpos[i]] += leaf_val[i];
        continue;
      }
      _nextBlock0->addcol(pboard_vec, i, POS_SIZE, startpos[i]);
    }

  }
#else //Using SC
  for (int i = 0; i < MY_SIMD_WIDTH; ++i){
    unsigned short f = 1 << i;
    if ((f & non_leaf_mask1) && eval_ended[i] && leaf_val[i]){
      pos_weights[startpos[i]] += leaf_val[i];
    }
  }

  __m512i vec_0 = _mm512_setzero_epi32();
  __mmask16 mask_cond_eval_ended = _mm512_cmp_epi32_mask(vec_eval_ended, vec_0, _MM_CMPINT_EQ);
  unsigned short ret_mask = (~non_leaf_mask1) | (~mask_cond_eval_ended);
  unsigned short ok_mask = ~ret_mask;

#ifdef SEQSC
  __declspec(align(64)) int tmp[MY_SIMD_WIDTH] = {0};
  __declspec(align(64)) int tmp_non_leaf[MY_SIMD_WIDTH] = {0};
  _mm512_mask_store_epi32(tmp_non_leaf, ok_mask, vec_1);

  for (int i = 1; i < MY_SIMD_WIDTH; ++i){
    tmp[i] = tmp[i-1] + tmp_non_leaf[i-1];
  }

  assert(_nextBlock0->size < _Block::max_block);
  for (int i = 0; i < POS_SIZE; i++){
    __m512i vec_n = _mm512_load_epi32(((__m512i*)pboard_vec) + i);
    int start_index = i * _Block::max_block + _nextBlock0->size;
    __m512i vec_index_local = _mm512_add_epi32(_mm512_set1_epi32(start_index), vec_index);
    _mm512_mask_i32scatter_epi32((int*)_nextBlock0->a, (__mmask16)ok_mask, vec_index_local, vec_n, 4);
  }

  __m512i vec_index_local = _mm512_add_epi32(_mm512_set1_epi32(_nextBlock0->size), vec_index);
  _mm512_mask_i32scatter_epi32((int*)_nextBlock0->startpos, (__mmask16)ok_mask, vec_index_local, vec_startpos, 4);

  int adv = tmp[MY_SIMD_WIDTH - 1] + tmp_non_leaf[MY_SIMD_WIDTH - 1];
  _nextBlock0->size += adv;

#else // no SEQSC
  unsigned short low_8, high_8;
  low_8 = (ok_mask) & 0x00FF;
  high_8 = ((ok_mask) & 0xFF00) >> 8;

  __m512i vec_index = _mm512_load_epi32(g_scantable[low_8]);
  __m512i vec_index_1 = _mm512_load_epi32(g_scantable[high_8]);

  __m512i vec_index_offset = _mm512_set1_epi32(g_scantable[low_8][7] + (((ok_mask) >> 7) & 0x1));

  vec_index_1 = _mm512_mask_add_epi32(vec_index_1, 0x00FF, vec_index_1, vec_index_offset);
  vec_index_1 = _mm512_permute4f128_epi32(vec_index_1, _MM_PERM_BADC);
  vec_index = _mm512_and_epi32(vec_index, vec_index_1);

  for (int i = 0; i < POS_SIZE; i++){
    __m512i vec_n = _mm512_load_epi32(((__m512i*)pboard_vec) + i);
    int start_index = i * _Block::max_block + _nextBlock0->size;
    __m512i vec_index_local = _mm512_add_epi32(_mm512_set1_epi32(start_index), vec_index);
    _mm512_mask_i32scatter_epi32((int*)_nextBlock0->a, (__mmask16)ok_mask, vec_index_local, vec_n, 4);
  }

  __m512i vec_index_local = _mm512_add_epi32(_mm512_set1_epi32(_nextBlock0->size), vec_index);
  _mm512_mask_i32scatter_epi32((int*)_nextBlock0->startpos, (__mmask16)ok_mask, vec_index_local, vec_startpos, 4);

  int adv = g_scantable[low_8][7] + g_scantable[high_8][7] + (((ok_mask) >> 7) & 0x1) + (((ok_mask) >> 15) & 0x1);
  _nextBlock0->size += adv;

#endif // SEQSC

#endif//NOSC

}

/*sequential processing*/
inline void process_point(_Block* _block, _Block* _nextBlock0, int _bi, int _depth, int player, int pos){
  //Leaf node
  if (_block->get(_bi, pos) != '_'){
    return;
  }


  node_t *pboard = _block->getcol(_bi); //already a copy

  if(player == 2)
  {
    pboard[pos] = 'O';
  }
  else if(player == 1)
  {
    pboard[pos] = 'X';
  }

  int eval_ended;
  int leaf_val = 0;
  node_t startpos;

  if(_depth == 0)
  {
    startpos = pos;
  } else {
    startpos = _block->startpos[_bi];
  }

  eval_ended = 0;
  /*Check if the game is over or depth of analysis is reached*/
  eval_ended += evaluationFunction(pboard,&leaf_val,_depth,player);
  if(eval_ended)
  {
    /*Update the pos_weights array with the results*/
    if(leaf_val)
    {
      pos_weights[startpos] += leaf_val;
    }
    _mm_free(pboard);
    return;
  }

  _nextBlock0->add(pboard, POS_SIZE, startpos);
  _mm_free(pboard);
  return;
}

int minimax_block(_BlockStack* _stack, int _depth, int pos, int player);
/*Breadth First execution to expand the number of tasks in software block*/
void minimax_expand_bf(_BlockStack* _stack, int * _depth, int * player)
{
  if (*_depth == 12) return;

  class _BlockSet *_set = _stack ->  get (*_depth);
  class _Block *_block = _set -> block;
  class _Block *_nextBlock0 = &_set -> _BlockSet::nextBlock0;
  _nextBlock0 ->  recycle ();
#ifdef BLOCK_PROFILE
  profiler.record(_block->size, *_depth);
#endif

  for (int pos = 1; pos <= g_ncheck; pos++) { //Only test for 12 positions
    int _si = 0;
    for (; _si < (_block->size - MY_SIMD_WIDTH + 1); _si += MY_SIMD_WIDTH){
      process_simd(_block, _nextBlock0, _si, *_depth, *player, pos);
    }

    //Process the rest
    for (int _bi = _si; _bi < _block->size; _bi++) {
      process_point(_block, _nextBlock0, _bi, *_depth, *player, pos);
    }

  }
  //Free old stack space
  if (!g_is_partial){
    if (!*_depth){
      delete g_initial_block;
    } else
    {
      _stack->release(*_depth - 1);
    }
  }


  int _nextblock0_size = _nextBlock0 -> _Block::size;

  *_depth += 1;
  *player = 3 - *player;
  if (_nextblock0_size > 0 && _nextblock0_size < _expandSize / g_ncheck * expand_condition) {
    _stack ->  get (*_depth) -> _BlockSet::block = _nextBlock0;
    minimax_expand_bf(_stack, _depth, player);
  } else { //Reach the buffer size, or finish all evaluation
    if (!dynamic_reexpand_count){// only print for the first time
      cout << "This is the max block buffer size for dfs: " << _nextblock0_size << endl;
    }

    if (_nextblock0_size){
      _stack ->  get (*_depth) -> _BlockSet::block = _nextBlock0;
      for (int i = 1; i <= g_ncheck ; ++i) {
        minimax_block(_stack, *_depth, i, *player);
      }
    }
  }
}

/*Depth First execution to limit the memory consumption*/
int minimax_block(_BlockStack* _stack, int _depth, int pos, int player){
  if (_depth == 12) return 1;
  class _BlockSet *_set = _stack ->  get (_depth);
  class _Block *_block = _set -> block;
  class _Block *_nextBlock0 = &_set -> _BlockSet::nextBlock0;
  _nextBlock0 ->  recycle ();
#ifdef BLOCK_PROFILE
  profiler.record(_block->size, _depth);
#endif
  int _block_size = _block->size;
  if (_block_size < _expandSize / g_ncheck * expand_condition){//Do dynamic reexpansion
    dynamic_reexpand_count++;
    g_is_partial = 1;
    minimax_expand_bf(_stack, &_depth, &player);
    return 1;
  }

  int _si = 0;
  for (; _si < (_block->size - MY_SIMD_WIDTH + 1); _si += MY_SIMD_WIDTH){
    process_simd(_block, _nextBlock0, _si, _depth, player, pos);
  }

  //Process the rest
  for (int _bi = _si; _bi < _block->size; _bi++) {
    process_point(_block, _nextBlock0, _bi, _depth, player, pos);
  }

  if (_nextBlock0 -> _Block::size > 0) {
    _stack ->  get (_depth + 1) -> _BlockSet::block = _nextBlock0;
    int skip = 0;
    for (int i = 1; i <= g_ncheck; i++) {
      skip = minimax_block(_stack, _depth + 1, i, 3 - player);
      if (skip) break;
    }
  }

  return 0;
}

/*Benchmark entrance called by harness*/
int app_main(int argc, char** argv)
{
  int pos, input, player=1,chk=0,ret=0;
  node_t board[BOARD_SIZE*BOARD_SIZE+1];
  init_board(board);
  print_board(board);

  if (argc < 1) {
    g_ncheck = 12;
    printf("Checking for 12-way, and block size is using default %d\n", D_MAX_BLOCK_SIZE);
  }

  if (argc >= 1 ) g_ncheck = atoi(argv[0]);
  if (argc == 2) {
    _expandSize = pow(2.0, atoi(argv[1]));
    if (atoi(argv[1]) == 6) expand_condition = 1;
    else expand_condition = 2;
  }

  Harness::start_timing();
  for(pos = 1; pos <= (BOARD_SIZE*BOARD_SIZE); pos++)
  {
    if(pos == 1) input = 5;
    else input = ret;

    if (pos % 2 != 0) board[input] = 'X';
    else board[input] = 'O';

    /*Print the board after each turn*/
    printf("\n****Board:********\n");
    print_board(board);
    /*Check if game is over*/
    chk = check_win(board);

    if (chk == 1)
    {
      printf("Player X wins!\n");
      break;
    }
    else if (chk == -1)
    {
      printf("Player O wins!\n");
      break;
    }
    /*Compute next best move*/
    else if ((chk == 0) && (pos != BOARD_SIZE * BOARD_SIZE))
    {
      memset(pos_weights,0,sizeof(pos_weights));

      player = 1 + pos % 2;
      int l_player = player;

      //Initialize software block stack
      cout << "Set fixed max block buffer size, _expandSize: " << _expandSize << endl;
      _Block::max_block = _expandSize;
      _Block::n = POS_SIZE;
      Harness::set_block_size(_expandSize);
      class _BlockStack * _stack = new _BlockStack;
      class _Block * _block = new _Block;
      g_initial_block = _block;

      _block->add(board, POS_SIZE, 0);
      int _depth = 0;
      _stack->get(_depth)->block = _block;

      //Start to execute blocked minmax 
      if (_expandSize >= g_ncheck)
        minimax_expand_bf(_stack, &_depth, &l_player);
      else{
        int df_block_size = _stack->get(_depth)->block->size;
        cout << "This is the max block buffer size for dfs: " << df_block_size << endl;
        if (df_block_size){
          for (int i = 1; i <= g_ncheck; ++i){
            minimax_block(_stack, _depth, i, l_player);
          }
        }
      }
      print_weights();

      delete _stack;
      if (_expandSize < g_ncheck) delete _block;

    }
    else
    {
      printf("The game is tied!\n");
    }
    ret = update_pos_weigh(player,board);
    if(-1 == ret)
    {
      return -1;
    }
    printf("\nOptimal move for player %d is %d",player,ret);
  }
  Harness::stop_timing();

#ifdef BLOCK_PROFILE
  profiler.output();
#endif

  return 0;
}
