/**********************************************************************************************/
/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
/**********************************************************************************************/
/*
 * Copyright (c) 2007 The Unbalanced Tree Search (UTS) Project Team:
 * -----------------------------------------------------------------
 *
 *  This file is part of the unbalanced tree search benchmark.  This
 *  project is licensed under the MIT Open Source license.  See the LICENSE
 *  file for copyright and licensing information.
 *
 *  UTS is a collaborative project between researchers at the University of
 *  Maryland, the University of North Carolina at Chapel Hill, and the Ohio
 *  State University.
 *
 * University of Maryland:
 *   Chau-Wen Tseng(1)  <tseng at cs.umd.edu>
 *
 * University of North Carolina, Chapel Hill:
 *   Jun Huan         <huan,
 *   Jinze Liu         liu,
 *   Stephen Olivier   olivier,
 *   Jan Prins*        prins at cs.umd.edu>
 *
 * The Ohio State University:
 *   James Dinan      <dinan,
 *   Gerald Sabin      sabin,
 *   P. Sadayappan*    saday at cse.ohio-state.edu>
 *
 * Supercomputing Research Center
 *   D. Pryor
 *
 * (1) - indicates project PI
 *
 * UTS Recursive Depth-First Search (DFS) version developed by James Dinan
 *
 * Adapted for OpenMP 3.0 Task-based version by Stephen Olivier
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 *
 */
/*******************************************************************/
/* This code is for PLDI-15 Artifact Evaluation only               */ 
/* and will be released with further copyright information         */ 
/* File: Seq block w reexpand of uts derived from above benchmark  */
/*******************************************************************/

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <omp.h>
#include <sys/time.h>
#include <iostream>
#include <algorithm>
#include <cstdlib>

#include <iostream>
#include "harness.h"
#include "block.h"
#include "uts.h"

#ifdef BLOCK_PROFILE
#include "blockprofiler.h"
BlockProfiler profiler(4);// simd utilization profiler, simd width is set as 4
#endif
#ifdef PARALLELISM_PROFILE
#include "parallelismprofiler.h"
ParallelismProfiler *parallelismProfiler;
#endif

#ifdef TRACK_TRAVERSALS
uint64_t work = 0;
#endif

//Add for blocking
//int _expandDepth = 0;
int _expandSize = D_MAX_BLOCK_SIZE;
int dynamic_reexpand_count = 0;
_Block * g_initial_block = NULL;
int g_is_partial = 0;


/***********************************************************
 *  Global state                                           *
 ***********************************************************/
unsigned long long bots_number_of_tasks = 0; /* forcing 8 bytes size in -m32 and -m64 */

unsigned long long nLeaves = 0;
int maxTreeDepth = 0;
/***********************************************************
 * Tree generation strategy is controlled via various      *
 * parameters set from the command line.  The parameters   *
 * and their default values are given below.               *
 * Trees are generated using a Galton-Watson process, in   *
 * which the branching factor of each node is a random     *
 * variable.                                               *
 *                                                         *
 * The random variable follow a binomial distribution.     *
 ***********************************************************/
double b_0   = 4.0; // default branching factor at the root
int   rootId = 0;   // default seed for RNG state at root
/***********************************************************
 *  The branching factor at the root is specified by b_0.
 *  The branching factor below the root follows an
 *     identical binomial distribution at all nodes.
 *  A node has m children with prob q, or no children with
 *     prob (1-q).  The expected branching factor is q * m.
 *
 *  Default parameter values
 ***********************************************************/
int    nonLeafBF   = 4;            // m
double nonLeafProb = 15.0 / 64.0;  // q
/***********************************************************
 * compute granularity - number of rng evaluations per
 * tree node
 ***********************************************************/
int computeGranularity = 1;
/***********************************************************
 * expected results for execution
 ***********************************************************/
unsigned long long  exp_tree_size = 0;
int        exp_tree_depth = 0;
unsigned long long  exp_num_leaves = 0;
/***********************************************************
 *  FUNCTIONS                                              *
 ***********************************************************/

void serTreeSearch_expand_bf(int* _depth, unsigned long long* subtreesize, _BlockStack * _stack, Node** release_parents, int release_size);
int serTreeSearch_block(int _depth, unsigned long long* subtreesize, _BlockStack * _stack);


void uts_initRoot(Node * root)
{
  root->height = 0;
  root->numChildren = -1;      // means not yet determined
  rng_init(root->state.state, rootId);
}

int uts_numChildren(Node *node)
{
  int numChildren = 0;

  // determine the number of children
  if (node->height == 0) numChildren = (int) floor(b_0);
  else
  {
    // distribution is identical everywhere below root
    int    v = rng_rand(node->state.state);
    double d = rng_toProb(v);
    numChildren = (d < nonLeafProb) ? nonLeafBF : 0;
  }

  // limit number of children (only a BIN root can have more than MAXNUMCHILDREN)
  if (node->height != 0) {
    if (numChildren > MAXNUMCHILDREN) {
      bots_debug("*** Number of children truncated from %d to %d\n", numChildren, MAXNUMCHILDREN);
      numChildren = MAXNUMCHILDREN;
    }
  }

  /* including info into node */
  node->numChildren = numChildren;

  return numChildren;
}

unsigned long long serial_uts (Node *root){
  unsigned long long num_nodes = 0;

  //Initialize software block stack
  cout << "Set fixed max block buffer size, _expandSize: " << _expandSize << endl;
  _Block::max_block = _expandSize;
  Harness::set_block_size(_expandSize);
  class _BlockStack * _stack = new _BlockStack;
  class _Block * _block = new _Block;
  g_initial_block = _block;

  _block->add(root);
  int _depth = 0;
  _stack->get (_depth) -> block = _block;

  //Start to execute blocked uts 
  if (_expandSize >= 2000){
#ifdef BLOCK_PROFILE
    profiler.record_bef_exp_size(0, 1);
    profiler.record_aft_exp_size(0, 1);
#ifdef INCLUSIVE
    profiler.record_w_wo_exp_ratio(0, 1);
#endif
#endif
    serTreeSearch_expand_bf(&_depth, &num_nodes, _stack, NULL, 0);
  }
  else{
    int df_block_size = _stack->get(_depth)->block->size;
    cout << "This is the max block buffer size for dfs: " << df_block_size << endl;

    if (df_block_size){
      serTreeSearch_block(_depth, &num_nodes, _stack);
    }

  }

  delete _stack;
  if (_expandSize < 2000) delete _block;

  bots_message(" completed!\n");
  return num_nodes;
}

inline int get_expand_condition(){
  return (_expandSize > 128) ? 2 : 8;
}

/*Breadth First execution to expand the number of tasks in software block*/
void serTreeSearch_expand_bf(int* _depth, unsigned long long* subtreesize, _BlockStack * _stack, Node** release_parents, int release_size){
#ifdef TRACK_TRAVERSALS
  work++;
#endif

  class _BlockSet *_set = _stack ->  get (*_depth);
  class _Block *_block = _set -> block;
  class _Block *_nextBlock0 = &_set -> _BlockSet::nextBlock0;

#ifdef BLOCK_PROFILE
  profiler.record(_block->size, *_depth);

  if (dynamic_reexpand_count) 
  {
    profiler.record_reexpansion(*_depth);
  }
#endif

  Node** nodes = new Node*[_block->size]();
  std::pair<int, Node*>* numChildren_Node_Pair = new std::pair<int, Node*>[_block->size]();
  int max_numChildren = 0;

  //Start: Reorder _block according to number of children
  for (int _bi = 0; _bi < _block->size; _bi++){
    class _Point & _point = _block->get(_bi);
    Node* parent = _point.node;

    int numChildren = uts_numChildren(parent);
    numChildren_Node_Pair[_bi].first = numChildren;
    numChildren_Node_Pair[_bi].second = parent;
    max_numChildren = max(max_numChildren, numChildren);
  }

  sort(numChildren_Node_Pair, numChildren_Node_Pair + _block->size, std::greater<std::pair<int, Node*> >());
  //End: Reorder

  for (int _bi = 0; _bi < _block->size; _bi++){
    *subtreesize += 1;
    int numChildren = numChildren_Node_Pair[_bi].first;
    Node* parent = numChildren_Node_Pair[_bi].second;
    if (!numChildren){
#ifdef PARALLELISM_PROFILE
      parallelismProfiler->recordTruncate();
#endif
      continue;
    }

#ifdef PARALLELISM_PROFILE
    parallelismProfiler->recordRecurse();
#endif

    nodes[_bi] = new Node[numChildren];
    for (int i = 0; i < numChildren; i++){
      nodes[_bi][i].height = parent->height + 1;
      for (int j = 0; j < computeGranularity; j++) {
        rng_spawn(parent->state.state, nodes[_bi][i].state.state, i);
      }
    }
  }


#ifdef BLOCK_PROFILE
  int o_num_child = max_numChildren;
  int c_bef_exp_size[o_num_child];
  for (int i = 0; i < o_num_child; ++i){
    c_bef_exp_size[i] = 0;
  }
  int max_c_bef_exp_size = 0;
  int n_num_child = o_num_child;
#endif  

  _nextBlock0 ->  recycle ();
  for (int n_pos = 0; n_pos < max_numChildren; ++n_pos){
    for (int _bi = 0; _bi < _block->size; _bi++){
      int numChildren = numChildren_Node_Pair[_bi].first;
      if (n_pos < numChildren){
        _nextBlock0->add(&nodes[_bi][n_pos]);
      }
    }
#ifdef BLOCK_PROFILE
    c_bef_exp_size[n_pos] = _nextBlock0->size;
    for (int j = n_pos-1; j >= 0; --j){
      c_bef_exp_size[n_pos] -= c_bef_exp_size[j];
    }
    max_c_bef_exp_size += c_bef_exp_size[n_pos];
#endif

  }

  //Delete Parents nodes
  for (int i = 0; i < release_size; ++i){
    if (release_parents[i]) delete release_parents[i];
  }
  if (release_parents) delete release_parents;

  //Delete tmp structure
  delete numChildren_Node_Pair;

  //Free old stack space
  int old_block_size = _block->size;
  if (!g_is_partial){
    if (!*_depth){
      delete g_initial_block;
    } else
    {
      _stack->release(*_depth - 1);
    }
  }

  int _nextblock0_size = _nextBlock0 -> _Block::size;
  *_depth += 1;

#ifdef BLOCK_PROFILE
  if (dynamic_reexpand_count == 0){
    if(_nextblock0_size) {
      profiler.record_bef_exp_size(*_depth, _nextblock0_size);
#ifdef INCLUSIVE
      profiler.record_w_wo_exp_ratio(*_depth, 1);	
#endif
    }
  } else{
    for (int i = 0; i < o_num_child; ++i){
      if (c_bef_exp_size[i]) profiler.record_bef_exp_size(*_depth, c_bef_exp_size[i]);
    }
    if(_nextblock0_size) profiler.record_w_wo_exp_ratio(*_depth, ((double)_nextblock0_size) / ((double)max_c_bef_exp_size / max_numChildren));	
  }
  if(_nextblock0_size) {
    profiler.record_aft_exp_size(*_depth, _nextblock0_size);
  }
#endif

  if (_nextblock0_size > 0 &&  _nextblock0_size <= _expandSize / get_expand_condition()) { //refine the stop-criteria
    _stack ->  get (*_depth) -> _BlockSet::block = _nextBlock0;
    serTreeSearch_expand_bf(_depth, subtreesize, _stack, nodes, old_block_size);
  } else{//Reach the buffer size, or finish all evaluation
    if (!dynamic_reexpand_count){
      cout << "This is the max block buffer size for dfs: " << _nextblock0_size << endl;
    }

    if (_nextblock0_size){
      _stack ->  get (*_depth) -> _BlockSet::block = _nextBlock0;
      serTreeSearch_block(*_depth, subtreesize, _stack);
    }
  }
#ifdef PARALLELISM_PROFILE
  parallelismProfiler->blockEnd();
#endif

}

/*Depth First execution of children to limit the memory consumption*/
int serTreeSearch_block(int _depth, unsigned long long* subtreesize, _BlockStack * _stack){
#ifdef TRACK_TRAVERSALS
  work++;
#endif

  class _BlockSet *_set = _stack ->  get (_depth);
  class _Block *_block = _set -> block;
  class _Block *_nextBlock0 = &_set -> _BlockSet::nextBlock0;


  int _block_size = _block->size;
  if (_depth > 0 && _block_size <= _expandSize / get_expand_condition ()){//Do dynamic reexpansion
    dynamic_reexpand_count++;
    g_is_partial = 1;
    serTreeSearch_expand_bf(&_depth, subtreesize, _stack, NULL, 0);
    return 1;
  } else {
#ifdef BLOCK_PROFILE
    profiler.record(_block->size, _depth);
#endif
    Node** nodes = new Node*[_block->size];
    std::pair<int, Node*> numChildren_Node_Pair[_block->size];
    int max_numChildren = 0;

    //Start: Reorder _block according to number of children
    for (int _bi = 0; _bi < _block->size; _bi++){
      class _Point & _point = _block->get(_bi);
      Node* parent = _point.node;

      int numChildren = uts_numChildren(parent);
      numChildren_Node_Pair[_bi].first = numChildren;
      numChildren_Node_Pair[_bi].second = parent;
      max_numChildren = max(max_numChildren, numChildren);
    }

    sort(numChildren_Node_Pair, numChildren_Node_Pair + _block->size, std::greater<std::pair<int, Node*> >());
    //End: Reorder

    for (int _bi = 0; _bi < _block->size; _bi++){
      *subtreesize += 1;
      int numChildren = numChildren_Node_Pair[_bi].first;
      Node* parent = numChildren_Node_Pair[_bi].second;
      if (!numChildren) {
#ifdef PARALLELISM_PROFILE
        parallelismProfiler->recordTruncate();
#endif
        continue;
      }

#ifdef PARALLELISM_PROFILE
      parallelismProfiler->recordRecurse();
#endif

      nodes[_bi] = new Node[numChildren];
      for (int i = 0; i < numChildren; i++){
        nodes[_bi][i].height = parent->height + 1;
        for (int j = 0; j < computeGranularity; j++) {
          rng_spawn(parent->state.state, nodes[_bi][i].state.state, i);
        }
      }
    }

    for (int n_pos = 0; n_pos < max_numChildren; ++n_pos){
      _nextBlock0 ->  recycle ();
      for (int _bi = 0; _bi < _block->size; _bi++){
        int numChildren = numChildren_Node_Pair[_bi].first;
        if (n_pos < numChildren){
          _nextBlock0->add(&nodes[_bi][n_pos]);
        }
      }

      int skip = 0;
      if (_nextBlock0 -> _Block::size > 0) {
        _stack ->  get (_depth + 1) -> _BlockSet::block = _nextBlock0;
#ifdef BLOCK_PROFILE
        profiler.record_bef_exp_size(_depth + 1, _nextBlock0->size);
        profiler.record_aft_exp_size(_depth + 1, _nextBlock0->size);
#ifdef INCLUSIVE
        profiler.record_w_wo_exp_ratio(_depth + 1, 1);	
#endif
#endif
        skip = serTreeSearch_block(_depth + 1, subtreesize, _stack);
      }
      //if (skip) break;
    }

    for (int _bi = 0; _bi < _block->size; _bi++){
      if (numChildren_Node_Pair[_bi].first) delete [] nodes[_bi];
    }
    delete [] nodes;

  }

#ifdef PARALLELISM_PROFILE
  parallelismProfiler->blockEnd();
#endif

  return 0;
}

void uts_read_file ( char *filename )
{
  FILE *fin;

  if ((fin = fopen(filename, "r")) == NULL) {
    bots_message( "Could not open input file (%s)\n", filename);
    exit (-1);
  }
  fscanf(fin,"%lf %lf %d %d %d %llu %d %llu",
         &b_0,
         &nonLeafProb,
         &nonLeafBF,
         &rootId,
         &computeGranularity,
         &exp_tree_size,
         &exp_tree_depth,
         &exp_num_leaves
        );
  fclose(fin);

  computeGranularity = max(1,computeGranularity);

  // Printing input data
  bots_message("\n");
  bots_message("Root branching factor                = %f\n", b_0);
  bots_message("Root seed (0 <= 2^31)                = %d\n", rootId);
  bots_message("Probability of non-leaf node         = %f\n", nonLeafProb);
  bots_message("Number of children for non-leaf node = %d\n", nonLeafBF);
  bots_message("E(n)                                 = %f\n", (double) ( nonLeafProb * nonLeafBF ) );
  bots_message("E(s)                                 = %f\n", (double) ( 1.0 / (1.0 - nonLeafProb * nonLeafBF) ) );
  bots_message("Compute granularity                  = %d\n", computeGranularity);
  bots_message("Random number generator              = "); rng_showtype();
}

void uts_show_stats( void )
{
  int chunkSize = 0;

  bots_message("\n");
  bots_message("Tree size                            = %llu\n", (unsigned long long) bots_number_of_tasks );
  bots_message("Maximum tree depth                   = %d\n", maxTreeDepth );
  bots_message("Chunk size                           = %d\n", chunkSize );
  bots_message("Number of leaves                     = %llu (%.2f%%)\n", nLeaves, nLeaves/(float)bots_number_of_tasks*100.0 );
}

int uts_check_result ( void )
{
  int answer = 1;

  if ( bots_number_of_tasks != exp_tree_size ) {
    answer = 0;
    bots_message("Incorrect tree size result (%llu instead of %llu).\n", bots_number_of_tasks, exp_tree_size);
  }

  return answer;
}

char bots_arg_file[255]="";

/*Benchmark entrance called by harness*/
int app_main(int argc, char** argv){
  if (argc != 1 && argc != 2){
    printf("Usage: uts [inputfile] or uts [inputfile] [buffer_size]\n");
    exit(0);
  }

  strcpy(bots_arg_file, argv[0]);
  if (argc == 2) _expandSize = atoi(argv[1]);

#ifdef PARALLELISM_PROFILE
  parallelismProfiler = new ParallelismProfiler;
#endif
  //_expandDepth = Harness::get_splice_depth();

  Node root;
  uts_read_file(bots_arg_file);

  Harness::start_timing();
  uts_initRoot(&root);
  bots_number_of_tasks = serial_uts(&root);
  Harness::stop_timing();

#ifdef BLOCK_PROFILE
  profiler.output();
#ifdef EXPAND_PROFILE
  profiler.outputReexpandInfo();
#endif
#endif

#ifdef PARALLELISM_PROFILE
  parallelismProfiler->output();
  delete parallelismProfiler;
#endif

#ifdef TRACK_TRAVERSALS
  cout << "work: " << work << endl;
#endif

  uts_show_stats();
  if (uts_check_result()){
    printf("Successfully exit!\n");
  }

  printf("This is the dynamic reexpand counter: %d\n", dynamic_reexpand_count);
#ifdef PROFILE_SPACE_USE
  cout << "This is max space use (Bytes): " << m_space << endl;
  cout << "This is the total number of new operations for block: " << total_malloc << endl;
#endif


  return 0;

}
