/**********************************************************/
/* This code is for PLDI-15 Artifact Evaluation only      */ 
/* and will be released with further copyright information*/ 
/* File: MIC block wo reexpansion of knapsack             */
/**********************************************************/

#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include "getoptions.h"
#include <string.h>
#include <algorithm>
#include <iostream>
#include <fstream>

#include "harness.h"
#include "block-mic.h"
#include "simd.h"

#ifdef BLOCK_PROFILE
#include "blockprofiler.h"
BlockProfiler profiler(16);
#endif

#ifdef TRACK_TRAVERSALS
uint64_t work = 0;
#endif

//int _expandDepth = 0;
long long _expandSize = D_MAX_BLOCK_SIZE;
_Block * g_initial_block = NULL;

using namespace std;

/* every item in the knapsack has a weight and a value */
#define MAX_ITEMS 256

struct item {
  int value;
  int weight;
};

int best_so_far = MY_MIN;

int compare(struct item *a, struct item *b)
{
  double c = ((double) a->value / a->weight) -
      ((double) b->value / b->weight);

  if (c > 0)
    return -1;
  if (c < 0)
    return 1;
  return 0;
}

int read_input(const char *filename, struct item *items, int *capacity, int *n)
{
  int i;
  FILE *f;

  if (filename == NULL)
    filename = "\0";
  f = fopen(filename, "r");
  if (f == NULL) {
    fprintf(stderr, "open_input(\"%s\") failed\n", filename);
    return -1;
  }
  /* format of the input: #items capacity value1 weight1 ... */
  fscanf(f, "%d", n);
  fscanf(f, "%d", capacity);

  for (i = 0; i < *n; ++i)
    fscanf(f, "%d %d", &items[i].value, &items[i].weight);

  fclose(f);

  /* sort the items on decreasing order of value/weight */
  /* cilk2c is fascist in dealing with pointers, whence the ugly cast */
  qsort(items, *n, sizeof(struct item),
        (int (*)(const void *, const void *)) compare);

  return 0;
}

/*
 * return the optimal solution for n items (first is e) and
 * capacity c. Value so far is v.
 */
/*Pseudo tail recursive knapsack matching our language spec*/
void knapsack(struct item *e, int c, int n, int v, int* max_ret)
{
#ifdef TRACK_TRAVERSALS
  work++;
#endif
#ifdef BLOCK_PROFILE
  profiler.record_single();
#endif

  /* base case: full knapsack or no items */
  if (c < 0){
    *max_ret = max(*max_ret, MY_MIN);
    return;
  }

  if (n == 0 || c == 0){
    *max_ret = max(*max_ret, v);
    return;		/* feasible solution, with value v */
  }

  /*
   * compute the best solution without the current item in the knapsack
   */
  knapsack(e + 1, c, n - 1, v, max_ret);

  /* compute the best solution with the current item in the knapsack */
  knapsack(e + 1, c - e->weight, n - 1, v + e->value, max_ret);
}

void knapsack1(struct item * e, int n, _BlockStack* _stack, int* max_ret, int _depth);

/*sequential processing for left children*/
inline void process_point0(struct item * e, int n, _Block* _block, _Block* _nextBlock0, int _bi, int* max_ret){
  int c = _block->c0[_bi];
  int v = _block->v0[_bi];

  /* base case: full knapsack or no items */
  if (c < 0){
    *max_ret = max(*max_ret, MY_MIN);
    return;
  }

  if (n == 0 || c == 0){
    *max_ret = max(*max_ret, v);
    return;		/* feasible solution, with value v */
  }

  _nextBlock0->add(c, v, c - e->weight, v + e->value);
}

/*sequential processing for right children*/
inline void process_point1(struct item * e, int n, _Block* _block, _Block* _nextBlock0, int _bi, int* max_ret){
  int c = _block->c1[_bi];
  int v = _block->v1[_bi];

  /* base case: full knapsack or no items */
  if (c < 0){
    *max_ret = max(*max_ret, MY_MIN);
    return;
  }

  if (n == 0 || c == 0){
    *max_ret = max(*max_ret, v);
    return;		/* feasible solution, with value v */
  }

  _nextBlock0->add(c, v, c - e->weight, v + e->value);
}

/*simd processing for left children, we store n in int in the block*/
inline void process_simd0_opt(struct item * e, int n, _Block* _block, _Block* _nextBlock0, int _si, int* max_ret){
  __m512i vec_c = _mm512_load_epi32((__m512i*)&_block->c0[_si]);
  __m512i vec_v = _mm512_load_epi32((__m512i*)&_block->v0[_si]);
  __m512i vec_n = _mm512_set1_epi32(n);
  __m512i vec_0 = _mm512_setzero_epi32();
  __m512i vec_max_ret = _mm512_set1_epi32(*max_ret);
  __m512i vec_my_min = _mm512_set1_epi32(MY_MIN);

  // base case I: full knapsack or no items
  __mmask16 mask_c_lt_0 = _mm512_cmp_epi32_mask(vec_c, vec_0, _MM_CMPINT_LT);
  __mmask16 mask_mymin_gt_maxret = _mm512_cmp_epi32_mask(vec_my_min, vec_max_ret, _MM_CMPINT_GT);
  __mmask16 mask_assign_mymin = _mm512_kand(mask_c_lt_0, mask_mymin_gt_maxret);
  vec_max_ret = _mm512_mask_or_epi32(vec_max_ret, mask_assign_mymin, vec_my_min, vec_0);//equivalent to a blend operation

  // base case II: full knapsack or no items
  __mmask16 mask_c_eq_0 = _mm512_cmp_epi32_mask(vec_c, vec_0, _MM_CMPINT_EQ);
  __mmask16 mask_n_eq_0 = _mm512_cmp_epi32_mask(vec_n, vec_0, _MM_CMPINT_EQ);
  __mmask16 mask_c_gt_0 = _mm512_cmp_epi32_mask(vec_c, vec_0, _MM_CMPINT_GT);
  mask_n_eq_0 = _mm512_kand(mask_n_eq_0, mask_c_gt_0);
  __mmask16 mask_c_or_n_eq_0 = _mm512_kor(mask_c_eq_0, mask_n_eq_0);
  __mmask16 mask_v_gt_maxret = _mm512_cmp_epi32_mask(vec_v, vec_max_ret, _MM_CMPINT_GT);
  __mmask16 mask_assign_v =  _mm512_kand(mask_c_or_n_eq_0, mask_v_gt_maxret);
  vec_max_ret = _mm512_mask_or_epi32(vec_max_ret, mask_assign_v, vec_v, vec_0);//equivalent to a blend operation

  __mmask16 mask_is_leaf = _mm512_kor(mask_c_lt_0, mask_c_or_n_eq_0);

#ifdef SEQSC
  __declspec(align(64)) int tmp[MY_SIMD_WIDTH] = {0};
  __declspec(align(64)) int tmp_non_leaf[MY_SIMD_WIDTH] = {0};
  __m512i vec_1 = _mm512_set1_epi32(1);
  _mm512_mask_store_epi32(tmp_non_leaf, ~mask_is_leaf, vec_1);

  for (int i = 1; i < MY_SIMD_WIDTH; ++i){
    tmp[i] = tmp[i-1] + tmp_non_leaf[i-1];
  }

  __m512i vec_index = _mm512_add_epi32(_mm512_set1_epi32(_nextBlock0->size), _mm512_load_epi32(tmp));

  _mm512_mask_i32scatter_epi32((int*)_nextBlock0->c0, ~mask_is_leaf, vec_index, vec_c, 4);
  _mm512_mask_i32scatter_epi32((int*)_nextBlock0->v0, ~mask_is_leaf, vec_index, vec_v, 4);

  vec_c = _mm512_sub_epi32(vec_c, _mm512_set1_epi32(e->weight));
  vec_v = _mm512_add_epi32(vec_v, _mm512_set1_epi32(e->value));
  _mm512_mask_i32scatter_epi32((int*)_nextBlock0->c1, ~mask_is_leaf, vec_index, vec_c, 4);
  _mm512_mask_i32scatter_epi32((int*)_nextBlock0->v1, ~mask_is_leaf, vec_index, vec_v, 4);
  int adv = tmp[MY_SIMD_WIDTH - 1] + tmp_non_leaf[MY_SIMD_WIDTH - 1];
  _nextBlock0->size += adv;

  *max_ret = _mm512_reduce_max_epi32(vec_max_ret);
#else // no SEQSC
  unsigned short low_8, high_8;
  low_8 = (~mask_is_leaf) & 0x00FF;
  high_8 = ((~mask_is_leaf) & 0xFF00) >> 8;

  __m512i vec_index = _mm512_load_epi32(g_scantable[low_8]);
  __m512i vec_index_1 = _mm512_load_epi32(g_scantable[high_8]);

  __m512i vec_index_offset = _mm512_set1_epi32(g_scantable[low_8][7] + (((~mask_is_leaf) >> 7) & 0x1));

  vec_index_1 = _mm512_mask_add_epi32(vec_index_1, 0x00FF, vec_index_1, vec_index_offset);
  vec_index_1 = _mm512_permute4f128_epi32(vec_index_1, _MM_PERM_BADC);
  vec_index = _mm512_and_epi32(vec_index, vec_index_1);

  vec_index = _mm512_add_epi32(_mm512_set1_epi32(_nextBlock0->size), vec_index);

  _mm512_mask_i32scatter_epi32((int*)_nextBlock0->c0, ~mask_is_leaf, vec_index, vec_c, 4);
  _mm512_mask_i32scatter_epi32((int*)_nextBlock0->v0, ~mask_is_leaf, vec_index, vec_v, 4);

  vec_c = _mm512_sub_epi32(vec_c, _mm512_set1_epi32(e->weight));
  vec_v = _mm512_add_epi32(vec_v, _mm512_set1_epi32(e->value));
  _mm512_mask_i32scatter_epi32((int*)_nextBlock0->c1, ~mask_is_leaf, vec_index, vec_c, 4);
  _mm512_mask_i32scatter_epi32((int*)_nextBlock0->v1, ~mask_is_leaf, vec_index, vec_v, 4);

  int adv = g_scantable[low_8][7] + g_scantable[high_8][7] + (((~mask_is_leaf) >> 7) & 0x1) + (((~mask_is_leaf) >> 15) & 0x1);
  _nextBlock0->size += adv;

  *max_ret = _mm512_reduce_max_epi32(vec_max_ret);

#endif // SEQSC
}

/*simd processing for right children, we store n in int in the block*/
inline void process_simd1_opt(struct item * e, int n, _Block* _block, _Block* _nextBlock0, int _si, int* max_ret){
  __m512i vec_c = _mm512_load_epi32((__m512i*)&_block->c1[_si]);
  __m512i vec_v = _mm512_load_epi32((__m512i*)&_block->v1[_si]);
  __m512i vec_n = _mm512_set1_epi32(n);
  __m512i vec_0 = _mm512_setzero_epi32();
  __m512i vec_max_ret = _mm512_set1_epi32(*max_ret);
  __m512i vec_my_min = _mm512_set1_epi32(MY_MIN);

  // base case I: full knapsack or no items
  __mmask16 mask_c_lt_0 = _mm512_cmp_epi32_mask(vec_c, vec_0, _MM_CMPINT_LT);
  __mmask16 mask_mymin_gt_maxret = _mm512_cmp_epi32_mask(vec_my_min, vec_max_ret, _MM_CMPINT_GT);
  __mmask16 mask_assign_mymin = _mm512_kand(mask_c_lt_0, mask_mymin_gt_maxret);
  vec_max_ret = _mm512_mask_or_epi32(vec_max_ret, mask_assign_mymin, vec_my_min, vec_0);//equivalent to a blend operation

  // base case II: full knapsack or no items
  __mmask16 mask_c_eq_0 = _mm512_cmp_epi32_mask(vec_c, vec_0, _MM_CMPINT_EQ);
  __mmask16 mask_n_eq_0 = _mm512_cmp_epi32_mask(vec_n, vec_0, _MM_CMPINT_EQ);
  __mmask16 mask_c_gt_0 = _mm512_cmp_epi32_mask(vec_c, vec_0, _MM_CMPINT_GT);
  mask_n_eq_0 = _mm512_kand(mask_n_eq_0, mask_c_gt_0);
  __mmask16 mask_c_or_n_eq_0 = _mm512_kor(mask_c_eq_0, mask_n_eq_0);
  __mmask16 mask_v_gt_maxret = _mm512_cmp_epi32_mask(vec_v, vec_max_ret, _MM_CMPINT_GT);
  __mmask16 mask_assign_v =  _mm512_kand(mask_c_or_n_eq_0, mask_v_gt_maxret);
  vec_max_ret = _mm512_mask_or_epi32(vec_max_ret, mask_assign_v, vec_v, vec_0);//equivalent to a blend operation

  __mmask16 mask_is_leaf = _mm512_kor(mask_c_lt_0, mask_c_or_n_eq_0);

#ifdef SEQSC
  __declspec(align(64)) int tmp[MY_SIMD_WIDTH] = {0};
  __declspec(align(64)) int tmp_non_leaf[MY_SIMD_WIDTH] = {0};
  __m512i vec_1 = _mm512_set1_epi32(1);
  _mm512_mask_store_epi32(tmp_non_leaf, ~mask_is_leaf, vec_1);

  //TODO: Parallel exclusive-scan?
  for (int i = 1; i < MY_SIMD_WIDTH; ++i){
    tmp[i] = tmp[i-1] + tmp_non_leaf[i-1];
  }

  __m512i vec_index = _mm512_add_epi32(_mm512_set1_epi32(_nextBlock0->size), _mm512_load_epi32(tmp));

  _mm512_mask_i32scatter_epi32((int*)_nextBlock0->c0, ~mask_is_leaf, vec_index, vec_c, 4);
  _mm512_mask_i32scatter_epi32((int*)_nextBlock0->v0, ~mask_is_leaf, vec_index, vec_v, 4);

  vec_c = _mm512_sub_epi32(vec_c, _mm512_set1_epi32(e->weight));
  vec_v = _mm512_add_epi32(vec_v, _mm512_set1_epi32(e->value));
  _mm512_mask_i32scatter_epi32((int*)_nextBlock0->c1, ~mask_is_leaf, vec_index, vec_c, 4);
  _mm512_mask_i32scatter_epi32((int*)_nextBlock0->v1, ~mask_is_leaf, vec_index, vec_v, 4);
  int adv = tmp[MY_SIMD_WIDTH - 1] + tmp_non_leaf[MY_SIMD_WIDTH - 1];
  _nextBlock0->size += adv;

  *max_ret = _mm512_reduce_max_epi32(vec_max_ret);
#else // no SEQSC
  unsigned short low_8, high_8;
  low_8 = (~mask_is_leaf) & 0x00FF;
  high_8 = ((~mask_is_leaf) & 0xFF00) >> 8;

  __m512i vec_index = _mm512_load_epi32(g_scantable[low_8]);
  __m512i vec_index_1 = _mm512_load_epi32(g_scantable[high_8]);

  __m512i vec_index_offset = _mm512_set1_epi32(g_scantable[low_8][7] + (((~mask_is_leaf) >> 7) & 0x1));

  vec_index_1 = _mm512_mask_add_epi32(vec_index_1, 0x00FF, vec_index_1, vec_index_offset);
  vec_index_1 = _mm512_permute4f128_epi32(vec_index_1, _MM_PERM_BADC);
  vec_index = _mm512_and_epi32(vec_index, vec_index_1);

  vec_index = _mm512_add_epi32(_mm512_set1_epi32(_nextBlock0->size), vec_index);

  _mm512_mask_i32scatter_epi32((int*)_nextBlock0->c0, ~mask_is_leaf, vec_index, vec_c, 4);
  _mm512_mask_i32scatter_epi32((int*)_nextBlock0->v0, ~mask_is_leaf, vec_index, vec_v, 4);

  vec_c = _mm512_sub_epi32(vec_c, _mm512_set1_epi32(e->weight));
  vec_v = _mm512_add_epi32(vec_v, _mm512_set1_epi32(e->value));
  _mm512_mask_i32scatter_epi32((int*)_nextBlock0->c1, ~mask_is_leaf, vec_index, vec_c, 4);
  _mm512_mask_i32scatter_epi32((int*)_nextBlock0->v1, ~mask_is_leaf, vec_index, vec_v, 4);

  int adv = g_scantable[low_8][7] + g_scantable[high_8][7] + (((~mask_is_leaf) >> 7) & 0x1) + (((~mask_is_leaf) >> 15) & 0x1);
  _nextBlock0->size += adv;

  *max_ret = _mm512_reduce_max_epi32(vec_max_ret);

#endif // SEQSC
}


/*Depth First execution of left children to limit the memory consumption*/
void knapsack0(struct item * e, int n, _BlockStack* _stack, int* max_ret, int _depth){
  class _BlockSet* _set = _stack->get(_depth);
  class _Block* _block = _set->block;
  class _Block* _nextBlock0 = &_set-> nextBlock0;
  _nextBlock0->recycle();
#ifdef BLOCK_PROFILE
  profiler.record(_block->size, _depth);
#endif

  int _si = 0;
  for (; _si < (_block->size - MY_SIMD_WIDTH + 1); _si += MY_SIMD_WIDTH) {
    process_simd0_opt(e, n, _block, _nextBlock0, _si, max_ret);
  }

  //For the rest;
  for (int _bi = _si; _bi < _block -> _Block::size; ++_bi) {
    process_point0(e, n, _block, _nextBlock0, _bi, max_ret);
  }

  if (_nextBlock0 -> _Block::size > 0) {
    _stack ->  get (_depth + 1) -> _BlockSet::block = _nextBlock0;
    knapsack0(e + 1, n - 1, _stack, max_ret, _depth + 1);
    knapsack1(e + 1, n - 1, _stack, max_ret, _depth + 1);
  }
}

/*Depth First execution of right children to limit the memory consumption*/
void knapsack1(struct item * e, int n, _BlockStack* _stack, int* max_ret, int _depth){
  class _BlockSet* _set = _stack->get(_depth);
  class _Block* _block = _set->block;
  class _Block* _nextBlock0 = &_set-> nextBlock0;
  _nextBlock0->recycle();
#ifdef BLOCK_PROFILE
  profiler.record(_block->size, _depth);
#endif

  int _si = 0;
  for (; _si < (_block->size - MY_SIMD_WIDTH + 1); _si += MY_SIMD_WIDTH) {
    process_simd1_opt(e, n, _block, _nextBlock0, _si, max_ret);
  }

  //For the rest;
  for (int _bi = _si; _bi < _block -> _Block::size; ++_bi) {
    process_point1(e, n, _block, _nextBlock0, _bi, max_ret);
  }

  if (_nextBlock0 -> _Block::size > 0) {
    _stack ->  get (_depth + 1) -> _BlockSet::block = _nextBlock0;
    knapsack0(e + 1, n - 1, _stack, max_ret, _depth + 1);
    knapsack1(e + 1, n - 1, _stack, max_ret, _depth + 1);
  }
}

/*Breadth First execution to expand the number of tasks in software block*/
void knapsack_expand_bf(struct item * e, int n, _BlockStack* _stack, int* max_ret, int* _depth){
  class _BlockSet* _set = _stack->get(*_depth);
  class _Block* _block = _set->block;
  class _Block* _nextBlock0 = &_set-> nextBlock0;
  _nextBlock0->recycle();
#ifdef BLOCK_PROFILE
  profiler.record(_block->size,*_depth);
#endif

  //Add Left
  int _si = 0;
  for (; _si < (_block->size - MY_SIMD_WIDTH + 1); _si += MY_SIMD_WIDTH) {
    process_simd0_opt(e, n, _block, _nextBlock0, _si, max_ret);
  }

  //For the rest;
  for (int _bi = _si; _bi < _block -> _Block::size; ++_bi) {
    process_point0(e, n, _block, _nextBlock0, _bi, max_ret);
  }

  //Add Right
  _si = 0;
  for (; _si < (_block->size - MY_SIMD_WIDTH + 1); _si += MY_SIMD_WIDTH) {
    process_simd1_opt(e, n, _block, _nextBlock0, _si, max_ret);
  }

  //For the rest;
  for (int _bi = _si; _bi < _block -> _Block::size; ++_bi) {
    process_point1(e, n, _block, _nextBlock0, _bi, max_ret);
  }

  //Free old stack space
  if (!*_depth){
    delete g_initial_block;
  } else
  {
    _stack->release(*_depth - 1);
  }

  int _nextblock0_size = _nextBlock0 -> _Block::size;
  *_depth += 1;
  if (_nextblock0_size > 0 && _nextblock0_size <= _expandSize / 2) {
    _stack ->  get (*_depth) -> _BlockSet::block = _nextBlock0;
    knapsack_expand_bf(e + 1, n - 1, _stack, max_ret, _depth);
  } else { //Reach the buffer size, or finish all evaluation
    _stack ->  get (*_depth) -> _BlockSet::block = _nextBlock0;
  }
}

int usage(void)
{
  fprintf(stderr, "\nUsage: knapsack [<cilk-options>] [-f filename] [-benchmark] [-h]\n\n");
  fprintf(stderr, "The 0-1-Knapsack is a standard combinatorial optimization problem: ``A\n");
  fprintf(stderr, "thief robbing a store finds n items; the ith item is worth v_i dollars\n");
  fprintf(stderr, "and weighs w_i pounds, where v_i and w_i are integers. He wants to take\n");
  fprintf(stderr, "as valuable a load as possible, but he can carry at most W pounds in\n");
  fprintf(stderr, "his knapsack for some integer W. What items should he take?''\n\n");
  return -1;
}

char *specifiers[] = {"-f", "-benchmark", "-h", "-b", 0};
int opt_types[] = {STRINGARG, BENCHMARK, BOOLARG, INTARG ,0};

/*Benchmark entrance called by harness*/
int app_main(int argc, char *argv[])
{
  struct item items[MAX_ITEMS];	/* array of items */
  int n, capacity, sol, benchmark, help, buf;
  char filename[100];
  buf = 0;
  sol = 0;

  /* standard benchmark options */
  strcpy(filename, "../inputs/knapsack-example2.input");

  get_options(argc, argv, specifiers, opt_types, filename, &benchmark, &help, &buf);
  if (buf) _expandSize = pow(2.0, buf);

  if (help)
    return usage();

  if (benchmark) {
    switch (benchmark) {
      case 1:		/* short benchmark options -- a little work */
        strcpy(filename, "../inputs/knapsack-example1.input");
        break;
      case 2:		/* standard benchmark options */
        strcpy(filename, "../inputs/knapsack-example2.input");
        break;
      case 3:		/* long benchmark options -- a lot of work */
        strcpy(filename, "../inputs/knapsack-example3.input");
        break;
    }
  }
  if (read_input(filename, items, &capacity, &n))
    return 1;

#ifdef PARALLELISM_PROFILE
  parallelismProfiler = new ParallelismProfiler;
#endif

  Harness::start_timing();
  //_expandDepth = Harness::get_splice_depth();

  //Initialize software block stack
  cout << "Set fixed max block buffer size, _expandSize: " << _expandSize << endl;
  _Block::max_block = _expandSize;
  Harness::set_block_size(_expandSize);
  class _BlockStack * _stack = new _BlockStack;
  class _Block * _block = new _Block;
  g_initial_block = _block;

  /* base case: full knapsack or no items */
  if (capacity < 0){
    sol = max(sol, INT_MIN);
    return 0;
  }

  if (n == 0 || capacity == 0){
    sol = max(sol, 0);
    return 0;		/* feasible solution, with value v */
  }

  _block->add(capacity, 0, capacity - items[0].weight, items[0].value);

  int _depth = 0;
  _stack->get (_depth) -> block = _block;

  //Start to execute blocked knapsack
  if (_expandSize >= 2) knapsack_expand_bf(items + 1, n - 1, _stack, &sol, &_depth);
  int df_block_size = _stack->get(_depth)->block->size;
  cout << "This is the max block buffer size for dfs: " << df_block_size << endl;

  if (df_block_size){
    knapsack0(items + _depth + 1, n - _depth - 1, _stack, &sol, _depth);
    knapsack1(items + _depth + 1, n - _depth - 1, _stack, &sol, _depth);
  }

  delete _stack;
  if (_expandSize < 2) delete _block;

  Harness::stop_timing();
#ifdef BLOCK_PROFILE
  profiler.output();
#endif
#ifdef TRACK_TRAVERSALS
  cout << "work: " << work << endl;
#endif

  printf("\nExample: knapsack\n");
  printf("options: problem-file = %s\n\n", filename);
  printf("Best value is %d\n\n", sol);

#ifdef PROFILE_SPACE_USE
  cout << "This is max space use (Bytes): " << m_space << endl;
  cout << "This is total malloc counts: " << total_malloc << endl;
#endif

  return 0;
}
