/**********************************************************/
/* This code is for PLDI-15 Artifact Evaluation only      */ 
/* and will be released with further copyright information*/ 
/* File: SSE4 block wo reexpansion of knapsack            */
/**********************************************************/

#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include "getoptions.h"
#include <string.h>
#include <algorithm>
#include <iostream>
#include <fstream>

#include "harness.h"
#include "block-sse.h"
#include "simd.h"

#ifdef BLOCK_PROFILE
#include "blockprofiler.h"
BlockProfiler profiler(8);
#endif

#ifdef TRACK_TRAVERSALS
uint64_t work = 0;
#endif

//int _expandDepth = 0;
long long _expandSize = D_MAX_BLOCK_SIZE;
_Block * g_initial_block = NULL;

using namespace std;

/* every item in the knapsack has a weight and a value */
#define MAX_ITEMS 256

struct item {
  int value;
  int weight;
};

int best_so_far = MY_MIN;

int compare(struct item *a, struct item *b)
{
  double c = ((double) a->value / a->weight) -
      ((double) b->value / b->weight);

  if (c > 0)
    return -1;
  if (c < 0)
    return 1;
  return 0;
}

int read_input(const char *filename, struct item *items, int *capacity, int *n)
{
  int i;
  FILE *f;

  if (filename == NULL)
    filename = "\0";
  f = fopen(filename, "r");
  if (f == NULL) {
    fprintf(stderr, "open_input(\"%s\") failed\n", filename);
    return -1;
  }
  /* format of the input: #items capacity value1 weight1 ... */
  fscanf(f, "%d", n);
  fscanf(f, "%d", capacity);

  for (i = 0; i < *n; ++i)
    fscanf(f, "%d %d", &items[i].value, &items[i].weight);

  fclose(f);

  /* sort the items on decreasing order of value/weight */
  /* cilk2c is fascist in dealing with pointers, whence the ugly cast */
  qsort(items, *n, sizeof(struct item),
        (int (*)(const void *, const void *)) compare);

  return 0;
}

/*
 * return the optimal solution for n items (first is e) and
 * capacity c. Value so far is v.
 */
/*Pseudo tail recursive knapsack matching our language spec*/
void knapsack(struct item *e, int c, int n, int v, int* max_ret)
{
#ifdef TRACK_TRAVERSALS
  work++;
#endif
#ifdef BLOCK_PROFILE
  profiler.record_single();
#endif

  /* base case: full knapsack or no items */
  if (c < 0){
    *max_ret = max(*max_ret, MY_MIN);
    return;
  }

  if (n == 0 || c == 0){
    *max_ret = max(*max_ret, v);
    return;		/* feasible solution, with value v */
  }

  /*
   * compute the best solution without the current item in the knapsack
   */
  knapsack(e + 1, c, n - 1, v, max_ret);

  /* compute the best solution with the current item in the knapsack */
  knapsack(e + 1, c - e->weight, n - 1, v + e->value, max_ret);
}

void knapsack1(struct item * e, int n, _BlockStack* _stack, int* max_ret, int _depth);

/*sequential processing for left children*/
inline void process_point0(struct item * e, int n, _Block* _block, _Block* _nextBlock0, int _bi, int* max_ret){
  int c = _block->c0[_bi];
  int v = _block->v0[_bi];

  /* base case: full knapsack or no items */
  if (c < 0){
    *max_ret = max(*max_ret, MY_MIN);
    return;
  }

  if (n == 0 || c == 0){
    *max_ret = max(*max_ret, v);
    return;		/* feasible solution, with value v */
  }

  _nextBlock0->add(c, v, c - e->weight, v + e->value);
}

/*sequential processing for right children*/
inline void process_point1(struct item * e, int n, _Block* _block, _Block* _nextBlock0, int _bi, int* max_ret){
  int c = _block->c1[_bi];
  int v = _block->v1[_bi];

  /* base case: full knapsack or no items */
  if (c < 0){
    *max_ret = max(*max_ret, MY_MIN);
    return;
  }

  if (n == 0 || c == 0){
    *max_ret = max(*max_ret, v);
    return;		/* feasible solution, with value v */
  }

  _nextBlock0->add(c, v, c - e->weight, v + e->value);
}

node_t horizontal_max_Vec(__m128i x) {
  node_t result[MY_SIMD_WIDTH] __attribute__((aligned(16))) = {0};
  _mm_store_si128((__m128i *) result, x);
  node_t max = MY_MIN;

  for (int i = 0; i < MY_SIMD_WIDTH; ++i){
    if (max < result[i]) max = result[i];
  }

  return max;
}

/*simd processing for left children, we store n in short in the block*/
inline void process_simd0_opt(struct item * e, int n, _Block* _block, _Block* _nextBlock0, int _si, __m128i & vec_max_ret){
  __m128i vec_c = _mm_load_si128((__m128i*)&_block->c0[_si]);
  __m128i vec_v = _mm_load_si128((__m128i*)&_block->v0[_si]);
  __m128i vec_n = _mm_set1_epi16(n);
  __m128i vec_0 = _mm_setzero_si128();

  /* base case I: full knapsack or no items */
  __m128i cond_c_lt_0 = _mm_cmplt_epi16(vec_c, vec_0);
  __m128i vec_max_max_ret_MY_MIN = _mm_max_epi16(vec_max_ret, _mm_set1_epi16(MY_MIN));
  vec_max_ret = _mm_blendv_epi8(vec_max_ret, vec_max_max_ret_MY_MIN, cond_c_lt_0);

  /* base case II: full knapsack or no items */
  __m128i cond_c_eq_0 = _mm_cmpeq_epi16(vec_c, vec_0);
  __m128i cond_n_eq_0 = _mm_cmpeq_epi16(vec_n, vec_0);
  __m128i cond_c_gt_0 = _mm_cmpgt_epi16(vec_c, vec_0);
  cond_n_eq_0 = _mm_and_si128(cond_n_eq_0, cond_c_gt_0);
  __m128i cond_c_or_n_eq_0 = _mm_or_si128(cond_c_eq_0, cond_n_eq_0);
  __m128i vec_max_max_ret_v = _mm_max_epi16(vec_max_ret, vec_v);
  vec_max_ret = _mm_blendv_epi8(vec_max_ret, vec_max_max_ret_v, cond_c_or_n_eq_0);

  __m128i cond_is_leaf = _mm_or_si128(cond_c_lt_0, cond_c_or_n_eq_0);
  unsigned short mask_16 = _mm_movemask_epi8(cond_is_leaf);

  __attribute__((aligned(16))) unsigned char tmp[16];
  unsigned index = 0;
  //do first 8
  *((__int64*)tmp) = g_shuffletable[mask_16 & 0x000000FF];
  index += g_advanceNextPtrCounts[mask_16 & 0x000000FF];
  // now second 8
  *((__int64*)&tmp[index]) = 0x0808080808080808 + g_shuffletable[(mask_16 & 0x0000FF00) >> 8];
  index += g_advanceNextPtrCounts[(mask_16 & 0x0000FF00) >> 8];
  // fill rest with 0xFF
  memset(&tmp[index], 0xFF, 16 - index);

  __m128i vec_shuffleTable =  _mm_load_si128((const __m128i *) tmp);
  vec_c = _mm_shuffle_epi8(vec_c, vec_shuffleTable);
  vec_v = _mm_shuffle_epi8(vec_v, vec_shuffleTable);


  _mm_storeu_si128((__m128i*)&_nextBlock0->c0[_nextBlock0->size], vec_c);
  _mm_storeu_si128((__m128i*)&_nextBlock0->v0[_nextBlock0->size], vec_v);

  vec_c = _mm_sub_epi16(vec_c, _mm_set1_epi16(e->weight));
  vec_v = _mm_add_epi16(vec_v, _mm_set1_epi16(e->value));
  _mm_storeu_si128((__m128i*)&_nextBlock0->c1[_nextBlock0->size], vec_c);
  _mm_storeu_si128((__m128i*)&_nextBlock0->v1[_nextBlock0->size], vec_v);

  assert(index % 2 == 0);
  _nextBlock0->size += index / 2;
}

/*simd processing for right children, we store n in short in the block*/
inline void process_simd1_opt(struct item * e, int n, _Block* _block, _Block* _nextBlock0, int _si, __m128i & vec_max_ret){
  __m128i vec_c = _mm_load_si128((__m128i*)&_block->c1[_si]);
  __m128i vec_v = _mm_load_si128((__m128i*)&_block->v1[_si]);
  __m128i vec_n = _mm_set1_epi16(n);
  __m128i vec_0 = _mm_setzero_si128();

  /* base case I: full knapsack or no items */
  __m128i cond_c_lt_0 = _mm_cmplt_epi16(vec_c, vec_0);
  __m128i vec_max_max_ret_MY_MIN = _mm_max_epi16(vec_max_ret, _mm_set1_epi16(MY_MIN));
  vec_max_ret = _mm_blendv_epi8(vec_max_ret, vec_max_max_ret_MY_MIN, cond_c_lt_0);

  /* base case II: full knapsack or no items */
  __m128i cond_c_eq_0 = _mm_cmpeq_epi16(vec_c, vec_0);
  __m128i cond_n_eq_0 = _mm_cmpeq_epi16(vec_n, vec_0);
  __m128i cond_c_gt_0 = _mm_cmpgt_epi16(vec_c, vec_0);
  cond_n_eq_0 = _mm_and_si128(cond_n_eq_0, cond_c_gt_0);
  __m128i cond_c_or_n_eq_0 = _mm_or_si128(cond_c_eq_0, cond_n_eq_0);
  __m128i vec_max_max_ret_v = _mm_max_epi16(vec_max_ret, vec_v);
  vec_max_ret = _mm_blendv_epi8(vec_max_ret, vec_max_max_ret_v, cond_c_or_n_eq_0);

  __m128i cond_is_leaf = _mm_or_si128(cond_c_lt_0, cond_c_or_n_eq_0);
  unsigned short mask_16 = _mm_movemask_epi8(cond_is_leaf);

  __attribute__((aligned(16))) unsigned char tmp[16];
  unsigned index = 0;
  //do first 8
  *((__int64*)tmp) = g_shuffletable[mask_16 & 0x000000FF];
  index += g_advanceNextPtrCounts[mask_16 & 0x000000FF];
  // now second 8
  *((__int64*)&tmp[index]) = 0x0808080808080808 + g_shuffletable[(mask_16 & 0x0000FF00) >> 8];
  index += g_advanceNextPtrCounts[(mask_16 & 0x0000FF00) >> 8];
  // fill rest with 0xFF
  memset(&tmp[index], 0xFF, 16 - index);

  __m128i vec_shuffleTable =  _mm_load_si128((const __m128i *) tmp);
  vec_c = _mm_shuffle_epi8(vec_c, vec_shuffleTable);
  vec_v = _mm_shuffle_epi8(vec_v, vec_shuffleTable);


  _mm_storeu_si128((__m128i*)&_nextBlock0->c0[_nextBlock0->size], vec_c);
  _mm_storeu_si128((__m128i*)&_nextBlock0->v0[_nextBlock0->size], vec_v);

  vec_c = _mm_sub_epi16(vec_c, _mm_set1_epi16(e->weight));
  vec_v = _mm_add_epi16(vec_v, _mm_set1_epi16(e->value));
  _mm_storeu_si128((__m128i*)&_nextBlock0->c1[_nextBlock0->size], vec_c);
  _mm_storeu_si128((__m128i*)&_nextBlock0->v1[_nextBlock0->size], vec_v);

  assert(index % 2 == 0);
  _nextBlock0->size += index / 2;
}


/*Depth First execution of left children to limit the memory consumption*/
void knapsack0(struct item * e, int n, _BlockStack* _stack, int* max_ret, int _depth){
  class _BlockSet* _set = _stack->get(_depth);
  class _Block* _block = _set->block;
  class _Block* _nextBlock0 = &_set-> nextBlock0;
  _nextBlock0->recycle();
#ifdef BLOCK_PROFILE
  profiler.record(_block->size, _depth);
#endif

  int _si = 0;
  __m128i vec_max_ret = _mm_set1_epi16(*max_ret);
  for (; _si < (_block->size - MY_SIMD_WIDTH + 1); _si += MY_SIMD_WIDTH) {
    process_simd0_opt(e, n, _block, _nextBlock0, _si, vec_max_ret);
  }
  *max_ret = horizontal_max_Vec(vec_max_ret);

  //For the rest;
  for (int _bi = _si; _bi < _block -> _Block::size; ++_bi) {
    process_point0(e, n, _block, _nextBlock0, _bi, max_ret);
  }

  if (_nextBlock0 -> _Block::size > 0) {
    _stack ->  get (_depth + 1) -> _BlockSet::block = _nextBlock0;
    knapsack0(e + 1, n - 1, _stack, max_ret, _depth + 1);
    knapsack1(e + 1, n - 1, _stack, max_ret, _depth + 1);
  }
}

/*Depth First execution of right children to limit the memory consumption*/
void knapsack1(struct item * e, int n, _BlockStack* _stack, int* max_ret, int _depth){
  class _BlockSet* _set = _stack->get(_depth);
  class _Block* _block = _set->block;
  class _Block* _nextBlock0 = &_set-> nextBlock0;
  _nextBlock0->recycle();
#ifdef BLOCK_PROFILE
  profiler.record(_block->size, _depth);
#endif

  int _si = 0;
  __m128i vec_max_ret = _mm_set1_epi16(*max_ret);
  for (; _si < (_block->size - MY_SIMD_WIDTH + 1); _si += MY_SIMD_WIDTH) {
    process_simd1_opt(e, n, _block, _nextBlock0, _si, vec_max_ret);
  }
  *max_ret = horizontal_max_Vec(vec_max_ret);

  //For the rest;
  for (int _bi = _si; _bi < _block -> _Block::size; ++_bi) {
    process_point1(e, n, _block, _nextBlock0, _bi, max_ret);
  }

  if (_nextBlock0 -> _Block::size > 0) {
    _stack ->  get (_depth + 1) -> _BlockSet::block = _nextBlock0;
    knapsack0(e + 1, n - 1, _stack, max_ret, _depth + 1);
    knapsack1(e + 1, n - 1, _stack, max_ret, _depth + 1);
  }
}

/*Breadth First execution to expand the number of tasks in software block*/
void knapsack_expand_bf(struct item * e, int n, _BlockStack* _stack, int* max_ret, int* _depth){
  class _BlockSet* _set = _stack->get(*_depth);
  class _Block* _block = _set->block;
  class _Block* _nextBlock0 = &_set-> nextBlock0;
  _nextBlock0->recycle();
#ifdef BLOCK_PROFILE
  profiler.record(_block->size,*_depth);
#endif

  //Add Left
  int _si = 0;
  __m128i vec_max_ret = _mm_set1_epi16(*max_ret);
  for (; _si < (_block->size - MY_SIMD_WIDTH + 1); _si += MY_SIMD_WIDTH) {
    process_simd0_opt(e, n, _block, _nextBlock0, _si, vec_max_ret);
  }
  *max_ret = horizontal_max_Vec(vec_max_ret);

  //For the rest;
  for (int _bi = _si; _bi < _block -> _Block::size; ++_bi) {
    process_point0(e, n, _block, _nextBlock0, _bi, max_ret);
  }

  //Add Right
  _si = 0;
  vec_max_ret = _mm_set1_epi16(*max_ret);
  for (; _si < (_block->size - MY_SIMD_WIDTH + 1); _si += MY_SIMD_WIDTH) {
    process_simd1_opt(e, n, _block, _nextBlock0, _si, vec_max_ret);
  }
  *max_ret = horizontal_max_Vec(vec_max_ret);

  //For the rest;
  for (int _bi = _si; _bi < _block -> _Block::size; ++_bi) {
    process_point1(e, n, _block, _nextBlock0, _bi, max_ret);
  }


  //Free old stack space
  if (!*_depth){
    delete g_initial_block;
  } else
  {
    _stack->release(*_depth - 1);
  }

  int _nextblock0_size = _nextBlock0 -> _Block::size;
  *_depth += 1;
  if (_nextblock0_size > 0 && _nextblock0_size <= _expandSize / 2) {
    _stack ->  get (*_depth) -> _BlockSet::block = _nextBlock0;
    knapsack_expand_bf(e + 1, n - 1, _stack, max_ret, _depth);
  } else { //Reach the buffer size, or finish all evaluation
    _stack ->  get (*_depth) -> _BlockSet::block = _nextBlock0;
  }
}

int usage(void)
{
  fprintf(stderr, "\nUsage: knapsack [<cilk-options>] [-f filename] [-benchmark] [-h]\n\n");
  fprintf(stderr, "The 0-1-Knapsack is a standard combinatorial optimization problem: ``A\n");
  fprintf(stderr, "thief robbing a store finds n items; the ith item is worth v_i dollars\n");
  fprintf(stderr, "and weighs w_i pounds, where v_i and w_i are integers. He wants to take\n");
  fprintf(stderr, "as valuable a load as possible, but he can carry at most W pounds in\n");
  fprintf(stderr, "his knapsack for some integer W. What items should he take?''\n\n");
  return -1;
}

char *specifiers[] = {"-f", "-benchmark", "-h", "-b", 0};
int opt_types[] = {STRINGARG, BENCHMARK, BOOLARG, INTARG ,0};

/*Benchmark entrance called by harness*/
int app_main(int argc, char *argv[])
{
  struct item items[MAX_ITEMS];	/* array of items */
  int n, capacity, sol, benchmark, help, buf;
  char filename[100];
  buf = 0;
  sol = 0;

  /* standard benchmark options */
  strcpy(filename, "../inputs/knapsack-example2.input");

  get_options(argc, argv, specifiers, opt_types, filename, &benchmark, &help, &buf);
  if (buf) _expandSize = pow(2.0, buf);

  if (help)
    return usage();


  if (benchmark) {
    switch (benchmark) {
      case 1:		/* short benchmark options -- a little work */
        strcpy(filename, "../inputs/knapsack-example1.input");
        break;
      case 2:		/* standard benchmark options */
        strcpy(filename, "../inputs/knapsack-example2.input");
        break;
      case 3:		/* long benchmark options -- a lot of work */
        strcpy(filename, "../inputs/knapsack-example3.input");
        break;
    }
  }
  if (read_input(filename, items, &capacity, &n))
    return 1;

#ifdef PARALLELISM_PROFILE
  parallelismProfiler = new ParallelismProfiler;
#endif

  Harness::start_timing();
  //_expandDepth = Harness::get_splice_depth();

  //Initialize software block stack
  cout << "Set fixed max block buffer size, _expandSize: " << _expandSize << endl;
  _Block::max_block = _expandSize;
  Harness::set_block_size(_expandSize);
  class _BlockStack * _stack = new _BlockStack;
  class _Block * _block = new _Block;
  g_initial_block = _block;

  /* base case: full knapsack or no items */
  if (capacity < 0){
    sol = max(sol, INT_MIN);
    return 0;
  }

  if (n == 0 || capacity == 0){
    sol = max(sol, 0);
    return 0;		/* feasible solution, with value v */
  }

  _block->add(capacity, 0, capacity - items[0].weight, items[0].value);

  int _depth = 0;
  _stack->get (_depth) -> block = _block;

  //Start to execute blocked knapsack
  if (_expandSize >= 2) knapsack_expand_bf(items + 1, n - 1, _stack, &sol, &_depth);
  int df_block_size = _stack->get(_depth)->block->size;
  cout << "This is the max block buffer size for dfs: " << df_block_size << endl;

  if (df_block_size){
    knapsack0(items + _depth + 1, n - _depth - 1, _stack, &sol, _depth);
    knapsack1(items + _depth + 1, n - _depth - 1, _stack, &sol, _depth);
  }

  delete _stack;
  if (_expandSize < 2) delete _block;

  Harness::stop_timing();
#ifdef BLOCK_PROFILE
  profiler.output();
#endif
#ifdef TRACK_TRAVERSALS
  cout << "work: " << work << endl;
#endif

  printf("\nExample: knapsack\n");
  printf("options: problem-file = %s\n\n", filename);
  printf("Best value is %d\n\n", sol);

#ifdef PROFILE_SPACE_USE
  cout << "This is max space use (Bytes): " << m_space << endl;
  cout << "This is total malloc counts: " << total_malloc << endl;
#endif

  return 0;
}
