/**********************************************************************************************/
/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
/**********************************************************************************************/
/*
   ---------------------------------------------------------------------------
   Copyright (c) 2002, Dr Brian Gladman, Worcester, UK.   All rights reserved.

   LICENSE TERMS

   The free distribution and use of this software in both source and binary
   form is allowed (with or without changes) provided that:

   1. distributions of this source code include the above copyright
   notice, this list of conditions and the following disclaimer;

   2. distributions in binary form include the above copyright
   notice, this list of conditions and the following disclaimer
   in the documentation and/or other associated materials;

   3. the copyright holder's name is not used to endorse products
   built using this software without specific written permission.

   ALTERNATIVELY, provided that this notice is retained in full, this product
   may be distributed under the terms of the GNU General Public License (GPL),
   in which case the provisions of the GPL apply INSTEAD OF those given above.

   DISCLAIMER

   This software is provided 'as is' with no explicit or implied warranties
   in respect of its properties, including, but not limited to, correctness
   and/or fitness for purpose.
   ---------------------------------------------------------------------------
   Issue Date: 01/08/2005

   This is a byte oriented version of SHA1 that operates on arrays of bytes
   stored in memory.
   */
/**********************************************************/
/* This code is for PLDI-15 Artifact Evaluation only      */ 
/* and will be released with further copyright information*/ 
/* File: SSE version of sha1 derived from above benchmark */
/**********************************************************/

#include <string.h>     /* for memcpy() etc.        */
#include <stdio.h>

#include "brg_sha1-sse.h"
#include "brg_endian.h"
#include "simd.h"

#if defined(__cplusplus)
extern "C"
{
#endif

  /** BEGIN: UTS RNG Harness **/

  void rng_init(RNG_state *newstate, int seed)
  {
    struct sha1_context ctx;
    struct state_t gen;
    int i;

    for (i=0; i < 16; i++)
      gen.state[i] = 0;
    gen.state[16] = (u_int8_t) (0xFF & (seed >> 24));
    gen.state[17] = (u_int8_t) (0xFF & (seed >> 16));
    gen.state[18] = (u_int8_t) (0xFF & (seed >> 8));
    gen.state[19] = (u_int8_t) (0xFF & (seed >> 0));

    sha1_begin(&ctx);
    sha1_hash(gen.state, 20, &ctx);
    sha1_end(newstate, &ctx);
  }

  void rng_spawn(RNG_state *mystate, RNG_state *newstate, int spawnnumber)
  {
    struct sha1_context ctx;
    u_int8_t  bytes[4];

    bytes[0] = (u_int8_t) (0xFF & (spawnnumber >> 24));
    bytes[1] = (u_int8_t) (0xFF & (spawnnumber >> 16));
    bytes[2] = (u_int8_t) (0xFF & (spawnnumber >> 8));
    bytes[3] = (u_int8_t) (0xFF & spawnnumber);

    sha1_begin(&ctx);
    sha1_hash(mystate, 20, &ctx);
    sha1_hash(bytes, 4, &ctx);
    sha1_end(newstate, &ctx);
  }

  int rng_rand(RNG_state *mystate)
  {
    int r;
    uint32 b =  (mystate[16] << 24) | (mystate[17] << 16) | (mystate[18] << 8) | (mystate[19] << 0);
    b = b & POS_MASK;
    r = (int) b;
    //bots_debug("b: %d\t, r: %d\n", b, r);
    return r;
  }

  // Interpret 32 bit positive integer as value on [0,1)
  double rng_toProb(int n)
  {
    if (n < 0) {
      printf("*** toProb: rand n = %d out of range\n",n);
    }
    return ((n<0)? 0.0 : ((double) n)/2147483648.0);
  }

  int rng_nextrand(RNG_state *mystate){
    struct sha1_context ctx;
    int r;
    uint32 b;

    sha1_begin(&ctx);
    sha1_hash(mystate, 20, &ctx);
    sha1_end(mystate, &ctx);
    b =  (mystate[16] << 24) | (mystate[17] << 16)
        | (mystate[18] << 8) | (mystate[19] << 0);
    b = b & POS_MASK;

    r = (int) b;
    return r;
  }

  /* condense state into string to display during debugging */
  char * rng_showstate(RNG_state *state, char *s){
    sprintf(s,"%.2X%.2X...", state[0],state[1]);
    return s;
  }

  /* describe random number generator type into string */
  void rng_showtype( void ) {
    bots_message("SHA-1 (state size = %luB)\n", sizeof(struct state_t));
  }

  /** END: UTS RNG Harness **/

#if defined( _MSC_VER ) && ( _MSC_VER > 800 )
#pragma intrinsic(memcpy)
#endif

#if 0 && defined(_MSC_VER)
#define rotl32  _lrotl
#define rotr32  _lrotr
#else
#define rotl32(x,n)   (((x) << n) | ((x) >> (32 - n)))
#define rotr32(x,n)   (((x) >> n) | ((x) << (32 - n)))
#endif

#if !defined(bswap_32)
#define bswap_32(x) ((rotr32((x), 24) & 0x00ff00ff) | (rotr32((x), 8) & 0xff00ff00))
#endif

#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
#define SWAP_BYTES
#else
#undef  SWAP_BYTES
#endif

#if defined(SWAP_BYTES)
#define bsw_32(p,n) \
  { int _i = (n); while(_i--) ((uint_32t*)p)[_i] = bswap_32(((uint_32t*)p)[_i]); }
#else
#define bsw_32(p,n)
#endif

#define SHA1_MASK   (SHA1_BLOCK_SIZE - 1)

#if 0

#define ch(x,y,z)       (((x) & (y)) ^ (~(x) & (z)))
#define parity(x,y,z)   ((x) ^ (y) ^ (z))
#define maj(x,y,z)      (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))

#else   /* Discovered by Rich Schroeppel and Colin Plumb   */

#define ch(x,y,z)       ((z) ^ ((x) & ((y) ^ (z))))
#define parity(x,y,z)   ((x) ^ (y) ^ (z))
#define maj(x,y,z)      (((x) & (y)) | ((z) & ((x) ^ (y))))

#endif

  /* Compile 64 bytes of hash data into SHA1 context. Note    */
  /* that this routine assumes that the byte order in the     */
  /* ctx->wbuf[] at this point is in such an order that low   */
  /* address bytes in the ORIGINAL byte stream will go in     */
  /* this buffer to the high end of 32-bit words on BOTH big  */
  /* and little endian systems                                */

#ifdef ARRAY
#define q(v,n)  v[n]
#else
#define q(v,n)  v##n
#endif

#define one_cycle(v,a,b,c,d,e,f,k,h)            \
  q(v,e) += rotr32(q(v,a),27) +               \
  f(q(v,b),q(v,c),q(v,d)) + k + h;  \
  q(v,b)  = rotr32(q(v,b), 2)

#define five_cycle(v,f,k,i)                 \
  one_cycle(v, 0,1,2,3,4, f,k,hf(i  ));   \
  one_cycle(v, 4,0,1,2,3, f,k,hf(i+1));   \
  one_cycle(v, 3,4,0,1,2, f,k,hf(i+2));   \
  one_cycle(v, 2,3,4,0,1, f,k,hf(i+3));   \
  one_cycle(v, 1,2,3,4,0, f,k,hf(i+4))

  VOID_RETURN sha1_compile(sha1_ctx ctx[1])
  {   uint_32t    *w = ctx->wbuf;

#ifdef ARRAY
    uint_32t    v[5];
    memcpy(v, ctx->hash, 5 * sizeof(uint_32t));
#else
    uint_32t    v0, v1, v2, v3, v4;
    v0 = ctx->hash[0]; v1 = ctx->hash[1];
    v2 = ctx->hash[2]; v3 = ctx->hash[3];
    v4 = ctx->hash[4];
#endif

#define hf(i)   w[i]

    five_cycle(v, ch, 0x5a827999,  0);
    five_cycle(v, ch, 0x5a827999,  5);
    five_cycle(v, ch, 0x5a827999, 10);
    one_cycle(v,0,1,2,3,4, ch, 0x5a827999, hf(15)); \

#undef  hf
#define hf(i) (w[(i) & 15] = rotl32(                    \
                                    w[((i) + 13) & 15] ^ w[((i) + 8) & 15] \
                                    ^ w[((i) +  2) & 15] ^ w[(i) & 15], 1))

        one_cycle(v,4,0,1,2,3, ch, 0x5a827999, hf(16));
    one_cycle(v,3,4,0,1,2, ch, 0x5a827999, hf(17));
    one_cycle(v,2,3,4,0,1, ch, 0x5a827999, hf(18));
    one_cycle(v,1,2,3,4,0, ch, 0x5a827999, hf(19));

    five_cycle(v, parity, 0x6ed9eba1,  20);
    five_cycle(v, parity, 0x6ed9eba1,  25);
    five_cycle(v, parity, 0x6ed9eba1,  30);
    five_cycle(v, parity, 0x6ed9eba1,  35);

    five_cycle(v, maj, 0x8f1bbcdc,  40);
    five_cycle(v, maj, 0x8f1bbcdc,  45);
    five_cycle(v, maj, 0x8f1bbcdc,  50);
    five_cycle(v, maj, 0x8f1bbcdc,  55);

    five_cycle(v, parity, 0xca62c1d6,  60);
    five_cycle(v, parity, 0xca62c1d6,  65);
    five_cycle(v, parity, 0xca62c1d6,  70);
    five_cycle(v, parity, 0xca62c1d6,  75);

#ifdef ARRAY
    ctx->hash[0] += v[0]; ctx->hash[1] += v[1];
    ctx->hash[2] += v[2]; ctx->hash[3] += v[3];
    ctx->hash[4] += v[4];
#else
    ctx->hash[0] += v0; ctx->hash[1] += v1;
    ctx->hash[2] += v2; ctx->hash[3] += v3;
    ctx->hash[4] += v4;
#endif
  }

  VOID_RETURN sha1_begin(sha1_ctx ctx[1])
  {
    ctx->count[0] = ctx->count[1] = 0;
    ctx->hash[0] = 0x67452301;
    ctx->hash[1] = 0xefcdab89;
    ctx->hash[2] = 0x98badcfe;
    ctx->hash[3] = 0x10325476;
    ctx->hash[4] = 0xc3d2e1f0;
  }

  /* SHA1 hash data in an array of bytes into hash buffer and */
  /* call the hash_compile function as required.              */
  VOID_RETURN sha1_hash(const unsigned char data[], unsigned long len, sha1_ctx ctx[1])
  {
    uint_32t pos = (uint_32t)(ctx->count[0] & SHA1_MASK), space = SHA1_BLOCK_SIZE - pos;
    const unsigned char *sp = data;

    if((ctx->count[0] += len) < len) ++(ctx->count[1]);

    while(len >= space)     /* tranfer whole blocks if possible  */
    {
      memcpy(((unsigned char*)ctx->wbuf) + pos, sp, space);
      sp += space; len -= space; space = SHA1_BLOCK_SIZE; pos = 0;
      bsw_32(ctx->wbuf, SHA1_BLOCK_SIZE >> 2);
      sha1_compile(ctx);
    }

    memcpy(((unsigned char*)ctx->wbuf) + pos, sp, len);
  }

  /* SHA1 final padding and digest calculation  */
  VOID_RETURN sha1_end(unsigned char hval[], sha1_ctx ctx[1])
  {
    uint_32t    i = (uint_32t)(ctx->count[0] & SHA1_MASK);

    /* put bytes in the buffer in an order in which references to   */
    /* 32-bit words will put bytes with lower addresses into the    */
    /* top of 32 bit words on BOTH big and little endian machines   */
    bsw_32(ctx->wbuf, (i + 3) >> 2);

    /* we now need to mask valid bytes and add the padding which is */
    /* a single 1 bit and as many zero bits as necessary. Note that */
    /* we can always add the first padding byte here because the    */
    /* buffer always has at least one empty slot                    */
    ctx->wbuf[i >> 2] &= 0xffffff80 << 8 * (~i & 3);
    ctx->wbuf[i >> 2] |= 0x00000080 << 8 * (~i & 3);

    /* we need 9 or more empty positions, one for the padding byte  */
    /* (above) and eight for the length count. If there is not      */
    /* enough space, pad and empty the buffer                       */
    if(i > SHA1_BLOCK_SIZE - 9)
    {
      if(i < 60) ctx->wbuf[15] = 0;
      sha1_compile(ctx);
      i = 0;
    }
    else    /* compute a word index for the empty buffer positions  */
      i = (i >> 2) + 1;

    while(i < 14) /* and zero pad all but last two positions        */
      ctx->wbuf[i++] = 0;

    /* the following 32-bit length fields are assembled in the      */
    /* wrong byte order on little endian machines but this is       */
    /* corrected later since they are only ever used as 32-bit      */
    /* word values.                                                 */
    ctx->wbuf[14] = (ctx->count[1] << 3) | (ctx->count[0] >> 29);
    ctx->wbuf[15] = ctx->count[0] << 3;
    sha1_compile(ctx);

    /* extract the hash value as bytes in case the hash buffer is   */
    /* misaligned for 32-bit words                                  */
    for(i = 0; i < SHA1_DIGEST_SIZE; ++i)
      hval[i] = (unsigned char)(ctx->hash[i >> 2] >> (8 * (~i & 3)));
  }

  VOID_RETURN sha1(unsigned char hval[], const unsigned char data[], unsigned long len)
  {
    sha1_ctx cx[1];
    sha1_begin(cx); sha1_hash(data, len, cx); sha1_end(hval, cx);
  }

  /*************************Start Unrolled Functions*************************************/

  //process MY_SIMD_WIDTH states at the same time
  //Layout is [MY_SIMD_WIDTH][20]
  void rng_spawn_unroll(RNG_state mystate[MY_SIMD_WIDTH][20],
                        RNG_state newstate[MY_SIMD_WIDTH][20], int spawnnumber[MY_SIMD_WIDTH]){

    struct sha1_context ctx[MY_SIMD_WIDTH];
    u_int8_t bytes[MY_SIMD_WIDTH][4];

    for (int s = 0; s < MY_SIMD_WIDTH; ++s){
      bytes[s][0] = (u_int8_t) (0xFF & (spawnnumber[s] >> 24));
      bytes[s][1] = (u_int8_t) (0xFF & (spawnnumber[s] >> 16));
      bytes[s][2] = (u_int8_t) (0xFF & (spawnnumber[s] >> 8));
      bytes[s][3] = (u_int8_t) (0xFF & spawnnumber[s]);

      sha1_begin(&ctx[s]);
    }
    sha1_hash_unroll1(mystate, 20, ctx);
    sha1_hash_unroll2(bytes, 4, ctx);
    sha1_end_unroll(newstate, ctx);
  }


  VOID_RETURN AoS_to_SoA(const sha1_ctx ctx[MY_SIMD_WIDTH], sha1_ctx_vec* ctx_vec){
    for(int i = 0; i < MY_SIMD_WIDTH; ++i){
      for (int j = 0; j < 2; ++j){
        ctx_vec->count[j][i] = ctx[i].count[j];
      }
      for (int j = 0; j < 5; ++j){
        ctx_vec->hash[j][i] = ctx[i].hash[j];
      }
      for (int j = 0; j < 16; ++j){
        ctx_vec->wbuf[j][i] = ctx[i].wbuf[j];
      }
    }
  }

  VOID_RETURN SoA_to_AoS(const sha1_ctx_vec* ctx_vec, sha1_ctx ctx[MY_SIMD_WIDTH]){
    for(int i = 0; i < MY_SIMD_WIDTH; ++i){
      for (int j = 0; j < 2; ++j){
        ctx[i].count[j] = ctx_vec->count[j][i];
      }
      for (int j = 0; j < 5; ++j){
        ctx[i].hash[j] = ctx_vec->hash[j][i];
      }
      for (int j = 0; j < 16; ++j){
        ctx[i].wbuf[j] = ctx_vec->wbuf[j][i];
      }
    }
  }

#define rotl32_vec(x_vec, n) (_mm_or_si128(_mm_slli_epi32((x_vec), n), _mm_srli_epi32((x_vec), (32-n))))
#define rotr32_vec(x_vec, n) (_mm_or_si128(_mm_srli_epi32((x_vec), n), _mm_slli_epi32((x_vec), (32-n))))

#define ch_vec(x_vec, y_vec, z_vec) (_mm_xor_si128((z_vec), _mm_and_si128((x_vec), _mm_xor_si128((y_vec), (z_vec)))))
#define parity_vec(x_vec, y_vec, z_vec) (_mm_xor_si128(_mm_xor_si128((x_vec), (y_vec)), (z_vec)))
#define maj_vec(x_vec, y_vec, z_vec) (_mm_or_si128(_mm_and_si128((x_vec), (y_vec)), _mm_and_si128((z_vec),_mm_or_si128((x_vec), (y_vec)))))

#define one_cycle_vec(v_vec,a,b,c,d,e,f_vec,k_vec,h_vec)            \
  q(v_vec,e) = _mm_add_epi32(_mm_add_epi32(_mm_add_epi32(_mm_add_epi32( \
                                                                       q(v_vec,e), rotr32_vec(q(v_vec,a), 27)), \
                                                         f_vec(q(v_vec,b),q(v_vec,c),q(v_vec,d))), k_vec), h_vec); \
  q(v_vec, b) = rotr32_vec(q(v_vec, b), 2)

#define five_cycle_vec(v_vec,f_vec,k_vec,i)                 \
  one_cycle_vec(v_vec, 0,1,2,3,4, f_vec,k_vec,hf_vec(i  ));   \
  one_cycle_vec(v_vec, 4,0,1,2,3, f_vec,k_vec,hf_vec(i+1));   \
  one_cycle_vec(v_vec, 3,4,0,1,2, f_vec,k_vec,hf_vec(i+2));   \
  one_cycle_vec(v_vec, 2,3,4,0,1, f_vec,k_vec,hf_vec(i+3));   \
  one_cycle_vec(v_vec, 1,2,3,4,0, f_vec,k_vec,hf_vec(i+4))

  //for SSE (128) now.
  VOID_RETURN sha1_compile_vec(sha1_ctx_vec* ctx_vec){
    __m128i *w_vec = (__m128i*)(ctx_vec->wbuf);

#ifdef ARRAY
    __m128i v_vec[5];
    memcpy(v_vec, ctx_vec->hash, 5 * sizeof(__m128i));
#else
    __m128i v_vec0, v_vec1, v_vec2, v_vec3, v_vec4;
    v_vec0 = _mm_load_si128((__m128i*)(ctx_vec->hash[0]));
    v_vec1 = _mm_load_si128((__m128i*)(ctx_vec->hash[1]));
    v_vec2 = _mm_load_si128((__m128i*)(ctx_vec->hash[2]));
    v_vec3 = _mm_load_si128((__m128i*)(ctx_vec->hash[3]));
    v_vec4 = _mm_load_si128((__m128i*)(ctx_vec->hash[4]));
#endif

#define hf_vec(i) w_vec[i]
    five_cycle_vec(v_vec, ch_vec, _mm_set1_epi32(0x5a827999), 0);
    five_cycle_vec(v_vec, ch_vec, _mm_set1_epi32(0x5a827999), 5);
    five_cycle_vec(v_vec, ch_vec, _mm_set1_epi32(0x5a827999), 10);
    one_cycle_vec(v_vec, 0, 1, 2, 3, 4, ch_vec, _mm_set1_epi32(0x5a827999), hf_vec(15));

#undef hf_vec
#define hf_vec(i) (w_vec[(i) & 15] = rotl32_vec(                    \
                                                _mm_xor_si128(_mm_xor_si128(  \
                                                                            _mm_xor_si128(w_vec[((i) + 13) & 15], w_vec[((i) + 8) & 15]), \
                                                                            w_vec[((i) +  2) & 15]), w_vec[(i) & 15]), 1))


    one_cycle_vec(v_vec,4,0,1,2,3, ch_vec, _mm_set1_epi32(0x5a827999), hf_vec(16));
    one_cycle_vec(v_vec,3,4,0,1,2, ch_vec, _mm_set1_epi32(0x5a827999), hf_vec(17));
    one_cycle_vec(v_vec,2,3,4,0,1, ch_vec, _mm_set1_epi32(0x5a827999), hf_vec(18));
    one_cycle_vec(v_vec,1,2,3,4,0, ch_vec, _mm_set1_epi32(0x5a827999), hf_vec(19));

    five_cycle_vec(v_vec, parity_vec, _mm_set1_epi32(0x6ed9eba1),  20);
    five_cycle_vec(v_vec, parity_vec, _mm_set1_epi32(0x6ed9eba1),  25);
    five_cycle_vec(v_vec, parity_vec, _mm_set1_epi32(0x6ed9eba1),  30);
    five_cycle_vec(v_vec, parity_vec, _mm_set1_epi32(0x6ed9eba1),  35);

    five_cycle_vec(v_vec, maj_vec, _mm_set1_epi32(0x8f1bbcdc),  40);
    five_cycle_vec(v_vec, maj_vec, _mm_set1_epi32(0x8f1bbcdc),  45);
    five_cycle_vec(v_vec, maj_vec, _mm_set1_epi32(0x8f1bbcdc),  50);
    five_cycle_vec(v_vec, maj_vec, _mm_set1_epi32(0x8f1bbcdc),  55);

    five_cycle_vec(v_vec, parity_vec, _mm_set1_epi32(0xca62c1d6),  60);
    five_cycle_vec(v_vec, parity_vec, _mm_set1_epi32(0xca62c1d6),  65);
    five_cycle_vec(v_vec, parity_vec, _mm_set1_epi32(0xca62c1d6),  70);
    five_cycle_vec(v_vec, parity_vec, _mm_set1_epi32(0xca62c1d6),  75);


    __m128i ctx_vec_hash_vec[5];
    memcpy(ctx_vec_hash_vec, ctx_vec->hash, 5 * sizeof(__m128i));
#ifdef ARRAY
    ctx_vec_hash_vec[0] = _mm_add_epi32(ctx_vec_hash_vec[0], v_vec[0]);
    ctx_vec_hash_vec[1] = _mm_add_epi32(ctx_vec_hash_vec[1], v_vec[1]);
    ctx_vec_hash_vec[2] = _mm_add_epi32(ctx_vec_hash_vec[2], v_vec[2]);
    ctx_vec_hash_vec[3] = _mm_add_epi32(ctx_vec_hash_vec[3], v_vec[3]);
    ctx_vec_hash_vec[4] = _mm_add_epi32(ctx_vec_hash_vec[4], v_vec[4]);
#else
    ctx_vec_hash_vec[0] = _mm_add_epi32(ctx_vec_hash_vec[0], v_vec0);
    ctx_vec_hash_vec[1] = _mm_add_epi32(ctx_vec_hash_vec[1], v_vec1);
    ctx_vec_hash_vec[2] = _mm_add_epi32(ctx_vec_hash_vec[2], v_vec2);
    ctx_vec_hash_vec[3] = _mm_add_epi32(ctx_vec_hash_vec[3], v_vec3);
    ctx_vec_hash_vec[4] = _mm_add_epi32(ctx_vec_hash_vec[4], v_vec4);
#endif
    memcpy(ctx_vec->hash, ctx_vec_hash_vec, 5 * sizeof(__m128i));
  }


  VOID_RETURN sha1_hash_unroll1(const unsigned char data[MY_SIMD_WIDTH][20], unsigned long len, sha1_ctx ctx[MY_SIMD_WIDTH]){
    //All SIMD lanes share the same pos, len, space
    uint_32t pos = (uint_32t)(ctx[0].count[0] & SHA1_MASK), space = SHA1_BLOCK_SIZE - pos;

    const unsigned char * sp[MY_SIMD_WIDTH];
    for (int s = 0; s < MY_SIMD_WIDTH; ++s){
      sp[s] = data[s];
      if((ctx[s].count[0] += len) < len) ++(ctx[s].count[1]);
    }

    while(len >= space)     /* tranfer whole blocks if possible  */
    {
      for (int s = 0; s < MY_SIMD_WIDTH; ++s){
        memcpy(((unsigned char*)ctx[s].wbuf) + pos, sp[s], space);
        sp[s] += space; len -= space; space = SHA1_BLOCK_SIZE; pos = 0;
        bsw_32(ctx[s].wbuf, SHA1_BLOCK_SIZE >> 2);
      }
      //Transpose ctx[MY_SIMD_WIDTH] to ctx_vec with arrays [n][MY_SIMD_WIDTH]
      sha1_ctx_vec ctx_vec;
      AoS_to_SoA(ctx, &ctx_vec);
      //Vectorized Kernel
      sha1_compile_vec(&ctx_vec);
      //Transpose ctx_vec with arrays [n][MY_SIMD_WIDTH] to ctx[MY_SIMD_WIDTH]
      SoA_to_AoS(&ctx_vec, ctx);
    }

    for (int s = 0; s < MY_SIMD_WIDTH; ++s){
      memcpy(((unsigned char*)ctx[s].wbuf) + pos, sp[s], len);
    }
  }

  VOID_RETURN sha1_hash_unroll2(const unsigned char data[MY_SIMD_WIDTH][4], unsigned long len, sha1_ctx ctx[MY_SIMD_WIDTH]){
    //All SIMD lanes share the same pos, len, space
    uint_32t pos = (uint_32t)(ctx[0].count[0] & SHA1_MASK), space = SHA1_BLOCK_SIZE - pos;

    const unsigned char * sp[MY_SIMD_WIDTH];
    for (int s = 0; s < MY_SIMD_WIDTH; ++s){
      sp[s] = data[s];
      if((ctx[s].count[0] += len) < len) ++(ctx[s].count[1]);
    }

    while(len >= space)     /* tranfer whole blocks if possible  */
    {

      for (int s = 0; s < MY_SIMD_WIDTH; ++s){
        memcpy(((unsigned char*)ctx[s].wbuf) + pos, sp[s], space);
        sp[s] += space; len -= space; space = SHA1_BLOCK_SIZE; pos = 0;
        bsw_32(ctx[s].wbuf, SHA1_BLOCK_SIZE >> 2);
      }
      //Transpose ctx[MY_SIMD_WIDTH] to ctx_vec with arrays [n][MY_SIMD_WIDTH]
      sha1_ctx_vec ctx_vec;
      AoS_to_SoA(ctx, &ctx_vec);
      //Vectorized Kernel
      sha1_compile_vec(&ctx_vec);
      //Transpose ctx_vec with arrays [n][MY_SIMD_WIDTH] to ctx[MY_SIMD_WIDTH]
      SoA_to_AoS(&ctx_vec, ctx);
    }

    for (int s = 0; s < MY_SIMD_WIDTH; ++s){
      memcpy(((unsigned char*)ctx[s].wbuf) + pos, sp[s], len);
    }
  }


  VOID_RETURN sha1_end_unroll(unsigned char hval[MY_SIMD_WIDTH][20], sha1_ctx ctx[MY_SIMD_WIDTH])
  {
    uint_32t i[MY_SIMD_WIDTH];
    for (int s = 0; s < MY_SIMD_WIDTH; ++s){
      i[s] = (uint_32t)(ctx[s].count[0] & SHA1_MASK);

      /* put bytes in the buffer in an order in which references to   */
      /* 32-bit words will put bytes with lower addresses into the    */
      /* top of 32 bit words on BOTH big and little endian machines   */
      bsw_32(ctx[s].wbuf, (i[s] + 3) >> 2);

      /* we now need to mask valid bytes and add the padding which is */
      /* a single 1 bit and as many zero bits as necessary. Note that */
      /* we can always add the first padding byte here because the    */
      /* buffer always has at least one empty slot                    */
      ctx[s].wbuf[i[s] >> 2] &= 0xffffff80 << 8 * (~i[s] & 3);
      ctx[s].wbuf[i[s] >> 2] |= 0x00000080 << 8 * (~i[s] & 3);

      /* we need 9 or more empty positions, one for the padding byte  */
      /* (above) and eight for the length count. If there is not      */
      /* enough space, pad and empty the buffer                       */
    }
    if(i[0] > SHA1_BLOCK_SIZE - 9)
    {
      for (int s = 0; s < MY_SIMD_WIDTH; ++s){
        if(i[s] < 60) ctx[s].wbuf[15] = 0;
      }
      //Transpose ctx[MY_SIMD_WIDTH] to ctx_vec with arrays [n][MY_SIMD_WIDTH]
      sha1_ctx_vec ctx_vec;
      AoS_to_SoA(ctx, &ctx_vec);
      //Vectorized Kernel
      sha1_compile_vec(&ctx_vec);
      //Transpose ctx_vec with arrays [n][MY_SIMD_WIDTH] to ctx[MY_SIMD_WIDTH]
      SoA_to_AoS(&ctx_vec, ctx);
      for (int s = 0; s < MY_SIMD_WIDTH; ++s){
        i[s] = 0;
      }
    }
    else    /* compute a word index for the empty buffer positions  */
      for (int s = 0; s < MY_SIMD_WIDTH; ++s){
        i[s] = (i[s] >> 2) + 1;
      }

    for (int s = 0; s < MY_SIMD_WIDTH; ++s){
      while(i[s] < 14) /* and zero pad all but last two positions        */
        ctx[s].wbuf[i[s]++] = 0;

      /* the following 32-bit length fields are assembled in the      */
      /* wrong byte order on little endian machines but this is       */
      /* corrected later since they are only ever used as 32-bit      */
      /* word values.                                                 */
      ctx[s].wbuf[14] = (ctx[s].count[1] << 3) | (ctx[s].count[0] >> 29);
      ctx[s].wbuf[15] = ctx[s].count[0] << 3;
    }
    //Transpose ctx[MY_SIMD_WIDTH] to ctx_vec with arrays [n][MY_SIMD_WIDTH]
    sha1_ctx_vec ctx_vec;
    AoS_to_SoA(ctx, &ctx_vec);
    //Vectorized Kernel
    sha1_compile_vec(&ctx_vec);
    //Transpose ctx_vec with arrays [n][MY_SIMD_WIDTH] to ctx[MY_SIMD_WIDTH]
    SoA_to_AoS(&ctx_vec, ctx);

    for (int s = 0; s < MY_SIMD_WIDTH; ++s){
      /* extract the hash value as bytes in case the hash buffer is   */
      /* misaligned for 32-bit words                                  */
      for(i[s] = 0; i[s] < SHA1_DIGEST_SIZE; ++i[s])
        hval[s][i[s]] = (unsigned char)(ctx[s].hash[i[s] >> 2] >> (8 * (~i[s] & 3)));
    }
  }

  /*************************End Unrolled Functions***************************************/

#if defined(__cplusplus)
}
#endif
