/home/tgrogers/github/purdue-aalp/gpgpu-sim_simulations/util/correlation/../../benchmarks/bin/10.1/release/transpose Starting...

> Device 0: "GeForce GTX TITAN"
> SM Capability 3.5 detected:
> [GeForce GTX TITAN] has 14 MP(s) x -1 (Cores/MP) = -14 (Cores)
> Compute performance scaling factor = 1.00
> MatrixSize X = 512
> MatrixSize Y = 512

Matrix size: 512x512 (32x32 tiles), tile size: 16x16, block size: 16x16


transpose-Outer-simple copy       , Throughput = 0.8717 GB/s, Time = 2.24067 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-simple copy       , Throughput = 1.2386 GB/s, Time = 1.57683 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-shared memory copy, Throughput = 1.2426 GB/s, Time = 1.57174 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-shared memory copy, Throughput = 1.2283 GB/s, Time = 1.59005 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-naive             , Throughput = 1.2511 GB/s, Time = 1.56112 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-naive             , Throughput = 1.2562 GB/s, Time = 1.55478 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-coalesced         , Throughput = 1.2047 GB/s, Time = 1.62128 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-coalesced         , Throughput = 1.2595 GB/s, Time = 1.55072 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-optimized         , Throughput = 1.2368 GB/s, Time = 1.57923 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-optimized         , Throughput = 1.2693 GB/s, Time = 1.53872 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-coarse-grained    , Throughput = 1.2754 GB/s, Time = 1.53142 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-coarse-grained    , Throughput = 1.2726 GB/s, Time = 1.53472 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-fine-grained      , Throughput = 1.2790 GB/s, Time = 1.52704 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-fine-grained      , Throughput = 1.1752 GB/s, Time = 1.66195 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-diagonal          , Throughput = 1.2865 GB/s, Time = 1.51814 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-diagonal          , Throughput = 1.2375 GB/s, Time = 1.57827 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
