/home/tgrogers/github-purdue/aalp/gpgpu-sim_simulations/util/correlation/../../benchmarks/bin/10.0/release/transpose Starting...

> Device 0: "TITAN V"
> SM Capability 7.0 detected:
> [TITAN V] has 80 MP(s) x -1 (Cores/MP) = -80 (Cores)
> Compute performance scaling factor = 1.00
> MatrixSize X = 128
> MatrixSize Y = 128

Matrix size: 128x128 (8x8 tiles), tile size: 16x16, block size: 16x16


transpose-Outer-simple copy       , Throughput = 6.7279 GB/s, Time = 0.01814 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-simple copy       , Throughput = 5.3056 GB/s, Time = 0.02301 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-shared memory copy, Throughput = 7.4798 GB/s, Time = 0.01632 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-shared memory copy, Throughput = 5.7537 GB/s, Time = 0.02122 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-naive             , Throughput = 7.5539 GB/s, Time = 0.01616 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-naive             , Throughput = 6.4005 GB/s, Time = 0.01907 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-coalesced         , Throughput = 7.4798 GB/s, Time = 0.01632 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-coalesced         , Throughput = 6.2742 GB/s, Time = 0.01946 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-optimized         , Throughput = 8.1511 GB/s, Time = 0.01498 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-optimized         , Throughput = 6.1827 GB/s, Time = 0.01974 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-coarse-grained    , Throughput = 8.2213 GB/s, Time = 0.01485 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-coarse-grained    , Throughput = 6.4876 GB/s, Time = 0.01882 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-fine-grained      , Throughput = 7.6142 GB/s, Time = 0.01603 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-fine-grained      , Throughput = 6.4766 GB/s, Time = 0.01885 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-diagonal          , Throughput = 8.2037 GB/s, Time = 0.01488 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-diagonal          , Throughput = 6.1428 GB/s, Time = 0.01987 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256
