/home/tgrogers/github/purdue-aalp/gpgpu-sim_simulations/util/correlation/../../benchmarks/bin/10.1/release/transpose Starting...

> Device 0: "TITAN V"
> SM Capability 7.0 detected:
> [TITAN V] has 80 MP(s) x -1 (Cores/MP) = -80 (Cores)
> Compute performance scaling factor = 1.00
> MatrixSize X = 128
> MatrixSize Y = 128

Matrix size: 128x128 (8x8 tiles), tile size: 16x16, block size: 16x16


transpose-Outer-simple copy       , Throughput = 0.0167 GB/s, Time = 7.30378 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-simple copy       , Throughput = 0.0162 GB/s, Time = 7.53005 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-shared memory copy, Throughput = 0.0179 GB/s, Time = 6.80966 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-shared memory copy, Throughput = 0.0187 GB/s, Time = 6.53043 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-naive             , Throughput = 0.0197 GB/s, Time = 6.19469 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-naive             , Throughput = 0.0207 GB/s, Time = 5.90051 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-coalesced         , Throughput = 0.0218 GB/s, Time = 5.60170 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-coalesced         , Throughput = 0.0220 GB/s, Time = 5.53738 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-optimized         , Throughput = 0.0232 GB/s, Time = 5.25107 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-optimized         , Throughput = 0.0235 GB/s, Time = 5.18579 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-coarse-grained    , Throughput = 0.0242 GB/s, Time = 5.04611 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-coarse-grained    , Throughput = 0.0252 GB/s, Time = 4.85094 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-fine-grained      , Throughput = 0.0260 GB/s, Time = 4.69978 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-fine-grained      , Throughput = 0.0247 GB/s, Time = 4.94490 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-diagonal          , Throughput = 0.0266 GB/s, Time = 4.59360 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-diagonal          , Throughput = 0.0189 GB/s, Time = 6.44899 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256
