/home/tgrogers/github-purdue/gpgpu-sim_simulations/util/correlation/../../benchmarks/bin/9.1/release/transpose Starting...

> Device 0: "Tesla P100-SXM2-16GB"
> SM Capability 6.0 detected:
> [Tesla P100-SXM2-16GB] has 56 MP(s) x -1 (Cores/MP) = -56 (Cores)
> Compute performance scaling factor = 1.00

Matrix size: 512x512 (32x32 tiles), tile size: 16x16, block size: 16x16


transpose-Outer-simple copy       , Throughput = 74.5240 GB/s, Time = 0.02621 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-simple copy       , Throughput = 78.0501 GB/s, Time = 0.02502 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-shared memory copy, Throughput = 43.8471 GB/s, Time = 0.04454 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-shared memory copy, Throughput = 79.3695 GB/s, Time = 0.02461 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-naive             , Throughput = 44.7472 GB/s, Time = 0.04365 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-naive             , Throughput = 50.4423 GB/s, Time = 0.03872 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-coalesced         , Throughput = 44.4215 GB/s, Time = 0.04397 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-coalesced         , Throughput = 67.8168 GB/s, Time = 0.02880 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-optimized         , Throughput = 47.5352 GB/s, Time = 0.04109 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-optimized         , Throughput = 69.5954 GB/s, Time = 0.02806 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-coarse-grained    , Throughput = 41.3237 GB/s, Time = 0.04726 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-coarse-grained    , Throughput = 65.9127 GB/s, Time = 0.02963 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-fine-grained      , Throughput = 48.0214 GB/s, Time = 0.04067 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-fine-grained      , Throughput = 69.4370 GB/s, Time = 0.02813 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-diagonal          , Throughput = 45.4469 GB/s, Time = 0.04298 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-diagonal          , Throughput = 68.2720 GB/s, Time = 0.02861 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
