/home/tgrogers/github/purdue-aalp/gpgpu-sim_simulations/util/correlation/../../benchmarks/bin/9.1/release/transpose Starting...

> Device 0: "Quadro GV100"
> SM Capability 7.0 detected:
> [Quadro GV100] has 80 MP(s) x -1 (Cores/MP) = -80 (Cores)
> Compute performance scaling factor = 1.00
> MatrixSize X = 512
> MatrixSize Y = 512

Matrix size: 512x512 (32x32 tiles), tile size: 16x16, block size: 16x16


transpose-Outer-simple copy       , Throughput = 70.3981 GB/s, Time = 0.02774 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-simple copy       , Throughput = 61.4036 GB/s, Time = 0.03181 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-shared memory copy, Throughput = 71.6375 GB/s, Time = 0.02726 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-shared memory copy, Throughput = 68.3484 GB/s, Time = 0.02858 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-naive             , Throughput = 85.0072 GB/s, Time = 0.02298 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-naive             , Throughput = 46.4853 GB/s, Time = 0.04202 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-coalesced         , Throughput = 83.6098 GB/s, Time = 0.02336 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-coalesced         , Throughput = 66.2705 GB/s, Time = 0.02947 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-optimized         , Throughput = 86.9447 GB/s, Time = 0.02246 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-optimized         , Throughput = 68.5019 GB/s, Time = 0.02851 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-coarse-grained    , Throughput = 82.5916 GB/s, Time = 0.02365 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-coarse-grained    , Throughput = 67.3677 GB/s, Time = 0.02899 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-fine-grained      , Throughput = 85.4834 GB/s, Time = 0.02285 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-fine-grained      , Throughput = 69.3581 GB/s, Time = 0.02816 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-diagonal          , Throughput = 71.1365 GB/s, Time = 0.02746 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-diagonal          , Throughput = 70.1553 GB/s, Time = 0.02784 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
