/home/tgrogers-raid/a/tgrogers/github/purdue-aalp/gpgpu-sim_simulations/util/correlation/../../benchmarks/bin/10.1/release/transpose Starting...

> Device 0: "Tesla V100-SXM2-32GB"
> SM Capability 7.0 detected:
> [Tesla V100-SXM2-32GB] has 80 MP(s) x -1 (Cores/MP) = -80 (Cores)
> Compute performance scaling factor = 1.00
> MatrixSize X = 512
> MatrixSize Y = 512

Matrix size: 512x512 (32x32 tiles), tile size: 16x16, block size: 16x16


transpose-Outer-simple copy       , Throughput = 0.3964 GB/s, Time = 4.92774 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-simple copy       , Throughput = 0.4410 GB/s, Time = 4.42886 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-shared memory copy, Throughput = 0.4487 GB/s, Time = 4.35261 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-shared memory copy, Throughput = 0.4401 GB/s, Time = 4.43824 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-naive             , Throughput = 0.4540 GB/s, Time = 4.30189 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-naive             , Throughput = 0.4483 GB/s, Time = 4.35712 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-coalesced         , Throughput = 0.4552 GB/s, Time = 4.29050 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-coalesced         , Throughput = 0.4443 GB/s, Time = 4.39619 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-optimized         , Throughput = 0.4553 GB/s, Time = 4.28957 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-optimized         , Throughput = 0.4508 GB/s, Time = 4.33222 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-coarse-grained    , Throughput = 0.4537 GB/s, Time = 4.30525 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-coarse-grained    , Throughput = 0.4452 GB/s, Time = 4.38752 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-fine-grained      , Throughput = 0.4558 GB/s, Time = 4.28538 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-fine-grained      , Throughput = 0.4492 GB/s, Time = 4.34832 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-diagonal          , Throughput = 0.4532 GB/s, Time = 4.31002 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-diagonal          , Throughput = 0.4481 GB/s, Time = 4.35824 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
