/home/tgrogers/github-purdue/aalp/gpgpu-sim_simulations/util/correlation/../../benchmarks/bin/9.1/release/transpose Starting...

> Device 0: "TITAN X (Pascal)"
> SM Capability 6.1 detected:
> [TITAN X (Pascal)] has 28 MP(s) x -1 (Cores/MP) = -28 (Cores)
> Compute performance scaling factor = 1.00

Matrix size: 512x512 (32x32 tiles), tile size: 16x16, block size: 16x16


transpose-Outer-simple copy       , Throughput = 64.8620 GB/s, Time = 0.03011 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-simple copy       , Throughput = 57.9081 GB/s, Time = 0.03373 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-shared memory copy, Throughput = 69.5954 GB/s, Time = 0.02806 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-shared memory copy, Throughput = 60.2519 GB/s, Time = 0.03242 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-naive             , Throughput = 72.1456 GB/s, Time = 0.02707 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-naive             , Throughput = 40.6901 GB/s, Time = 0.04800 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-coalesced         , Throughput = 75.2591 GB/s, Time = 0.02595 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-coalesced         , Throughput = 62.2808 GB/s, Time = 0.03136 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-optimized         , Throughput = 76.1987 GB/s, Time = 0.02563 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-optimized         , Throughput = 55.1356 GB/s, Time = 0.03542 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-coarse-grained    , Throughput = 70.6425 GB/s, Time = 0.02765 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-coarse-grained    , Throughput = 60.4308 GB/s, Time = 0.03232 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-fine-grained      , Throughput = 74.3425 GB/s, Time = 0.02627 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-fine-grained      , Throughput = 55.7908 GB/s, Time = 0.03501 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-diagonal          , Throughput = 71.7217 GB/s, Time = 0.02723 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-diagonal          , Throughput = 60.3711 GB/s, Time = 0.03235 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
