/home/tgrogers/github-purdue/aalp/gpgpu-sim_simulations/util/correlation/../../benchmarks/bin/9.1/release/transpose Starting...

> Device 0: "TITAN X (Pascal)"
> SM Capability 6.1 detected:
> [TITAN X (Pascal)] has 28 MP(s) x -1 (Cores/MP) = -28 (Cores)
> Compute performance scaling factor = 1.00

Matrix size: 512x512 (32x32 tiles), tile size: 16x16, block size: 16x16


transpose-Outer-simple copy       , Throughput = 0.0020 GB/s, Time = 1000.78491 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-simple copy       , Throughput = 0.0021 GB/s, Time = 948.81720 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-shared memory copy, Throughput = 0.0020 GB/s, Time = 960.67889 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-shared memory copy, Throughput = 0.0020 GB/s, Time = 980.82123 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-naive             , Throughput = 0.0018 GB/s, Time = 1058.72974 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-naive             , Throughput = 0.0020 GB/s, Time = 970.54944 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-coalesced         , Throughput = 0.0019 GB/s, Time = 1029.66199 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-coalesced         , Throughput = 0.0019 GB/s, Time = 1055.04688 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-optimized         , Throughput = 0.0020 GB/s, Time = 985.15479 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-optimized         , Throughput = 0.0020 GB/s, Time = 977.23853 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-coarse-grained    , Throughput = 0.0019 GB/s, Time = 1004.73907 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-coarse-grained    , Throughput = 0.0019 GB/s, Time = 1015.67847 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-fine-grained      , Throughput = 0.0017 GB/s, Time = 1152.88928 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-fine-grained      , Throughput = 0.0018 GB/s, Time = 1074.84143 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-diagonal          , Throughput = 0.0020 GB/s, Time = 967.73090 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-diagonal          , Throughput = 0.0017 GB/s, Time = 1138.00610 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
