port from perforce
This commit is contained in:
28
ruins64k/tools/NvPerfUtility/build/NvPerfSDK.props
Normal file
28
ruins64k/tools/NvPerfUtility/build/NvPerfSDK.props
Normal file
@@ -0,0 +1,28 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ImportGroup Label="PropertySheets" />
|
||||
<PropertyGroup Label="UserMacros">
|
||||
<_Relative_NvPerf_host_dll>bin/x64/nvperf_host.dll</_Relative_NvPerf_host_dll>
|
||||
<_Possible_NvPerf_Dir_0>$([MSBuild]::NormalizePath('$(MSBuildThisFileDirectory)../../NvPerf/'))</_Possible_NvPerf_Dir_0>
|
||||
<_Possible_NvPerf_Dir_1>$([MSBuild]::NormalizePath('$(MSBuildThisFileDirectory)../../../NvPerf/'))</_Possible_NvPerf_Dir_1>
|
||||
<NvPerfSdkPath></NvPerfSdkPath>
|
||||
<NvPerfSdkPath Condition="'$(NvPerfSdkPath)'=='' And Exists('$(_Possible_NvPerf_Dir_0)/$(_Relative_NvPerf_host_dll)')">$(_Possible_NvPerf_Dir_0)</NvPerfSdkPath>
|
||||
<NvPerfSdkPath Condition="'$(NvPerfSdkPath)'=='' And Exists('$(_Possible_NvPerf_Dir_1)/$(_Relative_NvPerf_host_dll)')">$(_Possible_NvPerf_Dir_1)</NvPerfSdkPath>
|
||||
<NvPerfSdkPath Condition="'$(NvPerfSdkPath)'=='' And '$(NVPERF_SDK_PATH)' != '' And Exists('$(NVPERF_SDK_PATH)/$(_Relative_NvPerf_host_dll)')">$(NVPERF_SDK_PATH)</NvPerfSdkPath>
|
||||
<NvPerfSdkPath Condition="'$(NvPerfSdkPath)'!=''">$([MSBuild]::NormalizePath($(NvPerfSdkPath)))</NvPerfSdkPath>
|
||||
<NvPerfUtilityPath>$([MSBuild]::NormalizePath('$(MSBuildThisFileDirectory)../../NvPerfUtility/'))</NvPerfUtilityPath>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup />
|
||||
<ItemGroup />
|
||||
<Target Name="PrintNvPerfLocation" BeforeTargets="ClCompile">
|
||||
<Message
|
||||
Condition="'$(NvPerfSdkPath)'!=''"
|
||||
Text="NvPerf SDK found: NvPerfSdkPath = $(NvPerfSdkPath)" />
|
||||
<Error
|
||||
Condition="'$(NvPerfSdkPath)'==''"
|
||||
Text="NvPerf SDK could not be found; please unzip the SDK into one of the following locations:
|
||||
$(_Possible_NvPerf_Dir_0)
|
||||
$(_Possible_NvPerf_Dir_1)
|
||||
or set environment variable NVPERF_SDK_PATH" />
|
||||
</Target>
|
||||
</Project>
|
||||
118
ruins64k/tools/NvPerfUtility/build/NvPerfUtility.vcxproj
Normal file
118
ruins64k/tools/NvPerfUtility/build/NvPerfUtility.vcxproj
Normal file
@@ -0,0 +1,118 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|x64">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|x64">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
</ItemGroup>
|
||||
<PropertyGroup Label="Globals">
|
||||
<VCProjectVersion>16.0</VCProjectVersion>
|
||||
<Keyword>Win32Proj</Keyword>
|
||||
<ProjectGuid>{ea22d2ac-ebf7-43e4-adb7-0f320c46692e}</ProjectGuid>
|
||||
<RootNamespace>NvPerfUtility</RootNamespace>
|
||||
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
|
||||
<ConfigurationType>Utility</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<PlatformToolset>v142</PlatformToolset>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
|
||||
<ConfigurationType>Utility</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<PlatformToolset>v142</PlatformToolset>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||
<ImportGroup Label="ExtensionSettings">
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="Shared">
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<Import Project="NvPerfSDK.props" />
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<IncludePath>$(NvPerfUtilityPath)/include;$(NvPerfSdkPath)/include;$(IncludePath)</IncludePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<IncludePath>$(NvPerfUtilityPath)/include;$(NvPerfSdkPath)/include;$(IncludePath)</IncludePath>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<SDLCheck>true</SDLCheck>
|
||||
<PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<ConformanceMode>true</ConformanceMode>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<SDLCheck>true</SDLCheck>
|
||||
<PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<ConformanceMode>true</ConformanceMode>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="../include/NvPerfCounterConfiguration.h" />
|
||||
<ClInclude Include="../include/NvPerfCounterData.h" />
|
||||
<ClInclude Include="../include/NvPerfD3D.h" />
|
||||
<ClInclude Include="../include/NvPerfD3D12.h" />
|
||||
<ClInclude Include="../include/NvPerfDeviceProperties.h" />
|
||||
<ClInclude Include="../include/NvPerfInit.h" />
|
||||
<ClInclude Include="../include/NvPerfMetricsConfigBuilder.h" />
|
||||
<ClInclude Include="../include/NvPerfMetricsEvaluator.h" />
|
||||
<ClInclude Include="../include/NvPerfRangeProfiler.h" />
|
||||
<ClInclude Include="../include/NvPerfRangeProfilerD3D12.h" />
|
||||
<ClInclude Include="../include/NvPerfRangeProfilerVulkan.h" />
|
||||
<ClInclude Include="../include/NvPerfReportDefinition.h" />
|
||||
<ClInclude Include="../include/NvPerfReportDefinitionGA10X.h" />
|
||||
<ClInclude Include="../include/NvPerfReportDefinitionGV100.h" />
|
||||
<ClInclude Include="../include/NvPerfReportDefinitionHAL.h" />
|
||||
<ClInclude Include="../include/NvPerfReportDefinitionTU10X.h" />
|
||||
<ClInclude Include="../include/NvPerfReportDefinitionTU11X.h" />
|
||||
<ClInclude Include="../include/NvPerfReportGenerator.h" />
|
||||
<ClInclude Include="../include/NvPerfReportGeneratorD3D12.h" />
|
||||
<ClInclude Include="../include/NvPerfReportGeneratorVulkan.h" />
|
||||
<ClInclude Include="../include/NvPerfVulkan.h" />
|
||||
<ClInclude Include="$(NvPerfSdkPath)/include/nvperf_d3d12_host.h" />
|
||||
<ClInclude Include="$(NvPerfSdkPath)/include/nvperf_d3d12_target.h" />
|
||||
<ClInclude Include="$(NvPerfSdkPath)/include/nvperf_device_host.h" />
|
||||
<ClInclude Include="$(NvPerfSdkPath)/include/nvperf_device_target.h" />
|
||||
<ClInclude Include="$(NvPerfSdkPath)/include/nvperf_host.h" />
|
||||
<ClInclude Include="$(NvPerfSdkPath)/include/nvperf_target.h" />
|
||||
<ClInclude Include="$(NvPerfSdkPath)/include/nvperf_versions_target.h" />
|
||||
<ClInclude Include="$(NvPerfSdkPath)/include/nvperf_vulkan_host.h" />
|
||||
<ClInclude Include="$(NvPerfSdkPath)/include/nvperf_vulkan_target.h" />
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
@@ -0,0 +1,35 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup>
|
||||
<ClInclude Include="$(NvPerfSdkPath)/include/nvperf_d3d12_host.h" />
|
||||
<ClInclude Include="$(NvPerfSdkPath)/include/nvperf_d3d12_target.h" />
|
||||
<ClInclude Include="$(NvPerfSdkPath)/include/nvperf_device_host.h" />
|
||||
<ClInclude Include="$(NvPerfSdkPath)/include/nvperf_device_target.h" />
|
||||
<ClInclude Include="$(NvPerfSdkPath)/include/nvperf_host.h" />
|
||||
<ClInclude Include="$(NvPerfSdkPath)/include/nvperf_target.h" />
|
||||
<ClInclude Include="$(NvPerfSdkPath)/include/nvperf_versions_target.h" />
|
||||
<ClInclude Include="$(NvPerfSdkPath)/include/nvperf_vulkan_host.h" />
|
||||
<ClInclude Include="$(NvPerfSdkPath)/include/nvperf_vulkan_target.h" />
|
||||
<ClInclude Include="../include/NvPerfCounterConfiguration.h" />
|
||||
<ClInclude Include="../include/NvPerfCounterData.h" />
|
||||
<ClInclude Include="../include/NvPerfD3D.h" />
|
||||
<ClInclude Include="../include/NvPerfD3D12.h" />
|
||||
<ClInclude Include="../include/NvPerfDeviceProperties.h" />
|
||||
<ClInclude Include="../include/NvPerfInit.h" />
|
||||
<ClInclude Include="../include/NvPerfMetricsConfigBuilder.h" />
|
||||
<ClInclude Include="../include/NvPerfMetricsEvaluator.h" />
|
||||
<ClInclude Include="../include/NvPerfRangeProfiler.h" />
|
||||
<ClInclude Include="../include/NvPerfRangeProfilerD3D12.h" />
|
||||
<ClInclude Include="../include/NvPerfRangeProfilerVulkan.h" />
|
||||
<ClInclude Include="../include/NvPerfReportDefinition.h" />
|
||||
<ClInclude Include="../include/NvPerfReportDefinitionGA10X.h" />
|
||||
<ClInclude Include="../include/NvPerfReportDefinitionGV100.h" />
|
||||
<ClInclude Include="../include/NvPerfReportDefinitionHAL.h" />
|
||||
<ClInclude Include="../include/NvPerfReportDefinitionTU10X.h" />
|
||||
<ClInclude Include="../include/NvPerfReportDefinitionTU11X.h" />
|
||||
<ClInclude Include="../include/NvPerfReportGenerator.h" />
|
||||
<ClInclude Include="../include/NvPerfReportGeneratorD3D12.h" />
|
||||
<ClInclude Include="../include/NvPerfReportGeneratorVulkan.h" />
|
||||
<ClInclude Include="../include/NvPerfVulkan.h" />
|
||||
</ItemGroup>
|
||||
</Project>
|
||||
@@ -0,0 +1,15 @@
|
||||
/*
|
||||
* Copyright 2014-2021 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
@@ -0,0 +1,232 @@
|
||||
# Copyright 2014-2021 NVIDIA Corporation. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from profiler_report_types import *
|
||||
import argparse
|
||||
import sys
|
||||
import os
|
||||
|
||||
#===============================================================================
|
||||
# Dep File generation, compatible with Make or Ninja build systems
|
||||
#===============================================================================
|
||||
|
||||
def get_loaded_module_file_names():
|
||||
module_file_names = set()
|
||||
for name, module in sys.modules.items():
|
||||
path = getattr(module, "__file__", None)
|
||||
if not path:
|
||||
continue
|
||||
path = os.path.realpath(path)
|
||||
if path.endswith("<frozen>"):
|
||||
continue
|
||||
if not os.path.isabs(path):
|
||||
path = os.path.abspath(path)
|
||||
if not os.path.isfile(path):
|
||||
continue # filter out directories
|
||||
module_file_names.add(path)
|
||||
return sorted(list(module_file_names))
|
||||
|
||||
def gen_depfile(target_file_path, buildroot):
|
||||
target_path_canonicalized = os.path.normpath(os.path.normcase(target_file_path))
|
||||
buildroot_canonicalized = os.path.normpath(os.path.normcase(buildroot))
|
||||
target_path_final = target_file_path
|
||||
if target_path_canonicalized.startswith(buildroot_canonicalized):
|
||||
target_path_final = target_file_path[len(buildroot_canonicalized):]
|
||||
if target_path_final[0] in ('\\', '/'):
|
||||
target_path_final = target_path_final[1:]
|
||||
|
||||
module_file_names = get_loaded_module_file_names()
|
||||
|
||||
depfile_contents = []
|
||||
depfile_contents.append(target_path_final + ':\\')
|
||||
for module_file_name in module_file_names:
|
||||
depfile_contents.append('\t' + module_file_name + ' \\')
|
||||
|
||||
return '\n'.join(depfile_contents)
|
||||
|
||||
# target_file_path : the file being generated
|
||||
# buildroot : root directory of the build system; this prefix is removed from target_file_path to pacify ninja
|
||||
# depfile_path : the depfile to be written
|
||||
def write_depfile(target_file_path, buildroot, depfile_path):
|
||||
with open(depfile_path, 'w', encoding='utf-8') as out_fd:
|
||||
depfile_str = gen_depfile(target_file_path, buildroot)
|
||||
out_fd.write(depfile_str)
|
||||
|
||||
#===============================================================================
|
||||
# C++ header generation
|
||||
#===============================================================================
|
||||
|
||||
def write_cpp_file(out_fd, report_definition):
|
||||
out_fd.write(r'''
|
||||
namespace {} {{
|
||||
'''.format(report_definition.name))
|
||||
|
||||
out_fd.write(r'''
|
||||
inline ReportDefinition GetReportDefinition()
|
||||
{''')
|
||||
|
||||
# counters
|
||||
if len(report_definition.required_counters):
|
||||
out_fd.write(r'''
|
||||
static const char* const RequiredCounters[] = {
|
||||
''')
|
||||
for counter in report_definition.required_counters:
|
||||
out_fd.write(r''' "{}",
|
||||
'''.format(counter))
|
||||
out_fd.write(r''' };
|
||||
''')
|
||||
|
||||
# ratios
|
||||
if len(report_definition.required_ratios):
|
||||
out_fd.write(r'''
|
||||
static const char* const RequiredRatios[] = {
|
||||
''')
|
||||
for ratio in report_definition.required_ratios:
|
||||
out_fd.write(r''' "{}",
|
||||
'''.format(ratio))
|
||||
out_fd.write(r''' };
|
||||
''')
|
||||
|
||||
# throughputs
|
||||
if len(report_definition.required_throughputs):
|
||||
out_fd.write(r'''
|
||||
static const char* const RequiredThroughputs[] = {
|
||||
''')
|
||||
for throughput in report_definition.required_throughputs:
|
||||
out_fd.write(r''' "{}",
|
||||
'''.format(throughput))
|
||||
out_fd.write(r''' };
|
||||
''')
|
||||
|
||||
# html template
|
||||
assert(len(report_definition.html));
|
||||
out_fd.write(r'''
|
||||
static const unsigned char ReportContents[] = {''')
|
||||
barray = bytearray(report_definition.html, 'utf-8')
|
||||
formatted_string = []
|
||||
for index, b in enumerate(barray):
|
||||
if index % 20 == 0:
|
||||
formatted_string += '\n '
|
||||
assert(b <= 0xFF)
|
||||
formatted_string.append('0x{:02x}, '.format(b))
|
||||
out_fd.write("".join(formatted_string))
|
||||
out_fd.write(r'''0x0
|
||||
};
|
||||
''')
|
||||
|
||||
out_fd.write(r'''
|
||||
ReportDefinition reportDefinition = {''')
|
||||
|
||||
if len(report_definition.required_counters):
|
||||
out_fd.write(r'''
|
||||
RequiredCounters,
|
||||
sizeof(RequiredCounters) / sizeof(RequiredCounters[0]),''')
|
||||
else:
|
||||
out_fd.write(r'''
|
||||
nullptr,
|
||||
0,''')
|
||||
|
||||
if len(report_definition.required_ratios):
|
||||
out_fd.write(r'''
|
||||
RequiredRatios,
|
||||
sizeof(RequiredRatios) / sizeof(RequiredRatios[0]),''')
|
||||
else:
|
||||
out_fd.write(r'''
|
||||
nullptr,
|
||||
0,''')
|
||||
|
||||
if len(report_definition.required_throughputs):
|
||||
out_fd.write(r'''
|
||||
RequiredThroughputs,
|
||||
sizeof(RequiredThroughputs) / sizeof(RequiredThroughputs[0]),''')
|
||||
else:
|
||||
out_fd.write(r'''
|
||||
nullptr,
|
||||
0,''')
|
||||
out_fd.write(r'''
|
||||
(const char*)ReportContents
|
||||
};
|
||||
return reportDefinition;
|
||||
}
|
||||
''')
|
||||
out_fd.write(r'''
|
||||
}} // namespace {}
|
||||
|
||||
|
||||
'''.format(report_definition.name))
|
||||
|
||||
#===============================================================================
|
||||
# Main
|
||||
#===============================================================================
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Generate HTML report definition')
|
||||
parser.add_argument('--chip', type=str, required=True, help='chip name, e.g. tu10x')
|
||||
parser.add_argument('--outDir', type=str, required=True, help='output directory')
|
||||
parser.add_argument('--pypath', default=[], action='append', required=False, help="Python module paths.")
|
||||
parser.add_argument('--buildroot', default='', required=False, help="build root dir for depfile")
|
||||
parser.add_argument('--copyright', type=str, help="Copyright header.")
|
||||
|
||||
args = parser.parse_args()
|
||||
sys.path.extend(args.pypath)
|
||||
sys.path.extend(".")
|
||||
chip = args.chip
|
||||
report_module_name = "report_" + args.chip
|
||||
try:
|
||||
report_module = __import__(report_module_name)
|
||||
except ImportError:
|
||||
raise ImportError('Module "{}" is not found, this could happen due to invalid chip name or insufficient --pypath.'.format(report_module_name))
|
||||
per_range_report_definition = report_module.get_per_range_report_definition()
|
||||
summary_report_definition = report_module.get_summary_report_definition()
|
||||
|
||||
if not os.path.isdir(args.outDir):
|
||||
raise Exception('Invalid argument for --outDir: {}'.format(args.outDir))
|
||||
|
||||
# Per-range report: debug html(this can be used for inspection, the debug mode also lists the metrics that are used by each table)
|
||||
range_debug_html_file_name = 'NvPerfReportDefinition{}_range_debug.html'.format(chip.upper())
|
||||
range_debug_html_file_path = os.path.join(args.outDir, range_debug_html_file_name)
|
||||
with open(range_debug_html_file_path, 'w', encoding='utf-8') as out_fd:
|
||||
out_fd.write(per_range_report_definition.html)
|
||||
|
||||
# Summary report: debug html
|
||||
summary_debug_html_file_name = 'NvPerfReportDefinition{}_summary_debug.html'.format(chip.upper())
|
||||
summary_debug_html_file_path = os.path.join(args.outDir, summary_debug_html_file_name)
|
||||
with open(summary_debug_html_file_path, 'w', encoding='utf-8') as out_fd:
|
||||
out_fd.write(summary_report_definition.html)
|
||||
|
||||
|
||||
# CPP file
|
||||
cpp_file_name = 'NvPerfReportDefinition{}.h'.format(chip.upper())
|
||||
cpp_file_path = os.path.join(args.outDir, cpp_file_name)
|
||||
with open(cpp_file_path, 'w') as out_fd:
|
||||
if args.copyright:
|
||||
with open(args.copyright, 'r') as copyright_fd:
|
||||
out_fd.write(copyright_fd.read())
|
||||
|
||||
out_fd.write(r'''
|
||||
#pragma once
|
||||
|
||||
#include "NvPerfReportDefinition.h"
|
||||
|
||||
namespace nv {{ namespace perf {{ namespace {} {{
|
||||
'''.format(args.chip))
|
||||
# per-range report
|
||||
write_cpp_file(out_fd, per_range_report_definition)
|
||||
# summary report
|
||||
write_cpp_file(out_fd, summary_report_definition)
|
||||
out_fd.write(r'''
|
||||
} } }''')
|
||||
|
||||
# Emit a single depfile using the C++ file as the representative output.
|
||||
write_depfile(cpp_file_path, args.buildroot, cpp_file_path + '.d')
|
||||
@@ -0,0 +1,69 @@
|
||||
# Copyright 2014-2021 NVIDIA Corporation. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
class DataTable:
|
||||
def __init__(dtable, name, html, jsfunc, jscall, required_counters, required_ratios, required_throughputs, workflow):
|
||||
dtable.name = name
|
||||
dtable.html = html
|
||||
dtable.jsfunc = jsfunc
|
||||
dtable.jscall = jscall
|
||||
dtable.required_counters = required_counters
|
||||
dtable.required_ratios = required_ratios
|
||||
dtable.required_throughputs = required_throughputs
|
||||
dtable.workflow = workflow
|
||||
|
||||
class DataSection:
|
||||
def __init__(section, dtables, inter_table_spacing=True, title=None):
|
||||
section.dtables = dtables
|
||||
section.inter_table_spacing = inter_table_spacing
|
||||
section.title = title
|
||||
|
||||
class ReportDefinition:
|
||||
def __init__(rd, name, html, required_counters, required_ratios, required_throughputs):
|
||||
rd.name = name
|
||||
rd.html = html
|
||||
rd.required_counters = required_counters
|
||||
rd.required_ratios = required_ratios
|
||||
rd.required_throughputs = required_throughputs
|
||||
|
||||
def get_data_tables(sections):
|
||||
dtables = [dtable for section in sections for dtable in section.dtables]
|
||||
return dtables
|
||||
|
||||
def get_required_counters(sections):
|
||||
required_counters = set()
|
||||
dtables = get_data_tables(sections)
|
||||
for dtable in dtables:
|
||||
for counter in dtable.required_counters:
|
||||
required_counters.add(counter)
|
||||
required_counters = sorted(list(required_counters))
|
||||
return required_counters
|
||||
|
||||
def get_required_ratios(sections):
|
||||
required_ratios = set()
|
||||
dtables = get_data_tables(sections)
|
||||
for dtable in dtables:
|
||||
for ratio in dtable.required_ratios:
|
||||
required_ratios.add(ratio)
|
||||
required_ratios = sorted(list(required_ratios))
|
||||
return required_ratios
|
||||
|
||||
def get_required_throughputs(sections):
|
||||
required_throughputs = set()
|
||||
dtables = get_data_tables(sections)
|
||||
for dtable in dtables:
|
||||
for throughput in dtable.required_throughputs:
|
||||
required_throughputs.add(throughput)
|
||||
required_throughputs = sorted(list(required_throughputs))
|
||||
return required_throughputs
|
||||
@@ -0,0 +1,80 @@
|
||||
# Copyright 2014-2021 NVIDIA Corporation. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from profiler_report_types import *
|
||||
import pub.tables_common as tables_common
|
||||
import pub.ampere.tables_ga10x as tables_ga10x
|
||||
|
||||
def get_per_range_report_definition():
|
||||
sections = [
|
||||
DataSection([
|
||||
tables_ga10x.DevicePropertiesGenerator().make_data_table(),
|
||||
tables_common.ClocksGenerator().make_data_table(),
|
||||
], inter_table_spacing=False),
|
||||
DataSection([
|
||||
tables_common.TopLevelStatsGenerator().make_data_table(),
|
||||
tables_ga10x.TopThroughputsGenerator().make_data_table(),
|
||||
tables_common.CacheHitRates().make_data_table(),
|
||||
], title='Overview Section'),
|
||||
DataSection([
|
||||
tables_common.MainMemoryGenerator().make_data_table(),
|
||||
tables_ga10x.L2TrafficByMemoryApertureShortBreakdownGenerator(show_generic_workflow=True).make_data_table(),
|
||||
tables_ga10x.L2TrafficBySrcBreakdownGenerator(show_generic_workflow=False).make_data_table(),
|
||||
tables_common.L1TexThroughputsGenerator().make_data_table(),
|
||||
tables_common.L1TexTrafficBreakdownGenerator().make_data_table(),
|
||||
], title='Memory Performance Section'),
|
||||
DataSection([
|
||||
tables_ga10x.SmThroughputsGenerator().make_data_table(),
|
||||
tables_ga10x.SmInstExecutedGenerator().make_data_table(),
|
||||
tables_common.SmShaderExecutionGenerator().make_data_table(),
|
||||
tables_ga10x.SmResourceUsageGenerator().make_data_table(),
|
||||
tables_common.SmWarpLaunchStallsGenerator().make_data_table(),
|
||||
tables_common.WarpIssueStallsGenerator().make_data_table(),
|
||||
], title='Shader Performance Section'),
|
||||
DataSection([
|
||||
tables_common.PrimitiveDataflowGenerator().make_data_table(),
|
||||
tables_ga10x.RasterDataflowGenerator().make_data_table(),
|
||||
], title='3D Pipeline Section'),
|
||||
DataSection([
|
||||
tables_ga10x.L2TrafficByMemoryApertureBreakdownGenerator(show_generic_workflow=False).make_data_table(),
|
||||
tables_ga10x.L2TrafficByOperationBreakdownGenerator(show_generic_workflow=False).make_data_table(),
|
||||
], title='Additional L2 Traffic Breakdowns Section'),
|
||||
DataSection([
|
||||
tables_common.AdditionalMetricsGenerator().make_data_table(),
|
||||
tables_common.AllCountersGenerator().make_data_table(),
|
||||
tables_common.AllRatiosGenerator().make_data_table(),
|
||||
tables_common.AllThroughputsGenerator().make_data_table(),
|
||||
], title='Exhaustive Listings Section'),
|
||||
]
|
||||
html = tables_common.generate_range_html_common(sections)
|
||||
required_counters = get_required_counters(sections)
|
||||
required_ratios = get_required_ratios(sections)
|
||||
required_throughputs = get_required_throughputs(sections)
|
||||
return ReportDefinition('PerRangeReport', html, required_counters, required_ratios, required_throughputs)
|
||||
|
||||
def get_summary_report_definition():
|
||||
sections = [
|
||||
DataSection([
|
||||
tables_common.CollectionInfoGenerator().make_data_table(),
|
||||
]),
|
||||
DataSection([
|
||||
tables_ga10x.RangesSummaryGenerator().make_data_table(),
|
||||
], title='Summary of Measured Ranges'),
|
||||
]
|
||||
html = tables_common.generate_summary_html_common(sections)
|
||||
required_counters = get_required_counters(sections)
|
||||
required_ratios = get_required_ratios(sections)
|
||||
required_throughputs = get_required_throughputs(sections)
|
||||
return ReportDefinition('SummaryReport', html, required_counters, required_ratios, required_throughputs)
|
||||
|
||||
@@ -0,0 +1,646 @@
|
||||
# Copyright 2014-2021 NVIDIA Corporation. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from profiler_report_types import *
|
||||
import pub.tables_common as tables_common
|
||||
|
||||
class DevicePropertiesGenerator(tables_common.DevicePropertiesGenerator):
|
||||
def __init__(gen):
|
||||
super().__init__()
|
||||
gen.l2cacheSizePerLts = 128
|
||||
|
||||
class TopThroughputsGenerator(tables_common.TopThroughputsGenerator):
|
||||
def __init__(gen):
|
||||
super().__init__()
|
||||
gen.rows += [
|
||||
gen.Row('Shader' , '<a href="#SM-Instruction-Throughput">SM (Shader Cores)</a>' , "getThroughputPct('sm__throughput')"),
|
||||
gen.Row('Memory' , '<a href="#L1TEX-Throughput">L1TEX Cache</a>' , "getThroughputPct('l1tex__throughput')"),
|
||||
gen.Row('Memory' , '<a href="#L2-Sector-Traffic">L2 Cache</a>' , "getThroughputPct('lts__throughput')"),
|
||||
gen.Row('Memory' , '<a href="#Main-Memory-Throughput">DRAM</a>' , "getThroughputPct('dram__throughput')"),
|
||||
gen.Row('Memory' , '<a href="#Main-Memory-Throughput">PCIe</a>' , "getThroughputPct('pcie__throughput')"),
|
||||
gen.Row('World Pipe' , '<a href="#Primitive-Data-Flow">PDA Index Fetch</a>' , "getThroughputPct('pda__throughput')"),
|
||||
gen.Row('World Pipe' , '<a href="#Primitive-Data-Flow">Vertex Attr. Fetch</a>' , "getThroughputPct('vaf__throughput')"),
|
||||
gen.Row('World Pipe' , '<a href="#Primitive-Data-Flow">Primitive Engine</a>' , "getThroughputPct('pes__throughput')"),
|
||||
gen.Row('Screen Pipe' , '<a href="#Raster-Data-Flow">RASTER</a>' , "getThroughputPct('raster__throughput')"),
|
||||
gen.Row('Screen Pipe' , '<a href="#Raster-Data-Flow">PROP (Pre-ROP)</a>' , "getThroughputPct('prop__throughput')"),
|
||||
gen.Row('Screen Pipe' , '<a href="#Raster-Data-Flow">ZROP (Depth-Test)</a>' , "getThroughputPct('zrop__throughput')"),
|
||||
gen.Row('Screen Pipe' , '<a href="#Raster-Data-Flow">CROP (Color Blend)</a>' , "getThroughputPct('crop__throughput')"),
|
||||
]
|
||||
gen.required_throughputs += [
|
||||
'crop__throughput',
|
||||
'dram__throughput',
|
||||
'l1tex__throughput',
|
||||
'lts__throughput',
|
||||
'pcie__throughput',
|
||||
'pda__throughput',
|
||||
'pes__throughput',
|
||||
'prop__throughput',
|
||||
'raster__throughput',
|
||||
'sm__throughput',
|
||||
'vaf__throughput',
|
||||
'zrop__throughput',
|
||||
]
|
||||
|
||||
class SmThroughputsGenerator(tables_common.SmThroughputsGenerator):
|
||||
def __init__(gen):
|
||||
super().__init__()
|
||||
gen.pipes = [
|
||||
gen.Pipe('adu' , 'Computed branches and indexed constants'),
|
||||
gen.Pipe('alu' , 'INT32 except multiply; FP32 comparison'),
|
||||
gen.Pipe('cbu' , 'Divergent branches and control flow'),
|
||||
gen.Pipe('fma' , 'FP32 mul/add, FP16 mul/add'),
|
||||
gen.Pipe('fmaheavy' , 'FP32 mul/add and INT32 multiply'),
|
||||
gen.Pipe('fp64' , 'FP64 mul/add'),
|
||||
gen.Pipe('ipa' , 'Pixel shader attribute interpolation'),
|
||||
gen.Pipe('lsu' , 'Global, local, shared memory, and misc'),
|
||||
gen.Pipe('tensor' , 'Tensor matrix multiply (FP16, INT8/4/1)'),
|
||||
gen.Pipe('tex' , 'Texture and surface memory'),
|
||||
gen.Pipe('uniform' , 'Warp-level scalar operations'),
|
||||
gen.Pipe('xu' , 'Transcendentals and float/int conversion'),
|
||||
]
|
||||
for pipe in gen.pipes:
|
||||
gen.required_counters.extend(pipe.get_counter_names())
|
||||
|
||||
|
||||
class SmInstExecutedGenerator(tables_common.SmInstExecutedGenerator):
|
||||
def __init__(gen):
|
||||
super().__init__()
|
||||
gen.pipes = [
|
||||
gen.Pipe('total' , 'All instructions', True),
|
||||
gen.Pipe('adu' , 'Computed branches and indexed constants'),
|
||||
gen.Pipe('alu' , 'INT32 except multiply; FP32 comparison', True),
|
||||
gen.Pipe('cbu' , 'Divergent branches and control flow'),
|
||||
gen.Pipe('fma' , 'FP32 mul/add, FP16 mul/add', True),
|
||||
gen.Pipe('fmaheavy' , 'FP32 mul/add and INT32 multiply', True),
|
||||
gen.Pipe('fp64' , 'FP64 mul/add', True),
|
||||
gen.Pipe('ipa' , 'Pixel shader attribute interpolation', True),
|
||||
gen.Pipe('lsu' , 'Global, local, shared memory, and misc', True),
|
||||
gen.Pipe('tensor' , 'Tensor matrix multiply (FP16, INT8/4/1)'),
|
||||
gen.Pipe('tex' , 'Texture and surface memory'),
|
||||
gen.Pipe('uniform' , 'Warp-level scalar operations'),
|
||||
gen.Pipe('xu' , 'Transcendentals and float/int conversion', True),
|
||||
]
|
||||
for pipe in gen.pipes:
|
||||
gen.required_counters.extend(pipe.get_counter_names())
|
||||
|
||||
class L2TrafficByMemoryApertureShortBreakdownGenerator(tables_common.L2TrafficBreakdownGenerator):
|
||||
def __init__(gen, show_generic_workflow):
|
||||
super().__init__(show_generic_workflow=show_generic_workflow)
|
||||
gen.name = 'L2SectorTrafficBreakdownByMemoryApertureShort'
|
||||
gen.table_id = 'L2-Sector-Traffic-By-Memory-Aperture-Short'
|
||||
gen.column_names = [
|
||||
('Memory Aperture', 'To Memory'),
|
||||
('Op', 'Op'),
|
||||
]
|
||||
gen.workflow += ' This table decomposes L2 bandwidth to each destination, per operation. A <a href="#L2-Sector-Traffic-By-Memory-Aperture">more detailed version of this table</a> can be found below.'
|
||||
|
||||
gen.nodes = [
|
||||
gen.Node('DRAM' , ['lts__average_t_sector_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_aperture_device_op_write'], []),
|
||||
gen.Node('Atomics', ['lts__average_t_sector_aperture_device_op_atom'], []),
|
||||
gen.Node('Reductions', ['lts__average_t_sector_aperture_device_op_red'], []),
|
||||
]),
|
||||
gen.Node('System Memory', ['lts__average_t_sector_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_aperture_sysmem_op_write'], []),
|
||||
gen.Node('Atomics', ['lts__average_t_sector_aperture_sysmem_op_atom'], []),
|
||||
gen.Node('Reductions', ['lts__average_t_sector_aperture_sysmem_op_red'], []),
|
||||
]),
|
||||
gen.Node('Peer Memory', ['lts__average_t_sector_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_aperture_peer_op_write'], []),
|
||||
gen.Node('Atomics', ['lts__average_t_sector_aperture_peer_op_atom'], []),
|
||||
gen.Node('Reductions', ['lts__average_t_sector_aperture_peer_op_red'], []),
|
||||
]),
|
||||
]
|
||||
|
||||
gen.required_ratios = gen.get_required_ratios()
|
||||
|
||||
class L2TrafficBySrcBreakdownGenerator(tables_common.L2TrafficBreakdownGenerator):
|
||||
def __init__(gen, show_generic_workflow):
|
||||
super().__init__(show_generic_workflow=show_generic_workflow)
|
||||
gen.name = 'L2SectorTrafficBreakdownBySource'
|
||||
gen.table_id = 'L2-Sector-Traffic-By-Source'
|
||||
gen.column_names = [
|
||||
('Source Breakdown', 'From Source'),
|
||||
('Unit Breakdown', 'From Unit'),
|
||||
('Memory Aperture', 'To Memory'),
|
||||
('Op', 'Op'),
|
||||
]
|
||||
gen.workflow += ' This table decomposes L2 bandwidth from each source unit, to each destination, per operation. See also: these tables that prioritize <a href="#L2-Sector-Traffic-By-Memory-Aperture">destination Memory Aperture</a> and <a href="#L2-Sector-Traffic-By-Operation">Operation</a>.'
|
||||
|
||||
gen.nodes = [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex'], [
|
||||
gen.Node('DRAM' , ['lts__average_t_sector_srcunit_tex_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_tex_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_tex_aperture_device_op_write'], []),
|
||||
gen.Node('Atomics', ['lts__average_t_sector_srcunit_tex_aperture_device_op_atom'], []),
|
||||
gen.Node('Reductions', ['lts__average_t_sector_srcunit_tex_aperture_device_op_red'], []),
|
||||
]),
|
||||
gen.Node('Peer Memory', ['lts__average_t_sector_srcunit_tex_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_write'], []),
|
||||
gen.Node('Atomics', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_atom'], []),
|
||||
gen.Node('Reductions', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_red'], []),
|
||||
]),
|
||||
gen.Node('System Memory', ['lts__average_t_sector_srcunit_tex_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_write'], []),
|
||||
gen.Node('Atomics', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_atom'], []),
|
||||
gen.Node('Reductions', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_red'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('L1.5 Constant Cache', ['lts__average_t_sector_srcunit_gcc'], [
|
||||
gen.Node('DRAM' , ['lts__average_t_sector_srcunit_gcc_aperture_device'], [
|
||||
gen.Node('Reads' , ['lts__average_t_sector_srcunit_gcc_aperture_device'], [])
|
||||
]),
|
||||
gen.Node('Peer Memory', ['lts__average_t_sector_srcunit_gcc_aperture_peer'], [
|
||||
gen.Node('Reads' , ['lts__average_t_sector_srcunit_gcc_aperture_peer'], [])
|
||||
]),
|
||||
gen.Node('System Memory', ['lts__average_t_sector_srcunit_gcc_aperture_sysmem'], [
|
||||
gen.Node('Reads' , ['lts__average_t_sector_srcunit_gcc_aperture_sysmem'], [])
|
||||
]),
|
||||
]),
|
||||
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe'], [
|
||||
gen.Node('DRAM' , ['lts__average_t_sector_srcunit_pe_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_pe_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_pe_aperture_device_op_write'], []),
|
||||
]),
|
||||
gen.Node('Peer Memory', ['lts__average_t_sector_srcunit_pe_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_pe_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_pe_aperture_peer_op_write'], []),
|
||||
]),
|
||||
gen.Node('System Memory', ['lts__average_t_sector_srcunit_pe_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_pe_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_pe_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('Raster', ['lts__average_t_sector_srcunit_raster'], [
|
||||
gen.Node('DRAM' , ['lts__average_t_sector_srcunit_raster_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_raster_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_raster_aperture_device_op_write'], []),
|
||||
]),
|
||||
gen.Node('Peer Memory', ['lts__average_t_sector_srcunit_raster_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_raster_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_raster_aperture_peer_op_write'], []),
|
||||
]),
|
||||
gen.Node('System Memory', ['lts__average_t_sector_srcunit_raster_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_raster_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_raster_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop'], [
|
||||
gen.Node('DRAM' , ['lts__average_t_sector_srcunit_zrop_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_zrop_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_zrop_aperture_device_op_write'], []),
|
||||
]),
|
||||
gen.Node('Peer Memory' , ['lts__average_t_sector_srcunit_zrop_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_zrop_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_zrop_aperture_peer_op_write'], []),
|
||||
]),
|
||||
gen.Node('System Memory' , ['lts__average_t_sector_srcunit_zrop_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop'], [
|
||||
gen.Node('DRAM' , ['lts__average_t_sector_srcunit_crop_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_crop_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_crop_aperture_device_op_write'], []),
|
||||
]),
|
||||
gen.Node('Peer Memory' , ['lts__average_t_sector_srcunit_crop_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_crop_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_crop_aperture_peer_op_write'], []),
|
||||
]),
|
||||
gen.Node('System Memory' , ['lts__average_t_sector_srcunit_crop_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_crop_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_crop_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
]),
|
||||
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp'], [
|
||||
gen.Node('all FBP Units', ['lts__average_t_sector_srcnode_fbp'], [
|
||||
gen.Node('DRAM', ['lts__average_t_sector_srcnode_fbp_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcnode_fbp_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcnode_fbp_aperture_device_op_write'], []),
|
||||
]),
|
||||
gen.Node('Peer Memory', ['lts__average_t_sector_srcnode_fbp_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcnode_fbp_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcnode_fbp_aperture_peer_op_write'], []),
|
||||
]),
|
||||
gen.Node('System Memory', ['lts__average_t_sector_srcnode_fbp_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcnode_fbp_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcnode_fbp_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
]),
|
||||
gen.Node('HUB Units', [], [
|
||||
gen.Node('all HUB Units', [], [
|
||||
gen.Node('DRAM' , ['lts__average_t_sector_srcnode_hub_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcnode_hub_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcnode_hub_aperture_device_op_write'], []),
|
||||
]),
|
||||
gen.Node('Peer Memory', ['lts__average_t_sector_srcnode_hub_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_write'], []),
|
||||
]),
|
||||
gen.Node('System Memory', ['lts__average_t_sector_srcnode_hub_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
]),
|
||||
]
|
||||
|
||||
gen.required_ratios = gen.get_required_ratios()
|
||||
|
||||
class L2TrafficByMemoryApertureBreakdownGenerator(tables_common.L2TrafficBreakdownGenerator):
|
||||
def __init__(gen, show_generic_workflow):
|
||||
super().__init__(show_generic_workflow=show_generic_workflow)
|
||||
gen.name = 'L2SectorTrafficBreakdownByMemoryAperture'
|
||||
gen.table_id = 'L2-Sector-Traffic-By-Memory-Aperture'
|
||||
gen.column_names = [
|
||||
('Memory Aperture', 'To Memory'),
|
||||
('Source Breakdown', 'From Source'),
|
||||
('Unit Breakdown', 'From Unit'),
|
||||
('Op', 'Op'),
|
||||
]
|
||||
gen.workflow += ' This is an extended breakdown of <a href="#L2-Sector-Traffic-By-Memory-Aperture-Short">L2 Traffic by destination</a>. It decomposes L2 bandwidth to each destination, from each source unit, per operation.'
|
||||
|
||||
gen.nodes = [
|
||||
gen.Node('DRAM' , ['lts__average_t_sector_aperture_device'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_device'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_tex_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_tex_aperture_device_op_write'], []),
|
||||
gen.Node('Atomics', ['lts__average_t_sector_srcunit_tex_aperture_device_op_atom'], []),
|
||||
gen.Node('Reductions', ['lts__average_t_sector_srcunit_tex_aperture_device_op_red'], []),
|
||||
]),
|
||||
gen.Node('L1.5 Constant Cache', ['lts__average_t_sector_srcunit_gcc'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_gcc'], []),
|
||||
]),
|
||||
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_pe_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_pe_aperture_device_op_write'], []),
|
||||
]),
|
||||
gen.Node('Raster', ['lts__average_t_sector_srcunit_raster'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_raster_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_raster_aperture_device_op_write'], []),
|
||||
]),
|
||||
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_zrop_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_zrop_aperture_device_op_write'], []),
|
||||
]),
|
||||
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_crop_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_crop_aperture_device_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_device'], [
|
||||
gen.Node('all FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcnode_fbp_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcnode_fbp_aperture_device_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_device'], [
|
||||
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcnode_hub_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcnode_hub_aperture_device_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
]),
|
||||
gen.Node('Peer Memory', ['lts__average_t_sector_aperture_peer'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_peer'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_write'], []),
|
||||
gen.Node('Atomics', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_atom'], []),
|
||||
gen.Node('Reductions', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_red'], []),
|
||||
]),
|
||||
gen.Node('L1.5 Constant Cache', ['lts__average_t_sector_srcunit_gcc'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_gcc'], []),
|
||||
]),
|
||||
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_pe_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_pe_aperture_peer_op_write'], []),
|
||||
]),
|
||||
gen.Node('Raster', ['lts__average_t_sector_srcunit_raster'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_raster_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_raster_aperture_peer_op_write'], []),
|
||||
]),
|
||||
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_zrop_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_zrop_aperture_peer_op_write'], []),
|
||||
]),
|
||||
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_crop_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_crop_aperture_peer_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_peer'], [
|
||||
gen.Node('all FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcnode_fbp_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcnode_fbp_aperture_peer_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_peer'], [
|
||||
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
]),
|
||||
gen.Node('System Memory', ['lts__average_t_sector_aperture_sysmem'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_sysmem'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_write'], []),
|
||||
gen.Node('Atomics', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_atom'], []),
|
||||
gen.Node('Reductions', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_red'], []),
|
||||
]),
|
||||
gen.Node('L1.5 Constant Cache', ['lts__average_t_sector_srcunit_gcc'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_gcc'], []),
|
||||
]),
|
||||
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_pe_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_pe_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
gen.Node('Raster', ['lts__average_t_sector_srcunit_raster'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_raster_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_raster_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_crop_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_crop_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_sysmem'], [
|
||||
gen.Node('all FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcnode_fbp_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcnode_fbp_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_sysmem'], [
|
||||
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
]),
|
||||
]
|
||||
|
||||
gen.required_ratios = gen.get_required_ratios()
|
||||
|
||||
class L2TrafficByOperationBreakdownGenerator(tables_common.L2TrafficBreakdownGenerator):
|
||||
def __init__(gen, show_generic_workflow):
|
||||
super().__init__(show_generic_workflow=show_generic_workflow)
|
||||
gen.name = 'L2SectorTrafficBreakdownByOperation'
|
||||
gen.table_id = 'L2-Sector-Traffic-By-Operation'
|
||||
gen.column_names = [
|
||||
('Op', 'Op'),
|
||||
('Memory Aperture', 'To Memory'),
|
||||
('Source Breakdown', 'From Source'),
|
||||
('Unit Breakdown', 'From Unit'),
|
||||
]
|
||||
gen.workflow += ' This table decomposes L2 bandwidth per operation, to each destination, from each source unit.'
|
||||
|
||||
gen.nodes = [
|
||||
gen.Node('Reads', ['lts__average_t_sector_op_read'], [
|
||||
gen.Node('DRAM', ['lts__average_t_sector_aperture_device_op_read'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_device_op_read'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_device_op_read'], []),
|
||||
gen.Node('L1.5 Constant Cache', ['lts__average_t_sector_srcunit_gcc_aperture_device'], []),
|
||||
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_device_op_read'], []),
|
||||
gen.Node('Raster', ['lts__average_t_sector_srcunit_raster_aperture_device_op_read'], []),
|
||||
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_device_op_read'], []),
|
||||
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_device_op_read'], []),
|
||||
]),
|
||||
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_device_op_read'], [
|
||||
gen.Node('all FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_device_op_read'], []),
|
||||
]),
|
||||
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_device_op_read'], [
|
||||
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_device_op_read'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('Peer Memory', ['lts__average_t_sector_aperture_peer_op_read'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_peer_op_read'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_read'], []),
|
||||
gen.Node('L1.5 Constant Cache', ['lts__average_t_sector_srcunit_gcc_aperture_peer'], []),
|
||||
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_peer_op_read'], []),
|
||||
gen.Node('Raster', ['lts__average_t_sector_srcunit_raster_aperture_peer_op_read'], []),
|
||||
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_peer_op_read'], []),
|
||||
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_peer_op_read'], []),
|
||||
]),
|
||||
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_peer_op_read'], [
|
||||
gen.Node('all FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_peer_op_read'], []),
|
||||
]),
|
||||
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_read'], [
|
||||
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_read'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('System Memory', ['lts__average_t_sector_aperture_sysmem_op_read'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_sysmem_op_read'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_read'], []),
|
||||
gen.Node('L1.5 Constant Cache', ['lts__average_t_sector_srcunit_gcc_aperture_sysmem'], []),
|
||||
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Raster', ['lts__average_t_sector_srcunit_raster_aperture_sysmem_op_read'], []),
|
||||
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem_op_read'], []),
|
||||
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_sysmem_op_read'], []),
|
||||
]),
|
||||
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_sysmem_op_read'], [
|
||||
gen.Node('all FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_sysmem_op_read'], []),
|
||||
]),
|
||||
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_read'], [
|
||||
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_read'], []),
|
||||
]),
|
||||
]),
|
||||
]),
|
||||
gen.Node('Writes', ['lts__average_t_sector_op_write'], [
|
||||
gen.Node('DRAM', ['lts__average_t_sector_aperture_device_op_write'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_device_op_write'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_device_op_write'], []),
|
||||
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_device_op_write'], []),
|
||||
gen.Node('Raster', ['lts__average_t_sector_srcunit_raster_aperture_device_op_write'], []),
|
||||
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_device_op_write'], []),
|
||||
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_device_op_write'], []),
|
||||
]),
|
||||
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_device_op_write'], [
|
||||
gen.Node('all FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_device_op_write'], []),
|
||||
]),
|
||||
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_device_op_write'], [
|
||||
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_device_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('Peer Memory', ['lts__average_t_sector_aperture_peer_op_write'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_peer_op_write'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_write'], []),
|
||||
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_peer_op_write'], []),
|
||||
gen.Node('Raster', ['lts__average_t_sector_srcunit_raster_aperture_peer_op_write'], []),
|
||||
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_peer_op_write'], []),
|
||||
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_peer_op_write'], []),
|
||||
]),
|
||||
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_peer_op_write'], [
|
||||
gen.Node('all FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_peer_op_write'], []),
|
||||
]),
|
||||
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_write'], [
|
||||
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('System Memory', ['lts__average_t_sector_aperture_sysmem_op_write'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_sysmem_op_write'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_write'], []),
|
||||
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_sysmem_op_write'], []),
|
||||
gen.Node('Raster', ['lts__average_t_sector_srcunit_raster_aperture_sysmem_op_write'], []),
|
||||
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem_op_write'], []),
|
||||
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_sysmem_op_write'], [
|
||||
gen.Node('all FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_write'], [
|
||||
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
]),
|
||||
gen.Node('Atomics', ['lts__average_t_sector_op_atom'], [
|
||||
gen.Node('DRAM', ['lts__average_t_sector_aperture_device_op_atom'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_device_op_atom'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_device_op_atom'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('Peer Memory', ['lts__average_t_sector_aperture_peer_op_atom'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_peer_op_atom'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_atom'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('System Memory', ['lts__average_t_sector_aperture_sysmem_op_atom'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_sysmem_op_atom'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_atom'], []),
|
||||
]),
|
||||
]),
|
||||
]),
|
||||
gen.Node('Reductions', ['lts__average_t_sector_op_red'], [
|
||||
gen.Node('DRAM', ['lts__average_t_sector_aperture_device_op_red'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_device_op_red'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_device_op_red'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('Peer Memory', ['lts__average_t_sector_aperture_peer_op_red'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_peer_op_red'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_red'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('System Memory', ['lts__average_t_sector_aperture_sysmem_op_red'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_sysmem_op_red'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_red'], []),
|
||||
]),
|
||||
]),
|
||||
]),
|
||||
]
|
||||
|
||||
gen.required_ratios = gen.get_required_ratios()
|
||||
|
||||
class RasterDataflowGenerator(tables_common.RasterDataflowGenerator):
|
||||
def __init__(gen):
|
||||
super().__init__()
|
||||
gen.zrop_pixels_input = r'''getCounterValue('prop__prop2zrop_pixels_realtime', 'sum')'''
|
||||
gen.crop_pixels_input = r'''getCounterValue('prop__prop2crop_pixels_realtime', 'sum')'''
|
||||
gen.required_counters.extend([
|
||||
'prop__prop2zrop_pixels_realtime',
|
||||
'prop__prop2crop_pixels_realtime'
|
||||
])
|
||||
|
||||
class SmResourceUsageGenerator(tables_common.SmResourceUsageGenerator):
|
||||
def __init__(gen):
|
||||
super().__init__()
|
||||
# TODO: this is using single-pass counters; use ordinary counters when available
|
||||
# NOTE: the gfx column is a hack, for rows that cannot separately measure VTG vs. PS
|
||||
gen.rows = [
|
||||
# resource total gfx vtg ps cs
|
||||
gen.Row('Warps' , 'sm__warps_active' , 'NotApplicable' , 'tpc__warps_active_shader_vtg' , 'tpc__warps_active_shader_ps' , 'tpc__warps_active_shader_cs' ),
|
||||
gen.Row('Registers' , 'tpc__sm_rf_registers_allocated' , 'NotApplicable' , 'tpc__sm_rf_registers_allocated_shader_vtg' , 'tpc__sm_rf_registers_allocated_shader_ps' , 'tpc__sm_rf_registers_allocated_shader_cs' ),
|
||||
gen.Row('Attr/ShMem' , 'NotApplicable' , 'NotApplicable' , 'tpc__l1tex_mem_shared_data_isbe_bytes_allocated' , 'tpc__l1tex_mem_shared_data_tram_bytes_allocated' , 'tpc__l1tex_mem_shared_data_compute_bytes_allocated' ),
|
||||
gen.Row('CTAs' , 'NotApplicable' , 'NotApplicable' , 'NotApplicable' , 'NotApplicable' , 'sm__ctas_active' ),
|
||||
]
|
||||
gen.required_counters = [
|
||||
'sm__warps_active',
|
||||
'tpc__warps_active_shader_vtg',
|
||||
'tpc__warps_active_shader_ps',
|
||||
'tpc__warps_active_shader_cs',
|
||||
'tpc__sm_rf_registers_allocated',
|
||||
'tpc__sm_rf_registers_allocated_shader_vtg',
|
||||
'tpc__sm_rf_registers_allocated_shader_ps',
|
||||
'tpc__sm_rf_registers_allocated_shader_cs',
|
||||
'tpc__l1tex_mem_shared_data_isbe_bytes_allocated',
|
||||
'tpc__l1tex_mem_shared_data_tram_bytes_allocated',
|
||||
'tpc__l1tex_mem_shared_data_compute_bytes_allocated',
|
||||
'sm__ctas_active',
|
||||
]
|
||||
|
||||
class RangesSummaryGenerator(tables_common.RangesSummaryGenerator):
|
||||
def __init__(gen):
|
||||
super().__init__()
|
||||
gen.cols = [
|
||||
gen.Col('Duration μs' , "getCounterValue('gpu__time_duration', 'avg')" , 'format_avg' , 'ra'),
|
||||
gen.Col('GR Active%' , "getCounterPct('gr__cycles_active', 'avg')" , 'format_pct' , 'ra'),
|
||||
gen.Col('3D?' , "getCounterValue('fe__draw_count', 'sum') ? '✓' : ''" , '' , 'ra'),
|
||||
gen.Col('Comp?' , "getCounterValue('gr__dispatch_count', 'sum') ? '✓' : ''" , '' , 'ra'),
|
||||
gen.Col('#WFI' , "getCounterValue('fe__output_ops_cmd_go_idle', 'sum')" , 'format_sum' , 'ra'),
|
||||
gen.Col('#Prims' , "getCounterValue('pda__input_prims', 'sum')" , 'format_sum' , 'ra'),
|
||||
gen.Col('#Pixels-Z' , "getCounterValue('prop__prop2zrop_pixels_realtime', 'sum')" , 'format_sum' , 'ra'),
|
||||
gen.Col('#Pixels-C' , "getCounterValue('prop__prop2crop_pixels_realtime', 'sum')" , 'format_sum' , 'ra'),
|
||||
gen.Col('SM%' , "getThroughputPct('sm__throughput')" , 'format_pct' , 'ra'),
|
||||
gen.Col('L1TEX%' , "getThroughputPct('l1tex__throughput')" , 'format_pct' , 'ra'),
|
||||
gen.Col('L2%' , "getThroughputPct('lts__throughput')" , 'format_pct' , 'ra'),
|
||||
gen.Col('DRAM%' , "getThroughputPct('dram__throughput')" , 'format_pct' , 'ra'),
|
||||
gen.Col('PCIe%' , "getThroughputPct('pcie__throughput')" , 'format_pct' , 'ra'),
|
||||
gen.Col('PD%' , "getThroughputPct('pda__throughput')" , 'format_pct' , 'ra'),
|
||||
gen.Col('PE%' , "Math.max(getThroughputPct('vaf__throughput'), getThroughputPct('vpc__throughput'), getThroughputPct('pes__throughput'))" , 'format_pct' , 'ra'),
|
||||
gen.Col('RSTR%' , "getThroughputPct('raster__throughput')" , 'format_pct' , 'ra'),
|
||||
gen.Col('PROP%' , "getThroughputPct('prop__throughput')" , 'format_pct' , 'ra'),
|
||||
gen.Col('ZROP%' , "getThroughputPct('zrop__throughput')" , 'format_pct' , 'ra'),
|
||||
gen.Col('CROP%' , "getThroughputPct('crop__throughput')" , 'format_pct' , 'ra'),
|
||||
]
|
||||
gen.required_counters = [
|
||||
'fe__draw_count',
|
||||
'fe__output_ops_cmd_go_idle',
|
||||
'gpu__time_duration',
|
||||
'gr__cycles_active',
|
||||
'gr__dispatch_count',
|
||||
'pda__input_prims',
|
||||
'prop__prop2crop_pixels_realtime',
|
||||
'prop__prop2zrop_pixels_realtime',
|
||||
]
|
||||
gen.required_ratios = []
|
||||
gen.required_throughputs = [
|
||||
'crop__throughput',
|
||||
'dram__throughput',
|
||||
'l1tex__throughput',
|
||||
'lts__throughput',
|
||||
'pcie__throughput',
|
||||
'pda__throughput',
|
||||
'pes__throughput',
|
||||
'prop__throughput',
|
||||
'raster__throughput',
|
||||
'sm__throughput',
|
||||
'vaf__throughput',
|
||||
'vpc__throughput',
|
||||
'zrop__throughput',
|
||||
]
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,79 @@
|
||||
# Copyright 2014-2021 NVIDIA Corporation. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from profiler_report_types import *
|
||||
import pub.tables_common as tables_common
|
||||
import pub.turing.tables_turing as tables_turing
|
||||
|
||||
def get_per_range_report_definition():
|
||||
sections = [
|
||||
DataSection([
|
||||
tables_turing.DevicePropertiesGenerator().make_data_table(),
|
||||
tables_common.ClocksGenerator().make_data_table(),
|
||||
], inter_table_spacing=False),
|
||||
DataSection([
|
||||
tables_common.TopLevelStatsGenerator().make_data_table(),
|
||||
tables_turing.TopThroughputsGenerator().make_data_table(),
|
||||
tables_common.CacheHitRates().make_data_table(),
|
||||
], title='Overview Section'),
|
||||
DataSection([
|
||||
tables_common.MainMemoryGenerator().make_data_table(),
|
||||
tables_turing.L2TrafficByMemoryApertureShortBreakdownGenerator(show_generic_workflow=True).make_data_table(),
|
||||
tables_turing.L2TrafficBySrcBreakdownGenerator(show_generic_workflow=False).make_data_table(),
|
||||
tables_common.L1TexThroughputsGenerator().make_data_table(),
|
||||
tables_common.L1TexTrafficBreakdownGenerator().make_data_table(),
|
||||
], title='Memory Performance Section'),
|
||||
DataSection([
|
||||
tables_turing.SmThroughputsGenerator_tu10x().make_data_table(),
|
||||
tables_turing.SmInstExecutedGenerator().make_data_table(),
|
||||
tables_common.SmShaderExecutionGenerator().make_data_table(),
|
||||
tables_turing.SmResourceUsageGenerator().make_data_table(),
|
||||
tables_common.SmWarpLaunchStallsGenerator().make_data_table(),
|
||||
tables_common.WarpIssueStallsGenerator().make_data_table(),
|
||||
], title='Shader Performance Section'),
|
||||
DataSection([
|
||||
tables_common.PrimitiveDataflowGenerator().make_data_table(),
|
||||
tables_turing.RasterDataflowGenerator().make_data_table(),
|
||||
], title='3D Pipeline Section'),
|
||||
DataSection([
|
||||
tables_turing.L2TrafficByMemoryApertureBreakdownGenerator(show_generic_workflow=False).make_data_table(),
|
||||
tables_turing.L2TrafficByOperationBreakdownGenerator(show_generic_workflow=False).make_data_table(),
|
||||
], title='Additional L2 Traffic Breakdowns Section'),
|
||||
DataSection([
|
||||
tables_common.AdditionalMetricsGenerator().make_data_table(),
|
||||
tables_common.AllCountersGenerator().make_data_table(),
|
||||
tables_common.AllRatiosGenerator().make_data_table(),
|
||||
tables_common.AllThroughputsGenerator().make_data_table(),
|
||||
], title='Exhaustive Listings Section'),
|
||||
]
|
||||
html = tables_common.generate_range_html_common(sections)
|
||||
required_counters = get_required_counters(sections)
|
||||
required_ratios = get_required_ratios(sections)
|
||||
required_throughputs = get_required_throughputs(sections)
|
||||
return ReportDefinition('PerRangeReport', html, required_counters, required_ratios, required_throughputs)
|
||||
|
||||
def get_summary_report_definition():
|
||||
sections = [
|
||||
DataSection([
|
||||
tables_common.CollectionInfoGenerator().make_data_table(),
|
||||
]),
|
||||
DataSection([
|
||||
tables_turing.RangesSummaryGenerator().make_data_table(),
|
||||
], title='Summary of Measured Ranges'),
|
||||
]
|
||||
html = tables_common.generate_summary_html_common(sections)
|
||||
required_counters = get_required_counters(sections)
|
||||
required_ratios = get_required_ratios(sections)
|
||||
required_throughputs = get_required_throughputs(sections)
|
||||
return ReportDefinition('SummaryReport', html, required_counters, required_ratios, required_throughputs)
|
||||
@@ -0,0 +1,79 @@
|
||||
# Copyright 2014-2021 NVIDIA Corporation. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from profiler_report_types import *
|
||||
import pub.tables_common as tables_common
|
||||
import pub.turing.tables_turing as tables_turing
|
||||
|
||||
def get_per_range_report_definition():
|
||||
sections = [
|
||||
DataSection([
|
||||
tables_turing.DevicePropertiesGenerator().make_data_table(),
|
||||
tables_common.ClocksGenerator().make_data_table(),
|
||||
], inter_table_spacing=False),
|
||||
DataSection([
|
||||
tables_common.TopLevelStatsGenerator().make_data_table(),
|
||||
tables_turing.TopThroughputsGenerator().make_data_table(),
|
||||
tables_common.CacheHitRates().make_data_table(),
|
||||
], title='Overview Section'),
|
||||
DataSection([
|
||||
tables_common.MainMemoryGenerator().make_data_table(),
|
||||
tables_turing.L2TrafficByMemoryApertureShortBreakdownGenerator(show_generic_workflow=True).make_data_table(),
|
||||
tables_turing.L2TrafficBySrcBreakdownGenerator(show_generic_workflow=False).make_data_table(),
|
||||
tables_common.L1TexThroughputsGenerator().make_data_table(),
|
||||
tables_common.L1TexTrafficBreakdownGenerator().make_data_table(),
|
||||
], title='Memory Performance Section'),
|
||||
DataSection([
|
||||
tables_turing.SmThroughputsGenerator_tu11x().make_data_table(),
|
||||
tables_turing.SmInstExecutedGenerator().make_data_table(),
|
||||
tables_common.SmShaderExecutionGenerator().make_data_table(),
|
||||
tables_turing.SmResourceUsageGenerator().make_data_table(),
|
||||
tables_common.SmWarpLaunchStallsGenerator().make_data_table(),
|
||||
tables_common.WarpIssueStallsGenerator().make_data_table(),
|
||||
], title='Shader Performance Section'),
|
||||
DataSection([
|
||||
tables_common.PrimitiveDataflowGenerator().make_data_table(),
|
||||
tables_turing.RasterDataflowGenerator().make_data_table(),
|
||||
], title='3D Pipeline Section'),
|
||||
DataSection([
|
||||
tables_turing.L2TrafficByMemoryApertureBreakdownGenerator(show_generic_workflow=False).make_data_table(),
|
||||
tables_turing.L2TrafficByOperationBreakdownGenerator(show_generic_workflow=False).make_data_table(),
|
||||
], title='Additional L2 Traffic Breakdown'),
|
||||
DataSection([
|
||||
tables_common.AdditionalMetricsGenerator().make_data_table(),
|
||||
tables_common.AllCountersGenerator().make_data_table(),
|
||||
tables_common.AllRatiosGenerator().make_data_table(),
|
||||
tables_common.AllThroughputsGenerator().make_data_table(),
|
||||
], title='Exhaustive Listings Section'),
|
||||
]
|
||||
html = tables_common.generate_range_html_common(sections)
|
||||
required_counters = get_required_counters(sections)
|
||||
required_ratios = get_required_ratios(sections)
|
||||
required_throughputs = get_required_throughputs(sections)
|
||||
return ReportDefinition('PerRangeReport', html, required_counters, required_ratios, required_throughputs)
|
||||
|
||||
def get_summary_report_definition():
|
||||
sections = [
|
||||
DataSection([
|
||||
tables_common.CollectionInfoGenerator().make_data_table(),
|
||||
]),
|
||||
DataSection([
|
||||
tables_turing.RangesSummaryGenerator().make_data_table(),
|
||||
], title='Summary of Measured Ranges'),
|
||||
]
|
||||
html = tables_common.generate_summary_html_common(sections)
|
||||
required_counters = get_required_counters(sections)
|
||||
required_ratios = get_required_ratios(sections)
|
||||
required_throughputs = get_required_throughputs(sections)
|
||||
return ReportDefinition('SummaryReport', html, required_counters, required_ratios, required_throughputs)
|
||||
@@ -0,0 +1,610 @@
|
||||
# Copyright 2014-2021 NVIDIA Corporation. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from profiler_report_types import *
|
||||
import pub.tables_common as tables_common
|
||||
|
||||
class DevicePropertiesGenerator(tables_common.DevicePropertiesGenerator):
|
||||
def __init__(gen):
|
||||
super().__init__()
|
||||
gen.l2cacheSizePerLts = 128
|
||||
|
||||
class TopThroughputsGenerator(tables_common.TopThroughputsGenerator):
|
||||
def __init__(gen):
|
||||
super().__init__()
|
||||
gen.rows += [
|
||||
gen.Row('Shader' , '<a href="#SM-Instruction-Throughput">SM (Shader Cores)</a>' , "getThroughputPct('sm__throughput')"),
|
||||
gen.Row('Memory' , '<a href="#L1TEX-Throughput">L1TEX Cache</a>' , "getThroughputPct('l1tex__throughput')"),
|
||||
gen.Row('Memory' , '<a href="#L2-Sector-Traffic">L2 Cache</a>' , "getThroughputPct('lts__throughput')"),
|
||||
gen.Row('Memory' , '<a href="#Main-Memory-Throughput">DRAM</a>' , "getThroughputPct('dram__throughput')"),
|
||||
gen.Row('Memory' , '<a href="#Main-Memory-Throughput">PCIe</a>' , "getThroughputPct('pcie__throughput')"),
|
||||
gen.Row('World Pipe' , '<a href="#Primitive-Data-Flow">PDA Index Fetch</a>' , "getThroughputPct('pda__throughput')"),
|
||||
gen.Row('World Pipe' , '<a href="#Primitive-Data-Flow">Vertex Attr. Fetch</a>' , "getThroughputPct('vaf__throughput')"),
|
||||
gen.Row('World Pipe' , '<a href="#Primitive-Data-Flow">Primitive Engine</a>' , "getThroughputPct('pes__throughput')"),
|
||||
gen.Row('Screen Pipe' , '<a href="#Raster-Data-Flow">RASTER</a>' , "getThroughputPct('raster__throughput')"),
|
||||
gen.Row('Screen Pipe' , '<a href="#Raster-Data-Flow">PROP (Pre-ROP)</a>' , "getThroughputPct('prop__throughput')"),
|
||||
gen.Row('Screen Pipe' , '<a href="#Raster-Data-Flow">ZROP (Depth-Test)</a>' , "getThroughputPct('zrop__throughput')"),
|
||||
gen.Row('Screen Pipe' , '<a href="#Raster-Data-Flow">CROP (Color Blend)</a>' , "getThroughputPct('crop__throughput')"),
|
||||
]
|
||||
gen.required_throughputs += [
|
||||
'crop__throughput',
|
||||
'dram__throughput',
|
||||
'l1tex__throughput',
|
||||
'lts__throughput',
|
||||
'pcie__throughput',
|
||||
'pda__throughput',
|
||||
'pes__throughput',
|
||||
'prop__throughput',
|
||||
'raster__throughput',
|
||||
'sm__throughput',
|
||||
'vaf__throughput',
|
||||
'zrop__throughput',
|
||||
]
|
||||
|
||||
class SmThroughputsGenerator_tu10x(tables_common.SmThroughputsGenerator):
|
||||
def __init__(gen):
|
||||
super().__init__()
|
||||
gen.pipes += [
|
||||
gen.Pipe('adu' , 'Computed branches and indexed constants'),
|
||||
gen.Pipe('alu' , 'INT32 except multiply; FP32 comparison'),
|
||||
gen.Pipe('cbu' , 'Divergent branches and control flow'),
|
||||
gen.Pipe('fma' , 'FP32 mul/add and INT32 multiply'),
|
||||
gen.Pipe('fp16' , 'FP16 mul/add'),
|
||||
gen.Pipe('fp64' , 'FP64 mul/add'),
|
||||
gen.Pipe('ipa' , 'Pixel shader attribute interpolation'),
|
||||
gen.Pipe('lsu' , 'Global, local, shared memory, and misc'),
|
||||
gen.Pipe('shared' , 'Shared Pipe Dispatch (FP16,Tensor)', 'sm__pipe_shared_cycles_active', hasInstExecuted=False),
|
||||
gen.Pipe('tensor' , 'Tensor matrix multiply (FP16, INT8/4/1)'),
|
||||
gen.Pipe('tex' , 'Texture and surface memory'),
|
||||
gen.Pipe('uniform' , 'Warp-level scalar operations'),
|
||||
gen.Pipe('xu' , 'Transcendentals and float/int conversion'),
|
||||
]
|
||||
for pipe in gen.pipes:
|
||||
gen.required_counters.extend(pipe.get_counter_names())
|
||||
|
||||
class SmThroughputsGenerator_tu11x(tables_common.SmThroughputsGenerator):
|
||||
def __init__(gen):
|
||||
super().__init__()
|
||||
gen.pipes = []
|
||||
gen.pipes += [
|
||||
gen.Pipe('adu' , 'Computed branches and indexed constants'),
|
||||
gen.Pipe('alu' , 'INT32 except multiply; FP32 comparison'),
|
||||
gen.Pipe('cbu' , 'Divergent branches and control flow'),
|
||||
gen.Pipe('fma' , 'FP32 mul/add and INT32 multiply'),
|
||||
gen.Pipe('fp16' , 'FP16 mul/add'),
|
||||
gen.Pipe('fp64' , 'FP64 mul/add'),
|
||||
gen.Pipe('ipa' , 'Pixel shader attribute interpolation'),
|
||||
gen.Pipe('lsu' , 'Global, local, shared memory, and misc'),
|
||||
gen.Pipe('tensor' , 'Tensor matrix multiply (FP16, INT8/4/1)'),
|
||||
gen.Pipe('tex' , 'Texture and surface memory'),
|
||||
gen.Pipe('uniform' , 'Warp-level scalar operations'),
|
||||
gen.Pipe('xu' , 'Transcendentals and float/int conversion'),
|
||||
]
|
||||
for pipe in gen.pipes:
|
||||
gen.required_counters.extend(pipe.get_counter_names())
|
||||
|
||||
class SmInstExecutedGenerator(tables_common.SmInstExecutedGenerator):
|
||||
def __init__(gen):
|
||||
super().__init__()
|
||||
gen.pipes = [
|
||||
gen.Pipe('total' , 'All instructions', True),
|
||||
gen.Pipe('adu' , 'Computed branches and indexed constants'),
|
||||
gen.Pipe('alu' , 'INT32 except multiply; FP32 comparison', True),
|
||||
gen.Pipe('cbu' , 'Divergent branches and control flow'),
|
||||
gen.Pipe('fma' , 'FP32 mul/add and INT32 multiply', True),
|
||||
gen.Pipe('fp16' , 'FP16 mul/add', True),
|
||||
gen.Pipe('fp64' , 'FP64 mul/add', True),
|
||||
gen.Pipe('ipa' , 'Pixel shader attribute interpolation', True),
|
||||
gen.Pipe('lsu' , 'Global, local, shared memory, and misc', True),
|
||||
gen.Pipe('tensor' , 'Tensor matrix multiply (FP16, INT8/4/1)'),
|
||||
gen.Pipe('tex' , 'Texture and surface memory'),
|
||||
gen.Pipe('uniform' , 'Warp-level scalar operations'),
|
||||
gen.Pipe('xu' , 'Transcendentals and float/int conversion', True),
|
||||
]
|
||||
for pipe in gen.pipes:
|
||||
gen.required_counters.extend(pipe.get_counter_names())
|
||||
|
||||
class L2TrafficByMemoryApertureShortBreakdownGenerator(tables_common.L2TrafficBreakdownGenerator):
|
||||
def __init__(gen, show_generic_workflow):
|
||||
super().__init__(show_generic_workflow=show_generic_workflow)
|
||||
gen.name = 'L2SectorTrafficBreakdownByMemoryApertureShort'
|
||||
gen.table_id = 'L2-Sector-Traffic-By-Memory-Aperture-Short'
|
||||
gen.column_names = [
|
||||
('Memory Aperture', 'To Memory'),
|
||||
('Op', 'Op'),
|
||||
]
|
||||
gen.workflow += ' This table decomposes L2 bandwidth to each destination, per operation. A <a href="#L2-Sector-Traffic-By-Memory-Aperture">more detailed version of this table</a> can be found below.'
|
||||
|
||||
gen.nodes = [
|
||||
gen.Node('DRAM' , ['lts__average_t_sector_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_aperture_device_op_write'], []),
|
||||
gen.Node('Atomics', ['lts__average_t_sector_aperture_device_op_atom'], []),
|
||||
gen.Node('Reductions', ['lts__average_t_sector_aperture_device_op_red'], []),
|
||||
]),
|
||||
gen.Node('System Memory', ['lts__average_t_sector_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_aperture_sysmem_op_write'], []),
|
||||
gen.Node('Atomics', ['lts__average_t_sector_aperture_sysmem_op_atom'], []),
|
||||
gen.Node('Reductions', ['lts__average_t_sector_aperture_sysmem_op_red'], []),
|
||||
]),
|
||||
gen.Node('Peer Memory', ['lts__average_t_sector_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_aperture_peer_op_write'], []),
|
||||
gen.Node('Atomics', ['lts__average_t_sector_aperture_peer_op_atom'], []),
|
||||
gen.Node('Reductions', ['lts__average_t_sector_aperture_peer_op_red'], []),
|
||||
]),
|
||||
]
|
||||
|
||||
gen.required_ratios = gen.get_required_ratios()
|
||||
|
||||
class L2TrafficBySrcBreakdownGenerator(tables_common.L2TrafficBreakdownGenerator):
|
||||
def __init__(gen, show_generic_workflow):
|
||||
super().__init__(show_generic_workflow=show_generic_workflow)
|
||||
gen.name = 'L2SectorTrafficBreakdownBySource'
|
||||
gen.table_id = 'L2-Sector-Traffic-By-Source'
|
||||
gen.column_names = [
|
||||
('Source Breakdown', 'From Source'),
|
||||
('Unit Breakdown', 'From Unit'),
|
||||
('Memory Aperture', 'To Memory'),
|
||||
('Op', 'Op'),
|
||||
]
|
||||
gen.workflow += ' This table decomposes L2 bandwidth from each source unit, to each destination, per operation. See also: these tables that prioritize <a href="#L2-Sector-Traffic-By-Memory-Aperture">destination Memory Aperture</a> and <a href="#L2-Sector-Traffic-By-Operation">Operation</a>.'
|
||||
|
||||
gen.nodes = [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex'], [
|
||||
gen.Node('DRAM' , ['lts__average_t_sector_srcunit_tex_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_tex_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_tex_aperture_device_op_write'], []),
|
||||
gen.Node('Atomics', ['lts__average_t_sector_srcunit_tex_aperture_device_op_atom'], []),
|
||||
gen.Node('Reductions', ['lts__average_t_sector_srcunit_tex_aperture_device_op_red'], []),
|
||||
]),
|
||||
gen.Node('Peer Memory', ['lts__average_t_sector_srcunit_tex_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_write'], []),
|
||||
gen.Node('Atomics', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_atom'], []),
|
||||
gen.Node('Reductions', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_red'], []),
|
||||
]),
|
||||
gen.Node('System Memory', ['lts__average_t_sector_srcunit_tex_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_write'], []),
|
||||
gen.Node('Atomics', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_atom'], []),
|
||||
gen.Node('Reductions', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_red'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe'], [
|
||||
gen.Node('DRAM' , ['lts__average_t_sector_srcunit_pe_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_pe_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_pe_aperture_device_op_write'], []),
|
||||
]),
|
||||
gen.Node('Peer Memory', ['lts__average_t_sector_srcunit_pe_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_pe_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_pe_aperture_peer_op_write'], []),
|
||||
]),
|
||||
gen.Node('System Memory', ['lts__average_t_sector_srcunit_pe_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_pe_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_pe_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother'], [
|
||||
gen.Node('DRAM' , ['lts__average_t_sector_srcunit_gpcother_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_gpcother_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_gpcother_aperture_device_op_write'], []),
|
||||
]),
|
||||
gen.Node('Peer Memory', ['lts__average_t_sector_srcunit_gpcother_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_gpcother_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_gpcother_aperture_peer_op_write'], []),
|
||||
]),
|
||||
gen.Node('System Memory', ['lts__average_t_sector_srcunit_gpcother_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_gpcother_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_gpcother_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
]),
|
||||
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp'], [
|
||||
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop'], [
|
||||
gen.Node('DRAM' , ['lts__average_t_sector_srcunit_zrop_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_zrop_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_zrop_aperture_device_op_write'], []),
|
||||
]),
|
||||
gen.Node('Peer Memory' , ['lts__average_t_sector_srcunit_zrop_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_zrop_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_zrop_aperture_peer_op_write'], []),
|
||||
]),
|
||||
gen.Node('System Memory' , ['lts__average_t_sector_srcunit_zrop_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop'], [
|
||||
gen.Node('DRAM' , ['lts__average_t_sector_srcunit_crop_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_crop_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_crop_aperture_device_op_write'], []),
|
||||
]),
|
||||
gen.Node('Peer Memory' , ['lts__average_t_sector_srcunit_crop_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_crop_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_crop_aperture_peer_op_write'], []),
|
||||
]),
|
||||
gen.Node('System Memory' , ['lts__average_t_sector_srcunit_crop_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_crop_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_crop_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
]),
|
||||
gen.Node('HUB Units', [], [ # lts__average_t_sector_srcnode_fbp is buggy, use sum of children instead
|
||||
gen.Node('all HUB Units', [], [ # lts__average_t_sector_srcnode_fbp is buggy, use sum of children instead
|
||||
gen.Node('DRAM' , ['lts__average_t_sector_srcnode_hub_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcnode_hub_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcnode_hub_aperture_device_op_write'], []),
|
||||
]),
|
||||
gen.Node('Peer Memory', ['lts__average_t_sector_srcnode_hub_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_write'], []),
|
||||
]),
|
||||
gen.Node('System Memory', ['lts__average_t_sector_srcnode_hub_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
]),
|
||||
]
|
||||
|
||||
gen.required_ratios = gen.get_required_ratios()
|
||||
|
||||
class L2TrafficByMemoryApertureBreakdownGenerator(tables_common.L2TrafficBreakdownGenerator):
|
||||
def __init__(gen, show_generic_workflow):
|
||||
super().__init__(show_generic_workflow=show_generic_workflow)
|
||||
gen.name = 'L2SectorTrafficBreakdownByMemoryAperture'
|
||||
gen.table_id = 'L2-Sector-Traffic-By-Memory-Aperture'
|
||||
gen.column_names = [
|
||||
('Memory Aperture', 'To Memory'),
|
||||
('Source Breakdown', 'From Source'),
|
||||
('Unit Breakdown', 'From Unit'),
|
||||
('Op', 'Op'),
|
||||
]
|
||||
gen.workflow += ' This is an extended breakdown of <a href="#L2-Sector-Traffic-By-Memory-Aperture-Short">L2 Traffic by destination</a>. It decomposes L2 bandwidth to each destination, from each source unit, per operation.'
|
||||
|
||||
gen.nodes = [
|
||||
gen.Node('DRAM' , ['lts__average_t_sector_aperture_device'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_device'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_tex_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_tex_aperture_device_op_write'], []),
|
||||
gen.Node('Atomics', ['lts__average_t_sector_srcunit_tex_aperture_device_op_atom'], []),
|
||||
gen.Node('Reductions', ['lts__average_t_sector_srcunit_tex_aperture_device_op_red'], []),
|
||||
]),
|
||||
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_pe_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_pe_aperture_device_op_write'], []),
|
||||
]),
|
||||
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_gpcother_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_gpcother_aperture_device_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_device'], [
|
||||
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_zrop_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_zrop_aperture_device_op_write'], []),
|
||||
]),
|
||||
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_crop_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_crop_aperture_device_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_device'], [
|
||||
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcnode_hub_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcnode_hub_aperture_device_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
]),
|
||||
gen.Node('Peer Memory', ['lts__average_t_sector_aperture_peer'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_peer'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_write'], []),
|
||||
gen.Node('Atomics', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_atom'], []),
|
||||
gen.Node('Reductions', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_red'], []),
|
||||
]),
|
||||
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_pe_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_pe_aperture_peer_op_write'], []),
|
||||
]),
|
||||
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_gpcother_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_gpcother_aperture_peer_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_peer'], [
|
||||
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_zrop_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_zrop_aperture_peer_op_write'], []),
|
||||
]),
|
||||
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_crop_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_crop_aperture_peer_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_peer'], [
|
||||
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
]),
|
||||
gen.Node('System Memory', ['lts__average_t_sector_aperture_sysmem'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_sysmem'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_write'], []),
|
||||
gen.Node('Atomics', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_atom'], []),
|
||||
gen.Node('Reductions', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_red'], []),
|
||||
]),
|
||||
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_pe_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_pe_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_gpcother_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_gpcother_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_sysmem'], [
|
||||
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_crop_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_crop_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_sysmem'], [
|
||||
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
]),
|
||||
]
|
||||
|
||||
gen.required_ratios = gen.get_required_ratios()
|
||||
|
||||
class L2TrafficByOperationBreakdownGenerator(tables_common.L2TrafficBreakdownGenerator):
|
||||
def __init__(gen, show_generic_workflow):
|
||||
super().__init__(show_generic_workflow=show_generic_workflow)
|
||||
gen.name = 'L2SectorTrafficBreakdownByOperation'
|
||||
gen.table_id = 'L2-Sector-Traffic-By-Operation'
|
||||
gen.column_names = [
|
||||
('Op', 'Op'),
|
||||
('Memory Aperture', 'To Memory'),
|
||||
('Source Breakdown', 'From Source'),
|
||||
('Unit Breakdown', 'From Unit'),
|
||||
]
|
||||
gen.workflow += ' This table decomposes L2 bandwidth per operation, to each destination, from each source unit.'
|
||||
|
||||
gen.nodes = [
|
||||
gen.Node('Reads', ['lts__average_t_sector_op_read'], [
|
||||
gen.Node('DRAM', ['lts__average_t_sector_aperture_device_op_read'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_device_op_read'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_device_op_read'], []),
|
||||
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_device_op_read'], []),
|
||||
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother_aperture_device_op_read'], []),
|
||||
]),
|
||||
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_device_op_read'], [
|
||||
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_device_op_read'], []),
|
||||
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_device_op_read'], []),
|
||||
]),
|
||||
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_device_op_read'], [
|
||||
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_device_op_read'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('Peer Memory', ['lts__average_t_sector_aperture_peer_op_read'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_peer_op_read'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_read'], []),
|
||||
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_peer_op_read'], []),
|
||||
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother_aperture_peer_op_read'], []),
|
||||
]),
|
||||
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_peer_op_read'], [
|
||||
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_peer_op_read'], []),
|
||||
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_peer_op_read'], []),
|
||||
]),
|
||||
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_read'], [
|
||||
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_read'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('System Memory', ['lts__average_t_sector_aperture_sysmem_op_read'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_sysmem_op_read'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_sysmem_op_read'], []),
|
||||
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother_aperture_sysmem_op_read'], []),
|
||||
]),
|
||||
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_sysmem_op_read'], [
|
||||
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem_op_read'], []),
|
||||
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_sysmem_op_read'], []),
|
||||
]),
|
||||
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_read'], [
|
||||
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_read'], []),
|
||||
]),
|
||||
]),
|
||||
]),
|
||||
gen.Node('Writes', ['lts__average_t_sector_op_write'], [
|
||||
gen.Node('DRAM', ['lts__average_t_sector_aperture_device_op_write'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_device_op_write'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_device_op_write'], []),
|
||||
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_device_op_write'], []),
|
||||
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother_aperture_device_op_write'], []),
|
||||
]),
|
||||
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_device_op_write'], [
|
||||
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_device_op_write'], []),
|
||||
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_device_op_write'], []),
|
||||
]),
|
||||
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_device_op_write'], [
|
||||
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_device_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('Peer Memory', ['lts__average_t_sector_aperture_peer_op_write'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_peer_op_write'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_write'], []),
|
||||
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_peer_op_write'], []),
|
||||
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother_aperture_peer_op_write'], []),
|
||||
]),
|
||||
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_peer_op_write'], [
|
||||
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_peer_op_write'], []),
|
||||
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_peer_op_write'], []),
|
||||
]),
|
||||
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_write'], [
|
||||
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('System Memory', ['lts__average_t_sector_aperture_sysmem_op_write'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_sysmem_op_write'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_write'], []),
|
||||
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_sysmem_op_write'], []),
|
||||
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_sysmem_op_write'], [
|
||||
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem_op_write'], []),
|
||||
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_write'], [
|
||||
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
]),
|
||||
gen.Node('Atomics', ['lts__average_t_sector_op_atom'], [
|
||||
gen.Node('DRAM', ['lts__average_t_sector_aperture_device_op_atom'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_device_op_atom'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_device_op_atom'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('Peer Memory', ['lts__average_t_sector_aperture_peer_op_atom'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_peer_op_atom'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_atom'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('System Memory', ['lts__average_t_sector_aperture_sysmem_op_atom'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_sysmem_op_atom'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_atom'], []),
|
||||
]),
|
||||
]),
|
||||
]),
|
||||
gen.Node('Reductions', ['lts__average_t_sector_op_red'], [
|
||||
gen.Node('DRAM', ['lts__average_t_sector_aperture_device_op_red'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_device_op_red'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_device_op_red'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('Peer Memory', ['lts__average_t_sector_aperture_peer_op_red'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_peer_op_red'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_red'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('System Memory', ['lts__average_t_sector_aperture_sysmem_op_red'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_sysmem_op_red'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_red'], []),
|
||||
]),
|
||||
]),
|
||||
]),
|
||||
]
|
||||
|
||||
gen.required_ratios = gen.get_required_ratios()
|
||||
|
||||
class RasterDataflowGenerator(tables_common.RasterDataflowGenerator):
|
||||
def __init__(gen):
|
||||
super().__init__()
|
||||
gen.zrop_pixels_input = r'''getCounterValue('prop__prop2xbar_zrop_pixels_realtime', 'sum')'''
|
||||
gen.crop_pixels_input = r'''getCounterValue('prop__prop2xbar_crop_pixels_realtime', 'sum')'''
|
||||
gen.required_counters.extend([
|
||||
'prop__prop2xbar_zrop_pixels_realtime',
|
||||
'prop__prop2xbar_crop_pixels_realtime'
|
||||
])
|
||||
|
||||
class SmResourceUsageGenerator(tables_common.SmResourceUsageGenerator):
|
||||
def __init__(gen):
|
||||
super().__init__()
|
||||
gen.rows = [
|
||||
# resource total gfx vtg ps cs
|
||||
gen.Row('Warps' , 'sm__warps_active' , 'NotApplicable' , 'tpc__warps_active_shader_vtg_realtime' , 'tpc__warps_active_shader_ps_realtime' , 'tpc__warps_active_shader_cs_realtime' ),
|
||||
gen.Row('Registers' , 'tpc__sm_rf_registers_allocated' , 'NotApplicable' , 'tpc__sm_rf_registers_allocated_shader_vtg' , 'tpc__sm_rf_registers_allocated_shader_ps' , 'tpc__sm_rf_registers_allocated_shader_cs' ),
|
||||
gen.Row('Attr/ShMem' , 'NotApplicable' , 'NotApplicable' , 'tpc__l1tex_mem_shared_data_isbe_bytes_allocated' , 'tpc__l1tex_mem_shared_data_tram_bytes_allocated' , 'tpc__l1tex_mem_shared_data_compute_bytes_allocated' ),
|
||||
gen.Row('CTAs' , 'NotApplicable' , 'NotApplicable' , 'NotApplicable' , 'NotApplicable' , 'sm__ctas_active' ),
|
||||
]
|
||||
gen.required_counters = [
|
||||
'sm__ctas_active',
|
||||
'sm__warps_active',
|
||||
'tpc__warps_active_shader_cs_realtime',
|
||||
'tpc__warps_active_shader_ps_realtime',
|
||||
'tpc__warps_active_shader_vtg_realtime',
|
||||
'tpc__sm_rf_registers_allocated',
|
||||
'tpc__sm_rf_registers_allocated_shader_vtg',
|
||||
'tpc__sm_rf_registers_allocated_shader_ps',
|
||||
'tpc__sm_rf_registers_allocated_shader_cs',
|
||||
'tpc__l1tex_mem_shared_data_isbe_bytes_allocated',
|
||||
'tpc__l1tex_mem_shared_data_tram_bytes_allocated',
|
||||
'tpc__l1tex_mem_shared_data_compute_bytes_allocated',
|
||||
]
|
||||
|
||||
class RangesSummaryGenerator(tables_common.RangesSummaryGenerator):
|
||||
def __init__(gen):
|
||||
super().__init__()
|
||||
gen.cols = [
|
||||
gen.Col('Duration μs' , "getCounterValue('gpu__time_duration', 'avg')" , 'format_avg' , 'ra'),
|
||||
gen.Col('GR Active%' , "getCounterPct('gr__cycles_active', 'avg')" , 'format_pct' , 'ra'),
|
||||
gen.Col('3D?' , "getCounterValue('fe__draw_count', 'sum') ? '✓' : ''" , '' , 'ra'),
|
||||
gen.Col('Comp?' , "getCounterValue('gr__dispatch_count', 'sum') ? '✓' : ''" , '' , 'ra'),
|
||||
gen.Col('#WFI' , "getCounterValue('fe__output_ops_type_bundle_cmd_go_idle', 'sum')" , 'format_sum' , 'ra'),
|
||||
gen.Col('#Prims' , "getCounterValue('pda__input_prims', 'sum')" , 'format_sum' , 'ra'),
|
||||
gen.Col('#Pixels-Z' , "getCounterValue('prop__prop2xbar_zrop_pixels_realtime', 'sum')" , 'format_sum' , 'ra'),
|
||||
gen.Col('#Pixels-C' , "getCounterValue('prop__prop2xbar_crop_pixels_realtime', 'sum')" , 'format_sum' , 'ra'),
|
||||
gen.Col('SM%' , "getThroughputPct('sm__throughput')" , 'format_pct' , 'ra'),
|
||||
gen.Col('L1TEX%' , "getThroughputPct('l1tex__throughput')" , 'format_pct' , 'ra'),
|
||||
gen.Col('L2%' , "getThroughputPct('lts__throughput')" , 'format_pct' , 'ra'),
|
||||
gen.Col('DRAM%' , "getThroughputPct('dram__throughput')" , 'format_pct' , 'ra'),
|
||||
gen.Col('PCIe%' , "getThroughputPct('pcie__throughput')" , 'format_pct' , 'ra'),
|
||||
gen.Col('PD%' , "getThroughputPct('pda__throughput')" , 'format_pct' , 'ra'),
|
||||
gen.Col('PE%' , "Math.max(getThroughputPct('vaf__throughput'), getThroughputPct('vpc__throughput'), getThroughputPct('pes__throughput'))" , 'format_pct' , 'ra'),
|
||||
gen.Col('RSTR%' , "getThroughputPct('raster__throughput')" , 'format_pct' , 'ra'),
|
||||
gen.Col('PROP%' , "getThroughputPct('prop__throughput')" , 'format_pct' , 'ra'),
|
||||
gen.Col('ZROP%' , "getThroughputPct('zrop__throughput')" , 'format_pct' , 'ra'),
|
||||
gen.Col('CROP%' , "getThroughputPct('crop__throughput')" , 'format_pct' , 'ra'),
|
||||
]
|
||||
gen.required_counters = [
|
||||
'fe__draw_count',
|
||||
'fe__output_ops_type_bundle_cmd_go_idle',
|
||||
'gpu__time_duration',
|
||||
'gr__cycles_active',
|
||||
'gr__dispatch_count',
|
||||
'pda__input_prims',
|
||||
'prop__prop2xbar_crop_pixels_realtime',
|
||||
'prop__prop2xbar_zrop_pixels_realtime',
|
||||
]
|
||||
gen.required_ratios = []
|
||||
gen.required_throughputs = [
|
||||
'crop__throughput',
|
||||
'dram__throughput',
|
||||
'l1tex__throughput',
|
||||
'lts__throughput',
|
||||
'pcie__throughput',
|
||||
'pda__throughput',
|
||||
'pes__throughput',
|
||||
'prop__throughput',
|
||||
'raster__throughput',
|
||||
'sm__throughput',
|
||||
'vaf__throughput',
|
||||
'vpc__throughput',
|
||||
'zrop__throughput',
|
||||
]
|
||||
@@ -0,0 +1,75 @@
|
||||
# Copyright 2014-2021 NVIDIA Corporation. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from profiler_report_types import *
|
||||
import pub.tables_common as tables_common
|
||||
import pub.volta.tables_gv100 as tables_gv100
|
||||
|
||||
def get_per_range_report_definition():
|
||||
sections = [
|
||||
DataSection([
|
||||
tables_gv100.DevicePropertiesGenerator().make_data_table(),
|
||||
tables_common.ClocksGenerator().make_data_table(),
|
||||
], inter_table_spacing=False),
|
||||
DataSection([
|
||||
tables_common.TopLevelStatsGenerator().make_data_table(),
|
||||
tables_gv100.TopThroughputsGenerator().make_data_table(),
|
||||
tables_common.CacheHitRates().make_data_table(),
|
||||
], title='Overview Section'),
|
||||
DataSection([
|
||||
tables_common.MainMemoryGenerator().make_data_table(),
|
||||
tables_gv100.L2TrafficByMemoryApertureShortBreakdownGenerator(show_generic_workflow=True).make_data_table(),
|
||||
tables_gv100.L2TrafficBySrcBreakdownGenerator(show_generic_workflow=False).make_data_table(),
|
||||
tables_common.L1TexThroughputsGenerator().make_data_table(),
|
||||
tables_common.L1TexTrafficBreakdownGenerator().make_data_table(),
|
||||
], title='Memory Performance Section'),
|
||||
DataSection([
|
||||
tables_gv100.SmThroughputsGenerator().make_data_table(),
|
||||
tables_gv100.SmInstExecutedGenerator().make_data_table(),
|
||||
tables_common.SmShaderExecutionGenerator().make_data_table(),
|
||||
tables_common.SmWarpLaunchStallsGenerator().make_data_table(),
|
||||
tables_common.WarpIssueStallsGenerator().make_data_table(),
|
||||
], title='Shader Performance Section'),
|
||||
DataSection([
|
||||
tables_gv100.L2TrafficByMemoryApertureBreakdownGenerator(show_generic_workflow=False).make_data_table(),
|
||||
tables_gv100.L2TrafficByOperationBreakdownGenerator(show_generic_workflow=False).make_data_table(),
|
||||
], title='Additional L2 Traffic Breakdowns Section'),
|
||||
DataSection([
|
||||
tables_common.AdditionalMetricsGenerator().make_data_table(),
|
||||
tables_common.AllCountersGenerator().make_data_table(),
|
||||
tables_common.AllRatiosGenerator().make_data_table(),
|
||||
tables_common.AllThroughputsGenerator().make_data_table(),
|
||||
], title='Exhaustive Listings Section'),
|
||||
]
|
||||
html = tables_common.generate_range_html_common(sections)
|
||||
required_counters = get_required_counters(sections)
|
||||
required_ratios = get_required_ratios(sections)
|
||||
required_throughputs = get_required_throughputs(sections)
|
||||
return ReportDefinition('PerRangeReport', html, required_counters, required_ratios, required_throughputs)
|
||||
|
||||
def get_summary_report_definition():
|
||||
sections = [
|
||||
DataSection([
|
||||
tables_common.CollectionInfoGenerator().make_data_table(),
|
||||
]),
|
||||
DataSection([
|
||||
tables_gv100.RangesSummaryGenerator().make_data_table(),
|
||||
], title='Summary of Measured Ranges'),
|
||||
]
|
||||
html = tables_common.generate_summary_html_common(sections)
|
||||
required_counters = get_required_counters(sections)
|
||||
required_ratios = get_required_ratios(sections)
|
||||
required_throughputs = get_required_throughputs(sections)
|
||||
return ReportDefinition('SummaryReport', html, required_counters, required_ratios, required_throughputs)
|
||||
|
||||
@@ -0,0 +1,555 @@
|
||||
# Copyright 2014-2021 NVIDIA Corporation. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from profiler_report_types import *
|
||||
import pub.tables_common as tables_common
|
||||
|
||||
class DevicePropertiesGenerator(tables_common.DevicePropertiesGenerator):
|
||||
def __init__(gen):
|
||||
super().__init__()
|
||||
gen.l2cacheSizePerLts = 96
|
||||
|
||||
class TopThroughputsGenerator(tables_common.TopThroughputsGenerator):
|
||||
def __init__(gen):
|
||||
super().__init__()
|
||||
gen.rows += [
|
||||
gen.Row('Shader' , '<a href="#SM-Instruction-Throughput">SM (Shader Cores)</a>' , "getThroughputPct('sm__throughput')"),
|
||||
gen.Row('Memory' , '<a href="#L1TEX-Throughput">L1TEX Cache</a>' , "getThroughputPct('l1tex__throughput')"),
|
||||
gen.Row('Memory' , '<a href="#L2-Sector-Traffic">L2 Cache</a>' , "getThroughputPct('lts__throughput')"),
|
||||
gen.Row('Memory' , '<a href="#Main-Memory-Throughput">DRAM</a>' , "getThroughputPct('dram__throughput')"),
|
||||
gen.Row('Memory' , '<a href="#Main-Memory-Throughput">PCIe</a>' , "getThroughputPct('pcie__throughput')"),
|
||||
gen.Row('World Pipe' , '<a href="#Primitive-Data-Flow">PDA Index Fetch</a>' , "getThroughputPct('pda__throughput')"),
|
||||
gen.Row('World Pipe' , '<a href="#Primitive-Data-Flow">Vertex Attr. Fetch</a>' , "getThroughputPct('vaf__throughput')"),
|
||||
gen.Row('World Pipe' , '<a href="#Primitive-Data-Flow">Primitive Engine</a>' , "getThroughputPct('pes__throughput')"),
|
||||
gen.Row('Screen Pipe' , '<a href="#Raster-Data-Flow">RASTER</a>' , "getThroughputPct('raster__throughput')"),
|
||||
gen.Row('Screen Pipe' , '<a href="#Raster-Data-Flow">ZROP (Depth-Test)</a>' , "getThroughputPct('zrop__throughput')"),
|
||||
gen.Row('Screen Pipe' , '<a href="#Raster-Data-Flow">CROP (Color Blend)</a>' , "getThroughputPct('crop__throughput')"),
|
||||
]
|
||||
gen.required_throughputs += [
|
||||
'crop__throughput',
|
||||
'dram__throughput',
|
||||
'l1tex__throughput',
|
||||
'lts__throughput',
|
||||
'pcie__throughput',
|
||||
'pda__throughput',
|
||||
'pes__throughput',
|
||||
'raster__throughput',
|
||||
'sm__throughput',
|
||||
'vaf__throughput',
|
||||
'zrop__throughput',
|
||||
]
|
||||
|
||||
class SmThroughputsGenerator(tables_common.SmThroughputsGenerator):
|
||||
def __init__(gen):
|
||||
super().__init__()
|
||||
gen.pipes += [
|
||||
gen.Pipe('adu' , 'Computed branches and indexed constants'),
|
||||
gen.Pipe('alu' , 'INT32 except multiply; FP32 comparison', 'sm__pipe_alu_cycles_active'),
|
||||
gen.Pipe('cbu' , 'Divergent branches and control flow'),
|
||||
gen.Pipe('fma' , 'FP32 mul/add and INT32 multiply', 'sm__pipe_fma_cycles_active'),
|
||||
gen.Pipe('fp16' , 'FP16 mul/add'),
|
||||
gen.Pipe('fp64' , 'FP64 mul/add'),
|
||||
gen.Pipe('ipa' , 'Pixel shader attribute interpolation'),
|
||||
gen.Pipe('lsu' , 'Global, local, shared memory, and misc'),
|
||||
gen.Pipe('shared' , 'Shared Pipe Dispatch (FP64,FP16,Tensor)', 'sm__pipe_shared_cycles_active', hasInstExecuted=False),
|
||||
gen.Pipe('tensor' , 'Tensor matrix multiply (FP16)', 'sm__pipe_tensor_cycles_active'),
|
||||
gen.Pipe('tex' , 'Texture and surface memory'),
|
||||
gen.Pipe('xu' , 'Transcendentals and float/int conversion'),
|
||||
]
|
||||
for pipe in gen.pipes:
|
||||
gen.required_counters.extend(pipe.get_counter_names())
|
||||
|
||||
|
||||
class SmInstExecutedGenerator(tables_common.SmInstExecutedGenerator):
|
||||
def __init__(gen):
|
||||
super().__init__()
|
||||
gen.pipes += [
|
||||
gen.Pipe('total' , 'All instructions', True),
|
||||
gen.Pipe('adu' , 'Computed branches and indexed constants'),
|
||||
gen.Pipe('alu' , 'INT32 except multiply; FP32 comparison', True),
|
||||
gen.Pipe('cbu' , 'Divergent branches and control flow'),
|
||||
gen.Pipe('fma' , 'FP32 mul/add and INT32 multiply', True),
|
||||
gen.Pipe('fp16' , 'FP16 mul/add', True),
|
||||
gen.Pipe('fp64' , 'FP64 mul/add', True),
|
||||
gen.Pipe('ipa' , 'Pixel shader attribute interpolation', True),
|
||||
gen.Pipe('lsu' , 'Global, local, shared memory, and misc', True),
|
||||
gen.Pipe('tensor' , 'Tensor matrix multiply (FP16)'),
|
||||
gen.Pipe('tex' , 'Texture and surface memory'),
|
||||
gen.Pipe('xu' , 'Transcendentals and float/int conversion', True),
|
||||
]
|
||||
for pipe in gen.pipes:
|
||||
gen.required_counters.extend(pipe.get_counter_names())
|
||||
|
||||
class L2TrafficByMemoryApertureShortBreakdownGenerator(tables_common.L2TrafficBreakdownGenerator):
|
||||
def __init__(gen, show_generic_workflow):
|
||||
super().__init__(show_generic_workflow=show_generic_workflow)
|
||||
gen.name = 'L2SectorTrafficBreakdownByMemoryApertureShort'
|
||||
gen.table_id = 'L2-Sector-Traffic-By-Memory-Aperture-Short'
|
||||
gen.column_names = [
|
||||
('Memory Aperture', 'To Memory'),
|
||||
('Op', 'Op'),
|
||||
]
|
||||
gen.workflow += ' This table decomposes L2 bandwidth to each destination, per operation. A <a href="#L2-Sector-Traffic-By-Memory-Aperture">more detailed version of this table</a> can be found below.'
|
||||
|
||||
gen.nodes = [
|
||||
gen.Node('DRAM' , ['lts__average_t_sector_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_aperture_device_op_write'], []),
|
||||
gen.Node('Atomics', ['lts__average_t_sector_aperture_device_op_atom'], []),
|
||||
gen.Node('Reductions', ['lts__average_t_sector_aperture_device_op_red'], []),
|
||||
]),
|
||||
gen.Node('System Memory', ['lts__average_t_sector_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_aperture_sysmem_op_write'], []),
|
||||
gen.Node('Atomics', ['lts__average_t_sector_aperture_sysmem_op_atom'], []),
|
||||
gen.Node('Reductions', ['lts__average_t_sector_aperture_sysmem_op_red'], []),
|
||||
]),
|
||||
gen.Node('Peer Memory', ['lts__average_t_sector_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_aperture_peer_op_write'], []),
|
||||
gen.Node('Atomics', ['lts__average_t_sector_aperture_peer_op_atom'], []),
|
||||
gen.Node('Reductions', ['lts__average_t_sector_aperture_peer_op_red'], []),
|
||||
]),
|
||||
]
|
||||
|
||||
gen.required_ratios = gen.get_required_ratios()
|
||||
|
||||
class L2TrafficBySrcBreakdownGenerator(tables_common.L2TrafficBreakdownGenerator):
|
||||
def __init__(gen, show_generic_workflow):
|
||||
super().__init__(show_generic_workflow=show_generic_workflow)
|
||||
gen.name = 'L2SectorTrafficBreakdownBySource'
|
||||
gen.table_id = 'L2-Sector-Traffic-By-Source'
|
||||
gen.column_names = [
|
||||
('Source Breakdown', 'From Source'),
|
||||
('Unit Breakdown', 'From Unit'),
|
||||
('Memory Aperture', 'To Memory'),
|
||||
('Op', 'Op'),
|
||||
]
|
||||
gen.workflow += ' This table decomposes L2 bandwidth from each source unit, to each destination, per operation. See also: these tables that prioritize <a href="#L2-Sector-Traffic-By-Memory-Aperture">destination Memory Aperture</a> and <a href="#L2-Sector-Traffic-By-Operation">Operation</a>.'
|
||||
|
||||
gen.nodes = [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex'], [
|
||||
gen.Node('DRAM' , ['lts__average_t_sector_srcunit_tex_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_tex_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_tex_aperture_device_op_write'], []),
|
||||
gen.Node('Atomics', ['lts__average_t_sector_srcunit_tex_aperture_device_op_atom'], []),
|
||||
gen.Node('Reductions', ['lts__average_t_sector_srcunit_tex_aperture_device_op_red'], []),
|
||||
]),
|
||||
gen.Node('Peer Memory', ['lts__average_t_sector_srcunit_tex_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_write'], []),
|
||||
gen.Node('Atomics', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_atom'], []),
|
||||
gen.Node('Reductions', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_red'], []),
|
||||
]),
|
||||
gen.Node('System Memory', ['lts__average_t_sector_srcunit_tex_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_write'], []),
|
||||
gen.Node('Atomics', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_atom'], []),
|
||||
gen.Node('Reductions', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_red'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe'], [
|
||||
gen.Node('DRAM' , ['lts__average_t_sector_srcunit_pe_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_pe_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_pe_aperture_device_op_write'], []),
|
||||
]),
|
||||
gen.Node('Peer Memory', ['lts__average_t_sector_srcunit_pe_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_pe_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_pe_aperture_peer_op_write'], []),
|
||||
]),
|
||||
gen.Node('System Memory', ['lts__average_t_sector_srcunit_pe_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_pe_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_pe_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother'], [
|
||||
gen.Node('DRAM' , ['lts__average_t_sector_srcunit_gpcother_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_gpcother_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_gpcother_aperture_device_op_write'], []),
|
||||
]),
|
||||
gen.Node('Peer Memory', ['lts__average_t_sector_srcunit_gpcother_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_gpcother_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_gpcother_aperture_peer_op_write'], []),
|
||||
]),
|
||||
gen.Node('System Memory', ['lts__average_t_sector_srcunit_gpcother_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_gpcother_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_gpcother_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
]),
|
||||
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp'], [
|
||||
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop'], [
|
||||
gen.Node('DRAM' , ['lts__average_t_sector_srcunit_zrop_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_zrop_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_zrop_aperture_device_op_write'], []),
|
||||
]),
|
||||
gen.Node('Peer Memory' , ['lts__average_t_sector_srcunit_zrop_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_zrop_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_zrop_aperture_peer_op_write'], []),
|
||||
]),
|
||||
gen.Node('System Memory' , ['lts__average_t_sector_srcunit_zrop_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop'], [
|
||||
gen.Node('DRAM' , ['lts__average_t_sector_srcunit_crop_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_crop_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_crop_aperture_device_op_write'], []),
|
||||
]),
|
||||
gen.Node('Peer Memory' , ['lts__average_t_sector_srcunit_crop_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_crop_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_crop_aperture_peer_op_write'], []),
|
||||
]),
|
||||
gen.Node('System Memory' , ['lts__average_t_sector_srcunit_crop_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_crop_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_crop_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
]),
|
||||
gen.Node('HUB Units', [], [ # lts__average_t_sector_srcnode_fbp is buggy, use sum of children instead
|
||||
gen.Node('all HUB Units', [], [ # lts__average_t_sector_srcnode_fbp is buggy, use sum of children instead
|
||||
gen.Node('DRAM' , ['lts__average_t_sector_srcnode_hub_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcnode_hub_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcnode_hub_aperture_device_op_write'], []),
|
||||
]),
|
||||
gen.Node('Peer Memory', ['lts__average_t_sector_srcnode_hub_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_write'], []),
|
||||
]),
|
||||
gen.Node('System Memory', ['lts__average_t_sector_srcnode_hub_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
]),
|
||||
]
|
||||
|
||||
gen.required_ratios = gen.get_required_ratios()
|
||||
|
||||
class L2TrafficByMemoryApertureBreakdownGenerator(tables_common.L2TrafficBreakdownGenerator):
|
||||
def __init__(gen, show_generic_workflow):
|
||||
super().__init__(show_generic_workflow=show_generic_workflow)
|
||||
gen.name = 'L2SectorTrafficBreakdownByMemoryAperture'
|
||||
gen.table_id = 'L2-Sector-Traffic-By-Memory-Aperture'
|
||||
gen.column_names = [
|
||||
('Memory Aperture', 'To Memory'),
|
||||
('Source Breakdown', 'From Source'),
|
||||
('Unit Breakdown', 'From Unit'),
|
||||
('Op', 'Op'),
|
||||
]
|
||||
gen.workflow += ' This is an extended breakdown of <a href="#L2-Sector-Traffic-By-Memory-Aperture-Short">L2 Traffic by destination</a>. It decomposes L2 bandwidth to each destination, from each source unit, per operation.'
|
||||
|
||||
gen.nodes = [
|
||||
gen.Node('DRAM' , ['lts__average_t_sector_aperture_device'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_device'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_tex_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_tex_aperture_device_op_write'], []),
|
||||
gen.Node('Atomics', ['lts__average_t_sector_srcunit_tex_aperture_device_op_atom'], []),
|
||||
gen.Node('Reductions', ['lts__average_t_sector_srcunit_tex_aperture_device_op_red'], []),
|
||||
]),
|
||||
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_pe_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_pe_aperture_device_op_write'], []),
|
||||
]),
|
||||
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_gpcother_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_gpcother_aperture_device_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_device'], [
|
||||
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_zrop_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_zrop_aperture_device_op_write'], []),
|
||||
]),
|
||||
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_crop_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_crop_aperture_device_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_device'], [
|
||||
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_device'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcnode_hub_aperture_device_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcnode_hub_aperture_device_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
]),
|
||||
gen.Node('Peer Memory', ['lts__average_t_sector_aperture_peer'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_peer'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_write'], []),
|
||||
gen.Node('Atomics', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_atom'], []),
|
||||
gen.Node('Reductions', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_red'], []),
|
||||
]),
|
||||
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_pe_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_pe_aperture_peer_op_write'], []),
|
||||
]),
|
||||
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_gpcother_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_gpcother_aperture_peer_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_peer'], [
|
||||
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_zrop_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_zrop_aperture_peer_op_write'], []),
|
||||
]),
|
||||
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_crop_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_crop_aperture_peer_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_peer'], [
|
||||
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_peer'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
]),
|
||||
gen.Node('System Memory', ['lts__average_t_sector_aperture_sysmem'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_sysmem'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_write'], []),
|
||||
gen.Node('Atomics', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_atom'], []),
|
||||
gen.Node('Reductions', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_red'], []),
|
||||
]),
|
||||
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_pe_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_pe_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_gpcother_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_gpcother_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_sysmem'], [
|
||||
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcunit_crop_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcunit_crop_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_sysmem'], [
|
||||
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_sysmem'], [
|
||||
gen.Node('Reads', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Writes', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
]),
|
||||
]
|
||||
|
||||
gen.required_ratios = gen.get_required_ratios()
|
||||
|
||||
class L2TrafficByOperationBreakdownGenerator(tables_common.L2TrafficBreakdownGenerator):
|
||||
def __init__(gen, show_generic_workflow):
|
||||
super().__init__(show_generic_workflow=show_generic_workflow)
|
||||
gen.name = 'L2SectorTrafficBreakdownByOperation'
|
||||
gen.table_id = 'L2-Sector-Traffic-By-Operation'
|
||||
gen.column_names = [
|
||||
('Op', 'Op'),
|
||||
('Memory Aperture', 'To Memory'),
|
||||
('Source Breakdown', 'From Source'),
|
||||
('Unit Breakdown', 'From Unit'),
|
||||
]
|
||||
gen.workflow += ' This table decomposes L2 bandwidth per operation, to each destination, from each source unit.'
|
||||
|
||||
gen.nodes = [
|
||||
gen.Node('Reads', ['lts__average_t_sector_op_read'], [
|
||||
gen.Node('DRAM', ['lts__average_t_sector_aperture_device_op_read'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_device_op_read'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_device_op_read'], []),
|
||||
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_device_op_read'], []),
|
||||
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother_aperture_device_op_read'], []),
|
||||
]),
|
||||
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_device_op_read'], [
|
||||
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_device_op_read'], []),
|
||||
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_device_op_read'], []),
|
||||
]),
|
||||
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_device_op_read'], [
|
||||
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_device_op_read'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('Peer Memory', ['lts__average_t_sector_aperture_peer_op_read'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_peer_op_read'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_read'], []),
|
||||
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_peer_op_read'], []),
|
||||
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother_aperture_peer_op_read'], []),
|
||||
]),
|
||||
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_peer_op_read'], [
|
||||
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_peer_op_read'], []),
|
||||
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_peer_op_read'], []),
|
||||
]),
|
||||
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_read'], [
|
||||
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_read'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('System Memory', ['lts__average_t_sector_aperture_sysmem_op_read'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_sysmem_op_read'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_read'], []),
|
||||
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_sysmem_op_read'], []),
|
||||
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother_aperture_sysmem_op_read'], []),
|
||||
]),
|
||||
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_sysmem_op_read'], [
|
||||
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem_op_read'], []),
|
||||
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_sysmem_op_read'], []),
|
||||
]),
|
||||
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_read'], [
|
||||
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_read'], []),
|
||||
]),
|
||||
]),
|
||||
]),
|
||||
gen.Node('Writes', ['lts__average_t_sector_op_write'], [
|
||||
gen.Node('DRAM', ['lts__average_t_sector_aperture_device_op_write'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_device_op_write'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_device_op_write'], []),
|
||||
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_device_op_write'], []),
|
||||
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother_aperture_device_op_write'], []),
|
||||
]),
|
||||
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_device_op_write'], [
|
||||
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_device_op_write'], []),
|
||||
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_device_op_write'], []),
|
||||
]),
|
||||
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_device_op_write'], [
|
||||
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_device_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('Peer Memory', ['lts__average_t_sector_aperture_peer_op_write'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_peer_op_write'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_write'], []),
|
||||
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_peer_op_write'], []),
|
||||
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother_aperture_peer_op_write'], []),
|
||||
]),
|
||||
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_peer_op_write'], [
|
||||
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_peer_op_write'], []),
|
||||
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_peer_op_write'], []),
|
||||
]),
|
||||
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_write'], [
|
||||
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('System Memory', ['lts__average_t_sector_aperture_sysmem_op_write'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_sysmem_op_write'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_write'], []),
|
||||
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_sysmem_op_write'], []),
|
||||
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_sysmem_op_write'], [
|
||||
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem_op_write'], []),
|
||||
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_write'], [
|
||||
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_write'], []),
|
||||
]),
|
||||
]),
|
||||
]),
|
||||
gen.Node('Atomics', ['lts__average_t_sector_op_atom'], [
|
||||
gen.Node('DRAM', ['lts__average_t_sector_aperture_device_op_atom'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_device_op_atom'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_device_op_atom'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('Peer Memory', ['lts__average_t_sector_aperture_peer_op_atom'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_peer_op_atom'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_atom'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('System Memory', ['lts__average_t_sector_aperture_sysmem_op_atom'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_sysmem_op_atom'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_atom'], []),
|
||||
]),
|
||||
]),
|
||||
]),
|
||||
gen.Node('Reductions', ['lts__average_t_sector_op_red'], [
|
||||
gen.Node('DRAM', ['lts__average_t_sector_aperture_device_op_red'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_device_op_red'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_device_op_red'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('Peer Memory', ['lts__average_t_sector_aperture_peer_op_red'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_peer_op_red'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_red'], []),
|
||||
]),
|
||||
]),
|
||||
gen.Node('System Memory', ['lts__average_t_sector_aperture_sysmem_op_red'], [
|
||||
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_sysmem_op_red'], [
|
||||
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_red'], []),
|
||||
]),
|
||||
]),
|
||||
]),
|
||||
]
|
||||
|
||||
gen.required_ratios = gen.get_required_ratios()
|
||||
|
||||
class RasterDataflowGenerator(tables_common.RasterDataflowGenerator):
|
||||
def __init__(gen):
|
||||
super().__init__()
|
||||
gen.zrop_pixels_input = r'''getCounterValue('prop__prop2zrop_pixels_realtime', 'sum')'''
|
||||
gen.crop_pixels_input = r'''getCounterValue('prop__prop2crop_pixels_realtime', 'sum')'''
|
||||
gen.required_counters.extend([
|
||||
'prop__prop2zrop_pixels_realtime',
|
||||
'prop__prop2crop_pixels_realtime'
|
||||
])
|
||||
|
||||
class RangesSummaryGenerator(tables_common.RangesSummaryGenerator):
|
||||
def __init__(gen):
|
||||
super().__init__()
|
||||
gen.cols = [
|
||||
gen.Col('Duration μs' , "getCounterValue('gpu__time_duration', 'avg')" , 'format_avg' , 'ra'),
|
||||
gen.Col('GR Active%' , "getCounterPct('gr__cycles_active', 'avg')" , 'format_pct' , 'ra'),
|
||||
gen.Col('3D?' , "getCounterValue('fe__draw_count', 'sum') ? '✓' : ''" , '' , 'ra'),
|
||||
gen.Col('Comp?' , "getCounterValue('gr__dispatch_count', 'sum') ? '✓' : ''" , '' , 'ra'),
|
||||
gen.Col('#WFI' , "getCounterValue('fe__output_ops_type_bundle_cmd_go_idle', 'sum')" , 'format_sum' , 'ra'),
|
||||
gen.Col('#Prims' , "getCounterValue('pda__input_prims', 'sum')" , 'format_sum' , 'ra'),
|
||||
gen.Col('SM%' , "getThroughputPct('sm__throughput')" , 'format_pct' , 'ra'),
|
||||
gen.Col('L1TEX%' , "getThroughputPct('l1tex__throughput')" , 'format_pct' , 'ra'),
|
||||
gen.Col('L2%' , "getThroughputPct('lts__throughput')" , 'format_pct' , 'ra'),
|
||||
gen.Col('DRAM%' , "getThroughputPct('dram__throughput')" , 'format_pct' , 'ra'),
|
||||
gen.Col('PCIe%' , "getThroughputPct('pcie__throughput')" , 'format_pct' , 'ra'),
|
||||
gen.Col('PD%' , "getThroughputPct('pda__throughput')" , 'format_pct' , 'ra'),
|
||||
gen.Col('PE%' , "Math.max(getThroughputPct('vaf__throughput'), getThroughputPct('vpc__throughput'), getThroughputPct('pes__throughput'))" , 'format_pct' , 'ra'),
|
||||
gen.Col('RSTR%' , "getThroughputPct('raster__throughput')" , 'format_pct' , 'ra'),
|
||||
gen.Col('ZROP%' , "getThroughputPct('zrop__throughput')" , 'format_pct' , 'ra'),
|
||||
gen.Col('CROP%' , "getThroughputPct('crop__throughput')" , 'format_pct' , 'ra'),
|
||||
]
|
||||
gen.required_counters = [
|
||||
'fe__draw_count',
|
||||
'fe__output_ops_type_bundle_cmd_go_idle',
|
||||
'gpu__time_duration',
|
||||
'gr__cycles_active',
|
||||
'gr__dispatch_count',
|
||||
'pda__input_prims',
|
||||
]
|
||||
gen.required_ratios = []
|
||||
gen.required_throughputs = [
|
||||
'crop__throughput',
|
||||
'dram__throughput',
|
||||
'l1tex__throughput',
|
||||
'lts__throughput',
|
||||
'pcie__throughput',
|
||||
'pda__throughput',
|
||||
'pes__throughput',
|
||||
'raster__throughput',
|
||||
'sm__throughput',
|
||||
'vaf__throughput',
|
||||
'vpc__throughput',
|
||||
'zrop__throughput',
|
||||
]
|
||||
@@ -0,0 +1,11 @@
|
||||
This is an offline tool that generates the C++ "report definition" header.
|
||||
|
||||
Example command:
|
||||
```
|
||||
python3 profiler_report_generator.py --chip ga10x --outDir=PATH/TO/YOUR/OUTPUT_DIR --pypath pub/ampere
|
||||
```
|
||||
|
||||
* This has been tested with Python 3.5.2
|
||||
* Please use "profiler_report_generator.py --help" for more details
|
||||
|
||||
A pre-generated version of header files have been deployed to both the "/gen" sub-directory and the "NvPerfUtility" directory.
|
||||
Reference in New Issue
Block a user