port from perforce

This commit is contained in:
2026-04-18 22:31:51 +02:00
commit 8d0ab5b7cc
8409 changed files with 3972376 additions and 0 deletions

View File

@@ -0,0 +1,28 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ImportGroup Label="PropertySheets" />
<PropertyGroup Label="UserMacros">
<_Relative_NvPerf_host_dll>bin/x64/nvperf_host.dll</_Relative_NvPerf_host_dll>
<_Possible_NvPerf_Dir_0>$([MSBuild]::NormalizePath('$(MSBuildThisFileDirectory)../../NvPerf/'))</_Possible_NvPerf_Dir_0>
<_Possible_NvPerf_Dir_1>$([MSBuild]::NormalizePath('$(MSBuildThisFileDirectory)../../../NvPerf/'))</_Possible_NvPerf_Dir_1>
<NvPerfSdkPath></NvPerfSdkPath>
<NvPerfSdkPath Condition="'$(NvPerfSdkPath)'=='' And Exists('$(_Possible_NvPerf_Dir_0)/$(_Relative_NvPerf_host_dll)')">$(_Possible_NvPerf_Dir_0)</NvPerfSdkPath>
<NvPerfSdkPath Condition="'$(NvPerfSdkPath)'=='' And Exists('$(_Possible_NvPerf_Dir_1)/$(_Relative_NvPerf_host_dll)')">$(_Possible_NvPerf_Dir_1)</NvPerfSdkPath>
<NvPerfSdkPath Condition="'$(NvPerfSdkPath)'=='' And '$(NVPERF_SDK_PATH)' != '' And Exists('$(NVPERF_SDK_PATH)/$(_Relative_NvPerf_host_dll)')">$(NVPERF_SDK_PATH)</NvPerfSdkPath>
<NvPerfSdkPath Condition="'$(NvPerfSdkPath)'!=''">$([MSBuild]::NormalizePath($(NvPerfSdkPath)))</NvPerfSdkPath>
<NvPerfUtilityPath>$([MSBuild]::NormalizePath('$(MSBuildThisFileDirectory)../../NvPerfUtility/'))</NvPerfUtilityPath>
</PropertyGroup>
<ItemDefinitionGroup />
<ItemGroup />
<Target Name="PrintNvPerfLocation" BeforeTargets="ClCompile">
<Message
Condition="'$(NvPerfSdkPath)'!=''"
Text="NvPerf SDK found: NvPerfSdkPath = $(NvPerfSdkPath)" />
<Error
Condition="'$(NvPerfSdkPath)'==''"
Text="NvPerf SDK could not be found; please unzip the SDK into one of the following locations:
$(_Possible_NvPerf_Dir_0)
$(_Possible_NvPerf_Dir_1)
or set environment variable NVPERF_SDK_PATH" />
</Target>
</Project>

View File

@@ -0,0 +1,118 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<VCProjectVersion>16.0</VCProjectVersion>
<Keyword>Win32Proj</Keyword>
<ProjectGuid>{ea22d2ac-ebf7-43e4-adb7-0f320c46692e}</ProjectGuid>
<RootNamespace>NvPerfUtility</RootNamespace>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>Utility</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>v142</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>Utility</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>v142</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
</ImportGroup>
<ImportGroup Label="Shared">
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<Import Project="NvPerfSDK.props" />
<PropertyGroup Label="UserMacros" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<LinkIncremental>true</LinkIncremental>
<IncludePath>$(NvPerfUtilityPath)/include;$(NvPerfSdkPath)/include;$(IncludePath)</IncludePath>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<LinkIncremental>false</LinkIncremental>
<IncludePath>$(NvPerfUtilityPath)/include;$(NvPerfSdkPath)/include;$(IncludePath)</IncludePath>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<SDLCheck>true</SDLCheck>
<PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<ConformanceMode>true</ConformanceMode>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<SDLCheck>true</SDLCheck>
<PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<ConformanceMode>true</ConformanceMode>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<ItemGroup>
<ClInclude Include="../include/NvPerfCounterConfiguration.h" />
<ClInclude Include="../include/NvPerfCounterData.h" />
<ClInclude Include="../include/NvPerfD3D.h" />
<ClInclude Include="../include/NvPerfD3D12.h" />
<ClInclude Include="../include/NvPerfDeviceProperties.h" />
<ClInclude Include="../include/NvPerfInit.h" />
<ClInclude Include="../include/NvPerfMetricsConfigBuilder.h" />
<ClInclude Include="../include/NvPerfMetricsEvaluator.h" />
<ClInclude Include="../include/NvPerfRangeProfiler.h" />
<ClInclude Include="../include/NvPerfRangeProfilerD3D12.h" />
<ClInclude Include="../include/NvPerfRangeProfilerVulkan.h" />
<ClInclude Include="../include/NvPerfReportDefinition.h" />
<ClInclude Include="../include/NvPerfReportDefinitionGA10X.h" />
<ClInclude Include="../include/NvPerfReportDefinitionGV100.h" />
<ClInclude Include="../include/NvPerfReportDefinitionHAL.h" />
<ClInclude Include="../include/NvPerfReportDefinitionTU10X.h" />
<ClInclude Include="../include/NvPerfReportDefinitionTU11X.h" />
<ClInclude Include="../include/NvPerfReportGenerator.h" />
<ClInclude Include="../include/NvPerfReportGeneratorD3D12.h" />
<ClInclude Include="../include/NvPerfReportGeneratorVulkan.h" />
<ClInclude Include="../include/NvPerfVulkan.h" />
<ClInclude Include="$(NvPerfSdkPath)/include/nvperf_d3d12_host.h" />
<ClInclude Include="$(NvPerfSdkPath)/include/nvperf_d3d12_target.h" />
<ClInclude Include="$(NvPerfSdkPath)/include/nvperf_device_host.h" />
<ClInclude Include="$(NvPerfSdkPath)/include/nvperf_device_target.h" />
<ClInclude Include="$(NvPerfSdkPath)/include/nvperf_host.h" />
<ClInclude Include="$(NvPerfSdkPath)/include/nvperf_target.h" />
<ClInclude Include="$(NvPerfSdkPath)/include/nvperf_versions_target.h" />
<ClInclude Include="$(NvPerfSdkPath)/include/nvperf_vulkan_host.h" />
<ClInclude Include="$(NvPerfSdkPath)/include/nvperf_vulkan_target.h" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
</ImportGroup>
</Project>

View File

@@ -0,0 +1,35 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<ClInclude Include="$(NvPerfSdkPath)/include/nvperf_d3d12_host.h" />
<ClInclude Include="$(NvPerfSdkPath)/include/nvperf_d3d12_target.h" />
<ClInclude Include="$(NvPerfSdkPath)/include/nvperf_device_host.h" />
<ClInclude Include="$(NvPerfSdkPath)/include/nvperf_device_target.h" />
<ClInclude Include="$(NvPerfSdkPath)/include/nvperf_host.h" />
<ClInclude Include="$(NvPerfSdkPath)/include/nvperf_target.h" />
<ClInclude Include="$(NvPerfSdkPath)/include/nvperf_versions_target.h" />
<ClInclude Include="$(NvPerfSdkPath)/include/nvperf_vulkan_host.h" />
<ClInclude Include="$(NvPerfSdkPath)/include/nvperf_vulkan_target.h" />
<ClInclude Include="../include/NvPerfCounterConfiguration.h" />
<ClInclude Include="../include/NvPerfCounterData.h" />
<ClInclude Include="../include/NvPerfD3D.h" />
<ClInclude Include="../include/NvPerfD3D12.h" />
<ClInclude Include="../include/NvPerfDeviceProperties.h" />
<ClInclude Include="../include/NvPerfInit.h" />
<ClInclude Include="../include/NvPerfMetricsConfigBuilder.h" />
<ClInclude Include="../include/NvPerfMetricsEvaluator.h" />
<ClInclude Include="../include/NvPerfRangeProfiler.h" />
<ClInclude Include="../include/NvPerfRangeProfilerD3D12.h" />
<ClInclude Include="../include/NvPerfRangeProfilerVulkan.h" />
<ClInclude Include="../include/NvPerfReportDefinition.h" />
<ClInclude Include="../include/NvPerfReportDefinitionGA10X.h" />
<ClInclude Include="../include/NvPerfReportDefinitionGV100.h" />
<ClInclude Include="../include/NvPerfReportDefinitionHAL.h" />
<ClInclude Include="../include/NvPerfReportDefinitionTU10X.h" />
<ClInclude Include="../include/NvPerfReportDefinitionTU11X.h" />
<ClInclude Include="../include/NvPerfReportGenerator.h" />
<ClInclude Include="../include/NvPerfReportGeneratorD3D12.h" />
<ClInclude Include="../include/NvPerfReportGeneratorVulkan.h" />
<ClInclude Include="../include/NvPerfVulkan.h" />
</ItemGroup>
</Project>

View File

@@ -0,0 +1,15 @@
/*
* Copyright 2014-2021 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

View File

@@ -0,0 +1,232 @@
# Copyright 2014-2021 NVIDIA Corporation. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from profiler_report_types import *
import argparse
import sys
import os
#===============================================================================
# Dep File generation, compatible with Make or Ninja build systems
#===============================================================================
def get_loaded_module_file_names():
module_file_names = set()
for name, module in sys.modules.items():
path = getattr(module, "__file__", None)
if not path:
continue
path = os.path.realpath(path)
if path.endswith("<frozen>"):
continue
if not os.path.isabs(path):
path = os.path.abspath(path)
if not os.path.isfile(path):
continue # filter out directories
module_file_names.add(path)
return sorted(list(module_file_names))
def gen_depfile(target_file_path, buildroot):
target_path_canonicalized = os.path.normpath(os.path.normcase(target_file_path))
buildroot_canonicalized = os.path.normpath(os.path.normcase(buildroot))
target_path_final = target_file_path
if target_path_canonicalized.startswith(buildroot_canonicalized):
target_path_final = target_file_path[len(buildroot_canonicalized):]
if target_path_final[0] in ('\\', '/'):
target_path_final = target_path_final[1:]
module_file_names = get_loaded_module_file_names()
depfile_contents = []
depfile_contents.append(target_path_final + ':\\')
for module_file_name in module_file_names:
depfile_contents.append('\t' + module_file_name + ' \\')
return '\n'.join(depfile_contents)
# target_file_path : the file being generated
# buildroot : root directory of the build system; this prefix is removed from target_file_path to pacify ninja
# depfile_path : the depfile to be written
def write_depfile(target_file_path, buildroot, depfile_path):
with open(depfile_path, 'w', encoding='utf-8') as out_fd:
depfile_str = gen_depfile(target_file_path, buildroot)
out_fd.write(depfile_str)
#===============================================================================
# C++ header generation
#===============================================================================
def write_cpp_file(out_fd, report_definition):
out_fd.write(r'''
namespace {} {{
'''.format(report_definition.name))
out_fd.write(r'''
inline ReportDefinition GetReportDefinition()
{''')
# counters
if len(report_definition.required_counters):
out_fd.write(r'''
static const char* const RequiredCounters[] = {
''')
for counter in report_definition.required_counters:
out_fd.write(r''' "{}",
'''.format(counter))
out_fd.write(r''' };
''')
# ratios
if len(report_definition.required_ratios):
out_fd.write(r'''
static const char* const RequiredRatios[] = {
''')
for ratio in report_definition.required_ratios:
out_fd.write(r''' "{}",
'''.format(ratio))
out_fd.write(r''' };
''')
# throughputs
if len(report_definition.required_throughputs):
out_fd.write(r'''
static const char* const RequiredThroughputs[] = {
''')
for throughput in report_definition.required_throughputs:
out_fd.write(r''' "{}",
'''.format(throughput))
out_fd.write(r''' };
''')
# html template
assert(len(report_definition.html));
out_fd.write(r'''
static const unsigned char ReportContents[] = {''')
barray = bytearray(report_definition.html, 'utf-8')
formatted_string = []
for index, b in enumerate(barray):
if index % 20 == 0:
formatted_string += '\n '
assert(b <= 0xFF)
formatted_string.append('0x{:02x}, '.format(b))
out_fd.write("".join(formatted_string))
out_fd.write(r'''0x0
};
''')
out_fd.write(r'''
ReportDefinition reportDefinition = {''')
if len(report_definition.required_counters):
out_fd.write(r'''
RequiredCounters,
sizeof(RequiredCounters) / sizeof(RequiredCounters[0]),''')
else:
out_fd.write(r'''
nullptr,
0,''')
if len(report_definition.required_ratios):
out_fd.write(r'''
RequiredRatios,
sizeof(RequiredRatios) / sizeof(RequiredRatios[0]),''')
else:
out_fd.write(r'''
nullptr,
0,''')
if len(report_definition.required_throughputs):
out_fd.write(r'''
RequiredThroughputs,
sizeof(RequiredThroughputs) / sizeof(RequiredThroughputs[0]),''')
else:
out_fd.write(r'''
nullptr,
0,''')
out_fd.write(r'''
(const char*)ReportContents
};
return reportDefinition;
}
''')
out_fd.write(r'''
}} // namespace {}
'''.format(report_definition.name))
#===============================================================================
# Main
#===============================================================================
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Generate HTML report definition')
parser.add_argument('--chip', type=str, required=True, help='chip name, e.g. tu10x')
parser.add_argument('--outDir', type=str, required=True, help='output directory')
parser.add_argument('--pypath', default=[], action='append', required=False, help="Python module paths.")
parser.add_argument('--buildroot', default='', required=False, help="build root dir for depfile")
parser.add_argument('--copyright', type=str, help="Copyright header.")
args = parser.parse_args()
sys.path.extend(args.pypath)
sys.path.extend(".")
chip = args.chip
report_module_name = "report_" + args.chip
try:
report_module = __import__(report_module_name)
except ImportError:
raise ImportError('Module "{}" is not found, this could happen due to invalid chip name or insufficient --pypath.'.format(report_module_name))
per_range_report_definition = report_module.get_per_range_report_definition()
summary_report_definition = report_module.get_summary_report_definition()
if not os.path.isdir(args.outDir):
raise Exception('Invalid argument for --outDir: {}'.format(args.outDir))
# Per-range report: debug html(this can be used for inspection, the debug mode also lists the metrics that are used by each table)
range_debug_html_file_name = 'NvPerfReportDefinition{}_range_debug.html'.format(chip.upper())
range_debug_html_file_path = os.path.join(args.outDir, range_debug_html_file_name)
with open(range_debug_html_file_path, 'w', encoding='utf-8') as out_fd:
out_fd.write(per_range_report_definition.html)
# Summary report: debug html
summary_debug_html_file_name = 'NvPerfReportDefinition{}_summary_debug.html'.format(chip.upper())
summary_debug_html_file_path = os.path.join(args.outDir, summary_debug_html_file_name)
with open(summary_debug_html_file_path, 'w', encoding='utf-8') as out_fd:
out_fd.write(summary_report_definition.html)
# CPP file
cpp_file_name = 'NvPerfReportDefinition{}.h'.format(chip.upper())
cpp_file_path = os.path.join(args.outDir, cpp_file_name)
with open(cpp_file_path, 'w') as out_fd:
if args.copyright:
with open(args.copyright, 'r') as copyright_fd:
out_fd.write(copyright_fd.read())
out_fd.write(r'''
#pragma once
#include "NvPerfReportDefinition.h"
namespace nv {{ namespace perf {{ namespace {} {{
'''.format(args.chip))
# per-range report
write_cpp_file(out_fd, per_range_report_definition)
# summary report
write_cpp_file(out_fd, summary_report_definition)
out_fd.write(r'''
} } }''')
# Emit a single depfile using the C++ file as the representative output.
write_depfile(cpp_file_path, args.buildroot, cpp_file_path + '.d')

View File

@@ -0,0 +1,69 @@
# Copyright 2014-2021 NVIDIA Corporation. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
class DataTable:
def __init__(dtable, name, html, jsfunc, jscall, required_counters, required_ratios, required_throughputs, workflow):
dtable.name = name
dtable.html = html
dtable.jsfunc = jsfunc
dtable.jscall = jscall
dtable.required_counters = required_counters
dtable.required_ratios = required_ratios
dtable.required_throughputs = required_throughputs
dtable.workflow = workflow
class DataSection:
def __init__(section, dtables, inter_table_spacing=True, title=None):
section.dtables = dtables
section.inter_table_spacing = inter_table_spacing
section.title = title
class ReportDefinition:
def __init__(rd, name, html, required_counters, required_ratios, required_throughputs):
rd.name = name
rd.html = html
rd.required_counters = required_counters
rd.required_ratios = required_ratios
rd.required_throughputs = required_throughputs
def get_data_tables(sections):
dtables = [dtable for section in sections for dtable in section.dtables]
return dtables
def get_required_counters(sections):
required_counters = set()
dtables = get_data_tables(sections)
for dtable in dtables:
for counter in dtable.required_counters:
required_counters.add(counter)
required_counters = sorted(list(required_counters))
return required_counters
def get_required_ratios(sections):
required_ratios = set()
dtables = get_data_tables(sections)
for dtable in dtables:
for ratio in dtable.required_ratios:
required_ratios.add(ratio)
required_ratios = sorted(list(required_ratios))
return required_ratios
def get_required_throughputs(sections):
required_throughputs = set()
dtables = get_data_tables(sections)
for dtable in dtables:
for throughput in dtable.required_throughputs:
required_throughputs.add(throughput)
required_throughputs = sorted(list(required_throughputs))
return required_throughputs

View File

@@ -0,0 +1,80 @@
# Copyright 2014-2021 NVIDIA Corporation. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from profiler_report_types import *
import pub.tables_common as tables_common
import pub.ampere.tables_ga10x as tables_ga10x
def get_per_range_report_definition():
sections = [
DataSection([
tables_ga10x.DevicePropertiesGenerator().make_data_table(),
tables_common.ClocksGenerator().make_data_table(),
], inter_table_spacing=False),
DataSection([
tables_common.TopLevelStatsGenerator().make_data_table(),
tables_ga10x.TopThroughputsGenerator().make_data_table(),
tables_common.CacheHitRates().make_data_table(),
], title='Overview Section'),
DataSection([
tables_common.MainMemoryGenerator().make_data_table(),
tables_ga10x.L2TrafficByMemoryApertureShortBreakdownGenerator(show_generic_workflow=True).make_data_table(),
tables_ga10x.L2TrafficBySrcBreakdownGenerator(show_generic_workflow=False).make_data_table(),
tables_common.L1TexThroughputsGenerator().make_data_table(),
tables_common.L1TexTrafficBreakdownGenerator().make_data_table(),
], title='Memory Performance Section'),
DataSection([
tables_ga10x.SmThroughputsGenerator().make_data_table(),
tables_ga10x.SmInstExecutedGenerator().make_data_table(),
tables_common.SmShaderExecutionGenerator().make_data_table(),
tables_ga10x.SmResourceUsageGenerator().make_data_table(),
tables_common.SmWarpLaunchStallsGenerator().make_data_table(),
tables_common.WarpIssueStallsGenerator().make_data_table(),
], title='Shader Performance Section'),
DataSection([
tables_common.PrimitiveDataflowGenerator().make_data_table(),
tables_ga10x.RasterDataflowGenerator().make_data_table(),
], title='3D Pipeline Section'),
DataSection([
tables_ga10x.L2TrafficByMemoryApertureBreakdownGenerator(show_generic_workflow=False).make_data_table(),
tables_ga10x.L2TrafficByOperationBreakdownGenerator(show_generic_workflow=False).make_data_table(),
], title='Additional L2 Traffic Breakdowns Section'),
DataSection([
tables_common.AdditionalMetricsGenerator().make_data_table(),
tables_common.AllCountersGenerator().make_data_table(),
tables_common.AllRatiosGenerator().make_data_table(),
tables_common.AllThroughputsGenerator().make_data_table(),
], title='Exhaustive Listings Section'),
]
html = tables_common.generate_range_html_common(sections)
required_counters = get_required_counters(sections)
required_ratios = get_required_ratios(sections)
required_throughputs = get_required_throughputs(sections)
return ReportDefinition('PerRangeReport', html, required_counters, required_ratios, required_throughputs)
def get_summary_report_definition():
sections = [
DataSection([
tables_common.CollectionInfoGenerator().make_data_table(),
]),
DataSection([
tables_ga10x.RangesSummaryGenerator().make_data_table(),
], title='Summary of Measured Ranges'),
]
html = tables_common.generate_summary_html_common(sections)
required_counters = get_required_counters(sections)
required_ratios = get_required_ratios(sections)
required_throughputs = get_required_throughputs(sections)
return ReportDefinition('SummaryReport', html, required_counters, required_ratios, required_throughputs)

View File

@@ -0,0 +1,646 @@
# Copyright 2014-2021 NVIDIA Corporation. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from profiler_report_types import *
import pub.tables_common as tables_common
class DevicePropertiesGenerator(tables_common.DevicePropertiesGenerator):
def __init__(gen):
super().__init__()
gen.l2cacheSizePerLts = 128
class TopThroughputsGenerator(tables_common.TopThroughputsGenerator):
def __init__(gen):
super().__init__()
gen.rows += [
gen.Row('Shader' , '<a href="#SM-Instruction-Throughput">SM (Shader Cores)</a>' , "getThroughputPct('sm__throughput')"),
gen.Row('Memory' , '<a href="#L1TEX-Throughput">L1TEX Cache</a>' , "getThroughputPct('l1tex__throughput')"),
gen.Row('Memory' , '<a href="#L2-Sector-Traffic">L2 Cache</a>' , "getThroughputPct('lts__throughput')"),
gen.Row('Memory' , '<a href="#Main-Memory-Throughput">DRAM</a>' , "getThroughputPct('dram__throughput')"),
gen.Row('Memory' , '<a href="#Main-Memory-Throughput">PCIe</a>' , "getThroughputPct('pcie__throughput')"),
gen.Row('World Pipe' , '<a href="#Primitive-Data-Flow">PDA Index Fetch</a>' , "getThroughputPct('pda__throughput')"),
gen.Row('World Pipe' , '<a href="#Primitive-Data-Flow">Vertex Attr. Fetch</a>' , "getThroughputPct('vaf__throughput')"),
gen.Row('World Pipe' , '<a href="#Primitive-Data-Flow">Primitive Engine</a>' , "getThroughputPct('pes__throughput')"),
gen.Row('Screen Pipe' , '<a href="#Raster-Data-Flow">RASTER</a>' , "getThroughputPct('raster__throughput')"),
gen.Row('Screen Pipe' , '<a href="#Raster-Data-Flow">PROP (Pre-ROP)</a>' , "getThroughputPct('prop__throughput')"),
gen.Row('Screen Pipe' , '<a href="#Raster-Data-Flow">ZROP (Depth-Test)</a>' , "getThroughputPct('zrop__throughput')"),
gen.Row('Screen Pipe' , '<a href="#Raster-Data-Flow">CROP (Color Blend)</a>' , "getThroughputPct('crop__throughput')"),
]
gen.required_throughputs += [
'crop__throughput',
'dram__throughput',
'l1tex__throughput',
'lts__throughput',
'pcie__throughput',
'pda__throughput',
'pes__throughput',
'prop__throughput',
'raster__throughput',
'sm__throughput',
'vaf__throughput',
'zrop__throughput',
]
class SmThroughputsGenerator(tables_common.SmThroughputsGenerator):
def __init__(gen):
super().__init__()
gen.pipes = [
gen.Pipe('adu' , 'Computed branches and indexed constants'),
gen.Pipe('alu' , 'INT32 except multiply; FP32 comparison'),
gen.Pipe('cbu' , 'Divergent branches and control flow'),
gen.Pipe('fma' , 'FP32 mul/add, FP16 mul/add'),
gen.Pipe('fmaheavy' , 'FP32 mul/add and INT32 multiply'),
gen.Pipe('fp64' , 'FP64 mul/add'),
gen.Pipe('ipa' , 'Pixel shader attribute interpolation'),
gen.Pipe('lsu' , 'Global, local, shared memory, and misc'),
gen.Pipe('tensor' , 'Tensor matrix multiply (FP16, INT8/4/1)'),
gen.Pipe('tex' , 'Texture and surface memory'),
gen.Pipe('uniform' , 'Warp-level scalar operations'),
gen.Pipe('xu' , 'Transcendentals and float/int conversion'),
]
for pipe in gen.pipes:
gen.required_counters.extend(pipe.get_counter_names())
class SmInstExecutedGenerator(tables_common.SmInstExecutedGenerator):
def __init__(gen):
super().__init__()
gen.pipes = [
gen.Pipe('total' , 'All instructions', True),
gen.Pipe('adu' , 'Computed branches and indexed constants'),
gen.Pipe('alu' , 'INT32 except multiply; FP32 comparison', True),
gen.Pipe('cbu' , 'Divergent branches and control flow'),
gen.Pipe('fma' , 'FP32 mul/add, FP16 mul/add', True),
gen.Pipe('fmaheavy' , 'FP32 mul/add and INT32 multiply', True),
gen.Pipe('fp64' , 'FP64 mul/add', True),
gen.Pipe('ipa' , 'Pixel shader attribute interpolation', True),
gen.Pipe('lsu' , 'Global, local, shared memory, and misc', True),
gen.Pipe('tensor' , 'Tensor matrix multiply (FP16, INT8/4/1)'),
gen.Pipe('tex' , 'Texture and surface memory'),
gen.Pipe('uniform' , 'Warp-level scalar operations'),
gen.Pipe('xu' , 'Transcendentals and float/int conversion', True),
]
for pipe in gen.pipes:
gen.required_counters.extend(pipe.get_counter_names())
class L2TrafficByMemoryApertureShortBreakdownGenerator(tables_common.L2TrafficBreakdownGenerator):
def __init__(gen, show_generic_workflow):
super().__init__(show_generic_workflow=show_generic_workflow)
gen.name = 'L2SectorTrafficBreakdownByMemoryApertureShort'
gen.table_id = 'L2-Sector-Traffic-By-Memory-Aperture-Short'
gen.column_names = [
('Memory Aperture', 'To Memory'),
('Op', 'Op'),
]
gen.workflow += ' This table decomposes L2 bandwidth to each destination, per operation. A <a href="#L2-Sector-Traffic-By-Memory-Aperture">more detailed version of this table</a> can be found below.'
gen.nodes = [
gen.Node('DRAM' , ['lts__average_t_sector_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_aperture_device_op_write'], []),
gen.Node('Atomics', ['lts__average_t_sector_aperture_device_op_atom'], []),
gen.Node('Reductions', ['lts__average_t_sector_aperture_device_op_red'], []),
]),
gen.Node('System Memory', ['lts__average_t_sector_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_aperture_sysmem_op_write'], []),
gen.Node('Atomics', ['lts__average_t_sector_aperture_sysmem_op_atom'], []),
gen.Node('Reductions', ['lts__average_t_sector_aperture_sysmem_op_red'], []),
]),
gen.Node('Peer Memory', ['lts__average_t_sector_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_aperture_peer_op_write'], []),
gen.Node('Atomics', ['lts__average_t_sector_aperture_peer_op_atom'], []),
gen.Node('Reductions', ['lts__average_t_sector_aperture_peer_op_red'], []),
]),
]
gen.required_ratios = gen.get_required_ratios()
class L2TrafficBySrcBreakdownGenerator(tables_common.L2TrafficBreakdownGenerator):
def __init__(gen, show_generic_workflow):
super().__init__(show_generic_workflow=show_generic_workflow)
gen.name = 'L2SectorTrafficBreakdownBySource'
gen.table_id = 'L2-Sector-Traffic-By-Source'
gen.column_names = [
('Source Breakdown', 'From Source'),
('Unit Breakdown', 'From Unit'),
('Memory Aperture', 'To Memory'),
('Op', 'Op'),
]
gen.workflow += ' This table decomposes L2 bandwidth from each source unit, to each destination, per operation. See also: these tables that prioritize <a href="#L2-Sector-Traffic-By-Memory-Aperture">destination Memory Aperture</a> and <a href="#L2-Sector-Traffic-By-Operation">Operation</a>.'
gen.nodes = [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex'], [
gen.Node('DRAM' , ['lts__average_t_sector_srcunit_tex_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_tex_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_tex_aperture_device_op_write'], []),
gen.Node('Atomics', ['lts__average_t_sector_srcunit_tex_aperture_device_op_atom'], []),
gen.Node('Reductions', ['lts__average_t_sector_srcunit_tex_aperture_device_op_red'], []),
]),
gen.Node('Peer Memory', ['lts__average_t_sector_srcunit_tex_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_write'], []),
gen.Node('Atomics', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_atom'], []),
gen.Node('Reductions', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_red'], []),
]),
gen.Node('System Memory', ['lts__average_t_sector_srcunit_tex_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_write'], []),
gen.Node('Atomics', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_atom'], []),
gen.Node('Reductions', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_red'], []),
]),
]),
gen.Node('L1.5 Constant Cache', ['lts__average_t_sector_srcunit_gcc'], [
gen.Node('DRAM' , ['lts__average_t_sector_srcunit_gcc_aperture_device'], [
gen.Node('Reads' , ['lts__average_t_sector_srcunit_gcc_aperture_device'], [])
]),
gen.Node('Peer Memory', ['lts__average_t_sector_srcunit_gcc_aperture_peer'], [
gen.Node('Reads' , ['lts__average_t_sector_srcunit_gcc_aperture_peer'], [])
]),
gen.Node('System Memory', ['lts__average_t_sector_srcunit_gcc_aperture_sysmem'], [
gen.Node('Reads' , ['lts__average_t_sector_srcunit_gcc_aperture_sysmem'], [])
]),
]),
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe'], [
gen.Node('DRAM' , ['lts__average_t_sector_srcunit_pe_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_pe_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_pe_aperture_device_op_write'], []),
]),
gen.Node('Peer Memory', ['lts__average_t_sector_srcunit_pe_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_pe_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_pe_aperture_peer_op_write'], []),
]),
gen.Node('System Memory', ['lts__average_t_sector_srcunit_pe_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_pe_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_pe_aperture_sysmem_op_write'], []),
]),
]),
gen.Node('Raster', ['lts__average_t_sector_srcunit_raster'], [
gen.Node('DRAM' , ['lts__average_t_sector_srcunit_raster_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_raster_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_raster_aperture_device_op_write'], []),
]),
gen.Node('Peer Memory', ['lts__average_t_sector_srcunit_raster_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_raster_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_raster_aperture_peer_op_write'], []),
]),
gen.Node('System Memory', ['lts__average_t_sector_srcunit_raster_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_raster_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_raster_aperture_sysmem_op_write'], []),
]),
]),
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop'], [
gen.Node('DRAM' , ['lts__average_t_sector_srcunit_zrop_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_zrop_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_zrop_aperture_device_op_write'], []),
]),
gen.Node('Peer Memory' , ['lts__average_t_sector_srcunit_zrop_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_zrop_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_zrop_aperture_peer_op_write'], []),
]),
gen.Node('System Memory' , ['lts__average_t_sector_srcunit_zrop_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem_op_write'], []),
]),
]),
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop'], [
gen.Node('DRAM' , ['lts__average_t_sector_srcunit_crop_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_crop_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_crop_aperture_device_op_write'], []),
]),
gen.Node('Peer Memory' , ['lts__average_t_sector_srcunit_crop_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_crop_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_crop_aperture_peer_op_write'], []),
]),
gen.Node('System Memory' , ['lts__average_t_sector_srcunit_crop_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_crop_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_crop_aperture_sysmem_op_write'], []),
]),
]),
]),
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp'], [
gen.Node('all FBP Units', ['lts__average_t_sector_srcnode_fbp'], [
gen.Node('DRAM', ['lts__average_t_sector_srcnode_fbp_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_srcnode_fbp_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcnode_fbp_aperture_device_op_write'], []),
]),
gen.Node('Peer Memory', ['lts__average_t_sector_srcnode_fbp_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_srcnode_fbp_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcnode_fbp_aperture_peer_op_write'], []),
]),
gen.Node('System Memory', ['lts__average_t_sector_srcnode_fbp_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_srcnode_fbp_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcnode_fbp_aperture_sysmem_op_write'], []),
]),
]),
]),
gen.Node('HUB Units', [], [
gen.Node('all HUB Units', [], [
gen.Node('DRAM' , ['lts__average_t_sector_srcnode_hub_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_srcnode_hub_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcnode_hub_aperture_device_op_write'], []),
]),
gen.Node('Peer Memory', ['lts__average_t_sector_srcnode_hub_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_write'], []),
]),
gen.Node('System Memory', ['lts__average_t_sector_srcnode_hub_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_write'], []),
]),
]),
]),
]
gen.required_ratios = gen.get_required_ratios()
class L2TrafficByMemoryApertureBreakdownGenerator(tables_common.L2TrafficBreakdownGenerator):
def __init__(gen, show_generic_workflow):
super().__init__(show_generic_workflow=show_generic_workflow)
gen.name = 'L2SectorTrafficBreakdownByMemoryAperture'
gen.table_id = 'L2-Sector-Traffic-By-Memory-Aperture'
gen.column_names = [
('Memory Aperture', 'To Memory'),
('Source Breakdown', 'From Source'),
('Unit Breakdown', 'From Unit'),
('Op', 'Op'),
]
gen.workflow += ' This is an extended breakdown of <a href="#L2-Sector-Traffic-By-Memory-Aperture-Short">L2 Traffic by destination</a>. It decomposes L2 bandwidth to each destination, from each source unit, per operation.'
gen.nodes = [
gen.Node('DRAM' , ['lts__average_t_sector_aperture_device'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_device'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_tex_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_tex_aperture_device_op_write'], []),
gen.Node('Atomics', ['lts__average_t_sector_srcunit_tex_aperture_device_op_atom'], []),
gen.Node('Reductions', ['lts__average_t_sector_srcunit_tex_aperture_device_op_red'], []),
]),
gen.Node('L1.5 Constant Cache', ['lts__average_t_sector_srcunit_gcc'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_gcc'], []),
]),
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_pe_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_pe_aperture_device_op_write'], []),
]),
gen.Node('Raster', ['lts__average_t_sector_srcunit_raster'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_raster_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_raster_aperture_device_op_write'], []),
]),
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_zrop_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_zrop_aperture_device_op_write'], []),
]),
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_crop_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_crop_aperture_device_op_write'], []),
]),
]),
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_device'], [
gen.Node('all FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_srcnode_fbp_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcnode_fbp_aperture_device_op_write'], []),
]),
]),
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_device'], [
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_srcnode_hub_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcnode_hub_aperture_device_op_write'], []),
]),
]),
]),
gen.Node('Peer Memory', ['lts__average_t_sector_aperture_peer'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_peer'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_write'], []),
gen.Node('Atomics', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_atom'], []),
gen.Node('Reductions', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_red'], []),
]),
gen.Node('L1.5 Constant Cache', ['lts__average_t_sector_srcunit_gcc'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_gcc'], []),
]),
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_pe_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_pe_aperture_peer_op_write'], []),
]),
gen.Node('Raster', ['lts__average_t_sector_srcunit_raster'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_raster_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_raster_aperture_peer_op_write'], []),
]),
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_zrop_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_zrop_aperture_peer_op_write'], []),
]),
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_crop_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_crop_aperture_peer_op_write'], []),
]),
]),
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_peer'], [
gen.Node('all FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_srcnode_fbp_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcnode_fbp_aperture_peer_op_write'], []),
]),
]),
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_peer'], [
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_write'], []),
]),
]),
]),
gen.Node('System Memory', ['lts__average_t_sector_aperture_sysmem'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_sysmem'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_write'], []),
gen.Node('Atomics', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_atom'], []),
gen.Node('Reductions', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_red'], []),
]),
gen.Node('L1.5 Constant Cache', ['lts__average_t_sector_srcunit_gcc'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_gcc'], []),
]),
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_pe_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_pe_aperture_sysmem_op_write'], []),
]),
gen.Node('Raster', ['lts__average_t_sector_srcunit_raster'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_raster_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_raster_aperture_sysmem_op_write'], []),
]),
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem_op_write'], []),
]),
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_crop_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_crop_aperture_sysmem_op_write'], []),
]),
]),
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_sysmem'], [
gen.Node('all FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_srcnode_fbp_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcnode_fbp_aperture_sysmem_op_write'], []),
]),
]),
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_sysmem'], [
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_write'], []),
]),
]),
]),
]
gen.required_ratios = gen.get_required_ratios()
class L2TrafficByOperationBreakdownGenerator(tables_common.L2TrafficBreakdownGenerator):
def __init__(gen, show_generic_workflow):
super().__init__(show_generic_workflow=show_generic_workflow)
gen.name = 'L2SectorTrafficBreakdownByOperation'
gen.table_id = 'L2-Sector-Traffic-By-Operation'
gen.column_names = [
('Op', 'Op'),
('Memory Aperture', 'To Memory'),
('Source Breakdown', 'From Source'),
('Unit Breakdown', 'From Unit'),
]
gen.workflow += ' This table decomposes L2 bandwidth per operation, to each destination, from each source unit.'
gen.nodes = [
gen.Node('Reads', ['lts__average_t_sector_op_read'], [
gen.Node('DRAM', ['lts__average_t_sector_aperture_device_op_read'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_device_op_read'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_device_op_read'], []),
gen.Node('L1.5 Constant Cache', ['lts__average_t_sector_srcunit_gcc_aperture_device'], []),
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_device_op_read'], []),
gen.Node('Raster', ['lts__average_t_sector_srcunit_raster_aperture_device_op_read'], []),
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_device_op_read'], []),
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_device_op_read'], []),
]),
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_device_op_read'], [
gen.Node('all FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_device_op_read'], []),
]),
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_device_op_read'], [
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_device_op_read'], []),
]),
]),
gen.Node('Peer Memory', ['lts__average_t_sector_aperture_peer_op_read'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_peer_op_read'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_read'], []),
gen.Node('L1.5 Constant Cache', ['lts__average_t_sector_srcunit_gcc_aperture_peer'], []),
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_peer_op_read'], []),
gen.Node('Raster', ['lts__average_t_sector_srcunit_raster_aperture_peer_op_read'], []),
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_peer_op_read'], []),
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_peer_op_read'], []),
]),
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_peer_op_read'], [
gen.Node('all FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_peer_op_read'], []),
]),
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_read'], [
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_read'], []),
]),
]),
gen.Node('System Memory', ['lts__average_t_sector_aperture_sysmem_op_read'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_sysmem_op_read'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_read'], []),
gen.Node('L1.5 Constant Cache', ['lts__average_t_sector_srcunit_gcc_aperture_sysmem'], []),
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_sysmem_op_read'], []),
gen.Node('Raster', ['lts__average_t_sector_srcunit_raster_aperture_sysmem_op_read'], []),
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem_op_read'], []),
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_sysmem_op_read'], []),
]),
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_sysmem_op_read'], [
gen.Node('all FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_sysmem_op_read'], []),
]),
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_read'], [
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_read'], []),
]),
]),
]),
gen.Node('Writes', ['lts__average_t_sector_op_write'], [
gen.Node('DRAM', ['lts__average_t_sector_aperture_device_op_write'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_device_op_write'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_device_op_write'], []),
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_device_op_write'], []),
gen.Node('Raster', ['lts__average_t_sector_srcunit_raster_aperture_device_op_write'], []),
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_device_op_write'], []),
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_device_op_write'], []),
]),
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_device_op_write'], [
gen.Node('all FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_device_op_write'], []),
]),
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_device_op_write'], [
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_device_op_write'], []),
]),
]),
gen.Node('Peer Memory', ['lts__average_t_sector_aperture_peer_op_write'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_peer_op_write'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_write'], []),
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_peer_op_write'], []),
gen.Node('Raster', ['lts__average_t_sector_srcunit_raster_aperture_peer_op_write'], []),
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_peer_op_write'], []),
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_peer_op_write'], []),
]),
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_peer_op_write'], [
gen.Node('all FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_peer_op_write'], []),
]),
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_write'], [
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_write'], []),
]),
]),
gen.Node('System Memory', ['lts__average_t_sector_aperture_sysmem_op_write'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_sysmem_op_write'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_write'], []),
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_sysmem_op_write'], []),
gen.Node('Raster', ['lts__average_t_sector_srcunit_raster_aperture_sysmem_op_write'], []),
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem_op_write'], []),
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_sysmem_op_write'], []),
]),
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_sysmem_op_write'], [
gen.Node('all FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_sysmem_op_write'], []),
]),
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_write'], [
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_write'], []),
]),
]),
]),
gen.Node('Atomics', ['lts__average_t_sector_op_atom'], [
gen.Node('DRAM', ['lts__average_t_sector_aperture_device_op_atom'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_device_op_atom'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_device_op_atom'], []),
]),
]),
gen.Node('Peer Memory', ['lts__average_t_sector_aperture_peer_op_atom'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_peer_op_atom'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_atom'], []),
]),
]),
gen.Node('System Memory', ['lts__average_t_sector_aperture_sysmem_op_atom'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_sysmem_op_atom'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_atom'], []),
]),
]),
]),
gen.Node('Reductions', ['lts__average_t_sector_op_red'], [
gen.Node('DRAM', ['lts__average_t_sector_aperture_device_op_red'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_device_op_red'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_device_op_red'], []),
]),
]),
gen.Node('Peer Memory', ['lts__average_t_sector_aperture_peer_op_red'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_peer_op_red'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_red'], []),
]),
]),
gen.Node('System Memory', ['lts__average_t_sector_aperture_sysmem_op_red'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_sysmem_op_red'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_red'], []),
]),
]),
]),
]
gen.required_ratios = gen.get_required_ratios()
class RasterDataflowGenerator(tables_common.RasterDataflowGenerator):
def __init__(gen):
super().__init__()
gen.zrop_pixels_input = r'''getCounterValue('prop__prop2zrop_pixels_realtime', 'sum')'''
gen.crop_pixels_input = r'''getCounterValue('prop__prop2crop_pixels_realtime', 'sum')'''
gen.required_counters.extend([
'prop__prop2zrop_pixels_realtime',
'prop__prop2crop_pixels_realtime'
])
class SmResourceUsageGenerator(tables_common.SmResourceUsageGenerator):
def __init__(gen):
super().__init__()
# TODO: this is using single-pass counters; use ordinary counters when available
# NOTE: the gfx column is a hack, for rows that cannot separately measure VTG vs. PS
gen.rows = [
# resource total gfx vtg ps cs
gen.Row('Warps' , 'sm__warps_active' , 'NotApplicable' , 'tpc__warps_active_shader_vtg' , 'tpc__warps_active_shader_ps' , 'tpc__warps_active_shader_cs' ),
gen.Row('Registers' , 'tpc__sm_rf_registers_allocated' , 'NotApplicable' , 'tpc__sm_rf_registers_allocated_shader_vtg' , 'tpc__sm_rf_registers_allocated_shader_ps' , 'tpc__sm_rf_registers_allocated_shader_cs' ),
gen.Row('Attr/ShMem' , 'NotApplicable' , 'NotApplicable' , 'tpc__l1tex_mem_shared_data_isbe_bytes_allocated' , 'tpc__l1tex_mem_shared_data_tram_bytes_allocated' , 'tpc__l1tex_mem_shared_data_compute_bytes_allocated' ),
gen.Row('CTAs' , 'NotApplicable' , 'NotApplicable' , 'NotApplicable' , 'NotApplicable' , 'sm__ctas_active' ),
]
gen.required_counters = [
'sm__warps_active',
'tpc__warps_active_shader_vtg',
'tpc__warps_active_shader_ps',
'tpc__warps_active_shader_cs',
'tpc__sm_rf_registers_allocated',
'tpc__sm_rf_registers_allocated_shader_vtg',
'tpc__sm_rf_registers_allocated_shader_ps',
'tpc__sm_rf_registers_allocated_shader_cs',
'tpc__l1tex_mem_shared_data_isbe_bytes_allocated',
'tpc__l1tex_mem_shared_data_tram_bytes_allocated',
'tpc__l1tex_mem_shared_data_compute_bytes_allocated',
'sm__ctas_active',
]
class RangesSummaryGenerator(tables_common.RangesSummaryGenerator):
def __init__(gen):
super().__init__()
gen.cols = [
gen.Col('Duration μs' , "getCounterValue('gpu__time_duration', 'avg')" , 'format_avg' , 'ra'),
gen.Col('GR Active%' , "getCounterPct('gr__cycles_active', 'avg')" , 'format_pct' , 'ra'),
gen.Col('3D?' , "getCounterValue('fe__draw_count', 'sum') ? '&#x2713;' : ''" , '' , 'ra'),
gen.Col('Comp?' , "getCounterValue('gr__dispatch_count', 'sum') ? '&#x2713;' : ''" , '' , 'ra'),
gen.Col('#WFI' , "getCounterValue('fe__output_ops_cmd_go_idle', 'sum')" , 'format_sum' , 'ra'),
gen.Col('#Prims' , "getCounterValue('pda__input_prims', 'sum')" , 'format_sum' , 'ra'),
gen.Col('#Pixels-Z' , "getCounterValue('prop__prop2zrop_pixels_realtime', 'sum')" , 'format_sum' , 'ra'),
gen.Col('#Pixels-C' , "getCounterValue('prop__prop2crop_pixels_realtime', 'sum')" , 'format_sum' , 'ra'),
gen.Col('SM%' , "getThroughputPct('sm__throughput')" , 'format_pct' , 'ra'),
gen.Col('L1TEX%' , "getThroughputPct('l1tex__throughput')" , 'format_pct' , 'ra'),
gen.Col('L2%' , "getThroughputPct('lts__throughput')" , 'format_pct' , 'ra'),
gen.Col('DRAM%' , "getThroughputPct('dram__throughput')" , 'format_pct' , 'ra'),
gen.Col('PCIe%' , "getThroughputPct('pcie__throughput')" , 'format_pct' , 'ra'),
gen.Col('PD%' , "getThroughputPct('pda__throughput')" , 'format_pct' , 'ra'),
gen.Col('PE%' , "Math.max(getThroughputPct('vaf__throughput'), getThroughputPct('vpc__throughput'), getThroughputPct('pes__throughput'))" , 'format_pct' , 'ra'),
gen.Col('RSTR%' , "getThroughputPct('raster__throughput')" , 'format_pct' , 'ra'),
gen.Col('PROP%' , "getThroughputPct('prop__throughput')" , 'format_pct' , 'ra'),
gen.Col('ZROP%' , "getThroughputPct('zrop__throughput')" , 'format_pct' , 'ra'),
gen.Col('CROP%' , "getThroughputPct('crop__throughput')" , 'format_pct' , 'ra'),
]
gen.required_counters = [
'fe__draw_count',
'fe__output_ops_cmd_go_idle',
'gpu__time_duration',
'gr__cycles_active',
'gr__dispatch_count',
'pda__input_prims',
'prop__prop2crop_pixels_realtime',
'prop__prop2zrop_pixels_realtime',
]
gen.required_ratios = []
gen.required_throughputs = [
'crop__throughput',
'dram__throughput',
'l1tex__throughput',
'lts__throughput',
'pcie__throughput',
'pda__throughput',
'pes__throughput',
'prop__throughput',
'raster__throughput',
'sm__throughput',
'vaf__throughput',
'vpc__throughput',
'zrop__throughput',
]

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,79 @@
# Copyright 2014-2021 NVIDIA Corporation. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from profiler_report_types import *
import pub.tables_common as tables_common
import pub.turing.tables_turing as tables_turing
def get_per_range_report_definition():
sections = [
DataSection([
tables_turing.DevicePropertiesGenerator().make_data_table(),
tables_common.ClocksGenerator().make_data_table(),
], inter_table_spacing=False),
DataSection([
tables_common.TopLevelStatsGenerator().make_data_table(),
tables_turing.TopThroughputsGenerator().make_data_table(),
tables_common.CacheHitRates().make_data_table(),
], title='Overview Section'),
DataSection([
tables_common.MainMemoryGenerator().make_data_table(),
tables_turing.L2TrafficByMemoryApertureShortBreakdownGenerator(show_generic_workflow=True).make_data_table(),
tables_turing.L2TrafficBySrcBreakdownGenerator(show_generic_workflow=False).make_data_table(),
tables_common.L1TexThroughputsGenerator().make_data_table(),
tables_common.L1TexTrafficBreakdownGenerator().make_data_table(),
], title='Memory Performance Section'),
DataSection([
tables_turing.SmThroughputsGenerator_tu10x().make_data_table(),
tables_turing.SmInstExecutedGenerator().make_data_table(),
tables_common.SmShaderExecutionGenerator().make_data_table(),
tables_turing.SmResourceUsageGenerator().make_data_table(),
tables_common.SmWarpLaunchStallsGenerator().make_data_table(),
tables_common.WarpIssueStallsGenerator().make_data_table(),
], title='Shader Performance Section'),
DataSection([
tables_common.PrimitiveDataflowGenerator().make_data_table(),
tables_turing.RasterDataflowGenerator().make_data_table(),
], title='3D Pipeline Section'),
DataSection([
tables_turing.L2TrafficByMemoryApertureBreakdownGenerator(show_generic_workflow=False).make_data_table(),
tables_turing.L2TrafficByOperationBreakdownGenerator(show_generic_workflow=False).make_data_table(),
], title='Additional L2 Traffic Breakdowns Section'),
DataSection([
tables_common.AdditionalMetricsGenerator().make_data_table(),
tables_common.AllCountersGenerator().make_data_table(),
tables_common.AllRatiosGenerator().make_data_table(),
tables_common.AllThroughputsGenerator().make_data_table(),
], title='Exhaustive Listings Section'),
]
html = tables_common.generate_range_html_common(sections)
required_counters = get_required_counters(sections)
required_ratios = get_required_ratios(sections)
required_throughputs = get_required_throughputs(sections)
return ReportDefinition('PerRangeReport', html, required_counters, required_ratios, required_throughputs)
def get_summary_report_definition():
sections = [
DataSection([
tables_common.CollectionInfoGenerator().make_data_table(),
]),
DataSection([
tables_turing.RangesSummaryGenerator().make_data_table(),
], title='Summary of Measured Ranges'),
]
html = tables_common.generate_summary_html_common(sections)
required_counters = get_required_counters(sections)
required_ratios = get_required_ratios(sections)
required_throughputs = get_required_throughputs(sections)
return ReportDefinition('SummaryReport', html, required_counters, required_ratios, required_throughputs)

View File

@@ -0,0 +1,79 @@
# Copyright 2014-2021 NVIDIA Corporation. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from profiler_report_types import *
import pub.tables_common as tables_common
import pub.turing.tables_turing as tables_turing
def get_per_range_report_definition():
sections = [
DataSection([
tables_turing.DevicePropertiesGenerator().make_data_table(),
tables_common.ClocksGenerator().make_data_table(),
], inter_table_spacing=False),
DataSection([
tables_common.TopLevelStatsGenerator().make_data_table(),
tables_turing.TopThroughputsGenerator().make_data_table(),
tables_common.CacheHitRates().make_data_table(),
], title='Overview Section'),
DataSection([
tables_common.MainMemoryGenerator().make_data_table(),
tables_turing.L2TrafficByMemoryApertureShortBreakdownGenerator(show_generic_workflow=True).make_data_table(),
tables_turing.L2TrafficBySrcBreakdownGenerator(show_generic_workflow=False).make_data_table(),
tables_common.L1TexThroughputsGenerator().make_data_table(),
tables_common.L1TexTrafficBreakdownGenerator().make_data_table(),
], title='Memory Performance Section'),
DataSection([
tables_turing.SmThroughputsGenerator_tu11x().make_data_table(),
tables_turing.SmInstExecutedGenerator().make_data_table(),
tables_common.SmShaderExecutionGenerator().make_data_table(),
tables_turing.SmResourceUsageGenerator().make_data_table(),
tables_common.SmWarpLaunchStallsGenerator().make_data_table(),
tables_common.WarpIssueStallsGenerator().make_data_table(),
], title='Shader Performance Section'),
DataSection([
tables_common.PrimitiveDataflowGenerator().make_data_table(),
tables_turing.RasterDataflowGenerator().make_data_table(),
], title='3D Pipeline Section'),
DataSection([
tables_turing.L2TrafficByMemoryApertureBreakdownGenerator(show_generic_workflow=False).make_data_table(),
tables_turing.L2TrafficByOperationBreakdownGenerator(show_generic_workflow=False).make_data_table(),
], title='Additional L2 Traffic Breakdown'),
DataSection([
tables_common.AdditionalMetricsGenerator().make_data_table(),
tables_common.AllCountersGenerator().make_data_table(),
tables_common.AllRatiosGenerator().make_data_table(),
tables_common.AllThroughputsGenerator().make_data_table(),
], title='Exhaustive Listings Section'),
]
html = tables_common.generate_range_html_common(sections)
required_counters = get_required_counters(sections)
required_ratios = get_required_ratios(sections)
required_throughputs = get_required_throughputs(sections)
return ReportDefinition('PerRangeReport', html, required_counters, required_ratios, required_throughputs)
def get_summary_report_definition():
sections = [
DataSection([
tables_common.CollectionInfoGenerator().make_data_table(),
]),
DataSection([
tables_turing.RangesSummaryGenerator().make_data_table(),
], title='Summary of Measured Ranges'),
]
html = tables_common.generate_summary_html_common(sections)
required_counters = get_required_counters(sections)
required_ratios = get_required_ratios(sections)
required_throughputs = get_required_throughputs(sections)
return ReportDefinition('SummaryReport', html, required_counters, required_ratios, required_throughputs)

View File

@@ -0,0 +1,610 @@
# Copyright 2014-2021 NVIDIA Corporation. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from profiler_report_types import *
import pub.tables_common as tables_common
class DevicePropertiesGenerator(tables_common.DevicePropertiesGenerator):
def __init__(gen):
super().__init__()
gen.l2cacheSizePerLts = 128
class TopThroughputsGenerator(tables_common.TopThroughputsGenerator):
def __init__(gen):
super().__init__()
gen.rows += [
gen.Row('Shader' , '<a href="#SM-Instruction-Throughput">SM (Shader Cores)</a>' , "getThroughputPct('sm__throughput')"),
gen.Row('Memory' , '<a href="#L1TEX-Throughput">L1TEX Cache</a>' , "getThroughputPct('l1tex__throughput')"),
gen.Row('Memory' , '<a href="#L2-Sector-Traffic">L2 Cache</a>' , "getThroughputPct('lts__throughput')"),
gen.Row('Memory' , '<a href="#Main-Memory-Throughput">DRAM</a>' , "getThroughputPct('dram__throughput')"),
gen.Row('Memory' , '<a href="#Main-Memory-Throughput">PCIe</a>' , "getThroughputPct('pcie__throughput')"),
gen.Row('World Pipe' , '<a href="#Primitive-Data-Flow">PDA Index Fetch</a>' , "getThroughputPct('pda__throughput')"),
gen.Row('World Pipe' , '<a href="#Primitive-Data-Flow">Vertex Attr. Fetch</a>' , "getThroughputPct('vaf__throughput')"),
gen.Row('World Pipe' , '<a href="#Primitive-Data-Flow">Primitive Engine</a>' , "getThroughputPct('pes__throughput')"),
gen.Row('Screen Pipe' , '<a href="#Raster-Data-Flow">RASTER</a>' , "getThroughputPct('raster__throughput')"),
gen.Row('Screen Pipe' , '<a href="#Raster-Data-Flow">PROP (Pre-ROP)</a>' , "getThroughputPct('prop__throughput')"),
gen.Row('Screen Pipe' , '<a href="#Raster-Data-Flow">ZROP (Depth-Test)</a>' , "getThroughputPct('zrop__throughput')"),
gen.Row('Screen Pipe' , '<a href="#Raster-Data-Flow">CROP (Color Blend)</a>' , "getThroughputPct('crop__throughput')"),
]
gen.required_throughputs += [
'crop__throughput',
'dram__throughput',
'l1tex__throughput',
'lts__throughput',
'pcie__throughput',
'pda__throughput',
'pes__throughput',
'prop__throughput',
'raster__throughput',
'sm__throughput',
'vaf__throughput',
'zrop__throughput',
]
class SmThroughputsGenerator_tu10x(tables_common.SmThroughputsGenerator):
def __init__(gen):
super().__init__()
gen.pipes += [
gen.Pipe('adu' , 'Computed branches and indexed constants'),
gen.Pipe('alu' , 'INT32 except multiply; FP32 comparison'),
gen.Pipe('cbu' , 'Divergent branches and control flow'),
gen.Pipe('fma' , 'FP32 mul/add and INT32 multiply'),
gen.Pipe('fp16' , 'FP16 mul/add'),
gen.Pipe('fp64' , 'FP64 mul/add'),
gen.Pipe('ipa' , 'Pixel shader attribute interpolation'),
gen.Pipe('lsu' , 'Global, local, shared memory, and misc'),
gen.Pipe('shared' , 'Shared Pipe Dispatch (FP16,Tensor)', 'sm__pipe_shared_cycles_active', hasInstExecuted=False),
gen.Pipe('tensor' , 'Tensor matrix multiply (FP16, INT8/4/1)'),
gen.Pipe('tex' , 'Texture and surface memory'),
gen.Pipe('uniform' , 'Warp-level scalar operations'),
gen.Pipe('xu' , 'Transcendentals and float/int conversion'),
]
for pipe in gen.pipes:
gen.required_counters.extend(pipe.get_counter_names())
class SmThroughputsGenerator_tu11x(tables_common.SmThroughputsGenerator):
def __init__(gen):
super().__init__()
gen.pipes = []
gen.pipes += [
gen.Pipe('adu' , 'Computed branches and indexed constants'),
gen.Pipe('alu' , 'INT32 except multiply; FP32 comparison'),
gen.Pipe('cbu' , 'Divergent branches and control flow'),
gen.Pipe('fma' , 'FP32 mul/add and INT32 multiply'),
gen.Pipe('fp16' , 'FP16 mul/add'),
gen.Pipe('fp64' , 'FP64 mul/add'),
gen.Pipe('ipa' , 'Pixel shader attribute interpolation'),
gen.Pipe('lsu' , 'Global, local, shared memory, and misc'),
gen.Pipe('tensor' , 'Tensor matrix multiply (FP16, INT8/4/1)'),
gen.Pipe('tex' , 'Texture and surface memory'),
gen.Pipe('uniform' , 'Warp-level scalar operations'),
gen.Pipe('xu' , 'Transcendentals and float/int conversion'),
]
for pipe in gen.pipes:
gen.required_counters.extend(pipe.get_counter_names())
class SmInstExecutedGenerator(tables_common.SmInstExecutedGenerator):
def __init__(gen):
super().__init__()
gen.pipes = [
gen.Pipe('total' , 'All instructions', True),
gen.Pipe('adu' , 'Computed branches and indexed constants'),
gen.Pipe('alu' , 'INT32 except multiply; FP32 comparison', True),
gen.Pipe('cbu' , 'Divergent branches and control flow'),
gen.Pipe('fma' , 'FP32 mul/add and INT32 multiply', True),
gen.Pipe('fp16' , 'FP16 mul/add', True),
gen.Pipe('fp64' , 'FP64 mul/add', True),
gen.Pipe('ipa' , 'Pixel shader attribute interpolation', True),
gen.Pipe('lsu' , 'Global, local, shared memory, and misc', True),
gen.Pipe('tensor' , 'Tensor matrix multiply (FP16, INT8/4/1)'),
gen.Pipe('tex' , 'Texture and surface memory'),
gen.Pipe('uniform' , 'Warp-level scalar operations'),
gen.Pipe('xu' , 'Transcendentals and float/int conversion', True),
]
for pipe in gen.pipes:
gen.required_counters.extend(pipe.get_counter_names())
class L2TrafficByMemoryApertureShortBreakdownGenerator(tables_common.L2TrafficBreakdownGenerator):
def __init__(gen, show_generic_workflow):
super().__init__(show_generic_workflow=show_generic_workflow)
gen.name = 'L2SectorTrafficBreakdownByMemoryApertureShort'
gen.table_id = 'L2-Sector-Traffic-By-Memory-Aperture-Short'
gen.column_names = [
('Memory Aperture', 'To Memory'),
('Op', 'Op'),
]
gen.workflow += ' This table decomposes L2 bandwidth to each destination, per operation. A <a href="#L2-Sector-Traffic-By-Memory-Aperture">more detailed version of this table</a> can be found below.'
gen.nodes = [
gen.Node('DRAM' , ['lts__average_t_sector_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_aperture_device_op_write'], []),
gen.Node('Atomics', ['lts__average_t_sector_aperture_device_op_atom'], []),
gen.Node('Reductions', ['lts__average_t_sector_aperture_device_op_red'], []),
]),
gen.Node('System Memory', ['lts__average_t_sector_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_aperture_sysmem_op_write'], []),
gen.Node('Atomics', ['lts__average_t_sector_aperture_sysmem_op_atom'], []),
gen.Node('Reductions', ['lts__average_t_sector_aperture_sysmem_op_red'], []),
]),
gen.Node('Peer Memory', ['lts__average_t_sector_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_aperture_peer_op_write'], []),
gen.Node('Atomics', ['lts__average_t_sector_aperture_peer_op_atom'], []),
gen.Node('Reductions', ['lts__average_t_sector_aperture_peer_op_red'], []),
]),
]
gen.required_ratios = gen.get_required_ratios()
class L2TrafficBySrcBreakdownGenerator(tables_common.L2TrafficBreakdownGenerator):
def __init__(gen, show_generic_workflow):
super().__init__(show_generic_workflow=show_generic_workflow)
gen.name = 'L2SectorTrafficBreakdownBySource'
gen.table_id = 'L2-Sector-Traffic-By-Source'
gen.column_names = [
('Source Breakdown', 'From Source'),
('Unit Breakdown', 'From Unit'),
('Memory Aperture', 'To Memory'),
('Op', 'Op'),
]
gen.workflow += ' This table decomposes L2 bandwidth from each source unit, to each destination, per operation. See also: these tables that prioritize <a href="#L2-Sector-Traffic-By-Memory-Aperture">destination Memory Aperture</a> and <a href="#L2-Sector-Traffic-By-Operation">Operation</a>.'
gen.nodes = [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex'], [
gen.Node('DRAM' , ['lts__average_t_sector_srcunit_tex_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_tex_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_tex_aperture_device_op_write'], []),
gen.Node('Atomics', ['lts__average_t_sector_srcunit_tex_aperture_device_op_atom'], []),
gen.Node('Reductions', ['lts__average_t_sector_srcunit_tex_aperture_device_op_red'], []),
]),
gen.Node('Peer Memory', ['lts__average_t_sector_srcunit_tex_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_write'], []),
gen.Node('Atomics', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_atom'], []),
gen.Node('Reductions', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_red'], []),
]),
gen.Node('System Memory', ['lts__average_t_sector_srcunit_tex_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_write'], []),
gen.Node('Atomics', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_atom'], []),
gen.Node('Reductions', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_red'], []),
]),
]),
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe'], [
gen.Node('DRAM' , ['lts__average_t_sector_srcunit_pe_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_pe_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_pe_aperture_device_op_write'], []),
]),
gen.Node('Peer Memory', ['lts__average_t_sector_srcunit_pe_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_pe_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_pe_aperture_peer_op_write'], []),
]),
gen.Node('System Memory', ['lts__average_t_sector_srcunit_pe_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_pe_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_pe_aperture_sysmem_op_write'], []),
]),
]),
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother'], [
gen.Node('DRAM' , ['lts__average_t_sector_srcunit_gpcother_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_gpcother_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_gpcother_aperture_device_op_write'], []),
]),
gen.Node('Peer Memory', ['lts__average_t_sector_srcunit_gpcother_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_gpcother_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_gpcother_aperture_peer_op_write'], []),
]),
gen.Node('System Memory', ['lts__average_t_sector_srcunit_gpcother_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_gpcother_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_gpcother_aperture_sysmem_op_write'], []),
]),
]),
]),
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp'], [
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop'], [
gen.Node('DRAM' , ['lts__average_t_sector_srcunit_zrop_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_zrop_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_zrop_aperture_device_op_write'], []),
]),
gen.Node('Peer Memory' , ['lts__average_t_sector_srcunit_zrop_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_zrop_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_zrop_aperture_peer_op_write'], []),
]),
gen.Node('System Memory' , ['lts__average_t_sector_srcunit_zrop_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem_op_write'], []),
]),
]),
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop'], [
gen.Node('DRAM' , ['lts__average_t_sector_srcunit_crop_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_crop_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_crop_aperture_device_op_write'], []),
]),
gen.Node('Peer Memory' , ['lts__average_t_sector_srcunit_crop_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_crop_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_crop_aperture_peer_op_write'], []),
]),
gen.Node('System Memory' , ['lts__average_t_sector_srcunit_crop_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_crop_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_crop_aperture_sysmem_op_write'], []),
]),
]),
]),
gen.Node('HUB Units', [], [ # lts__average_t_sector_srcnode_fbp is buggy, use sum of children instead
gen.Node('all HUB Units', [], [ # lts__average_t_sector_srcnode_fbp is buggy, use sum of children instead
gen.Node('DRAM' , ['lts__average_t_sector_srcnode_hub_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_srcnode_hub_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcnode_hub_aperture_device_op_write'], []),
]),
gen.Node('Peer Memory', ['lts__average_t_sector_srcnode_hub_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_write'], []),
]),
gen.Node('System Memory', ['lts__average_t_sector_srcnode_hub_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_write'], []),
]),
]),
]),
]
gen.required_ratios = gen.get_required_ratios()
class L2TrafficByMemoryApertureBreakdownGenerator(tables_common.L2TrafficBreakdownGenerator):
def __init__(gen, show_generic_workflow):
super().__init__(show_generic_workflow=show_generic_workflow)
gen.name = 'L2SectorTrafficBreakdownByMemoryAperture'
gen.table_id = 'L2-Sector-Traffic-By-Memory-Aperture'
gen.column_names = [
('Memory Aperture', 'To Memory'),
('Source Breakdown', 'From Source'),
('Unit Breakdown', 'From Unit'),
('Op', 'Op'),
]
gen.workflow += ' This is an extended breakdown of <a href="#L2-Sector-Traffic-By-Memory-Aperture-Short">L2 Traffic by destination</a>. It decomposes L2 bandwidth to each destination, from each source unit, per operation.'
gen.nodes = [
gen.Node('DRAM' , ['lts__average_t_sector_aperture_device'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_device'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_tex_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_tex_aperture_device_op_write'], []),
gen.Node('Atomics', ['lts__average_t_sector_srcunit_tex_aperture_device_op_atom'], []),
gen.Node('Reductions', ['lts__average_t_sector_srcunit_tex_aperture_device_op_red'], []),
]),
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_pe_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_pe_aperture_device_op_write'], []),
]),
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_gpcother_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_gpcother_aperture_device_op_write'], []),
]),
]),
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_device'], [
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_zrop_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_zrop_aperture_device_op_write'], []),
]),
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_crop_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_crop_aperture_device_op_write'], []),
]),
]),
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_device'], [
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_srcnode_hub_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcnode_hub_aperture_device_op_write'], []),
]),
]),
]),
gen.Node('Peer Memory', ['lts__average_t_sector_aperture_peer'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_peer'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_write'], []),
gen.Node('Atomics', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_atom'], []),
gen.Node('Reductions', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_red'], []),
]),
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_pe_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_pe_aperture_peer_op_write'], []),
]),
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_gpcother_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_gpcother_aperture_peer_op_write'], []),
]),
]),
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_peer'], [
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_zrop_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_zrop_aperture_peer_op_write'], []),
]),
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_crop_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_crop_aperture_peer_op_write'], []),
]),
]),
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_peer'], [
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_write'], []),
]),
]),
]),
gen.Node('System Memory', ['lts__average_t_sector_aperture_sysmem'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_sysmem'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_write'], []),
gen.Node('Atomics', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_atom'], []),
gen.Node('Reductions', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_red'], []),
]),
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_pe_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_pe_aperture_sysmem_op_write'], []),
]),
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_gpcother_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_gpcother_aperture_sysmem_op_write'], []),
]),
]),
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_sysmem'], [
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem_op_write'], []),
]),
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_crop_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_crop_aperture_sysmem_op_write'], []),
]),
]),
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_sysmem'], [
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_write'], []),
]),
]),
]),
]
gen.required_ratios = gen.get_required_ratios()
class L2TrafficByOperationBreakdownGenerator(tables_common.L2TrafficBreakdownGenerator):
def __init__(gen, show_generic_workflow):
super().__init__(show_generic_workflow=show_generic_workflow)
gen.name = 'L2SectorTrafficBreakdownByOperation'
gen.table_id = 'L2-Sector-Traffic-By-Operation'
gen.column_names = [
('Op', 'Op'),
('Memory Aperture', 'To Memory'),
('Source Breakdown', 'From Source'),
('Unit Breakdown', 'From Unit'),
]
gen.workflow += ' This table decomposes L2 bandwidth per operation, to each destination, from each source unit.'
gen.nodes = [
gen.Node('Reads', ['lts__average_t_sector_op_read'], [
gen.Node('DRAM', ['lts__average_t_sector_aperture_device_op_read'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_device_op_read'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_device_op_read'], []),
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_device_op_read'], []),
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother_aperture_device_op_read'], []),
]),
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_device_op_read'], [
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_device_op_read'], []),
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_device_op_read'], []),
]),
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_device_op_read'], [
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_device_op_read'], []),
]),
]),
gen.Node('Peer Memory', ['lts__average_t_sector_aperture_peer_op_read'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_peer_op_read'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_read'], []),
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_peer_op_read'], []),
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother_aperture_peer_op_read'], []),
]),
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_peer_op_read'], [
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_peer_op_read'], []),
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_peer_op_read'], []),
]),
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_read'], [
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_read'], []),
]),
]),
gen.Node('System Memory', ['lts__average_t_sector_aperture_sysmem_op_read'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_sysmem_op_read'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_read'], []),
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_sysmem_op_read'], []),
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother_aperture_sysmem_op_read'], []),
]),
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_sysmem_op_read'], [
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem_op_read'], []),
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_sysmem_op_read'], []),
]),
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_read'], [
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_read'], []),
]),
]),
]),
gen.Node('Writes', ['lts__average_t_sector_op_write'], [
gen.Node('DRAM', ['lts__average_t_sector_aperture_device_op_write'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_device_op_write'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_device_op_write'], []),
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_device_op_write'], []),
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother_aperture_device_op_write'], []),
]),
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_device_op_write'], [
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_device_op_write'], []),
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_device_op_write'], []),
]),
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_device_op_write'], [
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_device_op_write'], []),
]),
]),
gen.Node('Peer Memory', ['lts__average_t_sector_aperture_peer_op_write'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_peer_op_write'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_write'], []),
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_peer_op_write'], []),
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother_aperture_peer_op_write'], []),
]),
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_peer_op_write'], [
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_peer_op_write'], []),
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_peer_op_write'], []),
]),
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_write'], [
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_write'], []),
]),
]),
gen.Node('System Memory', ['lts__average_t_sector_aperture_sysmem_op_write'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_sysmem_op_write'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_write'], []),
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_sysmem_op_write'], []),
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother_aperture_sysmem_op_write'], []),
]),
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_sysmem_op_write'], [
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem_op_write'], []),
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_sysmem_op_write'], []),
]),
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_write'], [
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_write'], []),
]),
]),
]),
gen.Node('Atomics', ['lts__average_t_sector_op_atom'], [
gen.Node('DRAM', ['lts__average_t_sector_aperture_device_op_atom'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_device_op_atom'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_device_op_atom'], []),
]),
]),
gen.Node('Peer Memory', ['lts__average_t_sector_aperture_peer_op_atom'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_peer_op_atom'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_atom'], []),
]),
]),
gen.Node('System Memory', ['lts__average_t_sector_aperture_sysmem_op_atom'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_sysmem_op_atom'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_atom'], []),
]),
]),
]),
gen.Node('Reductions', ['lts__average_t_sector_op_red'], [
gen.Node('DRAM', ['lts__average_t_sector_aperture_device_op_red'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_device_op_red'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_device_op_red'], []),
]),
]),
gen.Node('Peer Memory', ['lts__average_t_sector_aperture_peer_op_red'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_peer_op_red'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_red'], []),
]),
]),
gen.Node('System Memory', ['lts__average_t_sector_aperture_sysmem_op_red'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_sysmem_op_red'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_red'], []),
]),
]),
]),
]
gen.required_ratios = gen.get_required_ratios()
class RasterDataflowGenerator(tables_common.RasterDataflowGenerator):
def __init__(gen):
super().__init__()
gen.zrop_pixels_input = r'''getCounterValue('prop__prop2xbar_zrop_pixels_realtime', 'sum')'''
gen.crop_pixels_input = r'''getCounterValue('prop__prop2xbar_crop_pixels_realtime', 'sum')'''
gen.required_counters.extend([
'prop__prop2xbar_zrop_pixels_realtime',
'prop__prop2xbar_crop_pixels_realtime'
])
class SmResourceUsageGenerator(tables_common.SmResourceUsageGenerator):
def __init__(gen):
super().__init__()
gen.rows = [
# resource total gfx vtg ps cs
gen.Row('Warps' , 'sm__warps_active' , 'NotApplicable' , 'tpc__warps_active_shader_vtg_realtime' , 'tpc__warps_active_shader_ps_realtime' , 'tpc__warps_active_shader_cs_realtime' ),
gen.Row('Registers' , 'tpc__sm_rf_registers_allocated' , 'NotApplicable' , 'tpc__sm_rf_registers_allocated_shader_vtg' , 'tpc__sm_rf_registers_allocated_shader_ps' , 'tpc__sm_rf_registers_allocated_shader_cs' ),
gen.Row('Attr/ShMem' , 'NotApplicable' , 'NotApplicable' , 'tpc__l1tex_mem_shared_data_isbe_bytes_allocated' , 'tpc__l1tex_mem_shared_data_tram_bytes_allocated' , 'tpc__l1tex_mem_shared_data_compute_bytes_allocated' ),
gen.Row('CTAs' , 'NotApplicable' , 'NotApplicable' , 'NotApplicable' , 'NotApplicable' , 'sm__ctas_active' ),
]
gen.required_counters = [
'sm__ctas_active',
'sm__warps_active',
'tpc__warps_active_shader_cs_realtime',
'tpc__warps_active_shader_ps_realtime',
'tpc__warps_active_shader_vtg_realtime',
'tpc__sm_rf_registers_allocated',
'tpc__sm_rf_registers_allocated_shader_vtg',
'tpc__sm_rf_registers_allocated_shader_ps',
'tpc__sm_rf_registers_allocated_shader_cs',
'tpc__l1tex_mem_shared_data_isbe_bytes_allocated',
'tpc__l1tex_mem_shared_data_tram_bytes_allocated',
'tpc__l1tex_mem_shared_data_compute_bytes_allocated',
]
class RangesSummaryGenerator(tables_common.RangesSummaryGenerator):
def __init__(gen):
super().__init__()
gen.cols = [
gen.Col('Duration μs' , "getCounterValue('gpu__time_duration', 'avg')" , 'format_avg' , 'ra'),
gen.Col('GR Active%' , "getCounterPct('gr__cycles_active', 'avg')" , 'format_pct' , 'ra'),
gen.Col('3D?' , "getCounterValue('fe__draw_count', 'sum') ? '&#x2713;' : ''" , '' , 'ra'),
gen.Col('Comp?' , "getCounterValue('gr__dispatch_count', 'sum') ? '&#x2713;' : ''" , '' , 'ra'),
gen.Col('#WFI' , "getCounterValue('fe__output_ops_type_bundle_cmd_go_idle', 'sum')" , 'format_sum' , 'ra'),
gen.Col('#Prims' , "getCounterValue('pda__input_prims', 'sum')" , 'format_sum' , 'ra'),
gen.Col('#Pixels-Z' , "getCounterValue('prop__prop2xbar_zrop_pixels_realtime', 'sum')" , 'format_sum' , 'ra'),
gen.Col('#Pixels-C' , "getCounterValue('prop__prop2xbar_crop_pixels_realtime', 'sum')" , 'format_sum' , 'ra'),
gen.Col('SM%' , "getThroughputPct('sm__throughput')" , 'format_pct' , 'ra'),
gen.Col('L1TEX%' , "getThroughputPct('l1tex__throughput')" , 'format_pct' , 'ra'),
gen.Col('L2%' , "getThroughputPct('lts__throughput')" , 'format_pct' , 'ra'),
gen.Col('DRAM%' , "getThroughputPct('dram__throughput')" , 'format_pct' , 'ra'),
gen.Col('PCIe%' , "getThroughputPct('pcie__throughput')" , 'format_pct' , 'ra'),
gen.Col('PD%' , "getThroughputPct('pda__throughput')" , 'format_pct' , 'ra'),
gen.Col('PE%' , "Math.max(getThroughputPct('vaf__throughput'), getThroughputPct('vpc__throughput'), getThroughputPct('pes__throughput'))" , 'format_pct' , 'ra'),
gen.Col('RSTR%' , "getThroughputPct('raster__throughput')" , 'format_pct' , 'ra'),
gen.Col('PROP%' , "getThroughputPct('prop__throughput')" , 'format_pct' , 'ra'),
gen.Col('ZROP%' , "getThroughputPct('zrop__throughput')" , 'format_pct' , 'ra'),
gen.Col('CROP%' , "getThroughputPct('crop__throughput')" , 'format_pct' , 'ra'),
]
gen.required_counters = [
'fe__draw_count',
'fe__output_ops_type_bundle_cmd_go_idle',
'gpu__time_duration',
'gr__cycles_active',
'gr__dispatch_count',
'pda__input_prims',
'prop__prop2xbar_crop_pixels_realtime',
'prop__prop2xbar_zrop_pixels_realtime',
]
gen.required_ratios = []
gen.required_throughputs = [
'crop__throughput',
'dram__throughput',
'l1tex__throughput',
'lts__throughput',
'pcie__throughput',
'pda__throughput',
'pes__throughput',
'prop__throughput',
'raster__throughput',
'sm__throughput',
'vaf__throughput',
'vpc__throughput',
'zrop__throughput',
]

View File

@@ -0,0 +1,75 @@
# Copyright 2014-2021 NVIDIA Corporation. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from profiler_report_types import *
import pub.tables_common as tables_common
import pub.volta.tables_gv100 as tables_gv100
def get_per_range_report_definition():
sections = [
DataSection([
tables_gv100.DevicePropertiesGenerator().make_data_table(),
tables_common.ClocksGenerator().make_data_table(),
], inter_table_spacing=False),
DataSection([
tables_common.TopLevelStatsGenerator().make_data_table(),
tables_gv100.TopThroughputsGenerator().make_data_table(),
tables_common.CacheHitRates().make_data_table(),
], title='Overview Section'),
DataSection([
tables_common.MainMemoryGenerator().make_data_table(),
tables_gv100.L2TrafficByMemoryApertureShortBreakdownGenerator(show_generic_workflow=True).make_data_table(),
tables_gv100.L2TrafficBySrcBreakdownGenerator(show_generic_workflow=False).make_data_table(),
tables_common.L1TexThroughputsGenerator().make_data_table(),
tables_common.L1TexTrafficBreakdownGenerator().make_data_table(),
], title='Memory Performance Section'),
DataSection([
tables_gv100.SmThroughputsGenerator().make_data_table(),
tables_gv100.SmInstExecutedGenerator().make_data_table(),
tables_common.SmShaderExecutionGenerator().make_data_table(),
tables_common.SmWarpLaunchStallsGenerator().make_data_table(),
tables_common.WarpIssueStallsGenerator().make_data_table(),
], title='Shader Performance Section'),
DataSection([
tables_gv100.L2TrafficByMemoryApertureBreakdownGenerator(show_generic_workflow=False).make_data_table(),
tables_gv100.L2TrafficByOperationBreakdownGenerator(show_generic_workflow=False).make_data_table(),
], title='Additional L2 Traffic Breakdowns Section'),
DataSection([
tables_common.AdditionalMetricsGenerator().make_data_table(),
tables_common.AllCountersGenerator().make_data_table(),
tables_common.AllRatiosGenerator().make_data_table(),
tables_common.AllThroughputsGenerator().make_data_table(),
], title='Exhaustive Listings Section'),
]
html = tables_common.generate_range_html_common(sections)
required_counters = get_required_counters(sections)
required_ratios = get_required_ratios(sections)
required_throughputs = get_required_throughputs(sections)
return ReportDefinition('PerRangeReport', html, required_counters, required_ratios, required_throughputs)
def get_summary_report_definition():
sections = [
DataSection([
tables_common.CollectionInfoGenerator().make_data_table(),
]),
DataSection([
tables_gv100.RangesSummaryGenerator().make_data_table(),
], title='Summary of Measured Ranges'),
]
html = tables_common.generate_summary_html_common(sections)
required_counters = get_required_counters(sections)
required_ratios = get_required_ratios(sections)
required_throughputs = get_required_throughputs(sections)
return ReportDefinition('SummaryReport', html, required_counters, required_ratios, required_throughputs)

View File

@@ -0,0 +1,555 @@
# Copyright 2014-2021 NVIDIA Corporation. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from profiler_report_types import *
import pub.tables_common as tables_common
class DevicePropertiesGenerator(tables_common.DevicePropertiesGenerator):
def __init__(gen):
super().__init__()
gen.l2cacheSizePerLts = 96
class TopThroughputsGenerator(tables_common.TopThroughputsGenerator):
def __init__(gen):
super().__init__()
gen.rows += [
gen.Row('Shader' , '<a href="#SM-Instruction-Throughput">SM (Shader Cores)</a>' , "getThroughputPct('sm__throughput')"),
gen.Row('Memory' , '<a href="#L1TEX-Throughput">L1TEX Cache</a>' , "getThroughputPct('l1tex__throughput')"),
gen.Row('Memory' , '<a href="#L2-Sector-Traffic">L2 Cache</a>' , "getThroughputPct('lts__throughput')"),
gen.Row('Memory' , '<a href="#Main-Memory-Throughput">DRAM</a>' , "getThroughputPct('dram__throughput')"),
gen.Row('Memory' , '<a href="#Main-Memory-Throughput">PCIe</a>' , "getThroughputPct('pcie__throughput')"),
gen.Row('World Pipe' , '<a href="#Primitive-Data-Flow">PDA Index Fetch</a>' , "getThroughputPct('pda__throughput')"),
gen.Row('World Pipe' , '<a href="#Primitive-Data-Flow">Vertex Attr. Fetch</a>' , "getThroughputPct('vaf__throughput')"),
gen.Row('World Pipe' , '<a href="#Primitive-Data-Flow">Primitive Engine</a>' , "getThroughputPct('pes__throughput')"),
gen.Row('Screen Pipe' , '<a href="#Raster-Data-Flow">RASTER</a>' , "getThroughputPct('raster__throughput')"),
gen.Row('Screen Pipe' , '<a href="#Raster-Data-Flow">ZROP (Depth-Test)</a>' , "getThroughputPct('zrop__throughput')"),
gen.Row('Screen Pipe' , '<a href="#Raster-Data-Flow">CROP (Color Blend)</a>' , "getThroughputPct('crop__throughput')"),
]
gen.required_throughputs += [
'crop__throughput',
'dram__throughput',
'l1tex__throughput',
'lts__throughput',
'pcie__throughput',
'pda__throughput',
'pes__throughput',
'raster__throughput',
'sm__throughput',
'vaf__throughput',
'zrop__throughput',
]
class SmThroughputsGenerator(tables_common.SmThroughputsGenerator):
def __init__(gen):
super().__init__()
gen.pipes += [
gen.Pipe('adu' , 'Computed branches and indexed constants'),
gen.Pipe('alu' , 'INT32 except multiply; FP32 comparison', 'sm__pipe_alu_cycles_active'),
gen.Pipe('cbu' , 'Divergent branches and control flow'),
gen.Pipe('fma' , 'FP32 mul/add and INT32 multiply', 'sm__pipe_fma_cycles_active'),
gen.Pipe('fp16' , 'FP16 mul/add'),
gen.Pipe('fp64' , 'FP64 mul/add'),
gen.Pipe('ipa' , 'Pixel shader attribute interpolation'),
gen.Pipe('lsu' , 'Global, local, shared memory, and misc'),
gen.Pipe('shared' , 'Shared Pipe Dispatch (FP64,FP16,Tensor)', 'sm__pipe_shared_cycles_active', hasInstExecuted=False),
gen.Pipe('tensor' , 'Tensor matrix multiply (FP16)', 'sm__pipe_tensor_cycles_active'),
gen.Pipe('tex' , 'Texture and surface memory'),
gen.Pipe('xu' , 'Transcendentals and float/int conversion'),
]
for pipe in gen.pipes:
gen.required_counters.extend(pipe.get_counter_names())
class SmInstExecutedGenerator(tables_common.SmInstExecutedGenerator):
def __init__(gen):
super().__init__()
gen.pipes += [
gen.Pipe('total' , 'All instructions', True),
gen.Pipe('adu' , 'Computed branches and indexed constants'),
gen.Pipe('alu' , 'INT32 except multiply; FP32 comparison', True),
gen.Pipe('cbu' , 'Divergent branches and control flow'),
gen.Pipe('fma' , 'FP32 mul/add and INT32 multiply', True),
gen.Pipe('fp16' , 'FP16 mul/add', True),
gen.Pipe('fp64' , 'FP64 mul/add', True),
gen.Pipe('ipa' , 'Pixel shader attribute interpolation', True),
gen.Pipe('lsu' , 'Global, local, shared memory, and misc', True),
gen.Pipe('tensor' , 'Tensor matrix multiply (FP16)'),
gen.Pipe('tex' , 'Texture and surface memory'),
gen.Pipe('xu' , 'Transcendentals and float/int conversion', True),
]
for pipe in gen.pipes:
gen.required_counters.extend(pipe.get_counter_names())
class L2TrafficByMemoryApertureShortBreakdownGenerator(tables_common.L2TrafficBreakdownGenerator):
def __init__(gen, show_generic_workflow):
super().__init__(show_generic_workflow=show_generic_workflow)
gen.name = 'L2SectorTrafficBreakdownByMemoryApertureShort'
gen.table_id = 'L2-Sector-Traffic-By-Memory-Aperture-Short'
gen.column_names = [
('Memory Aperture', 'To Memory'),
('Op', 'Op'),
]
gen.workflow += ' This table decomposes L2 bandwidth to each destination, per operation. A <a href="#L2-Sector-Traffic-By-Memory-Aperture">more detailed version of this table</a> can be found below.'
gen.nodes = [
gen.Node('DRAM' , ['lts__average_t_sector_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_aperture_device_op_write'], []),
gen.Node('Atomics', ['lts__average_t_sector_aperture_device_op_atom'], []),
gen.Node('Reductions', ['lts__average_t_sector_aperture_device_op_red'], []),
]),
gen.Node('System Memory', ['lts__average_t_sector_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_aperture_sysmem_op_write'], []),
gen.Node('Atomics', ['lts__average_t_sector_aperture_sysmem_op_atom'], []),
gen.Node('Reductions', ['lts__average_t_sector_aperture_sysmem_op_red'], []),
]),
gen.Node('Peer Memory', ['lts__average_t_sector_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_aperture_peer_op_write'], []),
gen.Node('Atomics', ['lts__average_t_sector_aperture_peer_op_atom'], []),
gen.Node('Reductions', ['lts__average_t_sector_aperture_peer_op_red'], []),
]),
]
gen.required_ratios = gen.get_required_ratios()
class L2TrafficBySrcBreakdownGenerator(tables_common.L2TrafficBreakdownGenerator):
def __init__(gen, show_generic_workflow):
super().__init__(show_generic_workflow=show_generic_workflow)
gen.name = 'L2SectorTrafficBreakdownBySource'
gen.table_id = 'L2-Sector-Traffic-By-Source'
gen.column_names = [
('Source Breakdown', 'From Source'),
('Unit Breakdown', 'From Unit'),
('Memory Aperture', 'To Memory'),
('Op', 'Op'),
]
gen.workflow += ' This table decomposes L2 bandwidth from each source unit, to each destination, per operation. See also: these tables that prioritize <a href="#L2-Sector-Traffic-By-Memory-Aperture">destination Memory Aperture</a> and <a href="#L2-Sector-Traffic-By-Operation">Operation</a>.'
gen.nodes = [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex'], [
gen.Node('DRAM' , ['lts__average_t_sector_srcunit_tex_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_tex_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_tex_aperture_device_op_write'], []),
gen.Node('Atomics', ['lts__average_t_sector_srcunit_tex_aperture_device_op_atom'], []),
gen.Node('Reductions', ['lts__average_t_sector_srcunit_tex_aperture_device_op_red'], []),
]),
gen.Node('Peer Memory', ['lts__average_t_sector_srcunit_tex_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_write'], []),
gen.Node('Atomics', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_atom'], []),
gen.Node('Reductions', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_red'], []),
]),
gen.Node('System Memory', ['lts__average_t_sector_srcunit_tex_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_write'], []),
gen.Node('Atomics', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_atom'], []),
gen.Node('Reductions', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_red'], []),
]),
]),
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe'], [
gen.Node('DRAM' , ['lts__average_t_sector_srcunit_pe_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_pe_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_pe_aperture_device_op_write'], []),
]),
gen.Node('Peer Memory', ['lts__average_t_sector_srcunit_pe_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_pe_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_pe_aperture_peer_op_write'], []),
]),
gen.Node('System Memory', ['lts__average_t_sector_srcunit_pe_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_pe_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_pe_aperture_sysmem_op_write'], []),
]),
]),
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother'], [
gen.Node('DRAM' , ['lts__average_t_sector_srcunit_gpcother_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_gpcother_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_gpcother_aperture_device_op_write'], []),
]),
gen.Node('Peer Memory', ['lts__average_t_sector_srcunit_gpcother_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_gpcother_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_gpcother_aperture_peer_op_write'], []),
]),
gen.Node('System Memory', ['lts__average_t_sector_srcunit_gpcother_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_gpcother_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_gpcother_aperture_sysmem_op_write'], []),
]),
]),
]),
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp'], [
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop'], [
gen.Node('DRAM' , ['lts__average_t_sector_srcunit_zrop_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_zrop_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_zrop_aperture_device_op_write'], []),
]),
gen.Node('Peer Memory' , ['lts__average_t_sector_srcunit_zrop_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_zrop_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_zrop_aperture_peer_op_write'], []),
]),
gen.Node('System Memory' , ['lts__average_t_sector_srcunit_zrop_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem_op_write'], []),
]),
]),
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop'], [
gen.Node('DRAM' , ['lts__average_t_sector_srcunit_crop_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_crop_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_crop_aperture_device_op_write'], []),
]),
gen.Node('Peer Memory' , ['lts__average_t_sector_srcunit_crop_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_crop_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_crop_aperture_peer_op_write'], []),
]),
gen.Node('System Memory' , ['lts__average_t_sector_srcunit_crop_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_crop_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_crop_aperture_sysmem_op_write'], []),
]),
]),
]),
gen.Node('HUB Units', [], [ # lts__average_t_sector_srcnode_fbp is buggy, use sum of children instead
gen.Node('all HUB Units', [], [ # lts__average_t_sector_srcnode_fbp is buggy, use sum of children instead
gen.Node('DRAM' , ['lts__average_t_sector_srcnode_hub_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_srcnode_hub_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcnode_hub_aperture_device_op_write'], []),
]),
gen.Node('Peer Memory', ['lts__average_t_sector_srcnode_hub_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_write'], []),
]),
gen.Node('System Memory', ['lts__average_t_sector_srcnode_hub_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_write'], []),
]),
]),
]),
]
gen.required_ratios = gen.get_required_ratios()
class L2TrafficByMemoryApertureBreakdownGenerator(tables_common.L2TrafficBreakdownGenerator):
def __init__(gen, show_generic_workflow):
super().__init__(show_generic_workflow=show_generic_workflow)
gen.name = 'L2SectorTrafficBreakdownByMemoryAperture'
gen.table_id = 'L2-Sector-Traffic-By-Memory-Aperture'
gen.column_names = [
('Memory Aperture', 'To Memory'),
('Source Breakdown', 'From Source'),
('Unit Breakdown', 'From Unit'),
('Op', 'Op'),
]
gen.workflow += ' This is an extended breakdown of <a href="#L2-Sector-Traffic-By-Memory-Aperture-Short">L2 Traffic by destination</a>. It decomposes L2 bandwidth to each destination, from each source unit, per operation.'
gen.nodes = [
gen.Node('DRAM' , ['lts__average_t_sector_aperture_device'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_device'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_tex_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_tex_aperture_device_op_write'], []),
gen.Node('Atomics', ['lts__average_t_sector_srcunit_tex_aperture_device_op_atom'], []),
gen.Node('Reductions', ['lts__average_t_sector_srcunit_tex_aperture_device_op_red'], []),
]),
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_pe_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_pe_aperture_device_op_write'], []),
]),
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_gpcother_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_gpcother_aperture_device_op_write'], []),
]),
]),
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_device'], [
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_zrop_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_zrop_aperture_device_op_write'], []),
]),
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_crop_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_crop_aperture_device_op_write'], []),
]),
]),
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_device'], [
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_device'], [
gen.Node('Reads', ['lts__average_t_sector_srcnode_hub_aperture_device_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcnode_hub_aperture_device_op_write'], []),
]),
]),
]),
gen.Node('Peer Memory', ['lts__average_t_sector_aperture_peer'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_peer'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_write'], []),
gen.Node('Atomics', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_atom'], []),
gen.Node('Reductions', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_red'], []),
]),
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_pe_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_pe_aperture_peer_op_write'], []),
]),
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_gpcother_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_gpcother_aperture_peer_op_write'], []),
]),
]),
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_peer'], [
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_zrop_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_zrop_aperture_peer_op_write'], []),
]),
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_crop_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_crop_aperture_peer_op_write'], []),
]),
]),
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_peer'], [
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_peer'], [
gen.Node('Reads', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_write'], []),
]),
]),
]),
gen.Node('System Memory', ['lts__average_t_sector_aperture_sysmem'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_sysmem'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_write'], []),
gen.Node('Atomics', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_atom'], []),
gen.Node('Reductions', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_red'], []),
]),
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_pe_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_pe_aperture_sysmem_op_write'], []),
]),
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_gpcother_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_gpcother_aperture_sysmem_op_write'], []),
]),
]),
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_sysmem'], [
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem_op_write'], []),
]),
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_srcunit_crop_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcunit_crop_aperture_sysmem_op_write'], []),
]),
]),
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_sysmem'], [
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_sysmem'], [
gen.Node('Reads', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_read'], []),
gen.Node('Writes', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_write'], []),
]),
]),
]),
]
gen.required_ratios = gen.get_required_ratios()
class L2TrafficByOperationBreakdownGenerator(tables_common.L2TrafficBreakdownGenerator):
def __init__(gen, show_generic_workflow):
super().__init__(show_generic_workflow=show_generic_workflow)
gen.name = 'L2SectorTrafficBreakdownByOperation'
gen.table_id = 'L2-Sector-Traffic-By-Operation'
gen.column_names = [
('Op', 'Op'),
('Memory Aperture', 'To Memory'),
('Source Breakdown', 'From Source'),
('Unit Breakdown', 'From Unit'),
]
gen.workflow += ' This table decomposes L2 bandwidth per operation, to each destination, from each source unit.'
gen.nodes = [
gen.Node('Reads', ['lts__average_t_sector_op_read'], [
gen.Node('DRAM', ['lts__average_t_sector_aperture_device_op_read'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_device_op_read'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_device_op_read'], []),
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_device_op_read'], []),
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother_aperture_device_op_read'], []),
]),
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_device_op_read'], [
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_device_op_read'], []),
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_device_op_read'], []),
]),
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_device_op_read'], [
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_device_op_read'], []),
]),
]),
gen.Node('Peer Memory', ['lts__average_t_sector_aperture_peer_op_read'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_peer_op_read'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_read'], []),
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_peer_op_read'], []),
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother_aperture_peer_op_read'], []),
]),
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_peer_op_read'], [
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_peer_op_read'], []),
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_peer_op_read'], []),
]),
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_read'], [
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_read'], []),
]),
]),
gen.Node('System Memory', ['lts__average_t_sector_aperture_sysmem_op_read'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_sysmem_op_read'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_read'], []),
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_sysmem_op_read'], []),
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother_aperture_sysmem_op_read'], []),
]),
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_sysmem_op_read'], [
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem_op_read'], []),
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_sysmem_op_read'], []),
]),
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_read'], [
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_read'], []),
]),
]),
]),
gen.Node('Writes', ['lts__average_t_sector_op_write'], [
gen.Node('DRAM', ['lts__average_t_sector_aperture_device_op_write'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_device_op_write'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_device_op_write'], []),
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_device_op_write'], []),
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother_aperture_device_op_write'], []),
]),
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_device_op_write'], [
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_device_op_write'], []),
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_device_op_write'], []),
]),
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_device_op_write'], [
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_device_op_write'], []),
]),
]),
gen.Node('Peer Memory', ['lts__average_t_sector_aperture_peer_op_write'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_peer_op_write'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_write'], []),
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_peer_op_write'], []),
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother_aperture_peer_op_write'], []),
]),
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_peer_op_write'], [
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_peer_op_write'], []),
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_peer_op_write'], []),
]),
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_write'], [
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_peer_op_write'], []),
]),
]),
gen.Node('System Memory', ['lts__average_t_sector_aperture_sysmem_op_write'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_sysmem_op_write'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_write'], []),
gen.Node('Primitive Engine', ['lts__average_t_sector_srcunit_pe_aperture_sysmem_op_write'], []),
gen.Node('other GPC units', ['lts__average_t_sector_srcunit_gpcother_aperture_sysmem_op_write'], []),
]),
gen.Node('FBP Units', ['lts__average_t_sector_srcnode_fbp_aperture_sysmem_op_write'], [
gen.Node('ZROP', ['lts__average_t_sector_srcunit_zrop_aperture_sysmem_op_write'], []),
gen.Node('CROP', ['lts__average_t_sector_srcunit_crop_aperture_sysmem_op_write'], []),
]),
gen.Node('HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_write'], [
gen.Node('all HUB Units', ['lts__average_t_sector_srcnode_hub_aperture_sysmem_op_write'], []),
]),
]),
]),
gen.Node('Atomics', ['lts__average_t_sector_op_atom'], [
gen.Node('DRAM', ['lts__average_t_sector_aperture_device_op_atom'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_device_op_atom'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_device_op_atom'], []),
]),
]),
gen.Node('Peer Memory', ['lts__average_t_sector_aperture_peer_op_atom'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_peer_op_atom'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_atom'], []),
]),
]),
gen.Node('System Memory', ['lts__average_t_sector_aperture_sysmem_op_atom'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_sysmem_op_atom'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_atom'], []),
]),
]),
]),
gen.Node('Reductions', ['lts__average_t_sector_op_red'], [
gen.Node('DRAM', ['lts__average_t_sector_aperture_device_op_red'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_device_op_red'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_device_op_red'], []),
]),
]),
gen.Node('Peer Memory', ['lts__average_t_sector_aperture_peer_op_red'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_peer_op_red'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_peer_op_red'], []),
]),
]),
gen.Node('System Memory', ['lts__average_t_sector_aperture_sysmem_op_red'], [
gen.Node('GPC Units', ['lts__average_t_sector_srcnode_gpc_aperture_sysmem_op_red'], [
gen.Node('<a href="#L1TEX-Sector-Traffic">L1TEX Cache</a>', ['lts__average_t_sector_srcunit_tex_aperture_sysmem_op_red'], []),
]),
]),
]),
]
gen.required_ratios = gen.get_required_ratios()
class RasterDataflowGenerator(tables_common.RasterDataflowGenerator):
def __init__(gen):
super().__init__()
gen.zrop_pixels_input = r'''getCounterValue('prop__prop2zrop_pixels_realtime', 'sum')'''
gen.crop_pixels_input = r'''getCounterValue('prop__prop2crop_pixels_realtime', 'sum')'''
gen.required_counters.extend([
'prop__prop2zrop_pixels_realtime',
'prop__prop2crop_pixels_realtime'
])
class RangesSummaryGenerator(tables_common.RangesSummaryGenerator):
def __init__(gen):
super().__init__()
gen.cols = [
gen.Col('Duration μs' , "getCounterValue('gpu__time_duration', 'avg')" , 'format_avg' , 'ra'),
gen.Col('GR Active%' , "getCounterPct('gr__cycles_active', 'avg')" , 'format_pct' , 'ra'),
gen.Col('3D?' , "getCounterValue('fe__draw_count', 'sum') ? '&#x2713;' : ''" , '' , 'ra'),
gen.Col('Comp?' , "getCounterValue('gr__dispatch_count', 'sum') ? '&#x2713;' : ''" , '' , 'ra'),
gen.Col('#WFI' , "getCounterValue('fe__output_ops_type_bundle_cmd_go_idle', 'sum')" , 'format_sum' , 'ra'),
gen.Col('#Prims' , "getCounterValue('pda__input_prims', 'sum')" , 'format_sum' , 'ra'),
gen.Col('SM%' , "getThroughputPct('sm__throughput')" , 'format_pct' , 'ra'),
gen.Col('L1TEX%' , "getThroughputPct('l1tex__throughput')" , 'format_pct' , 'ra'),
gen.Col('L2%' , "getThroughputPct('lts__throughput')" , 'format_pct' , 'ra'),
gen.Col('DRAM%' , "getThroughputPct('dram__throughput')" , 'format_pct' , 'ra'),
gen.Col('PCIe%' , "getThroughputPct('pcie__throughput')" , 'format_pct' , 'ra'),
gen.Col('PD%' , "getThroughputPct('pda__throughput')" , 'format_pct' , 'ra'),
gen.Col('PE%' , "Math.max(getThroughputPct('vaf__throughput'), getThroughputPct('vpc__throughput'), getThroughputPct('pes__throughput'))" , 'format_pct' , 'ra'),
gen.Col('RSTR%' , "getThroughputPct('raster__throughput')" , 'format_pct' , 'ra'),
gen.Col('ZROP%' , "getThroughputPct('zrop__throughput')" , 'format_pct' , 'ra'),
gen.Col('CROP%' , "getThroughputPct('crop__throughput')" , 'format_pct' , 'ra'),
]
gen.required_counters = [
'fe__draw_count',
'fe__output_ops_type_bundle_cmd_go_idle',
'gpu__time_duration',
'gr__cycles_active',
'gr__dispatch_count',
'pda__input_prims',
]
gen.required_ratios = []
gen.required_throughputs = [
'crop__throughput',
'dram__throughput',
'l1tex__throughput',
'lts__throughput',
'pcie__throughput',
'pda__throughput',
'pes__throughput',
'raster__throughput',
'sm__throughput',
'vaf__throughput',
'vpc__throughput',
'zrop__throughput',
]

View File

@@ -0,0 +1,11 @@
This is an offline tool that generates the C++ "report definition" header.
Example command:
```
python3 profiler_report_generator.py --chip ga10x --outDir=PATH/TO/YOUR/OUTPUT_DIR --pypath pub/ampere
```
* This has been tested with Python 3.5.2
* Please use "profiler_report_generator.py --help" for more details
A pre-generated version of header files have been deployed to both the "/gen" sub-directory and the "NvPerfUtility" directory.