Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 40 additions & 23 deletions cmd/telemetry/telemetry.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ package telemetry
import (
"fmt"
"log/slog"
"regexp"
"os"
"slices"
"strconv"
"strings"
Expand Down Expand Up @@ -61,12 +61,10 @@ var (
flagPower bool
flagTemperature bool
flagInstrMix bool
flagGaudi bool

flagNoSystemSummary bool

flagInstrMixPid int
flagInstrMixFilter []string
flagInstrMixFrequency int
)

Expand All @@ -87,12 +85,10 @@ const (
flagPowerName = "power"
flagTemperatureName = "temperature"
flagInstrMixName = "instrmix"
flagGaudiName = "gaudi"

flagNoSystemSummaryName = "no-summary"

flagInstrMixPidName = "instrmix-pid"
flagInstrMixFilterName = "instrmix-filter"
flagInstrMixFrequencyName = "instrmix-frequency"
)

Expand All @@ -110,9 +106,13 @@ var categories = []common.Category{
{FlagName: flagStorageName, FlagVar: &flagStorage, DefaultValue: false, Help: "monitor storage", TableNames: []string{report.DriveTelemetryTableName}},
{FlagName: flagIRQRateName, FlagVar: &flagIRQRate, DefaultValue: false, Help: "monitor IRQ rate", TableNames: []string{report.IRQRateTelemetryTableName}},
{FlagName: flagInstrMixName, FlagVar: &flagInstrMix, DefaultValue: false, Help: "monitor instruction mix", TableNames: []string{report.InstructionTelemetryTableName}},
{FlagName: flagGaudiName, FlagVar: &flagGaudi, DefaultValue: false, Help: "monitor gaudi", TableNames: []string{report.GaudiTelemetryTableName}},
}

const (
instrmixFrequencyDefaultSystemWide = 10000000
instrmixFrequencyDefaultPerPID = 100000
)

func init() {
// set up config category flags
for _, cat := range categories {
Expand All @@ -124,8 +124,7 @@ func init() {
Cmd.Flags().IntVar(&flagDuration, flagDurationName, 30, "")
Cmd.Flags().IntVar(&flagInterval, flagIntervalName, 2, "")
Cmd.Flags().IntVar(&flagInstrMixPid, flagInstrMixPidName, 0, "")
Cmd.Flags().StringSliceVar(&flagInstrMixFilter, flagInstrMixFilterName, []string{"SSE", "AVX", "AVX2", "AVX512", "AMX_TILE"}, "")
Cmd.Flags().IntVar(&flagInstrMixFrequency, flagInstrMixFrequencyName, 10000000, "") // 10 million
Cmd.Flags().IntVar(&flagInstrMixFrequency, flagInstrMixFrequencyName, instrmixFrequencyDefaultSystemWide, "")
Cmd.Flags().BoolVar(&flagNoSystemSummary, flagNoSystemSummaryName, false, "")

common.AddTargetFlags(Cmd)
Expand Down Expand Up @@ -193,13 +192,9 @@ func getFlagGroups() []common.FlagGroup {
Name: flagInstrMixPidName,
Help: "PID to monitor for instruction mix, no PID means all processes",
},
{
Name: flagInstrMixFilterName,
Help: "filter to apply to instruction mix",
},
{
Name: flagInstrMixFrequencyName,
Help: "number of instructions between samples when no PID specified",
Help: "number of instructions between samples, default is 10,000,000 when collecting system wide and 100,000 when collecting for a specific PID",
},
{
Name: flagNoSystemSummaryName,
Expand Down Expand Up @@ -259,16 +254,12 @@ func validateFlags(cmd *cobra.Command, args []string) error {
if flagDuration == 0 && (target != "" || targets != "") {
return common.FlagValidationError(cmd, "duration must be greater than 0 when collecting from a remote target")
}
if cmd.Flags().Lookup(flagInstrMixFilterName).Changed {
re := regexp.MustCompile("^[A-Z0-9_]+$")
for _, filter := range flagInstrMixFilter {
if !re.MatchString(filter) {
return common.FlagValidationError(cmd, fmt.Sprintf("invalid filter: %s, must be uppercase letters, numbers, and underscores", filter))
}
}
}
if flagInstrMixFrequency < 100000 { // 100,000 instructions is the minimum frequency
return common.FlagValidationError(cmd, "instruction mix frequency must be 100,000 or greater")
return common.FlagValidationError(cmd, "instruction mix frequency must be 100,000 or greater to limit overhead")
}
// warn if instruction mix frequency is low when collecting system wide
if flagInstrMix && flagInstrMixPid == 0 && flagInstrMixFrequency < instrmixFrequencyDefaultSystemWide {
slog.Warn("instruction mix frequency is set to a value lower than default for system wide collection, consider using a higher frequency to limit collection overhead", slog.Int("frequency", flagInstrMixFrequency))
}
// common target flags
if err := common.ValidateTargetFlags(cmd); err != nil {
Expand All @@ -289,6 +280,28 @@ func runCmd(cmd *cobra.Command, args []string) error {
tableNames = append(tableNames, cat.TableNames...)
}
}
// confirm proper default for instrmix frequency
if flagInstrMix {
if flagInstrMixPid != 0 && !cmd.Flags().Changed(flagInstrMixFrequencyName) {
// per-PID collection and frequency not changed, set to per-PID default
flagInstrMixFrequency = instrmixFrequencyDefaultPerPID
}
}
// hidden feature - Gaudi telemetry, only enabled when PERFSPECT_GAUDI_HLSMI_PATH is set
gaudiHlsmiPath := os.Getenv("PERFSPECT_GAUDI_HLSMI_PATH") // must be full path to hlsmi binary
if gaudiHlsmiPath != "" {
slog.Info("Gaudi telemetry enabled", slog.String("hlsmi_path", gaudiHlsmiPath))
tableNames = append(tableNames, report.GaudiTelemetryTableName)
}
// hidden feature - PDU telemetry, only enabled when four environment variables are set
pduHost := os.Getenv("PERFSPECT_PDU_HOST")
pduUser := os.Getenv("PERFSPECT_PDU_USER")
pduPassword := os.Getenv("PERFSPECT_PDU_PASSWORD")
pduOutlet := os.Getenv("PERFSPECT_PDU_OUTLET")
if pduHost != "" && pduUser != "" && pduPassword != "" && pduOutlet != "" {
slog.Info("PDU telemetry enabled", slog.String("host", pduHost), slog.String("outlet", pduOutlet))
tableNames = append(tableNames, report.PDUTelemetryTableName)
}
// include telemetry summary table if all telemetry options are selected
var summaryFunc common.SummaryFunc
if flagAll {
Expand All @@ -306,8 +319,12 @@ func runCmd(cmd *cobra.Command, args []string) error {
"Interval": strconv.Itoa(flagInterval),
"Duration": strconv.Itoa(flagDuration),
"InstrMixPID": strconv.Itoa(flagInstrMixPid),
"InstrMixFilter": strings.Join(flagInstrMixFilter, " "),
"InstrMixFrequency": strconv.Itoa(flagInstrMixFrequency),
"GaudiHlsmiPath": gaudiHlsmiPath,
"PDUHost": pduHost,
"PDUUser": pduUser,
"PDUPassword": pduPassword,
"PDUOutlet": pduOutlet,
},
TableNames: tableNames,
SummaryFunc: summaryFunc,
Expand Down
107 changes: 86 additions & 21 deletions internal/report/render_html.go
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,8 @@ const datasetTemplate = `
backgroundColor: '{{.Color}}',
borderColor: '{{.Color}}',
borderWidth: 1,
showLine: true
showLine: true,
hidden: {{.Hidden}}
}
`
const lineChartTemplate = `<div class="chart-container" style="max-width: 900px">
Expand Down Expand Up @@ -715,20 +716,27 @@ func dimmTableHTMLRenderer(tableValues TableValues, targetName string) string {
return renderHTMLTable(socketTableHeaders, socketTableValues, "pure-table pure-table-bordered", [][]string{})
}

func renderChart(chartType string, allFormattedPoints []string, datasetNames []string, xAxisLabels []string, config chartTemplateStruct) string {
func renderChart(chartType string, allFormattedPoints []string, datasetNames []string, xAxisLabels []string, config chartTemplateStruct, datasetHiddenFlags []bool) string {
datasets := []string{}
for dataIdx, formattedPoints := range allFormattedPoints {
specValues := formattedPoints
dst := texttemplate.Must(texttemplate.New("datasetTemplate").Parse(datasetTemplate))
buf := new(bytes.Buffer)
// determine hidden flag for this dataset
hidden := "false"
if datasetHiddenFlags != nil && dataIdx < len(datasetHiddenFlags) && datasetHiddenFlags[dataIdx] {
hidden = "true"
}
err := dst.Execute(buf, struct {
Label string
Data string
Color string
Label string
Data string
Color string
Hidden string
}{
Label: datasetNames[dataIdx],
Data: specValues,
Color: getColor(dataIdx),
Label: datasetNames[dataIdx],
Data: specValues,
Color: getColor(dataIdx),
Hidden: hidden,
})
if err != nil {
slog.Error("error executing template", slog.String("error", err.Error()))
Expand Down Expand Up @@ -781,10 +789,10 @@ func renderScatterChart(data [][]scatterPoint, datasetNames []string, config cha
}
allFormattedPoints = append(allFormattedPoints, strings.Join(formattedPoints, ","))
}
return renderChart("scatter", allFormattedPoints, datasetNames, nil, config)
return renderChart("scatter", allFormattedPoints, datasetNames, nil, config, nil)
}

func renderLineChart(xAxisLabels []string, data [][]float64, datasetNames []string, config chartTemplateStruct) string {
func renderLineChart(xAxisLabels []string, data [][]float64, datasetNames []string, config chartTemplateStruct, datasetHiddenFlags []bool) string {
allFormattedPoints := []string{}
for dataIdx := range data {
formattedPoints := []string{}
Expand All @@ -793,7 +801,7 @@ func renderLineChart(xAxisLabels []string, data [][]float64, datasetNames []stri
}
allFormattedPoints = append(allFormattedPoints, strings.Join(formattedPoints, ","))
}
return renderChart("line", allFormattedPoints, datasetNames, xAxisLabels, config)
return renderChart("line", allFormattedPoints, datasetNames, xAxisLabels, config, datasetHiddenFlags)
}

func renderFrequencyTable(tableValues TableValues) (out string) {
Expand Down Expand Up @@ -908,7 +916,7 @@ func telemetryTableHTMLRenderer(tableValues TableValues, data [][]float64, datas
timestamps = append(timestamps, timestamp)
}
}
return renderLineChart(timestamps, data, datasetNames, chartConfig)
return renderLineChart(timestamps, data, datasetNames, chartConfig, nil)
}

func cpuUtilizationTelemetryTableHTMLRenderer(tableValues TableValues, targetName string) string {
Expand Down Expand Up @@ -1361,13 +1369,22 @@ func c6TelemetryTableHTMLRenderer(tableValues TableValues, targetName string) st
return telemetryTableHTMLRenderer(tableValues, data, datasetNames, chartConfig)
}

// instructionTelemetryTableHTMLRenderer renders instruction set usage statistics
// Each category is a separate dataset within the chart.
// Categories with zero total usage are hidden by default.
func instructionTelemetryTableHTMLRenderer(tableValues TableValues, targetname string) string {
data := [][]float64{}
datasetNames := []string{}
for _, field := range tableValues.Fields[1:] {
// Collect entries with their sums so we can sort per requirements
type instrEntry struct {
name string
points []float64
sum float64
}
entries := []instrEntry{}
for _, field := range tableValues.Fields[1:] { // skip timestamp field
points := []float64{}
sum := 0.0
for _, val := range field.Values {
if val == "" {
if val == "" { // end of data for this category
break
}
stat, err := strconv.ParseFloat(val, 64)
Expand All @@ -1376,24 +1393,55 @@ func instructionTelemetryTableHTMLRenderer(tableValues TableValues, targetname s
return ""
}
points = append(points, stat)
sum += stat
}
if len(points) > 0 {
data = append(data, points)
datasetNames = append(datasetNames, field.Name)
if len(points) > 0 { // only include categories with at least one point
entries = append(entries, instrEntry{name: field.Name, points: points, sum: sum})
}
}
// Partition into non-zero and zero-sum groups
nonZero := []instrEntry{}
zero := []instrEntry{}
for _, e := range entries {
if e.sum > 0 {
nonZero = append(nonZero, e)
} else {
zero = append(zero, e)
}
}
sort.Slice(nonZero, func(i, j int) bool { return nonZero[i].name < nonZero[j].name })
sort.Slice(zero, func(i, j int) bool { return zero[i].name < zero[j].name })
ordered := append(nonZero, zero...)
data := make([][]float64, 0, len(ordered))
datasetNames := make([]string, 0, len(ordered))
hiddenFlags := make([]bool, 0, len(ordered))
for _, e := range ordered {
data = append(data, e.points)
datasetNames = append(datasetNames, e.name)
// hide zero-sum categories by default
hiddenFlags = append(hiddenFlags, e.sum == 0)
}
chartConfig := chartTemplateStruct{
ID: fmt.Sprintf("%s%d", tableValues.Name, util.RandUint(10000)),
XaxisText: "Time",
YaxisText: "% Samples",
TitleText: "",
DisplayTitle: "false",
DisplayLegend: "true",
AspectRatio: "2",
AspectRatio: "1", // extra tall due to large number of data sets
SuggestedMin: "0",
SuggestedMax: "0",
}
return telemetryTableHTMLRenderer(tableValues, data, datasetNames, chartConfig)
// render directly using renderLineChart to supply hidden flags
tsFieldIdx := 0
var timestamps []string
for i := range tableValues.Fields[0].Values {
timestamp := tableValues.Fields[tsFieldIdx].Values[i]
if !slices.Contains(timestamps, timestamp) {
timestamps = append(timestamps, timestamp)
}
}
return renderLineChart(timestamps, data, datasetNames, chartConfig, hiddenFlags)
}

func renderGaudiStatsChart(tableValues TableValues, chartStatFieldName string, titleText string, yAxisText string, suggestedMax string) string {
Expand Down Expand Up @@ -1463,6 +1511,23 @@ func gaudiTelemetryTableHTMLRenderer(tableValues TableValues, targetName string)
return out
}

func pduTelemetryTableHTMLRenderer(tableValues TableValues, targetName string) string {
data := [][]float64{}
datasetNames := []string{}
chartConfig := chartTemplateStruct{
ID: fmt.Sprintf("%s%d", tableValues.Name, util.RandUint(10000)),
XaxisText: "Time",
YaxisText: "Value",
TitleText: "",
DisplayTitle: "false",
DisplayLegend: "true",
AspectRatio: "2",
SuggestedMin: "0",
SuggestedMax: "0",
}
return telemetryTableHTMLRenderer(tableValues, data, datasetNames, chartConfig)
}

func callStackFrequencyTableHTMLRenderer(tableValues TableValues, targetName string) string {
out := `<style>

Expand Down
15 changes: 15 additions & 0 deletions internal/report/table_defs.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ const (
PowerTelemetryTableName = "Power Telemetry"
TemperatureTelemetryTableName = "Temperature Telemetry"
GaudiTelemetryTableName = "Gaudi Telemetry"
PDUTelemetryTableName = "PDU Telemetry"
// config table names
ConfigurationTableName = "Configuration"
// flamegraph table names
Expand All @@ -156,6 +157,7 @@ const (
PowerTelemetryMenuLabel = "Power"
TemperatureTelemetryMenuLabel = "Temperature"
GaudiTelemetryMenuLabel = "Gaudi"
PDUTelemetryMenuLabel = "PDU"
)

const (
Expand Down Expand Up @@ -790,6 +792,15 @@ var tableDefinitions = map[string]TableDefinition{
NoDataFound: "No Gaudi telemetry found. Gaudi devices and the hl-smi tool must be installed on the target system to collect Gaudi stats.",
FieldsFunc: gaudiTelemetryTableValues,
HTMLTableRendererFunc: gaudiTelemetryTableHTMLRenderer},
PDUTelemetryTableName: {
Name: PDUTelemetryTableName,
MenuLabel: PDUTelemetryMenuLabel,
HasRows: true,
ScriptNames: []string{
script.PDUTelemetryScriptName,
},
FieldsFunc: pduTelemetryTableValues,
HTMLTableRendererFunc: pduTelemetryTableHTMLRenderer},
//
// flamegraph tables
//
Expand Down Expand Up @@ -2756,6 +2767,10 @@ func gaudiTelemetryTableValues(outputs map[string]script.ScriptOutput) []Field {
return fields
}

func pduTelemetryTableValues(outputs map[string]script.ScriptOutput) []Field {
return []Field{}
}

func callStackFrequencyTableValues(outputs map[string]script.ScriptOutput) []Field {
fields := []Field{
{Name: "Native Stacks", Values: []string{nativeFoldedFromOutput(outputs)}},
Expand Down
Loading