Skip to content

Commit 5fc187e

Browse files
authored
run all scripts with controller (#596)
* run all scripts with controller Signed-off-by: Harper, Jason M <[email protected]> * clean up errors and logs Signed-off-by: Harper, Jason M <[email protected]> * update turbostat tests to expect errors for empty output scenarios Signed-off-by: Harper, Jason M <[email protected]> * no sudo if user is root Signed-off-by: Harper, Jason M <[email protected]> * restore option to stop running sequential scripts on failure Signed-off-by: Harper, Jason M <[email protected]> --------- Signed-off-by: Harper, Jason M <[email protected]>
1 parent c4c7f11 commit 5fc187e

File tree

11 files changed

+335
-274
lines changed

11 files changed

+335
-274
lines changed

cmd/config/config.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ func runCmd(cmd *cobra.Command, args []string) error {
221221
// prepareTarget prepares the target for configuration changes
222222
// almost all set scripts require the msr kernel module to be loaded and
223223
// use wrmsr and rdmsr, so we do that here so that the goroutines for the
224-
// set scripts can run in parallel without conflicts
224+
// set scripts can run concurrently without conflicts
225225
func prepareTarget(myTarget target.Target, localTempDir string) (err error) {
226226
prepareScript := script.ScriptDefinition{
227227
Name: "prepare-target",

cmd/flamegraph/flamegraph_tables.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,10 @@ func callStackFrequencyTableValues(outputs map[string]script.ScriptOutput) []tab
4141
}
4242

4343
func javaFoldedFromOutput(outputs map[string]script.ScriptOutput) string {
44+
if outputs[script.CollapsedCallStacksScriptName].Stdout == "" {
45+
slog.Warn("collapsed call stack output is empty")
46+
return ""
47+
}
4448
sections := common.GetSectionsFromOutput(outputs[script.CollapsedCallStacksScriptName].Stdout)
4549
if len(sections) == 0 {
4650
slog.Warn("no sections in collapsed call stack output")
@@ -79,6 +83,10 @@ func javaFoldedFromOutput(outputs map[string]script.ScriptOutput) string {
7983
}
8084

8185
func nativeFoldedFromOutput(outputs map[string]script.ScriptOutput) string {
86+
if outputs[script.CollapsedCallStacksScriptName].Stdout == "" {
87+
slog.Warn("collapsed call stack output is empty")
88+
return ""
89+
}
8290
sections := common.GetSectionsFromOutput(outputs[script.CollapsedCallStacksScriptName].Stdout)
8391
if len(sections) == 0 {
8492
slog.Warn("no sections in collapsed call stack output")
@@ -104,6 +112,10 @@ func nativeFoldedFromOutput(outputs map[string]script.ScriptOutput) string {
104112
}
105113

106114
func maxRenderDepthFromOutput(outputs map[string]script.ScriptOutput) string {
115+
if outputs[script.CollapsedCallStacksScriptName].Stdout == "" {
116+
slog.Warn("collapsed call stack output is empty")
117+
return ""
118+
}
107119
sections := common.GetSectionsFromOutput(outputs[script.CollapsedCallStacksScriptName].Stdout)
108120
if len(sections) == 0 {
109121
slog.Warn("no sections in collapsed call stack output")

cmd/metrics/metadata.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ func (c *X86MetadataCollector) CollectMetadata(t target.Target, noRoot bool, noS
157157
if err != nil {
158158
return Metadata{}, fmt.Errorf("failed to get number of general purpose counters: %v", err)
159159
}
160-
// the rest of the metadata is retrieved by running scripts in parallel
160+
// the rest of the metadata is retrieved by running scripts concurrently
161161
metadataScripts, err := getMetadataScripts(noRoot, noSystemSummary, metadata.NumGeneralPurposeCounters)
162162
if err != nil {
163163
return Metadata{}, fmt.Errorf("failed to get metadata scripts: %v", err)
@@ -336,7 +336,7 @@ func (c *ARMMetadataCollector) CollectMetadata(t target.Target, noRoot bool, noS
336336
if err != nil {
337337
return Metadata{}, fmt.Errorf("failed to get number of general purpose counters: %v", err)
338338
}
339-
// the rest of the metadata is retrieved by running scripts in parallel and then parsing the output
339+
// the rest of the metadata is retrieved by running scripts concurrently and then parsing the output
340340
metadataScripts, err := getMetadataScripts(noRoot, noSystemSummary, metadata.NumGeneralPurposeCounters)
341341
if err != nil {
342342
return Metadata{}, fmt.Errorf("failed to get metadata scripts: %v", err)
@@ -393,7 +393,7 @@ func (c *ARMMetadataCollector) CollectMetadata(t target.Target, noRoot bool, noS
393393
}
394394

395395
func getMetadataScripts(noRoot bool, noSystemSummary bool, numGPCounters int) (metadataScripts []script.ScriptDefinition, err error) {
396-
// reduce startup time by running the metadata scripts in parallel
396+
// reduce startup time by running the metadata scripts concurrently
397397
metadataScriptDefs := []script.ScriptDefinition{
398398
{
399399
Name: "get architecture",

cmd/telemetry/telemetry_tables.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -609,9 +609,13 @@ func instructionTelemetryTableValues(outputs map[string]script.ScriptOutput) []t
609609
// first two lines are not part of the CSV output, they are the start time and interval
610610
var startTime time.Time
611611
var interval int
612+
if len(outputs[script.InstructionTelemetryScriptName].Stdout) == 0 {
613+
slog.Warn("instruction mix output is empty")
614+
return []table.Field{}
615+
}
612616
lines := strings.Split(outputs[script.InstructionTelemetryScriptName].Stdout, "\n")
613617
if len(lines) < 4 {
614-
slog.Warn("no data found in instruction mix output")
618+
slog.Warn("no data lines found in instruction mix output")
615619
return []table.Field{}
616620
}
617621
// TIME

internal/common/common.go

Lines changed: 49 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -288,16 +288,31 @@ func (rc *ReportingCommand) Run() error {
288288
return nil
289289
}
290290

291+
func signalProcessOnTarget(t target.Target, pidStr string, sigStr string) error {
292+
var cmd *exec.Cmd
293+
// prepend "-" to the signal string if not already present
294+
if !strings.HasPrefix(sigStr, "-") {
295+
sigStr = "-" + sigStr
296+
}
297+
if !t.IsSuperUser() && t.CanElevatePrivileges() {
298+
cmd = exec.Command("sudo", "kill", sigStr, pidStr)
299+
} else {
300+
cmd = exec.Command("kill", sigStr, pidStr)
301+
}
302+
_, _, _, err := t.RunCommandEx(cmd, 5, false, true) // #nosec G204
303+
return err
304+
}
305+
291306
// configureSignalHandler sets up a signal handler to catch SIGINT and SIGTERM
292307
//
293308
// When perfspect receives ctrl-c while in the shell, the shell propagates the
294309
// signal to all our children. But when perfspect is run in the background or disowned and
295310
// then receives SIGINT, e.g., from a script, we need to send the signal to our children
296311
//
297-
// Also, when running scripts in parallel using the parallel_master.sh script, we need to
298-
// send the signal to the parallel_master.sh script on each target so that it can clean up
299-
// its child processes. This is because the parallel_master.sh script is run in its own process group
300-
// and does not receive the signal when perfspect receives it.
312+
// When running scripts using the controller.sh script, we need to send the signal to the
313+
// controller.sh script on each target so that it can clean up its child processes. This is
314+
// because the controller.sh script is run in its own process group and does not receive the
315+
// signal when perfspect receives it.
301316
//
302317
// Parameters:
303318
// - myTargets: The list of targets to send the signal to.
@@ -308,48 +323,38 @@ func configureSignalHandler(myTargets []target.Target, statusFunc progress.Multi
308323
go func() {
309324
sig := <-sigChannel
310325
slog.Debug("received signal", slog.String("signal", sig.String()))
311-
// Scripts that are run in parallel using the parallel_master.sh script and a few other sequential scripts need to be handled specially
312-
// because they are run in their own process group, we need to send the signal directly to the PID of the script.
313-
// For every target, look for the primary_collection_script PID file and send SIGINT to it.
326+
// The controller.sh script is run in its own process group, so we need to send the signal
327+
// directly to the PID of the controller. For every target, look for the primary_collection_script
328+
// PID file and send SIGINT to it.
329+
// The controller script is run in its own process group, so we need to send the signal
330+
// directly to the PID of the controller. For every target, look for the controller
331+
// PID file and send SIGINT to it.
314332
for _, t := range myTargets {
315333
if statusFunc != nil {
316334
_ = statusFunc(t.GetName(), "Signal received, cleaning up...")
317335
}
318-
pidFilePath := filepath.Join(t.GetTempDirectory(), "primary_collection_script.pid")
336+
pidFilePath := filepath.Join(t.GetTempDirectory(), script.ControllerPIDFileName)
319337
stdout, _, exitcode, err := t.RunCommandEx(exec.Command("cat", pidFilePath), 5, false, true) // #nosec G204
320338
if err != nil {
321-
slog.Error("error retrieving target primary_collection_script PID", slog.String("target", t.GetName()), slog.String("error", err.Error()))
339+
slog.Error("error retrieving target controller PID", slog.String("target", t.GetName()), slog.String("error", err.Error()))
322340
}
323341
if exitcode == 0 {
324342
pidStr := strings.TrimSpace(stdout)
325-
_, _, _, err := t.RunCommandEx(exec.Command("sudo", "kill", "-SIGINT", pidStr), 5, false, true) // #nosec G204
343+
err = signalProcessOnTarget(t, pidStr, "SIGINT")
326344
if err != nil {
327-
slog.Error("error sending signal to target primary_collection_script", slog.String("target", t.GetName()), slog.String("error", err.Error()))
345+
slog.Error("error sending SIGINT signal to target controller", slog.String("target", t.GetName()), slog.String("error", err.Error()))
328346
}
329347
}
330348
}
331-
// now wait until all primary collection scripts have exited
332-
slog.Debug("waiting for primary_collection_script scripts to exit")
349+
// now wait until all controller scripts have exited
350+
slog.Debug("waiting for controller scripts to exit")
333351
for _, t := range myTargets {
334352
// create a per-target timeout context
335353
targetTimeout := 10 * time.Second
336354
ctx, cancel := context.WithTimeout(context.Background(), targetTimeout)
337355
timedOut := false
338-
pidFilePath := filepath.Join(t.GetTempDirectory(), "primary_collection_script.pid")
356+
pidFilePath := filepath.Join(t.GetTempDirectory(), script.ControllerPIDFileName)
339357
for {
340-
// check for timeout
341-
select {
342-
case <-ctx.Done():
343-
if statusFunc != nil {
344-
_ = statusFunc(t.GetName(), "cleanup timeout exceeded")
345-
}
346-
slog.Warn("signal handler cleanup timeout exceeded for target", slog.String("target", t.GetName()))
347-
timedOut = true
348-
default:
349-
}
350-
if timedOut {
351-
break
352-
}
353358
// read the pid file
354359
stdout, _, exitcode, err := t.RunCommandEx(exec.Command("cat", pidFilePath), 5, false, true) // #nosec G204
355360
if err != nil || exitcode != 0 {
@@ -362,6 +367,23 @@ func configureSignalHandler(myTargets []target.Target, statusFunc progress.Multi
362367
if err != nil || exitcode != 0 {
363368
break // process no longer exists, script has exited
364369
}
370+
// check for timeout
371+
select {
372+
case <-ctx.Done():
373+
timedOut = true
374+
default:
375+
}
376+
if timedOut {
377+
if statusFunc != nil {
378+
_ = statusFunc(t.GetName(), "cleanup timeout exceeded, sending kill signal")
379+
}
380+
slog.Warn("signal handler cleanup timeout exceeded for target, sending SIGKILL", slog.String("target", t.GetName()))
381+
err = signalProcessOnTarget(t, pidStr, "SIGKILL")
382+
if err != nil {
383+
slog.Error("error sending SIGKILL signal to target controller", slog.String("target", t.GetName()), slog.String("error", err.Error()))
384+
}
385+
break
386+
}
365387
// sleep for a short time before checking again
366388
time.Sleep(500 * time.Millisecond)
367389
}

internal/common/targets.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -815,15 +815,15 @@ func RunScript(t target.Target, s script.ScriptDefinition, localTempDir string,
815815
return script.RunScript(t, s, localTempDir)
816816
}
817817

818-
func RunScripts(t target.Target, s []script.ScriptDefinition, ignoreScriptErrors bool, localTempDir string, statusUpdate progress.MultiSpinnerUpdateFunc, collectingStatusMsg string, noRoot bool) (map[string]script.ScriptOutput, error) {
818+
func RunScripts(t target.Target, s []script.ScriptDefinition, continueOnScriptError bool, localTempDir string, statusUpdate progress.MultiSpinnerUpdateFunc, collectingStatusMsg string, noRoot bool) (map[string]script.ScriptOutput, error) {
819819
supportedScripts, err := FilterScriptsForTarget(t, s, localTempDir, noRoot)
820820
if err != nil {
821821
return nil, fmt.Errorf("failed to check if scripts are supported on target %v", err)
822822
}
823823
if len(supportedScripts) == 0 {
824824
return nil, fmt.Errorf("zero scripts are supported on target %s", t.GetName())
825825
}
826-
return script.RunScripts(t, supportedScripts, ignoreScriptErrors, localTempDir, statusUpdate, collectingStatusMsg)
826+
return script.RunScripts(t, supportedScripts, continueOnScriptError, localTempDir, statusUpdate, collectingStatusMsg)
827827
}
828828

829829
func RunScriptStream(t target.Target, s script.ScriptDefinition, localTempDir string, stdoutChannel chan []byte, stderrChannel chan []byte, exitcodeChannel chan int, errorChannel chan error, cmdChannel chan *exec.Cmd, noRoot bool) {

internal/common/turbostat.go

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -91,19 +91,18 @@ func parseTurbostatOutput(output string) ([]map[string]string, error) {
9191
// The "platform" rows are those where Package, Die, Core, and CPU are all "-".
9292
// The first column is the sample time, and the rest are the values for the specified fields.
9393
func TurbostatPlatformRows(turboStatScriptOutput string, fieldNames []string) ([][]string, error) {
94+
if turboStatScriptOutput == "" {
95+
return nil, fmt.Errorf("turbostat output is empty")
96+
}
9497
if len(fieldNames) == 0 {
95-
err := fmt.Errorf("no field names provided")
96-
slog.Error(err.Error())
97-
return nil, err
98+
return nil, fmt.Errorf("no field names provided")
9899
}
99100
rows, err := parseTurbostatOutput(turboStatScriptOutput)
100101
if err != nil {
101-
err := fmt.Errorf("unable to parse turbostat output: %w", err)
102-
return nil, err
102+
return nil, fmt.Errorf("unable to parse turbostat output: %w", err)
103103
}
104104
if len(rows) == 0 {
105-
slog.Warn("no platform rows found in turbostat output")
106-
return nil, nil
105+
return nil, fmt.Errorf("no platform rows found in turbostat output")
107106
}
108107
// filter the rows to the summary rows only
109108
var fieldValues [][]string
@@ -145,18 +144,18 @@ func isPlatformRow(row map[string]string) bool {
145144
// for each package, for the specified field names.
146145
// The first column is the sample time, and the rest are the values for the specified fields.
147146
func TurbostatPackageRows(turboStatScriptOutput string, fieldNames []string) ([][][]string, error) {
147+
if turboStatScriptOutput == "" {
148+
return nil, fmt.Errorf("turbostat output is empty")
149+
}
148150
if len(fieldNames) == 0 {
149-
err := fmt.Errorf("no field names provided")
150-
return nil, err
151+
return nil, fmt.Errorf("no field names provided")
151152
}
152153
rows, err := parseTurbostatOutput(turboStatScriptOutput)
153154
if err != nil {
154-
err := fmt.Errorf("unable to parse turbostat output: %w", err)
155-
return nil, err
155+
return nil, fmt.Errorf("unable to parse turbostat output: %w", err)
156156
}
157157
if len(rows) == 0 {
158-
slog.Warn("no package rows found in turbostat output")
159-
return nil, nil
158+
return nil, fmt.Errorf("no package rows found in turbostat output")
160159
}
161160
var packageRows [][][]string
162161
for _, row := range rows {
@@ -195,8 +194,7 @@ func TurbostatPackageRows(turboStatScriptOutput string, fieldNames []string) ([]
195194
}
196195
}
197196
if len(packageRows) == 0 {
198-
err := fmt.Errorf("no data found in turbostat output for fields: %s", fieldNames)
199-
return nil, err
197+
return nil, fmt.Errorf("no data found in turbostat output for fields: %s", fieldNames)
200198
}
201199
return packageRows, nil
202200
}

internal/common/turbostat_test.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -98,23 +98,23 @@ func TestTurbostatPlatformRows(t *testing.T) {
9898
fieldNames: []string{"Avg_MHz", "Busy%"},
9999
wantFirst: nil,
100100
wantLen: 0,
101-
expectErr: false,
101+
expectErr: true,
102102
},
103103
{
104104
name: "No output",
105105
turbostatOutput: "",
106106
fieldNames: []string{"Avg_MHz", "Busy%"},
107107
wantFirst: nil,
108108
wantLen: 0,
109-
expectErr: false,
109+
expectErr: true,
110110
},
111111
{
112112
name: "Only time and interval, no turbostat data",
113113
turbostatOutput: strings.Join(strings.Split(turbostatOutput, "\n")[0:2], "\n"), // Only header and no data
114114
fieldNames: []string{"Avg_MHz", "Busy%"},
115115
wantFirst: nil,
116116
wantLen: 0,
117-
expectErr: false,
117+
expectErr: true,
118118
},
119119
}
120120

@@ -547,7 +547,7 @@ X 0 0 1000 10
547547
turbostatOutput: "",
548548
fieldNames: []string{"Avg_MHz"},
549549
want: nil,
550-
wantErr: false,
550+
wantErr: true,
551551
},
552552
{
553553
name: "Only headers, no data",
@@ -558,7 +558,7 @@ Package Core CPU Avg_MHz Busy%
558558
`,
559559
fieldNames: []string{"Avg_MHz"},
560560
want: nil,
561-
wantErr: false,
561+
wantErr: true,
562562
},
563563
}
564564

0 commit comments

Comments
 (0)