diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..a490b61 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,5 @@ +{ + "githubPullRequests.ignoredPullRequestBranches": [ + "main" + ] +} \ No newline at end of file diff --git a/cmd/diagnostic/PyTorch.go b/cmd/diagnostic/PyTorch.go new file mode 100644 index 0000000..b478ba5 --- /dev/null +++ b/cmd/diagnostic/PyTorch.go @@ -0,0 +1,191 @@ +package diagnostic + +import ( + "bytes" + "encoding/json" + "fmt" + "io/ioutil" + "net/http" + "os/exec" + "regexp" + "strconv" + "strings" + + "github.com/spf13/cobra" +) + +type PodQueryData struct { + Query string `json:"query"` + Variables map[string]string `json:"variables"` +} + +type PodQueryResponse struct { + Data struct { + Pod struct { + MachineID string `json:"machineId"` + } `json:"pod"` + } `json:"data"` +} + +func getPodMachineID(podID, apiKey string) string { + url := fmt.Sprintf("https://api.runpod.io/graphql?api_key=%s", apiKey) + headers := map[string]string{ + "Content-Type": "application/json", + } + query := ` + query Pod($podId: String!) { + pod(input: { podId: $podId }) { + machineId + } + } + ` + data := PodQueryData{ + Query: query, + Variables: map[string]string{"podId": podID}, + } + jsonData, err := json.Marshal(data) + if err != nil { + fmt.Printf("Error marshalling JSON: %v\n", err) + return "" + } + + req, err := http.NewRequest("POST", url, bytes.NewBuffer(jsonData)) + if err != nil { + fmt.Printf("Error creating request: %v\n", err) + return "" + } + for k, v := range headers { + req.Header.Set(k, v) + } + + client := &http.Client{} + resp, err := client.Do(req) + if err != nil { + fmt.Printf("Failed to fetch machineId: %v\n", err) + return "" + } + defer resp.Body.Close() + + var result PodQueryResponse + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + fmt.Printf("Error decoding response: %v\n", err) + return "" + } + return result.Data.Pod.MachineID +} + +func collectEnvInfo() map[string]string { + fmt.Println("Collecting environment information...") + envInfo := map[string]string{ + "RUNPOD_POD_ID": os.Getenv("RUNPOD_POD_ID"), + "Template CUDA_VERSION": os.Getenv("CUDA_VERSION"), + "NVIDIA_DRIVER_CAPABILITIES": os.Getenv("NVIDIA_DRIVER_CAPABILITIES"), + "NVIDIA_VISIBLE_DEVICES": os.Getenv("NVIDIA_VISIBLE_DEVICES"), + "NVIDIA_PRODUCT_NAME": os.Getenv("NVIDIA_PRODUCT_NAME"), + "RUNPOD_GPU_COUNT": os.Getenv("RUNPOD_GPU_COUNT"), + "machineId": getPodMachineID(os.Getenv("RUNPOD_POD_ID"), os.Getenv("RUNPOD_API_KEY")), + } + for k, v := range envInfo { + if v == "" { + envInfo[k] = "Not Available" + } + } + return envInfo +} + +func parseNvidiaSMIOutput(output string) map[string]string { + cudaVersionRegex := regexp.MustCompile(`CUDA Version: (\d+\.\d+)`) + driverVersionRegex := regexp.MustCompile(`Driver Version: (\d+\.\d+\.\d+)`) + gpuNameRegex := regexp.MustCompile(`\|\s+\d+\s+([^\|]+?)\s+On\s+\|`) + + cudaVersion := cudaVersionRegex.FindStringSubmatch(output) + driverVersion := driverVersionRegex.FindStringSubmatch(output) + gpuName := gpuNameRegex.FindStringSubmatch(output) + + info := map[string]string{ + "CUDA Version": "Not Available", + "Driver Version": "Not Available", + "GPU Name": "Not Available", + } + + if len(cudaVersion) > 1 { + info["CUDA Version"] = cudaVersion[1] + } + if len(driverVersion) > 1 { + info["Driver Version"] = driverVersion[1] + } + if len(gpuName) > 1 { + info["GPU Name"] = strings.TrimSpace(gpuName[1]) + } + + return info +} + +func getNvidiaSMIInfo() map[string]string { + cmd := exec.Command("nvidia-smi") + output, err := cmd.Output() + if err != nil { + return map[string]string{"Error": fmt.Sprintf("Failed to fetch nvidia-smi info: %v", err)} + } + return parseNvidiaSMIOutput(string(output)) +} + +func getSystemInfo() map[string]interface{} { + systemInfo := map[string]interface{}{ + "Environment Info": collectEnvInfo(), + "Host Machine Info": getNvidiaSMIInfo(), + } + return systemInfo +} + +func runCUDATest() map[string]string { + fmt.Println("Performing CUDA operation tests on all available GPUs...") + gpuCount := 0 + if count, err := strconv.Atoi(os.Getenv("RUNPOD_GPU_COUNT")); err == nil { + gpuCount = count + } + results := make(map[string]string) + + if gpuCount == 0 { + return map[string]string{"Error": "No GPUs found."} + } + + for gpuID := 0; gpuID < gpuCount; gpuID++ { + cmd := exec.Command("python", "-c", fmt.Sprintf(` +import torch +device = torch.device('cuda:%d') +torch.cuda.set_device(device) +x = torch.rand(10, 10, device=device) +y = torch.rand(10, 10, device=device) +z = x + y +print("Success: CUDA is working correctly.") + `, gpuID)) + output, err := cmd.CombinedOutput() + if err != nil { + results[fmt.Sprintf("GPU %d", gpuID)] = fmt.Sprintf("Error: %v", err) + } else { + results[fmt.Sprintf("GPU %d", gpuID)] = strings.TrimSpace(string(output)) + } + } + + return results +} + +func saveInfoToFile(info map[string]interface{}, filename string) { + jsonData, _ := json.MarshalIndent(info, "", " ") + ioutil.WriteFile(filename, jsonData, 0644) + fmt.Printf("Diagnostics information saved to %s. Please share this file with RunPod Tech Support for further assistance.\n", filename) +} + +// Cobra command +var GpuDiagnosticsCmd = &cobra.Command{ + Use: "PyTorch", + Short: "Run PyTorch CUDA test", + Long: `This command performs a series of diagnostics tests on the GPUs available in your system for RunPod.`, + Run: func(cmd *cobra.Command, args []string) { + fmt.Println("RunPod GPU Diagnostics Tool") + systemInfo := getSystemInfo() + systemInfo["CUDA Test Result"] = runCUDATest() + saveInfoToFile(systemInfo, "/workspace/gpu_diagnostics.json") + }, +} \ No newline at end of file diff --git a/cmd/gpu_test.go b/cmd/gpu_test.go new file mode 100644 index 0000000..9411794 --- /dev/null +++ b/cmd/gpu_test.go @@ -0,0 +1,17 @@ +package cmd + +import ( + "cmd/diagnostic" + + "github.com/spf13/cobra" +) + +var gpuTestCmd = &cobra.Command{ + Use: "gpu-test", + Short: "GPU test commands", + Long: "Commands for testing GPU functionality", +} + +func init() { + gpuTestCmd.AddCommand(diagnostic.GpuDiagnosticsCmd) +} \ No newline at end of file diff --git a/cmd/root.go b/cmd/root.go index 904bbcf..07f4a8f 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -45,6 +45,9 @@ func registerCommands() { rootCmd.AddCommand(updateCmd) rootCmd.AddCommand(sshCmd) + //Diagnostic tools + rootCmd.AddCommand(gpuTestCmd) + // Remote File Execution rootCmd.AddCommand(execCmd)