Skip to content

运行torch on musa 时频发RuntimeError: MUSA error: unknown error #75

Open
@RichardAwesome

Description

@RichardAwesome

使用MTTS80运行测试代码无法完成测试,观察过进程显存占用也仅有53M,整体显存占用约1.6G,观察到有以下报错;但如果能在程序中每次数据加载到musa中,则可以正常运行

基本信息
  • 显卡类型: MTT S80
  • 驱动版本: 2.7.0-rc0717
  • container toolkit 版本: 1.9.0-1
  • torch_on_musa版本:docker镜像registry.mthreads.com/mcconline/musa-pytorch-release-public:rc3.0.1-v1.2.1-S80-py310
内核报错

[24078.925037] MTGPU:(Error): 56377: PID 1747 experienced error Guilty Lockup (0x1) caller[CheckFWCCB CTX Reset Noticication] [188]
[24082.310090] MTGPU:(Error): 56377: PID 1747 experienced error Guilty Lockup (0x1) caller[CheckFWCCB CTX Reset Noticication] [188]
[24082.376608] MTGPU:(Error): 56377: PID 1747 experienced error Guilty Lockup (0x1) caller[CheckFWCCB CTX Reset Noticication] [188]
[24082.378837] MTGPU:(Error): 56377: PID 115094 experienced error Innocent Lockup (0x2) caller[CheckFWCCB CTX Reset Noticication] [188]
[24082.378858] MTGPU:(Error): 56377: PID 1747 experienced error Innocent Lockup (0x2) caller[CheckFWCCB CTX Reset Noticication] [188]

运行时报错

Traceback (most recent call last):
File "/img/Ai-Learn-master/深度学习-PyTorch框架/PyTorch框架实战/第四章:图像识别核心模块实战解读/test.py", line 71, in
loss = criterion(outputs, labels.to(device))
File "/opt/conda/envs/py310/lib/python3.10/site-packages/torch_musa/core/tensor_attrs.py", line 41, in _to
return self.orig_to(*args, **kwargs)
RuntimeError: MUSA error: unknown error

import sys
import torch
import torch_musa
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

## 2. build network
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

if __name__ == "__main__": 
    ## 1. prepare dataset
    transform = transforms.Compose(
        [transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

    batch_size = 5
    trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                            download=True, transform=transform)
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                            shuffle=True, num_workers=2)
    testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                        download=True, transform=transform)
    testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                            shuffle=False, num_workers=2)
    classes = ('plane', 'car', 'bird', 'cat',
            'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
    device = torch.device("musa")

    net = Net().to(device)

    ## 3. define loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

    ## 4. train
    for epoch in range(2):  # loop over the dataset multiple times
        running_loss = 0.0
        # torch_musa.empty_cache()
        size = 0
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            # torch_musa.empty_cache()
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            size += sys.getsizeof(inputs)
            inputs = inputs.to(device)
            outputs = net(inputs)
            loss = criterion(outputs, labels.to(device))
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            
            if i % 200 == 199 :
                print("data size is ", size)
            
            if i % 2000 == 1999:    # print every 2000 mini-batches
                print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
                running_loss = 0.0

    print('Finished Training')

    PATH = './cifar_net.pth'
    torch.save(net.state_dict(), PATH)

    net.load_state_dict(torch.load(PATH))

    ## 5. test
    correct = 0
    total = 0
    # since we're not training, we don't need to calculate the gradients for our outputs
    with torch.no_grad():
        for data in testloader:
            # torch_musa.empty_cache()
            images, labels = data
            # calculate outputs by running images through the network
            outputs = net(images.to(device))
            # the class with the highest energy is what we choose as prediction
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels.to(device)).sum().item()

    print(f'Accuracy of the network on the 10000 test images: {100 * correct // total} %')

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions