0

我创建了一个简单的模型和训练函数来训练一个简单的线性回归训练,当在 CPU 上运行时,代码会运行,但是在 CUDA 上运行时,代码会随机崩溃一半,无法弄清楚到底是什么问题。

这是模型:

import torch.nn as nn
import torch


class LinearRegression1D:
    def __init__(self, in_feature=1, out_feature=1):
        self.in_feature = in_feature
        self.out_feature = out_feature
        self.device = torch.device("cuda")
        self.loss_func = nn.MSELoss()

    def set_device(self, device: str):
        devices = ["cuda", "cpu"]
        if device not in devices:
            raise ValueError("Only {} is valid as device name".format(" and ".join(devices)))
        self.device = torch.device(device)

    def get_device(self):
        return self.device

    def get_loss_func(self):
        return self.loss_func

    def __call__(self, *args, **kwargs):
        model = nn.Linear(self.in_feature, self.out_feature)
        return model

下面是训练函数:

import torch
from utills import move_to
from tqdm import tqdm


def train_simple_network(
    model,
    loss_func,
    training_loader,
    epochs=20,
    device="cpu"
):
    optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
    model.to(device)
    for _ in tqdm(range(epochs), desc="Epoch"):
        model = model.train()
        running_loss = 0.0
        for inputs, labels in tqdm(training_loader, desc="Batch", leave=False):
            inputs = move_to(inputs, device)
            labels = move_to(labels, device)

            optimizer.zero_grad()
            y_hat = model(inputs)
            loss = loss_func(y_hat, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

移动到函数是将任何数据结构移动到所选设备的通用函数:

def move_to(obj, device):
    """
    Based on the type move python object
    :param obj: the python object to move to a device, or to move its contents to a device
    :param device: the compute device to move objects to
    :return: python obj
    """
    if isinstance(obj, list):
        return [move_to(x, device) for x in obj]
    elif isinstance(obj, tuple):
        return tuple(move_to(list(obj), device))
    elif isinstance(obj, set):
        return set(move_to(list(obj), device))
    elif isinstance(obj, dict):
        to_ret = dict()
        for key, value in obj.items():
            to_ret[move_to(key, device)] = move_to(value, device)
        return to_ret
    elif hasattr(obj, "to"):
        return obj.to(device)
    else:
        return obj

这是最终的执行:

import torch
from torch.utils.data import DataLoader
import seaborn as sns

from chapter2.dataloaders import Simple1DRegressionDataSet
from chapter2.datagenerators import generate_1d_data
from chapter2.models import LinearRegression1D
from chapter2.train import train_simple_network

# utility function to generate 1 feature and 1 target set
features, targets = generate_1d_data()
# sns.scatterplot(x=features, y=targets)

# Convert dataset object to iterator
training_loader = DataLoader(Simple1DRegressionDataSet(features=features, targets=targets), shuffle=True)
# initialize model
model_instance = LinearRegression1D()

# If cuda is used it throws error: Process finished with exit code 139 (interrupted by signal 11: SIGSEGV)
model_instance.set_device("cpu")

model = model_instance()
loss_func = model_instance.get_loss_func()
device = model_instance.get_device()

train_simple_network(model=model, loss_func=loss_func, training_loader=training_loader, device=device)

出于重用目的,我已将代码分成文件。可以在以下 github repo中找到单独的代码。最终的主要代码可以在以下文件repo中找到。

有人可以帮我找出问题所在。

4

0 回答 0