文章/答案/技术大牛

发布

社区首页 >问答首页 >基本的议会联盟示例与ValueError崩溃:预期的父级

问基本的议会联盟示例与ValueError崩溃:预期的父级
EN

Stack Overflow用户

提问于 2022-10-13 08:58:16

回答 1查看 42关注 0票数 1

我想测试免费的议会联盟运行时在纸空间，因此，我做了一个免费的帐户，并选择了HuggingFace +议会联盟笔记本。

之后，我创建了以下非常简单的笔记本电脑，并使用Pytorch闪电在MNIST上执行分类(最简单的例子)：

!python3 -m pip install torchvision==0.11.1
!python3 -m pip install pytorch_lightning


import torch
from torch.nn import functional as F

import pytorch_lightning as pl
from torch.utils.data import DataLoader
import torchvision
import poptorch

class LitClassifier(pl.LightningModule):
    def __init__(self, hidden_dim: int = 128, learning_rate: float = 0.0001):
        super().__init__()
        self.save_hyperparameters()

        self.l1 = torch.nn.Linear(28 * 28, self.hparams.hidden_dim)
        self.l2 = torch.nn.Linear(self.hparams.hidden_dim, 10)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = torch.relu(self.l1(x))
        x = torch.relu(self.l2(x))
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        probs = self(x)
        # we currently return the accuracy as the validation_step/test_step is run on the IPU devices.
        # Outputs from the step functions are sent to the host device, where we calculate the metrics in
        # validation_epoch_end and test_epoch_end for the test_step.
        acc = self.accuracy(probs, y)
        return acc

    def test_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        acc = self.accuracy(logits, y)
        return acc

    def accuracy(self, logits, y):
        # currently IPU poptorch doesn't implicit convert bools to tensor
        # hence we use an explicit calculation for accuracy here. Once fixed in poptorch
        # we can use the accuracy metric.
        acc = torch.sum(torch.eq(torch.argmax(logits, -1), y).to(torch.float32)) / len(y)
        return acc

    def validation_epoch_end(self, outputs) -> None:
        # since the training step/validation step and test step are run on the IPU device
        # we must log the average loss outside the step functions.
        self.log("val_acc", torch.stack(outputs).mean(), prog_bar=True)

    def test_epoch_end(self, outputs) -> None:
        self.log("test_acc", torch.stack(outputs).mean())

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)
    

training_batch_size = 10


dm = DataLoader(
     torchvision.datasets.MNIST('mnist_data/',
                                train=True,
                                download=True,
                                transform=torchvision.transforms.Compose([
                                    torchvision.transforms.ToTensor(),
                                    torchvision.transforms.Normalize(
                                        (0.1307, ), (0.3081, ))
                                ])),
     batch_size=training_batch_size,
     shuffle=True)

model = LitClassifier()

print(model)
trainer = pl.Trainer(max_epochs=2, accelerator="ipu", devices="auto")

trainer.fit(model, datamodule=dm)

代码崩溃时，库的内部错误如下：

LitClassifier(
  (l1): Linear(in_features=784, out_features=128, bias=True)
  (l2): Linear(in_features=128, out_features=10, bias=True)
)
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: True, using: 4 IPUs
HPU available: False, using: 0 HPUs
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-24-329fa233a013> in <module>
     83 trainer = pl.Trainer(max_epochs=2, accelerator="ipu", devices="auto")
     84 
---> 85 trainer.fit(model, datamodule=dm)

/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py in fit(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
    694         """
    695         self.strategy.model = model
--> 696         self._call_and_handle_interrupt(
    697             self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
    698         )

/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py in _call_and_handle_interrupt(self, trainer_fn, *args, **kwargs)
    648                 return self.strategy.launcher.launch(trainer_fn, *args, trainer=self, **kwargs)
    649             else:
--> 650                 return trainer_fn(*args, **kwargs)
    651         # TODO(awaelchli): Unify both exceptions below, where `KeyboardError` doesn't re-raise
    652         except KeyboardInterrupt as exception:

/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py in _fit_impl(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
    733             ckpt_path, model_provided=True, model_connected=self.lightning_module is not None
    734         )
--> 735         results = self._run(model, ckpt_path=self.ckpt_path)
    736 
    737         assert self.state.stopped

/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py in _run(self, model, ckpt_path)
   1089         self._callback_connector._attach_model_logging_functions()
   1090 
-> 1091         verify_loop_configurations(self)
   1092 
   1093         # hook

/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/configuration_validator.py in verify_loop_configurations(trainer)
     57     _check_on_pretrain_routine(model)
     58     # TODO: Delete CheckpointHooks off LightningDataModule in v1.8
---> 59     _check_datamodule_checkpoint_hooks(trainer)
     60     _check_setup_method(trainer)
     61 

/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/configuration_validator.py in _check_datamodule_checkpoint_hooks(trainer)
    291 
    292 def _check_datamodule_checkpoint_hooks(trainer: "pl.Trainer") -> None:
--> 293     if is_overridden(method_name="on_save_checkpoint", instance=trainer.datamodule):
    294         rank_zero_deprecation(
    295             "`LightningDataModule.on_save_checkpoint` was deprecated in"

/usr/local/lib/python3.8/dist-packages/pytorch_lightning/utilities/model_helpers.py in is_overridden(method_name, instance, parent)
     32             parent = pl.Callback
     33         if parent is None:
---> 34             raise ValueError("Expected a parent")
     35 
     36     instance_attr = getattr(instance, method_name, None)

ValueError: Expected a parent

这是库的版本不兼容的问题吗？我尝试在谷歌上搜索这个错误，但只发现了这个问题：pytorch - Model_heplers.py in is_overridden > raise ValueError(“Expected a parent”)，但我不认为这是我的问题，因为我只是使用内置的Dataloader，并在我的网络中继承了pl.LightningModule。纸空间文档中的示例：https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/example.html可以工作，但它不使用Pytorch。

有什么方法可以让这个服务在闪电中正常运行吗？

machine-learning

computer-vision

pytorch-lightning

ipu

python

回答 1

Stack Overflow用户

发布于 2022-10-13 09:19:00

为了使Pythor闪电正确地处理数据中心，在这种情况下，它必须放在LitClassifier类中。

!python3 -m pip install torchvision==0.11.1
!python3 -m pip install pytorch_lightning


import torch
from torch.nn import functional as F

import pytorch_lightning as pl
from torch.utils.data import DataLoader
import torchvision
import poptorch

class LitClassifier(pl.LightningModule):
    def __init__(self, hidden_dim: int = 128, learning_rate: float = 0.0001):
        super().__init__()
        self.save_hyperparameters()

        self.l1 = torch.nn.Linear(28 * 28, self.hparams.hidden_dim)
        self.l2 = torch.nn.Linear(self.hparams.hidden_dim, 10)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = torch.relu(self.l1(x))
        x = torch.relu(self.l2(x))
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        probs = self(x)
        # we currently return the accuracy as the validation_step/test_step is run on the IPU devices.
        # Outputs from the step functions are sent to the host device, where we calculate the metrics in
        # validation_epoch_end and test_epoch_end for the test_step.
        acc = self.accuracy(probs, y)
        return acc

    def test_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        acc = self.accuracy(logits, y)
        return acc

    def accuracy(self, logits, y):
        # currently IPU poptorch doesn't implicit convert bools to tensor
        # hence we use an explicit calculation for accuracy here. Once fixed in poptorch
        # we can use the accuracy metric.
        acc = torch.sum(torch.eq(torch.argmax(logits, -1), y).to(torch.float32)) / len(y)
        return acc

    def validation_epoch_end(self, outputs) -> None:
        # since the training step/validation step and test step are run on the IPU device
        # we must log the average loss outside the step functions.
        self.log("val_acc", torch.stack(outputs).mean(), prog_bar=True)

    def test_epoch_end(self, outputs) -> None:
        self.log("test_acc", torch.stack(outputs).mean())

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)
        
    def train_dataloader(self):
        training_batch_size = 100
        return DataLoader(
     torchvision.datasets.MNIST('mnist_data/',
                                train=True,
                                download=True,
                                transform=torchvision.transforms.Compose([
                                    torchvision.transforms.ToTensor(),
                                    torchvision.transforms.Normalize(
                                        (0.1307, ), (0.3081, ))
                                ])),
     batch_size=training_batch_size,
     num_workers=240,
     shuffle=True)

    def test_dataloader(self):
        val_batch_size = 100
        return DataLoader(
     torchvision.datasets.MNIST('mnist_data/',
                                train=False,
                                download=True,
                                transform=torchvision.transforms.Compose([
                                    torchvision.transforms.ToTensor(),
                                    torchvision.transforms.Normalize(
                                        (0.1307, ), (0.3081, ))
                                ])),
     batch_size=val_batch_size,
     num_workers=240,
     shuffle=False)




model = LitClassifier()

print(model)
trainer = pl.Trainer(max_epochs=2, accelerator="ipu", devices="auto")

trainer.fit(model)

票数 1

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/74053078

复制

相似问题

问基本的议会联盟示例与ValueError崩溃:预期的父级
EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问基本的议会联盟示例与ValueError崩溃:预期的父级EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问基本的议会联盟示例与ValueError崩溃:预期的父级
EN