首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >基本的议会联盟示例与ValueError崩溃:预期的父级

基本的议会联盟示例与ValueError崩溃:预期的父级
EN

Stack Overflow用户
提问于 2022-10-13 08:58:16
回答 1查看 42关注 0票数 1

我想测试免费的议会联盟运行时在纸空间,因此,我做了一个免费的帐户,并选择了HuggingFace +议会联盟笔记本。

之后,我创建了以下非常简单的笔记本电脑,并使用Pytorch闪电在MNIST上执行分类(最简单的例子):

代码语言:javascript
复制
!python3 -m pip install torchvision==0.11.1
!python3 -m pip install pytorch_lightning


import torch
from torch.nn import functional as F

import pytorch_lightning as pl
from torch.utils.data import DataLoader
import torchvision
import poptorch

class LitClassifier(pl.LightningModule):
    def __init__(self, hidden_dim: int = 128, learning_rate: float = 0.0001):
        super().__init__()
        self.save_hyperparameters()

        self.l1 = torch.nn.Linear(28 * 28, self.hparams.hidden_dim)
        self.l2 = torch.nn.Linear(self.hparams.hidden_dim, 10)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = torch.relu(self.l1(x))
        x = torch.relu(self.l2(x))
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        probs = self(x)
        # we currently return the accuracy as the validation_step/test_step is run on the IPU devices.
        # Outputs from the step functions are sent to the host device, where we calculate the metrics in
        # validation_epoch_end and test_epoch_end for the test_step.
        acc = self.accuracy(probs, y)
        return acc

    def test_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        acc = self.accuracy(logits, y)
        return acc

    def accuracy(self, logits, y):
        # currently IPU poptorch doesn't implicit convert bools to tensor
        # hence we use an explicit calculation for accuracy here. Once fixed in poptorch
        # we can use the accuracy metric.
        acc = torch.sum(torch.eq(torch.argmax(logits, -1), y).to(torch.float32)) / len(y)
        return acc

    def validation_epoch_end(self, outputs) -> None:
        # since the training step/validation step and test step are run on the IPU device
        # we must log the average loss outside the step functions.
        self.log("val_acc", torch.stack(outputs).mean(), prog_bar=True)

    def test_epoch_end(self, outputs) -> None:
        self.log("test_acc", torch.stack(outputs).mean())

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)
    

training_batch_size = 10


dm = DataLoader(
     torchvision.datasets.MNIST('mnist_data/',
                                train=True,
                                download=True,
                                transform=torchvision.transforms.Compose([
                                    torchvision.transforms.ToTensor(),
                                    torchvision.transforms.Normalize(
                                        (0.1307, ), (0.3081, ))
                                ])),
     batch_size=training_batch_size,
     shuffle=True)

model = LitClassifier()

print(model)
trainer = pl.Trainer(max_epochs=2, accelerator="ipu", devices="auto")

trainer.fit(model, datamodule=dm)

代码崩溃时,库的内部错误如下:

代码语言:javascript
复制
LitClassifier(
  (l1): Linear(in_features=784, out_features=128, bias=True)
  (l2): Linear(in_features=128, out_features=10, bias=True)
)
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: True, using: 4 IPUs
HPU available: False, using: 0 HPUs
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-24-329fa233a013> in <module>
     83 trainer = pl.Trainer(max_epochs=2, accelerator="ipu", devices="auto")
     84 
---> 85 trainer.fit(model, datamodule=dm)

/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py in fit(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
    694         """
    695         self.strategy.model = model
--> 696         self._call_and_handle_interrupt(
    697             self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
    698         )

/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py in _call_and_handle_interrupt(self, trainer_fn, *args, **kwargs)
    648                 return self.strategy.launcher.launch(trainer_fn, *args, trainer=self, **kwargs)
    649             else:
--> 650                 return trainer_fn(*args, **kwargs)
    651         # TODO(awaelchli): Unify both exceptions below, where `KeyboardError` doesn't re-raise
    652         except KeyboardInterrupt as exception:

/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py in _fit_impl(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
    733             ckpt_path, model_provided=True, model_connected=self.lightning_module is not None
    734         )
--> 735         results = self._run(model, ckpt_path=self.ckpt_path)
    736 
    737         assert self.state.stopped

/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py in _run(self, model, ckpt_path)
   1089         self._callback_connector._attach_model_logging_functions()
   1090 
-> 1091         verify_loop_configurations(self)
   1092 
   1093         # hook

/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/configuration_validator.py in verify_loop_configurations(trainer)
     57     _check_on_pretrain_routine(model)
     58     # TODO: Delete CheckpointHooks off LightningDataModule in v1.8
---> 59     _check_datamodule_checkpoint_hooks(trainer)
     60     _check_setup_method(trainer)
     61 

/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/configuration_validator.py in _check_datamodule_checkpoint_hooks(trainer)
    291 
    292 def _check_datamodule_checkpoint_hooks(trainer: "pl.Trainer") -> None:
--> 293     if is_overridden(method_name="on_save_checkpoint", instance=trainer.datamodule):
    294         rank_zero_deprecation(
    295             "`LightningDataModule.on_save_checkpoint` was deprecated in"

/usr/local/lib/python3.8/dist-packages/pytorch_lightning/utilities/model_helpers.py in is_overridden(method_name, instance, parent)
     32             parent = pl.Callback
     33         if parent is None:
---> 34             raise ValueError("Expected a parent")
     35 
     36     instance_attr = getattr(instance, method_name, None)

ValueError: Expected a parent

这是库的版本不兼容的问题吗?我尝试在谷歌上搜索这个错误,但只发现了这个问题:pytorch - Model_heplers.py in is_overridden > raise ValueError(“Expected a parent”),但我不认为这是我的问题,因为我只是使用内置的Dataloader,并在我的网络中继承了pl.LightningModule。纸空间文档中的示例:https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/example.html可以工作,但它不使用Pytorch。

有什么方法可以让这个服务在闪电中正常运行吗?

EN

回答 1

Stack Overflow用户

发布于 2022-10-13 09:19:00

为了使Pythor闪电正确地处理数据中心,在这种情况下,它必须放在LitClassifier类中。

代码语言:javascript
复制
!python3 -m pip install torchvision==0.11.1
!python3 -m pip install pytorch_lightning


import torch
from torch.nn import functional as F

import pytorch_lightning as pl
from torch.utils.data import DataLoader
import torchvision
import poptorch

class LitClassifier(pl.LightningModule):
    def __init__(self, hidden_dim: int = 128, learning_rate: float = 0.0001):
        super().__init__()
        self.save_hyperparameters()

        self.l1 = torch.nn.Linear(28 * 28, self.hparams.hidden_dim)
        self.l2 = torch.nn.Linear(self.hparams.hidden_dim, 10)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = torch.relu(self.l1(x))
        x = torch.relu(self.l2(x))
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        probs = self(x)
        # we currently return the accuracy as the validation_step/test_step is run on the IPU devices.
        # Outputs from the step functions are sent to the host device, where we calculate the metrics in
        # validation_epoch_end and test_epoch_end for the test_step.
        acc = self.accuracy(probs, y)
        return acc

    def test_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        acc = self.accuracy(logits, y)
        return acc

    def accuracy(self, logits, y):
        # currently IPU poptorch doesn't implicit convert bools to tensor
        # hence we use an explicit calculation for accuracy here. Once fixed in poptorch
        # we can use the accuracy metric.
        acc = torch.sum(torch.eq(torch.argmax(logits, -1), y).to(torch.float32)) / len(y)
        return acc

    def validation_epoch_end(self, outputs) -> None:
        # since the training step/validation step and test step are run on the IPU device
        # we must log the average loss outside the step functions.
        self.log("val_acc", torch.stack(outputs).mean(), prog_bar=True)

    def test_epoch_end(self, outputs) -> None:
        self.log("test_acc", torch.stack(outputs).mean())

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)
        
    def train_dataloader(self):
        training_batch_size = 100
        return DataLoader(
     torchvision.datasets.MNIST('mnist_data/',
                                train=True,
                                download=True,
                                transform=torchvision.transforms.Compose([
                                    torchvision.transforms.ToTensor(),
                                    torchvision.transforms.Normalize(
                                        (0.1307, ), (0.3081, ))
                                ])),
     batch_size=training_batch_size,
     num_workers=240,
     shuffle=True)

    def test_dataloader(self):
        val_batch_size = 100
        return DataLoader(
     torchvision.datasets.MNIST('mnist_data/',
                                train=False,
                                download=True,
                                transform=torchvision.transforms.Compose([
                                    torchvision.transforms.ToTensor(),
                                    torchvision.transforms.Normalize(
                                        (0.1307, ), (0.3081, ))
                                ])),
     batch_size=val_batch_size,
     num_workers=240,
     shuffle=False)




model = LitClassifier()

print(model)
trainer = pl.Trainer(max_epochs=2, accelerator="ipu", devices="auto")

trainer.fit(model)
票数 1
EN
页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://stackoverflow.com/questions/74053078

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档