有人能解释一下为什么这段代码(我从这里获取的):
## Standard libraries
import os
import json
import math
import numpy as np
import time
## Imports for plotting
import matplotlib.pyplot as plt
#%matplotlib inline
#from IPython.display import set_matplotlib_formats
#set_matplotlib_formats('svg', 'pdf') # For export
from matplotlib.colors import to_rgb
import matplotlib
matplotlib.rcParams['lines.linewidth'] = 2.0
import seaborn as sns
sns.reset_orig()
sns.set()
import torch_geometric
import torch_geometric.nn as geom_nn
import torch_geometric.data as geom_data
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
## Progress bar
from tqdm.notebook import tqdm
## PyTorch
import torch
import torchmetrics
from torchmetrics.functional import precision_recall
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import torch.optim as optim
# Torchvision
import torchvision
from torchvision.datasets import CIFAR10
from torchvision import transforms
# PyTorch Lightning
import pytorch_lightning as pl
from ray import tune
def __init__(self, config):
super(LightningMNISTClassifier, self).__init__()
self.layer_1_size = config["layer_1_size"]
self.layer_2_size = config["layer_2_size"]
self.lr = config["lr"]
self.batch_size = config["batch_size"]
from ray.tune.integration.pytorch_lightning import TuneReportCallback
callback = TuneReportCallback(
{
"loss": "val_loss",
"mean_accuracy": "val_accuracy"
},
on="validation_end")
def train_tune(config, epochs=10, gpus=0):
model = LightningMNISTClassifier(config)
trainer = pl.Trainer(
max_epochs=epochs,
gpus=gpus,
progress_bar_refresh_rate=0,
callbacks=[callback])
trainer.fit(model)
config = {
"layer_1_size": tune.choice([32, 64, 128]),
"layer_2_size": tune.choice([64, 128, 256]),
"lr": tune.loguniform(1e-4, 1e-1),
"batch_size": tune.choice([32, 64, 128])
}
def train_tune(config, epochs=10, gpus=0):
model = LightningMNISTClassifier(config)
trainer = pl.Trainer(
max_epochs=epochs,
gpus=gpus,
progress_bar_refresh_rate=0,
callbacks=[callback])
trainer.fit(model)
from functools import partial
tune.run(
partial(train_tune, epochs=10, gpus=0),
config=config,
num_samples=10)生成此错误:
Traceback (most recent call last):
File "example_hpo_working.py", line 89, in <module>
num_samples=10)
File "/root/miniconda3/lib/python3.7/site-packages/ray/tune/tune.py", line 741, in run
raise TuneError("Trials did not complete", incomplete_trials)
ray.tune.error.TuneError: ('Trials did not complete', [train_tune_6f362_00000, train_tune_6f362_00001, train_tune_6f362_00002, train_tune_6f362_00003, train_tune_6f362_00004, train_tune_6f362_00005, train_tune_6f362_00006, train_tune_6f362_00007, train_tune_6f362_00008, train_tune_6f362_00009])我可以看到一个类似的问题被问到了这里,但没有回答(最终的目的是使用射线超参数优化与一个pytorch网络)。
这是代码中的完整跟踪:
2022-08-16 15:44:08,204 WARNING function_runner.py:604 -- Function checkpointing is disabled. This may result in unexpected behavior when using checkpointing features or certain schedulers. To enable, set the train function arguments to be `func(config, checkpoint_dir=None)`.
2022-08-16 15:44:08,411 ERROR syncer.py:147 -- Log sync requires rsync to be installed.
== Status ==
Memory usage on this node: 16.8/86.4 GiB
Using FIFO scheduling algorithm.
Resources requested: 1.0/64 CPUs, 0/0 GPUs, 0.0/62.79 GiB heap, 0.0/9.31 GiB objects
Result logdir: /root/ray_results/train_tune_2022-08-16_15-44-08
Number of trials: 10/10 (9 PENDING, 1 RUNNING)
+------------------------+----------+------------------+--------------+----------------+----------------+-------------+
| Trial name | status | loc | batch_size | layer_1_size | layer_2_size | lr |
|------------------------+----------+------------------+--------------+----------------+----------------+-------------|
| train_tune_43fd5_00000 | RUNNING | 172.17.0.2:41684 | 64 | 64 | 256 | 0.00233834 |
| train_tune_43fd5_00001 | PENDING | | 64 | 64 | 256 | 0.00155955 |
| train_tune_43fd5_00002 | PENDING | | 128 | 128 | 64 | 0.00399358 |
| train_tune_43fd5_00003 | PENDING | | 128 | 128 | 64 | 0.000184477 |
...deleted a few similar lines here
..and then there's:
(func pid=41684) 2022-08-16 15:44:10,774 ERROR function_runner.py:286 -- Runner Thread raised error.
(func pid=41684) Traceback (most recent call last):
(func pid=41684) File "/root/miniconda3/lib/python3.7/site-packages/ray/tune/function_runner.py", line 277, in run
(func pid=41684) self._entrypoint()
(func pid=41684) File "/root/miniconda3/lib/python3.7/site-packages/ray/tune/function_runner.py", line 352, in entrypoint
(func pid=41684) self._status_reporter.get_checkpoint(),
(func pid=41684) File "/root/miniconda3/lib/python3.7/site-packages/ray/util/tracing/tracing_helper.py", line 462, in _resume_span
(func pid=41684) return method(self, *_args, **_kwargs)
(func pid=41684) File "/root/miniconda3/lib/python3.7/site-packages/ray/tune/function_runner.py", line 645, in _trainable_func
(func pid=41684) output = fn()
(func pid=41684) File "example_hpo_working.py", line 76, in train_tune
(func pid=41684) model = LightningMNISTClassifier(config)
(func pid=41684) NameError: name 'LightningMNISTClassifier' is not defined
2022-08-16 15:44:10,977 ERROR trial_runner.py:886 -- Trial train_tune_43fd5_00000: Error processing event.
NoneType: None
Result for train_tune_43fd5_00000:
date: 2022-08-16_15-44-10
experiment_id: c8977e85cbf84a9badff15fb2de6f516
hostname: 0e26c6a24ffa
node_ip: 172.17.0.2
pid: 41684
timestamp: 1660664650
trial_id: 43fd5_00000
(func pid=41722) 2022-08-16 15:44:13,241 ERROR function_runner.py:286 -- Runner Thread raised error.
(func pid=41722) Traceback (most recent call last):
(func pid=41722) File "/root/miniconda3/lib/python3.7/site-packages/ray/tune/function_runner.py", line 277, in run
(func pid=41722) self._entrypoint()
(func pid=41722) File "/root/miniconda3/lib/python3.7/site-packages/ray/tune/function_runner.py", line 352, in entrypoint
(func pid=41722) self._status_reporter.get_checkpoint(),
(func pid=41722) File "/root/miniconda3/lib/python3.7/site-packages/ray/util/tracing/tracing_helper.py", line 462, in _resume_span
(func pid=41722) return method(self, *_args, **_kwargs)
(func pid=41722) File "/root/miniconda3/lib/python3.7/site-packages/ray/tune/function_runner.py", line 645, in _trainable_func
(func pid=41722) output = fn()
(func pid=41722) File "example_hpo_working.py", line 76, in train_tune
(func pid=41722) model = LightningMNISTClassifier(config)
(func pid=41722) NameError: name 'LightningMNISTClassifier' is not defined
(func pid=41720) 2022-08-16 15:44:13,253 ERROR function_runner.py:286 -- Runner Thread raised error.
(func pid=41720) Traceback (most recent call last):
(func pid=41720) File "/root/miniconda3/lib/python3.7/site-packages/ray/tune/function_runner.py", line 277, in run
(func pid=41720) self._entrypoint()
(func pid=41720) File "/root/miniconda3/lib/python3.7/site-packages/ray/tune/function_runner.py", line 352, in entrypoint
(func pid=41720) self._status_reporter.get_checkpoint(),
(func pid=41720) File "/root/miniconda3/lib/python3.7/site-packages/ray/util/tracing/tracing_helper.py", line 462, in _resume_span
(func pid=41720) return method(self, *_args, **_kwargs)
(func pid=41720) File "/root/miniconda3/lib/python3.7/site-packages/ray/tune/function_runner.py", line 645, in _trainable_func
(func pid=41720) output = fn()
(func pid=41720) File "example_hpo_working.py", line 76, in train_tune
(func pid=41720) model = LightningMNISTClassifier(config)
(func pid=41720) NameError: name 'LightningMNISTClassifier' is not defined
(func pid=41718) 2022-08-16 15:44:13,253 ERROR function_runner.py:286 -- Runner Thread raised error.
(func pid=41718) Traceback (most recent call last):
(func pid=41718) File "/root/miniconda3/lib/python3.7/site-packages/ray/tune/function_runner.py", line 277, in run
(func pid=41718) self._entrypoint()
(func pid=41718) File "/root/miniconda3/lib/python3.7/site-packages/ray/tune/function_runner.py", line 352, in entrypoint
(func pid=41718) self._status_reporter.get_checkpoint(),
(func pid=41718) File "/root/miniconda3/lib/python3.7/site-packages/ray/util/tracing/tracing_helper.py", line 462, in _resume_span
(func pid=41718) return method(self, *_args, **_kwargs)
(func pid=41718) File "/root/miniconda3/lib/python3.7/site-packages/ray/tune/function_runner.py", line 645, in _trainable_func
(func pid=41718) output = fn()
(func pid=41718) File "example_hpo_working.py", line 76, in train_tune
(func pid=41718) model = LightningMNISTClassifier(config)
(func pid=41718) NameError: name 'LightningMNISTClassifier' is not defined
(func pid=41734) 2022-08-16 15:44:13,340 ERROR function_runner.py:286 -- Runner Thread raised error.
(func pid=41734) Traceback (most recent call last):
(func pid=41734) File "/root/miniconda3/lib/python3.7/site-packages/ray/tune/function_runner.py", line 277, in run
(func pid=41734) self._entrypoint()
(func pid=41734) File "/root/miniconda3/lib/python3.7/site-packages/ray/tune/function_runner.py", line 352, in entrypoint
(func pid=41734) self._status_reporter.get_checkpoint(),
(func pid=41734) File "/root/miniconda3/lib/python3.7/site-packages/ray/util/tracing/tracing_helper.py", line 462, in _resume_span
(func pid=41734) return method(self, *_args, **_kwargs)
(func pid=41734) File "/root/miniconda3/lib/python3.7/site-packages/ray/tune/function_runner.py", line 645, in _trainable_func
(func pid=41734) output = fn()
(func pid=41734) File "example_hpo_working.py", line 76, in train_tune
(func pid=41734) model = LightningMNISTClassifier(config)
(func pid=41734) NameError: name 'LightningMNISTClassifier' is not defined
(func pid=41732) 2022-08-16 15:44:13,325 ERROR function_runner.py:286 -- Runner Thread raised error.
(func pid=41732) Traceback (most recent call last):
(func pid=41732) File "/root/miniconda3/lib/python3.7/site-packages/ray/tune/function_runner.py", line 277, in run
(func pid=41732) self._entrypoint()
(func pid=41732) File "/root/miniconda3/lib/python3.7/site-packages/ray/tune/function_runner.py", line 352, in entrypoint
(func pid=41732) self._status_reporter.get_checkpoint(),
(func pid=41732) File "/root/miniconda3/lib/python3.7/site-packages/ray/util/tracing/tracing_helper.py", line 462, in _resume_span
(func pid=41732) return method(self, *_args, **_kwargs)
(func pid=41732) File "/root/miniconda3/lib/python3.7/site-packages/ray/tune/function_runner.py", line 645, in _trainable_func
(func pid=41732) output = fn()
(func pid=41732) File "example_hpo_working.py", line 76, in train_tune
(func pid=41732) model = LightningMNISTClassifier(config)
(func pid=41732) NameError: name 'LightningMNISTClassifier' is not defined
(func pid=41728) 2022-08-16 15:44:13,309 ERROR function_runner.py:286 -- Runner Thread raised error.
(func pid=41728) Traceback (most recent call last):
(func pid=41728) File "/root/miniconda3/lib/python3.7/site-packages/ray/tune/function_runner.py", line 277, in run
(func pid=41728) self._entrypoint()
(func pid=41728) File "/root/miniconda3/lib/python3.7/site-packages/ray/tune/function_runner.py", line 352, in entrypoint
(func pid=41728) self._status_reporter.get_checkpoint(),
(func pid=41728) File "/root/miniconda3/lib/python3.7/site-packages/ray/util/tracing/tracing_helper.py", line 462, in _resume_span
(func pid=41728) return method(self, *_args, **_kwargs)
(func pid=41728) File "/root/miniconda3/lib/python3.7/site-packages/ray/tune/function_runner.py", line 645, in _trainable_func
(func pid=41728) output = fn()
(func pid=41728) File "example_hpo_working.py", line 76, in train_tune
(func pid=41728) model = LightningMNISTClassifier(config)
(func pid=41728) NameError: name 'LightningMNISTClassifier' is not defined
(func pid=41730) 2022-08-16 15:44:13,272 ERROR function_runner.py:286 -- Runner Thread raised error.
(func pid=41730) Traceback (most recent call last):
(func pid=41730) File "/root/miniconda3/lib/python3.7/site-packages/ray/tune/function_runner.py", line 277, in run
(func pid=41730) self._entrypoint()
(func pid=41730) File "/root/miniconda3/lib/python3.7/site-packages/ray/tune/function_runner.py", line 352, in entrypoint
(func pid=41730) self._status_reporter.get_checkpoint(),
(func pid=41730) File "/root/miniconda3/lib/python3.7/site-packages/ray/util/tracing/tracing_helper.py", line 462, in _resume_span
(func pid=41730) return method(self, *_args, **_kwargs)
(func pid=41730) File "/root/miniconda3/lib/python3.7/site-packages/ray/tune/function_runner.py", line 645, in _trainable_func
(func pid=41730) output = fn()
(func pid=41730) File "example_hpo_working.py", line 76, in train_tune
(func pid=41730) model = LightningMNISTClassifier(config)
(func pid=41730) NameError: name 'LightningMNISTClassifier' is not defined
2022-08-16 15:44:13,444 ERROR trial_runner.py:886 -- Trial train_tune_43fd5_00003: Error processing event.
NoneType: None
Result for train_tune_43fd5_00003:
date: 2022-08-16_15-44-13
experiment_id: 02204d81b72943e3bbfcc822d35f02a0
hostname: 0e26c6a24ffa
node_ip: 172.17.0.2
pid: 41722
timestamp: 1660664653
trial_id: 43fd5_00003
(func pid=41724) 2022-08-16 15:44:13,457 ERROR function_runner.py:286 -- Runner Thread raised error.
(func pid=41724) Traceback (most recent call last):
(func pid=41724) File "/root/miniconda3/lib/python3.7/site-packages/ray/tune/function_runner.py", line 277, in run
(func pid=41724) self._entrypoint()
(func pid=41724) File "/root/miniconda3/lib/python3.7/site-packages/ray/tune/function_runner.py", line 352, in entrypoint
(func pid=41724) self._status_reporter.get_checkpoint(),
(func pid=41724) File "/root/miniconda3/lib/python3.7/site-packages/ray/util/tracing/tracing_helper.py", line 462, in _resume_span
(func pid=41724) return method(self, *_args, **_kwargs)
(func pid=41724) File "/root/miniconda3/lib/python3.7/site-packages/ray/tune/function_runner.py", line 645, in _trainable_func
(func pid=41724) output = fn()
(func pid=41724) File "example_hpo_working.py", line 76, in train_tune
(func pid=41724) model = LightningMNISTClassifier(config)
(func pid=41724) NameError: name 'LightningMNISTClassifier' is not defined
== Status ==
Current time: 2022-08-16 15:44:13 (running for 00:00:05.24)
Memory usage on this node: 17.6/86.4 GiB
Using FIFO scheduling algorithm.
Resources requested: 8.0/64 CPUs, 0/0 GPUs, 0.0/62.79 GiB heap, 0.0/9.31 GiB objects
Result logdir: /root/ray_results/train_tune_2022-08-16_15-44-08
Number of trials: 10/10 (2 ERROR, 8 RUNNING)
+------------------------+----------+------------------+--------------+----------------+----------------+-------------+
| Trial name | status | loc | batch_size | layer_1_size | layer_2_size | lr |
|------------------------+----------+------------------+--------------+----------------+----------------+-------------|
| train_tune_43fd5_00001 | RUNNING | 172.17.0.2:41718 | 64 | 64 | 256 | 0.00155955 |
| train_tune_43fd5_00002 | RUNNING | 172.17.0.2:41720 | 128 | 128 | 64 | 0.00399358 |
| train_tune_43fd5_00004 | RUNNING | 172.17.0.2:41724 | 128 | 64 | 128 | 0.0221855 |
| train_tune_43fd5_00005 | RUNNING | 172.17.0.2:41726 | 64 | 128 | 128 | 0.00041038 |
| train_tune_43fd5_00006 | RUNNING | 172.17.0.2:41728 | 64 | 64 | 256 | 0.0105243 |
| train_tune_43fd5_00007 | RUNNING | 172.17.0.2:41730 | 128 | 32 | 256 | 0.000929454 |
| train_tune_43fd5_00008 | RUNNING | 172.17.0.2:41732 | 64 | 64 | 128 | 0.00176483 |
| train_tune_43fd5_00009 | RUNNING | 172.17.0.2:41734 | 128 | 32 | 256 | 0.000113077 |
| train_tune_43fd5_00000 | ERROR | 172.17.0.2:41684 | 64 | 64 | 256 | 0.00233834 |
| train_tune_43fd5_00003 | ERROR | 172.17.0.2:41722 | 128 | 128 | 64 | 0.000184477 |
+------------------------+----------+------------------+--------------+----------------+----------------+-------------+
Number of errored trials: 2
+------------------------+--------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name | # failures | error file |
|------------------------+--------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| train_tune_43fd5_00000 | 1 | /root/ray_results/train_tune_2022-08-16_15-44-08/train_tune_43fd5_00000_0_batch_size=64,layer_1_size=64,layer_2_size=256,lr=0.0023_2022-08-16_15-44-08/error.txt |
| train_tune_43fd5_00003 | 1 | /root/ray_results/train_tune_2022-08-16_15-44-08/train_tune_43fd5_00003_3_batch_size=128,layer_1_size=128,layer_2_size=64,lr=0.0002_2022-08-16_15-44-10/error.txt |
+------------------------+--------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------+
2022-08-16 15:44:13,487 ERROR trial_runner.py:886 -- Trial train_tune_43fd5_00001: Error processing event.
NoneType: None
Result for train_tune_43fd5_00001:
date: 2022-08-16_15-44-13
experiment_id: e738348e77c64919931d70c916cbfaf8
hostname: 0e26c6a24ffa
node_ip: 172.17.0.2
pid: 41718
timestamp: 1660664653
trial_id: 43fd5_00001
2022-08-16 15:44:13,490 ERROR trial_runner.py:886 -- Trial train_tune_43fd5_00007: Error processing event.
NoneType: None
Result for train_tune_43fd5_00007:
date: 2022-08-16_15-44-13
experiment_id: f79be7b9e98a43f1a41893071c4e1f6b
hostname: 0e26c6a24ffa
node_ip: 172.17.0.2
pid: 41730
timestamp: 1660664653
trial_id: 43fd5_00007
2022-08-16 15:44:13,493 ERROR trial_runner.py:886 -- Trial train_tune_43fd5_00002: Error processing event.
NoneType: None
Result for train_tune_43fd5_00002:
date: 2022-08-16_15-44-13
experiment_id: 8e7422287e3e44f9b2e7b249a8ae18cd
hostname: 0e26c6a24ffa
node_ip: 172.17.0.2
pid: 41720
timestamp: 1660664653
trial_id: 43fd5_00002
2022-08-16 15:44:13,512 ERROR trial_runner.py:886 -- Trial train_tune_43fd5_00006: Error processing event.
NoneType: None
Result for train_tune_43fd5_00006:
date: 2022-08-16_15-44-13
experiment_id: 2d56b152a6a34e1f9e26dad1aec25d00
hostname: 0e26c6a24ffa
node_ip: 172.17.0.2
pid: 41728
timestamp: 1660664653
trial_id: 43fd5_00006
2022-08-16 15:44:13,527 ERROR trial_runner.py:886 -- Trial train_tune_43fd5_00008: Error processing event.
NoneType: None
Result for train_tune_43fd5_00008:
date: 2022-08-16_15-44-13
experiment_id: b2158026b3b947bfbb9c3da4e6f7b977
hostname: 0e26c6a24ffa
node_ip: 172.17.0.2
pid: 41732
timestamp: 1660664653
trial_id: 43fd5_00008
2022-08-16 15:44:13,543 ERROR trial_runner.py:886 -- Trial train_tune_43fd5_00009: Error processing event.
NoneType: None
Result for train_tune_43fd5_00009:
date: 2022-08-16_15-44-13
experiment_id: 6b5a73f09241440085bd6c09f6f681e9
hostname: 0e26c6a24ffa
node_ip: 172.17.0.2
pid: 41734
timestamp: 1660664653
trial_id: 43fd5_00009
(func pid=41726) 2022-08-16 15:44:13,484 ERROR function_runner.py:286 -- Runner Thread raised error.
(func pid=41726) Traceback (most recent call last):
(func pid=41726) File "/root/miniconda3/lib/python3.7/site-packages/ray/tune/function_runner.py", line 277, in run
(func pid=41726) self._entrypoint()
(func pid=41726) File "/root/miniconda3/lib/python3.7/site-packages/ray/tune/function_runner.py", line 352, in entrypoint
(func pid=41726) self._status_reporter.get_checkpoint(),
(func pid=41726) File "/root/miniconda3/lib/python3.7/site-packages/ray/util/tracing/tracing_helper.py", line 462, in _resume_span
(func pid=41726) return method(self, *_args, **_kwargs)
(func pid=41726) File "/root/miniconda3/lib/python3.7/site-packages/ray/tune/function_runner.py", line 645, in _trainable_func
(func pid=41726) output = fn()
(func pid=41726) File "example_hpo_working.py", line 76, in train_tune
(func pid=41726) model = LightningMNISTClassifier(config)
(func pid=41726) NameError: name 'LightningMNISTClassifier' is not defined
2022-08-16 15:44:13,660 ERROR trial_runner.py:886 -- Trial train_tune_43fd5_00004: Error processing event.
NoneType: None
Result for train_tune_43fd5_00004:
date: 2022-08-16_15-44-13
experiment_id: 60f51e072c7942bdb5d9298e0e147555
hostname: 0e26c6a24ffa
node_ip: 172.17.0.2
pid: 41724
timestamp: 1660664653
trial_id: 43fd5_00004
2022-08-16 15:44:13,687 ERROR trial_runner.py:886 -- Trial train_tune_43fd5_00005: Error processing event.
NoneType: None
Result for train_tune_43fd5_00005:
date: 2022-08-16_15-44-13
experiment_id: 79701d1c19ac4c55b5a73746c1872724
hostname: 0e26c6a24ffa
node_ip: 172.17.0.2
pid: 41726
timestamp: 1660664653
trial_id: 43fd5_00005
== Status ==
Current time: 2022-08-16 15:44:13 (running for 00:00:05.46)
Memory usage on this node: 16.4/86.4 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/64 CPUs, 0/0 GPUs, 0.0/62.79 GiB heap, 0.0/9.31 GiB objects
Result logdir: /root/ray_results/train_tune_2022-08-16_15-44-08
Number of trials: 10/10 (10 ERROR)
+------------------------+----------+------------------+--------------+----------------+----------------+-------------+
| Trial name | status | loc | batch_size | layer_1_size | layer_2_size | lr |
|------------------------+----------+------------------+--------------+----------------+----------------+-------------|
| train_tune_43fd5_00000 | ERROR | 172.17.0.2:41684 | 64 | 64 | 256 | 0.00233834 |
| train_tune_43fd5_00001 | ERROR | 172.17.0.2:41718 | 64 | 64 | 256 | 0.00155955 |
| train_tune_43fd5_00002 | ERROR | 172.17.0.2:41720 | 128 | 128 | 64 | 0.00399358 |
| train_tune_43fd5_00003 | ERROR | 172.17.0.2:41722 | 128 | 128 | 64 | 0.000184477 |
| train_tune_43fd5_00004 | ERROR | 172.17.0.2:41724 | 128 | 64 | 128 | 0.0221855 |
| train_tune_43fd5_00005 | ERROR | 172.17.0.2:41726 | 64 | 128 | 128 | 0.00041038 |
| train_tune_43fd5_00006 | ERROR | 172.17.0.2:41728 | 64 | 64 | 256 | 0.0105243 |
| train_tune_43fd5_00007 | ERROR | 172.17.0.2:41730 | 128 | 32 | 256 | 0.000929454 |
| train_tune_43fd5_00008 | ERROR | 172.17.0.2:41732 | 64 | 64 | 128 | 0.00176483 |
| train_tune_43fd5_00009 | ERROR | 172.17.0.2:41734 | 128 | 32 | 256 | 0.000113077 |
+------------------------+----------+------------------+--------------+----------------+----------------+-------------+
Number of errored trials: 10
+------------------------+--------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name | # failures | error file |
|------------------------+--------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| train_tune_43fd5_00000 | 1 | /root/ray_results/train_tune_2022-08-16_15-44-08/train_tune_43fd5_00000_0_batch_size=64,layer_1_size=64,layer_2_size=256,lr=0.0023_2022-08-16_15-44-08/error.txt |
| train_tune_43fd5_00001 | 1 | /root/ray_results/train_tune_2022-08-16_15-44-08/train_tune_43fd5_00001_1_batch_size=64,layer_1_size=64,layer_2_size=256,lr=0.0016_2022-08-16_15-44-10/error.txt |
| train_tune_43fd5_00002 | 1 | /root/ray_results/train_tune_2022-08-16_15-44-08/train_tune_43fd5_00002_2_batch_size=128,layer_1_size=128,layer_2_size=64,lr=0.0040_2022-08-16_15-44-10/error.txt |
| train_tune_43fd5_00003 | 1 | /root/ray_results/train_tune_2022-08-16_15-44-08/train_tune_43fd5_00003_3_batch_size=128,layer_1_size=128,layer_2_size=64,lr=0.0002_2022-08-16_15-44-10/error.txt |
| train_tune_43fd5_00004 | 1 | /root/ray_results/train_tune_2022-08-16_15-44-08/train_tune_43fd5_00004_4_batch_size=128,layer_1_size=64,layer_2_size=128,lr=0.0222_2022-08-16_15-44-10/error.txt |
| train_tune_43fd5_00005 | 1 | /root/ray_results/train_tune_2022-08-16_15-44-08/train_tune_43fd5_00005_5_batch_size=64,layer_1_size=128,layer_2_size=128,lr=0.0004_2022-08-16_15-44-10/error.txt |
| train_tune_43fd5_00006 | 1 | /root/ray_results/train_tune_2022-08-16_15-44-08/train_tune_43fd5_00006_6_batch_size=64,layer_1_size=64,layer_2_size=256,lr=0.0105_2022-08-16_15-44-10/error.txt |
| train_tune_43fd5_00007 | 1 | /root/ray_results/train_tune_2022-08-16_15-44-08/train_tune_43fd5_00007_7_batch_size=128,layer_1_size=32,layer_2_size=256,lr=0.0009_2022-08-16_15-44-10/error.txt |
| train_tune_43fd5_00008 | 1 | /root/ray_results/train_tune_2022-08-16_15-44-08/train_tune_43fd5_00008_8_batch_size=64,layer_1_size=64,layer_2_size=128,lr=0.0018_2022-08-16_15-44-10/error.txt |
| train_tune_43fd5_00009 | 1 | /root/ray_results/train_tune_2022-08-16_15-44-08/train_tune_43fd5_00009_9_batch_size=128,layer_1_size=32,layer_2_size=256,lr=0.0001_2022-08-16_15-44-10/error.txt |
+------------------------+--------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------+
Traceback (most recent call last):
File "example_hpo_working.py", line 89, in <module>
num_samples=10)
File "/root/miniconda3/lib/python3.7/site-packages/ray/tune/tune.py", line 741, in run
raise TuneError("Trials did not complete", incomplete_trials)
ray.tune.error.TuneError: ('Trials did not complete', [train_tune_43fd5_00000, train_tune_43fd5_00001, train_tune_43fd5_00002, train_tune_43fd5_00003, train_tune_43fd5_00004, train_tune_43fd5_00005, train_tune_43fd5_00006, train_tune_43fd5_00007, train_tune_43fd5_00008, train_tune_43fd5_00009])发布于 2022-08-16 22:39:27
我相信你的代码中有一个错误:
def __init__(self, config):
super(LightningMNISTClassifier, self).__init__()
self.layer_1_size = config["layer_1_size"]
self.layer_2_size = config["layer_2_size"]
self.lr = config["lr"]
self.batch_size = config["batch_size"]您需要正确地定义LightningMNISTClassifier。也许试着用这个例子?
请注意,您可能需要安装Ray的最新版本。
https://stackoverflow.com/questions/73374386
复制相似问题