Skip to content

Commit f11232b

Browse files
authoredNov 17, 2024··
Merge branch 'master' into feature/comet-logger-update
2 parents 4ba9bf2 + c110f4f commit f11232b

File tree

23 files changed

+760
-102
lines changed

23 files changed

+760
-102
lines changed
 

‎.github/checkgroup.yml

+25-24
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ subprojects:
1919
- "!*.md"
2020
- "!**/*.md"
2121
checks:
22-
- "pl-cpu (macOS-13, lightning, 3.9, 2.1, oldest)"
22+
- "pl-cpu (macOS-14, lightning, 3.9, 2.1, oldest)"
2323
- "pl-cpu (macOS-14, lightning, 3.10, 2.1)"
2424
- "pl-cpu (macOS-14, lightning, 3.11, 2.2.2)"
2525
- "pl-cpu (macOS-14, lightning, 3.11, 2.3)"
@@ -40,9 +40,9 @@ subprojects:
4040
- "pl-cpu (macOS-14, pytorch, 3.9, 2.1)"
4141
- "pl-cpu (ubuntu-20.04, pytorch, 3.9, 2.1)"
4242
- "pl-cpu (windows-2022, pytorch, 3.9, 2.1)"
43-
- "pl-cpu (macOS-13, pytorch, 3.10, 2.1)"
44-
- "pl-cpu (ubuntu-22.04, pytorch, 3.10, 2.1)"
45-
- "pl-cpu (windows-2022, pytorch, 3.10, 2.1)"
43+
- "pl-cpu (macOS-14, pytorch, 3.12, 2.5.1)"
44+
- "pl-cpu (ubuntu-22.04, pytorch, 3.12, 2.5.1)"
45+
- "pl-cpu (windows-2022, pytorch, 3.12, 2.5.1)"
4646

4747
- id: "pytorch_lightning: Azure GPU"
4848
paths:
@@ -89,14 +89,15 @@ subprojects:
8989
checks:
9090
- "lightning.Benchmarks"
9191

92-
- id: "pytorch-lightning: TPU workflow"
93-
paths:
94-
# tpu CI availability is very limited, so we only require tpu tests
95-
# to pass when their configurations are modified
96-
- ".github/workflows/tpu-tests.yml"
97-
- "tests/tests_pytorch/run_tpu_tests.sh"
98-
checks:
99-
- "test-on-tpus (pytorch, pjrt, v4-8)"
92+
# Temporarily disabled
93+
# - id: "pytorch-lightning: TPU workflow"
94+
# paths:
95+
# # tpu CI availability is very limited, so we only require tpu tests
96+
# # to pass when their configurations are modified
97+
# - ".github/workflows/tpu-tests.yml"
98+
# - "tests/tests_pytorch/run_tpu_tests.sh"
99+
# checks:
100+
# - "test-on-tpus (pytorch, pjrt, v4-8)"
100101

101102
- id: "fabric: Docs"
102103
paths:
@@ -171,7 +172,7 @@ subprojects:
171172
- "!*.md"
172173
- "!**/*.md"
173174
checks:
174-
- "fabric-cpu (macOS-13, lightning, 3.9, 2.1, oldest)"
175+
- "fabric-cpu (macOS-14, lightning, 3.9, 2.1, oldest)"
175176
- "fabric-cpu (macOS-14, lightning, 3.10, 2.1)"
176177
- "fabric-cpu (macOS-14, lightning, 3.11, 2.2.2)"
177178
- "fabric-cpu (macOS-14, lightning, 3.11, 2.3)"
@@ -192,9 +193,9 @@ subprojects:
192193
- "fabric-cpu (macOS-14, fabric, 3.9, 2.1)"
193194
- "fabric-cpu (ubuntu-20.04, fabric, 3.9, 2.1)"
194195
- "fabric-cpu (windows-2022, fabric, 3.9, 2.1)"
195-
- "fabric-cpu (macOS-13, fabric, 3.10, 2.1)"
196-
- "fabric-cpu (ubuntu-22.04, fabric, 3.10, 2.1)"
197-
- "fabric-cpu (windows-2022, fabric, 3.10, 2.1)"
196+
- "fabric-cpu (macOS-14, fabric, 3.12, 2.5.1)"
197+
- "fabric-cpu (ubuntu-22.04, fabric, 3.12, 2.5.1)"
198+
- "fabric-cpu (windows-2022, fabric, 3.12, 2.5.1)"
198199

199200
- id: "lightning_fabric: Azure GPU"
200201
paths:
@@ -266,14 +267,14 @@ subprojects:
266267
- "install-pkg (ubuntu-22.04, lightning, 3.11)"
267268
- "install-pkg (ubuntu-22.04, notset, 3.9)"
268269
- "install-pkg (ubuntu-22.04, notset, 3.11)"
269-
- "install-pkg (macOS-13, fabric, 3.9)"
270-
- "install-pkg (macOS-13, fabric, 3.11)"
271-
- "install-pkg (macOS-13, pytorch, 3.9)"
272-
- "install-pkg (macOS-13, pytorch, 3.11)"
273-
- "install-pkg (macOS-13, lightning, 3.9)"
274-
- "install-pkg (macOS-13, lightning, 3.11)"
275-
- "install-pkg (macOS-13, notset, 3.9)"
276-
- "install-pkg (macOS-13, notset, 3.11)"
270+
- "install-pkg (macOS-14, fabric, 3.9)"
271+
- "install-pkg (macOS-14, fabric, 3.11)"
272+
- "install-pkg (macOS-14, pytorch, 3.9)"
273+
- "install-pkg (macOS-14, pytorch, 3.11)"
274+
- "install-pkg (macOS-14, lightning, 3.9)"
275+
- "install-pkg (macOS-14, lightning, 3.11)"
276+
- "install-pkg (macOS-14, notset, 3.9)"
277+
- "install-pkg (macOS-14, notset, 3.11)"
277278
- "install-pkg (windows-2022, fabric, 3.9)"
278279
- "install-pkg (windows-2022, fabric, 3.11)"
279280
- "install-pkg (windows-2022, pytorch, 3.9)"

‎.github/workflows/ci-pkg-install.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ jobs:
4242
strategy:
4343
fail-fast: false
4444
matrix:
45-
os: ["ubuntu-22.04", "macOS-13", "windows-2022"]
45+
os: ["ubuntu-22.04", "macOS-14", "windows-2022"]
4646
pkg-name: ["fabric", "pytorch", "lightning", "notset"]
4747
python-version: ["3.9", "3.11"]
4848
steps:

‎.github/workflows/ci-tests-fabric.yml

+8-5
Original file line numberDiff line numberDiff line change
@@ -56,11 +56,11 @@ jobs:
5656
- { os: "ubuntu-22.04", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.5.1" }
5757
- { os: "windows-2022", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.5.1" }
5858
# only run PyTorch latest with Python latest, use Fabric scope to limit dependency issues
59-
- { os: "macOS-13", pkg-name: "fabric", python-version: "3.10", pytorch-version: "2.1" }
60-
- { os: "ubuntu-22.04", pkg-name: "fabric", python-version: "3.10", pytorch-version: "2.1" }
61-
- { os: "windows-2022", pkg-name: "fabric", python-version: "3.10", pytorch-version: "2.1" }
59+
- { os: "macOS-14", pkg-name: "fabric", python-version: "3.12", pytorch-version: "2.5.1" }
60+
- { os: "ubuntu-22.04", pkg-name: "fabric", python-version: "3.12", pytorch-version: "2.5.1" }
61+
- { os: "windows-2022", pkg-name: "fabric", python-version: "3.12", pytorch-version: "2.5.1" }
6262
# "oldest" versions tests, only on minimum Python
63-
- { os: "macOS-13", pkg-name: "lightning", python-version: "3.9", pytorch-version: "2.1", requires: "oldest" }
63+
- { os: "macOS-14", pkg-name: "lightning", python-version: "3.9", pytorch-version: "2.1", requires: "oldest" }
6464
- {
6565
os: "ubuntu-20.04",
6666
pkg-name: "lightning",
@@ -101,7 +101,10 @@ jobs:
101101

102102
- name: Set min. dependencies
103103
if: ${{ matrix.requires == 'oldest' }}
104-
run: python .actions/assistant.py replace_oldest_ver
104+
run: |
105+
python .actions/assistant.py replace_oldest_ver
106+
pip install "cython<3.0" wheel
107+
pip install "pyyaml==5.4" --no-build-isolation
105108
106109
- name: Adjust PyTorch versions in requirements files
107110
if: ${{ matrix.requires != 'oldest' }}

‎.github/workflows/ci-tests-pytorch.yml

+8-5
Original file line numberDiff line numberDiff line change
@@ -60,11 +60,11 @@ jobs:
6060
- { os: "ubuntu-22.04", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.5.1" }
6161
- { os: "windows-2022", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.5.1" }
6262
# only run PyTorch latest with Python latest, use PyTorch scope to limit dependency issues
63-
- { os: "macOS-13", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "2.1" }
64-
- { os: "ubuntu-22.04", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "2.1" }
65-
- { os: "windows-2022", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "2.1" }
63+
- { os: "macOS-14", pkg-name: "pytorch", python-version: "3.12", pytorch-version: "2.5.1" }
64+
- { os: "ubuntu-22.04", pkg-name: "pytorch", python-version: "3.12", pytorch-version: "2.5.1" }
65+
- { os: "windows-2022", pkg-name: "pytorch", python-version: "3.12", pytorch-version: "2.5.1" }
6666
# "oldest" versions tests, only on minimum Python
67-
- { os: "macOS-13", pkg-name: "lightning", python-version: "3.9", pytorch-version: "2.1", requires: "oldest" }
67+
- { os: "macOS-14", pkg-name: "lightning", python-version: "3.9", pytorch-version: "2.1", requires: "oldest" }
6868
- {
6969
os: "ubuntu-20.04",
7070
pkg-name: "lightning",
@@ -106,7 +106,10 @@ jobs:
106106

107107
- name: Set min. dependencies
108108
if: ${{ matrix.requires == 'oldest' }}
109-
run: python .actions/assistant.py replace_oldest_ver
109+
run: |
110+
python .actions/assistant.py replace_oldest_ver
111+
pip install "cython<3.0" wheel
112+
pip install "pyyaml==5.4" --no-build-isolation
110113
111114
- name: Adjust PyTorch versions in requirements files
112115
if: ${{ matrix.requires != 'oldest' }}

‎examples/fabric/build_your_own_trainer/run.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,8 @@ def training_step(self, batch, batch_idx: int):
4141

4242
def configure_optimizers(self):
4343
optim = torch.optim.Adam(self.parameters(), lr=1e-4)
44-
return optim, {
44+
return {
45+
"optimizer": optim,
4546
"scheduler": torch.optim.lr_scheduler.ReduceLROnPlateau(optim, mode="max", verbose=True),
4647
"monitor": "val_accuracy",
4748
"interval": "epoch",

‎src/lightning/fabric/accelerators/cpu.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -39,13 +39,13 @@ def teardown(self) -> None:
3939

4040
@staticmethod
4141
@override
42-
def parse_devices(devices: Union[int, str, List[int]]) -> int:
42+
def parse_devices(devices: Union[int, str]) -> int:
4343
"""Accelerator device parsing logic."""
4444
return _parse_cpu_cores(devices)
4545

4646
@staticmethod
4747
@override
48-
def get_parallel_devices(devices: Union[int, str, List[int]]) -> List[torch.device]:
48+
def get_parallel_devices(devices: Union[int, str]) -> List[torch.device]:
4949
"""Gets parallel devices for the Accelerator."""
5050
devices = _parse_cpu_cores(devices)
5151
return [torch.device("cpu")] * devices
@@ -72,12 +72,12 @@ def register_accelerators(cls, accelerator_registry: _AcceleratorRegistry) -> No
7272
)
7373

7474

75-
def _parse_cpu_cores(cpu_cores: Union[int, str, List[int]]) -> int:
75+
def _parse_cpu_cores(cpu_cores: Union[int, str]) -> int:
7676
"""Parses the cpu_cores given in the format as accepted by the ``devices`` argument in the
7777
:class:`~lightning.pytorch.trainer.trainer.Trainer`.
7878
7979
Args:
80-
cpu_cores: An int > 0.
80+
cpu_cores: An int > 0 or a string that can be converted to an int > 0.
8181
8282
Returns:
8383
An int representing the number of processes

‎src/lightning/fabric/utilities/throughput.py

+8
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,14 @@ def measure_flops(
347347
torch.int8: 389.9e12,
348348
"int4": 779.8e12,
349349
},
350+
"rtx 4080 super": {
351+
torch.float32: 52.2e12,
352+
"tfloat32": 52.2e12,
353+
torch.bfloat16: 52.2e12,
354+
torch.float16: 52.2e12,
355+
torch.int8: 417.6e12,
356+
"int4": 835.2e12,
357+
},
350358
"l4": {
351359
torch.float32: 30.3e12,
352360
"tfloat32": 60e12,

‎src/lightning/pytorch/accelerators/cpu.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -48,13 +48,13 @@ def teardown(self) -> None:
4848

4949
@staticmethod
5050
@override
51-
def parse_devices(devices: Union[int, str, List[int]]) -> int:
51+
def parse_devices(devices: Union[int, str]) -> int:
5252
"""Accelerator device parsing logic."""
5353
return _parse_cpu_cores(devices)
5454

5555
@staticmethod
5656
@override
57-
def get_parallel_devices(devices: Union[int, str, List[int]]) -> List[torch.device]:
57+
def get_parallel_devices(devices: Union[int, str]) -> List[torch.device]:
5858
"""Gets parallel devices for the Accelerator."""
5959
devices = _parse_cpu_cores(devices)
6060
return [torch.device("cpu")] * devices

‎src/lightning/pytorch/loggers/mlflow.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -299,7 +299,7 @@ def save_dir(self) -> Optional[str]:
299299
300300
"""
301301
if self._tracking_uri.startswith(LOCAL_FILE_URI_PREFIX):
302-
return self._tracking_uri.lstrip(LOCAL_FILE_URI_PREFIX)
302+
return self._tracking_uri[len(LOCAL_FILE_URI_PREFIX) :]
303303
return None
304304

305305
@property

‎src/lightning/pytorch/loops/evaluation_loop.py

+37-1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import shutil
1616
import sys
1717
from collections import ChainMap, OrderedDict, defaultdict
18+
from dataclasses import dataclass
1819
from typing import Any, DefaultDict, Iterable, Iterator, List, Optional, Tuple, Union
1920

2021
from lightning_utilities.core.apply_func import apply_to_collection
@@ -45,6 +46,12 @@
4546
from lightning.pytorch.utilities.signature_utils import is_param_in_hook_signature
4647

4748

49+
@dataclass
50+
class RestartStage:
51+
NONE = "none"
52+
RESTARTED_MID_EVALUATION = "restarted_mid_evaluation"
53+
54+
4855
class _EvaluationLoop(_Loop):
4956
"""Top-level loop where validation/testing starts."""
5057

@@ -73,6 +80,7 @@ def __init__(
7380
self._seen_batches_per_dataloader: DefaultDict[int, int] = defaultdict(int)
7481
self._last_val_dl_reload_epoch = float("-inf")
7582
self._module_mode = _ModuleMode()
83+
self._restart_stage = RestartStage.NONE
7684

7785
@property
7886
def num_dataloaders(self) -> int:
@@ -137,7 +145,7 @@ def run(self) -> List[_OUT_DICT]:
137145
# this needs to wrap the `*_step` call too (not just `next`) for `dataloader_iter` support
138146
break
139147
finally:
140-
self._restarting = False
148+
self.on_iteration_done()
141149
self._store_dataloader_outputs()
142150
return self.on_run_end()
143151

@@ -197,6 +205,24 @@ def setup_data(self) -> None:
197205
# this depends on the data used, so reset it too
198206
self._seen_batches_per_dataloader = defaultdict(int)
199207

208+
@property
209+
def restarted_mid_evaluation(self) -> bool:
210+
return self._restart_stage == RestartStage.RESTARTED_MID_EVALUATION
211+
212+
def update_restart_stage(self) -> None:
213+
if (
214+
self.restarting
215+
and self.batch_progress.total.started == self.batch_progress.total.ready
216+
and self.batch_progress.total.processed == self.batch_progress.total.started - 1
217+
and self.batch_progress.total.completed == self.batch_progress.total.processed
218+
):
219+
self._restart_stage = RestartStage.RESTARTED_MID_EVALUATION
220+
else:
221+
self._restart_stage = RestartStage.NONE
222+
223+
def reset_restart_stage(self) -> None:
224+
self._restart_stage = RestartStage.NONE
225+
200226
def reset(self) -> None:
201227
"""Resets the internal state of the loop."""
202228
trainer = self.trainer
@@ -236,6 +262,16 @@ def reset(self) -> None:
236262
data_fetcher._stop_profiler = self._on_after_fetch
237263
self._data_fetcher = data_fetcher
238264

265+
def increment_progress_to_evaluation_end(self) -> None:
266+
self.setup_data()
267+
if self.skip:
268+
return
269+
self.reset()
270+
max_batch = int(max(self.max_batches))
271+
if max_batch == -1:
272+
return
273+
self.batch_progress.increment_by(max_batch, True)
274+
239275
def on_run_start(self) -> None:
240276
"""Runs the ``_on_evaluation_model_eval``, ``_on_evaluation_start`` and ``_on_evaluation_epoch_start``
241277
hooks."""

0 commit comments

Comments
 (0)