Merge branch 'main' into sekyondaMeta-Deadlink-update-1

pytorch · Nov 15, 2024 · c76db51 · c76db51
2 parents 93d9713 + 69475d6
commit c76db51
Show file tree

Hide file tree

Showing 7 changed files with 816 additions and 5 deletions.
diff --git a/.jenkins/metadata.json b/.jenkins/metadata.json
@@ -33,7 +33,7 @@
   },
   "recipes_source/torch_export_aoti_python.py": {
     "needs": "linux.g5.4xlarge.nvidia.gpu"
-  }, 
+  },
   "advanced_source/pendulum.py": {
     "needs": "linux.g5.4xlarge.nvidia.gpu",
     "_comment": "need to be here for the compiling_optimizer_lr_scheduler.py to run."
@@ -58,6 +58,9 @@
   "intermediate_source/scaled_dot_product_attention_tutorial.py": {
     "needs": "linux.g5.4xlarge.nvidia.gpu"
   },
+  "intermediate_source/transformer_building_blocks.py": {
+    "needs": "linux.g5.4xlarge.nvidia.gpu"
+  },
   "recipes_source/torch_compile_user_defined_triton_kernel_tutorial.py": {
     "needs": "linux.g5.4xlarge.nvidia.gpu"
   },

diff --git a/.jenkins/validate_tutorials_built.py b/.jenkins/validate_tutorials_built.py
@@ -25,6 +25,7 @@
     "intermediate_source/mnist_train_nas",  # used by ax_multiobjective_nas_tutorial.py
     "intermediate_source/fx_conv_bn_fuser",
     "intermediate_source/_torch_export_nightly_tutorial",  # does not work on release
+    "intermediate_source/transformer_building_blocks",  # does not work on release
     "advanced_source/super_resolution_with_onnxruntime",
     "advanced_source/usb_semisup_learn", # fails with CUDA OOM error, should try on a different worker
     "prototype_source/fx_graph_mode_ptq_dynamic",

diff --git a/beginner_source/ddp_series_intro.rst b/beginner_source/ddp_series_intro.rst
@@ -7,7 +7,7 @@
 Distributed Data Parallel in PyTorch - Video Tutorials
 ======================================================
 
-Authors: `Suraj Subramanian <https://github.com/suraj813>`__
+Authors: `Suraj Subramanian <https://github.com/subramen>`__
 
 Follow along with the video below or on `youtube <https://www.youtube.com/watch/-K3bZYHYHEA>`__.
 

diff --git a/en-wordlist.txt b/en-wordlist.txt
@@ -1,5 +1,6 @@
 ACL
 ADI
+ALiBi
 AOT
 AOTInductor
 APIs
@@ -79,6 +80,7 @@ FX
 FX's
 FairSeq
 Fastpath
+FFN
 FloydHub
 FloydHub's
 Frobenius
@@ -127,6 +129,7 @@ Kihyuk
 Kiuk
 Kubernetes
 Kuei
+KV
 LRSchedulers
 LSTM
 LSTMs
@@ -162,6 +165,7 @@ NLP
 NTK
 NUMA
 NaN
+NaNs
 NanoGPT
 Netron
 NeurIPS
@@ -231,6 +235,7 @@ Sigmoid
 SoTA
 Sohn
 Spacy
+SwiGLU
 TCP
 THP
 TIAToolbox
@@ -276,6 +281,7 @@ Xcode
 Xeon
 Yidong
 YouTube
+Zipf
 accelerometer
 accuracies
 activations
@@ -305,6 +311,7 @@ bbAP
 benchmarked
 benchmarking
 bitwise
+bool
 boolean
 breakpoint
 broadcasted
@@ -333,6 +340,7 @@ csv
 cuDNN
 cuda
 customizable
+customizations
 datafile
 dataflow
 dataframe
@@ -377,6 +385,7 @@ fbgemm
 feedforward
 finetune
 finetuning
+FlexAttention
 fp
 frontend
 functionalized
@@ -431,6 +440,7 @@ mAP
 macos
 manualSeed
 matmul
+matmuls
 matplotlib
 memcpy
 memset
@@ -446,6 +456,7 @@ modularized
 mpp
 mucosa
 multihead
+MultiheadAttention
 multimodal
 multimodality
 multinode
@@ -456,7 +467,11 @@ multithreading
 namespace
 natively
 ndarrays
+nheads
 nightlies
+NJT
+NJTs
+NJT's
 num
 numericalize
 numpy
@@ -532,6 +547,7 @@ runtime
 runtime
 runtimes
 scalable
+SDPA
 sharded
 softmax
 sparsified
@@ -591,12 +607,14 @@ tradeoff
 tradeoffs
 triton
 uint
+UX
 umap
 uncomment
 uncommented
 underflowing
 unfused
 unimodal
+unigram
 unnormalized
 unoptimized
 unparametrized
@@ -618,6 +636,7 @@ warmstarted
 warmstarting
 warmup
 webp
+wikitext
 wsi
 wsis
 Meta's

diff --git a/index.rst b/index.rst
@@ -664,6 +664,14 @@ Welcome to PyTorch Tutorials
    :link: beginner/knowledge_distillation_tutorial.html
    :tags: Model-Optimization,Image/Video
 
+
+.. customcarditem::
+   :header: Accelerating PyTorch Transformers by replacing nn.Transformer with Nested Tensors and torch.compile()
+   :card_description: This tutorial goes over recommended best practices for implementing Transformers with native PyTorch.
+   :image: _static/img/thumbnails/cropped/pytorch-logo.png
+   :link: intermediate/transformer_building_blocks.html
+   :tags: Transformer
+
 .. Parallel-and-Distributed-Training
 
 

diff --git a/intermediate_source/process_group_cpp_extension_tutorial.rst b/intermediate_source/process_group_cpp_extension_tutorial.rst
@@ -25,9 +25,8 @@ Basics
 
 PyTorch collective communications power several widely adopted distributed
 training features, including
-`DistributedDataParallel <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html>`__,
-`ZeroRedundancyOptimizer <https://pytorch.org/docs/stable/distributed.optim.html#torch.distributed.optim.ZeroRedundancyOptimizer>`__,
-`FullyShardedDataParallel <https://github.com/pytorch/pytorch/blob/master/torch/distributed/_fsdp/fully_sharded_data_parallel.py>`__.
+`DistributedDataParallel <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html>`__ and
+`ZeroRedundancyOptimizer <https://pytorch.org/docs/stable/distributed.optim.html#torch.distributed.optim.ZeroRedundancyOptimizer>`__.
 In order to make the same collective communication API work with
 different communication backends, the distributed package abstracts collective
 communication operations into a