Merge pull request #139 from georgian-io/flash-attn

benjaminye · web-flow · commit fa532c0a60d7 · 2024-04-09T10:55:09.000-04:00
Flash Attention Implementation &amp; Fuller Config Options
diff --git a/README.md b/README.md
@@ -51,6 +51,30 @@ Then the second command initiates the fine-tuning process using the settings spe
 
 The configuration file is the central piece that defines the behavior of the toolkit. It is written in YAML format and consists of several sections that control different aspects of the process, such as data ingestion, model definition, training, inference, and quality assurance. We highlight some of the critical sections.
 
+#### Flash Attention 2
+
+To enable Flash-attention for [supported models](https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2). First install `flash-attn`:
+
+**pipx**
+
+```shell
+pipx inject llm-toolkit flash-attn --pip-args=--no-build-isolation
+```
+
+**pip**
+
+```
+pip install flash-attn --no-build-isolation
+```
+
+Then, add to config file.
+
+```yaml
+model:
+  torch_dtype: "bfloat16" # or "float16" if using older GPU
+  attn_implementation: "flash_attention_2"
+```
+
 #### Data Ingestion
 
 An example of what the data ingestion may look like:
diff --git a/config.yml b/config.yml
@@ -24,6 +24,8 @@ data:
 # Model Definition -------------------
 model:
   hf_model_ckpt: "NousResearch/Llama-2-7b-hf"
+  torch_dtype: "bfloat16"
+  attn_implementation: "flash_attention_2"
   quantize: true
   bitsandbytes:
     load_in_4bit: true
diff --git a/llmtune/finetune/lora.py b/llmtune/finetune/lora.py
@@ -74,6 +74,8 @@ def _get_model(self):
             ),
             use_cache=False,
             device_map=self.device_map,
+            torch_dtype=self._model_config.casted_torch_dtype,
+            attn_implementation=self._model_config.attn_implementation,
         )
 
         model.config.pretraining_tp = 1
diff --git a/llmtune/inference/lora.py b/llmtune/inference/lora.py
@@ -40,17 +40,13 @@ def _get_merged_model(self, weights_path: str):
         torch.cuda.empty_cache()
 
         # Load from path
-        dtype = (
-            torch.float16
-            if self.config.training.training_args.fp16
-            else (torch.bfloat16 if self.config.training.training_args.bf16 else torch.float32)
-        )
 
         self.model = AutoPeftModelForCausalLM.from_pretrained(
             weights_path,
-            torch_dtype=dtype,
-            device_map=self.device_map,
+            torch_dtype=self.config.model.casted_torch_dtype,
             quantization_config=(BitsAndBytesConfig(**self.config.model.bitsandbytes.model_dump())),
+            device_map=self.device_map,
+            attn_implementation=self.config.model.attn_implementation,
         )
 
         """TODO: figure out multi-gpu
diff --git a/llmtune/pydantic_models/config_model.py b/llmtune/pydantic_models/config_model.py
@@ -77,7 +77,13 @@ class ModelConfig(BaseModel):
         description="Path to the model (huggingface repo or local path)",
     )
     device_map: Optional[str] = Field("auto", description="device onto which to load the model")
+    torch_dtype: Optional[str] = Field("auto", description="torch dtype to use for model weights")
+    attn_implementation: Optional[str] = Field(
+        None,
+        description="set desired attention implementation; leave None for default. E.g. `flash_attention_2` (please ensure `torch_dtype` is either float16 or bfloat16).",
+    )
 
+    # Quantization Config
     quantize: Optional[bool] = Field(False, description="Flag to enable quantization")
     bitsandbytes: BitsAndBytesConfig = Field(None, description="Bits and Bytes configuration")
 
@@ -99,6 +105,18 @@ def set_device_map_to_none(cls, v, values, **kwargs):
             return None
         return v
 
+    @property
+    def casted_torch_dtype(self) -> Union[str, torch.dtype]:
+        if self.torch_dtype == "auto":
+            return self.torch_dtype
+
+        try:
+            torch_dtype = getattr(torch, self.torch_dtype)
+        except AttributeError:
+            raise ValueError(f"{self.torch_dtype} is not a valid torch data type")
+
+        return torch_dtype
+
 
 class LoraConfig(BaseModel):
     r: Optional[int] = Field(8, description="Lora rank")
@@ -126,7 +144,6 @@ class LoraConfig(BaseModel):
     # )
 
 
-# TODO: Get comprehensive Args!
 class TrainingArgs(BaseModel):
     num_train_epochs: Optional[int] = Field(1, description="Number of training epochs")
     per_device_train_batch_size: Optional[int] = Field(1, description="Batch size per training device")
@@ -141,9 +158,12 @@ class TrainingArgs(BaseModel):
     max_grad_norm: Optional[float] = Field(0.3, description="Maximum gradient norm")
     warmup_ratio: Optional[float] = Field(0.03, description="Warmup ratio")
     lr_scheduler_type: Optional[str] = Field("constant", description="Learning rate scheduler type")
+    save_steps: Optional[Union[int, float]] = Field(
+        500,
+        description="Number of updates steps before checkpoint saves. Should be an integer or a float in range [0,1). If smaller than 1, will be interpreted as ratio of total training steps.",
+    )
 
 
-# TODO: Get comprehensive Args!
 class SftArgs(BaseModel):
     max_seq_length: Optional[int] = Field(None, description="Maximum sequence length")
     neftune_noise_alpha: Optional[float] = Field(
@@ -157,16 +177,56 @@ class TrainingConfig(BaseModel):
     sft_args: SftArgs
 
 
-# TODO: Get comprehensive Args!
 class InferenceConfig(BaseModel):
-    max_new_tokens: Optional[int] = Field(None, description="Maximum new tokens")
-    use_cache: Optional[bool] = Field(True, description="Flag to enable cache usage")
-    do_sample: Optional[bool] = Field(True, description="Flag to enable sampling")
-    top_p: Optional[float] = Field(1.0, description="Top p value")
-    temperature: Optional[float] = Field(0.1, description="Temperature value")
-    epsilon_cutoff: Optional[float] = Field(0.0, description="epsilon cutoff value")
-    eta_cutoff: Optional[float] = Field(0.0, description="eta cutoff value")
-    top_k: Optional[int] = Field(50, description="top-k sampling")
+    # Length
+    max_length: Optional[int] = Field(None, description="The maximum length the generated tokens can have.")
+    max_new_tokens: Optional[int] = Field(None, description="The maximum numbers of tokens to generate.")
+    min_length: Optional[int] = Field(0, description="The minimum length of the sequence to be generated.")
+    min_new_tokens: Optional[int] = Field(None, description="The minimum numbers of tokens to generate.")
+    early_stopping: Optional[Union[bool, str]] = Field(
+        False, description="Controls the stopping condition for beam search."
+    )
+    max_time: Optional[float] = Field(None, description="The maximum amount of time for the computation in seconds.")
+
+    # Generation Strategy
+    do_sample: Optional[bool] = Field(False, description="Whether or not to use sampling.")
+    num_beams: Optional[int] = Field(1, description="Number of beams for beam search.")
+    num_beam_groups: Optional[int] = Field(1, description="Number of groups for diversity among beams.")
+    penalty_alpha: Optional[float] = Field(None, description="Balances model confidence and degeneration penalty.")
+    use_cache: Optional[bool] = Field(
+        True,
+        description="Whether to use past key/values attentions to speed up decoding.",
+    )
+
+    # Manipulation of Model Output Logits
+    temperature: Optional[float] = Field(1.0, description="Modulates the next token probabilities.")
+    top_k: Optional[int] = Field(
+        50,
+        description="Number of highest probability tokens to keep for top-k-filtering.",
+    )
+    top_p: Optional[float] = Field(
+        1.0,
+        description="Keeps the smallest set of most probable tokens summing up to top_p.",
+    )
+    typical_p: Optional[float] = Field(1.0, description="Local typicality measure.")
+    epsilon_cutoff: Optional[float] = Field(0.0, description="Minimum conditional probability for token sampling.")
+    eta_cutoff: Optional[float] = Field(0.0, description="Hybrid of locally typical sampling and epsilon sampling.")
+    diversity_penalty: Optional[float] = Field(
+        0.0, description="Penalty for token repetition across different beam groups."
+    )
+    repetition_penalty: Optional[float] = Field(1.0, description="Penalty for token repetition.")
+    encoder_repetition_penalty: Optional[float] = Field(
+        1.0, description="Penalty on sequences not in the original input."
+    )
+    length_penalty: Optional[float] = Field(1.0, description="Exponential penalty to the length for beam search.")
+    no_repeat_ngram_size: Optional[int] = Field(0, description="Size of ngrams that cannot occur more than once.")
+    bad_words_ids: Optional[List[List[int]]] = Field(None, description="Tokens that are not allowed to be generated.")
+    force_words_ids: Optional[List[Union[List[int], List[List[int]]]]] = Field(
+        None, description="Tokens that must be generated."
+    )
+    renormalize_logits: Optional[bool] = Field(
+        False, description="Whether to renormalize logits after all processors."
+    )
 
 
 class AblationConfig(BaseModel):

Original file line number	Diff line number	Diff line change
`@@ -74,6 +74,8 @@ def _get_model(self):`
`74`	`74`	`),`
`75`	`75`	`use_cache=False,`
`76`	`76`	`device_map=self.device_map,`
	`77`	`+ torch_dtype=self._model_config.casted_torch_dtype,`
	`78`	`+ attn_implementation=self._model_config.attn_implementation,`
`77`	`79`	`)`
`78`	`80`
`79`	`81`	`model.config.pretraining_tp = 1`