diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/requirements.txt b/requirements.txt index cb9dc99..1433d7d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,4 @@ -auto_mix_prep -datasets -icecream numpy -pandas -prompt_toolkit -pydantic torch tqdm transformers diff --git a/spectrum.py b/spectrum.py index 38ae27c..02d855f 100644 --- a/spectrum.py +++ b/spectrum.py @@ -1,32 +1,43 @@ # spectrum.py -import torch -from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig -import numpy as np -import json -from prompt_toolkit.shortcuts import checkboxlist_dialog, input_dialog import argparse -from tqdm import tqdm +import json import os import time +import numpy as np +import torch +from tqdm import tqdm +from transformers import AutoConfig, AutoModelForCausalLM + + class ModelModifier: - def __init__(self, model_name=None, top_percent=50, batch_size=1): + def __init__(self, model_name=None, top_percent=50, batch_size=1, cuda=False): self.model_name = model_name self.top_percent = top_percent self.batch_size = batch_size if model_name: try: - self.model = AutoModelForCausalLM.from_pretrained( - model_name, - torch_dtype=torch.float32, - low_cpu_mem_usage=True, - trust_remote_code=True, - device_map="auto" - ) + if cuda: + torch.cuda.empty_cache() + self.model = AutoModelForCausalLM.from_pretrained( + model_name, + torch_dtype=torch.float32, + low_cpu_mem_usage=True, + trust_remote_code=True, + device_map="auto", + ) + else: + self.model = AutoModelForCausalLM.from_pretrained( + model_name, + torch_dtype=torch.float32, + trust_remote_code=True, + device_map="cpu", + ) except KeyError as e: print(f"Error loading model: {e}") print("Attempting to load with custom configuration...") + config = AutoConfig.from_pretrained(model_name) config.rope_scaling = {"type": "linear", "factor": 1.0} self.model = AutoModelForCausalLM.from_pretrained( @@ -35,64 +46,87 @@ def __init__(self, model_name=None, top_percent=50, batch_size=1): torch_dtype=torch.float32, low_cpu_mem_usage=True, trust_remote_code=True, - device_map="auto" + device_map="auto", ) - + except RuntimeError or NotImplementedError as e: + print(f"Error loading model: {e}") + print("Try passing --cuda=False!") + # Check if the model config has rope_scaling - if not hasattr(self.model.config, 'rope_scaling'): - self.model.config.rope_scaling = {'type': 'linear'} + if not hasattr(self.model.config, "rope_scaling"): + self.model.config.rope_scaling = {"type": "linear"} elif not isinstance(self.model.config.rope_scaling, dict): - self.model.config.rope_scaling = {'type': 'linear'} - elif 'type' not in self.model.config.rope_scaling: - self.model.config.rope_scaling['type'] = 'linear' + self.model.config.rope_scaling = {"type": "linear"} + elif "type" not in self.model.config.rope_scaling: + self.model.config.rope_scaling["type"] = "linear" else: self.model = None - self.layer_snr = {} - self.layer_types = [] + self.layer_snr = dict() def get_weight_types(self): weight_types = set() + for name, module in self.model.named_modules(): - parts = name.split('.') - if any(hasattr(module, attr) for attr in ['weight', 'bias','inv_freq']): - layer_index = next((i for i, part in enumerate(parts) if part.isdigit()), -1) - weight_type = '.'.join(parts[layer_index + 1:]) if layer_index != -1 else name + parts = name.split(".") + + if any(hasattr(module, attr) for attr in ["weight", "bias", "inv_freq"]): + layer_index = next( + (i for i, part in enumerate(parts) if part.isdigit()), -1 + ) + weight_type = (".".join(parts[layer_index + 1 :]) if layer_index != -1 else name) weight_types.add(weight_type) + return list(weight_types) - def interactive_select_weights(self): + def select_weights(self): weight_types = self.get_weight_types() - sorted_weight_types = self.sort_weight_types(weight_types) - selected_types = checkboxlist_dialog( - title="Select Weight Types", - text="Deselect the weight types you do not want to scan for SNR:", - values=[(wt, wt) for wt in sorted_weight_types], - default_values=sorted_weight_types - ).run() - self.layer_types = selected_types - return selected_types - - def sort_weight_types(self, weight_types): - categories = {} + categories = dict() + + def get_layer(weight: list): + try: + return weight[1] + except IndexError: + return None + for wt in weight_types: - category = wt.split('.')[0] - categories.setdefault(category, []).append(wt) - sorted_categories = {k: sorted(v) for k, v in sorted(categories.items(), key=lambda item: item[0])} - sorted_weight_types = [wt for sublist in sorted_categories.values() for wt in sublist] + arr = wt.split(".") + category = arr[0] + layer = get_layer(arr) + + if category in ["mlp", "self_attn"] and layer != "rotary_emb": + categories.setdefault(category, list()).append(wt) + + sorted_categories = { + k: sorted(v) + for k, v in sorted(categories.items(), key=lambda item: item[0]) + } + sorted_weight_types = [ + wt for sublist in sorted_categories.values() for wt in sublist + ] + return sorted_weight_types def calculate_snr_for_layer(self, layer_type): - layers = [(name, module) for name, module in self.model.named_modules() if layer_type in name and hasattr(module, 'weight')] + layers = [ + (name, module) + for name, module in self.model.named_modules() + if layer_type in name and hasattr(module, "weight") + ] num_batches = (len(layers) + self.batch_size - 1) // self.batch_size - with tqdm(total=num_batches, unit='batch', desc=f'Calculating SNR for {layer_type}') as progress_bar: + with tqdm( + total=num_batches, unit="batch", desc=f"Calculating SNR for {layer_type}" + ) as progress_bar: for i in range(0, len(layers), self.batch_size): - batch_layers = layers[i:i + self.batch_size] + batch_layers = layers[i : i + self.batch_size] + for name, module in batch_layers: weights = module.weight.detach() + if weights.ndim < 2: weights = weights.unsqueeze(0) + S = torch.linalg.svdvals(weights) max_singular_value = S[0] sigma_estimated = self.estimate_sigma_with_full_iqr(S) @@ -100,9 +134,10 @@ def calculate_snr_for_layer(self, layer_type): mp_threshold = self.marchenko_pastur_threshold(sigma_estimated, n, m) signal = S[S > mp_threshold].sum() noise = S[S <= mp_threshold].sum() - snr = signal / noise if noise != 0 else float('inf') + snr = signal / noise if noise != 0 else float("inf") snr_ratio = snr / max_singular_value - self.layer_snr[name] = {'type': layer_type, 'snr': snr_ratio.item()} + self.layer_snr[name] = {"type": layer_type, "snr": snr_ratio.item()} + progress_bar.update(1) @staticmethod @@ -120,10 +155,14 @@ def estimate_sigma_with_full_iqr(S): return sigma_estimated def assess_layers_snr(self, selected_weight_types): - total_layers = sum(1 for name, module in self.model.named_modules() if any(layer_type in name for layer_type in selected_weight_types) and hasattr(module, 'weight')) + # total_layers = sum(1 for name, module in self.model.named_modules() if any(layer_type in name for layer_type in selected_weight_types) and hasattr(module, 'weight')) start_time = time.time() - with tqdm(total=len(selected_weight_types), unit='type', desc='Calculating SNR for types') as progress_bar: + with tqdm( + total=len(selected_weight_types), + unit="type", + desc="Calculating SNR for types", + ) as progress_bar: for layer_type in selected_weight_types: self.calculate_snr_for_layer(layer_type) progress_bar.update(1) @@ -133,64 +172,90 @@ def assess_layers_snr(self, selected_weight_types): print(f"Total time taken: {total_time:.2f} seconds") def save_snr_to_json(self): - model_name_slug = self.model_name.replace('/', '-').replace('_', '-') - directory = 'model_snr_results' - filename = os.path.join(directory, f'snr_results_{model_name_slug}.json') - - # Ensure the directory exists - if not os.path.exists(directory): - os.makedirs(directory) - - serializable_data = {} + model_name_slug = ( + os.path.basename(self.model_name).replace("_", "-") + if os.path.exists(self.model_name) + else self.model_name.replace("/", "-").replace("_", "-") + ) + directory = "model_snr_results" + filename = os.path.join(directory, f"snr_results_{model_name_slug}.json") + + os.makedirs(directory, exist_ok=True) + + serializable_data = dict() + for layer_name, info in self.layer_snr.items(): - snr_value = info['snr'].item() if isinstance(info['snr'], torch.Tensor) else info['snr'] - layer_type = str(info['type']) - serializable_data[layer_name] = {'snr': snr_value, 'type': layer_type} - - with open(filename, 'w') as file: + snr_value = ( + info["snr"].item() + if isinstance(info["snr"], torch.Tensor) + else info["snr"] + ) + layer_type = str(info["type"]) + serializable_data[layer_name] = {"snr": snr_value, "type": layer_type} + + with open(filename, "w") as file: json.dump(serializable_data, file, indent=4) - + print(f"Results saved to {filename}") self.save_top_snr_ratios_to_json(filename) self.generate_unfrozen_params_yaml(filename) def generate_unfrozen_params_yaml(self, json_filename, top_percent=None): top_percent = top_percent if top_percent is not None else self.top_percent - with open(json_filename, 'r') as file: + + with open(json_filename, "r") as file: snr_data = json.load(file) - unfrozen_parameters = {} + + unfrozen_parameters = dict() + for layer_name, info in snr_data.items(): - layer_type = info['type'] + layer_type = info["type"] + if layer_type not in unfrozen_parameters: - unfrozen_parameters[layer_type] = [] - unfrozen_parameters[layer_type].append((layer_name, info['snr'])) - top_layers_by_type = {} + unfrozen_parameters[layer_type] = list() + + unfrozen_parameters[layer_type].append((layer_name, info["snr"])) + + top_layers_by_type = dict() + for layer_type, layers in unfrozen_parameters.items(): layers_sorted = sorted(layers, key=lambda x: x[1], reverse=True) num_top_layers = int(len(layers) * top_percent / 100) - top_layers_by_type[layer_type] = [layer[0] for layer in layers_sorted[:num_top_layers]] + top_layers_by_type[layer_type] = [ + layer[0] for layer in layers_sorted[:num_top_layers] + ] + # Modify the yaml_filename to include the input json name and top_percent json_file_base = os.path.splitext(os.path.basename(json_filename))[0] yaml_filename = f"{json_file_base}_unfrozenparameters_{top_percent}percent.yaml" - with open(yaml_filename, 'w') as file: + + with open(yaml_filename, "w") as file: file.write("unfrozen_parameters:\n") file.write("- ^lm_head.weight$\n") file.write("- ^model.embed_tokens.weight$\n") + for layer_type, layer_names in top_layers_by_type.items(): file.write(f"# {layer_type} layers\n") + for layer_name in layer_names: file.write(f"- {layer_name}\n") + print(f"Top {top_percent}% SNR layers saved to {yaml_filename}") def save_top_snr_ratios_to_json(self, json_filename, filename=None): - with open(json_filename, 'r') as file: + with open(json_filename, "r") as file: snr_data = json.load(file) - all_snr_layers = {} + + all_snr_layers = dict() + for layer_name, info in snr_data.items(): - layer_type = info['type'] + layer_type = info["type"] + if layer_type not in all_snr_layers: - all_snr_layers[layer_type] = [] - all_snr_layers[layer_type].append((layer_name, info['snr'])) + all_snr_layers[layer_type] = list() + + all_snr_layers[layer_type].append((layer_name, info["snr"])) + for layer_type, layers in all_snr_layers.items(): layers_sorted = sorted(layers, key=lambda x: x[1], reverse=True) all_snr_layers[layer_type] = {layer[0]: layer[1] for layer in layers_sorted} @@ -198,37 +263,48 @@ def save_top_snr_ratios_to_json(self, json_filename, filename=None): json_file_base = os.path.splitext(os.path.basename(json_filename))[0] filename = f"{json_file_base}_sorted.json" if filename is None else filename - with open(filename, 'w') as file: + with open(filename, "w") as file: json.dump(all_snr_layers, file, indent=4) + print(f"All SNR layers sorted and saved to {filename}") -def main(): - # Handle command-line arguments - parser = argparse.ArgumentParser(description="Process SNR data for layers.") - parser.add_argument('--model-name', type=str, required=True, help='Model name or path to the model') - parser.add_argument('--top-percent', type=int, default=None, help='Top percentage of layers to select, overriding the default') - args = parser.parse_args() - # Check for existing SNR results file - model_name_slug = args.model_name.replace('/', '-').replace('_', '-') - snr_file_path = os.path.join('model_snr_results', f'snr_results_{model_name_slug}.json') +def calculate_model_snr(model: str, top_percent: int, batch_size: int = 1, cuda: bool = False): + model_name_slug = ( + os.path.basename(model).replace("_", "-") + if os.path.exists(model) + else model.replace("/", "-").replace("_", "-") + ) + snr_file_path = os.path.join("model_snr_results", f"snr_results_{model_name_slug}.json") if os.path.exists(snr_file_path): - print(f"Found existing SNR results file for {args.model_name}") - modifier = ModelModifier(top_percent=args.top_percent) - modifier.generate_unfrozen_params_yaml(snr_file_path, args.top_percent) + print(f"Found existing SNR results file for {model}") + modifier = ModelModifier(top_percent=top_percent, cuda=cuda) + modifier.generate_unfrozen_params_yaml(snr_file_path, top_percent) else: - print(f"No existing SNR results file found for {args.model_name}. Proceeding with SNR calculation.") - batch_size = input_dialog(title="Batch Size", text="Enter the batch size:").run() - batch_size = int(batch_size) if batch_size else 1 - modifier = ModelModifier(model_name=args.model_name, batch_size=batch_size) - selected_weight_types = modifier.interactive_select_weights() + print(f"No existing SNR results file found for {model}. Proceeding with SNR calculation.") + modifier = ModelModifier(model_name=model, batch_size=batch_size, cuda=cuda) + selected_weight_types = modifier.select_weights() + if selected_weight_types: modifier.assess_layers_snr(selected_weight_types) modifier.save_snr_to_json() - print("Finished SNR scanning and data saved.") + print("Finished SNR rating.") else: print("No weight types selected.") + +def main(): + # Handle command-line arguments + parser = argparse.ArgumentParser(description="Process SNR data for layers.") + parser.add_argument("--model-name", type=str, required=True, help="Model name or path to the model") + parser.add_argument("--top-percent", type=int, default=50, help="Top percentage of layers to select") + parser.add_argument("--batch-size", type=int, default=1, help="Job batch size") + parser.add_argument("--cuda", type=bool, default=False, help="Whether to use the GPU") + args = parser.parse_args() + + calculate_model_snr(args.model_name, args.top_percent, args.batch_size, args.cuda) + + if __name__ == "__main__": main()