evo-design
diff --git a/‎.gitignore‎
Lines changed: 4 additions & 0 deletions b/‎.gitignore‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 108 additions & 1 deletion b/‎README.md‎
Lines changed: 108 additions & 1 deletion
diff --git a/‎evo/__init__.py‎
Lines changed: 6 additions & 0 deletions b/‎evo/__init__.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎evo/generation.py‎
Lines changed: 103 additions & 0 deletions b/‎evo/generation.py‎
Lines changed: 103 additions & 0 deletions
diff --git a/‎evo/models.py‎
Lines changed: 93 additions & 0 deletions b/‎evo/models.py‎
Lines changed: 93 additions & 0 deletions
@@ -0,0 +1,4 @@
+*.egg-info/
+__pycache__/
+build/
+dist/
@@ -1 +1,108 @@
-# evo
+# Evo: DNA foundation modeling from molecular to genome scale
+
+Tasks remaining:
+- [ ] Upload checkpoints to the web and finalize auto downloading code.
+- [ ] Verify logits are the same as private repo.
+- [ ] Package and upload to PyPI.
+- [ ] Update with preprint info, blog info, Together API info, and HF info.
+
+Evo is a biological foundation model capable of long-context modeling and design.
+Evo uses the [StripedHyena architecture](https://github.com/togethercomputer/stripedhyena) to enable modeling of sequences at a single-nucleotide, byte-level resolution with near-linear scaling of compute and memory relative to context length.
+Evo has 7 billion parameters and is trained on OpenGenome, a prokaryotic whole-genome dataset containing 260 billion tokens.
+
+Technical details about Evo can be found in our preprint and the accompanying blog.
+
+We provide the following model checkpoints:
+- `evo-1_stripedhyena_pretrained_8k`: A model pretrained with 8k context. We use this model as the base model for molecular-scale finetuning tasks.
+- `evo-1_stripedhyena_pretrained_131k`: A model pretrained with 131k context using `evo-1_stripedhyena_pretrained_8k` as the base model. We use this model to reason about and generate sequences at the genome scale.
+
+## Contents
+
+- [Setup](#setup)
+  - [Requirements](#requirements)
+  - [Installation](#installation)
+- [Usage](#usage)
+- [Web API](#web-api)
+- [HuggingFace](#hugging-face)
+- [Citation](#citation)
+
+## Setup
+
+### Requirements
+
+Evo uses [FlashAttention-2](https://github.com/Dao-AILab/flash-attention), which may not work on all GPU architectures.
+Please consult the [FlashAttention GitHub repository](https://github.com/Dao-AILab/flash-attention#installation-and-features) for the current list of supported GPUs.
+
+Evo also uses PyTorch. Make sure the correct [PyTorch version is installed](https://pytorch.org/) on your system.
+
+### Installation
+
+You can install Evo using `pip`
+```bash
+pip install evo-model
+```
+or directly from the GitHub source
+```bash
+git clone https://github.com/evo-design/evo.git
+cd evo/
+pip install .
+```
+
+## Usage
+
+You can download Evo and use it locally through the Python API. For example:
+```python
+from evo import Evo
+import torch
+
+device = 'cuda:0'
+
+evo_model = Evo('evo-1_stripedhyena_pretrained_8k')
+model, tokenizer = evo_model.model, evo_model.tokenizer
+model.to(device)
+model.eval()
+
+sequence = 'ACGT'
+input_ids = torch.tensor(
+    tokenizer.tokenize(sequence),
+    dtype=torch.int,
+).to(device).unsqueeze(0)
+logits, _ = model(input_ids) # (batch, length, vocab)
+
+print('Logits: ', logits)
+print('Shape (batch, length, vocab): ', logits.shape)
+```
+Examples of batched inference can be found in [`scripts/example_inference.py`](scripts/example_inference.py).
+
+We provide an example script for how to prompt the model and sample a set of sequences given the prompt.
+```bash
+python scripts/generate.py \
+    --model-name evo-1_stripedhyena_pretrained_8k \
+    --prompt ACGT \
+    --n-samples 10 \
+    --n-tokens 100 \
+    --temperature 1. \
+    --top-k 4 \
+    --device cuda:0
+```
+
+We also provide an example script for using the model to score the log-likelihoods of a set of sequences.
+```bash
+python scripts/score.py \
+    --input-fasta examples/example_seqs.fasta \
+    --output-tsv scores.tsv \
+    --model-name evo-1_stripedhyena_pretrained_8k \
+    --device cuda:0
+```
+
+## Web API
+
+We are working with [Together.AI](https://www.together.ai/) on a web API that will provide logits and sampling functionality for Evo.
+
+## HuggingFace integration
+
+We are working on integration with [HuggingFace](https://huggingface.co/).
+
+## Citation
+
+We will make a preprint publicly available soon.
@@ -0,0 +1,6 @@
+from .version import version as __version__
+
+from .models import Evo
+
+from .generation import generate
+from .scoring import score_sequences, positional_entropies
@@ -0,0 +1,103 @@
+import numpy as np
+import sys
+import torch
+from typing import List, Tuple, Union
+
+from .models import load_checkpoint
+from .scoring import logits_to_logprobs, prepare_batch
+from .stripedhyena.src.generation import Generator
+from .stripedhyena.src.model import StripedHyena
+from .stripedhyena.src.tokenizer import CharLevelTokenizer
+
+
+def generate(
+        prompt_seqs: List[str],
+        model: StripedHyena,
+        tokenizer: CharLevelTokenizer,
+        n_tokens: int = 100,
+        temperature: float = 0.,
+        top_k: int = 1,
+        top_p: float = 1.,
+        skipped_tokens: Union[str, List[str], List[int]] = None,
+        batched: bool = True,
+        prepend_bos: bool = True,
+        cached_generation: bool = False,
+        verbose: int = 1,
+        device: str = 'cuda:0',
+        **kwargs,
+) -> Tuple[List[str], List[float]]:
+    """
+    Performs generation from a list of prompts.
+    If all prompts are the same length, this can do batched generation.
+    Also supports cached generation for efficient sampling.
+    """
+    model.eval()
+
+    g = Generator(
+        model,
+        tokenizer,
+        top_k=top_k,
+        top_p=top_p,
+        temperature=temperature,
+    )
+
+    uniform_lengths = all(len(s) == len(prompt_seqs[0]) for s in prompt_seqs)
+
+    if batched and uniform_lengths:
+        input_ids_list = [
+            prepare_batch(
+                prompt_seqs,
+                tokenizer,
+                prepend_bos=prepend_bos,
+                device=device,
+            )[0]
+        ]
+    else:
+        if verbose:
+            if not uniform_lengths:
+                sys.stderr.write('Note: Prompts are of different lengths.\n')
+            sys.stderr.write('Note: Will not do batched generation.\n')
+        input_ids_list = [
+            prepare_batch(
+                [ prompt_seq ],
+                tokenizer,
+                prepend_bos=prepend_bos,
+                device=device,
+            )[0]
+            for prompt_seq in prompt_seqs
+        ]
+
+    generated_seqs, generated_scores = [], []
+    for input_ids in input_ids_list:
+        batch_size = input_ids.shape[0]
+        
+        output_ids, logits = g.generate(
+            input_ids=input_ids,
+            num_tokens=n_tokens,
+            cached_generation=cached_generation,
+            device=device,
+            print_generation=True,
+            verbose=(verbose > 1),
+            skipped_tokens=skipped_tokens,
+            stop_at_eos=False,
+        )
+        if verbose > 1:
+            print('input_ids.shape', input_ids.shape)
+            print('output_ids.shape', output_ids.shape)
+            print('logits.shape', logits.shape)
+
+        generated_seqs_batch = list(tokenizer.detokenize_batch(output_ids))
+        assert len(generated_seqs_batch) == batch_size
+        generated_seqs += generated_seqs_batch
+
+        logprobs = logits_to_logprobs(logits, output_ids, trim_bos=prepend_bos)
+        logprobs = logprobs.float().cpu().numpy()
+
+        generated_scores += [ np.mean(logprobs[idx]) for idx in range(batch_size) ]
+
+    assert len(generated_seqs) == len(generated_scores) == len(prompt_seqs)
+    if verbose:
+        for seq, score, prompt in zip(generated_seqs, generated_scores, prompt_seqs):
+            print(f'Prompt: "{prompt}",\tOutput: "{seq}",\tScore: {score}')
+
+    return generated_seqs, generated_scores
@@ -0,0 +1,93 @@
+import os
+import requests
+import torch
+from typing import List, Tuple
+import yaml
+
+from .stripedhyena.src.utils import dotdict
+from .stripedhyena.src.model import StripedHyena
+from .stripedhyena.src.tokenizer import CharLevelTokenizer
+
+
+VALID_MODEL_NAMES = [
+    'evo-1_stripedhyena_pretrained_8k',
+    'evo-1_stripedhyena_pretrained_131k',
+]
+
+
+class Evo:
+    def __init__(self, model_name: str, device: str = 'cuda:0'):
+        """
+        Loads an Evo model checkpoint given a model name.
+        If the checkpoint does not exist, automatically downloads the model to
+        `~/.cache/torch/hub/checkpoints`.
+        """
+
+        if model_name not in VALID_MODEL_NAMES:
+            raise ValueError(
+                f'Invalid model name {model_name}. Should be one of: '
+                f'{", ".join(VALID_MODEL_NAMES)}.'
+            )
+
+        # Download checkpoint.
+
+        home_directory = os.path.expanduser('~')
+        download_url = f'https://TODO/checkpoints/{model_name}.pt'
+        cache_dir = f'{home_directory}/.cache/torch/hub/checkpoints'
+        checkpoint_path = f'{cache_dir}/{model_name}.pt'
+
+        if not os.path.exists(checkpoint_path):
+            print(f'Downloading {download_url} to {cache_dir}...')
+
+            if not os.path.exists(cache_dir):
+                os.makedirs(cache_dir, exist_ok=True)
+
+            response = requests.get(download_url, stream=True)
+            if response.status_code == 200:
+                with open(checkpoint_path, 'wb') as f:
+                    f.write(response.raw.read())
+            else:
+                raise Exception(f'Failed to download the file. Status code: {response.status_code}')
+
+        # Load correct config file.
+
+        if model_name == 'evo-1_stripedhyena_pretrained_8k':
+            config_path = 'evo/stripedhyena/configs/sh_inference_config_7b.yml'
+        elif model_name == 'evo-1_stripedhyena_pretrained_131k':
+            config_path = 'evo/stripedhyena/configs/sh_inference_config_7b_rotary_scale_16.yml'
+        else:
+            raise ValueError(f'Invalid model name {model_name}.')
+
+        # Load model.
+
+        self.model, self.tokenizer = load_checkpoint(
+            checkpoint_path,
+            model_type='stripedhyena',
+            config_path=config_path,
+            device=device,
+        )
+        self.model = self.model.to(device).eval()
+
+        self.device = device
+
+
+def load_checkpoint(
+        ckpt_path: str,
+        config_path: str = './evo/stripedhyena/configs/sh_inference_config_7b.yml',
+        verbose: int = 0,
+        device: str = 'cuda:0',
+        **kwargs: dict,
+) -> Tuple[StripedHyena, CharLevelTokenizer]:
+    """
+    Loads a checkpoint from a path and corresponding config.
+    """
+    global_config = dotdict(yaml.load(open(config_path), Loader=yaml.FullLoader))
+
+    model = StripedHyena(global_config)
+    tokenizer = CharLevelTokenizer(512)
+
+    model.load_state_dict(torch.load(ckpt_path), strict=True)
+
+    model.to_bfloat16_except_poles_residues()
+
+    return model, tokenizer
-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +*.egg-info/
 +__pycache__/
 +build/
 +dist/