push

patil-suraj · patil-suraj · commit 9d50f6aabcea · 2022-07-12T17:31:30.000Z
diff --git a/bloom_inference/generator.py b/bloom_inference/generator.py
@@ -50,7 +50,7 @@ class Generator:
     def __init__(
         self,
         model_parallel_submesh=(1, 2, 4, 1), # for v4-64
-        ckpt="bigscience/bloom-6b3",
+        ckpt="bigscience/bloom",
         t5x_path="gs://bloom-jax-us-central2-b/bloom-176B-scan-t5x/checkpoint_0",
         max_len=256,
         max_input_len=64,
@@ -62,14 +62,14 @@ def __init__(
         self.max_input_len = max_input_len
 
         config = BloomConfig.from_pretrained(ckpt, max_length=max_len, do_sample=True, num_beams=1, top_p=0.9)
-        model = FlaxBloomForCausalLM(config, _do_init=False, dtype=jnp.bfloat16, use_scan=True)
+        self.model = FlaxBloomForCausalLM(config, _do_init=False, dtype=jnp.bfloat16, use_scan=True)
 
         def init_state():
             input_shape = (1,1)
             input_ids = jnp.zeros(input_shape, dtype="i4")
             attention_mask = jnp.ones_like(input_ids)
             rng = jax.random.PRNGKey(0)
-            initial_vars = model.module.init(rng, input_ids, attention_mask, return_dict=False)
+            initial_vars = self.model.module.init(rng, input_ids, attention_mask, return_dict=False)
             return InferenceState.create(initial_vars)
         
         state_shapes = jax.eval_shape(init_state)
diff --git a/bloom_inference/host_worker.py b/bloom_inference/host_worker.py
@@ -1,13 +1,15 @@
+import os
 import ray
 import time
 from queue import Queue
 
 
 @ray.remote(resources=***REMOVED***"tpu": 1***REMOVED***)
+# @ray.remote
 class TPUHostWorker(object):
     def __init__(
         self,
-        ckpt="bigscience/bloom-6b3",
+        ckpt="bigscience/bloom",
         t5x_path="gs://bloom-jax-us-central2-b/bloom-176B-scan-t5x/checkpoint_0",
         max_len=256,
         max_input_len=64,
@@ -22,14 +24,24 @@ def __init__(
         self.input_q = Queue(maxsize=1)
         self.output_q = Queue(maxsize=1)
 
+        self._is_cpu = os.path.exists("/home/suraj_huggingface_co/bloom-jax-inference/is_cpu.txt")
+    
+    def is_cpu(self):
+        return self._is_cpu
+
     def run(self):
         # we import packages here to import JAX and Generator only on the Host worker and not the CPU manager
         import jax
         from bloom_inference.generator import Generator, head_print
 
         print(f"jax runtime initialization starting")
         start = time.time()
-        head_print(f"jax devices: ***REMOVED***jax.device_count()***REMOVED***")
+        device_count = jax.device_count()
+        if device_count == 1:
+            head_print("TPU not found. Returning")
+            ray.shutdown()
+            return
+        head_print(f"jax devices: ***REMOVED***device_count***REMOVED***")
         head_print(f"jax runtime initialized in ***REMOVED***time.time() - start:.06***REMOVED***s")
 
         # load model and params
diff --git a/bloom_inference/tpu_manager.py b/bloom_inference/tpu_manager.py
@@ -7,14 +7,14 @@ class TPUManager:
     def __init__(
         self,
         node_count=8,
-        ckpt="bigscience/bloom-6b3",
+        ckpt="bigscience/bloom",
         t5x_path="gs://bloom-jax-us-central2-b/bloom-176B-scan-t5x/checkpoint_0",
         max_len=256,
         max_input_len=64,
         model_parallel_submesh=(1, 2, 4, 1), # for v4-64
     ):
         # needs a valid ray cluster to start
-        assert ray.is_initialized(), "ray not initialised"
+        # assert ray.is_initialized(), "ray not initialised"
 
         from bloom_inference.host_worker import TPUHostWorker
 
@@ -29,16 +29,33 @@ def __init__(
 
         start = time.time()
 
-        for i in range(node_count):
+        # for i in range(node_count):
+        #     worker = TPUHostWorker.options(max_concurrency=2).remote(
+        #         ckpt,
+        #         t5x_path,
+        #         max_len,
+        #         max_input_len,
+        #         model_parallel_submesh,
+        #     )
+        #     is_cpu = ray.get(worker.is_cpu.remote())
+        #     print(is_cpu)
+        #     if not is_cpu:
+        #         self.nodes.append(worker)
+
+        while (len(self.nodes) < node_count):
             worker = TPUHostWorker.options(max_concurrency=2).remote(
                 ckpt,
                 t5x_path,
                 max_len,
                 max_input_len,
                 model_parallel_submesh,
             )
-            self.nodes.append(worker)
+            is_cpu = ray.get(worker.is_cpu.remote())
+            print(is_cpu)
+            if not is_cpu:
+                self.nodes.append(worker)
 
+        assert len(self.nodes) == node_count
         for node in self.nodes:
             node.run.remote()
 
diff --git a/dump.rdb b/dump.rdb
diff --git a/is_cpu.txt b/is_cpu.txt
diff --git a/launch_generate.sh b/launch_generate.sh
@@ -0,0 +1,9 @@
+INSTANCE=bloom-tpu-v4-64
+ZONE=us-central2-b
+PROJECT=huggingface-ml
+
+# run script.bash through run_script.bash
+gcloud alpha compute tpus tpu-vm ssh $INSTANCE --project=$PROJECT --zone=$ZONE \
+    --force-key-file-overwrite --strict-host-key-checking=no \
+    --worker=all \
+    --command="bash ~/bloom-jax-inference/run_generate.sh"
diff --git a/ray_tpu.py b/ray_tpu.py
@@ -46,21 +46,28 @@ def get_connection(
 
 def start_ray(conn, address):
     # start afresh each launch (temporarily)
-    conn.run("sudo rm -rf *.py bloom_inference")
+    conn.run("sudo rm -rf *.py bloom-jax-inference")
     # make directory of structure: bloom_inference/bloom_inference/modeling_bloom
-    conn.run("mkdir bloom_inference bloom_inference/bloom_inference bloom_inference/bloom_inference/modeling_bloom -p")
-    
+    conn.run("mkdir bloom-jax-inference bloom-jax-inference/bloom_inference bloom-jax-inference/bloom_inference/modeling_bloom -p")
+
     # copy run files into bloom_inference
     for i in glob.glob("*.py"):
-        conn.put(i, "bloom_inference/")
+        conn.put(i, "bloom-jax-inference/")
 
     # copy CPU/TPU manager files into bloom_inference/bloom_inference
     for i in glob.glob("bloom_inference/*.py"):
-        conn.put(i, "bloom_inference/bloom_inference/")
+        conn.put(i, "bloom-jax-inference/bloom_inference/")
 
     # copy modeling files into bloom_inference/bloom_inference/modeling_bloom
     for i in glob.glob("bloom_inference/modeling_bloom/*.py"):
-        conn.put(i, "bloom_inference/bloom_inference/modeling_bloom/")
+        conn.put(i, "bloom-jax-inference/bloom_inference/modeling_bloom/")
+
+    # copy modeling files into bloom_inference/bloom_inference/modeling_bloom
+    for i in glob.glob("*.sh"):
+        conn.put(i, "bloom-jax-inference/")
+
+    # copy key files into bloom_inference
+    conn.put("key.json", "bloom-jax-inference/")
 
     # transfer start-up script from CPU -> hosts and give permissions
     conn.put("scripts/ray_tpu.sh", "/tmp/ray-tpu.sh")
@@ -74,6 +81,6 @@ def start_ray(conn, address):
     time.sleep(1)
     
     # run start-up script
-    out = conn.run(f"bash /tmp/ray-tpu.sh ***REMOVED***address***REMOVED***", hide=True)
+    out = conn.run(f"bash /tmp/ray-tpu.sh ***REMOVED***address***REMOVED***", hide=False)
     # display result
     print(out)
diff --git a/run.py b/run.py
@@ -7,50 +7,49 @@
 
 from bloom_inference.tpu_manager import TPUManager 
 
-num_mp_partitions = 8
-
-#tpu_name = "suraj-tpu-v3-32"
-# tpu_name = "patrick-tpu-v3-32"
-# region = "europe-west4-a"
 tpu_name="bloom-tpu-v4-64"
 region="us-central2-b"
 
-ckpt = "bigscience/bloom-6b3",
-t5x_path = "gs://bloom-jax-us-central2-b/bloom-176B-scan-t5x/checkpoint_0",
-max_len = 256,
-max_input_len = 64,
-model_parallel_submesh = (1, 2, 4, 1), # for v4-64
-
-
-# get Python list of TPU hosts
-conns = get_connection(tpu_name, region)
-
-head_info = ray.init(include_dashboard=False, object_store_memory=10**9)
-address = head_info.address_info['address']
-
-# start ray CPU<->TPU on all hosts
-with pool.ThreadPool(processes=len(conns)) as p:
-    p.map(functools.partial(start_ray, address=address), conns)
-
-# initialise TPU manager
-t = TPUManager(
-    len(conns),
-    ckpt=ckpt,
-    t5x_path=t5x_path,
-    max_len=max_len,
-    max_input_len=max_input_len,
-    model_parallel_submesh=model_parallel_submesh,
-)
-
-# benchmark compile step
-start = time.time()
-print(t.generate(4*['Recipe for coconut pasta:']))
-print(f"Generations completed in ***REMOVED***time.time() - start:.06***REMOVED***s")
-
-# benchmark generate
-start = time.time()
-print(t.generate(4*['Recipe for coconut pasta:']))
-print(f"Generations completed in ***REMOVED***time.time() - start:.06***REMOVED***s")
-
-# shutdown ray rpc
-ray.shutdown()
+ckpt = "bigscience/bloom"
+t5x_path = "gs://bloom-jax-us-central2-b/bloom-176B-scan-t5x/checkpoint_0"
+max_len = 128
+max_input_len = 64
+model_parallel_submesh = (1, 2, 4, 1) # for v4-64
+
+
+def setup():
+    # get Python list of TPU hosts
+    conns = get_connection(tpu_name, region)
+    print(len(conns))
+    address='10.130.0.10:8080'
+    head_info = ray.init(include_dashboard=False, address="auto")
+    # object_store_memory=10**9, 
+
+    # start ray CPU<->TPU on all hosts
+    with pool.ThreadPool(processes=len(conns)) as p:
+        p.map(functools.partial(start_ray, address=address), conns)
+
+def init_manager():
+    # initialise TPU manager
+    t = TPUManager(
+        8,
+        ckpt=ckpt,
+        t5x_path=t5x_path,
+        max_len=max_len,
+        max_input_len=max_input_len,
+        model_parallel_submesh=model_parallel_submesh,
+    )
+    return t
+
+# # benchmark compile step
+# start = time.time()
+# print(t.generate(4*['Recipe for coconut pasta:']))
+# print(f"Generations completed in ***REMOVED***time.time() - start:.06***REMOVED***s")
+
+# # benchmark generate
+# start = time.time()
+# print(t.generate(4*['Recipe for coconut pasta:']))
+# print(f"Generations completed in ***REMOVED***time.time() - start:.06***REMOVED***s")
+
+# # shutdown ray rpc
+# ray.shutdown()
diff --git a/run_generate.sh b/run_generate.sh
@@ -0,0 +1,3 @@
+source ~/venv/bin/activate
+export GOOGLE_APPLICATION_CREDENTIALS=~/bloom-jax-inference/key.json
+python ~/bloom-jax-inference/run_speed.py
diff --git a/run_speed.py b/run_speed.py
@@ -1,5 +1,5 @@
 import argparse
-from time import time
+import time
 
 import numpy as np
 import jax
@@ -28,7 +28,7 @@
 input_len = args.input_len
 
 config = BloomConfig.from_pretrained(ckpt)
-model, params = FlaxBloomForCausalLM(config, _do_init=False, dtype=jnp.bfloat16, use_scan=True)
+model = FlaxBloomForCausalLM(config, _do_init=False, dtype=jnp.bfloat16, use_scan=True)
 tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-350m", use_fast=False)
 
 
@@ -102,7 +102,7 @@ def generate(params, input_ids, attention_mask):
 # This will auto-magically run in mesh context
 start = time.time()
 gen_ids = p_generate(loaded_state.params, inputs["input_ids"], inputs["attention_mask"])
-generated_text = tokenizer.batch_decode(gen_ids.local_shards[0].data, skip_special_tokens=False)
+generated_text = tokenizer.batch_decode(gen_ids, skip_special_tokens=False)
 if jax.process_index() == 0:
     print("Compilation time:", time.time() - start)
 
diff --git a/scripts/ray_tpu.sh b/scripts/ray_tpu.sh
@@ -4,19 +4,22 @@ set -e
 # this locks the python executable down to hopefully stop if from being fiddled with...
 screen -d -m python -c "import time; time.sleep(999999999)"
 
+gcloud auth activate-service-account --key-file ~/bloom-jax-inference/key.json
+export GOOGLE_APPLICATION_CREDENTIALS=~/bloom-jax-inference/key.json
+
 # debugging
-#rm -rf ~/venv
-#rm -rf ~/t5x
-#rm -rf ~/.cache/pip
+# rm -rf ~/venv
+# rm -rf ~/t5x
+# rm -rf ~/.cache/pip
 
 # check if venv exists
 if [ -f ~/venv/bin/activate ];
 then
   echo "venv exists"
   # activate venv (if not done so already)
   source ~/venv/bin/activate
-  # for now, reinstall bloom_inference everytime
-  pip install -e bloom_inference/
+  # for now, reinstall bloom-jax-inference everytime
+  pip install -e bloom-jax-inference/
 else
   echo "creating venv"
   # get application updates, 'yes' to all
@@ -29,19 +32,25 @@ else
   source ~/venv/bin/activate
 
   # pip install standard packages
-  pip install ray==1.13.0 transformers fabric dataclasses tqdm func_timeout
+  pip install ray==1.13.0 git+https://github.com/huggingface/transformers.git fabric dataclasses tqdm func_timeout
+  
   # build T5X from source
   git clone --branch=main https://github.com/google-research/t5x
   cd t5x
   python3 -m pip install -e '.[tpu]' -f \https://storage.googleapis.com/jax-releases/libtpu_releases.html
   cd ..
+  
   rm -rf ~/.cache/pip
+  
   yes | pip uninstall jax
   # force JAX to TPU version
-  pip install "jax[tpu]>=0.2.16" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
+  pip install "jax[tpu]" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
+  
   # And finally, Flax BLOOM
-  pip install -e bloom_inference/
+  pip install -e bloom-jax-inference/
 fi
 
+sudo pkill python* | true
 # TODO: this should be it's own command.
-TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=34359738368 ray start --address=$1 --resources="***REMOVED***\"tpu\": 1***REMOVED***"
+TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=34359738368 ray start --address=$1 --resources="***REMOVED***\"tpu\": 1***REMOVED***"
+# TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=34359738368 ray start --address=$1

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+source ~/venv/bin/activate`
	`2`	`+export GOOGLE_APPLICATION_CREDENTIALS=~/bloom-jax-inference/key.json`
	`3`	`+python ~/bloom-jax-inference/run_speed.py`