Skip to content

Commit 9d8dab4

Browse files
committed
update
1 parent 153915a commit 9d8dab4

File tree

2 files changed

+97
-0
lines changed

2 files changed

+97
-0
lines changed

llama-index-integrations/llms/llama-index-llms-ipex-llm/examples/README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,14 @@ python basic.py -m <path_to_model> -d <cpu_or_xpu> -q <query_to_LLM>
3636
> pip install -U transformers==4.37.0 tokenizers==0.15.2
3737
> ```
3838
39+
### Low Bit Example
40+
41+
The example [low_bit.py](./low_bit.py) shows how to save and load low_bit model by `IpexLLM` on Intel CPU or GPU and conduct tasks such as text completion. Run the example as following:
42+
43+
```bash
44+
python low_bit.py -m <path_to_model> -d <cpu_or_xpu> -q <query_to_LLM>
45+
```
46+
3947
### More Data Types Example
4048

4149
By default, `IpexLLM` loads the model in int4 format. To load a model in different data formats like `sym_int5`, `sym_int8`, etc., you can use the `load_in_low_bit` option in `IpexLLM`. To load a model on different device like `cpu` or `xpu`, you can use the `device_map` option in `IpexLLM`.
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
def completion_to_prompt(completion):
2+
return f"<|system|>\n</s>\n<|user|>\n{completion}</s>\n<|assistant|>\n"
3+
4+
5+
# Transform a list of chat messages into zephyr-specific input
6+
def messages_to_prompt(messages):
7+
prompt = ""
8+
for message in messages:
9+
if message.role == "system":
10+
prompt += f"<|system|>\n{message.content}</s>\n"
11+
elif message.role == "user":
12+
prompt += f"<|user|>\n{message.content}</s>\n"
13+
elif message.role == "assistant":
14+
prompt += f"<|assistant|>\n{message.content}</s>\n"
15+
16+
# ensure we start with a system prompt, insert blank if needed
17+
if not prompt.startswith("<|system|>\n"):
18+
prompt = "<|system|>\n</s>\n" + prompt
19+
20+
# add final assistant prompt
21+
prompt = prompt + "<|assistant|>\n"
22+
23+
return prompt
24+
25+
26+
from llama_index.llms.ipex_llm import IpexLLM
27+
import argparse
28+
29+
if __name__ == "__main__":
30+
parser = argparse.ArgumentParser(description="IpexLLM Basic Usage Example")
31+
parser.add_argument(
32+
"--model-name",
33+
"-m",
34+
type=str,
35+
default="HuggingFaceH4/zephyr-7b-alpha",
36+
help="The huggingface repo id for the LLM model to be downloaded"
37+
", or the path to the huggingface checkpoint folder",
38+
)
39+
parser.add_argument(
40+
"--device",
41+
"-d",
42+
type=str,
43+
default="cpu",
44+
choices=["cpu", "xpu"],
45+
help="The device (Intel CPU or Intel GPU) the embedding model runs on",
46+
)
47+
parser.add_argument(
48+
"--query",
49+
"-q",
50+
type=str,
51+
default="What is IPEX-LLM?",
52+
help="The sentence you prefer for query the LLM",
53+
)
54+
55+
args = parser.parse_args()
56+
model_name = args.model_name
57+
device = args.device
58+
query = args.query
59+
60+
llm = IpexLLM.from_model_id(
61+
model_name=model_name,
62+
tokenizer_name=model_name,
63+
context_window=512,
64+
max_new_tokens=128,
65+
generate_kwargs={"do_sample": False},
66+
completion_to_prompt=completion_to_prompt,
67+
messages_to_prompt=messages_to_prompt,
68+
device_map=device,
69+
)
70+
71+
saved_lowbit_model_path = "./zephyr-7b-alpha-low-bit" # path to save low-bit model
72+
73+
llm._model.save_low_bit(saved_lowbit_model_path)
74+
del llm
75+
76+
llm_lowbit = IpexLLM.from_model_id_low_bit(
77+
model_name=saved_lowbit_model_path,
78+
tokenizer_name=model_name,
79+
# tokenizer_name=saved_lowbit_model_path, # copy the tokenizers to saved path if you want to use it this way
80+
context_window=512,
81+
max_new_tokens=64,
82+
completion_to_prompt=completion_to_prompt,
83+
generate_kwargs={"do_sample": False},
84+
device_map=device,
85+
)
86+
87+
response_iter = llm_lowbit.stream_complete(query)
88+
for response in response_iter:
89+
print(response.delta, end="", flush=True)

0 commit comments

Comments
 (0)