-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathstep2_generate_instruction.py
122 lines (102 loc) · 5.59 KB
/
step2_generate_instruction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
from tqdm import tqdm
from openai import OpenAI
from beir.datasets.data_loader import GenericDataLoader
import os
import json
import pickle
import openai
import time
import argparse
def main(args):
corpus, queries, qrels = GenericDataLoader(data_folder=args.data_path).load(split="test")
print("len(corpus),len(queries),len(qrels):",len(corpus),len(queries),len(qrels))
client = OpenAI(
api_key=args.open_ai_api_key,
)
step2_prompt ="You are a helpful, respectful and creative assistant."
model = 'gpt-4-1106-preview'
temperature=1.0
results = []
for qid, _ in tqdm(qrels.items()):
query = queries[qid]
retry_cnt = 0
backoff_time = 30
while retry_cnt <= 3:
try:
description = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": step2_prompt.strip()},
{
"role": "user",
"content":
"""Your task is to generate a set of scenarios for the provided search query.
Here is the specification for the scenario generation task:
- The scenario should reflect a very specific scenario where a user is interacting with an AI search engine.
- Within the scenario, the user could write about his/her job, background, situation, location, occupation, hobbies, interests, or goals of doing the search. Also, the user could explicitly reflect about his/her preference regarding the document to be searched.
- The scenario SHOULD be written from a first person’s view point. For example, it should start with phrases like “I am a {job},”, “I am in a situation ….”, “During my {situation}”.
- While the provided query is about "what" is being searched for, the scenario you will generate should be about "how" the search should be approached and what values or criteria should be prioritized in that search. This distinction makes the role of the scenario clear as a guiding framework for the search process, as opposed to the query which is more about the specific target of the search.
- However, the scenario should be RELATED with the provided query. In other words, it shouldn’t be applicable to other queries in general.
- You should generate based on the following format (note that there is a phrase “[END]” after each scenario being generated):
Scenario 1: {generate the first scenario} [END]
Scenario 2: {generate the second scenario} [END]
Scenario 10: {generate the last scenario} [END]
- Please do not generate any other opening, closing, and explanations. Just generate the set of scenarios!""" + f"\nQuery: {query}"
}
],
temperature=temperature,
)
response = description.choices[0].message.content
# print("Query:\n",query)
# print("Respone:\n",response)
break
except openai.APIError as e:
print(f"OpenAIError: {e}.")
if "Please reduce your prompt" in str(e):
target_length = int(target_length * 0.8)
print(f"Reducing target length to {target_length}, retrying...")
else:
print(f"Retrying in {backoff_time} seconds...")
time.sleep(backoff_time)
backoff_time *= 1.5
retry_cnt += 1
results.append({
"query_id": qid,
"input": query,
"generated_text": response,
"metadata": {
"input_type": 'step2',
"model": model,
"temparature": temperature,
}
})
print(f"Processing Done - total: {len(results)}")
os.makedirs(args.save_path,exist_ok=True)
with open(os.path.join(args.save_path, "raw_dataset.step2_generate_instruction.json"),"w") as f:
f.write(json.dumps(results, indent=4, ensure_ascii=False))
# * Post processing
processed_results = []
for pair in results:
scenarios= []
split_lines = pair['generated_text'].split('[END]')
for line in split_lines:
if line.strip():
output = line.split(':',1)[1].strip()
if 'Scenario' in line:
scenarios.append(output)
processed_results.append({
"query_id": pair['query_id'],
"query": pair['input'],
"instructions": [scene for scene in scenarios],
})
print(len(processed_results))
with open(os.path.join(args.save_path, "processed_dataset.step2_generate_instruction.json"),"w") as f:
f.write(json.dumps(processed_results, indent=4, ensure_ascii=False))
if __name__=="__main__":
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--open_ai_api_key", type=str,required=True,help="OPENAI_API_KEY")
parser.add_argument("--data_path", type=str, default="./msmarco/filtered_version" ,help="filtered dataset folder after stage 1")
parser.add_argument("--save_path", type=str, default="generated_data/step2_generate_instruction" ,help="save path")
args, _ = parser.parse_known_args()
main(args)