-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexample_wt_usage_toolkit.py
218 lines (188 loc) · 8.63 KB
/
example_wt_usage_toolkit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
#!/usr/bin/env python
# tokens_words_counter.py
#
# Description: A tool to count tokens and words in text files
# using the Anthropic API and calculate thinking budgets.
#
# Usage:
# python -B tokens_words_counter.py --text_file input.txt
# python -B tokens_words_counter.py --text_file input.txt --context_window 200000 --thinking_budget_tokens 32000
import anthropic
import argparse
import sys
import io
import time
from datetime import datetime
# Make stdout line-buffered (effectively the same as flush=True for each print)
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, line_buffering=True)
def parse_arguments():
parser = argparse.ArgumentParser(
description='Count tokens and words in a text file using Claude API.',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Example usages:
python -B tokens_words_counter.py --text_file input.txt
python -B tokens_words_counter.py --text_file input.txt --context_window 200000 --thinking_budget_tokens 32000
python -B tokens_words_counter.py --text_file input.txt --desired_output_tokens 20000
"""
)
# Create argument groups with section headers
input_group = parser.add_argument_group('Input Files')
api_group = parser.add_argument_group('Claude API Configuration')
output_group = parser.add_argument_group('Output Configuration')
# Add arguments to the Input Files group
input_group.add_argument('--text_file', type=str, required=True,
help="File containing the text to analyze (required)")
# Add arguments to the Claude API Configuration group
api_group.add_argument('--context_window', type=int, default=200000,
help='Context window for Claude 3.7 Sonnet (default: 200000)')
api_group.add_argument('--betas_max_tokens', type=int, default=128000,
help='Maximum tokens for AI output (default: 128000)')
api_group.add_argument('--thinking_budget_tokens', type=int, default=32000,
help='Maximum tokens for AI thinking (default: 32000)')
api_group.add_argument('--desired_output_tokens', type=int, default=12000,
help='User desired number of tokens to generate before stopping output')
api_group.add_argument('--request_timeout', type=int, default=300,
help='Maximum timeout for each *streamed chunk* of output (default: 300 seconds)')
# Add arguments to the Output Configuration group
# cls: this is ignored, as the tool only outputs counts
output_group.add_argument('--save_dir', type=str, default=".",
help='Directory to save character analysis reports (default: current directory)')
return parser.parse_args()
def read_text_file(file_path):
"""Read text file content with error handling."""
try:
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
if not content.strip():
print(f"Error: text file '{file_path}' is empty.")
if __name__ == "__main__":
sys.exit(1)
else:
return None
return content
except FileNotFoundError:
print(f"Error: text file '{file_path}' not found.")
if __name__ == "__main__":
sys.exit(1)
else:
return None
except Exception as e:
print(f"Error reading text file '{file_path}': {e}")
if __name__ == "__main__":
sys.exit(1)
else:
return None
def count_tokens(client, text):
"""Count tokens in text using Anthropic API."""
try:
response = client.beta.messages.count_tokens(
model="claude-3-7-sonnet-20250219",
thinking={
"type": "enabled",
"budget_tokens": 128000
},
messages=[{"role": "user", "content": text}],
betas=["output-128k-2025-02-19"]
)
return response.input_tokens
except Exception as e:
print(f"Token counting error: {e}")
if __name__ == "__main__":
sys.exit(1)
else:
return 0
def count_words(text):
"""Count the number of words in a text string."""
return len(text.split())
def main(file_path=None, context_window=200000, betas_max_tokens=128000,
thinking_budget_tokens=32000, desired_output_tokens=12000,
request_timeout=300, return_results=False):
"""
Main function that can be called directly or via command line.
Args:
file_path (str, optional): Path to the text file to analyze
context_window (int): Context window size for Claude
betas_max_tokens (int): Maximum tokens for AI output
thinking_budget_tokens (int): Maximum tokens for AI thinking
desired_output_tokens (int): User desired output tokens
request_timeout (int): API request timeout in seconds
return_results (bool): Whether to return results as a dictionary
Returns:
bool or dict: If return_results is False, returns True if thinking budget is sufficient
If return_results is True, returns a dictionary with analysis results
"""
# Parse arguments from command line if file_path not provided
if file_path is None:
args = parse_arguments()
file_path = args.text_file
context_window = args.context_window
betas_max_tokens = args.betas_max_tokens
thinking_budget_tokens = args.thinking_budget_tokens
desired_output_tokens = args.desired_output_tokens
request_timeout = args.request_timeout
# Initialize Anthropic client
client = anthropic.Anthropic(
timeout=request_timeout,
max_retries=0
)
# Read text file
text = read_text_file(file_path)
if text is None:
return None if return_results else False
# Count words
word_count = count_words(text)
# Count tokens
print(f"Counting tokens for text file: {file_path}")
prompt_token_count = count_tokens(client, text)
if prompt_token_count == 0:
return None if return_results else False
# Calculate available tokens after prompt
prompt_tokens = prompt_token_count
available_tokens = context_window - prompt_tokens
# For API call, max_tokens must respect the API limit
max_tokens = min(available_tokens, betas_max_tokens)
# Thinking budget must be LESS than max_tokens to leave room for visible output
thinking_budget = max_tokens - desired_output_tokens
if thinking_budget > 32000:
print(f"Warning: thinking budget is larger than 32K, reset to 32K. Use batch for larger thinking budgets.")
thinking_budget = 32000
# Display results
print(f"\nToken stats:")
print(f"Word count: {word_count}")
print(f"Max AI model context window: [{context_window}] tokens")
print(f"Input prompt tokens: [{prompt_tokens}]")
print(f"Available tokens: [{available_tokens}] = {context_window} - {prompt_tokens}")
print(f"Desired output tokens: [{desired_output_tokens}]")
print(f"AI model thinking budget: [{thinking_budget}] tokens")
print(f"Max output tokens (max_tokens): [{max_tokens}] tokens")
has_sufficient_budget = thinking_budget >= thinking_budget_tokens
if not has_sufficient_budget:
print(f"Error: prompt is too large to have a {thinking_budget_tokens} thinking budget!")
if __name__ == "__main__" and not return_results:
sys.exit(1)
else:
print(f"✓ Thinking budget is sufficient!")
print(f"✓ Text is ready for use with requested thinking budget of {thinking_budget_tokens} tokens")
# Words per token ratio
words_per_token = word_count / prompt_tokens if prompt_tokens > 0 else 0
print(f"Words per token ratio: {words_per_token:.2f}\n")
# Simple summary output at the end
print(f"\n***************************************************************************")
print(f"Counts for text file: {file_path}")
print(f"\n{word_count} words\n")
print(f"\n{prompt_tokens} tokens using 'client.beta.messages.count_tokens'")
print(f"\n***************************************************************************")
if return_results:
return {
"word_count": word_count,
"token_count": prompt_tokens,
"available_tokens": available_tokens,
"thinking_budget": thinking_budget,
"max_tokens": max_tokens,
"words_per_token": words_per_token,
"has_sufficient_budget": has_sufficient_budget
}
return has_sufficient_budget
if __name__ == "__main__":
main()