-
Notifications
You must be signed in to change notification settings - Fork 99
/
Copy pathexport_logs_py.py
256 lines (206 loc) · 9.12 KB
/
export_logs_py.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
# Copyright 2020 IBM All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Author: Simon O'Doherty
# Contact: [email protected]
#
# Note: The code below is a sample provided to illustrate one way
# to approach this issue and is used as is and at your own risk. In order
# for this example to perform as intended, the script must be laid out exactly
# as indicated below. Product Support cannot customize this script for specific
# environments or applications.
""" Notes on running.
Just run the program for the command line options.
* No longer backward compatible. Only apikey authentication supported.
* apikey + url you can get from your service credentials page in IBM Cloud.
* Default is to read your workspace. Use --logtype to change if you want assistant or deployment.
* All log types require an ID that relates to the log you are trying to pull.
* Using the filter option will negate the following settings and you have to set yourself
* language, logtype, id.
* If you scroll down you can hard code your defaults easily.
* built using python 3.8.
Example command lines:
* python export_logs.py apikey workspace_id test.json
* python export_logs.py apikey assistant_id test.xlsx --logtype ASSISTANT --filetype XLSX --url service_url
* python export_logs.py apikey deployment_id test.csv --logtype DEPLOYMENT --filetype CSV --strip --url service_url
"""
import pandas as pd
import argparse
import json
from ibm_watson import AssistantV1 as WatsonAssistant
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from urllib.parse import urlparse, parse_qs
C_DEPLOYMENT = 'DEPLOYMENT'
C_ASSISTANT = 'ASSISTANT'
C_WORKSPACE = 'WORKSPACE'
C_CSV = 'CSV'
C_TSV = 'TSV'
C_XLSX = 'XLSX'
C_JSON = 'JSON'
c_RESPONSE = 'response'
c_CONTEXT = 'context'
c_SYSTEM = 'system'
c_INPUT = 'input'
c_OUTPUT = 'output'
c_INTENTS = 'intents'
c_INTENT = 'intent'
c_TEXT = 'text'
c_BRANCH_EXITED_REASON = 'branch_exited_reason'
c_LOG_MESSAGING = 'log_messaging'
c_CONFIDENCE = 'confidence'
c_LOGS = 'logs'
c_PAGINATION = 'pagination'
c_NEXT_URL = 'next_url'
c_CURSOR = 'cursor'
# If you want to hard code your main defaults.
default_version = '2020-04-01'
default_url = 'https://gateway.watsonplatform.net/assistant/api'
default_logtype = C_WORKSPACE
default_language = 'en'
parser = argparse.ArgumentParser()
parser.add_argument('apikey', help='Watson Assistant API Key.', type=str)
parser.add_argument('id', help=f'identifier for logtype. For example workspace_id if {C_WORKSPACE} was specified.', type=str)
parser.add_argument('filename', help='Output file name.',type=str)
parser.add_argument('--logtype', help=f'What logs to pull. Options are Default is {default_logtype}.',
type=str, default=default_logtype, choices=[C_ASSISTANT, C_WORKSPACE, C_DEPLOYMENT])
parser.add_argument('--language', help=f'Default is {default_language}.', type=str, default=default_language)
parser.add_argument('--filetype', help=f'Output file type. Can be: {C_CSV}, {C_TSV}, {C_XLSX}, {C_JSON} (default)',
type=str, default='JSON', choices=[C_CSV, C_TSV, C_XLSX, C_JSON])
parser.add_argument('--url', help=f'Default is {default_url}.', type=str, default=default_url)
parser.add_argument('--version', help=f'Default is {default_version}.', type=str, default=default_version)
parser.add_argument('--totalpages', help='Maximum number of pages to pull. Default is 999', type=int, default=999)
parser.add_argument('--pagelimit', help='Maximum number of records to a page. Default is 200.', type=int, default=200)
parser.add_argument('--filter', help='Search filter to use. This overrides logtype, so you will need to manually set.',
type=str, default=None)
parser.add_argument('--strip', help='Strip newlines from output text. Default is false.', type=bool, default=False)
args = parser.parse_args()
f_conversation_id = 'conversation_id'
f_request_timestamp = 'request_timestamp'
f_response_timestamp = 'response_timestamp'
f_user_input = 'User Input'
f_output = 'Output'
f_intent = 'Intent'
f_confidence = 'Confidence'
f_exit_reason = 'Exit Reason'
f_logging = 'Logging'
f_context = 'Context'
columns = [
f_conversation_id, f_request_timestamp, f_response_timestamp,
f_user_input, f_output, f_intent, f_confidence, f_exit_reason, f_logging, f_context
]
# Saving methods.
def save_json(data=None,file_name=None):
with open(file_name, 'w') as out:
json.dump(data,out)
def save_xsv(data=None, sep=',', file_name=None):
df = convert_json_to_dataframe(data)
if df is not None:
df.to_csv(args.filename,encoding='utf8',sep=sep,index=False)
def save_xlsx(data=None, file_name=None):
df = convert_json_to_dataframe(data)
if df is not None:
df.to_excel(args.filename,index=False)
def convert_json_to_dataframe(data=None):
rows = []
if data == [[]]:
print('No Logs found. :(')
return None
for data_records in data:
for o in data_records:
row = {}
# Let's shorthand the response and system object.
r = o[c_RESPONSE]
s = r[c_CONTEXT][c_SYSTEM]
row[f_conversation_id] = r[c_CONTEXT][f_conversation_id]
row[f_request_timestamp] = o[f_request_timestamp]
row[f_response_timestamp] = o[f_response_timestamp]
if c_TEXT in r[c_INPUT]:
row[f_user_input] = r[c_INPUT][c_TEXT]
if c_TEXT in r[c_OUTPUT]:
row[f_output] = ' '.join(r[c_OUTPUT][c_TEXT])
if args.strip:
row[f_output] = row[f_output].replace('\l','').replace('\n','').replace('\r','')
if len(r[c_INTENTS]) > 0:
row[f_confidence] = r[c_INTENTS][0][c_CONFIDENCE]
row[f_intent] = r[c_INTENTS][0][c_INTENT]
if c_BRANCH_EXITED_REASON in s:
row[f_exit_reason] = s[c_BRANCH_EXITED_REASON]
if c_LOG_MESSAGING in r[c_OUTPUT]:
row[f_logging] = r[c_OUTPUT][c_LOG_MESSAGING]
row[f_context] = json.dumps(r[c_CONTEXT])
rows.append(row)
# Build the dataframe.
df = pd.DataFrame(rows,columns=columns)
# cleaning up dataframe. Removing NaN and converting date fields.
df = df.fillna('')
# Prevent timezone limitation in to_excel call.
if args.filetype != C_XLSX:
df[f_request_timestamp] = pd.to_datetime(df[f_request_timestamp])
df[f_response_timestamp] = pd.to_datetime(df[f_response_timestamp])
# Lastly sort by conversation ID, and then request, so that the logs become readable.
df = df.sort_values([f_conversation_id, f_request_timestamp], ascending=[True, True])
return df
# Make connection to Watson Assistant.
authenticator = IAMAuthenticator(args.apikey)
c = WatsonAssistant(version=args.version, authenticator=authenticator)
c.set_service_url(args.url)
# Determine how logs will be pulled.
logtype = None
pull_filter = None
if args.filter is None:
args.logtype = args.logtype.upper()
if args.logtype == C_WORKSPACE:
logtype = 'workspace_id'
elif args.logtype == C_ASSISTANT:
logtype = 'request.context.system.assistant_id'
elif args.logtype == C_DEPLOYMENT:
logtype = 'request.context.metadata.deployment'
else:
print("Error: I don't understand logtype {}. Exiting.".format(args.logtype))
exit(1)
print(f'Reading {args.logtype} using ID {args.id}.')
pull_filter = 'language::{},{}::{}'.format(args.language, logtype, args.id)
else:
print(f'Reading using filter: {args.filter}')
pull_filter = args.filter
# Download the logs.
j = []
page_count = 1
cursor = None
count = 0
x = { c_PAGINATION: 'DUMMY' }
while x[c_PAGINATION]:
if page_count > args.totalpages:
break
print('Reading page {}.'.format(page_count))
x = c.list_all_logs(filter=pull_filter,cursor=cursor,page_limit=args.pagelimit).result
j.append(x[c_LOGS])
count = count + len(x[c_LOGS])
page_count = page_count + 1
if c_PAGINATION in x and c_NEXT_URL in x[c_PAGINATION]:
p = x[c_PAGINATION][c_NEXT_URL]
u = urlparse(p)
query = parse_qs(u.query)
cursor = query[c_CURSOR][0]
# Determine how the file should be saved.
args.filetype = args.filetype.upper()
if args.filetype == C_CSV:
save_xsv(data=j, sep=',',file_name=args.filename)
elif args.filetype == C_TSV:
save_xsv(data=j, sep='\t',file_name=args.filename)
elif args.filetype == C_XLSX:
save_xlsx(data=j, file_name=args.filename)
else:
save_json(data=j, file_name=args.filename),
print('Writing {} records to: {} as file type: {}'.format(count, args.filename, args.filetype))