Skip to content

Commit

Permalink
[Python]Update for chatting sample (#2677)
Browse files Browse the repository at this point in the history
* update content assessment and score calculations.

* add feedback on grammar and vocabulary

---------

Co-authored-by: v-liukun <[email protected]>
  • Loading branch information
l17813182030 and v-liukun authored Dec 3, 2024
1 parent a4323b1 commit e392c9c
Showing 1 changed file with 114 additions and 44 deletions.
158 changes: 114 additions & 44 deletions samples/python/console/chatting/chatting_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
import time
import threading
import wave
import string
from typing import Literal

import numpy as np
import soundfile as sf
Expand Down Expand Up @@ -111,9 +113,70 @@ def chatting_from_file():
"""Performs chatting with Azure OAI and Azure Pronunciation Assessment asynchronously from audio files.
See more information at https://aka.ms/csspeech/pa"""

topic = "describe working dogs"
input_files = ["resources/chat_input_1.wav", "resources/chat_input_2.wav"]
reference_text = ""
chat_history = [
{
"role": "system",
"content": (
"You are a voice assistant, and when you answer questions,"
"your response should not exceed 25 words."
)
}
]
sample_sentence1 = "OK the movie i like to talk about is the cove it is very say phenomenal sensational documentary about adopting hunting practices in japan i think the director is called well i think the name escapes me anyway but well let's talk about the movie basically it's about dolphin hunting practices in japan there's a small village where where villagers fisherman Q almost twenty thousand dolphins on a yearly basis which is brutal and just explain massacre this book has influenced me a lot i still remember the first time i saw this movie i think it was in middle school one of my teachers showed it to all the class or the class and i remember we were going through some really boring topics like animal protection at that time it was really boring to me but right before everyone was going to just sleep in the class the teacher decided to put the textbook down and show us a clear from this document documentary we were shocked speechless to see the has of the dolphins chopped off and left on the beach and the C turning bloody red with their blood which is i felt sick i couldn't need fish for a whole week and it was lasting impression if not scarring impression and i think this movie is still very meaningful and it despite me a lot especially on wildlife protection dolphins or search beautiful intelligent animals of the sea and why do villagers and fishermen in japan killed it i assume there was a great benefit to its skin or some scientific research but the ironic thing is that they only kill them for the meat because the meat taste great that sickens me for awhile and i think the book inspired me to do a lot of different to do a lot of things about well i protection i follow news like"
sample_sentence2 = "yes i can speak how to this movie is it is worth young wolf young man this is this movie from korea it's a crime movies the movies on the movies speaker speaker or words of young man love hello a cow are you saying they end so i have to go to the go to the america or ha ha ha lots of years a go on the woman the woman is very old he talk to korea he carpool i want to go to the this way this whole home house this house is a is hey so what's your man and at the end the girl cause so there's a woman open open hum finally finds other wolf so what's your young man so the young man don't so yeah man the young man remember he said here's a woman also so am i it's very it's very very sad she is she is a crack credit thank you "
sample_sentence3 = "yes i want i want to talk about the TV series are enjoying watching a discount name is a friends and it's uh accommodate in the third decades decades an it come out the third decades and its main characters about a six friends live in the NYC but i watched it a long time ago i can't remember the name of them and the story is about what they are happening in their in their life and there are many things treating them and how the friendship are hard friendship and how the french how the strong strongly friendship they obtain them and they always have some funny things happen they only have happened something funny things and is a comedy so that was uh so many and i like this be cause of first adult cause it has a funding it has a farming serious and it can improve my english english words and on the other hand it can i can know about a lot of cultures about the united states and i i first hear about death TV series it's come out of a website and i took into and i watch it after my after my finish my studies and when i was a bad mood when i when i'm in a bad mood or i "
type_of_feedback = {
"comment_on_vocabulary": "lexical",
"comment_on_grammar": "grammatical",
}

def get_request_data(send_text, scenario=Literal["chat", "content_assess", "comment_on_vocabulary", "comment_on_grammar"]):
promptForFeedback = (
'From a professional perspective, provide a {type_of_feedback} evaluation of the following passage: "{passage}" First, give a summary evaluation, then list the issues and provide suggestions, keeping it within 50 words. output format as '
'"Summary Evaluation: *'
'Issues and Suggestions:'
' 1. *'
' ..."'
)
if scenario == "content_assess":
data = [
{
"role": "system",
"content":
'You are an English teacher and please help to grade a student\'s essay from vocabulary and grammar relevance on how well the essay aligns, and output format as: {"vocabulary": *.**(0-100), "grammar": *.**(0-100)}.',
},
{
"role": "user",
"content": (
f'Example1: this essay: "{sample_sentence1}" has vocabulary and grammar scores of 80 and 80, respectively.'
f'Example2: this essay: "{sample_sentence2}" has vocabulary and grammar scores of 40 and 43, respectively.'
f'Example3: this essay: "{sample_sentence3}" has vocabulary and grammar scores of 50 and 50, respectively.'
f'The essay for you to score is "{send_text}".'
'The script is from speech recognition so that please first add punctuations when needed, remove duplicates and unnecessary un uh from oral speech, then find all the misuse of words and grammar errors in this essay, find advanced words and grammar usages, and finally give scores based on this information. Please only response as this format {"vocabulary": *.**(0-100), "grammar": *.**(0-100)}.'
)
}
]
return data
elif scenario == "comment_on_vocabulary":
return [{
"role": "user",
"content": promptForFeedback.format(type_of_feedback=type_of_feedback["comment_on_vocabulary"], passage=send_text)
}]
elif scenario == "comment_on_grammar":
return [{
"role": "user",
"content": promptForFeedback.format(type_of_feedback=type_of_feedback["comment_on_grammar"], passage=send_text)
}]
else:
chat_history.append(
{
"role": "user",
"content": send_text
}
)
return chat_history

def stt(filename):
result_text = []
Expand Down Expand Up @@ -148,30 +211,25 @@ def stop_cb(evt: speechsdk.SessionEventArgs):
print("YOU: ", text)
return text

def call_gpt(send_text):
def call_gpt(send_text, scenario=Literal["chat", "content_assess", "comment_on_vocabulary", "comment_on_grammar"]):
url = (
f"https://{oai_resource_name}.openai.azure.com/openai/deployments/{oai_deployment_name}/"
f"chat/completions?api-version={oai_api_version}"
)
headers = {"Content-Type": "application/json", "api-key": oai_api_key}
data = {"messages": [
{
"role": "system",
"content": (
"You are a voice assistant, and when you answer questions,"
"your response should not exceed 25 words."
)
},
{
"role": "user",
"content": send_text
}
]}
data = {"messages": get_request_data(send_text, scenario)}

text = json.loads(
requests.post(url=url, headers=headers, data=json.dumps(data)).content
)["choices"][0]["message"]["content"]
print("GPT: ", text)
if scenario == "chat":
chat_history.append(
{
"role": "assistant",
"content": text
}
)
print("GPT: ", text)
return text

def tts(text, output_path, tag=None):
Expand Down Expand Up @@ -218,7 +276,6 @@ def pronunciation_assessment():
}
pronunciation_config = speechsdk.PronunciationAssessmentConfig(json_string=json.dumps(json_string))
pronunciation_config.enable_prosody_assessment()
pronunciation_config.enable_content_assessment_with_topic(topic)
pronunciation_config.reference_text = reference_text.strip()

# Creates a speech recognizer using a file as audio input.
Expand All @@ -236,9 +293,10 @@ def pronunciation_assessment():
prosody_scores = []
fluency_scores = []
durations = []
results = []
json_words = []
display_text = ""
startOffset = 0
endOffset = 0

def stop_cb(evt):
"""callback that signals to stop continuous recognition upon receiving an event `evt`"""
Expand All @@ -247,17 +305,19 @@ def stop_cb(evt):

def recognized(evt):
pronunciation_result = speechsdk.PronunciationAssessmentResult(evt.result)
nonlocal recognized_words, prosody_scores, fluency_scores, durations, results, json_words, display_text
results.append(pronunciation_result)
nonlocal recognized_words, prosody_scores, fluency_scores, durations, json_words, display_text, startOffset, endOffset
recognized_words += pronunciation_result.words
fluency_scores.append(pronunciation_result.fluency_score)
json_result = evt.result.properties.get(speechsdk.PropertyId.SpeechServiceResponse_JsonResult)
jo = json.loads(json_result)
nb = jo["NBest"][0]
display_text += nb["Display"]
display_text += nb["Display"] + " "
json_words += nb["Words"]
prosody_scores.append(pronunciation_result.prosody_score)
durations.append(sum([int(w["Duration"]) for w in nb["Words"]]))
durations.extend([int(w["Duration"]) + 100000 for w in nb["Words"]])
if startOffset == 0:
startOffset = nb["Words"][0]["Offset"]
endOffset = nb["Words"][-1]["Offset"] + nb["Words"][-1]["Duration"] + 100000

# Connect callbacks to the events fired by the speech recognizer
speech_recognizer.recognized.connect(recognized)
Expand All @@ -277,6 +337,11 @@ def recognized(evt):
speech_recognizer.stop_continuous_recognition()
push_stream_writer_thread.join()

# Get content scores
content_result = json.loads(call_gpt(display_text, "content_assess"))

durations_sum = sum([d for w, d in zip(recognized_words, durations) if w.error_type == "None"])

# We can calculate whole accuracy by averaging
final_accuracy_scores = []
for word in recognized_words:
Expand All @@ -293,46 +358,45 @@ def recognized(evt):
prosody_score = sum(prosody_scores) / len(prosody_scores)

# Re-calculate fluency score
fluency_score = sum([x * y for (x, y) in zip(fluency_scores, durations)]) / sum(durations)
fluency_score = 0
if startOffset > 0:
fluency_score = durations_sum / (endOffset - startOffset) * 100

pron_score = accuracy_score * 0.6 + fluency_score * 0.2 + prosody_score * 0.2
sorted_scores = sorted([accuracy_score, prosody_score, fluency_score])
pron_score = sorted_scores[0] * 0.6 + sorted_scores[1] * 0.2 + sorted_scores[2] * 0.2
print(f"Pronunciation score: {pron_score:.1f}")
print(f"Accuracy Score: {accuracy_score:.1f}")
if not math.isnan(prosody_score):
print(f"Prosody Score: {prosody_score:.1f}")
print(f"Fluency Score: {fluency_score:.1f}")
# Content assessment result is in the last pronunciation assessment block
assert results[-1].content_assessment_result is not None
content_result = results[-1].content_assessment_result
print(f"Vocabulary score: {content_result.vocabulary_score:.1f}")
print(f"Grammar score: {content_result.grammar_score:.1f}")
print(f"Topic score: {content_result.topic_score:.1f}")
print(f"Vocabulary score: {content_result['vocabulary']:.1f}")
print(f"Grammar score: {content_result['grammar']:.1f}")

comment_result(
{
"accuracy score": accuracy_score,
"fluency score": fluency_score,
"prosody score": prosody_score,
"vocabulary score": content_result.vocabulary_score,
"grammar score": content_result.grammar_score,
"topic score": content_result.topic_score
"vocabulary score": content_result["vocabulary"],
"grammar score": content_result["grammar"],
},
set_punctuation(json_words, display_text),
[
word for word in json_words
if word["PronunciationAssessment"]["ErrorType"] == "Mispronunciation"
or word["PronunciationAssessment"]["AccuracyScore"] < 60
],
merged_audio_path
merged_audio_path,
display_text
)

def set_punctuation(json_words, display_text):
for idx, word in enumerate(display_text.split(" ")):
if not word[-1].isalpha() and not word[-1].isdigit():
json_words[idx]["is_punctuation"] = True
if word.strip() and word.strip()[-1] in string.punctuation:
json_words[idx]["has_punctuation"] = True
return json_words

def comment_result(scores_dict, json_words, mis_pronunciation_words, merged_audio_path):
def comment_result(scores_dict, json_words, mis_pronunciation_words, merged_audio_path, content):
message_dict = {
"Excellent": [],
"Good": [],
Expand Down Expand Up @@ -366,10 +430,10 @@ def get_prosody_error(error_type, word, last_word):
if break_error_info == "null":
return False
if error_type == "MissingBreak":
if break_error_info["Confidence"] >= threshold and last_word.get("is_punctuation", False):
if break_error_info["Confidence"] >= threshold and last_word.get("has_punctuation", False):
return True
if error_type == "UnexpectedBreak":
if break_error_info["Confidence"] >= threshold and not last_word.get("is_punctuation", False):
if break_error_info["Confidence"] >= threshold and not last_word.get("has_punctuation", False):
return True
return False
elif error_type == "Monotone" and \
Expand All @@ -392,18 +456,18 @@ def get_error_message(error_types):
for error_type in error_types:
if len(error_dict[error_type]) != 0:
message += f"{error_type} count is {len(error_dict[error_type])}. near the word "
message += f"{' '.join([word['Word'].strip() for word in error_dict[error_type]])}. "
message += f"{', '.join([word['Word'].strip() for word in error_dict[error_type]])}. "

return message

def get_report(json_words, mis_pronunciation_words, merged_audio_path):
def get_report(json_words, mis_pronunciation_words, merged_audio_path, content):

set_error_dict(json_words)

report_audio_list = []
accuracy_report_audio_list = []
report_path = "output/chat_report.wav"
if len(mis_pronunciation_words) != 0:
accuracy_report_audio_list = []
accuracy_report_path = "output/accuracy_report.wav"
for idx, mis_word in enumerate(mis_pronunciation_words):
origin_content = ""
Expand Down Expand Up @@ -440,6 +504,12 @@ def get_report(json_words, mis_pronunciation_words, merged_audio_path):
tts(origin_content, fluency_prosody_report_path)
report_audio_list.append(fluency_prosody_report_path)

if content.strip() != "":
vocabulary_feedback_path = "output/vocabulary_feedback.wav"
grammar_feedback_path = "output/grammar_feedback.wav"
tts(call_gpt(content, "comment_on_vocabulary"), vocabulary_feedback_path, "vocabulary feedback")
tts(call_gpt(content, "comment_on_grammar"), grammar_feedback_path, "grammar feedback")

merge_wav(report_audio_list, report_path, "report")

def get_score_comment(scores_dict):
Expand All @@ -455,12 +525,12 @@ def get_score_comment(scores_dict):
tts(messages, "output/chat_score_comment.wav", "score comment")

get_score_comment(scores_dict)
get_report(json_words, mis_pronunciation_words, merged_audio_path)
get_report(json_words, mis_pronunciation_words, merged_audio_path, content)

if not os.path.exists("output"):
os.makedirs("output")
for idx, file in enumerate(input_files):
tts(call_gpt(stt(file)), f"output/gpt_output_{idx+1}.wav", "GPT output")
tts(call_gpt(stt(file), "chat"), f"output/gpt_output_{idx+1}.wav", "GPT output")
print("Generate the final report ......")
pronunciation_assessment()

Expand Down

0 comments on commit e392c9c

Please sign in to comment.