Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev #795

Merged
merged 5 commits into from
Aug 1, 2023
Merged

Dev #795

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
209 changes: 206 additions & 3 deletions backend/functions/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from .utils import (
get_batch_translations,
get_batch_ocr_predictions,
get_batch_asr_predictions,
)
from django.db import transaction, DataError, IntegrityError
from dataset.models import DatasetInstance
Expand Down Expand Up @@ -361,21 +362,66 @@ def generate_ocr_prediction_json(
try:
ocr_data_items = dataset_models.OCRDocument.objects.filter(
instance_id=dataset_instance_id
).values_list("id", "image_url", "ocr_prediction_json")
).values_list(
"id",
"metadata_json",
"draft_data_json",
"file_type",
"file_url",
"image_url",
"page_number",
"language",
"ocr_type",
"ocr_domain",
"ocr_transcribed_json",
"ocr_prediction_json",
"image_details_json",
"parent_data",
)
except Exception as e:
ocr_data_items = []

# converting the dataset_instance to pandas dataframe.
ocr_data_items_df = pd.DataFrame(
ocr_data_items,
columns=["id", "image_url", "ocr_prediction_json"],
columns=[
"id",
"metadata_json",
"draft_data_json",
"file_type",
"file_url",
"image_url",
"page_number",
"language",
"ocr_type",
"ocr_domain",
"ocr_transcribed_json",
"ocr_prediction_json",
"image_details_json",
"parent_data",
],
)

# Check if the dataframe is empty
if ocr_data_items_df.shape[0] == 0:
raise Exception("The OCR data is empty.")

required_columns = {"id", "image_url", "ocr_prediction_json"}
required_columns = {
"id",
"metadata_json",
"draft_data_json",
"file_type",
"file_url",
"image_url",
"page_number",
"language",
"ocr_type",
"ocr_domain",
"ocr_transcribed_json",
"ocr_prediction_json",
"image_details_json",
"parent_data",
}
if not required_columns.issubset(ocr_data_items_df.columns):
missing_columns = required_columns - set(ocr_data_items_df.columns)
raise ValueError(
Expand Down Expand Up @@ -408,8 +454,19 @@ def generate_ocr_prediction_json(
ocr_document = dataset_models.OCRDocument(
instance_id_id=dataset_instance_id,
id=curr_id,
metadata_json=row["metadata_json"],
draft_data_json=row["draft_data_json"],
file_type=row["file_type"],
file_url=row["file_url"],
image_url=image_url,
page_number=row["page_number"],
language=row["language"],
ocr_type=row["ocr_type"],
ocr_domain=row["ocr_domain"],
ocr_transcribed_json=row["ocr_transcribed_json"],
ocr_prediction_json=ocr_predictions_json,
image_details_json=row["image_details_json"],
parent_data=row["parent_data"],
)
with transaction.atomic():
ocr_document.save()
Expand All @@ -430,6 +487,152 @@ def generate_ocr_prediction_json(
return f"{success_count} out of {total_count} populated"


@shared_task(bind=True)
def generate_asr_prediction_json(
self, dataset_instance_id, api_type, automate_missing_data_items
):
"""Function to generate ASR prediction data and to save to the same data item.
Args:
dataset_instance_id (int): ID of the dataset instance.
api_type (str): Type of API to be used for translation. (default: dhruva_asr)
Example - [dhruva_asr, indic-trans, google, indic-trans-v2, azure, blank]
automate_missing_data_items (bool): "Boolean to translate only missing data items"
"""
# Fetching the data items for the given dataset instance.
success_count, total_count = 0, 0
try:
asr_data_items = dataset_models.SpeechConversation.objects.filter(
instance_id=dataset_instance_id
).values_list(
"id",
"metadata_json",
"draft_data_json",
"domain",
"scenario",
"speaker_count",
"speakers_json",
"language",
"transcribed_json",
"machine_transcribed_json",
"audio_url",
"audio_duration",
"reference_raw_transcript",
"prediction_json",
"parent_data",
)
except Exception as e:
asr_data_items = []

# converting the dataset_instance to pandas dataframe.
asr_data_items_df = pd.DataFrame(
asr_data_items,
columns=[
"id",
"metadata_json",
"draft_data_json",
"domain",
"scenario",
"speaker_count",
"speakers_json",
"language",
"transcribed_json",
"machine_transcribed_json",
"audio_url",
"audio_duration",
"reference_raw_transcript",
"prediction_json",
"parent_data",
],
)

# Check if the dataframe is empty
if asr_data_items_df.shape[0] == 0:
raise Exception("The ASR data is empty.")

required_columns = {
"id",
"metadata_json",
"draft_data_json",
"domain",
"scenario",
"speaker_count",
"speakers_json",
"language",
"transcribed_json",
"machine_transcribed_json",
"audio_url",
"audio_duration",
"reference_raw_transcript",
"prediction_json",
"parent_data",
}
if not required_columns.issubset(asr_data_items_df.columns):
missing_columns = required_columns - set(asr_data_items_df.columns)
raise ValueError(
f"The following required columns are missing: {missing_columns}"
)

# Update the asr_predictions field for each row in the DataFrame
for index, row in asr_data_items_df.iterrows():
curr_id = row["id"]
if "audio_url" not in row:
print(f"The ASR item with {curr_id} has missing audio_url.")
continue
audio_url = row["audio_url"]
language = row["language"]

# Considering the case when we should generate predictions for data items
# which already have asr_predictions or not.
if automate_missing_data_items and row["prediction_json"]:
continue
total_count += 1
asr_predictions = get_batch_asr_predictions(
curr_id, audio_url, api_type, language
)
if asr_predictions["status"] == "Success":
success_count += 1
prediction_json = asr_predictions["output"]

# Updating the asr_prediction_json column and saving in SpeechConversation dataset with the new asr predictions
try:
asr_data_items_df.at[index, "prediction_json"] = prediction_json
asr_document = dataset_models.SpeechConversation(
instance_id_id=dataset_instance_id,
id=curr_id,
metadata_json=row["metadata_json"],
draft_data_json=row["draft_data_json"],
domain=row["domain"],
scenario=row["scenario"],
speaker_count=row["speaker_count"],
speakers_json=row["speakers_json"],
language=row["language"],
transcribed_json=row["transcribed_json"],
machine_transcribed_json=row["machine_transcribed_json"],
audio_url=audio_url,
audio_duration=row["audio_duration"],
reference_raw_transcript=row["reference_raw_transcript"],
prediction_json=prediction_json,
parent_data=row["parent_data"],
)
with transaction.atomic():
asr_document.save()
except IntegrityError as e:
# Handling unique constraint violations or other data integrity issues
print(f"Error while saving dataset id- {curr_id}, IntegrityError: {e}")
except DataError as e:
# Handling data-related issues like incorrect data types, etc.
print(f"Error while saving dataset id- {curr_id}, DataError: {e}")
except Exception as e:
# Handling other unexpected exceptions.
print(f"Error while saving dataset id- {curr_id}, Error message: {e}")

else:
print(
f"The {api_type} API has not generated predictions for data item with id-{curr_id}"
)
print(f"{success_count} out of {total_count} populated")


@shared_task(bind=True)
def populate_draft_data_json(self, pk, fields_list):
try:
Expand Down
4 changes: 4 additions & 0 deletions backend/functions/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@
"schedule_ocr_prediction_json_population",
schedule_ocr_prediction_json_population,
),
path(
"schedule_asr_prediction_json_population",
schedule_asr_prediction_json_population,
),
]

# urlpatterns = format_suffix_patterns(urlpatterns)
Loading
Loading