-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpostprocessing.py
102 lines (76 loc) · 3.12 KB
/
postprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import json
import jiwer
from datasets import load_dataset
avg_acc = 0
def clean_json_outputs(input_file, output_file):
"""
Read JSON file, clean the output fields by removing special characters,
and save to a new JSON file.
Args:
input_file (str): Path to input JSON file
output_file (str): Path to save cleaned JSON file
"""
dataset_name = 'mozilla-foundation/common_voice_11_0'
global avg_acc
# Read the JSON file
with open(input_file, 'r') as f:
data = json.load(f)
# Clean each output field
for key in data:
# Get the output string
output = data[key]['output']
# Remove the special characters at the beginning
# Find the first regular character (after the UTF-8 control characters)
clean_start = 4 # Skip the first 4 characters (control sequence)
cleaned_output = output[clean_start:]
# Update the output field with cleaned text
data[key]['output'] = cleaned_output.strip()
acc = jiwer.wer(data[key]['actual_output'], data[key]['output'])
avg_acc = avg_acc + acc
data[key]['accuracy'] = acc
# Save the cleaned data to a new JSON file
with open(output_file, 'w') as f:
json.dump(data, f, ensure_ascii=False)
return data
def reverse_transform_filename_to_path(input_path):
"""
Transform a filename back to its original path structure by:
1. Removing the base directory (handlers/triton/audio/)
2. Splitting the remaining path by '-' up to the last component
3. Reconstructing the huggingface cache path
4. Converting .wav to .mp3
Args:
input_path (str): The input path with transformed filename
Returns:
tuple: (directory_path, filename)
"""
# Remove the base directory if it exists
if input_path.startswith('handlers/triton/audio/'):
filename = input_path.replace('handlers/triton/audio/', '')
else:
filename = input_path
# Split the filename by '-'
parts = filename.split('-')
# The hash is the first part
hash_value = parts[0]
# The second to last part is the directory
directory = parts[-2]
# The last part is the filename (need to change extension)
file_name = parts[-1].replace('.wav', '.mp3')
# Construct the full directory path
directory_path = f"/root/.cache/huggingface/datasets/downloads/extracted/{hash_value}/{directory}/{file_name}"
return directory_path
# Example usage
if __name__ == "__main__":
input_file = "/root/whisper_large_v2/output/2024-10-28T11:09:28Z.json"
output_file = "cleaned_output.json"
try:
cleaned_data = clean_json_outputs(input_file, output_file)
print(avg_acc / 1000)
print(f"\nProcessing complete. Cleaned data saved to {output_file}")
except FileNotFoundError:
print(f"Error: Could not find input file {input_file}")
except json.JSONDecodeError:
print("Error: Invalid JSON format in input file")
except Exception as e:
print(f"An error occurred: {str(e)}")