Files
2025-08-16 23:00:51 +05:30

96 lines
2.8 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from fastapi import FastAPI, File, UploadFile, Form
import uvicorn
import os
from fastapi.middleware.cors import CORSMiddleware
import whisper
import difflib
import pronouncing
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
UPLOAD_DIR = "uploads"
os.makedirs(UPLOAD_DIR, exist_ok=True)
'''
the Whisper model "small" is loaded by default.
Users can change it to other models based on their systems processing power.
namely "tiny", "base", "medium", "large".
'''
model = whisper.load_model("small")
def calculate_similarity(expected: str, actual: str) -> float:
seq = difflib.SequenceMatcher(None, expected.split(), actual.split())
return seq.ratio()
def text_to_phonemes(text: str) -> str:
words = text.lower().split()
phoneme_list = []
for word in words:
phones = pronouncing.phones_for_word(word)
if phones:
phoneme_list.append(phones[0])
else:
phoneme_list.append("[UNK]")
return " ".join(phoneme_list)
def phoneme_diff(expected: str, actual: str):
expected_list = expected.split()
actual_list = actual.split()
diff_result = []
matcher = difflib.SequenceMatcher(None, expected_list, actual_list)
for opcode, i1, i2, j1, j2 in matcher.get_opcodes():
if opcode == "equal":
diff_result.extend([(p, "match") for p in expected_list[i1:i2]])
elif opcode == "replace":
diff_result.extend([(p, "mismatch") for p in expected_list[i1:i2]])
elif opcode == "delete":
diff_result.extend([(p, "missing") for p in expected_list[i1:i2]])
elif opcode == "insert":
diff_result.extend([(p, "extra") for p in actual_list[j1:j2]])
return diff_result
@app.post("/upload")
async def upload_audio(
file: UploadFile = File(...),
expected_text: str = Form(...)
):
file_path = os.path.join(UPLOAD_DIR, file.filename)
with open(file_path, "wb") as f:
f.write(await file.read())
result = model.transcribe(file_path)
transcript = result["text"]
expected_phonemes = text_to_phonemes(expected_text)
actual_phonemes = text_to_phonemes(transcript)
similarity = calculate_similarity(expected_phonemes, actual_phonemes)
differences = phoneme_diff(expected_phonemes, actual_phonemes)
return {
"message": "Audio processed successfully",
"transcript": transcript,
"expected_text": expected_text,
"expected_phonemes": expected_phonemes,
"actual_phonemes": actual_phonemes,
"similarity_score": round(similarity, 3),
"phoneme_diff": differences
}
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)