mirror of
https://github.com/Manoj-HV30/PhonoCoach.git
synced 2026-05-16 19:35:26 +00:00
96 lines
2.8 KiB
Python
96 lines
2.8 KiB
Python
from fastapi import FastAPI, File, UploadFile, Form
|
||
import uvicorn
|
||
import os
|
||
from fastapi.middleware.cors import CORSMiddleware
|
||
import whisper
|
||
import difflib
|
||
import pronouncing
|
||
|
||
app = FastAPI()
|
||
app.add_middleware(
|
||
CORSMiddleware,
|
||
allow_origins=["*"],
|
||
allow_credentials=True,
|
||
allow_methods=["*"],
|
||
allow_headers=["*"],
|
||
)
|
||
|
||
|
||
UPLOAD_DIR = "uploads"
|
||
os.makedirs(UPLOAD_DIR, exist_ok=True)
|
||
|
||
'''
|
||
the Whisper model "small" is loaded by default.
|
||
Users can change it to other models based on their system’s processing power.
|
||
namely "tiny", "base", "medium", "large".
|
||
'''
|
||
model = whisper.load_model("small")
|
||
|
||
def calculate_similarity(expected: str, actual: str) -> float:
|
||
seq = difflib.SequenceMatcher(None, expected.split(), actual.split())
|
||
return seq.ratio()
|
||
|
||
|
||
def text_to_phonemes(text: str) -> str:
|
||
words = text.lower().split()
|
||
phoneme_list = []
|
||
for word in words:
|
||
phones = pronouncing.phones_for_word(word)
|
||
if phones:
|
||
phoneme_list.append(phones[0])
|
||
else:
|
||
phoneme_list.append("[UNK]")
|
||
return " ".join(phoneme_list)
|
||
|
||
|
||
def phoneme_diff(expected: str, actual: str):
|
||
expected_list = expected.split()
|
||
actual_list = actual.split()
|
||
diff_result = []
|
||
matcher = difflib.SequenceMatcher(None, expected_list, actual_list)
|
||
|
||
for opcode, i1, i2, j1, j2 in matcher.get_opcodes():
|
||
if opcode == "equal":
|
||
diff_result.extend([(p, "match") for p in expected_list[i1:i2]])
|
||
elif opcode == "replace":
|
||
diff_result.extend([(p, "mismatch") for p in expected_list[i1:i2]])
|
||
elif opcode == "delete":
|
||
diff_result.extend([(p, "missing") for p in expected_list[i1:i2]])
|
||
elif opcode == "insert":
|
||
diff_result.extend([(p, "extra") for p in actual_list[j1:j2]])
|
||
return diff_result
|
||
|
||
@app.post("/upload")
|
||
async def upload_audio(
|
||
file: UploadFile = File(...),
|
||
expected_text: str = Form(...)
|
||
):
|
||
|
||
file_path = os.path.join(UPLOAD_DIR, file.filename)
|
||
with open(file_path, "wb") as f:
|
||
f.write(await file.read())
|
||
|
||
|
||
result = model.transcribe(file_path)
|
||
transcript = result["text"]
|
||
|
||
expected_phonemes = text_to_phonemes(expected_text)
|
||
actual_phonemes = text_to_phonemes(transcript)
|
||
|
||
|
||
similarity = calculate_similarity(expected_phonemes, actual_phonemes)
|
||
differences = phoneme_diff(expected_phonemes, actual_phonemes)
|
||
|
||
return {
|
||
"message": "Audio processed successfully",
|
||
"transcript": transcript,
|
||
"expected_text": expected_text,
|
||
"expected_phonemes": expected_phonemes,
|
||
"actual_phonemes": actual_phonemes,
|
||
"similarity_score": round(similarity, 3),
|
||
"phoneme_diff": differences
|
||
}
|
||
|
||
if __name__ == "__main__":
|
||
uvicorn.run(app, host="0.0.0.0", port=8000)
|