Initial commit of PhoneCoach project

2026-05-16 19:35:26 +00:00 · 2025-08-16 03:37:24 +05:30
parent f14485a833
commit 5326b855dd
8 changed files with 436 additions and 0 deletions
@@ -0,0 +1,39 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # Python virtual environments
 venv/
 .env
 .envrc
 # Distribution / packaging
 build/
 dist/
 *.egg-info/
 *.egg
 *.manifest
 *.spec
 # Logs
 *.log
 # IDE / Editor files
 .vscode/
 .idea/
 *.swp
 *.swo
 # Jupyter Notebook
 .ipynb_checkpoints
 # OS-specific
 .DS_Store
 Thumbs.db
 # Chrome extension build outputs
 *.zip
 # Other temporary files
 *.tmp
@@ -0,0 +1,91 @@
 from fastapi import FastAPI, File, UploadFile, Form
 import uvicorn
 import os
 from fastapi.middleware.cors import CORSMiddleware
 import whisper
 import difflib
 import pronouncing
 app = FastAPI()
 app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )
 UPLOAD_DIR = "uploads"
 os.makedirs(UPLOAD_DIR, exist_ok=True)
 model = whisper.load_model("small")
 def calculate_similarity(expected: str, actual: str) -> float:
    seq = difflib.SequenceMatcher(None, expected.split(), actual.split())
    return seq.ratio()
 def text_to_phonemes(text: str) -> str:
    words = text.lower().split()
    phoneme_list = []
    for word in words:
        phones = pronouncing.phones_for_word(word)
        if phones:
            phoneme_list.append(phones[0])
        else:
            phoneme_list.append("[UNK]")  # mark as unknown pronunciation
    return " ".join(phoneme_list)
 def phoneme_diff(expected: str, actual: str):
    expected_list = expected.split()
    actual_list = actual.split()
    diff_result = []
    matcher = difflib.SequenceMatcher(None, expected_list, actual_list)
    for opcode, i1, i2, j1, j2 in matcher.get_opcodes():
        if opcode == "equal":
            diff_result.extend([(p, "match") for p in expected_list[i1:i2]])
        elif opcode == "replace":
            diff_result.extend([(p, "mismatch") for p in expected_list[i1:i2]])
        elif opcode == "delete":
            diff_result.extend([(p, "missing") for p in expected_list[i1:i2]])
        elif opcode == "insert":
            diff_result.extend([(p, "extra") for p in actual_list[j1:j2]])
    return diff_result
@app.post("/upload")
 async def upload_audio(
    file: UploadFile = File(...),
    expected_text: str = Form(...)
 ):
    file_path = os.path.join(UPLOAD_DIR, file.filename)
    with open(file_path, "wb") as f:
        f.write(await file.read())
    result = model.transcribe(file_path)
    transcript = result["text"]
    expected_phonemes = text_to_phonemes(expected_text)
    actual_phonemes = text_to_phonemes(transcript)
    similarity = calculate_similarity(expected_phonemes, actual_phonemes)
    differences = phoneme_diff(expected_phonemes, actual_phonemes)
    return {
        "message": "Audio processed successfully",
        "transcript": transcript,
        "expected_text": expected_text,
        "expected_phonemes": expected_phonemes,
        "actual_phonemes": actual_phonemes,
        "similarity_score": round(similarity, 3),
        "phoneme_diff": differences
    }
 if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)
@@ -0,0 +1,8 @@
 let selectedText = "";
 document.addEventListener("mouseup", () => {
  const selection = window.getSelection().toString().trim();
  if (selection) {
    selectedText = selection;
    chrome.storage.local.set({ selectedText });
  }
 });
@@ -0,0 +1,17 @@
 {
  "manifest_version": 3,
  "name": "PhonoCoach",
  "version": "1.0",
  "description": "Lets you practise pronunciation on any webpage you're on",
  "permissions": ["storage", "activeTab", "scripting"],
  "host_permissions": ["http://localhost:8000/*"],
  "action": {
    "default_popup": "popup.html"
  },
  "content_scripts": [
    {
      "matches": ["<all_urls>"],
      "js": ["content.js"]
    }
  ]
 }
@@ -0,0 +1,120 @@
 body {
    font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif;
    margin: 10px;
    width: 350px;
    background: linear-gradient(135deg, #f9f9f9, #e0f7fa);
    color: #333;
 }
 h3 {
    text-align: center;
    color: #2c3e50;
    margin-bottom: 12px;
 }
 #displayText {
    display: block;
    width: 100%;
    white-space: normal;
    word-wrap: break-word;
    border: 1px solid #ccc;
    padding: 10px;
    background-color: #ffffffcc;
    min-height: 50px;
    max-height: 150px;
    overflow-y: auto;
    margin-bottom: 10px;
    font-size: 1.4em;
    border-radius: 8px;
    box-shadow: 0 2px 6px rgba(0, 0, 0, 0.1);
 }
 #recordBtn {
    display: block;
    width: 100%;
    padding: 10px;
    background: #03a9f4;
    color: white;
    border: none;
    border-radius: 8px;
    font-size: 1.2em;
    cursor: pointer;
    transition: background 0.3s ease;
 }
 #recordBtn:hover {
    background: #0288d1;
 }
 #status {
    text-align: center;
    font-weight: 500;
    margin-top: 8px;
    color: #555;
 }
 #similarityScore {
    font-weight: bold;
    color: #2a9d8f;
    margin-top: 10px;
    text-align: center;
    font-size: 1.3em;
 }
 #phonemeDiff {
    margin-top: 10px;
    display: flex;
    flex-wrap: wrap;
    gap: 8px;
    max-height: 150px;
    overflow-y: auto;
 }
 .phoneme {
    padding: 10px 16px;
    border-radius: 8px;
    font-size: 1.6em; /* bigger font size for phonemes */
    font-weight: 700;
    transition: transform 0.2s;
 }
 .phoneme:hover {
    transform: scale(1.1);
    cursor: pointer;
 }
 #pronunciationTips {
    margin-top: 10px;
    font-style: italic;
    color: #555;
    text-align: center;
 }
 #generalTips {
    margin-top: 12px;
    padding: 10px;
    background: #f0f4f8;
    border-radius: 8px;
    box-shadow: 0 1px 4px rgba(0, 0, 0, 0.1);
    color: #333;
 }
 #generalTips h4 {
    margin-top: 0;
    margin-bottom: 8px;
    font-weight: 600;
    color: #2c3e50;
    font-size: 1.1em;
 }
 #generalTips ul {
    padding-left: 20px;
    margin: 0;
 }
 #generalTips ul li {
    margin-bottom: 6px;
    line-height: 1.4;
    font-size: 1.1em;
    list-style-type: disc;
 }
@@ -0,0 +1,26 @@
 <!doctype html>
 <html lang="en">
    <head>
        <meta charset="UTF-8" />
        <meta name="viewport" content="width=device-width, initial-scale=1" />
        <title>PhonoCoach</title>
        <link rel="stylesheet" href="popup.css" />
    </head>
    <body>
        <h3>Selected Text</h3>
        <div id="displayText">No selected text</div>
        <button id="recordBtn">🎙 Record</button>
        <p id="status"></p>
        <div id="similarityScore"></div>
        <p id="pronunciationTips"></p>
        <div id="phonemeDiff"></div>
        <!-- Color legend (shown after accuracy) -->
        <div id="colorLegend"></div>
        <script src="popup.js"></script>
    </body>
 </html>
@@ -0,0 +1,135 @@
 chrome.storage.local.get("selectedText", (data) => {
  document.getElementById("displayText").textContent =
    data.selectedText || "No text selected";
 });
 let mediaRecorder;
 let audioChunks = [];
 let isRecording = false;
 const recordBtn = document.getElementById("recordBtn");
 const status = document.getElementById("status");
 const similarityScoreElem = document.getElementById("similarityScore");
 async function requestMicPermission() {
  try {
    const permissionStatus = await navigator.permissions.query({
      name: "microphone",
    });
    if (permissionStatus.state === "granted") {
      return true;
    } else if (permissionStatus.state === "prompt") {
      await navigator.mediaDevices.getUserMedia({ audio: true });
      return true;
    } else {
      return false;
    }
  } catch (err) {
    console.error("Permission API error:", err);
    return false;
  }
 }
 recordBtn.addEventListener("click", async () => {
  if (!isRecording) {
    try {
      const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
      mediaRecorder = new MediaRecorder(stream);
      audioChunks = [];
      mediaRecorder.ondataavailable = (event) => {
        if (event.data.size > 0) {
          audioChunks.push(event.data);
        }
      };
      mediaRecorder.onstop = async () => {
        recordBtn.disabled = true;
        const audioBlob = new Blob(audioChunks, { type: "audio/wav" });
        const formData = new FormData();
        formData.append("file", audioBlob, "recording.wav");
        const expectedText =
          document.getElementById("displayText").textContent || "";
        formData.append("expected_text", expectedText);
        status.textContent = "Uploading...";
        similarityScoreElem.textContent = "";
        try {
          const response = await fetch("http://127.0.0.1:8000/upload", {
            method: "POST",
            body: formData,
          });
          if (response.ok) {
            const result = await response.json();
            similarityScoreElem.textContent = `Pronunciation Accuracy: ${(result.similarity_score * 100).toFixed(1)}%`;
            const tipsElem = document.getElementById("pronunciationTips");
            const score = result.similarity_score;
            if (score > 0.9) {
              tipsElem.textContent =
                "Great job! Keep practicing to maintain your clear pronunciation.";
            } else if (score > 0.7) {
              tipsElem.textContent =
                "Good effort! Try slowing down and emphasizing each word.";
            } else {
              tipsElem.textContent =
                "Keep practicing! Focus on vowel sounds and word stress.";
            }
            const legendElem = document.getElementById("colorLegend");
            legendElem.innerHTML = `
                            <h4>Color Coding:</h4>
                            <ul>
                                <li style="color: green;">Green: Correct phoneme</li>
                                <li style="color: red;">Red: Incorrect phoneme</li>
                                <li style="color: orange;">Orange: Missing phoneme</li>
                                <li style="color: blue;">Blue: Extra phoneme</li>
                                <li style="color: gray;">[UNK]: Unknown/Unrecognized phoneme</li>
                            </ul>
                                <h3>Note: This extension leverages OpenAI’s Whisper ASR model for automatic speech recognition, enabling accurate transcription and pronunciation analysis.</h3>`;
            const phonemeDiffElem = document.getElementById("phonemeDiff");
            phonemeDiffElem.innerHTML = "";
            result.phoneme_diff.forEach(([phoneme, status]) => {
              const span = document.createElement("span");
              span.textContent = phoneme + " ";
              if (status === "match") span.style.color = "green";
              else if (status === "mismatch") span.style.color = "red";
              else if (status === "missing") span.style.color = "orange";
              else if (status === "extra") span.style.color = "blue";
              phonemeDiffElem.appendChild(span);
            });
            status.textContent = "✅ Uploaded successfully!";
          } else {
            status.textContent = "❌ Upload failed.";
          }
        } catch (err) {
          console.error(err);
          status.textContent = "⚠️ Error uploading.";
        } finally {
          recordBtn.disabled = false;
        }
      };
      mediaRecorder.start();
      isRecording = true;
      recordBtn.textContent = "⏹ Stop";
      status.textContent = "🎙 Recording...";
      similarityScoreElem.textContent = "";
    } catch (err) {
      console.error(err);
      status.textContent = "⚠️ Microphone access denied.";
    }
  } else {
    mediaRecorder.stop();
    isRecording = false;
    recordBtn.textContent = "🎙 Record";
  }
 });