Skip to main content
This recipe shows the moving pieces for browser microphone transcription. In production, combine it with Browser WebSockets so the browser uses a short-lived token instead of an API key.

Flow

  1. Ask the user for microphone permission.
  2. Capture audio with the Web Audio API.
  3. Convert float samples to 16-bit mono PCM.
  4. Send base64 audio chunks over wss://api.gradium.ai/api/speech/asr.
  5. Render text messages and use step messages for turn-taking.

Browser Client

async function getGradiumToken() {
  const response = await fetch("/gradium-token", {method: "POST"});
  if (!response.ok) throw new Error("Could not get Gradium token");
  return response.json();
}

function pcm16Base64(float32) {
  const pcm = new Int16Array(float32.length);
  for (let i = 0; i < float32.length; i += 1) {
    const sample = Math.max(-1, Math.min(1, float32[i]));
    pcm[i] = sample < 0 ? sample * 32768 : sample * 32767;
  }

  const bytes = new Uint8Array(pcm.buffer);
  let binary = "";
  for (const byte of bytes) binary += String.fromCharCode(byte);
  return btoa(binary);
}

function appendTranscript(text) {
  console.log("transcript", text);
}

function markPossibleTurnEnd() {
  console.log("possible turn end");
}

async function startTranscription() {
  const {token} = await getGradiumToken();
  const url = new URL("wss://api.gradium.ai/api/speech/asr");
  url.searchParams.set("token", token);

  const stream = await navigator.mediaDevices.getUserMedia({audio: true});
  const audioContext = new AudioContext({sampleRate: 24000});
  const source = audioContext.createMediaStreamSource(stream);
  const processor = audioContext.createScriptProcessor(2048, 1, 1);
  const ws = new WebSocket(url);

  ws.addEventListener("open", () => {
    ws.send(JSON.stringify({
      type: "setup",
      model_name: "default",
      input_format: "pcm",
      json_config: {language: "en"},
    }));
  });

  ws.addEventListener("message", (event) => {
    const msg = JSON.parse(event.data);
    if (msg.type === "text") {
      appendTranscript(msg.text);
    } else if (msg.type === "step") {
      const horizon = msg.vad[msg.vad.length - 1];
      if (horizon?.inactivity_prob > 0.5) markPossibleTurnEnd();
    }
  });

  processor.onaudioprocess = (event) => {
    if (ws.readyState !== WebSocket.OPEN) return;
    const samples = event.inputBuffer.getChannelData(0);
    ws.send(JSON.stringify({
      type: "audio",
      audio: pcm16Base64(samples),
    }));
  };

  source.connect(processor);
  processor.connect(audioContext.destination);

  return {
    stop() {
      ws.send(JSON.stringify({type: "end_of_stream"}));
      processor.disconnect();
      source.disconnect();
      stream.getTracks().forEach((track) => track.stop());
    },
  };
}
ScriptProcessorNode is easy to read but deprecated. For production, prefer an AudioWorklet so audio capture stays reliable under UI load.

Audio Format Notes

  • input_format: "pcm" means 24 kHz, 16-bit signed mono PCM.
  • If your browser audio graph runs at 48 kHz, either resample to 24 kHz or send input_format: "pcm_48000".
  • Send small chunks, around 80-100 ms, to keep latency low.
  • Do not send compressed browser formats unless you explicitly set a supported Gradium input format such as opus.

Speech-to-Text WebSocket

Message types, VAD, flushing, and direct WebSocket examples.

Turn-taking with VAD

Use semantic VAD to decide when a speaker has finished.