Transcription
Speech-to-text (ASR) transcription via whisper.cpp โ i.e., use an speech recognition model to generate text transcriptions from audio input.
Usage
import {
loadModel,
unloadModel,
transcribe,
WHISPER_TINY,
VAD_SILERO_5_1_2,
} from "@qvac/sdk";
try {
console.log("๐ค Starting Whisper transcription with prompt example...");
// Load the Whisper model with VAD
console.log("๐ฅ Loading Whisper model...");
const modelId = await loadModel({
modelSrc: WHISPER_TINY,
modelType: "whisper",
vadModelSrc: VAD_SILERO_5_1_2,
modelConfig: {
audio_format: "f32le",
// Sampling strategy
strategy: "greedy",
n_threads: 4,
// Transcription options
language: "en",
translate: false,
no_timestamps: false,
single_segment: false,
print_timestamps: true,
token_timestamps: true,
// Quality settings
temperature: 0.0,
suppress_blank: true,
suppress_nst: true,
// Advanced tuning
entropy_thold: 2.4,
logprob_thold: -1.0,
// VAD configuration
vad_params: {
threshold: 0.35,
min_speech_duration_ms: 200,
min_silence_duration_ms: 150,
max_speech_duration_s: 30.0,
speech_pad_ms: 600,
samples_overlap: 0.3,
},
// Context parameters for GPU
contextParams: {
use_gpu: true,
flash_attn: true,
gpu_device: 0,
},
},
onProgress: (progress) => {
console.log(progress);
},
});
console.log(`โ
Whisper model loaded with ID: ${modelId}`);
// Perform transcription
console.log("๐ง Transcribing audio...");
const text = await transcribe({
modelId,
audioChunk: "examples/audio/sample-16khz.wav",
prompt:
"This is a test recording with clear speech and proper punctuation.",
});
console.log("๐ Transcription result:");
console.log(text);
// Unload the model when done
console.log("๐งน Unloading Whisper model...");
await unloadModel({ modelId });
console.log("โ
Whisper model unloaded successfully");
process.exit(0);
} catch (error) {
console.error("โ Error:", error);
process.exit(1);
}