{
"type": "object",
"properties": {
"fileUrl": {
"type": "string",
"description": "URL of the audio or video file to transcribe"
},
"model": {
"type": "string",
"enum": [
"nova-3",
"nova-3-general",
"nova-3-medical",
"nova-2",
"nova-2-general",
"nova-2-meeting",
"nova-2-phonecall",
"nova-2-voicemail",
"nova-2-finance",
"nova-2-conversationalai",
"nova-2-video",
"nova-2-medical",
"nova-2-drivethru",
"nova-2-automotive",
"nova",
"enhanced",
"base",
"whisper-tiny",
"whisper-base",
"whisper-small",
"whisper-medium",
"whisper-large"
],
"default": "nova-3",
"description": "Deepgram model. nova-3 is the latest (required for keyterm). Whisper tiers available."
},
"languageCode": {
"type": "string",
"description": "Language code (e.g., \"en\", \"es\", \"fr\"). Omit for auto-detect."
},
"detectLanguage": {
"type": "boolean",
"description": "Explicitly request language auto-detection (most models detect by default when languageCode is omitted)"
},
"smartFormat": {
"type": "boolean",
"description": "Smart formatting of numbers, dates, times, etc. Default true."
},
"numerals": {
"type": "boolean",
"description": "Convert number words to numerals"
},
"profanityFilter": {
"type": "boolean",
"description": "Filter profanity"
},
"dictation": {
"type": "boolean",
"description": "Interpret spoken punctuation commands"
},
"measurements": {
"type": "boolean",
"description": "Normalize measurement abbreviations"
},
"fillerWords": {
"type": "boolean",
"description": "Include \"um\" / \"uh\" / etc. in transcript"
},
"enableDiarization": {
"type": "boolean",
"default": false,
"description": "Identify different speakers. Auto-enables utterances with speaker-attributed timestamps."
},
"diarizationSpeakerCount": {
"type": "number",
"description": "Hint for expected number of speakers"
},
"diarizeVersion": {
"type": "string",
"description": "Pin diarization algorithm version"
},
"enableUtterances": {
"type": "boolean",
"description": "Group output into utterances with start/end timestamps per phrase/speaker. Auto-enabled when diarization is on. Compact — does not include per-word timestamps."
},
"uttSplit": {
"type": "number",
"description": "Silence threshold in seconds for splitting utterances (default 0.8)"
},
"enableParagraphs": {
"type": "boolean",
"default": false,
"description": "Format output into paragraphs with timestamps"
},
"multichannel": {
"type": "boolean",
"description": "Transcribe each audio channel separately. Critical for stereo call recordings."
},
"alternatives": {
"type": "number",
"description": "Return N transcription candidates per channel"
},
"keywords": {
"type": "array",
"items": {
"type": "string"
},
"description": "Keyword boosting. Each entry may be \"word\" or \"word:intensifier\". Pre-Nova-3 models."
},
"keyterm": {
"type": "array",
"items": {
"type": "string"
},
"description": "Nova-3 keyterm boosting (superior to keywords; supports multi-word phrases). English-only."
},
"search": {
"type": "array",
"items": {
"type": "string"
},
"description": "Search for phrases; hits returned with timestamps"
},
"replace": {
"type": "array",
"items": {
"type": "string"
},
"description": "Find-and-replace in transcript. Format: \"find:replace\"."
},
"enableSummary": {
"type": "boolean",
"default": false,
"description": "Generate a short text summary of the audio. Requires 50+ words of audio — shorter inputs return the original text. Returns a summary.short field."
},
"enableTopics": {
"type": "boolean",
"default": false,
"description": "Detect topics discussed in the audio. Returns topic segments with labels and confidence scores (0-1). Example topics: \"healthcare\", \"data collection\", \"budget\". English only."
},
"customTopics": {
"type": "array",
"items": {
"type": "string"
},
"description": "Provide up to 100 custom topics to detect. By default uses extended mode — returns your topics plus auto-detected ones. Requires enableTopics. Example: [\"sales\", \"support\", \"billing\", \"technical issue\"]"
},
"enableIntents": {
"type": "boolean",
"description": "Detect speaker intents — what speakers are trying to do. Returns intent segments with verb-form labels and confidence scores (0-1). Example intents: \"schedule a meeting\", \"request a refund\". English only."
},
"customIntents": {
"type": "array",
"items": {
"type": "string"
},
"description": "Provide up to 100 custom intents to detect. By default uses extended mode — returns your intents plus auto-detected ones. Requires enableIntents. Example: [\"purchase\", \"cancel subscription\", \"get status update\", \"file complaint\"]"
},
"enableSentiment": {
"type": "boolean",
"default": false,
"description": "Analyze sentiment per segment — tags each part as positive, negative, or neutral with a confidence score (0-1). Also provides an average sentiment for the entire transcript. Useful for call center analysis."
},
"redact": {
"type": "array",
"items": {
"type": "string",
"enum": [
"pci",
"pii",
"phi",
"numbers",
"ssn",
"aggressive_numbers",
"credit_card",
"credit_card_expiration",
"cvv",
"email_address",
"phone_number",
"account_number",
"age",
"date",
"date_interval",
"dob",
"driver_license",
"healthcare_number",
"ip_address",
"location",
"location_address",
"location_zip",
"location_coordinate",
"money",
"numerical_pii",
"passport_number",
"password",
"time",
"vehicle_id",
"statistics",
"bank_account",
"routing_number"
]
},
"description": "Redaction options. Common: pci, pii, phi, numbers, ssn. Also supports specific entity types."
},
"mipOptOut": {
"type": "boolean",
"description": "Opt out of Deepgram model improvement program"
},
"tag": {
"type": "array",
"items": {
"type": "string"
},
"description": "Labels for Deepgram console analytics filtering"
},
"extra": {
"type": "array",
"items": {
"type": "string"
},
"description": "Arbitrary \"key:value\" metadata passthrough"
},
"outputMode": {
"type": "string",
"enum": [
"transcription",
"transcription_with_timestamps"
],
"default": "transcription",
"description": "Output mode. \"transcription\" (default): text only (~5KB). \"transcription_with_timestamps\": includes utterance-level timestamps with start/end per phrase/speaker (~15KB). Use timestamps mode when you need to know when things were said."
},
"includeDetailedResults": {
"type": "boolean",
"description": "Include the full raw Deepgram response with word-level timestamps. Warning: very large output (200KB+ per minute of audio). Only use when you need per-word timing data."
}
},
"required": [
"fileUrl"
]
}