{ "swagger": "2.0", "info": { "title": "speechapi", "description": "Speech APIs enable you to recognize speech and convert it to text using advanced machine learning, and also to convert text to speech.", "version": "v1" }, "paths": { "/speech/recognize/file": { "post": { "tags": [ "Recognize" ], "summary": "Recognize audio input as text using Advanced AI", "description": "Uses advanced AI to convert input audio to text. Supports WAV, MP3, M4A, FLAC, OGG, and WMA formats. Consumes 1 API call per second of audio in Fast mode, 5 API calls per second in Normal mode, and 10 API calls per second in Advanced mode.", "consumes": [ "multipart/form-data" ], "produces": [ "text/plain", "application/json", "text/json" ], "parameters": [ { "in": "header", "name": "languageCode", "description": "ISO 639-3 three-letter language code (e.g. eng, spa, fra). Empty for auto-detect.", "type": "string", "default": "" }, { "in": "header", "name": "recognitionMode", "description": "Recognition mode: Fast, Normal (default), or Advanced. Advanced is only available on Private Cloud and Managed Instance deployments.", "type": "string", "default": "Normal" }, { "in": "formData", "name": "speechFile", "type": "file" } ], "responses": { "200": { "description": "OK", "schema": { "$ref": "#/definitions/SpeechRecognitionResult" } } } } }, "/speech/speak/text/voice/basic/audio": { "post": { "tags": [ "Speak" ], "summary": "Generate audio from text using Advanced AI", "description": "Converts text to speech using advanced AI. Supports English, Spanish, French, Hindi, Italian, Japanese, Portuguese, and Chinese. Specify language with LanguageCode (ISO 639-3, default: eng) and gender with Gender (Male or Female, default: Female). Output format is controlled by the Format field (mp3 or wav, default: mp3). Consumes 1 API call per second of generated audio.", "consumes": [ "application/json", "text/json", "application/*+json" ], "produces": [ "application/octet-stream" ], "parameters": [ { "in": "body", "name": "body", "description": "String input request", "schema": { "$ref": "#/definitions/TextToSpeechRequest" } } ], "responses": { "200": { "description": "OK", "schema": { "format": "binary", "type": "string" } } } } } }, "definitions": { "SpeechRecognitionResult": { "description": "Result of recognizing speech", "type": "object", "properties": { "TextResult": { "description": "Recognition result in text format", "type": "string", "example": "Hello world, this is a test." }, "Timestamps": { "description": "Token-level timestamps with character offsets into TextResult", "type": "array", "items": { "$ref": "#/definitions/TokenTimestamp" } } }, "additionalProperties": false }, "TextToSpeechRequest": { "description": "Input to a text-to-speech request", "type": "object", "properties": { "Text": { "description": "Text to be converted to speech", "type": "string", "example": "Hello, this is a test of the text to speech system." }, "Format": { "description": "File format for output audio file: wav or mp3, default is mp3", "type": "string", "example": "mp3" }, "LanguageCode": { "description": "ISO 639-3 three-letter language code (e.g. eng, spa, fra, hin, ita, jpn, por, zho). Default is eng (English).", "type": "string", "example": "eng" }, "Gender": { "description": "Voice gender: Male or Female. Default is Female. Note: Male is not available for French and Chinese.", "type": "string", "example": "Female" } }, "additionalProperties": false }, "TokenTimestamp": { "description": "A single token with its time range and position in the output text", "type": "object", "properties": { "Token": { "description": "The token text (word or subword)", "type": "string", "example": " Hello" }, "CharacterOffsetStart": { "format": "int32", "description": "Start character offset (0-based) in TextResult", "type": "integer", "example": 0 }, "CharacterOffsetEnd": { "format": "int32", "description": "End character offset (exclusive) in TextResult", "type": "integer", "example": 6 }, "StartTimeMs": { "format": "int64", "description": "Start time of this token in the audio, in milliseconds", "type": "integer", "example": 0 }, "EndTimeMs": { "format": "int64", "description": "End time of this token in the audio, in milliseconds", "type": "integer", "example": 500 } }, "additionalProperties": false } }, "securityDefinitions": { "Apikey": { "type": "apiKey", "name": "Apikey", "in": "header", "description": "Apikey" } }, "security": [ { "Apikey": [ ] } ] }