首页
学习
活动
专区
圈层
工具
发布
首页
学习
活动
专区
圈层
工具
MCP广场
社区首页 >问答首页 >Microsoft Cognitive Services语音保存选项

Microsoft Cognitive Services语音保存选项
EN

Stack Overflow用户
提问于 2021-01-21 01:02:37
回答 1查看 221关注 0票数 0

我已经编写了以下代码,它可以将文本转换为语音并通过扬声器播放。我正在使用Microsoft Cognitive Services。

代码语言:javascript
运行
复制
<html lang="en">
<head>
  <title>Microsoft Cognitive Services Speech SDK JavaScript Sample for Speech Synthesis</title>
  <meta charset="utf-8" />
  <style>
    body {
      font-family: 'Segoe UI', -apple-system, BlinkMacSystemFont, 'Roboto', 'Helvetica Neue', sans-serif;
      font-size: 14px;
    }

    table, th, td {
      border: 1px solid #f1f1f1;
      border-collapse: collapse;
    }

    th, td {
      padding: 10px;
    }

    textarea {
      font-family: Arial,sans-serif;
    }

    .mode {
      font-size: 18px;
    }

    .highlight{
      background-color: yellow;
    }

    input:not(disabled) {
      font-weight: bold;
      color: black;
    }

    button {
      padding: 4px 8px;
      background: #0078d4;
      color: #ffffff;
    }

    button:disabled {
      padding: 4px 8px;
      background: #ccc;
      color: #666;
    }

    input[type=radio] {
      position: relative;
      z-index: 1;
    }

    input[type=radio] + label {
      padding: 8px 4px 8px 30px;
      margin-left: -30px;
    }

    input[type=radio]:checked + label {
      background: #0078d4;
      color: #ffffff;
    }
  </style>
</head>
<body>
  <div id="warning">
    <h1 style="font-weight:500;">Speech Speech SDK not found
      (microsoft.cognitiveservices.speech.sdk.bundle.js missing).</h1>
  </div>
  
  <div id="content" style="display:none">
    <table>
      <tr>
        <td></td>
        <td><h1 style="font-weight:500;">Microsoft Cognitive Services</h1></td>
      </tr>
      <tr>
        <td align="right">
          <label for="subscriptionKey">
            <a href="https://docs.microsoft.com/azure/cognitive-services/speech-service/get-started"
               rel="noreferrer noopener"
               target="_blank">Subscription Key</a>
          </label>
        </td>
        <td><input id="subscriptionKey" type="text" size="40" placeholder="YourSubscriptionKey"></td>
      </tr>
      <tr>
        <td align="right"><label for="regionOptions">Region</label></td>
        <td>
<!--          see https://aka.ms/csspeech/region for more details-->
          <select id="regionOptions">
            <option value="uksouth">UK South</option>
          </select>
        </td>
      </tr>
      <tr>
        <td align="right"><label for="voiceOptions">Voice</label></td>
        <td>
          <button id="updateVoiceListButton">Update Voice List</button>
          <select id="voiceOptions" disabled>
            <option>Please update voice list first.</option>
          </select>
        </td>
      </tr>
      <tr>
        <td align="right"><label for="isSSML">Is SSML</label><br></td>
        <td>
          <input type="checkbox" id="isSSML" name="isSSML" value="ssml">
        </td>
      </tr>
      <tr>
        <td align="right"><label for="synthesisText">Text</label></td>
        <td>
          <textarea id="synthesisText" style="display: inline-block;width:500px;height:100px"
                 placeholder="Input text or ssml for synthesis."></textarea>
        </td>
      </tr>
      <tr>
        <td></td>
        <td>
          <button id="startSynthesisAsyncButton">Start synthesis</button>
          <button id="pauseButton">Pause</button>
          <button id="resumeButton">Resume</button>
        </td>
      </tr>
      <tr>
        <td align="right" valign="top"><label for="resultsDiv">Results</label></td>
        <td><textarea id="resultsDiv" readonly style="display: inline-block;width:500px;height:50px"></textarea></td>
      </tr>
      <tr>
        <td align="right" valign="top"><label for="eventsDiv">Events</label></td>
        <td><textarea id="eventsDiv" readonly style="display: inline-block;width:500px;height:200px"></textarea></td>
      </tr>
      <tr>
        <td align="right" valign="top"><label for="highlightDiv">Highlight</label></td>
        <td><div id="highlightDiv" style="display: inline-block;width:800px;"></div></td>
      </tr>
    </table>
  </div>

  <!-- Speech SDK reference sdk. -->
  <script src="microsoft.cognitiveservices.speech.sdk.bundle.js"></script>

  <!-- Speech SDK Authorization token -->
  <script>
  // Note: Replace the URL with a valid endpoint to retrieve
  //       authorization tokens for your subscription.
  var authorizationEndpoint = "token.php";

  function RequestAuthorizationToken() {
    if (authorizationEndpoint) {
      var a = new XMLHttpRequest();
      a.open("GET", authorizationEndpoint);
      a.setRequestHeader("Content-Type", "application/x-www-form-urlencoded");
      a.send("");
      a.onload = function() {
          var token = JSON.parse(atob(this.responseText.split(".")[1]));
          serviceRegion.value = token.region;
          authorizationToken = this.responseText;
          subscriptionKey.disabled = true;
          subscriptionKey.value = "using authorization token (hit F5 to refresh)";
          console.log("Got an authorization token: " + token);
      }
    }
  }
  </script>

  <!-- Speech SDK USAGE -->
  <script>
    // On document load resolve the Speech SDK dependency
    function Initialize(onComplete) {
      if (!!window.SpeechSDK) {
        document.getElementById('content').style.display = 'block';
        document.getElementById('warning').style.display = 'none';
        onComplete(window.SpeechSDK);
      }
    }
  </script>

  <!-- Browser Hooks -->
  <script>
    // status fields and start button in UI
    var resultsDiv, eventsDiv;
    var highlightDiv;
    var startSynthesisAsyncButton, pauseButton, resumeButton;
    var updateVoiceListButton;

    // subscription key and region for speech services.
    var subscriptionKey, regionOptions;
    var authorizationToken;
    var voiceOptions, isSsml;
    var SpeechSDK;
    var synthesisText;
    var synthesizer;
    var player;
    var wordBoundaryList = [];

    document.addEventListener("DOMContentLoaded", function () {
      startSynthesisAsyncButton = document.getElementById("startSynthesisAsyncButton");
      updateVoiceListButton = document.getElementById("updateVoiceListButton");
      pauseButton = document.getElementById("pauseButton");
      resumeButton = document.getElementById("resumeButton");
      subscriptionKey = document.getElementById("subscriptionKey");
      regionOptions = document.getElementById("regionOptions");
      resultsDiv = document.getElementById("resultsDiv");
      eventsDiv = document.getElementById("eventsDiv");
      voiceOptions = document.getElementById("voiceOptions");
      isSsml = document.getElementById("isSSML");
      highlightDiv = document.getElementById("highlightDiv");

      setInterval(function () {
        if (player !== undefined) {
          const currentTime = player.currentTime;
          var wordBoundary;
          for (const e of wordBoundaryList) {
            if (currentTime * 1000 > e.audioOffset / 10000) {
              wordBoundary = e;
            } else {
              break;
            }
          }
          if (wordBoundary !== undefined) {
            highlightDiv.innerHTML = synthesisText.value.substr(0, wordBoundary.textOffset) +
                    "<span class='highlight'>" + wordBoundary.text + "</span>" +
                    synthesisText.value.substr(wordBoundary.textOffset + wordBoundary.wordLength);
          } else {
            highlightDiv.innerHTML = synthesisText.value;
          }
        }
      }, 50);

      updateVoiceListButton.addEventListener("click", function () {
        var request = new XMLHttpRequest();
        request.open('GET',
                'https://' + regionOptions.value + ".tts.speech." +
                (regionOptions.value.startsWith("china") ? "azure.cn" : "microsoft.com") +
                        "/cognitiveservices/voices/list", true);
        if (authorizationToken) {
          request.setRequestHeader("Authorization", "Bearer " + authorizationToken);
        } else {
          if (subscriptionKey.value === "" || subscriptionKey.value === "subscription") {
            alert("Please enter your Microsoft Cognitive Services Speech subscription key!");
            return;
          }
          request.setRequestHeader("Ocp-Apim-Subscription-Key", subscriptionKey.value);
        }

        request.onload = function() {
          if (request.status >= 200 && request.status < 400) {
            const response = this.response;
            const neuralSupport = (response.indexOf("JessaNeural") > 0);
            const defaultVoice = neuralSupport ? "JessaNeural" : "JessaRUS";
            let selectId;
            const data = JSON.parse(response);
            voiceOptions.innerHTML = "";
            data.forEach((voice, index) => {
              voiceOptions.innerHTML += "<option value=\"" + voice.Name + "\">" + voice.Name + "</option>";
              if (voice.Name.indexOf(defaultVoice) > 0) {
                selectId = index;
              }
            });
            voiceOptions.selectedIndex = selectId;
            voiceOptions.disabled = false;
          } else {
            window.console.log(this);
            eventsDiv.innerHTML += "cannot get voice list, code: " + this.status + " detail: " + this.statusText + "\r\n";
          }
        };

        request.send()
      });

      pauseButton.addEventListener("click", function () {
        player.pause();
        pauseButton.disabled = true;
        resumeButton.disabled = false;
      });

      resumeButton.addEventListener("click", function () {
        player.resume();
        pauseButton.disabled = false;
        resumeButton.disabled = true;
      });

      startSynthesisAsyncButton.addEventListener("click", function () {
        startSynthesisAsyncButton.disabled = true;
        resultsDiv.innerHTML = "";
        eventsDiv.innerHTML = "";
        wordBoundaryList = [];
        synthesisText = document.getElementById("synthesisText");

        // if we got an authorization token, use the token. Otherwise use the provided subscription key
        var speechConfig;
        if (authorizationToken) {
          speechConfig = SpeechSDK.SpeechConfig.fromAuthorizationToken(authorizationToken, serviceRegion.value);
        } else {
          if (subscriptionKey.value === "" || subscriptionKey.value === "subscription") {
            alert("Please enter your Microsoft Cognitive Services Speech subscription key!");
            return;
          }
          speechConfig = SpeechSDK.SpeechConfig.fromSubscription(subscriptionKey.value, regionOptions.value);
        }

        speechConfig.speechSynthesisVoiceName = voiceOptions.value;
        // The SDK uses Media Source Extensions (https://www.w3.org/TR/media-source/) for playback.
        // Mp3 format is supported in most browsers.
        speechConfig.speechSynthesisOutputFormat = SpeechSDK.SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3;
        player = new SpeechSDK.SpeakerAudioDestination();
        player.onAudioEnd = function (_) {
          window.console.log("playback finished");
          eventsDiv.innerHTML += "playback finished" + "\r\n";
          startSynthesisAsyncButton.disabled = false;
          pauseButton.disabled = true;
          resumeButton.disabled = true;
          wordBoundaryList = [];
        };

        var audioConfig  = SpeechSDK.AudioConfig.fromSpeakerOutput(player);
        var audioConfig = AudioConfig.FromWavFileOutput("/var/www/html/unitypst/customfiles/tts/ttsfile.wav");
        
        synthesizer = new SpeechSDK.SpeechSynthesizer(speechConfig, audioConfig);

        // The event synthesizing signals that a synthesized audio chunk is received.
        // You will receive one or more synthesizing events as a speech phrase is synthesized.
        // You can use this callback to streaming receive the synthesized audio.
        synthesizer.synthesizing = function (s, e) {
          window.console.log(e);
          eventsDiv.innerHTML += "(synthesizing) Reason: " + SpeechSDK.ResultReason[e.result.reason] +
                  "Audio chunk length: " + e.result.audioData.byteLength + "\r\n";
        };

        // The synthesis started event signals that the synthesis is started.
        synthesizer.synthesisStarted = function (s, e) {
          window.console.log(e);
          eventsDiv.innerHTML += "(synthesis started)" + "\r\n";
          pauseButton.disabled = false;
        };

        // The event synthesis completed signals that the synthesis is completed.
        synthesizer.synthesisCompleted = function (s, e) {
          console.log(e);
          eventsDiv.innerHTML += "(synthesized)  Reason: " + SpeechSDK.ResultReason[e.result.reason] +
                  " Audio length: " + e.result.audioData.byteLength + "\r\n";
        };

        // The event signals that the service has stopped processing speech.
        // This can happen when an error is encountered.
        synthesizer.SynthesisCanceled = function (s, e) {
          const cancellationDetails = SpeechSDK.CancellationDetails.fromResult(e.result);
          let str = "(cancel) Reason: " + SpeechSDK.CancellationReason[cancellationDetails.reason];
          if (cancellationDetails.reason === SpeechSDK.CancellationReason.Error) {
            str += ": " + e.result.errorDetails;
          }
          window.console.log(e);
          eventsDiv.innerHTML += str + "\r\n";
          startSynthesisAsyncButton.disabled = false;
          pauseButton.disabled = true;
          resumeButton.disabled = true;
        };

        // This event signals that word boundary is received. This indicates the audio boundary of each word.
        // The unit of e.audioOffset is tick (1 tick = 100 nanoseconds), divide by 10,000 to convert to milliseconds.
        synthesizer.wordBoundary = function (s, e) {
          window.console.log(e);
          eventsDiv.innerHTML += "(WordBoundary), Text: " + e.text + ", Audio offset: " + e.audioOffset / 10000 + "ms." + "\r\n";
          wordBoundaryList.push(e);
        };

        const complete_cb = function (result) {
          if (result.reason === SpeechSDK.ResultReason.SynthesizingAudioCompleted) {
            resultsDiv.innerHTML += "synthesis finished";
          } else if (result.reason === SpeechSDK.ResultReason.Canceled) {
            resultsDiv.innerHTML += "synthesis failed. Error detail: " + result.errorDetails;
          }
          window.console.log(result);
          synthesizer.close();
          synthesizer = undefined;
        };
        const err_cb = function (err) {
          startSynthesisAsyncButton.disabled = false;
          phraseDiv.innerHTML += err;
          window.console.log(err);
          synthesizer.close();
          synthesizer = undefined;
        };
        if (isSsml.checked) {
          synthesizer.speakSsmlAsync(synthesisText.value,
                  complete_cb,
                  err_cb);
        } else {
          synthesizer.speakTextAsync(synthesisText.value,
                  complete_cb,
                  err_cb);
        }
      });

      Initialize(function (speechSdk) {
        SpeechSDK = speechSdk;
        startSynthesisAsyncButton.disabled = false;
        pauseButton.disabled = true;
        resumeButton.disabled = true;
        saveButton.disabled = true;

        // in case we have a function for getting an authorization token, call it.
        if (typeof RequestAuthorizationToken === "function") {
          RequestAuthorizationToken();
        }
      });
    });
  </script>
</body>
</html>

我现在要做的是添加选项,将合成的语音数据保存到磁盘,并提示用户下载文件。使用Microsoft Cognitive Services Speech SDK可以实现吗?我应该如何修改代码以支持“保存”或“下载”选项?

EN

回答 1

Stack Overflow用户

发布于 2021-01-26 20:06:48

我们有a sample来展示如何使用Azure Speech SDK在浏览器下保存TTS音频。

票数 0
EN
页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://stackoverflow.com/questions/65814124

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档