HTML Speech API Examples

Examples based on http://lists.w3.org/Archives/Public/public-xg-htmlspeech/2011Oct/att-0064/speechwepapi_1_.html.

The following code extracts illustrate how to use speech input in various cases:

Web search by voice, with auto-submit

    <script type="text/javascript">
      function startSpeech(event) {
        var sir = new SpeechInputRequest();
        sir.onresult = handleSpeechInput;
        sir.start();
      }
      function handleSpeechInput(event) {
        var q = document.getElementById("q");
        q.value = event.result.item(0).interpretation;
        q.form.submit();
      }
    </script>

    <form action="http://www.google.com/search">
    <input type="search" id="q" name="q">
    <input type="button" value="Speak" onclick="startSpeech">
    </form>
   
Behavior
  1. User clicks button.
  2. Audio is captured and the speech recognizer runs.
  3. If some speech was recognized, the first hypothesis is put in the text field and the form is submitted.
  4. Search results are loaded.

Web search by voice, with "Did you say..."

This example uses the second best result. The search results page will display a link with the text "Did you say $second_best?".

   <script type="text/javascript">
      function startSpeech(event) {
        var sir = new SpeechInputRequest();
        sir.onresult = handleSpeechInput;
        sir.start();
      }
      function handleSpeechInput(event) {
        var q = document.getElementById("q");
        q.value = event.result.item(0).interpretation;

        if (event.result.length > 1) {
          var second = event.result[1].interpretation;
          document.getElementById("second_best").value = second;
        }

        q.form.submit();
      }
    </script>

    <form action="http://www.google.com/search">
    <input type="search" id="q" name="q">
    <input type="button" value="Speak" onclick="startSpeech">
    <input type="hidden" name="second_best" id="second_best">
    </form>
   

Speech translator

    <script type="text/javascript" src="http://www.google.com/jsapi"></script>
    <script type="text/javascript">
      google.load("language", "1");  // Load the translator JS library.

      // These will most likely be set in a UI.
      var fromLang = "en";
      var toLang = "es";

      function startSpeech(event) {
        var sir = new SpeechInputRequest();
        sir.onresult = handleSpeechInput;
        sir.start();
      }
      function handleSpeechInput(event) {
        var text = event.result.item(0).interpretation;
        var callback = function(translationResult) {
          if (translationResult.translation)
            speak(translationResult.translation, toLang);
        };
        google.language.translate(text, fromLang, toLang, callback);
      }

      function speak(output, lang) {
        var tts = new TTS();
        // NOTE: these attributes don't seem to be in the proposal
        tts.text = output;
        tts.lang = lang;
        tts.play();
     }
    </script>

    <form>
    <input type="button" value="Speak" onclick="startSpeech">
    </form>
   
Behavior
  1. User clicks button and speaks in English.
  2. System recognizes the text in English.
  3. A web service translates the text from English to Spanish.
  4. System synthesizes and speaks the translated text in Spanish.

Turn-by-turn navigation

HTML:
    <script type="text/javascript">
      var directions;

      function startSpeech(event) {
        var sir = new SpeechInputRequest();
        var grammar = new SpeechGrammar();
        grammar.src = "grammar-nav-en.grxml";
        sir.grammars[0] = grammar;
        sir.onresult = handleSpeechInput;
        sir.start();
      }

      function handleSpeechInput(event) {
        var command = event.result.item(0).interpretation;
        getDirections(command.destination, handleDirections);
        speakNextInstruction();
      }

      function getDirections(query, handleDirections) {
        // Get location, then get directions from server, pass to handleDirections
      }

      function handleDirections(newDirections) {
        directions = newDirections;

        // List for location changes and call speakNextInstruction()
        // when appropriate
      }

      function speakNextInstruction() {
        var instruction = directions.pop();
        var tts = new TTS();
        tts.text = instruction;
        tts.play();
     }
    </script>

    <form>
    <input type="button" value="Speak" onclick="startSpeech">
    </form>
   
English SRGS XML Grammar (grammar-nav-en.grxml):
    <?xml version="1.0" encoding="UTF-8"?>
    <!DOCTYPE grammar PUBLIC "-//W3C//DTD GRAMMAR 1.0//EN"
                      "http://www.w3.org/TR/speech-grammar/grammar.dtd">
    <grammar xmlns="http://www.w3.org/2001/06/grammar" xml:lang="en"
             xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
             xsi:schemaLocation="http://www.w3.org/2001/06/grammar 
                                 http://www.w3.org/TR/speech-grammar/grammar.xsd"
             version="1.0" mode="voice" root="nav_cmd"
             tag-format="semantics/1.0">

    <rule id="nav_cmd" scope="public">
      <example> navigate to 76 Buckingham Palace Road, London </example>
      <example> go to Phantom of the Opera </example>
      <item>
        <ruleref uri="#nav_action" />
        <ruleref uri="builtin:search" />
        <tag>out.action="navigate_to"; out.destination=rules.latest();</tag>
      </item>
    </rule>

    <rule id="nav_action">
      <one-of>
        <item>navigate to</item>
        <item>go to</item>
      </one-of>
    </rule>

    </grammar>
   

Speech shell

This uses an SRGS grammar to declare the commands that are supported, and use SISR sematics so that the JavaScript code does not have to care about the language-specific representation.
Other similar examples: Speech-controlled E-mail client.

HTML.
    <script type="text/javascript">
      function startSpeech(event) {
        var sir = new SpeechInputRequest();
        var grammar = new SpeechGrammar();
        grammar.src = "commands.grxml";
        sir.grammars[0] = grammar;
        sir.onresult = handleSpeechInput;
        sir.start();
      }

      function handleSpeechInput(event) {
        var command = event.result.item(0).interpretation;
        if (command.action == "call_contact") {
          var number = getContactNumber(command.contact);
          callNumber(number);
        } else if (command.action == "call_number") {
          callNumber(command.number);
        } else if (command.action == "calculate") {
          say(evaluate(command.expression));
        } else if {command.action == "search") {
          search(command.query);
        }
      }
      function callNumber(number) {
        window.location = "tel:" + number;
      }
      function search(query) {
        // Start web search for query.
      }
      function getContactNumber(contact) {
        // Get the phone number of the contact.
      }
      function say(text) {
        // Speak text.
      }
    </script>

    <form>
    <input type="button" value="Speak" onclick="startSpeech">
    </form>

   
English SRGS XML Grammar (commands.grxml).
    <?xml version="1.0" encoding="UTF-8"?>
    <!DOCTYPE grammar PUBLIC "-//W3C//DTD GRAMMAR 1.0//EN"
                      "http://www.w3.org/TR/speech-grammar/grammar.dtd">
    <grammar xmlns="http://www.w3.org/2001/06/grammar" xml:lang="en"
             xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
             xsi:schemaLocation="http://www.w3.org/2001/06/grammar 
                                 http://www.w3.org/TR/speech-grammar/grammar.xsd"
             version="1.0" mode="voice" root="command"
             tag-format="semantics/1.0">

    <rule id="command" scope="public">
      <example> call Bob </example>
      <example> calculate 4 plus 3 </example>
      <example> search for pictures of the Golden Gate bridge </example>

      <one-of>
        <item>
          call <ruleref uri="contacts.grxml">
          <tag>out.action="call_contact"; out.contact=rules.latest()</tag>
        </item>
        <item>
          call <ruleref uri="phonenumber.grxml">
          <tag>out.action="call_number"; out.number=rules.latest()</tag>
        </item>
        <item>
          calculate <ruleref uri="#expression">
          <tag>out.action="calculate"; out.expression=rules.latest()</tag>
        </item>
        <item>
          search for <ruleref uri="http://grammar.example.com/search-ngram-model.xml">
          <tag>out.action="search"; out.query=rules.latest()</tag>
        </item>
      </one-of>
    </rule>
    </grammar>