HTML Speech Web API

1. Multimodal Example

From the use cases:

3.3.14 Multimodal Interaction

The ability to mix and integrate input from multiple modalities such as by saying "I want to go from here to there" while tapping two points on a touch screen map.

Example

In the example below the various speech APIs are used to do a simple multimodal example where the user is presented with a series of buttons and may click one while they are speaking. The click event is sent to the speech service using the sendInfo method on the SpeechInputRequest. The result coming back from the speech service is the integration of the voice command and gesture. The combined multimodal result is represented in the EMMA document along with the EMMA interpretations for the individual modalities.

Simple Multimodal Example JS API Only


<!DOCTYPE html> 
<html>
  <head>
    <title>Simple Multimodal Example</title>
  </head>
  <body>
    <script type="text/javascript">
      // Only send gestures if sir is active, start() has been called
      sir_started = false;

      function speechClick() { 
        var sir = new SpeechInputRequest();

        // Indicate the reco service to be used
        sir.serviceURI = "https://example.org/recoService/multimodal";

		  // Set any parameters
        sir.speedVsAccuracy = 0.75;
        sir.completeTimeout = 2000;
        sir.incompleteTimeout = 3000;

        // Specifying the grammar
        var g = new SpeechGrammar();
        g.src = "http://example.org/commands.srgs";
        sir.grammars[0] = g;

        // Say what happens on a match
        sir.onresult = function(event) {
          sir_started=false;
          var af = document.getElementById('asr_feedback');
          af.value = event.result.item(0).utterance;
          // code to pull out interpretation and execute;
          interp = event.result.item(0).interpretation;
          //  ....
        };

        // Also do something on a nomatch
        sir.onnomatch() = function(event) {
          // even though it is a no match we might have a result
          alert("no match: " + event.result.item(0).interpretation);
        }
        //
        
        // Start will call open for us, if we wanted to open the 
        // sir on page start we could have to do initial permission checking
        sir.start();
        sir_started=true; 
      };
    // What do if the user clicks a button during speech capture
    // Uses sendInfo to add the click to the stream sent to the service
    function sendClick(payload) {
		if (sir_started) {
			sir.sendInfo('text/xml',payload);
      }
    };

    </script>
    
    <div>
    <input id="asr_feedback" name="asr_feedback" type="text"/>
    <button name="mic" onclick="speechClick()">SPEAK</button>
    <br/>
    <br/>
    <button name="item1" onclick="sendClick('<click>joe_bloggs</click'">Joe Bloggs</button>
    <button name="item2" onclick="sendClick('<click>pam_brown</click>'">Pam Brown</button>
    <button name="item3" onclick="sendClick('<click>peter_smith</click>'">Peter Smith</button>
    </div>

  </body>
</html>

SRGS Grammar (http://example.org/commands.srgs)


<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE grammar PUBLIC "-//W3C//DTD GRAMMAR 1.0//EN"
"http://www.w3.org/TR/speech-grammar/grammar.dtd">
<grammar version="1.0" mode="voice" root="top" tag-format="semantics/1.0">
         
<rule id="top" scope="public">
<one-of>
  <item>call this person<tag>out._cmd="CALL";out._obj="DEICTIC";</tag></item>
  <item>email this person<tag>out._cmd="CALL";out._obj="DEICTIC";</tag></item>
  <item>call<tag>out._cmd="CALL";out._obj="DEICTIC";</tag></item>
  <item>email<tag>out._cmd="CALL";out._obj="DEICTIC";</tag></item>
</one-of>
</rule>
</grammar>

EMMA Returned


<emma:emma version="1.1"
    xmlns:emma="http://www.w3.org/2003/04/emma"
    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xsi:schemaLocation="http://www.w3.org/2003/04/emma
     http://www.w3.org/TR/2009/REC-emma-20090210/emma.xsd"
    xmlns="http://www.example.com/example">
  <emma:grammar id="gram1" grammar-type="application/srgs-xml" ref="http://example.org/commands.srgs"/>
  <emma:interpretation id="multimodal1"
      emma:confidence="0.6"
      emma:start="1087995961500"
      emma:end="1087995962542"
      emma:medium="acoustic tactile"
      emma:mode="voice gui"
      emma:function="dialog"
      emma:verbal="true"
      emma:lang="en-US"
      emma:result-format="application/json">
    <emma:derived-from resource="#voice1" composite="true"/>
    <emma:derived-from resource="#gui1" composite="true"/>
    <emma:literal><![CDATA[
              {_cmd:"CALL",
               _obj:"joe_bloggs"}]]></emma:literal>
  </emma:interpretation>
  <emma:derivation>
    <emma:interpretation id="voice1"
        emma:start="1087995961500"
        emma:end="1087995962542"
        emma:process="https://example.org/recoService/multimodal"
        emma:grammar-ref="gram1"
        emma:confidence="0.6"
        emma:medium="acoustic"
        emma:mode="voice"
        emma:function="dialog"
        emma:verbal="true"
        emma:lang="en-US"
        emma:tokens="call this person"
        emma:result-format="application/json">
       <emma:literal><![CDATA[
              {_cmd:"CALL",
               _obj:"DEICTIC"}]]></emma:literal>
    </emma:interpretation>
    <emma:interpretation id="gui1"
        emma:medium="tactile"
        emma:mode="gui"
        emma:function="dialog"
        emma:verbal="false">
        <click>joe_bloggs</click>
    </emma:interpretation>
  </emma:derivation>
</emma:emma>

HTML+SPEECH Examples

1. Multimodal Example