Copyright © 2011 W3C® (MIT, ERCIM, Keio), All Rights Reserved. W3C liability, trademark and document use rules apply.
From the use cases:
3.3.14 Multimodal Interaction
The ability to mix and integrate input from multiple modalities such as by saying "I want to go from here to there" while tapping two points on a touch screen map.
In the example below the various speech APIs are used to do a simple multimodal example where the user is presented with a series of buttons and may click one while they are speaking. The click event is sent to the speech service using the sendInfo method on the SpeechInputRequest. The result coming back from the speech service is the integration of the voice command and gesture. The combined multimodal result is represented in the EMMA document along with the EMMA interpretations for the individual modalities.
<!DOCTYPE html>
<html>
<head>
<title>Simple Multimodal Example</title>
</head>
<body>
<script type="text/javascript">
// Only send gestures if sir is active, start() has been called
sir_started = false;
function speechClick() {
var sir = new SpeechInputRequest();
// Indicate the reco service to be used
sir.serviceURI = "https://example.org/recoService/multimodal";
// Set any parameters
sir.speedVsAccuracy = 0.75;
sir.completeTimeout = 2000;
sir.incompleteTimeout = 3000;
// Specifying the grammar
var g = new SpeechGrammar();
g.src = "http://example.org/commands.srgs";
sir.grammars[0] = g;
// Say what happens on a match
sir.onresult = function(event) {
sir_started=false;
var af = document.getElementById('asr_feedback');
af.value = event.result.item(0).utterance;
// code to pull out interpretation and execute;
interp = event.result.item(0).interpretation;
// ....
};
// Also do something on a nomatch
sir.onnomatch() = function(event) {
// even though it is a no match we might have a result
alert("no match: " + event.result.item(0).interpretation);
}
//
// Start will call open for us, if we wanted to open the
// sir on page start we could have to do initial permission checking
sir.start();
sir_started=true;
};
// What do if the user clicks a button during speech capture
// Uses sendInfo to add the click to the stream sent to the service
function sendClick(payload) {
if (sir_started) {
sir.sendInfo('text/xml',payload);
}
};
</script>
<div>
<input id="asr_feedback" name="asr_feedback" type="text"/>
<button name="mic" onclick="speechClick()">SPEAK</button>
<br/>
<br/>
<button name="item1" onclick="sendClick('<click>joe_bloggs</click'">Joe Bloggs</button>
<button name="item2" onclick="sendClick('<click>pam_brown</click>'">Pam Brown</button>
<button name="item3" onclick="sendClick('<click>peter_smith</click>'">Peter Smith</button>
</div>
</body>
</html>
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE grammar PUBLIC "-//W3C//DTD GRAMMAR 1.0//EN"
"http://www.w3.org/TR/speech-grammar/grammar.dtd">
<grammar version="1.0" mode="voice" root="top" tag-format="semantics/1.0">
<rule id="top" scope="public">
<one-of>
<item>call this person<tag>out._cmd="CALL";out._obj="DEICTIC";</tag></item>
<item>email this person<tag>out._cmd="CALL";out._obj="DEICTIC";</tag></item>
<item>call<tag>out._cmd="CALL";out._obj="DEICTIC";</tag></item>
<item>email<tag>out._cmd="CALL";out._obj="DEICTIC";</tag></item>
</one-of>
</rule>
</grammar>
<emma:emma version="1.1"
xmlns:emma="http://www.w3.org/2003/04/emma"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.w3.org/2003/04/emma
http://www.w3.org/TR/2009/REC-emma-20090210/emma.xsd"
xmlns="http://www.example.com/example">
<emma:grammar id="gram1" grammar-type="application/srgs-xml" ref="http://example.org/commands.srgs"/>
<emma:interpretation id="multimodal1"
emma:confidence="0.6"
emma:start="1087995961500"
emma:end="1087995962542"
emma:medium="acoustic tactile"
emma:mode="voice gui"
emma:function="dialog"
emma:verbal="true"
emma:lang="en-US"
emma:result-format="application/json">
<emma:derived-from resource="#voice1" composite="true"/>
<emma:derived-from resource="#gui1" composite="true"/>
<emma:literal><![CDATA[
{_cmd:"CALL",
_obj:"joe_bloggs"}]]></emma:literal>
</emma:interpretation>
<emma:derivation>
<emma:interpretation id="voice1"
emma:start="1087995961500"
emma:end="1087995962542"
emma:process="https://example.org/recoService/multimodal"
emma:grammar-ref="gram1"
emma:confidence="0.6"
emma:medium="acoustic"
emma:mode="voice"
emma:function="dialog"
emma:verbal="true"
emma:lang="en-US"
emma:tokens="call this person"
emma:result-format="application/json">
<emma:literal><![CDATA[
{_cmd:"CALL",
_obj:"DEICTIC"}]]></emma:literal>
</emma:interpretation>
<emma:interpretation id="gui1"
emma:medium="tactile"
emma:mode="gui"
emma:function="dialog"
emma:verbal="false">
<click>joe_bloggs</click>
</emma:interpretation>
</emma:derivation>
</emma:emma>