FW: Examples from Robert Brown on 2011-11-02 (public-xg-htmlspeech@w3.org from November 2011)

From: Robert Brown <Robert.Brown@microsoft.com>
Date: Wed, 2 Nov 2011 22:37:19 +0000
To: HTML Speech XG <public-xg-htmlspeech@w3.org>
CC: "Dan Burnett (Voxeo)" <dburnett@voxeo.com>
Message-ID: <113BCF28740AF44989BE7D3F84AE18DD1B2F5561@TK5EX14MBXC118.redmond.corp.microsoft.>
Apparently this got bounced yesterday. Trying again...
________________________________
From: Robert Brown
Sent: Tuesday, November 01, 2011 4:36 PM
To: public-xg-htmlspeech@w3.org
Subject: Examples

Can’t say I’m super thrilled with my examples, but here’s what I have.

3.3.3 Domain Specific Grammars Contingent on Earlier Inputs
A use case exists around collecting multiple domain specific inputs sequentially where the later inputs depend on the results of the earlier inputs. For instance, changing which cities are in a grammar of cities in response to the user saying in which state they are located.
This seems trivial, assuming there are a variety of suitable grammars to choose from.

    <input id="txtState" type="text"/>
    <button id="btnStateMic" type="button" onclick="stateMicClicked()">
      <img src="microphone.png"/>
    </button>

    <br/>

    <input id="txtCity" type="text"/>
    <button id="btnCityMic" type="button" onclick="cityMicClicked()">
      <img src="microphone.png"/>
    </button>


    <script type="text/javascript">

    function stateMicClicked() {
        var sr = new SpeechInputRequest();

        // add the grammar that contains all the states:
        var g = new SpeechGrammar();
        g.src = "states.grxml";
        sr.speechGrammars.push(g);

        sr.onmatch = function (e) {
            // assume the grammar returns a standard string for each state
            // in which case we just need to get the interpretation
            document.getElementById("txtState").value =
            e.result.item(0).interpretation;
          }
        sr.start();
      }

      function cityMicClicked() {
        var sr = new SpeechInputRequest();

        // The cities grammar depends on what state has been selected
        // If state is blank, use major US cities
        // otherwise use cities in that state

        var state = document.getElementById("txtState").value;
        var g = new SpeechGrammar();
        g.src = "" == state ? "majorUScities.grxml" : "citiesin" + state + ".grxml";
        sr.speechGrammars.push(g);

        sr.onresult = function(e) {
          document.getElementById("txtState").value =
          e.result.item(0).interpretation;
          }
        sr.start();
      }
</script>

3.3.7 Rerecognition
Some sophisticated applications will re-use the same utterance in two or more recognitions turns in what appears to the user as one turn. For example, an application may ask "how may I help you?", to which the user responds "find me a round trip from New York to San Francisco on Monday morning, returning Friday afternoon". An initial recognition against a broad language model may be sufficient to understand that the user wants the "flight search" portion of the app. Rather than get the user to repeat themselves, the application will just re-use the existing utterance for the recognition on the flight search recognition.

<script type="text/javascript">
    function listenAndClassifyThenReco() {

        var sr = new SpeechInputRequest();
        sr.speechGrammars[0] = new SpeechGrammar();
        sr.speechGrammars[0].src = "broadClassifier.grxml";
        sr.speechGrammars[0].modal = true;
        sr.speechGrammars[1] = new SpeechGrammar();
        sr.speechGrammars[1].src = "flightsearch.grxml";
        sr.speechGrammars[2] = new SpeechGrammar();
        sr.speechGrammars[2].src = "hotelbooking.grxml";
        sr.speechGrammars[3] = new SpeechGrammar();
        sr.speechGrammars[3].src = "carrental.grxml";

        sr.onresult = function (e) {
            if (sr.speechGrammars[0].modal) {
                switch (e.result.item(0).interpretation) {
                    case "flightsearch":
                        sr.speechGrammars[1].modal = true;
                        break;
                    case "hotelbooking":
                        sr.speechGrammars[2].modal = true;
                        break;
                    case "carrental":
                        sr.speechGrammars[3].modal = true;
                        break;
                    default:
                        throw ("cannot interpret:" + e.result.item(0).interpretation);
                }
                // how do I tell it to use the retained audio rather than record again?
                sr.start();
            }
            else if (sr.speechGrammars[1].modal) {
                processFlightSearch(e.result.item(0).interpretation);
            }
            else if (sr.speechGrammars[2].modal) {
                processHotelBooking(e.result.item(0).interpretation);
            }
            else if (sr.speechGrammars[3].modal) {
                processCarRental(e.result.item(0).interpretation);
            }
        }

        sr.saveForRereco = true;
        sr.start();
    }
</script>



3.3.12 Speech Enabled Email Client
The application reads out subjects and contents of email and also listens for commands, for instance, "archive", "reply: ok, let's meet at 2 pm", "forward to bob", "read message". Some commands may relate to VCR like controls of the message being read back, for instance, "pause", "skip forwards", "skip back", or "faster". Some of those controls may include controls related to parts of speech, such as, "repeat last sentence" or "next paragraph".

This is the other end of the spectrum. It’s a huge app, if it works as described. But here’s a simplistic version of some of the logic…

    <script type="text/javascript">
        var sr;
        var tts;

        // initialize the speech object when the page loads:
        function initializeSpeech() {
            sr = new SpeechInputRequest();
            sr.speechGrammars[0] = new SpeechGrammar();
            sr.speechGrammars[0].src = "globalcommands.grxml";
            sr.onresult = doSRResult;
            sr.onerror = doSRError;

            tts = new TTS(); // really? SR gets a big long name and TTS gets an acronym?
        }

        function onMicClicked() {
            if (TBD_SPEECH_IDLE == sr.TBD_some_check_of_current_state) {
                // stop TTS if there is any...
                if (TBD_is_playing = tts.TBD_state) {
                    tts.controller.um_where_is_the_cancel_method(); // all I can see is pause()
                }
                sr.start();
            }
        }

        function doSRError(e) {
            // do something informative...
        }

        function doSRResult(e) {
            // I don't want to write this line. How do I avoid it?
            if (!e.result.final) {
                return;
            }

            // Assume the interpretation is an object with a
            // pile of properties that have been glommed together
            // by the grammar
            var semantics = e.result.item(0).interpretation;
            switch (semantics.command) {
                case "reply":
                    composeReply(currentMessage, semantics.message);
                    break;
                case "compose":
                    composeNewMessage(semantics.subject, semantics.recipients, semantics.message);
                    break;
                case "send":
                    currentComposition.Send();
                    break;
                case "readout":
                    readout(currentMessage);
                    break;
                default:
                    throw ("cannot interpret:" + semantics.command);
            }
        }

        function readout(message) {
            tts.src = "data:,message from " + message.sendername + ". Message content: " + message.body;
            tts.play();
        }
Received on Wednesday, 2 November 2011 22:38:01 UTC