- From: Young, Milan <Milan.Young@nuance.com>
- Date: Wed, 2 Nov 2011 17:00:52 -0700
- To: Dan Burnett <dburnett@voxeo.com>, <public-xg-htmlspeech@w3.org>
- Message-ID: <1AA381D92997964F898DF2A3AA4FF9AD0D55B97A@SUN-EXCH01.nuance.com>
This example shows the protocol interactions on the "Testing this
example, and launching this missile. < cough>" user utterance.  I still
need to flesh out the API calls and events, but wanted to get some early
feedback, particularly from the protocol team on the following issues:
 
  * What Content-Type do we want to use on an empty message?  Use case
was nulling out previous candidate recognition.
  * I am skeptical about changing established MRCP event/method names.
I sort of agree that LISTEN is better than RECOGNIZE, but do not think
the reasons are good enough to warrant ensuing churn.
  * We need a way to index the recognition results.  I suggest using a
Result-Index header.
  * It was awkward to use a RECOGNITION-COMPLETE message presumably with
a COMPLETE status during continuous speech.  Instead, I used
INTERMEDIATE-RESULT with a new Result-Status header set to final.
  * Perhaps Source-Time should also be required on final results.
  * Wanted to confirm that channel identification is being handled by
the WebSocket container
  * I noticed that Completion-Cause was missing from Robert's spec
example in section 4.2.
 
 
 
// Preload large grammar
 
C->S: web-speech/1.0 DEFINE-GRAMMAR 257
  Resource-ID:recognizer
  Content-Type:text/uri-list
  Content-ID:Precompile-123
 
  builtin:dictation?type=x-acme-military-control
 
 
S->C: web-speech/1.0 257 200 COMPLETE
  Completion-Cause: 000 Success
 
 
C->S: web-speech/1.0 LISTEN 258
  Resource-ID:recognizer
  Source-Time: 6023
  Listen-Mode: continuous
  Active-Grammars: session:Precompile-123
 
 
S->C: web-speech/1.0 258 200 IN-PROGRESS
 
 
// User starts speaking
 
User Speech: "test"
 
 
S->C: web-speech/1.0 258 START-OF-SPEECH IN-PROGRESS
  Source-Time: 6035
 
 
S->C: web-speech/1.0 INTERMEDIATE-RESULT 258 IN-PROGRESS
  Resource-ID:recognizer
  Content-Type:application/emma+xml
  Source-Time: 6035
  Result-Index: 0
 
  <emma:emma version="1.0" ...>
    <emma:one-of id="r1" emma:mode="voice">
      <emma:interpretation id="int1" emma:confidence="0.65"
       emma:tokens="text"/>
      <emma:interpretation id="int2" emma:confidence="0.60"
       emma:tokens="test"/>
    </emma:one-of>
  </emma:emma>
 
 
API Event1: [0] {"text", "test"}-I
Aggregate result: [{"text", "test"}-I]
 
 
// Modifying top choice
 
S->C: web-speech/1.0 INTERMEDIATE-RESULT 258 IN-PROGRESS
  Resource-ID:recognizer
  Content-Type:application/emma+xml
  Source-Time: 6035
  Result-Index: 0
 
  <emma:emma version="1.0" ...>
    <emma:one-of id="r1" emma:mode="voice">
      <emma:interpretation id="int1" emma:confidence="0.65"
       emma:tokens="test"/>
      <emma:interpretation id="int2" emma:confidence="0.50"
       emma:tokens="text"/>
    </emma:one-of>
  </emma:emma>
 
 
API Event2: [0] {"test", "text"}-I
Aggregate result: [{"test", "text"}-I]
 
 
User Speech: "ing"
 
 
S->C: web-speech/1.0 INTERMEDIATE-RESULT 258 IN-PROGRESS
  Resource-ID:recognizer
  Content-Type:application/emma+xml
  Source-Time: 6035
  Result-Index: 0
 
  <emma:emma version="1.0" ...>
    <emma:one-of id="r1" emma:mode="voice">
      <emma:interpretation id="int1" emma:confidence="0.70"
       emma:tokens="test sting"/>
      <emma:interpretation id="int2" emma:confidence="0.50"
       emma:tokens="texting"/>
    </emma:one-of>
  </emma:emma>
 
 
API Event3: [0] {"test sting", "texting"}-I
Aggregate result: [ {"test sting"], ["texting"}-I ]
 
 
User Speech: "this"
 
 
// Combining words within a phrase
 
S->C: web-speech/1.0 INTERMEDIATE-RESULT 258 IN-PROGRESS
  Resource-ID:recognizer
  Content-Type:application/emma+xml
  Source-Time: 6035
  Result-Index: 0
 
  <emma:emma version="1.0" ...>
    <emma:one-of id="r1" emma:mode="voice">
      <emma:interpretation id="int1" emma:confidence="0.75"
       emma:tokens="testing this"/>
      <emma:interpretation id="int2" emma:confidence="0.50"
       emma:tokens="texting this"/>
    </emma:one-of>
  </emma:emma>
 
 
API Event4: [0] {"testing this", "texting this"}-I
Aggregate result: [ {"testing this"], ["texting this"}-I ]
 
 
User Speech: "example <small pause> and"
 
 
S->C: web-speech/1.0 INTERMEDIATE-RESULT 258 IN-PROGRESS
  Resource-ID:recognizer
  Content-Type:application/emma+xml
  Source-Time: 6035
  Result-Index: 0
 
  <emma:emma version="1.0" ...>
    <emma:one-of id="r1" emma:mode="voice">
      <emma:interpretation id="int1" emma:confidence="0.75"
       emma:tokens="testing this example and"/>
      <emma:interpretation id="int2" emma:confidence="0.50"
       emma:tokens="texting this sample ant"/>
    </emma:one-of>
  </emma:emma>
 
 
API Event5: [0] {"testing this example and", "texting this sample
ant"}-I
Aggregate result: [ {"testing this example and", "texting this sample
ant"}-I ]
 
 
User Speech: "launching"
 
 
// Moving words across phrase boundaries
 
S->C: web-speech/1.0 INTERMEDIATE-RESULT 258 IN-PROGRESS
  Resource-ID:recognizer
  Content-Type:application/emma+xml
  Source-Time: 6035
  Result-Index: 0
 
  <emma:emma version="1.0" ...>
    <emma:one-of id="r1" emma:mode="voice">
      <emma:interpretation id="int1" emma:confidence="0.75"
       emma:tokens="testing this example"/>
      <emma:interpretation id="int2" emma:confidence="0.50"
       emma:tokens="texting this sample"/>
    </emma:one-of>
  </emma:emma>
 
 
API Event6: [0] {"testing this example", "texting this sample"}-I
Aggregate result: [ {"testing this example", "texting this sample"}-I ]
 
 
S->C: web-speech/1.0 INTERMEDIATE-RESULT 258 IN-PROGRESS
  Resource-ID:recognizer
  Content-Type:application/emma+xml
  Source-Time: 10032
  Result-Index: 1
 
  <emma:emma version="1.0" ...>
    <emma:one-of id="r1" emma:mode="voice">
      <emma:interpretation id="int1" emma:confidence="0.65"
       emma:tokens="ant launching"/>
      <emma:interpretation id="int2" emma:confidence="0.45"
       emma:tokens="ant lunching"/>
    </emma:one-of>
  </emma:emma>
 
 
Event7: [1] {"and launching", "ant lunching"}-I
Aggregate result: [ {"testing this example", "texting this sample"}-I,
{"and launching", "ant lunching"}-I ]
 
 
User Speech: "this missile"
 
 
S->C: web-speech/1.0 INTERMEDIATE-RESULT 258 IN-PROGRESS
  Resource-ID:recognizer
  Content-Type:application/emma+xml
  Source-Time: 10032
  Result-Index: 1
 
  <emma:emma version="1.0" ...>
    <emma:one-of id="r1" emma:mode="voice">
      <emma:interpretation id="int1" emma:confidence="0.70"
       emma:tokens="and launching this missile"/>
      <emma:interpretation id="int2" emma:confidence="0.40"
       emma:tokens="ant lunching this thistle"/>
    </emma:one-of>
  </emma:emma>
 
 
Event8: [1] {"and launching this missile", "ant lunching this
thistle"}-I
Aggregate result: [ {"testing this example", "texting this sample"}-I,
{"and launching this missile", "ant lunching this thistle"}-I ]
 
 
// First result is finalized
 
S->C: web-speech/1.0 INTERMEDIATE-RESULT 258 IN-PROGRESS 
  Resource-ID:recognizer
  Content-Type:application/emma+xml
  Source-Time: 6035
  Result-Index: 0
  Result-Status: final
 
  <emma:emma version="1.0" ...>
    <emma:one-of id="r1" emma:mode="voice">
      <emma:interpretation id="int1" emma:confidence="0.80"
       emma:tokens="testing this example"/>
      <emma:interpretation id="int2" emma:confidence="0.45"
       emma:tokens="texting this sample"/>
    </emma:one-of>
  </emma:emma>
 
 
Event8: [0] {"testing this example", "texting this sample"}-F
Aggregate result: [ {"testing this example", "texting this sample"}-F,
{"and launching this missile", "ant lunching this thistle"}-I ]
 
 
User Speech: "<cough>"
 
S->C: web-speech/1.0 INTERMEDIATE-RESULT 258 IN-PROGRESS
  Resource-ID:recognizer
  Content-Type:application/emma+xml
  Source-Time: 13022
  Result-Index: 2
 
  <emma:emma version="1.0" ...>
    <emma:one-of id="r1" emma:mode="voice">
      <emma:interpretation id="int1" emma:confidence="0.45"
       emma:tokens="confirm"/>
    </emma:one-of>
  </emma:emma>
 
 
Event9: [2] {"cancel"}-I
Aggregate result: [ {"testing this example", "texting this sample"}-F,
{"and launching this missile", "ant lunching this thistle"}-I,
{"confirm"}-I ]
 
 
// Retracts spurious result
 
S->C: web-speech/1.0 INTERMEDIATE-RESULT 258 IN-PROGRESS
  Resource-ID:recognizer
  Result-Index: 2
  Source-Time: 13022
  Result-Status: final
 
 
Event10: [2] {}-F
Aggregate result: [ {"testing this example", "texting this sample"}-F,
{"and launching this missile", "ant lunching this thistle"}-I ]
 
 
// Second result finalized, which completes transaction
 
S->C: web-speech/1.0  RECOGNITION-COMPLETE 258 COMPLETE
  Resource-ID:recognizer
  Content-Type:application/emma+xml
  Result-Index: 1
  Source-Time: 10032
  Result-Status: final
 
  <emma:emma version="1.0" ...>
    <emma:one-of id="r1" emma:mode="voice">
      <emma:interpretation id="int1" emma:confidence="0.75"
       emma:tokens="and launching this missile"/>
      <emma:interpretation id="int2" emma:confidence="0.40"
       emma:tokens="ant lunching this thistle"/>
    </emma:one-of>
  </emma:emma>
 
 
Event11: [1] {"and launching this missile", "ant lunching this
thistle"}-F
Aggregate result: [ {"testing this example", "texting this sample"}-F,
{"and launching this missile", "ant lunching this thistle"}-F ]
 
 
 
 
-----Original Message-----
From: Dan Burnett [mailto:dburnett@voxeo.com] 
Sent: Wednesday, November 02, 2011 4:50 AM
To: public-xg-htmlspeech@w3.org
Subject: Reminder: sample code due **now**
 
If you volunteered (or were volunteered, like Bjorn and Satish) to write
some sample code to address use cases, please send that in immediately.
We will begin reviewing those in tomorrow's f2f meeting.
 
-- dan
 
 
Received on Thursday, 3 November 2011 00:09:34 UTC