Merge branch 'main' into feat/audio-levels

jamsch · jamsch · commit c2017e11e51d · 2024-10-11T14:23:18.000+13:00
diff --git a/README.md b/README.md
@@ -237,7 +237,9 @@ ExpoSpeechRecognitionModule.start({
   // The maximum number of alternative transcriptions to return.
   maxAlternatives: 1,
   // [Default: false] Continuous recognition.
-  // If false on iOS, recognition will run until no speech is detected for 3 seconds.
+  // If false:
+  //    - on iOS 17-, recognition will run until no speech is detected for 3 seconds.
+  //    - on iOS 18+ and Android, recognition will run until a final result is received.
   // Not supported on Android 12 and below.
   continuous: true,
   // [Default: false] Prevent device from sending audio over the network. Only enabled if the device supports it.
diff --git a/ios/ExpoSpeechRecognitionException.swift b/ios/ExpoSpeechRecognitionException.swift
@@ -26,7 +26,7 @@ language-not-supported
 The user agent does not support the language specified in the value of lang attribute of the SpeechRecognition object. The set of supported languages is browser-dependent, and from frontend code there is no way to programmatically determine what languages a user's browser supports for speech recognition.
 */
 
-internal class NilRecognizerException: Exception {
+internal final class NilRecognizerException: Exception {
   override var code: String {
     "audio-capture"
   }
@@ -36,7 +36,7 @@ internal class NilRecognizerException: Exception {
   }
 }
 
-internal class PermissionException: Exception {
+internal final class PermissionException: Exception {
   override var code: String {
     "not-allowed"
   }
@@ -45,7 +45,7 @@ internal class PermissionException: Exception {
   }
 }
 
-internal class NotAuthorizedException: Exception {
+internal final class NotAuthorizedException: Exception {
   override var code: String {
     "not-allowed"
   }
@@ -55,7 +55,7 @@ internal class NotAuthorizedException: Exception {
   }
 }
 
-internal class NotPermittedToRecordException: Exception {
+internal final class NotPermittedToRecordException: Exception {
   override var code: String {
     "not-allowed"
   }
@@ -64,7 +64,7 @@ internal class NotPermittedToRecordException: Exception {
   }
 }
 
-internal class InvalidAudioModeException: GenericException<String> {
+internal final class InvalidAudioModeException: GenericException<String> {
   override var code: String {
     "audio-capture"
   }
@@ -73,7 +73,7 @@ internal class InvalidAudioModeException: GenericException<String> {
   }
 }
 
-internal class RecognizerUnavilableException: Exception {
+internal final class RecognizerUnavilableException: Exception {
   override var code: String {
     "service-not-allowed"
   }
diff --git a/ios/ExpoSpeechRecognitionModule.swift b/ios/ExpoSpeechRecognitionModule.swift
@@ -41,6 +41,14 @@ public class ExpoSpeechRecognitionModule: Module {
   // This is a temporary workaround until the issue is fixed in a future iOS release
   var hasSeenFinalResult: Bool = false
 
+  // Hack for iOS 18 to avoid sending a "nomatch" event after the final-final result
+  // Example event order emitted in iOS 18:
+  // [
+  //   { isFinal: false, transcripts: ["actually", "final", "results"], metadata: { duration: 1500 } },
+  //   { isFinal: true, transcripts: [] }
+  // ]
+  var previousResult: SFSpeechRecognitionResult?
+
   public func definition() -> ModuleDefinition {
     // Sets the name of the module that JavaScript code will use to refer to the module. Takes a string as an argument.
     // Can be inferred from module's class name, but it's recommended to set it explicitly for clarity.
@@ -132,6 +140,9 @@ public class ExpoSpeechRecognitionModule: Module {
         do {
           let currentLocale = await speechRecognizer?.getLocale()
 
+          // Reset the previous result
+          self.previousResult = nil
+
           // Re-create the speech recognizer when locales change
           if self.speechRecognizer == nil || currentLocale != options.lang {
             guard let locale = resolveLocale(localeIdentifier: options.lang) else {
@@ -363,12 +374,14 @@ public class ExpoSpeechRecognitionModule: Module {
 
   func sendErrorAndStop(error: String, message: String) {
     hasSeenFinalResult = false
+    previousResult = nil
     sendEvent("error", ["error": error, "message": message])
     sendEvent("end")
   }
 
   func handleEnd() {
     hasSeenFinalResult = false
+    previousResult = nil
     sendEvent("end")
   }
 
@@ -427,11 +440,21 @@ public class ExpoSpeechRecognitionModule: Module {
     }
 
     if isFinal && results.isEmpty {
-      // https://developer.mozilla.org/en-US/docs/Web/API/SpeechRecognition/nomatch_event
-      // The nomatch event of the Web Speech API is fired
-      // when the speech recognition service returns a final result with no significant recognition.
-      sendEvent("nomatch")
-      return
+      // Hack for iOS 18 to avoid sending a "nomatch" event after the final-final result
+      var previousResultWasFinal = false
+      var previousResultHadTranscriptions = false
+      if #available(iOS 18.0, *), let previousResult = previousResult {
+        previousResultWasFinal = previousResult.speechRecognitionMetadata?.speechDuration ?? 0 > 0
+        previousResultHadTranscriptions = !previousResult.transcriptions.isEmpty
+      }
+
+    if !previousResultWasFinal || !previousResultHadTranscriptions {
+        // https://developer.mozilla.org/en-US/docs/Web/API/SpeechRecognition/nomatch_event
+        // The nomatch event of the Web Speech API is fired
+        // when the speech recognition service returns a final result with no significant recognition.
+        sendEvent("nomatch")
+        return
+      }
     }
 
     sendEvent(
@@ -441,6 +464,8 @@ public class ExpoSpeechRecognitionModule: Module {
         "results": results.map { $0.toDictionary() },
       ]
     )
+
+    previousResult = result
   }
 
   func handleRecognitionError(_ error: Error) {
diff --git a/ios/ExpoSpeechRecognizer.swift b/ios/ExpoSpeechRecognizer.swift
@@ -295,8 +295,8 @@ actor ExpoSpeechRecognizer: ObservableObject {
         )
       }
 
-      // Don't run any timers if the audio source is from a file
-      let continuous = options.continuous || isSourcedFromFile
+      // Run timers on non-continuous mode, as long as the audio source is the mic
+      let shouldRunTimers = !options.continuous && !isSourcedFromFile
       let audioEngine = self.audioEngine
 
       self.task = recognizer.recognitionTask(
@@ -309,18 +309,20 @@ actor ExpoSpeechRecognizer: ObservableObject {
             }
           }
 
-          // Result handler
+          // Handle the result
           self?.recognitionHandler(
             audioEngine: audioEngine,
             result: result,
             error: error,
             resultHandler: resultHandler,
             errorHandler: errorHandler,
-            continuous: continuous
+            continuous: options.continuous,
+            shouldRunTimers: shouldRunTimers,
+            canEmitInterimResults: options.interimResults
           )
         })
 
-      if !continuous {
+      if shouldRunTimers {
         invalidateAndScheduleTimer()
       }
 
@@ -463,7 +465,10 @@ actor ExpoSpeechRecognizer: ObservableObject {
       request = SFSpeechAudioBufferRecognitionRequest()
     }
 
-    request.shouldReportPartialResults = options.interimResults
+    // We also force-enable partial results on non-continuous mode,
+    // which will allow us to re-schedule timers when text is detected
+    // These won't get emitted to the user, however
+    request.shouldReportPartialResults = options.interimResults || !options.continuous
 
     if recognizer.supportsOnDeviceRecognition {
       request.requiresOnDeviceRecognition = options.requiresOnDeviceRecognition
@@ -695,12 +700,25 @@ actor ExpoSpeechRecognizer: ObservableObject {
     error: Error?,
     resultHandler: @escaping (SFSpeechRecognitionResult) -> Void,
     errorHandler: @escaping (Error) -> Void,
-    continuous: Bool
+    continuous: Bool,
+    shouldRunTimers: Bool,
+    canEmitInterimResults: Bool
   ) {
+    // When a final result is returned, we should expect the task to be idle or stopping
     let receivedFinalResult = result?.isFinal ?? false
     let receivedError = error != nil
 
-    if let result: SFSpeechRecognitionResult {
+    // Hack for iOS 18 to detect final results
+    // See: https://forums.developer.apple.com/forums/thread/762952 for more info
+    // This can be emitted multiple times during a continuous session, unlike `result.isFinal` which is only emitted once
+    var receivedFinalLikeResult: Bool = receivedFinalResult
+    if #available(iOS 18.0, *), !receivedFinalLikeResult {
+      receivedFinalLikeResult = result?.speechRecognitionMetadata?.speechDuration ?? 0 > 0
+    }
+
+    let shouldEmitResult = receivedFinalResult || canEmitInterimResults || receivedFinalLikeResult
+
+    if let result: SFSpeechRecognitionResult, shouldEmitResult {
       Task { @MainActor in
         let taskState = await task?.state
         // Make sure the task is running before emitting the result
@@ -720,15 +738,16 @@ actor ExpoSpeechRecognizer: ObservableObject {
       }
     }
 
-    if receivedFinalResult || receivedError {
+    if (receivedFinalLikeResult && !continuous) || receivedError || receivedFinalResult {
       Task { @MainActor in
         await reset()
       }
+      return
     }
 
     // Non-continuous speech recognition
     // Stop the speech recognizer if the timer fires after not receiving a result for 3 seconds
-    if !continuous && !receivedError {
+    if shouldRunTimers && !receivedError {
       invalidateAndScheduleTimer()
     }
   }
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "expo-speech-recognition",
-  "version": "0.2.22",
+  "version": "0.2.23",
   "description": "Speech Recognition for React Native Expo projects",
   "main": "build/index.js",
   "types": "build/index.d.ts",
diff --git a/src/ExpoSpeechRecognitionModule.types.ts b/src/ExpoSpeechRecognitionModule.types.ts
@@ -157,7 +157,10 @@ export type ExpoSpeechRecognitionOptions = {
    *
    * Not supported on Android 12 and below.
    *
-   * If false on iOS, recognition will run until no speech is detected for 3 seconds.
+   * If false, the behaviors are the following:
+   *
+   *   - on iOS 17-, recognition will run until no speech is detected for 3 seconds.
+   *   - on iOS 18+ and Android, recognition will run until a result with `isFinal: true` is received.
    */
   continuous?: boolean;
   /** [Default: false] Prevent device from sending audio over the network. Only enabled if the device supports it.

Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@ language-not-supported`
`26`	`26`	`The user agent does not support the language specified in the value of lang attribute of the SpeechRecognition object. The set of supported languages is browser-dependent, and from frontend code there is no way to programmatically determine what languages a user's browser supports for speech recognition.`
`27`	`27`	`*/`
`28`	`28`
`29`		`-internal class NilRecognizerException: Exception {`
	`29`	`+internal final class NilRecognizerException: Exception {`
`30`	`30`	`override var code: String {`
`31`	`31`	`"audio-capture"`
`32`	`32`	`}`
`@@ -36,7 +36,7 @@ internal class NilRecognizerException: Exception {`
`36`	`36`	`}`
`37`	`37`	`}`
`38`	`38`
`39`		`-internal class PermissionException: Exception {`
	`39`	`+internal final class PermissionException: Exception {`
`40`	`40`	`override var code: String {`
`41`	`41`	`"not-allowed"`
`42`	`42`	`}`
`@@ -45,7 +45,7 @@ internal class PermissionException: Exception {`
`45`	`45`	`}`
`46`	`46`	`}`
`47`	`47`
`48`		`-internal class NotAuthorizedException: Exception {`
	`48`	`+internal final class NotAuthorizedException: Exception {`
`49`	`49`	`override var code: String {`
`50`	`50`	`"not-allowed"`
`51`	`51`	`}`
`@@ -55,7 +55,7 @@ internal class NotAuthorizedException: Exception {`
`55`	`55`	`}`
`56`	`56`	`}`
`57`	`57`
`58`		`-internal class NotPermittedToRecordException: Exception {`
	`58`	`+internal final class NotPermittedToRecordException: Exception {`
`59`	`59`	`override var code: String {`
`60`	`60`	`"not-allowed"`
`61`	`61`	`}`
`@@ -64,7 +64,7 @@ internal class NotPermittedToRecordException: Exception {`
`64`	`64`	`}`
`65`	`65`	`}`
`66`	`66`
`67`		`-internal class InvalidAudioModeException: GenericException<String> {`
	`67`	`+internal final class InvalidAudioModeException: GenericException<String> {`
`68`	`68`	`override var code: String {`
`69`	`69`	`"audio-capture"`
`70`	`70`	`}`
`@@ -73,7 +73,7 @@ internal class InvalidAudioModeException: GenericException<String> {`
`73`	`73`	`}`
`74`	`74`	`}`
`75`	`75`
`76`		`-internal class RecognizerUnavilableException: Exception {`
	`76`	`+internal final class RecognizerUnavilableException: Exception {`
`77`	`77`	`override var code: String {`
`78`	`78`	`"service-not-allowed"`
`79`	`79`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "expo-speech-recognition",`
`3`		`- "version": "0.2.22",`
	`3`	`+ "version": "0.2.23",`
`4`	`4`	`"description": "Speech Recognition for React Native Expo projects",`
`5`	`5`	`"main": "build/index.js",`
`6`	`6`	`"types": "build/index.d.ts",`
Original file line number	Diff line number	Diff line change
`@@ -157,7 +157,10 @@ export type ExpoSpeechRecognitionOptions = {`
`157`	`157`	`*`
`158`	`158`	`* Not supported on Android 12 and below.`
`159`	`159`	`*`
`160`		`- * If false on iOS, recognition will run until no speech is detected for 3 seconds.`
	`160`	`+ * If false, the behaviors are the following:`
	`161`	`+ *`
	`162`	`+ * - on iOS 17-, recognition will run until no speech is detected for 3 seconds.`
	`163`	+ * - on iOS 18+ and Android, recognition will run until a result with `isFinal: true` is received.
`161`	`164`	`*/`
`162`	`165`	`continuous?: boolean;`
`163`	`166`	`/** [Default: false] Prevent device from sending audio over the network. Only enabled if the device supports it.`