diff --git a/.changeset/dull-ligers-bow.md b/.changeset/dull-ligers-bow.md new file mode 100644 index 00000000000..95e813f3fb6 --- /dev/null +++ b/.changeset/dull-ligers-bow.md @@ -0,0 +1,6 @@ +--- +'firebase': minor +'@firebase/ai': minor +--- + +Add `sendTextRealtime()`, `sendAudioReatime()`, and `sendVideoRealtime()` to the `LiveSession` class, and deprecate `sendMediaChunks()` and `sendMediaStream()`. diff --git a/common/api-review/ai.api.md b/common/api-review/ai.api.md index d3c43d906fd..91bc421b516 100644 --- a/common/api-review/ai.api.md +++ b/common/api-review/ai.api.md @@ -991,12 +991,18 @@ export class LiveSession { constructor(webSocketHandler: WebSocketHandler, serverMessages: AsyncGenerator); close(): Promise; inConversation: boolean; + inVideoRecording: boolean; isClosed: boolean; receive(): AsyncGenerator; send(request: string | Array, turnComplete?: boolean): Promise; + sendAudioRealtime(blob: GenerativeContentBlob): Promise; sendFunctionResponses(functionResponses: FunctionResponse[]): Promise; + // @deprecated sendMediaChunks(mediaChunks: GenerativeContentBlob[]): Promise; + // @deprecated (undocumented) sendMediaStream(mediaChunkStream: ReadableStream): Promise; + sendTextRealtime(text: string): Promise; + sendVideoRealtime(blob: GenerativeContentBlob): Promise; } // @public @@ -1279,6 +1285,14 @@ export interface StartChatParams extends BaseParams { tools?: Tool[]; } +// @beta +export function startVideoRecording(liveSession: LiveSession, options?: StartVideoRecordingOptions): Promise; + +// @beta +export interface StartVideoRecordingOptions { + videoSource?: 'camera' | 'screen'; +} + // @public export class StringSchema extends Schema { constructor(schemaParams?: SchemaParams, enumValues?: string[]); @@ -1390,6 +1404,11 @@ export interface VideoMetadata { startOffset: string; } +// @beta +export interface VideoRecordingController { + stop: () => Promise; +} + // @beta export interface VoiceConfig { prebuiltVoiceConfig?: PrebuiltVoiceConfig; diff --git a/docs-devsite/_toc.yaml b/docs-devsite/_toc.yaml index 04d65f6c333..6b0428cc37f 100644 --- a/docs-devsite/_toc.yaml +++ b/docs-devsite/_toc.yaml @@ -194,6 +194,8 @@ toc: path: /docs/reference/js/ai.startaudioconversationoptions.md - title: StartChatParams path: /docs/reference/js/ai.startchatparams.md + - title: StartVideoRecordingOptions + path: /docs/reference/js/ai.startvideorecordingoptions.md - title: StringSchema path: /docs/reference/js/ai.stringschema.md - title: TextPart @@ -216,6 +218,8 @@ toc: path: /docs/reference/js/ai.vertexaibackend.md - title: VideoMetadata path: /docs/reference/js/ai.videometadata.md + - title: VideoRecordingController + path: /docs/reference/js/ai.videorecordingcontroller.md - title: VoiceConfig path: /docs/reference/js/ai.voiceconfig.md - title: WebAttribution diff --git a/docs-devsite/ai.livesession.md b/docs-devsite/ai.livesession.md index 558c5eb3bd6..c54c9d5fe3f 100644 --- a/docs-devsite/ai.livesession.md +++ b/docs-devsite/ai.livesession.md @@ -29,7 +29,8 @@ export declare class LiveSession | Property | Modifiers | Type | Description | | --- | --- | --- | --- | -| [inConversation](./ai.livesession.md#livesessioninconversation) | | boolean | (Public Preview) Indicates whether this Live session is being controlled by an AudioConversationController. | +| [inConversation](./ai.livesession.md#livesessioninconversation) | | boolean | (Public Preview) Indicates whether this Live session is being controlled by a [AudioConversationController](./ai.audioconversationcontroller.md#audioconversationcontroller_interface). | +| [inVideoRecording](./ai.livesession.md#livesessioninvideorecording) | | boolean | (Public Preview) Indicates whether this Live session is being controlled by a [VideoRecordingController](./ai.videorecordingcontroller.md#videorecordingcontroller_interface). | | [isClosed](./ai.livesession.md#livesessionisclosed) | | boolean | (Public Preview) Indicates whether this Live session is closed. | ## Methods @@ -39,16 +40,19 @@ export declare class LiveSession | [close()](./ai.livesession.md#livesessionclose) | | (Public Preview) Closes this session. All methods on this session will throw an error once this resolves. | | [receive()](./ai.livesession.md#livesessionreceive) | | (Public Preview) Yields messages received from the server. This can only be used by one consumer at a time. | | [send(request, turnComplete)](./ai.livesession.md#livesessionsend) | | (Public Preview) Sends content to the server. | +| [sendAudioRealtime(blob)](./ai.livesession.md#livesessionsendaudiorealtime) | | (Public Preview) Sends audio data to the server in realtime. | | [sendFunctionResponses(functionResponses)](./ai.livesession.md#livesessionsendfunctionresponses) | | (Public Preview) Sends function responses to the server. | | [sendMediaChunks(mediaChunks)](./ai.livesession.md#livesessionsendmediachunks) | | (Public Preview) Sends realtime input to the server. | -| [sendMediaStream(mediaChunkStream)](./ai.livesession.md#livesessionsendmediastream) | | (Public Preview) Sends a stream of [GenerativeContentBlob](./ai.generativecontentblob.md#generativecontentblob_interface). | +| [sendMediaStream(mediaChunkStream)](./ai.livesession.md#livesessionsendmediastream) | | (Public Preview) | +| [sendTextRealtime(text)](./ai.livesession.md#livesessionsendtextrealtime) | | (Public Preview) Sends text to the server in realtime. | +| [sendVideoRealtime(blob)](./ai.livesession.md#livesessionsendvideorealtime) | | (Public Preview) Sends video data to the server in realtime. | ## LiveSession.inConversation > This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. > -Indicates whether this Live session is being controlled by an `AudioConversationController`. +Indicates whether this Live session is being controlled by a [AudioConversationController](./ai.audioconversationcontroller.md#audioconversationcontroller_interface). Signature: @@ -56,6 +60,19 @@ Indicates whether this Live session is being controlled by an `AudioConversation inConversation: boolean; ``` +## LiveSession.inVideoRecording + +> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. +> + +Indicates whether this Live session is being controlled by a [VideoRecordingController](./ai.videorecordingcontroller.md#videorecordingcontroller_interface). + +Signature: + +```typescript +inVideoRecording: boolean; +``` + ## LiveSession.isClosed > This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. @@ -135,6 +152,45 @@ Promise<void> If this session has been closed. +## LiveSession.sendAudioRealtime() + +> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. +> + +Sends audio data to the server in realtime. + +The server requires that the audio data is base64-encoded 16-bit PCM at 16kHz little-endian. + +Signature: + +```typescript +sendAudioRealtime(blob: GenerativeContentBlob): Promise; +``` + +#### Parameters + +| Parameter | Type | Description | +| --- | --- | --- | +| blob | [GenerativeContentBlob](./ai.generativecontentblob.md#generativecontentblob_interface) | The base64-encoded PCM data to send to the server in realtime. | + +Returns: + +Promise<void> + +#### Exceptions + +If this session has been closed. + +### Example + + +```javascript +// const pcmData = ... base64-encoded 16-bit PCM at 16kHz little-endian. +const blob = { mimeType: "audio/pcm", data: pcmData }; +liveSession.sendAudioRealtime(blob); + +``` + ## LiveSession.sendFunctionResponses() > This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. @@ -167,6 +223,11 @@ If this session has been closed. > This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. > +> Warning: This API is now obsolete. +> +> Use `sendTextRealtime()`, `sendAudioRealtime()`, and `sendVideoRealtime()` instead. +> + Sends realtime input to the server. Signature: @@ -194,7 +255,12 @@ If this session has been closed. > This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. > -Sends a stream of [GenerativeContentBlob](./ai.generativecontentblob.md#generativecontentblob_interface). +> Warning: This API is now obsolete. +> +> Use `sendTextRealtime()`, `sendAudioRealtime()`, and `sendVideoRealtime()` instead. +> +> Sends a stream of [GenerativeContentBlob](./ai.generativecontentblob.md#generativecontentblob_interface). +> Signature: @@ -216,3 +282,77 @@ Promise<void> If this session has been closed. +## LiveSession.sendTextRealtime() + +> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. +> + +Sends text to the server in realtime. + +Signature: + +```typescript +sendTextRealtime(text: string): Promise; +``` + +#### Parameters + +| Parameter | Type | Description | +| --- | --- | --- | +| text | string | The text data to send. | + +Returns: + +Promise<void> + +#### Exceptions + +If this session has been closed. + +### Example + + +```javascript +liveSession.sendTextRealtime("Hello, how are you?"); + +``` + +## LiveSession.sendVideoRealtime() + +> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. +> + +Sends video data to the server in realtime. + +The server requires that the video is sent as individual video frames at 1 FPS. It is recommended to set `mimeType` to `image/jpeg`. + +Signature: + +```typescript +sendVideoRealtime(blob: GenerativeContentBlob): Promise; +``` + +#### Parameters + +| Parameter | Type | Description | +| --- | --- | --- | +| blob | [GenerativeContentBlob](./ai.generativecontentblob.md#generativecontentblob_interface) | The base64-encoded video data to send to the server in realtime. | + +Returns: + +Promise<void> + +#### Exceptions + +If this session has been closed. + +### Example + + +```javascript +// const videoFrame = ... base64-encoded JPEG data +const blob = { mimeType: "image/jpeg", data: videoFrame }; +liveSession.sendVideoRealtime(blob); + +``` + diff --git a/docs-devsite/ai.md b/docs-devsite/ai.md index c6e8fa365f7..52357256c24 100644 --- a/docs-devsite/ai.md +++ b/docs-devsite/ai.md @@ -24,6 +24,7 @@ The Firebase AI Web SDK. | [getLiveGenerativeModel(ai, modelParams)](./ai.md#getlivegenerativemodel_f2099ac) | (Public Preview) Returns a [LiveGenerativeModel](./ai.livegenerativemodel.md#livegenerativemodel_class) class for real-time, bidirectional communication.The Live API is only supported in modern browser windows and Node >= 22. | | function(liveSession, ...) | | [startAudioConversation(liveSession, options)](./ai.md#startaudioconversation_01c8e7f) | (Public Preview) Starts a real-time, bidirectional audio conversation with the model. This helper function manages the complexities of microphone access, audio recording, playback, and interruptions. | +| [startVideoRecording(liveSession, options)](./ai.md#startvideorecording_762a78a) | (Public Preview) Starts a real-time, unidirectional video stream to the model. This helper function manages the complexities of video source access, frame capture, and encoding. | ## Classes @@ -131,6 +132,7 @@ The Firebase AI Web SDK. | [SpeechConfig](./ai.speechconfig.md#speechconfig_interface) | (Public Preview) Configures speech synthesis. | | [StartAudioConversationOptions](./ai.startaudioconversationoptions.md#startaudioconversationoptions_interface) | (Public Preview) Options for [startAudioConversation()](./ai.md#startaudioconversation_01c8e7f). | | [StartChatParams](./ai.startchatparams.md#startchatparams_interface) | Params for [GenerativeModel.startChat()](./ai.generativemodel.md#generativemodelstartchat). | +| [StartVideoRecordingOptions](./ai.startvideorecordingoptions.md#startvideorecordingoptions_interface) | (Public Preview) Options for startVideoRecording. | | [TextPart](./ai.textpart.md#textpart_interface) | Content part interface if the part represents a text string. | | [ThinkingConfig](./ai.thinkingconfig.md#thinkingconfig_interface) | Configuration for "thinking" behavior of compatible Gemini models.Certain models utilize a thinking process before generating a response. This allows them to reason through complex problems and plan a more coherent and accurate answer. | | [ToolConfig](./ai.toolconfig.md#toolconfig_interface) | Tool config. This config is shared for all tools provided in the request. | @@ -140,6 +142,7 @@ The Firebase AI Web SDK. | [URLMetadata](./ai.urlmetadata.md#urlmetadata_interface) | (Public Preview) Metadata for a single URL retrieved by the [URLContextTool](./ai.urlcontexttool.md#urlcontexttool_interface) tool. | | [UsageMetadata](./ai.usagemetadata.md#usagemetadata_interface) | Usage metadata about a [GenerateContentResponse](./ai.generatecontentresponse.md#generatecontentresponse_interface). | | [VideoMetadata](./ai.videometadata.md#videometadata_interface) | Describes the input video content. | +| [VideoRecordingController](./ai.videorecordingcontroller.md#videorecordingcontroller_interface) | (Public Preview) A controller for managing an active video recording session. | | [VoiceConfig](./ai.voiceconfig.md#voiceconfig_interface) | (Public Preview) Configuration for the voice to used in speech synthesis. | | [WebAttribution](./ai.webattribution.md#webattribution_interface) | | | [WebGroundingChunk](./ai.webgroundingchunk.md#webgroundingchunk_interface) | A grounding chunk from the web.Important: If using Grounding with Google Search, you are required to comply with the [Service Specific Terms](https://cloud.google.com/terms/service-terms) for "Grounding with Google Search". | @@ -410,6 +413,66 @@ async function startConversation() { ``` +### startVideoRecording(liveSession, options) {:#startvideorecording_762a78a} + +> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. +> + +Starts a real-time, unidirectional video stream to the model. This helper function manages the complexities of video source access, frame capture, and encoding. + +Important: This function must be called in response to a user gesture (e.g., a button click) to comply with browser security policies for accessing camera or screen content. The backend requires video frames to be sent at 1 FPS as individual JPEGs. This helper enforces that constraint. + +Signature: + +```typescript +export declare function startVideoRecording(liveSession: LiveSession, options?: StartVideoRecordingOptions): Promise; +``` + +#### Parameters + +| Parameter | Type | Description | +| --- | --- | --- | +| liveSession | [LiveSession](./ai.livesession.md#livesession_class) | An active [LiveSession](./ai.livesession.md#livesession_class) instance. | +| options | [StartVideoRecordingOptions](./ai.startvideorecordingoptions.md#startvideorecordingoptions_interface) | Configuration options for the video recording. | + +Returns: + +Promise<[VideoRecordingController](./ai.videorecordingcontroller.md#videorecordingcontroller_interface)> + +A `Promise` that resolves with a `VideoRecordingController`. + +#### Exceptions + +`AIError` if the environment is unsupported, a recording is active, or the session is closed. + +`DOMException` if issues occur with media access (e.g., permissions denied). + +### Example + + +```javascript +const liveSession = await model.connect(); +let videoController; + +// This function must be called from within a click handler. +async function startRecording() { + try { + videoController = await startVideoRecording(liveSession, { + videoSource: 'screen' // or 'camera' + }); + } catch (e) { + // Handle AI-specific errors, DOMExceptions for permissions, etc. + console.error("Failed to start video recording:", e); + } +} + +// To stop the recording later: +// if (videoController) { +// await videoController.stop(); +// } + +``` + ## AIErrorCode Standardized error codes that [AIError](./ai.aierror.md#aierror_class) can have. diff --git a/docs-devsite/ai.startvideorecordingoptions.md b/docs-devsite/ai.startvideorecordingoptions.md new file mode 100644 index 00000000000..1165390ca75 --- /dev/null +++ b/docs-devsite/ai.startvideorecordingoptions.md @@ -0,0 +1,41 @@ +Project: /docs/reference/js/_project.yaml +Book: /docs/reference/_book.yaml +page_type: reference + +{% comment %} +DO NOT EDIT THIS FILE! +This is generated by the JS SDK team, and any local changes will be +overwritten. Changes should be made in the source code at +https://github.com/firebase/firebase-js-sdk +{% endcomment %} + +# StartVideoRecordingOptions interface +> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. +> + +Options for `startVideoRecording`. + +Signature: + +```typescript +export interface StartVideoRecordingOptions +``` + +## Properties + +| Property | Type | Description | +| --- | --- | --- | +| [videoSource](./ai.startvideorecordingoptions.md#startvideorecordingoptionsvideosource) | 'camera' \| 'screen' | (Public Preview) Specifies the source of the video stream. Can be either the user's camera or their screen (screen, window, or a tab). Is 'camera' by default. | + +## StartVideoRecordingOptions.videoSource + +> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. +> + +Specifies the source of the video stream. Can be either the user's camera or their screen (screen, window, or a tab). Is `'camera'` by default. + +Signature: + +```typescript +videoSource?: 'camera' | 'screen'; +``` diff --git a/docs-devsite/ai.videorecordingcontroller.md b/docs-devsite/ai.videorecordingcontroller.md new file mode 100644 index 00000000000..f173ed1d5b5 --- /dev/null +++ b/docs-devsite/ai.videorecordingcontroller.md @@ -0,0 +1,41 @@ +Project: /docs/reference/js/_project.yaml +Book: /docs/reference/_book.yaml +page_type: reference + +{% comment %} +DO NOT EDIT THIS FILE! +This is generated by the JS SDK team, and any local changes will be +overwritten. Changes should be made in the source code at +https://github.com/firebase/firebase-js-sdk +{% endcomment %} + +# VideoRecordingController interface +> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. +> + +A controller for managing an active video recording session. + +Signature: + +```typescript +export interface VideoRecordingController +``` + +## Properties + +| Property | Type | Description | +| --- | --- | --- | +| [stop](./ai.videorecordingcontroller.md#videorecordingcontrollerstop) | () => Promise<void> | (Public Preview) Stops the video recording, closes the media connection, and cleans up resources. Returns a promise that resolves when cleanup is complete. | + +## VideoRecordingController.stop + +> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. +> + +Stops the video recording, closes the media connection, and cleans up resources. Returns a promise that resolves when cleanup is complete. + +Signature: + +```typescript +stop: () => Promise; +``` diff --git a/packages/ai/integration/live.test.ts b/packages/ai/integration/live.test.ts index caa18970ab7..0af5bef2242 100644 --- a/packages/ai/integration/live.test.ts +++ b/packages/ai/integration/live.test.ts @@ -154,6 +154,45 @@ describe('Live', function () { }); }); + describe('sendTextRealtime()', () => { + it('should send a single text chunk and receive a response', async () => { + const model = getLiveGenerativeModel(testConfig.ai, { + model: testConfig.model, + generationConfig: textLiveGenerationConfig + }); + const session = await model.connect(); + const responsePromise = nextTurnText(session.receive()); + + await session.sendTextRealtime('Are you an AI? Yes or No.'); + + const responseText = await responsePromise; + expect(responseText).to.include('Yes'); + + await session.close(); + }); + }); + + describe('sendAudioRealtime()', () => { + it('should send a single audio chunk and receive a response', async () => { + const model = getLiveGenerativeModel(testConfig.ai, { + model: testConfig.model, + generationConfig: textLiveGenerationConfig + }); + const session = await model.connect(); + const responsePromise = nextTurnText(session.receive()); + + await session.sendAudioRealtime({ + data: HELLO_AUDIO_PCM_BASE64, // "Hey, can you hear me?" + mimeType: 'audio/pcm' + }); + + const responseText = await responsePromise; + expect(responseText).to.include('Yes'); + + await session.close(); + }); + }); + describe('sendMediaChunks()', () => { it('should send a single audio chunk and receive a response', async () => { const model = getLiveGenerativeModel(testConfig.ai, { diff --git a/packages/ai/src/api.ts b/packages/ai/src/api.ts index fc789f303a4..4237cdaf1db 100644 --- a/packages/ai/src/api.ts +++ b/packages/ai/src/api.ts @@ -49,7 +49,10 @@ export { Backend, VertexAIBackend, GoogleAIBackend } from './backend'; export { startAudioConversation, AudioConversationController, - StartAudioConversationOptions + StartAudioConversationOptions, + startVideoRecording, + StartVideoRecordingOptions, + VideoRecordingController } from './methods/live-session-helpers'; declare module '@firebase/component' { diff --git a/packages/ai/src/methods/live-session-helpers.test.ts b/packages/ai/src/methods/live-session-helpers.test.ts index cad0475b358..a62315c701d 100644 --- a/packages/ai/src/methods/live-session-helpers.test.ts +++ b/packages/ai/src/methods/live-session-helpers.test.ts @@ -65,7 +65,7 @@ class MockLiveSession { isClosed = false; inConversation = false; send = sinon.stub(); - sendMediaChunks = sinon.stub(); + sendAudioRealtime = sinon.stub(); sendFunctionResponses = sinon.stub(); messageGenerator = new MockMessageGenerator(); receive = (): MockMessageGenerator => this.messageGenerator; @@ -226,8 +226,8 @@ describe('Audio Conversation Helpers', () => { await clock.tickAsync(1); - expect(liveSession.sendMediaChunks).to.have.been.calledOnce; - const [sentChunk] = liveSession.sendMediaChunks.getCall(0).args[0]; + expect(liveSession.sendAudioRealtime).to.have.been.calledOnce; + const sentChunk = liveSession.sendAudioRealtime.getCall(0).args[0]; expect(sentChunk.mimeType).to.equal('audio/pcm'); expect(sentChunk.data).to.be.a('string'); await controller.stop(); diff --git a/packages/ai/src/methods/live-session-helpers.ts b/packages/ai/src/methods/live-session-helpers.ts index b3907d6219b..08df04799dd 100644 --- a/packages/ai/src/methods/live-session-helpers.ts +++ b/packages/ai/src/methods/live-session-helpers.ts @@ -29,6 +29,7 @@ import { Deferred } from '@firebase/util'; const SERVER_INPUT_SAMPLE_RATE = 16_000; const SERVER_OUTPUT_SAMPLE_RATE = 24_000; +const VIDEO_FRAME_RATE = 1; // Backend requires 1 FPS. const AUDIO_PROCESSOR_NAME = 'audio-processor'; @@ -159,6 +160,8 @@ export class AudioConversationRunner { private readonly deps: RunnerDependencies ) { this.liveSession.inConversation = true; + // For backward compatibility. + this.liveSession.inConversation = true; // Start listening for messages from the server. this.receiveLoopPromise = this.runReceiveLoop().finally(() => @@ -184,7 +187,7 @@ export class AudioConversationRunner { mimeType: 'audio/pcm', data: base64 }; - void this.liveSession.sendMediaChunks([chunk]); + void this.liveSession.sendAudioRealtime(chunk); }; } @@ -214,6 +217,7 @@ export class AudioConversationRunner { void this.deps.audioContext.close(); } this.liveSession.inConversation = false; + this.liveSession.inConversation = false; } /** @@ -495,3 +499,248 @@ export async function startAudioConversation( ); } } + +/** + * A controller for managing an active video recording session. + * + * @beta + */ +export interface VideoRecordingController { + /** + * Stops the video recording, closes the media connection, and + * cleans up resources. Returns a promise that resolves when cleanup is complete. + */ + stop: () => Promise; +} + +/** + * Options for `startVideoRecording`. + * + * @beta + */ +export interface StartVideoRecordingOptions { + /** + * Specifies the source of the video stream. Can be either the user's camera or their screen (screen, window, or a tab). Is `'camera'` by default. + */ + videoSource?: 'camera' | 'screen'; +} + +/** + * Encapsulates the core logic of a video recording session, managing the + * capture loop and resource cleanup. + * @internal + */ +class VideoRecordingRunner { + private isStopped = false; + private readonly mediaStream: MediaStream; + private readonly videoElement: HTMLVideoElement; + private readonly canvasElement: HTMLCanvasElement; + private readonly intervalId: number; + + constructor( + private readonly liveSession: LiveSession, + mediaStream: MediaStream, + videoElement: HTMLVideoElement + ) { + this.mediaStream = mediaStream; + this.videoElement = videoElement; + this.liveSession.inVideoRecording = true; + + this.canvasElement = document.createElement('canvas'); + + // Start a loop to capture and send a frame every second (1 FPS), + // adhering to the backend's requirement. + this.intervalId = setInterval(() => { + this.captureAndSendFrame(); + }, 1000 / VIDEO_FRAME_RATE) as unknown as number; // The Node setInterval returns a Timeout, but we're using the browser API which returns number. + } + + /** + * Captures a single frame from the video stream, encodes it as a JPEG, + * and sends it to the LiveSession. + */ + private captureAndSendFrame(): void { + if (this.isStopped) { + return; + } + // readyState < 2 means the video is not yet ready to play or has not loaded metadata. + // videoWidth === 0 is another check to ensure dimensions are available. + if ( + this.videoElement.readyState < 2 || + this.videoElement.videoWidth === 0 + ) { + return; + } + this.canvasElement.width = this.videoElement.videoWidth; + this.canvasElement.height = this.videoElement.videoHeight; + const context = this.canvasElement.getContext('2d'); + if (!context) { + // This should realistically never happen in a supported browser environment. + logger.error( + 'Could not get 2D context from canvas to capture video frame.' + ); + return; + } + context.drawImage(this.videoElement, 0, 0); + + // Convert the canvas content to a base64 JPEG. The '0.8' is a quality setting, + // balancing file size and image quality. + const dataUrl = this.canvasElement.toDataURL('image/jpeg', 0.8); + // The data URL includes a prefix like "data:image/jpeg;base64," which must be removed. + const base64Data = dataUrl.split(',')[1]; + if (!base64Data) { + logger.warn('Failed to extract base64 data from captured video frame.'); + return; + } + const blob: GenerativeContentBlob = { + mimeType: 'image/jpeg', + data: base64Data + }; + // Send the frame to the server. We use 'void' as this is a fire-and-forget + // operation within the loop. Errors at the session level should be handled by the user. + void this.liveSession.sendVideoRealtime(blob); + } + + /** + * Stops the video capture loop and cleans up all associated resources. + */ + async stop(): Promise { + if (this.isStopped) { + return; + } + this.isStopped = true; + clearInterval(this.intervalId); + + // Stop all tracks on the media stream (e.g., camera light turns off, screen sharing UI disappears). + this.mediaStream.getTracks().forEach(track => track.stop()); + + // Clean up in-memory video element to release resources. + this.videoElement.srcObject = null; + + // Reset the session flag to allow a new recording to start on the same session. + this.liveSession.inVideoRecording = false; + } +} + +/** + * Starts a real-time, unidirectional video stream to the model. This helper function manages + * the complexities of video source access, frame capture, and encoding. + * + * @remarks + * Important: This function must be called in response to a user gesture + * (e.g., a button click) to comply with browser security policies for + * accessing camera or screen content. The backend requires video frames to be + * sent at 1 FPS as individual JPEGs. This helper enforces that constraint. + * + * @example + * ```javascript + * const liveSession = await model.connect(); + * let videoController; + * + * // This function must be called from within a click handler. + * async function startRecording() { + * try { + * videoController = await startVideoRecording(liveSession, { + * videoSource: 'screen' // or 'camera' + * }); + * } catch (e) { + * // Handle AI-specific errors, DOMExceptions for permissions, etc. + * console.error("Failed to start video recording:", e); + * } + * } + * + * // To stop the recording later: + * // if (videoController) { + * // await videoController.stop(); + * // } + * ``` + * + * @param liveSession - An active {@link LiveSession} instance. + * @param options - Configuration options for the video recording. + * @returns A `Promise` that resolves with a `VideoRecordingController`. + * @throws `AIError` if the environment is unsupported, a recording is active, or the session is closed. + * @throws `DOMException` if issues occur with media access (e.g., permissions denied). + * + * @beta + */ +export async function startVideoRecording( + liveSession: LiveSession, + options: StartVideoRecordingOptions = {} +): Promise { + if (liveSession.isClosed) { + throw new AIError( + AIErrorCode.SESSION_CLOSED, + 'Cannot start video recording on a closed LiveSession.' + ); + } + + if (liveSession.inVideoRecording) { + throw new AIError( + AIErrorCode.REQUEST_ERROR, + 'A video recording is already in progress for this session.' + ); + } + + // Check for necessary browser API support. + if ( + typeof navigator === 'undefined' || + !navigator.mediaDevices || + typeof document === 'undefined' // Used for creating in-memory elements + ) { + throw new AIError( + AIErrorCode.UNSUPPORTED, + 'Video recording is not supported in this environment. It requires a browser with MediaDevices and DOM support.' + ); + } + + let mediaStream: MediaStream | undefined; + try { + const videoSource = options.videoSource ?? 'camera'; + + // The browser will prompt the user for permission and to select a source. + // This can throw DOMExceptions for permission denial, hardware issues, etc. + if (videoSource === 'camera') { + mediaStream = await navigator.mediaDevices.getUserMedia({ video: true }); + } else { + mediaStream = await navigator.mediaDevices.getDisplayMedia({ + video: true + }); + } + + // Create and configure the video element in memory. + const videoElement = document.createElement('video'); + videoElement.srcObject = mediaStream; + videoElement.muted = true; + + // Await the play() promise. This is required to handle potential + // playback errors and to satisfy browser autoplay policies, although + // the user gesture requirement for this function should prevent most issues. + await videoElement.play(); + + // All async setup is successful; now create the runner to manage the loop. + const runner = new VideoRecordingRunner( + liveSession, + mediaStream, + videoElement + ); + return { stop: () => runner.stop() }; + } catch (e) { + // If we successfully acquired a media stream but a subsequent step failed + // (e.g., videoElement.play()), we must clean up the stream. + if (mediaStream) { + mediaStream.getTracks().forEach(track => track.stop()); + } + + // Re-throw specific, known error types that the user might want to handle differently, + // such as permission errors from `getUserMedia`. + if (e instanceof AIError || e instanceof DOMException) { + throw e; + } + + // Wrap any other unexpected errors in a standard AIError for consistency. + throw new AIError( + AIErrorCode.ERROR, + `Failed to initialize video recording: ${(e as Error).message}` + ); + } +} diff --git a/packages/ai/src/methods/live-session.test.ts b/packages/ai/src/methods/live-session.test.ts index 7454b1208c9..428e92ec770 100644 --- a/packages/ai/src/methods/live-session.test.ts +++ b/packages/ai/src/methods/live-session.test.ts @@ -110,6 +110,42 @@ describe('LiveSession', () => { }); }); + describe('sendTextRealtime()', () => { + it('should send a correctly formatted realtimeInput message', async () => { + const text = 'foo'; + await session.sendTextRealtime(text); + expect(mockHandler.send).to.have.been.calledOnce; + const sentData = JSON.parse(mockHandler.send.getCall(0).args[0]); + expect(sentData).to.deep.equal({ + realtimeInput: { text } + }); + }); + }); + + describe('sendAudioRealtime()', () => { + it('should send a correctly formatted realtimeInput message', async () => { + const blob = { data: 'abcdef', mimeType: 'audio/pcm' }; + await session.sendAudioRealtime(blob); + expect(mockHandler.send).to.have.been.calledOnce; + const sentData = JSON.parse(mockHandler.send.getCall(0).args[0]); + expect(sentData).to.deep.equal({ + realtimeInput: { audio: blob } + }); + }); + }); + + describe('sendVideoRealtime()', () => { + it('should send a correctly formatted realtimeInput message', async () => { + const blob = { data: 'abcdef', mimeType: 'image/jpeg' }; + await session.sendVideoRealtime(blob); + expect(mockHandler.send).to.have.been.calledOnce; + const sentData = JSON.parse(mockHandler.send.getCall(0).args[0]); + expect(sentData).to.deep.equal({ + realtimeInput: { video: blob } + }); + }); + }); + describe('sendMediaChunks()', () => { it('should send a correctly formatted realtimeInput message', async () => { const chunks = [{ data: 'base64', mimeType: 'audio/webm' }]; diff --git a/packages/ai/src/methods/live-session.ts b/packages/ai/src/methods/live-session.ts index 92d325e2f0d..b64447be15e 100644 --- a/packages/ai/src/methods/live-session.ts +++ b/packages/ai/src/methods/live-session.ts @@ -49,13 +49,21 @@ export class LiveSession { * @beta */ isClosed = false; + /** - * Indicates whether this Live session is being controlled by an `AudioConversationController`. + * Indicates whether this Live session is being controlled by a {@link AudioConversationController}. * * @beta */ inConversation = false; + /** + * Indicates whether this Live session is being controlled by a {@link VideoRecordingController}. + * + * @beta + */ + inVideoRecording = false; + /** * @internal */ @@ -96,14 +104,19 @@ export class LiveSession { } /** - * Sends realtime input to the server. + * Sends text to the server in realtime. * - * @param mediaChunks - The media chunks to send. + * @example + * ```javascript + * liveSession.sendTextRealtime("Hello, how are you?"); + * ``` + * + * @param text - The text data to send. * @throws If this session has been closed. * * @beta */ - async sendMediaChunks(mediaChunks: GenerativeContentBlob[]): Promise { + async sendTextRealtime(text: string): Promise { if (this.isClosed) { throw new AIError( AIErrorCode.REQUEST_ERROR, @@ -111,27 +124,33 @@ export class LiveSession { ); } - // The backend does not support sending more than one mediaChunk in one message. - // Work around this limitation by sending mediaChunks in separate messages. - mediaChunks.forEach(mediaChunk => { - const message: _LiveClientRealtimeInput = { - realtimeInput: { mediaChunks: [mediaChunk] } - }; - this.webSocketHandler.send(JSON.stringify(message)); - }); + const message: _LiveClientRealtimeInput = { + realtimeInput: { + text + } + }; + this.webSocketHandler.send(JSON.stringify(message)); } /** - * Sends function responses to the server. + * Sends audio data to the server in realtime. * - * @param functionResponses - The function responses to send. + * @remarks The server requires that the audio data is base64-encoded 16-bit PCM at 16kHz + * little-endian. + * + * @example + * ```javascript + * // const pcmData = ... base64-encoded 16-bit PCM at 16kHz little-endian. + * const blob = { mimeType: "audio/pcm", data: pcmData }; + * liveSession.sendAudioRealtime(blob); + * ``` + * + * @param blob - The base64-encoded PCM data to send to the server in realtime. * @throws If this session has been closed. * * @beta */ - async sendFunctionResponses( - functionResponses: FunctionResponse[] - ): Promise { + async sendAudioRealtime(blob: GenerativeContentBlob): Promise { if (this.isClosed) { throw new AIError( AIErrorCode.REQUEST_ERROR, @@ -139,25 +158,32 @@ export class LiveSession { ); } - const message: _LiveClientToolResponse = { - toolResponse: { - functionResponses + const message: _LiveClientRealtimeInput = { + realtimeInput: { + audio: blob } }; this.webSocketHandler.send(JSON.stringify(message)); } /** - * Sends a stream of {@link GenerativeContentBlob}. + * Sends video data to the server in realtime. * - * @param mediaChunkStream - The stream of {@link GenerativeContentBlob} to send. + * @remarks The server requires that the video is sent as individual video frames at 1 FPS. It + * is recommended to set `mimeType` to `image/jpeg`. + * + * @example + * ```javascript + * // const videoFrame = ... base64-encoded JPEG data + * const blob = { mimeType: "image/jpeg", data: videoFrame }; + * liveSession.sendVideoRealtime(blob); + * ``` + * @param blob - The base64-encoded video data to send to the server in realtime. * @throws If this session has been closed. * * @beta */ - async sendMediaStream( - mediaChunkStream: ReadableStream - ): Promise { + async sendVideoRealtime(blob: GenerativeContentBlob): Promise { if (this.isClosed) { throw new AIError( AIErrorCode.REQUEST_ERROR, @@ -165,25 +191,38 @@ export class LiveSession { ); } - const reader = mediaChunkStream.getReader(); - while (true) { - try { - const { done, value } = await reader.read(); + const message: _LiveClientRealtimeInput = { + realtimeInput: { + video: blob + } + }; + this.webSocketHandler.send(JSON.stringify(message)); + } - if (done) { - break; - } else if (!value) { - throw new Error('Missing chunk in reader, but reader is not done.'); - } + /** + * Sends function responses to the server. + * + * @param functionResponses - The function responses to send. + * @throws If this session has been closed. + * + * @beta + */ + async sendFunctionResponses( + functionResponses: FunctionResponse[] + ): Promise { + if (this.isClosed) { + throw new AIError( + AIErrorCode.REQUEST_ERROR, + 'This LiveSession has been closed and cannot be used.' + ); + } - await this.sendMediaChunks([value]); - } catch (e) { - // Re-throw any errors that occur during stream consumption or sending. - const message = - e instanceof Error ? e.message : 'Error processing media stream.'; - throw new AIError(AIErrorCode.REQUEST_ERROR, message); + const message: _LiveClientToolResponse = { + toolResponse: { + functionResponses } - } + }; + this.webSocketHandler.send(JSON.stringify(message)); } /** @@ -259,4 +298,73 @@ export class LiveSession { await this.webSocketHandler.close(1000, 'Client closed session.'); } } + + /** + * Sends realtime input to the server. + * + * @deprecated Use `sendTextRealtime()`, `sendAudioRealtime()`, and `sendVideoRealtime()` instead. + * + * @param mediaChunks - The media chunks to send. + * @throws If this session has been closed. + * + * @beta + */ + async sendMediaChunks(mediaChunks: GenerativeContentBlob[]): Promise { + if (this.isClosed) { + throw new AIError( + AIErrorCode.REQUEST_ERROR, + 'This LiveSession has been closed and cannot be used.' + ); + } + + // The backend does not support sending more than one mediaChunk in one message. + // Work around this limitation by sending mediaChunks in separate messages. + mediaChunks.forEach(mediaChunk => { + const message: _LiveClientRealtimeInput = { + realtimeInput: { mediaChunks: [mediaChunk] } + }; + this.webSocketHandler.send(JSON.stringify(message)); + }); + } + + /** + * @deprecated Use `sendTextRealtime()`, `sendAudioRealtime()`, and `sendVideoRealtime()` instead. + * + * Sends a stream of {@link GenerativeContentBlob}. + * + * @param mediaChunkStream - The stream of {@link GenerativeContentBlob} to send. + * @throws If this session has been closed. + * + * @beta + */ + async sendMediaStream( + mediaChunkStream: ReadableStream + ): Promise { + if (this.isClosed) { + throw new AIError( + AIErrorCode.REQUEST_ERROR, + 'This LiveSession has been closed and cannot be used.' + ); + } + + const reader = mediaChunkStream.getReader(); + while (true) { + try { + const { done, value } = await reader.read(); + + if (done) { + break; + } else if (!value) { + throw new Error('Missing chunk in reader, but reader is not done.'); + } + + await this.sendMediaChunks([value]); + } catch (e) { + // Re-throw any errors that occur during stream consumption or sending. + const message = + e instanceof Error ? e.message : 'Error processing media stream.'; + throw new AIError(AIErrorCode.REQUEST_ERROR, message); + } + } + } } diff --git a/packages/ai/src/types/live-responses.ts b/packages/ai/src/types/live-responses.ts index d1870fa109f..6b69a0ea350 100644 --- a/packages/ai/src/types/live-responses.ts +++ b/packages/ai/src/types/live-responses.ts @@ -44,7 +44,14 @@ export interface _LiveClientContent { // eslint-disable-next-line @typescript-eslint/naming-convention export interface _LiveClientRealtimeInput { realtimeInput: { - mediaChunks: GenerativeContentBlob[]; + text?: string; + audio?: GenerativeContentBlob; + video?: GenerativeContentBlob; + + /** + * @deprecated Use `text`, `audio`, and `video` instead. + */ + mediaChunks?: GenerativeContentBlob[]; }; }