Skip to content

Commit 3ac0303

Browse files
authored
Merge pull request #242 from aws-samples/fix/python-sdk-backend
fix(python-backend): Fix Nova Sonic protocol and SDK v0.1.0 compatibi…
2 parents 45464a3 + e555829 commit 3ac0303

File tree

5 files changed

+81
-25
lines changed

5 files changed

+81
-25
lines changed

samples/speech-to-speech/backend/python_app/clients/bedrock_client.py

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
import logging
1616
import warnings
1717
import os
18-
import time
1918
from aws_sdk_bedrock_runtime.client import (
2019
BedrockRuntimeClient,
2120
InvokeModelWithBidirectionalStreamOperationInput,
@@ -24,15 +23,10 @@
2423
InvokeModelWithBidirectionalStreamInputChunk,
2524
BidirectionalInputPayloadPart,
2625
)
27-
from aws_sdk_bedrock_runtime.config import (
28-
Config,
29-
HTTPAuthSchemeResolver,
30-
SigV4AuthScheme,
31-
)
32-
from smithy_aws_core.credentials_resolvers.environment import (
33-
EnvironmentCredentialsResolver,
34-
)
35-
from smithy_aws_core.credentials_resolvers.container import ContainerCredentialsResolver
26+
from aws_sdk_bedrock_runtime.config import Config
27+
from smithy_aws_core.auth.sigv4 import SigV4AuthScheme
28+
from smithy_aws_core.identity.environment import EnvironmentCredentialsResolver
29+
from smithy_aws_core.identity.container import ContainerCredentialsResolver
3630
from smithy_http.aio.aiohttp import AIOHTTPClient, AIOHTTPClientConfig
3731

3832
# Configure logging
@@ -100,8 +94,7 @@ def initialize_client(self):
10094
endpoint_uri=f"https://bedrock-runtime.{self.region}.amazonaws.com",
10195
region=self.region,
10296
aws_credentials_identity_resolver=resolver,
103-
http_auth_scheme_resolver=HTTPAuthSchemeResolver(),
104-
http_auth_schemes={"aws.auth#sigv4": SigV4AuthScheme()},
97+
auth_schemes={"aws.auth#sigv4": SigV4AuthScheme(service="bedrock")},
10598
)
10699
self.bedrock_client = BedrockRuntimeClient(config=config)
107100
logger.info(

samples/speech-to-speech/backend/python_app/core/audio_processor.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,40 @@ async def queue_audio(self, prompt_name: str, content_name: str, audio_base64: s
8383
)
8484
return
8585

86+
# Add audio validation to detect silence
87+
try:
88+
import base64
89+
audio_bytes = base64.b64decode(audio_base64)
90+
91+
# Check if audio is silence (all zeros or very low values)
92+
if len(audio_bytes) > 0:
93+
# Convert bytes to 16-bit signed integers for analysis
94+
import struct
95+
num_samples = len(audio_bytes) // 2 # 16-bit samples
96+
97+
if num_samples > 0:
98+
# Unpack first few samples to check for silence
99+
sample_check_count = min(100, num_samples)
100+
samples = struct.unpack(f'{sample_check_count}h', audio_bytes[:sample_check_count * 2])
101+
102+
# Calculate max absolute value
103+
max_val = max(abs(s) for s in samples)
104+
105+
# Threshold for silence detection (16-bit audio range is -32768 to 32767)
106+
# Only warn on truly silent audio (complete zeros)
107+
if max_val == 0:
108+
self.logger.warning(
109+
"Detected completely silent audio chunk (all zeros). "
110+
"Frontend may not be sending real microphone audio."
111+
)
112+
elif max_val < 50:
113+
# Very low audio - only log at debug level
114+
self.logger.debug(f"Very low audio level - max amplitude: {max_val}")
115+
# Normal audio - no logging needed (reduces verbosity)
116+
117+
except Exception as e:
118+
self.logger.error(f"Error validating audio: {e}")
119+
86120
# Use put_nowait() like original code to avoid blocking/timing delays
87121
self.audio_input_queue.put_nowait(
88122
{

samples/speech-to-speech/backend/python_app/core/websocket_handler.py

Lines changed: 39 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -123,12 +123,12 @@ async def handle_connection(self, websocket):
123123
# Start audio processor
124124
await self.audio_processor.start(self.bedrock_client, self.stream)
125125

126-
# Start response processing task
127-
self.response_task = asyncio.create_task(self._process_responses(websocket))
128-
129126
# Register message handlers
130127
self._register_message_handlers()
131128

129+
# Start response processing task AFTER setup is complete
130+
self.response_task = asyncio.create_task(self._process_responses(websocket))
131+
132132
# Main message processing loop
133133
async for message in websocket:
134134
await self._handle_message(websocket, message)
@@ -235,14 +235,42 @@ async def _handle_message(self, websocket, message):
235235
list(data.get("event", {}).keys())[0] if "event" in data else None
236236
)
237237

238-
# Store session information
239-
if event_type == "promptStart":
240-
self.prompt_name = data["event"]["promptStart"]["promptName"]
241-
elif (
242-
event_type == "contentStart"
243-
and data["event"]["contentStart"].get("type") == "AUDIO"
244-
):
245-
self.audio_content_name = data["event"]["contentStart"]["contentName"]
238+
# Event sequence tracking (reduced verbosity)
239+
if event_type:
240+
if event_type == "sessionStart":
241+
self.logger.info("Session started")
242+
elif event_type == "promptStart":
243+
self.prompt_name = data["event"]["promptStart"]["promptName"]
244+
self.logger.debug("Prompt started")
245+
elif event_type == "contentStart":
246+
content_type = data["event"]["contentStart"].get("type")
247+
role = data["event"]["contentStart"].get("role", "UNSPECIFIED")
248+
content_name = data["event"]["contentStart"].get("contentName")
249+
250+
# Only log at debug level for routine content
251+
self.logger.debug(f"Content start: type={content_type}, role={role}")
252+
253+
# Validate first content block has SYSTEM role
254+
if not hasattr(self, '_first_content_received'):
255+
self._first_content_received = True
256+
if role != "SYSTEM":
257+
self.logger.error(f"First content block must have SYSTEM role, received {role}")
258+
await websocket.send(json.dumps({
259+
"type": "error",
260+
"message": f"First content block must have SYSTEM role, received {role}"
261+
}))
262+
263+
# Store audio content name
264+
if content_type == "AUDIO":
265+
self.audio_content_name = content_name
266+
if role != "USER":
267+
self.logger.warning(f"Audio content should have USER role, received {role}")
268+
269+
elif event_type == "sessionEnd":
270+
self.logger.info("Session ended")
271+
# Other events logged at debug level only
272+
elif event_type in ["textInput", "contentEnd", "promptEnd"]:
273+
self.logger.debug(f"Event: {event_type}")
246274

247275
# Send event to Bedrock
248276
await self.bedrock_client.send_event(self.stream, data)

samples/speech-to-speech/backend/python_app/events/s2s_events.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ def content_start_audio(
157157
"promptName": prompt_name,
158158
"contentName": content_name,
159159
"type": "AUDIO",
160+
"role": "USER", # Required by Nova Sonic API
160161
"interactive": True,
161162
"audioInputConfiguration": audio_input_config,
162163
}

samples/speech-to-speech/backend/python_app/requirements.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Core dependencies
2-
aws_sdk_bedrock_runtime==0.0.2
3-
smithy-aws-core==0.0.3
2+
aws_sdk_bedrock_runtime==0.1.0
3+
smithy-aws-core==0.1.0
44
python-dotenv>=1.1.0
55
aiohttp>=3.8.0
66

0 commit comments

Comments
 (0)