I'm currently trying to make a system that can transcribe a phone call in real time and then display the conversation in my command line. To do this, im using a Twilio phone number which sends out a http request when called. Then using Flask, Ngrok and Websockets to compile my server code, make my local port public and to transfer the data, the TwiML verb "Stream" is used to stream the audio data to the Google Cloud Speech-Text API. I have so far used Twilio's python demo on GitHub (https://github.com/twilio/media-streams/tree/master/python/realtime-transcriptions).
My server code:
from flask import Flask, render_template
from flask_sockets import Sockets
from SpeechClientBridge import SpeechClientBridge
from google.cloud.speech_v1 import enums
from google.cloud.speech_v1 import types
import json
import base64
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "./<KEY>.json"
HTTP_SERVER_PORT = 8080
config = types.RecognitionConfig(
encoding=enums.RecognitionConfig.AudioEncoding.MULAW,
sample_rate_hertz=8000,
language_code='en-US')
streaming_config = types.StreamingRecognitionConfig(
config=config,
interim_results=True)
app = Flask(__name__)
sockets = Sockets(app)
@app.route('/home')
def home():
return render_template("index.html")
@app.route('/twiml', methods=['POST'])
def return_twiml():
print("POST TwiML")
return render_template('streams.xml')
def on_transcription_response(response):
if not response.results:
return
result = response.results[0]
if not result.alternatives:
return
transcription = result.alternatives[0].transcript
print("Transcription: " + transcription)
@sockets.route('/')
def transcript(ws):
print("WS connection opened")
bridge = SpeechClientBridge(
streaming_config,
on_transcription_response
)
while not ws.closed:
message = ws.receive()
if message is None:
bridge.terminate()
break
data = json.loads(message)
if data["event"] in ("connected", "start"):
print(f"Media WS: Received event '{data['event']}': {message}")
continue
if data["event"] == "media":
media = data["media"]
chunk = base64.b64decode(media["payload"])
bridge.add_request(chunk)
if data["event"] == "stop":
print(f"Media WS: Received event 'stop': {message}")
print("Stopping...")
break
bridge.terminate()
print("WS connection closed")
if __name__ == '__main__':
from gevent import pywsgi
from geventwebsocket.handler import WebSocketHandler
server = pywsgi.WSGIServer(('', HTTP_SERVER_PORT), app, handler_class=WebSocketHandler)
print("Server listening on: http://localhost:" + str(HTTP_SERVER_PORT))
server.serve_forever()
streams.xml:
<?xml version="1.0" encoding="UTF-8"?>
<Response>
<Say> Thanks for calling!</Say>
<Start>
<Stream url="wss://<ngrok-URL/.ngrok.io/"/>
</Start>
<Pause length="40"/>
</Response>
Twilio WebHook:
http://<ngrok-URL>.ngrok.io/twiml
Im am getting the following error when I run the server code and then call the Twilio number:
C:\Users\Max\Python\Twilio>python server.py
Server listening on: http://localhost:8080
POST TwiML
WS connection opened
Media WS: Received event 'connected': {"event":"connected","protocol":"Call","version":"0.2.0"}
Media WS: Received event 'start': {"event":"start","sequenceNumber":"1","start":{"accountSid":"AC8abc5aa74496a227d3eb489","streamSid":"MZe6245f23e2385aa2ea7b397","callSid":"CA5864313b4992607d3fe46","tracks":["inbound"],"mediaFormat":{"encoding":"audio/x-mulaw","sampleRate":8000,"channels":1}},"streamSid":"MZe6245f2397c1285aa2ea7b397"}
Exception in thread Thread-4:
Traceback (most recent call last):
File "C:\Users\Max\AppData\Local\Programs\Python\Python37\lib\site-packages\google\api_core\grpc_helpers.py", line 96, in next
return six.next(self._wrapped)
File "C:\Users\Max\AppData\Local\Programs\Python\Python37\lib\site-packages\grpc\_channel.py", line 416, in __next__
return self._next()
File "C:\Users\Max\AppData\Local\Programs\Python\Python37\lib\site-packages\grpc\_channel.py", line 689, in _next
raise self
grpc._channel._MultiThreadedRendezvous: <_MultiThreadedRendezvous of RPC that terminated with:
status = StatusCode.OUT_OF_RANGE
details = "Audio Timeout Error: Long duration elapsed without audio. Audio should be sent close to real time."
debug_error_string = "{"created":"@1591738676.565000000","description":"Error received from peer ipv6:[2a00:1450:4009:807::200a]:443","file":"src/core/lib/surface/call.cc","file_line":1056,"grpc_message":"Audio Timeout Error: Long duration elapsed without audio. Audio should be sent close to real time.","grpc_status":11}"
>
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:\Users\Max\AppData\Local\Programs\Python\Python37\lib\threading.py", line 917, in _bootstrap_inner
self.run()
File "C:\Users\Max\AppData\Local\Programs\Python\Python37\lib\threading.py", line 865, in run
self._target(*self._args, **self._kwargs)
File "C:\Users\Max\Python\Twilio\SpeechClientBridge.py", line 37, in process_responses_loop
for response in responses:
File "C:\Users\Max\AppData\Local\Programs\Python\Python37\lib\site-packages\google\api_core\grpc_helpers.py", line 99, in next
six.raise_from(exceptions.from_grpc_error(exc), exc)
File "<string>", line 3, in raise_from
google.api_core.exceptions.OutOfRange: 400 Audio Timeout Error: Long duration elapsed without audio. Audio should be sent close to real time.
Media WS: Received event 'stop': {"event":"stop","sequenceNumber":"752","streamSid":"MZe6245f2397c125aa2ea7b397","stop":{"accountSid":"AC8abc5aa74496a60227d3eb489","callSid":"CA5842bc6431314d502607d3fe46"}}
Stopping...
WS connection closed
I cant work out why im getting the audio timeout error? Is it a firewall issue with Twilio and Google? An encoding issue?
Any help would be greatly appreciated.
System: Windows 10 Python 3.7.1 ngrok 2.3.35 Flask 1.1.2