Synthesizes speech with AWS Polly
Possible english speakers include
US English en-US {'Kevin', 'Salli', 'Matthew', 'Kimberly', 'Kendra',
'Justin', 'Joey', 'Joanna', 'Ivy'}
New Zealand English en-NZ {'Aria'}
South African English en-ZA {'Ayanda'}
British English en-GB {'Emma', 'Brian', 'Amy', 'Arthur'}
Australian English en-AU {'Olivia'}
Indian English en-IN {'Kajal'}
Source code in backend/app/utils/tts/backends/aws_polly_tts.py
| class PollySpeak():
""" Synthesizes speech with AWS Polly
Possible english speakers include:
US English en-US {'Kevin', 'Salli', 'Matthew', 'Kimberly', 'Kendra',
'Justin', 'Joey', 'Joanna', 'Ivy'}
New Zealand English en-NZ {'Aria'}
South African English en-ZA {'Ayanda'}
British English en-GB {'Emma', 'Brian', 'Amy', 'Arthur'}
Australian English en-AU {'Olivia'}
Indian English en-IN {'Kajal'}
"""
def __init__(self, default_path: str = "./output/temp.wav") -> None:
self.engine = "neural"
self.audio_format = "mp3"
self.polly_client = polly
self.path = Path(__file__).parent
self.save_path = os.path.join(self.path, default_path)
def synthesize(self, text: str, speaker_id: str = "", save_path: str = None):
"""Turns text into audio and visemes"""
if save_path:
self.save_path = save_path
lang_code = None
for key, names in english_speaker_map.items():
if speaker_id in names:
lang_code = key
voice = speaker_id
if not lang_code:
lang_code = "en-US"
voice = 'Kendra'
try:
kwargs = {
'Engine': self.engine,
'OutputFormat': self.audio_format,
'Text': text,
'VoiceId': voice}
if lang_code is not None:
kwargs['LanguageCode'] = lang_code
response = self.polly_client.synthesize_speech(**kwargs)
# print("got response", response)
audio_stream = response['AudioStream']
output = self.save_path
with closing(audio_stream) as stream:
with open(output, "wb") as file:
file.write(stream.read())
outstream = io.open(output, 'rb', buffering=0)
visemes = None
kwargs['OutputFormat'] = 'json'
kwargs['SpeechMarkTypes'] = ['viseme']
response = self.polly_client.synthesize_speech(**kwargs)
visemes = [json.loads(viseme) for viseme in
response['AudioStream'].read().decode().split() if viseme]
viseme_list = []
time_list = []
for viseme in visemes:
viseme_list.append(viseme["value"])
time_list.append(viseme["time"])
sleep_times = []
t_before = 0
for next_t in time_list:
wait_seconds = float(next_t) - float(t_before)
sleep_times.append(wait_seconds/1000)
t_before = next_t
except ClientError as exc:
print(exc)
raise
else:
return outstream, viseme_list, sleep_times
|
synthesize(text, speaker_id='', save_path=None)
Turns text into audio and visemes
Source code in backend/app/utils/tts/backends/aws_polly_tts.py
| def synthesize(self, text: str, speaker_id: str = "", save_path: str = None):
"""Turns text into audio and visemes"""
if save_path:
self.save_path = save_path
lang_code = None
for key, names in english_speaker_map.items():
if speaker_id in names:
lang_code = key
voice = speaker_id
if not lang_code:
lang_code = "en-US"
voice = 'Kendra'
try:
kwargs = {
'Engine': self.engine,
'OutputFormat': self.audio_format,
'Text': text,
'VoiceId': voice}
if lang_code is not None:
kwargs['LanguageCode'] = lang_code
response = self.polly_client.synthesize_speech(**kwargs)
# print("got response", response)
audio_stream = response['AudioStream']
output = self.save_path
with closing(audio_stream) as stream:
with open(output, "wb") as file:
file.write(stream.read())
outstream = io.open(output, 'rb', buffering=0)
visemes = None
kwargs['OutputFormat'] = 'json'
kwargs['SpeechMarkTypes'] = ['viseme']
response = self.polly_client.synthesize_speech(**kwargs)
visemes = [json.loads(viseme) for viseme in
response['AudioStream'].read().decode().split() if viseme]
viseme_list = []
time_list = []
for viseme in visemes:
viseme_list.append(viseme["value"])
time_list.append(viseme["time"])
sleep_times = []
t_before = 0
for next_t in time_list:
wait_seconds = float(next_t) - float(t_before)
sleep_times.append(wait_seconds/1000)
t_before = next_t
except ClientError as exc:
print(exc)
raise
else:
return outstream, viseme_list, sleep_times
|