Skip to content

speaker

Utility module for creating audio and visemes from text.

There are two supported options for generating speech, Coqui and Polly. You must choose between the two with the backend parameter at load time. Both support a wide variety of voice accents/genders/ages that can be chosen at runtime by selecting the speaker_identifier.

Speaker

Speaker takes text and returns an audio stream and a viseme list with timing

Speaker backend is set at runtime, but speaker ID can change dynammically

Parameters:

Name Type Description Default
backend str

Which backend to use. Defaults to "polly".

'polly'
Source code in backend/app/utils/tts/speaker.py
class Speaker:
    """Speaker takes text and returns an audio stream and a viseme list with timing

        Speaker backend is set at runtime, but speaker ID can change dynammically

        Args:
            backend (str, optional): Which backend to use. Defaults to "polly"."""
    def __init__(self, backend="polly") -> None:
        self.backend = backend
        if self.backend == "polly":
            self.speaker = PollySpeak()
        if self.backend == "coqui":
            self.speaker = CoquiSpeak()
        self.viseme_generator = VisemeGenerator()

    def synthesize(self, input_text: str, speaker_identifier: str,
                   save_path: str = "./tts/backends/output/temp.wav") -> tuple:
        """Takes in text and a speaker id and returns speech and visemes and timings

            Args:
                input_text (str): input text to say
                speaker_identifier (str): key for speaker voice
                save_path (str, optional): path to save wav file. Defaults to "./tts/backends/output/temp.wav".

            Returns:
                tuple: the audio stream, the visemes, and the viseme timings"""
        if self.backend == "polly":
            results = self.speaker.synthesize(input_text,
                                              speaker_id=speaker_identifier,
                                              save_path=save_path)
            audio_stream, visemes, delays = results

            visemes = self.viseme_generator.convert_aws_visemes(visemes)

        if self.backend == "coqui":
            results = self.speaker.synthesize_wav(input_text,
                                                  speaker_id=speaker_identifier)
            audio_stream, speaking_time = results

            # Visemes must be generated and timed manually for coqui
            visemes = self.viseme_generator.get_visemes(input_text)
            viseme_length = (speaking_time) / (len(visemes)+1)
            delays = [viseme_length for i in range(len(visemes))]

        return audio_stream, visemes, delays

synthesize(input_text, speaker_identifier, save_path='./tts/backends/output/temp.wav')

Takes in text and a speaker id and returns speech and visemes and timings

Parameters:

Name Type Description Default
input_text str

input text to say

required
speaker_identifier str

key for speaker voice

required
save_path str

path to save wav file. Defaults to "./tts/backends/output/temp.wav".

'./tts/backends/output/temp.wav'

Returns:

Name Type Description
tuple tuple

the audio stream, the visemes, and the viseme timings

Source code in backend/app/utils/tts/speaker.py
def synthesize(self, input_text: str, speaker_identifier: str,
               save_path: str = "./tts/backends/output/temp.wav") -> tuple:
    """Takes in text and a speaker id and returns speech and visemes and timings

        Args:
            input_text (str): input text to say
            speaker_identifier (str): key for speaker voice
            save_path (str, optional): path to save wav file. Defaults to "./tts/backends/output/temp.wav".

        Returns:
            tuple: the audio stream, the visemes, and the viseme timings"""
    if self.backend == "polly":
        results = self.speaker.synthesize(input_text,
                                          speaker_id=speaker_identifier,
                                          save_path=save_path)
        audio_stream, visemes, delays = results

        visemes = self.viseme_generator.convert_aws_visemes(visemes)

    if self.backend == "coqui":
        results = self.speaker.synthesize_wav(input_text,
                                              speaker_id=speaker_identifier)
        audio_stream, speaking_time = results

        # Visemes must be generated and timed manually for coqui
        visemes = self.viseme_generator.get_visemes(input_text)
        viseme_length = (speaking_time) / (len(visemes)+1)
        delays = [viseme_length for i in range(len(visemes))]

    return audio_stream, visemes, delays

main()

Run speaker on a text string

To hear the sound play the file at the save path "./tts/backends/output/temp.wav"

Source code in backend/app/utils/tts/speaker.py
def main():
    """Run speaker on a text string

    To hear the sound play the file at the save path "./tts/backends/output/temp.wav"
    """
    parser = argparse.ArgumentParser(
    formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument("--backend", default="polly", help="Backend to use",
                        choices=["polly", "coqui"])
    parser.add_argument("--text", default="This is what I sound like",
                        help="Text to say")
    parser.add_argument("--speakerid", default="Kevin",
                        help="location of file to transcribe")
    parser.add_argument("--savepath", default="./tts/backends/output/temp.wav",
                    help="location of file to save audio")

    args = parser.parse_args()
    speaker = Speaker(backend=args.backend)
    _, visemes, delays = speaker.synthesize(args.text, args.speakerid, save_path=args.savepath)
    visemes_delays = zip(visemes,delays)
    for i in visemes_delays:
        print(i)