Categories
Blog Knowledge Base

Transcribing Voicemail with Google Speech api

This is part 2 and rather long awaited description of how to transcribe voicemails to email and deliver them with text and an attached MP3

You will need to install the files from here https://zaf.github.io/asterisk-speech-recog/ and also have a Google Developers account.

Also create a directory:-

/var/lib/asterisk/sounds/catline

Lets begin.

  • Script to create the mp3 and the file for transcription
#!/bin/sh
PATH=/var/spool/asterisk/voicemail/default/
callerchan=$1
callerid=$2
origdate=$3
origtime=$4
origmailbox=$5
origdir=$6
duration=$7
apikey=YOUR GOOGLE SPEECH API KEY
FILENUM=$(/bin/ls ${PATH}${origmailbox}/INBOX |/bin/grep txt | /usr/bin/wc -l)


##Added to allow 999 messages
if  (( $FILENUM <= 9 ));
then
FILENAME=msg000${FILENUM}
elif (( $FILENUM <= 99 ));
then
FILENAME=msg00${FILENUM}
else
FILENAME=msg0${FILENUM}
fi

IN=$(/bin/grep "${origmailbox} =>" /etc/asterisk/voicemail.conf)
set -- "$IN"
IFS=","; declare -a Array=($*)
email=${Array[2]}


/bin/echo "[message]" >> ${PATH}${origmailbox}/INBOX/${FILENAME}.txt
/bin/echo origmailbox=${origmailbox} >> ${PATH}${origmailbox}/INBOX/${FILENAME}.txt
/bin/echo "context=demo" >> ${PATH}${origmailbox}/INBOX/${FILENAME}.txt
/bin/echo "macrocontext=" >> ${PATH}${origmailbox}/INBOX/${FILENAME}.txt
/bin/echo "exten=s" >> ${PATH}${origmailbox}/INBOX/${FILENAME}.txt
/bin/echo "priority=11" >> ${PATH}${origmailbox}/INBOX/${FILENAME}.txt
/bin/echo callerchan=${callerchan} >> ${PATH}${origmailbox}/INBOX/${FILENAME}.txt
/bin/echo callerid=${callerid} >> ${PATH}${origmailbox}/INBOX/${FILENAME}.txt
/bin/echo origdate=${origdate} >> ${PATH}${origmailbox}/INBOX/${FILENAME}.txt
/bin/echo origtime=${origtime} >> ${PATH}${origmailbox}/INBOX/${FILENAME}.txt
/bin/echo "category=" >> ${PATH}${origmailbox}/INBOX/${FILENAME}.txt
/bin/echo "duration=${duration}" >> ${PATH}${origmailbox}/INBOX/${FILENAME}.txt

/bin/nice /usr/bin/sox /var/lib/asterisk/sounds/catline/${origdir}.wav ${PATH}${origmailbox}/INBOX/${FILENAME}.flac   silence -l 1 0.1 1% -1 0.3 1% 

/bin/nice /usr/bin/lame -b 16 -m m -q 9-resample /var/lib/asterisk/sounds/catline/${origdir}.wav  ${PATH}${origmailbox}/INBOX/${FILENAME}.mp3

voicemailbody=$(/usr/bin/perl -w /usr/src/asterisk-speech-recog-cloud_api/cli/speech-recog-cli.pl -k $apikey -o detailed -r 8000 -n 1  /var/spool/asterisk/voicemail/default/${origmailbox}/INBOX/${FILENAME}.flac)

/bin/cp /var/lib/asterisk/sounds/catline/${origdir}.wav ${PATH}${origmailbox}/INBOX/${FILENAME}.wav

echo "You have a new voicemail from ${callerid} it was left on ${origdate} and is ${duration} seconds long ${voicemailbody}" | /bin/mail -s "A new voicemail has arrived from ${callerid}" -a "${PATH}${origmailbox}/INBOX/${FILENAME}.mp3" "$email"

/bin/rm -f ${PATH}${origmailbox}/INBOX/${FILENAME}.flac
/bin/rm -f ${PATH}${origmailbox}/INBOX/${FILENAME}.mp3
  • Asterisk Dialplan to pass the call to the above script
[vmail2text]
exten => _XXXX,1,Set(__EXTTOCALL=${EXTEN})
exten => _XXXX,n,Noop(${EXTTOCALL})
exten => _XXXX,n,Goto(s,1)

exten => s,1,Answer()  ; Listen to ringing for 1 seconds
exten => s,n,Noop(${EXTTOCALL} , ${DIALSTATUS} , ${SV_DIALSTATUS})
exten => s,n,GotoIf($["${DIALSTATUS}"="BUSY"]?busy:bnext)
exten => s,n(busy),Set(greeting=busy)
exten => s,n,Goto(carryon)
exten => s,n(bnext),GotoIf($["${DIALSTATUS}"="NOANSWER"]?unavail:unext)
exten => s,n(unavail),Set(greeting=unavail)
exten => s,n,Goto(carryon)
exten => s,n(unext),Set(greeting=unavail)
exten => s,n,Goto(carryon)
exten => s,n(carryon),Set(origmailbox=${EXTTOCALL})
exten => s,n,Set(msg=${STAT(e,${ASTSPOOLDIR}/voicemail/default/${origmailbox}/${greeting}.wav)})
exten => s,n,Set(__start=0)
exten => s,n,Set(__end=0)
exten => s,n,NoOp(${UNIQUEID})
exten => s,n,Set(origdate=${STRFTIME(${EPOCH},,%a %b %d %r %Z %G)})
exten => s,n,Set(origtime=${EPOCH})
exten => s,n,Set(callerchan=${CHANNEL})
exten => s,n,Set(callerid=${CALLERID(num)})
exten => s,n,Set(origmailbox=${origmailbox})
exten => s,n,Answer()
exten => s,n,GotoIf($["${msg}"="1"]?msgy:msgn)
exten => s,n(msgy),Playback(${ASTSPOOLDIR}/voicemail/default/${origmailbox}/${greeting});(local/catreq/how_did)
exten => s,n,Goto(beep)
exten => s,n(msgn),Playback(vm-intro)
exten => s,n(beep),System(/bin/touch /var/lib/asterisk/sounds/catline/${UNIQUEID}.wav)
exten => s,n,Playback(beep)
exten => s,n,Set(__start=${EPOCH})
exten => s,n,Record(catline/${UNIQUEID}.wav,3,60,kaq)
exten => s,n,Playback(beep)
exten => s,n,Hangup()
exten => h,1,Noop(${start} ${end})
exten => h,n,GotoIf($["${start}"!="0"]?ok:end)
exten => h,n(ok),Set(end=${EPOCH})
exten => h,n,Set(duration=${MATH(${end}-${start},int)})
exten => h,n,System(/usr/local/sbin/makevmal.sh "${callerchan}" ${callerid} "${origdate}" ${origtime} ${origmailbox} ${UNIQUEID} ${duration})
exten => h,n(end),Noop(finished)
  • Modified api script, Note the language and enhanced mode setting
    • For these to work you need “datalogging ” enabled in the dialogflow api settings
#!/usr/bin/env perl

#
# Render speech to text using Google's Cloud Speech API.
#
# Copyright (C) 2011 - 2016, Lefteris Zafiris <zaf@fastmail.com>
#
# This program is free software, distributed under the terms of
# the GNU General Public License Version 2. See the COPYING file
# at the top of the source tree.
#
# This has been altered to work with Googles new Speech models
#

use strict;
use warnings;
use File::Temp qw(tempfile);
use Getopt::Std;
use File::Basename;
use LWP::UserAgent;
use LWP::ConnCache;
use JSON;
use MIME::Base64;

my %options;
my $flac;
my $key;
my $url        = "https://speech.googleapis.com/v1p1beta1/speech";
my $samplerate = 16000;
my $language   = "en-US";
my $output     = "detailed";
my $results    = 1;
my $pro_filter = "false";
my $error      = 0;
my $thetext = ".";
my $score = ".";
getopts('k:l:o:r:n:fhq', \%options);

VERSION_MESSAGE() if (defined $options{h} || !@ARGV);

parse_options();

my %config = (
        "encoding"         => "FLAC",
        "sampleRateHertz"      => $samplerate,
        "languageCode"    => $language,
        "profanityFilter" => $pro_filter,
        "maxAlternatives" => $results,
        "model" => "phone_call",
        "useEnhanced" => 'true' 
);

my $ua = LWP::UserAgent->new(ssl_opts => {verify_hostname => 1});
$ua->agent("CLI speech recognition script");
$ua->env_proxy;
$ua->conn_cache(LWP::ConnCache->new());
$ua->timeout(60);

# send each sound file to Google and get the recognition results #
foreach my $file (@ARGV) {
        my ($filename, $dir, $ext) = fileparse($file, qr/\.[^.]*/);
        if ($ext ne ".flac" && $ext ne ".wav") {
                say_msg("Unsupported file-type: $ext");
                ++$error;
                next;
        }
        if ($ext eq ".wav") {
                if (($file = encode_flac($file)) eq '-1') {
                        ++$error;
                        next;
                }
        }
#       print("File $filename\n") if (!defined $options{q});
        my $audio;
        if (open(my $fh, "<", "$file")) {
                $audio = do { local $/; <$fh> };
                close($fh);
        } else {
                say_msg("Cant read file $file");
                ++$error;
                next;
        }
        my %audio = ( "content" => encode_base64($audio, "") );
        my %json = (
                "config" => \%config,
                "audio"  => \%audio,
        );
        my $response = $ua->post(
                "$url:recognize?key=$key",
                Content_Type => "application/json",
                Content      => encode_json(\%json),
        );
        if (!$response->is_success) {
                say_msg("Failed to get data for file: $file");
                ++$error;
                next;
        }
        if ($output eq "raw") {
                print $response->content;
                next;
        }
        my $jdata = decode_json($response->content);
        if ($output eq "detailed") {
                foreach (@{$jdata->{"results"}[0]->{"alternatives"}}) {
                        $score = $_->{"confidence"};
                        $thetext = $_->{"transcript"};
                        }
        } elsif ($output eq "compact") {
                print $_->{"transcript"}."\n" foreach (@{$jdata->{"results"}[0]->{"alternatives"}});
        }
}

print "\n\nThe transcription of message is below:\n\n$thetext\n\nWe are $score out of 1 sure its correct\n\nTranscribed using Googles Cloud Speech API ";

exit(($error) ? 1 : 0);

sub parse_options {
# Command line options parsing #
        if (defined $options{k}) {
        # check API key #
                $key = $options{k};
        } else {
                say_msg("Invalid or missing API key.\n");
                exit 1;
        }
        if (defined $options{l}) {
        # check if language setting is valid #
                if ($options{l} =~ /^[a-z]{2}(-[a-zA-Z]{2,6})?$/) {
                        $language = $options{l};
                } else {
                        say_msg("Invalid language setting. Using default.\n");
                }
        }
        if (defined $options{o}) {
        # check if output setting is valid #
                if ($options{o} =~ /^(detailed|compact|raw)$/) {
                        $output = $options{o};
                } else {
                        say_msg("Invalid output formatting setting. Using default.\n");
                }
        }
        if (defined $options{n}) {
        # set number or results #
                $results = $options{n} if ($options{n} =~ /\d+/);
        }
        if (defined $options{r}) {
        # set audio sampling rate #
                $samplerate = $options{r} if ($options{r} =~ /\d+/);
        }
        # set profanity filter #
        $pro_filter = "true" if (defined $options{f});

        return;
}

sub say_msg {
# Print messages to user if 'quiet' flag is not set #
        my @message = @_;
        warn @message if (!defined $options{q});
        return;
}

sub VERSION_MESSAGE {
# Help message #
        print "Speech recognition using Google Cloud Speech API.\n\n",
                "Usage: $0 [options] [file(s)]\n\n",
                "Supported options:\n",
                " -k <key>       specify the Speech API key\n",
                " -l <lang>      specify the language to use (default 'en-US')\n",
                " -o <type>      specify the type of output formatting\n",
                "    detailed    print detailed output with info like confidence (default)\n",
                "    compact     print only the transcripted string\n",
                "    raw         raw JSON output\n",
                " -r <rate>      specify the audio sample rate in Hz (default 16000)\n",
                " -n <number>    specify the maximum number of results (default 1)\n",
                " -f             filter out profanities\n",
                " -q             don't print any error messages or warnings\n",
                " -h             this help message\n\n";
        exit(1);
}
  • In Freepbx create a Custom Destination as    “vmail2text,s,1”  and if you require certain queues to go to specific mailboxes one like “vmail2text,2000,1” so calls will be sent to mailbox 2000
  • Then in extensions that want to use transcription set the “Optional Destinations” to the custom destination.

And thats it. Enjoy.