Voice Detection (Beta)

This service takes an audio stream as an input and tries to discriminate what is human voice and what is not. If detecting noise in general, and not specifically human voice, use Sound Detection instead.

Getting Started

Using the Angus python SDK:

# -*- coding: utf-8 -*-
import angus.client.cloud

conn = angus.client.connect()

service = conn.services.get_service('voice_detection', version=1)

job = service.process({'sound': open("./sound.wav", 'rb'), 'sensitivity':0.7})

print job.result

Input

{'sound' : file,
 'sensitivity' : 0.3}
  • sound : a python File Object as returned for example by open() or a StringIO buffer describing a wav file with the following format : PCM 16bit, Mono, without constraints on sample rate.
  • sensitivity : modifies the ability of the algorithms to detect quiet voices. [0, 1]. The higher the value is, the better the algorithm will detect quiet voices, but the more it will be sensitive to background noise.

Output

Events will be pushed to your client following that format:

{
  "voice_activity" : "SILENCE"
}
  • voice_activity : this field takes 4 different values: SILENCE when no voice is detected, VOICE when voice is detected, ON when a transition occurs between SILENCE and VOICE, and OFF when a transition occurs between VOICE and SILENCE.

Code Sample

requirements: PyAudio

This code sample retrieve the audio stream of a web cam and display the result of the voice_detection service.

# -*- coding: utf-8 -*-
import Queue
import StringIO
import wave
import time
import angus.client
import pyaudio
import sys
import numpy as np


CHUNK = 8192
PYAUDIO_FORMAT = pyaudio.paInt16
NUMPY_FORMAT = np.int16

def list_inputs():
    p = pyaudio.PyAudio()
    for i in range(p.get_device_count()):
        info = p.get_device_info_by_index(i)
        if info['maxInputChannels'] > 0:
            print("Device index={} name={}".format(info['index'], info['name']))

def prepare(in_data, channels, rate):
    # Extract first channel
    in_data = np.fromstring(in_data, dtype=NUMPY_FORMAT)
    in_data = np.reshape(in_data, (CHUNK, channels))
    in_data = in_data[:,0]

    # Down sample if needed
    srcx = np.arange(0, in_data.size, 1)
    tgtx = np.arange(0, in_data.size, float(rate) / float(16000))

    in_data = np.interp(tgtx, srcx, in_data).astype(NUMPY_FORMAT)
    return in_data


def main(stream_index):

    p = pyaudio.PyAudio()

    # Device configuration
    conf = p.get_device_info_by_index(stream_index)
    channels = int(conf['maxInputChannels'])
    if channels < 1:
        raise RuntimeException("Bad device, no input channel")

    rate = int(conf['defaultSampleRate'])
    if rate < 16000:
        raise RuntimeException("Bad device, sample rate is too low")


    # Angus
    conn = angus.client.connect()
    service = conn.services.get_service('voice_detection', version=1)
    service.enable_session()

    # Record Process
    stream_queue = Queue.Queue()
    def chunk_callback(in_data, frame_count, time_info, status):
        in_data = prepare(in_data, channels, rate)
        stream_queue.put(in_data.tostring())
        return (in_data, pyaudio.paContinue)

    stream = p.open(format=PYAUDIO_FORMAT,
                    channels=channels,
                    rate=rate,
                    input=True,
                    frames_per_buffer=CHUNK,
                    input_device_index=stream_index,
                    stream_callback=chunk_callback)
    stream.start_stream()

    # Get data and send to Angus.ai
    while True:
        nb_buffer_available = stream_queue.qsize()

        if nb_buffer_available == 0:
            time.sleep(0.01)
            continue

        data = stream_queue.get()
        buff = StringIO.StringIO()

        wf = wave.open(buff, 'wb')
        wf.setnchannels(1)
        wf.setsampwidth(p.get_sample_size(PYAUDIO_FORMAT))
        wf.setframerate(16000)
        wf.writeframes(data)
        wf.close()

        job = service.process(
            {'sound': StringIO.StringIO(buff.getvalue()), 'sensitivity': 0.2})

        res = job.result["voice_activity"]

        if res == "VOICE":
            print "\033[A                                             \033[A"
            print "***************************"
            print "*****   VOICE !!!!   ******"
            print "***************************"


    stream.stop_stream()
    stream.close()
    p.terminate()


if __name__ == "__main__":
    if len(sys.argv) < 2:
        list_inputs()
        index = raw_input("Please select a device number:")
    else:
        index = sys.argv[1]

    try:
        index = int(index)
        main(index)
    except ValueError:
        print("Not a valid index")
        exit(1)