Skip to content

GMM on MFCC

初期評価バージョン。

GMM on MFCC

それぞれの人物の30秒の音声データを用意する。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from sklearn import preprocessing
import python_speech_features as mfcc
import librosa
import matplotlib.pyplot as plt
import librosa.display
from sklearn.mixture import GMM 
import _pickle as pickle
import warnings
import numpy as np
warnings.filterwarnings("ignore")

def wav2mfcc(path):
    audio, rate = librosa.load(path)
    mfcc_data = mfcc.mfcc(audio, rate, 0.025, 0.01, 20, appendEnergy = True)
    mfcc_data = preprocessing.scale(mfcc_data)
    return (mfcc_data, rate)

def drawGraph(data, rate):
    librosa.display.specshow(data, sr=rate, x_axis='time', y_axis='mel')
    plt.title('mel power spectrogram')
    plt.colorbar(format='%02.0f dB')
    plt.tight_layout()

names = ["hozumi", "okuyama", "takeo", "takano"]

for name in names:
    (mfcc_data, rate) = wav2mfcc(name + ".wav")
    gmm = GMM(n_components = 16, n_iter = 200, covariance_type='diag',n_init = 3)
    gmm.fit(mfcc_data)
    pickle.dump(gmm, open(name + ".gmm", 'wb'))

models = []
for name in names:
    model = pickle.load(open(name + ".gmm", 'rb'))
    models.append(model)

(test_data, rate) = wav2mfcc("hozumi_test.wav")
total_scores = np.zeros(len(names)) 

for i in range(len(models)):
    gmm    = models[i] 
    scores = np.array(gmm.score(test_data))
    total_scores[i] = scores.sum()

speaker = np.argmax(total_scores)
print("Speaker is", names[speaker])