メルスペクトログラムがPythonでどう扱われるか確認する

cf. スペクトル、スペクトログラムって何だろう？（音声、音の初心者向け）

スペクトログラムとして表現するのは人が理解しやすいため

人の理解の仕方を機械で再現するために行う?

この辺り,信号処理の知識含め不足している

メルスペクトログラムは行列として扱われているみたい

ページ下部のコード群が該当

二次元の行列で

それぞれ（周波数帯の数, 時間フレーム数）に対応

各値は信号の強さを表現

実際にではどう扱われている?

生の波形をメルスペクトログラムに変換

code: mel

self.mel_spec = MelSpectrogram(

sample_rate=feat_params"sample_rate",

n_fft=feat_params"n_window",

win_length=feat_params"n_window",

hop_length=feat_params"hop_length",

f_min=feat_params"f_min",

f_max=feat_params"f_max",

n_mels=feat_params"n_mels",

window_fn=torch.hamming_window,

wkwargs={"periodic": False},

power=1,

)

対数変換と正規化

take_logとscaler

code: python

# sed_trainer_pretrained.py L.374-380

def detect(self, mel_feats, model, embeddings=None, **kwargs):

if embeddings is None:

return model(self.scaler(self.take_log(mel_feats)), **kwargs)

else:

return model(

self.scaler(self.take_log(mel_feats)), embeddings=embeddings, **kwargs

)

code: take_log.py

def take_log(self, mels):

"""Apply the log transformation to mel spectrograms.

Args:

mels: torch.Tensor, mel spectrograms for which to apply log.

Returns:

Tensor: logarithmic mel spectrogram of the mel spectrogram given as input

"""

amp_to_db = AmplitudeToDB(stype="amplitude")

amp_to_db.amin = 1e-5 # amin= 1e-5 as in librosa

# clamp to reproduce old code

return amp_to_db(mels).clamp(min=-50, max=80)

code: scaler.py

def _init_scaler(self):

"""Scaler inizialization

Raises:

NotImplementedError: in case of not Implemented scaler

Returns:

TorchScaler: returns the scaler

"""

if self.hparams"scaler""statistic" == "instance":

scaler = TorchScaler(

"instance",

self.hparams"scaler""normtype",

self.hparams"scaler""dims",

)

return scaler

elif self.hparams"scaler""statistic" == "dataset":

# we fit the scaler

scaler = TorchScaler(

"dataset",

self.hparams"scaler""normtype",

self.hparams"scaler""dims",

)

else:

raise NotImplementedError

if self.hparams"scaler""savepath" is not None:

if os.path.exists(self.hparams"scaler""savepath"):

scaler = torch.load(self.hparams"scaler""savepath")

print(

"Loaded Scaler from previous checkpoint from {}".format(

self.hparams"scaler""savepath"

)

return scaler

self.train_loader = self.train_dataloader()

scaler.fit(

self.train_loader,

transform_func=lambda x: self.take_log(self.mel_spec(x0)),

)

if self.hparams"scaler""savepath" is not None:

torch.save(scaler, self.hparams"scaler""savepath")

print(

"Saving Scaler from previous checkpoint at {}".format(

self.hparams"scaler""savepath"

)

return scaler

code: python

import librosa

import numpy as np

import matplotlib.pyplot as plt

import librosa.display

# 1. ダミーの音声信号を生成 (1秒間の440Hzのサイン波)

sr = 22050 # サンプリングレート

duration = 1.0

frequency = 440.0

y = np.sin(2 * np.pi * frequency * np.linspace(0, duration, int(sr * duration)))

# 2. メルスペクトログラムを計算

# n_mels: メル周波数帯の数（画像の縦の解像度）

# fmax: 解析する最大周波数

mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)

# 3. 形状と中身を確認

print("メルスペクトログラムのデータ型:", type(mel_spectrogram))

print("形状 (周波数帯の数, 時間フレーム数):", mel_spectrogram.shape)

print("\n--- 中身の一部 (左上の5x5) ---")

# パワーをデシベルに変換して見やすくします

S_dB = librosa.power_to_db(mel_spectrogram, ref=np.max)

print(S_dB:5, :5)

# 4. 可視化して確認

fig, ax = plt.subplots()

img = librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel', fmax=8000, ax=ax)

fig.colorbar(img, ax=ax, format='%+2.0f dB', label='Intensity (dB)')

ax.set(title='Mel-frequency spectrogram')

plt.show()

code: result

メルスペクトログラムのデータ型: <class 'numpy.ndarray'>

形状 (周波数帯の数, 時間フレーム数): (128, 44)

--- 中身の一部 (左上の5x5) ---

[-33.26704372 -39.28939481 -80. -80. -80.

-33.19220112 -39.21592788 -80. -80. -80.

-33.06758116 -39.09083289 -80. -80. -80.

-32.89105294 -38.91372578 -80. -80. -80.

-32.6291356 -38.65397701 -80. -80. -80. ]

https://gyazo.com/fbc0142f422a45e544fb042ad7c466d9

#Python #inbox