# Automatically reload imported modules that are changed outside this notebook
%load_ext autoreload
%autoreload 2
# More pixels in figures
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["figure.dpi"] = 200
# Init PRNG with fixed seed for reproducibility
import numpy as np
np_rng = np.random.default_rng(1)
import tensorflow as tf
tf.random.set_seed(np_rng.integers(0, tf.int64.max))
2020-11-21
This example expands common-voice-augmenting
by implementing language vector classification.
So far, we have used the x-vector neural network as an end-to-end classifier, making classification decisions based on its log-softmax outputs.
However, it can also be used for representation learning by adding a second step after training.
Once we have found reasonably optimal weights for the network, we extract all speech data as fixed-length vectors and train a separate, back-end classifier on these vectors.
These vectors are also called embeddings.
As explained in the original x-vector paper, one benefit of this approach is that we could first train a single neural network on vast amounts of data in hundreds of languages, which can then be used as a feature extractor for producing training data to arbitrary back-end classifiers.
These back-end classifiers could be trained on any subset of languages from the larger training set.
This example uses the same data as in the common-voice-small
example.
import urllib.parse
from IPython.display import display, Markdown
languages = """
et
mn
ta
tr
""".split()
languages = sorted(l.strip() for l in languages)
display(Markdown("### Languages"))
display(Markdown('\n'.join("* `{}`".format(l) for l in languages)))
bcp47_validator_url = 'https://schneegans.de/lv/?tags='
display(Markdown("See [this tool]({}) for a description of the BCP-47 language codes."
.format(bcp47_validator_url + urllib.parse.quote('\n'.join(languages)))))
import os
import pandas as pd
from lidbox.meta import (
common_voice,
generate_label2target,
verify_integrity,
read_audio_durations,
random_oversampling_on_split
)
workdir = "/data/exp/cv4-embed"
datadir = "/mnt/data/speech/common-voice/downloads/2020/cv-corpus"
print("work dir:", workdir)
print("data source dir:", datadir)
print()
os.makedirs(workdir, exist_ok=True)
assert os.path.isdir(datadir), datadir + " does not exist"
dirs = sorted((f for f in os.scandir(datadir) if f.is_dir()), key=lambda f: f.name)
print(datadir)
for d in dirs:
if d.name in languages:
print(' ', d.name)
for f in os.scandir(d):
print(' ', f.name)
missing_languages = set(languages) - set(d.name for d in dirs)
assert missing_languages == set(), "missing languages: {}".format(missing_languages)
meta = common_voice.load_all(datadir, languages)
meta, lang2target = generate_label2target(meta)
print("\nsize of all metadata", meta.shape)
meta = meta.dropna()
print("after dropping NaN rows", meta.shape)
print("verifying integrity")
verify_integrity(meta)
print("ok\n")
print("reading audio durations")
meta["duration"] = read_audio_durations(meta)
print("balancing the label distributions")
meta = random_oversampling_on_split(meta, "train")
work dir: /data/exp/cv4-embed data source dir: /mnt/data/speech/common-voice/downloads/2020/cv-corpus /mnt/data/speech/common-voice/downloads/2020/cv-corpus et validated.tsv invalidated.tsv other.tsv dev.tsv train.tsv clips test.tsv reported.tsv mn validated.tsv invalidated.tsv other.tsv dev.tsv train.tsv clips test.tsv reported.tsv ta validated.tsv invalidated.tsv other.tsv dev.tsv train.tsv clips test.tsv reported.tsv tr validated.tsv invalidated.tsv other.tsv dev.tsv train.tsv clips test.tsv reported.tsv size of all metadata (23842, 6) after dropping NaN rows (23842, 6) verifying integrity ok reading audio durations balancing the label distributions
from lidbox.features import audio, cmvn
import lidbox.data.steps as ds_steps
import scipy.signal
TF_AUTOTUNE = tf.data.experimental.AUTOTUNE
def metadata_to_dataset_input(meta):
return {
"id": tf.constant(meta.index, tf.string),
"path": tf.constant(meta.path, tf.string),
"label": tf.constant(meta.label, tf.string),
"target": tf.constant(meta.target, tf.int32),
"split": tf.constant(meta.split, tf.string),
"is_copy": tf.constant(meta.is_copy, tf.bool),
}
def read_mp3(x):
s, r = audio.read_mp3(x["path"])
out_rate = 16000
s = audio.resample(s, r, out_rate)
s = audio.peak_normalize(s, dBFS=-3.0)
s = audio.remove_silence(s, out_rate)
return dict(x, signal=s, sample_rate=out_rate)
def random_filter(x):
def scipy_filter(s, N=10):
b = np_rng.normal(0, 1, N)
return scipy.signal.lfilter(b, 1.0, s).astype(np.float32), b
s, _ = tf.numpy_function(
scipy_filter,
[x["signal"]],
[tf.float32, tf.float64],
name="np_random_filter")
s = tf.cast(s, tf.float32)
s = audio.peak_normalize(s, dBFS=-3.0)
return dict(x, signal=s)
def random_speed_change(ds):
return ds_steps.random_signal_speed_change(ds, min=0.9, max=1.1, flag="is_copy")
def batch_extract_features(x):
with tf.device("GPU"):
signals, rates = x["signal"], x["sample_rate"]
S = audio.spectrograms(signals, rates[0])
S = audio.linear_to_mel(S, rates[0])
S = tf.math.log(S + 1e-6)
S = cmvn(S, normalize_variance=False)
return dict(x, logmelspec=S)
def pipeline_from_meta(data, split):
if split == "train":
data = data.sample(frac=1, random_state=np_rng.bit_generator)
ds = (tf.data.Dataset
.from_tensor_slices(metadata_to_dataset_input(data))
.map(read_mp3, num_parallel_calls=TF_AUTOTUNE))
if split == "test":
return (ds
.batch(1)
.map(batch_extract_features, num_parallel_calls=TF_AUTOTUNE)
.unbatch()
.cache(os.path.join(cachedir, "data", split))
.prefetch(1000))
else:
return (ds
.cache(os.path.join(cachedir, "data", split))
.prefetch(1000)
.apply(random_speed_change)
.map(random_filter, num_parallel_calls=TF_AUTOTUNE)
.batch(1)
.map(batch_extract_features, num_parallel_calls=TF_AUTOTUNE)
.unbatch())
cachedir = os.path.join(workdir, "cache")
os.makedirs(os.path.join(cachedir, "data"))
split2ds = {split: pipeline_from_meta(meta[meta["split"]==split], split)
for split in meta.split.unique()}
2020-11-21 22:41:48.452 I lidbox.data.steps: Applying random resampling to signals with a random speed ratio chosen uniformly at random from [0.900, 1.100] 2020-11-21 22:41:48.659 I lidbox.data.steps: Applying random resampling to signals with a random speed ratio chosen uniformly at random from [0.900, 1.100]
for split, ds in split2ds.items():
print("filling", split, "cache")
_ = ds_steps.consume(ds, log_interval=2000)
filling test cache 2020-11-21 22:41:48.782 I lidbox.data.steps: Exhausting the dataset iterator by iterating over all elements, log_interval = 2000 2020-11-21 22:42:03.744 I lidbox.data.steps: 2000 done, 133.685 elements per second. 2020-11-21 22:42:16.297 I lidbox.data.steps: 4000 done, 159.326 elements per second. 2020-11-21 22:42:26.627 I lidbox.data.steps: 6000 done, 193.626 elements per second. 2020-11-21 22:42:35.177 I lidbox.data.steps: 7569 done, 183.536 elements per second. filling train cache 2020-11-21 22:42:35.180 I lidbox.data.steps: Exhausting the dataset iterator by iterating over all elements, log_interval = 2000 2020-11-21 22:42:52.866 I lidbox.data.steps: 2000 done, 113.084 elements per second. 2020-11-21 22:43:05.246 I lidbox.data.steps: 4000 done, 161.561 elements per second. 2020-11-21 22:43:17.708 I lidbox.data.steps: 6000 done, 160.505 elements per second. 2020-11-21 22:43:30.061 I lidbox.data.steps: 8000 done, 161.917 elements per second. 2020-11-21 22:43:40.296 I lidbox.data.steps: 10000 done, 195.420 elements per second. 2020-11-21 22:43:54.022 I lidbox.data.steps: 12000 done, 145.721 elements per second. 2020-11-21 22:44:06.270 I lidbox.data.steps: 14000 done, 163.292 elements per second. 2020-11-21 22:44:17.324 I lidbox.data.steps: 16000 done, 180.954 elements per second. 2020-11-21 22:44:18.804 I lidbox.data.steps: 16728 done, 492.213 elements per second. filling dev cache 2020-11-21 22:44:18.805 I lidbox.data.steps: Exhausting the dataset iterator by iterating over all elements, log_interval = 2000 2020-11-21 22:44:40.290 I lidbox.data.steps: 2000 done, 93.092 elements per second. 2020-11-21 22:44:51.641 I lidbox.data.steps: 4000 done, 176.203 elements per second. 2020-11-21 22:45:01.042 I lidbox.data.steps: 6000 done, 212.746 elements per second. 2020-11-21 22:45:04.535 I lidbox.data.steps: 7451 done, 415.563 elements per second.
We already have a trained instance of the x-vector model from common-voice-augmenting
so we can skip training the model.
from lidbox.models import xvector
previous_cachedir = "/data/exp/cv4-augment/cache"
def load_trained_model(num_freq_bins=40, num_labels=len(lang2target)):
m = xvector.create(
input_shape=[None, num_freq_bins],
num_outputs=num_labels,
channel_dropout_rate=0.8)
m.compile(
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5))
_ = m.load_weights(os.path.join(previous_cachedir, "model", m.name))
return m
model = load_trained_model()
model.summary()
Model: "x-vector" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= input (InputLayer) [(None, None, 40)] 0 _________________________________________________________________ channel_dropout (SpatialDrop (None, None, 40) 0 _________________________________________________________________ frame1 (Conv1D) (None, None, 512) 102912 _________________________________________________________________ frame2 (Conv1D) (None, None, 512) 786944 _________________________________________________________________ frame3 (Conv1D) (None, None, 512) 786944 _________________________________________________________________ frame4 (Conv1D) (None, None, 512) 262656 _________________________________________________________________ frame5 (Conv1D) (None, None, 1500) 769500 _________________________________________________________________ stats_pooling (GlobalMeanStd (None, 3000) 0 _________________________________________________________________ segment1 (Dense) (None, 512) 1536512 _________________________________________________________________ segment2 (Dense) (None, 512) 262656 _________________________________________________________________ outputs (Dense) (None, 4) 2052 _________________________________________________________________ log_softmax (Activation) (None, 4) 0 ================================================================= Total params: 4,510,176 Trainable params: 4,510,176 Non-trainable params: 0 _________________________________________________________________
import pandas as pd
from lidbox.util import evaluate_testset_with_model
from lidbox.visualize import draw_confusion_matrix
def display_classification_report(report):
for m in ("avg_detection_cost", "avg_equal_error_rate", "accuracy"):
print("{}: {:.3f}".format(m, report[m]))
lang_metrics = pd.DataFrame.from_dict(
{k: v for k, v in report.items() if k in lang2target})
lang_metrics["mean"] = lang_metrics.mean(axis=1)
display(lang_metrics.T)
fig, ax = draw_confusion_matrix(report["confusion_matrix"], lang2target)
report = evaluate_testset_with_model(
model=load_trained_model(),
test_ds=split2ds["test"].map(lambda x: dict(x, input=x["logmelspec"])).batch(1),
test_meta=meta[meta["split"]=="test"],
lang2target=lang2target)
display_classification_report(report)
WARNING:tensorflow:Unresolved object in checkpoint: (root).optimizer.iter
2020-11-21 22:45:18.860 W tensorflow: Unresolved object in checkpoint: (root).optimizer.iter
WARNING:tensorflow:Unresolved object in checkpoint: (root).optimizer.beta_1
2020-11-21 22:45:18.861 W tensorflow: Unresolved object in checkpoint: (root).optimizer.beta_1
WARNING:tensorflow:Unresolved object in checkpoint: (root).optimizer.beta_2
2020-11-21 22:45:18.862 W tensorflow: Unresolved object in checkpoint: (root).optimizer.beta_2
WARNING:tensorflow:Unresolved object in checkpoint: (root).optimizer.decay
2020-11-21 22:45:18.863 W tensorflow: Unresolved object in checkpoint: (root).optimizer.decay
WARNING:tensorflow:Unresolved object in checkpoint: (root).optimizer.learning_rate
2020-11-21 22:45:18.864 W tensorflow: Unresolved object in checkpoint: (root).optimizer.learning_rate
WARNING:tensorflow:A checkpoint was restored (e.g. tf.train.Checkpoint.restore or tf.keras.Model.load_weights) but not all checkpointed values were used. See above for specific issues. Use expect_partial() on the load status object, e.g. tf.train.Checkpoint.restore(...).expect_partial(), to silence these warnings, or use assert_consumed() to make the check explicit. See https://www.tensorflow.org/guide/checkpoint#loading_mechanics for details.
2020-11-21 22:45:18.865 W tensorflow: A checkpoint was restored (e.g. tf.train.Checkpoint.restore or tf.keras.Model.load_weights) but not all checkpointed values were used. See above for specific issues. Use expect_partial() on the load status object, e.g. tf.train.Checkpoint.restore(...).expect_partial(), to silence these warnings, or use assert_consumed() to make the check explicit. See https://www.tensorflow.org/guide/checkpoint#loading_mechanics for details.
avg_detection_cost: 0.093 avg_equal_error_rate: 0.085 accuracy: 0.848
precision | recall | f1-score | support | equal_error_rate | |
---|---|---|---|---|---|
et | 0.925676 | 0.882803 | 0.903731 | 2483.00 | 0.072945 |
mn | 0.937107 | 0.740884 | 0.827522 | 1810.00 | 0.083001 |
ta | 0.827945 | 0.922466 | 0.872654 | 1638.00 | 0.061204 |
tr | 0.707969 | 0.840659 | 0.768630 | 1638.00 | 0.121733 |
mean | 0.849674 | 0.846703 | 0.843134 | 1892.25 | 0.084721 |
In previous examples we stopped here, but this time we'll make use of the internal representation our neural network has learned.
As described in the x-vector paper, the language vectors should be extracted from the first fully connected layer, without activations.
Lets create a new feature extractor model that uses same inputs as the trained x-vector model, but uses the segment1
layer as its output layer.
We also freeze the model by converting it into a tf.function
.
from lidbox.util import model2function
model = load_trained_model()
xvec_layer = model.get_layer(name="segment1")
xvec_layer.activation = None
xvec_extractor = model2function(
tf.keras.Model(inputs=model.inputs, outputs=xvec_layer.output))
print("extractor:", str(xvec_extractor))
WARNING:tensorflow:Unresolved object in checkpoint: (root).optimizer.iter
2020-11-21 22:45:19.086 W tensorflow: Unresolved object in checkpoint: (root).optimizer.iter
WARNING:tensorflow:Unresolved object in checkpoint: (root).optimizer.beta_1
2020-11-21 22:45:19.088 W tensorflow: Unresolved object in checkpoint: (root).optimizer.beta_1
WARNING:tensorflow:Unresolved object in checkpoint: (root).optimizer.beta_2
2020-11-21 22:45:19.088 W tensorflow: Unresolved object in checkpoint: (root).optimizer.beta_2
WARNING:tensorflow:Unresolved object in checkpoint: (root).optimizer.decay
2020-11-21 22:45:19.089 W tensorflow: Unresolved object in checkpoint: (root).optimizer.decay
WARNING:tensorflow:Unresolved object in checkpoint: (root).optimizer.learning_rate
2020-11-21 22:45:19.090 W tensorflow: Unresolved object in checkpoint: (root).optimizer.learning_rate
WARNING:tensorflow:A checkpoint was restored (e.g. tf.train.Checkpoint.restore or tf.keras.Model.load_weights) but not all checkpointed values were used. See above for specific issues. Use expect_partial() on the load status object, e.g. tf.train.Checkpoint.restore(...).expect_partial(), to silence these warnings, or use assert_consumed() to make the check explicit. See https://www.tensorflow.org/guide/checkpoint#loading_mechanics for details.
2020-11-21 22:45:19.090 W tensorflow: A checkpoint was restored (e.g. tf.train.Checkpoint.restore or tf.keras.Model.load_weights) but not all checkpointed values were used. See above for specific issues. Use expect_partial() on the load status object, e.g. tf.train.Checkpoint.restore(...).expect_partial(), to silence these warnings, or use assert_consumed() to make the check explicit. See https://www.tensorflow.org/guide/checkpoint#loading_mechanics for details.
extractor: ConcreteFunction <lambda>(x) Args: x: float32 Tensor, shape=(None, None, 40) Returns: float32 Tensor, shape=(None, 512)
from lidbox.visualize import plot_embedding_vector
def is_not_copy(x):
return not x["is_copy"]
def batch_extract_embeddings(x):
with tf.device("GPU"):
return dict(x, embedding=xvec_extractor(x["logmelspec"]))
embedding_demo_ds = (split2ds["train"]
.filter(is_not_copy)
.take(12)
.batch(1)
.map(batch_extract_embeddings)
.unbatch())
for x in embedding_demo_ds.as_numpy_iterator():
print(x["id"].decode("utf-8"), x["embedding"].shape)
plot_embedding_vector(x["embedding"], figsize=(10, 0.2))
common_voice_mn_18589626 (512,)
common_voice_ta_19171903 (512,)
common_voice_et_20800798 (512,)
common_voice_tr_17348534 (512,)
common_voice_mn_18603608 (512,)
common_voice_mn_18593842 (512,)
common_voice_mn_18596637 (512,)
common_voice_mn_18909588 (512,)
common_voice_et_20838416 (512,)
common_voice_ta_19171901 (512,)
common_voice_mn_18586690 (512,)
common_voice_mn_18725186 (512,)
Let's extend our existing tf.data.Dataset
feature extraction pipelines by appending a step that extracts language vectors (embeddings) with the trained model.
We can add all embeddings into our metadata table, under a column called embedding
in order to keep everything neatly in one location.
def ds_to_embeddings(ds):
to_pair = lambda x: (x["id"], x["embedding"])
ds = (ds
.batch(1)
.map(batch_extract_embeddings, num_parallel_calls=TF_AUTOTUNE)
.unbatch()
.map(to_pair, num_parallel_calls=TF_AUTOTUNE))
ids = []
embeddings = []
for id, embedding in ds.as_numpy_iterator():
ids.append(id.decode("utf-8"))
embeddings.append(embedding.astype(np.float32))
df = pd.DataFrame.from_dict({"id": ids, "embedding": embeddings})
return df.set_index("id", drop=True, verify_integrity=True)
embeddings_by_split = (ds_to_embeddings(ds) for ds in split2ds.values())
meta = meta.join(pd.concat(embeddings_by_split, verify_integrity=True), how="outer")
assert not meta.embedding.isna().any(axis=None), "Missing embeddings, some rows contained NaN values"
Now, let's extract all embeddings and integer targets into NumPy-data and preprocess them with scikit-learn.
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA
from lidbox.embed.sklearn_utils import PLDA
def embeddings_as_numpy_data(df):
X = np.stack(df.embedding.values).astype(np.float32)
y = df.target.to_numpy(dtype=np.int32)
return X, y
def random_sample(X, y, sample_size_ratio):
N = X.shape[0]
sample_size = int(sample_size_ratio*N)
sample_idx = np_rng.choice(np.arange(N), size=sample_size, replace=False)
return X[sample_idx], y[sample_idx]
def pca_3d_scatterplot_by_label(data, targets, split_name):
target2lang = {t: l for l, t in lang2target.items()}
df = pd.DataFrame.from_dict({
"x": data[:,0],
"y": data[:,1],
"z": data[:,2],
"lang": [target2lang[t] for t in targets],
})
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111, projection='3d')
for lang, g in df.groupby("lang"):
ax.scatter(g.x, g.y, g.z, label=lang)
ax.legend()
ax.set_title("3D PCA scatter plot of {} set language vectors".format(split_name))
plt.show()
train_X, train_y = embeddings_as_numpy_data(meta[meta["split"]=="train"])
print("training vectors", train_X.shape, train_y.shape)
test_X, test_y = embeddings_as_numpy_data(meta[meta["split"]=="test"])
print("test vectors", test_X.shape, test_y.shape)
# Standardize all vectors using training set statistics
scaler = StandardScaler()
scaler.fit(train_X)
train_X = scaler.transform(train_X)
test_X = scaler.transform(test_X)
# Reduce dimensions
pre_shape = train_X.shape
plda = PLDA()
plda.fit(train_X, train_y)
train_X = plda.transform(train_X)
test_X = plda.transform(test_X)
print("PLDA reduced dimensions from {} to {}".format(pre_shape, train_X.shape))
# L2-normalize vectors to surface of a unit sphere
train_X = normalize(train_X)
test_X = normalize(test_X)
# Map vectors to 3D with PCA and plot scatterplots of 10% random samples
pca = PCA(n_components=3, whiten=False)
pca.fit(train_X)
X, y = random_sample(pca.transform(train_X), train_y, 0.1)
pca_3d_scatterplot_by_label(X, y, "training")
X, y = random_sample(pca.transform(test_X), test_y, 0.1)
pca_3d_scatterplot_by_label(X, y, "test")
training vectors (16728, 512) (16728,) test vectors (7569, 512) (7569,) PLDA reduced dimensions from (16728, 512) to (16728, 3)
Finally, we train a classifier on the training set vectors and predict some language scores on the test set vectors, from which we compute all metrics as before.
from sklearn.naive_bayes import GaussianNB
from lidbox.util import classification_report
# Fit classifier
clf = GaussianNB()
clf.fit(train_X, train_y)
# Predict scores on test set with classifier and compute metrics
test_pred = clf.predict_log_proba(test_X)
# Clamp -infs to -100
test_pred = np.maximum(-100, test_pred)
report = classification_report(test_y, test_pred, lang2target)
display_classification_report(report)
avg_detection_cost: 0.110 avg_equal_error_rate: 0.096 accuracy: 0.825
precision | recall | f1-score | support | equal_error_rate | |
---|---|---|---|---|---|
et | 0.923465 | 0.884414 | 0.903518 | 2483.00 | 0.071766 |
mn | 0.918353 | 0.727072 | 0.811594 | 1810.00 | 0.108005 |
ta | 0.889579 | 0.786935 | 0.835115 | 1638.00 | 0.073343 |
tr | 0.624513 | 0.880342 | 0.730682 | 1638.00 | 0.131681 |
mean | 0.838977 | 0.819691 | 0.820227 | 1892.25 | 0.096199 |
We were unable to improve our classification results by training a separate back-end classifier on the internal representation of the x-vector neural network. However, this technique can be useful if you have a pre-trained neural network and want to train a classifier on new data.