Dive into Weight Initialization
In short, it helps signals reach deep into the network.
Important: If the weights in a network start too small, then the signal shrinks as it passes through each layer until it’s too tiny to be useful. If the weights in a network start too large, then the signal grows as it passes through each layer until it’s too massive to be useful.
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import tensorflow_datasets as tfds
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import initializers
inputs = keras.Input(shape=(100,))
x1 = layers.Dense(100, 'sigmoid', False, initializers.RandomNormal(0, 1))(inputs)
x2 = layers.Dense(100, 'sigmoid', False, initializers.RandomNormal(0, 1))(x1)
x3 = layers.Dense(100, 'sigmoid', False, initializers.RandomNormal(0, 1))(x2)
x4 = layers.Dense(100, 'sigmoid', False, initializers.RandomNormal(0, 1))(x3)
x5 = layers.Dense(100, 'sigmoid', False, initializers.RandomNormal(0, 1))(x4)
model_1 = keras.Model(inputs, [x1, x2, x3, x4, x5])
x = np.random.randn(100, 100)
outputs = model_1.predict(x)
fig,axs = plt.subplots(1, 5, sharey=True, figsize=(12,5))
for i,(o,ax) in enumerate(zip(outputs, axs.flatten())):
ax.hist(o.flatten(), 30, range=[0,1])
ax.set_title(f"layer-{i}")
inputs = keras.Input(shape=(100,))
x1 = layers.Dense(100, 'sigmoid', False, initializers.RandomNormal(0, 0.01))(inputs)
x2 = layers.Dense(100, 'sigmoid', False, initializers.RandomNormal(0, 0.01))(x1)
x3 = layers.Dense(100, 'sigmoid', False, initializers.RandomNormal(0, 0.01))(x2)
x4 = layers.Dense(100, 'sigmoid', False, initializers.RandomNormal(0, 0.01))(x3)
x5 = layers.Dense(100, 'sigmoid', False, initializers.RandomNormal(0, 0.01))(x4)
model_2 = keras.Model(inputs, [x1, x2, x3, x4, x5])
x = np.random.randn(100, 100)
outputs = model_2.predict(x)
fig,axs = plt.subplots(1, 5, sharey=True, figsize=(12,5))
for i,(o,ax) in enumerate(zip(outputs, axs.flatten())):
ax.hist(o.flatten(), 30, range=[0,1])
ax.set_title(f"layer-{i}")
inputs = keras.Input(shape=(100,))
x1 = layers.Dense(100, 'sigmoid', False, initializers.glorot_normal())(inputs)
x2 = layers.Dense(100, 'sigmoid', False, initializers.glorot_normal())(x1)
x3 = layers.Dense(100, 'sigmoid', False, initializers.glorot_normal())(x2)
x4 = layers.Dense(100, 'sigmoid', False, initializers.glorot_normal())(x3)
x5 = layers.Dense(100, 'sigmoid', False, initializers.glorot_normal())(x4)
model_3 = keras.Model(inputs, [x1, x2, x3, x4, x5])
x = np.random.randn(100, 100)
outputs = model_3.predict(x)
fig,axs = plt.subplots(1, 5, sharey=True, figsize=(12,5))
for i,(o,ax) in enumerate(zip(outputs, axs.flatten())):
ax.hist(o.flatten(), 30, range=[0,1])
ax.set_title(f"layer-{i}")
inputs = keras.Input(shape=(100,))
x1 = layers.Dense(100, 'relu', False, initializers.glorot_normal())(inputs)
x2 = layers.Dense(100, 'relu', False, initializers.glorot_normal())(x1)
x3 = layers.Dense(100, 'relu', False, initializers.glorot_normal())(x2)
x4 = layers.Dense(100, 'relu', False, initializers.glorot_normal())(x3)
x5 = layers.Dense(100, 'relu', False, initializers.glorot_normal())(x4)
model_4 = keras.Model(inputs, [x1, x2, x3, x4, x5])
x = np.random.randn(100, 100)
outputs = model_4.predict(x)
fig,axs = plt.subplots(1, 5, sharey=True, figsize=(12,5))
for i,(o,ax) in enumerate(zip(outputs, axs.flatten())):
ax.hist(o.flatten(), 30, range=[0,1])
ax.set_title(f"layer-{i}")
inputs = keras.Input(shape=(100,))
x1 = layers.Dense(100, 'relu', False, initializers.he_normal())(inputs)
x2 = layers.Dense(100, 'relu', False, initializers.he_normal())(x1)
x3 = layers.Dense(100, 'relu', False, initializers.he_normal())(x2)
x4 = layers.Dense(100, 'relu', False, initializers.he_normal())(x3)
x5 = layers.Dense(100, 'relu', False, initializers.he_normal())(x4)
model_5 = keras.Model(inputs, [x1, x2, x3, x4, x5])
x = np.random.randn(100, 100)
outputs = model_5.predict(x)
fig,axs = plt.subplots(1, 5, sharey=True, figsize=(12,5))
for i,(o,ax) in enumerate(zip(outputs, axs.flatten())):
ax.hist(o.flatten(), 30, range=[0,1])
ax.set_title(f"layer-{i}")