Important: If the weights in a network start too small, then the signal shrinks as it passes through each layer until it’s too tiny to be useful. If the weights in a network start too large, then the signal grows as it passes through each layer until it’s too massive to be useful.
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import tensorflow_datasets as tfds

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import initializers
inputs = keras.Input(shape=(100,))
x1 = layers.Dense(100, 'sigmoid', False, initializers.RandomNormal(0, 1))(inputs)
x2 = layers.Dense(100, 'sigmoid', False, initializers.RandomNormal(0, 1))(x1)
x3 = layers.Dense(100, 'sigmoid', False, initializers.RandomNormal(0, 1))(x2)
x4 = layers.Dense(100, 'sigmoid', False, initializers.RandomNormal(0, 1))(x3)
x5 = layers.Dense(100, 'sigmoid', False, initializers.RandomNormal(0, 1))(x4)
model_1 = keras.Model(inputs, [x1, x2, x3, x4, x5])
x = np.random.randn(100, 100)
outputs = model_1.predict(x)

fig,axs = plt.subplots(1, 5, sharey=True, figsize=(12,5))
for i,(o,ax) in enumerate(zip(outputs, axs.flatten())):
    ax.hist(o.flatten(), 30, range=[0,1])
    ax.set_title(f"layer-{i}")
inputs = keras.Input(shape=(100,))
x1 = layers.Dense(100, 'sigmoid', False, initializers.RandomNormal(0, 0.01))(inputs)
x2 = layers.Dense(100, 'sigmoid', False, initializers.RandomNormal(0, 0.01))(x1)
x3 = layers.Dense(100, 'sigmoid', False, initializers.RandomNormal(0, 0.01))(x2)
x4 = layers.Dense(100, 'sigmoid', False, initializers.RandomNormal(0, 0.01))(x3)
x5 = layers.Dense(100, 'sigmoid', False, initializers.RandomNormal(0, 0.01))(x4)
model_2 = keras.Model(inputs, [x1, x2, x3, x4, x5])
x = np.random.randn(100, 100)
outputs = model_2.predict(x)

fig,axs = plt.subplots(1, 5, sharey=True, figsize=(12,5))
for i,(o,ax) in enumerate(zip(outputs, axs.flatten())):
    ax.hist(o.flatten(), 30, range=[0,1])
    ax.set_title(f"layer-{i}")
inputs = keras.Input(shape=(100,))
x1 = layers.Dense(100, 'sigmoid', False, initializers.glorot_normal())(inputs)
x2 = layers.Dense(100, 'sigmoid', False, initializers.glorot_normal())(x1)
x3 = layers.Dense(100, 'sigmoid', False, initializers.glorot_normal())(x2)
x4 = layers.Dense(100, 'sigmoid', False, initializers.glorot_normal())(x3)
x5 = layers.Dense(100, 'sigmoid', False, initializers.glorot_normal())(x4)
model_3 = keras.Model(inputs, [x1, x2, x3, x4, x5])
x = np.random.randn(100, 100)
outputs = model_3.predict(x)

fig,axs = plt.subplots(1, 5, sharey=True, figsize=(12,5))
for i,(o,ax) in enumerate(zip(outputs, axs.flatten())):
    ax.hist(o.flatten(), 30, range=[0,1])
    ax.set_title(f"layer-{i}")
inputs = keras.Input(shape=(100,))
x1 = layers.Dense(100, 'relu', False, initializers.glorot_normal())(inputs)
x2 = layers.Dense(100, 'relu', False, initializers.glorot_normal())(x1)
x3 = layers.Dense(100, 'relu', False, initializers.glorot_normal())(x2)
x4 = layers.Dense(100, 'relu', False, initializers.glorot_normal())(x3)
x5 = layers.Dense(100, 'relu', False, initializers.glorot_normal())(x4)
model_4 = keras.Model(inputs, [x1, x2, x3, x4, x5])
x = np.random.randn(100, 100)
outputs = model_4.predict(x)

fig,axs = plt.subplots(1, 5, sharey=True, figsize=(12,5))
for i,(o,ax) in enumerate(zip(outputs, axs.flatten())):
    ax.hist(o.flatten(), 30, range=[0,1])
    ax.set_title(f"layer-{i}")
inputs = keras.Input(shape=(100,))
x1 = layers.Dense(100, 'relu', False, initializers.he_normal())(inputs)
x2 = layers.Dense(100, 'relu', False, initializers.he_normal())(x1)
x3 = layers.Dense(100, 'relu', False, initializers.he_normal())(x2)
x4 = layers.Dense(100, 'relu', False, initializers.he_normal())(x3)
x5 = layers.Dense(100, 'relu', False, initializers.he_normal())(x4)
model_5 = keras.Model(inputs, [x1, x2, x3, x4, x5])
x = np.random.randn(100, 100)
outputs = model_5.predict(x)

fig,axs = plt.subplots(1, 5, sharey=True, figsize=(12,5))
for i,(o,ax) in enumerate(zip(outputs, axs.flatten())):
    ax.hist(o.flatten(), 30, range=[0,1])
    ax.set_title(f"layer-{i}")