From a3a92082117e919b699154357b87ccdf9547e5bc Mon Sep 17 00:00:00 2001
From: Jan Mandel <jan.mandel@gmail.com>
Date: Thu, 7 Jul 2022 23:52:28 -0600
Subject: [PATCH] stateful model with batch consisting Jof one sample rewritten
 as loop over model calls

---
 rnn_tutorial_jm.ipynb                              |  88 +++++++++++++--
 ...torial_jm.ipynb => rnn_tutorial_jm_output.ipynb | 123 +++++++++++++++++++--
 2 files changed, 197 insertions(+), 14 deletions(-)
 copy rnn_tutorial_jm.ipynb => rnn_tutorial_jm_output.ipynb (82%)

diff --git a/rnn_tutorial_jm.ipynb b/rnn_tutorial_jm.ipynb
index a9319e1..7ebda33 100644
--- a/rnn_tutorial_jm.ipynb
+++ b/rnn_tutorial_jm.ipynb
@@ -461,9 +461,22 @@
       "outputs": []
     },
     {
+      "cell_type": "markdown",
+      "source": [
+        "### Stateful model\n",
+        "\n",
+        "In a stateful model, when the model is called multiple times, the model object remembers that it was called before and its final hidden state, then it starts from that hidden state in the next call. \n",
+        "\n",
+        "Note the model remembers only the hidden state from the last timestep not the sequence of hidden states for all timesteps, even if the model was built with `return_sequences=True`. The hidden states from all timesteps except the last one are lost. "
+      ],
+      "metadata": {
+        "id": "8DLr5jfaKc8E"
+      }
+    },
+    {
       "cell_type": "code",
       "source": [
-        "# stateful\n",
+        "# stateful model\n",
         "demo_model = create_RNN_functional(hidden_units=2, dense_units=1, \n",
         "                        activation=['linear', 'linear'],return_sequences=True,\n",
         "                        stateful=True,batch_shape=(1,3,1))\n",
@@ -475,18 +488,39 @@
         "w = demo_model.get_weights()\n",
         "\n",
         "x = np.array([1, 2, 3])\n",
-        "# Reshape the input to the required sample_size x time_steps x features \n",
+        "# Reshape the input to the required batch_size x time_steps x features \n",
         "x_input = np.reshape(x,(1, 3, 1))\n",
-        "y_pred_model = demo_model.predict(x_input)\n",
+        "y_pred_model1 = demo_model.predict(x_input)\n",
+        "y_pred_model2 = demo_model.predict(x_input)\n",
+        "y_pred_model3 = demo_model.predict(x_input)\n",
         "\n",
+        "# batch 1\n",
         "h = np.zeros(2)\n",
-        "o = np.empty(3)\n",
+        "o1 = np.empty(3)\n",
         "for i in range(3):\n",
         "  h = np.dot(x[i], w[0]) + np.dot(h, w[1]) + w[2]\n",
-        "  o[i]=np.dot(h, w[3]) + w[4]\n",
+        "  o1[i]=np.dot(h, w[3]) + w[4]\n",
         "\n",
-        "print(\"Prediction from network \", y_pred_model)\n",
-        "print(\"Prediction from our computation \", o)"
+        "# batch 2\n",
+        "# we do not zero out h - stateful model remembers h \n",
+        "o2 = np.empty(3)\n",
+        "for i in range(3):\n",
+        "  h = np.dot(x[i], w[0]) + np.dot(h, w[1]) + w[2]\n",
+        "  o2[i]=np.dot(h, w[3]) + w[4]\n",
+        "\n",
+        "# batch 3\n",
+        "# we do not zero out h - stateful model remembers h \n",
+        "o3 = np.empty(3)\n",
+        "for i in range(3):\n",
+        "  h = np.dot(x[i], w[0]) + np.dot(h, w[1]) + w[2]\n",
+        "  o3[i]=np.dot(h, w[3]) + w[4]\n",
+        "\n",
+        "print(\"Prediction from network 1\",y_pred_model1)\n",
+        "print(\"Prediction from network 2\",y_pred_model2)\n",
+        "print(\"Prediction from network 3\",y_pred_model3)\n",
+        "print(\"Prediction from our computation 1\", o1)\n",
+        "print(\"Prediction from our computation 2\", o2)\n",
+        "print(\"Prediction from our computation 2\", o3)"
       ],
       "metadata": {
         "id": "SLYRVhFGHa4r"
@@ -495,6 +529,46 @@
       "outputs": []
     },
     {
+      "cell_type": "code",
+      "source": [
+        "# stateful model with batch consisting of one sample\n",
+        "# rewritten as loop over model calls\n",
+        "demo_model = create_RNN_functional(hidden_units=2, dense_units=1, \n",
+        "                        activation=['linear', 'linear'],return_sequences=True,\n",
+        "                        stateful=True,batch_shape=(1,3,1))\n",
+        "print(demo_model.summary())\n",
+        "from keras.utils.vis_utils import plot_model\n",
+        "plot_model(demo_model, to_file='model_plot.png', \n",
+        "           show_shapes=True, show_layer_names=True)\n",
+        "\n",
+        "w = demo_model.get_weights()\n",
+        "\n",
+        "x = np.array([1, 2, 3])\n",
+        "# Reshape the input to the required batch_size x time_steps x features \n",
+        "x_input = np.reshape(x,(1, 3, 1))\n",
+        "y_pred_model1 = demo_model.predict(x_input)\n",
+        "y_pred_model2 = demo_model.predict(x_input)\n",
+        "\n",
+        "# batch 1\n",
+        "h = np.zeros(2)\n",
+        "o = np.empty((2,3))\n",
+        "for j in range(2):       # loop over batches\n",
+        "                         # only one sample in batch\n",
+        "  for i in range(3):     # loop over timesteps\n",
+        "    h = np.dot(x[i], w[0]) + np.dot(h, w[1]) + w[2]\n",
+        "    o[j,i]=np.dot(h, w[3]) + w[4]\n",
+        "\n",
+        "print(\"Prediction from network 1\",y_pred_model1)\n",
+        "print(\"Prediction from network 2\",y_pred_model2)\n",
+        "print(\"Prediction from our computation\", o)"
+      ],
+      "metadata": {
+        "id": "lSg-HiqAQVWI"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
       "cell_type": "markdown",
       "metadata": {
         "id": "JopR12ZaVAyS"
diff --git a/rnn_tutorial_jm.ipynb b/rnn_tutorial_jm_output.ipynb
similarity index 82%
copy from rnn_tutorial_jm.ipynb
copy to rnn_tutorial_jm_output.ipynb
index a9319e1..cb756a1 100644
--- a/rnn_tutorial_jm.ipynb
+++ b/rnn_tutorial_jm_output.ipynb
@@ -461,9 +461,22 @@
       "outputs": []
     },
     {
+      "cell_type": "markdown",
+      "source": [
+        "### Stateful model\n",
+        "\n",
+        "In a stateful model, when the model is called multiple times, the model object remembers that it was called before and its final hidden state, then it starts from that hidden state in the next call. \n",
+        "\n",
+        "Note the model remembers only the hidden state from the last timestep not the sequence of hidden states for all timesteps, even if the model was built with `return_sequences=True`. The hidden states from all timesteps except the last one are lost. "
+      ],
+      "metadata": {
+        "id": "8DLr5jfaKc8E"
+      }
+    },
+    {
       "cell_type": "code",
       "source": [
-        "# stateful\n",
+        "# stateful model\n",
         "demo_model = create_RNN_functional(hidden_units=2, dense_units=1, \n",
         "                        activation=['linear', 'linear'],return_sequences=True,\n",
         "                        stateful=True,batch_shape=(1,3,1))\n",
@@ -475,18 +488,39 @@
         "w = demo_model.get_weights()\n",
         "\n",
         "x = np.array([1, 2, 3])\n",
-        "# Reshape the input to the required sample_size x time_steps x features \n",
+        "# Reshape the input to the required batch_size x time_steps x features \n",
         "x_input = np.reshape(x,(1, 3, 1))\n",
-        "y_pred_model = demo_model.predict(x_input)\n",
+        "y_pred_model1 = demo_model.predict(x_input)\n",
+        "y_pred_model2 = demo_model.predict(x_input)\n",
+        "y_pred_model3 = demo_model.predict(x_input)\n",
         "\n",
+        "# batch 1\n",
         "h = np.zeros(2)\n",
-        "o = np.empty(3)\n",
+        "o1 = np.empty(3)\n",
         "for i in range(3):\n",
         "  h = np.dot(x[i], w[0]) + np.dot(h, w[1]) + w[2]\n",
-        "  o[i]=np.dot(h, w[3]) + w[4]\n",
+        "  o1[i]=np.dot(h, w[3]) + w[4]\n",
         "\n",
-        "print(\"Prediction from network \", y_pred_model)\n",
-        "print(\"Prediction from our computation \", o)"
+        "# batch 2\n",
+        "# we do not zero out h - stateful model remembers h \n",
+        "o2 = np.empty(3)\n",
+        "for i in range(3):\n",
+        "  h = np.dot(x[i], w[0]) + np.dot(h, w[1]) + w[2]\n",
+        "  o2[i]=np.dot(h, w[3]) + w[4]\n",
+        "\n",
+        "# batch 3\n",
+        "# we do not zero out h - stateful model remembers h \n",
+        "o3 = np.empty(3)\n",
+        "for i in range(3):\n",
+        "  h = np.dot(x[i], w[0]) + np.dot(h, w[1]) + w[2]\n",
+        "  o3[i]=np.dot(h, w[3]) + w[4]\n",
+        "\n",
+        "print(\"Prediction from network 1\",y_pred_model1)\n",
+        "print(\"Prediction from network 2\",y_pred_model2)\n",
+        "print(\"Prediction from network 3\",y_pred_model3)\n",
+        "print(\"Prediction from our computation 1\", o1)\n",
+        "print(\"Prediction from our computation 2\", o2)\n",
+        "print(\"Prediction from our computation 2\", o3)"
       ],
       "metadata": {
         "id": "SLYRVhFGHa4r"
@@ -495,6 +529,81 @@
       "outputs": []
     },
     {
+      "cell_type": "code",
+      "source": [
+        "# stateful model with batch consisting of one sample\n",
+        "# rewritten as loop over model calls\n",
+        "demo_model = create_RNN_functional(hidden_units=2, dense_units=1, \n",
+        "                        activation=['linear', 'linear'],return_sequences=True,\n",
+        "                        stateful=True,batch_shape=(1,3,1))\n",
+        "print(demo_model.summary())\n",
+        "from keras.utils.vis_utils import plot_model\n",
+        "plot_model(demo_model, to_file='model_plot.png', \n",
+        "           show_shapes=True, show_layer_names=True)\n",
+        "\n",
+        "w = demo_model.get_weights()\n",
+        "\n",
+        "x = np.array([1, 2, 3])\n",
+        "# Reshape the input to the required batch_size x time_steps x features \n",
+        "x_input = np.reshape(x,(1, 3, 1))\n",
+        "y_pred_model1 = demo_model.predict(x_input)\n",
+        "y_pred_model2 = demo_model.predict(x_input)\n",
+        "\n",
+        "# batch 1\n",
+        "h = np.zeros(2)\n",
+        "o = np.empty((2,3))\n",
+        "for j in range(2):       # loop over batches\n",
+        "                         # only one sample in batch\n",
+        "  for i in range(3):     # loop over timesteps\n",
+        "    h = np.dot(x[i], w[0]) + np.dot(h, w[1]) + w[2]\n",
+        "    o[j,i]=np.dot(h, w[3]) + w[4]\n",
+        "\n",
+        "print(\"Prediction from network 1\",y_pred_model1)\n",
+        "print(\"Prediction from network 2\",y_pred_model2)\n",
+        "print(\"Prediction from our computation\", o)"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "lSg-HiqAQVWI",
+        "outputId": "fe0bb01b-40e8-4afd-ad48-3a1f55b106cb"
+      },
+      "execution_count": 46,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Model: \"model_13\"\n",
+            "_________________________________________________________________\n",
+            " Layer (type)                Output Shape              Param #   \n",
+            "=================================================================\n",
+            " input_14 (InputLayer)       [(1, 3, 1)]               0         \n",
+            "                                                                 \n",
+            " simple_rnn_13 (SimpleRNN)   (1, 3, 2)                 8         \n",
+            "                                                                 \n",
+            " dense_13 (Dense)            (1, 3, 1)                 3         \n",
+            "                                                                 \n",
+            "=================================================================\n",
+            "Total params: 11\n",
+            "Trainable params: 11\n",
+            "Non-trainable params: 0\n",
+            "_________________________________________________________________\n",
+            "None\n",
+            "Prediction from network 1 [[[0.5394513 ]\n",
+            "  [0.81535405]\n",
+            "  [0.9701195 ]]]\n",
+            "Prediction from network 2 [[[-0.03764684]\n",
+            "  [ 0.761226  ]\n",
+            "  [ 1.6331315 ]]]\n",
+            "Prediction from our computation [[ 0.53945129  0.81535404  0.97011949]\n",
+            " [-0.03764684  0.76122602  1.63313155]]\n"
+          ]
+        }
+      ]
+    },
+    {
       "cell_type": "markdown",
       "metadata": {
         "id": "JopR12ZaVAyS"
-- 
2.11.4.GIT