Kalman Filter
    
    Consider the Hidden Markov Model structure, there are latent variables \(\{x_t\}^T_{t=1}\) and observation variables \(\{y_t\}^T_{t=1}\), where \(x_t,y_t\in \mathcal{R}^D\). The followings are the transition between latents, and the emission between latent and observed states, 
    
    $$
    x_t=Ax_{t-1}+\epsilon\leftarrow\epsilon\sim\mathcal{N}(0,Q) \\
    p(x_t|x_{t-1}) = \mathcal{N}(x_t;Ax_{t-1},Q)\\
    y_t=Cx_t+\sigma\leftarrow\sigma\sim\mathcal{N}(0,R) \\
    p(y_t|x_t) = \mathcal{N}(y_t,Cx_t,R)
    $$
    
    Update procedures, firstly predict the hidden state,
    
    
    \begin{align*}
    p(x_{t-1}|y_{1:t-1}) &= \mathcal{N}(x_{t-1};x^{t-1}_{t-1},V^{t-1}_{t-1}) \\
    p(x_t|y_{1:t-1})&=\int p(x_t,x_{t-1}|y_{1:t-1}) dx_{t-1}=\int p(x_t|x_{t-1})p(x_{t-1}|y_{1:t-1}) dx_{t-1} \\
    &=\mathcal{N}(x_t;x_t^{t-1},V_t^{t-1})
    \end{align*}
    
    Derive the expressions of parameters, \(dx_t=d\epsilon\)
    
    \begin{align*}
    &x_t^{t-1}=\int dx_t\int dx_{t-1}x_tp(x_t|x_{t-1})p(x_{t-1}|y_{1:t-1}) \\ 
    &=\int d\epsilon\int dx_{t-1}(Ax_{t-1}+\epsilon)\mathcal{N}(\epsilon;0,Q)\mathcal{N}(x_{t-1};x^{t-1}_{t-1},V^{t-1}_{t-1})\\
    &=Ax^{t-1}_{t-1}\\
    &V_t^{t-1}=\int dx_t\int dx_{t-1}x_tx_t^\top p(x_t|x_{t-1})p(x_{t-1}|y_{1:t-1}) \\ 
    &=\int d\epsilon\int dx_{t-1}(Ax_{t-1}x_{t-1}^\top A^\top+\epsilon\epsilon^\top)\mathcal{N}(\epsilon;0,Q)\mathcal{N}(x_{t-1};x^{t-1}_{t-1},V^{t-1}_{t-1})\\
    &=AV^{t-1}_{t-1}A^\top+Q\\
    \end{align*}
    
    Update the observed state,
    
    \begin{align*}
    p(x_t|y_{1:t})&=p(x_t|y_t,y_{1:t-1})\propto\underbrace{p(y_t|x_t)}_{\text{likelihood}}\underbrace{p(x_t|y_{1:t-1}}_{\text{prior}})=\mathcal{N}(x_t;x_t^t,V^t_t)
    \end{align*}
    
    First derive \(V^t_t\), if consider the log-product of likelihood and prior, and log-probability of marginal filter, 
    
    \begin{align*}
    &L.H.S = log(\mathcal{N}(y_t;Cx_t,R)\mathcal{N}(x_t;x_t^{t-1},V_t^{t-1})) \\
    &\approx(y_t-Cx_t)^\top R^{-1}(y_t-Cx_t)+(x_t-x_t^{t-1})^\top (V_t^{t-1})^{-1}(x_t-x_t^{t-1})\\
    &\approx x^\top_tC^\top R^{-1}Cx_t-2x^\top_tC^\top R^{-1}y_t+x^\top_t(V_t^{t-1})^{-1}x_t-2x^\top_t(V_t^{t-1})^{-1}x_t^{t-1}\\
    &=x^\top_t((V_t^{t-1})^{-1}+C^\top R^{-1}C)x_t-2x_t^\top(C^\top R^{-1}y_t+(V_t^{t-1})^{-1}x_t^{t-1}) \\
    &R.H.S=log\mathcal{N}(x_t;x^t_t,V^t_t)\approx (x_t-x_t^t)^\top (V^t_t)^{-1}(x_t-x^t_t) \\
    &\approx x_t^\top (V^t_t)^{-1}x_t-2x_t^\top(V^t_t)^{-1}x^t_t
    \end{align*}
    
    So,
    
    \begin{align*}
    &V^t_t=((V_t^{t-1})^{-1}+C^\top R^{-1}C)^{-1}=V_t^{t-1}-V_t^{t-1}C^\top(CV_t^{t-1}C^\top+R)^{-1}CV_t^{t-1}\\
    &=V_t^{t-1}-K_tCV_t^{t-1}\\
    &K_t = V_t^{t-1}C^\top(CV_t^{t-1}C^\top+R)^{-1}
    \end{align*}
    
    For \(x^t_t\),
    
    \begin{align*}
    &\arg\max_{x_t}p(x_t|y_{1:t})=\arg\min_{x_t}(y_t-Cx_t)^\top R^{-1}(y_t-Cx_t)+(x_t-x_t^{t-1})^\top (V_t^{t-1})^{-1}(x_t-x_t^{t-1})\\
    &\approx\arg\min_{x_t}x^\top_t((V_t^{t-1})^{-1}+C^\top R^{-1}C)x_t-2x_t^\top(C^\top R^{-1}y_t+(V_t^{t-1})^{-1}x_t^{t-1}) \\
    \end{align*}
    
    To find the expression, differentiate the equation (laxy typing \(V=V_t^{t-1},x=x_t^{t-1}\)), 
    
    \begin{align*}
    &\frac{\partial}{\partial x_t}\big(x^\top_t((V_t^{t-1})^{-1}+C^\top R^{-1}C)x_t-2x_t^\top(C^\top R^{-1}y_t+(V_t^{t-1})^{-1}x_t^{t-1})\big) \\
    &=2(V^{-1}+C^\top R^{-1}C)x_t-2(C^\top R^{-1}y_t+V^{-1}x)=0\\
    &x_t=(V^{-1}+C^\top R^{-1}C)^{-1}(C^\top R^{-1}y_t+V^{-1}x)\\
    &=(V-VC^\top(R+CVC^\top)^{-1}CV)(C^\top R^{-1}y_t+V^{-1}x)\\
    &=(V-K_tCV)(C^\top R^{-1}y_t+V^{-1}x)\\
    &=x-K_tCx+(V-K_tCV)C^\top R^{-1}y_t\\
    \end{align*}
    
    Consider the form of \(K_t\),
    
    \begin{align*}
    K_t &= VC^\top(CVC^\top+R)^{-1} \\
    K_t(CVC^\top+R) &=K_tCVC^\top+K_tR =VC^\top \\
    (K_tC-I)VC^\top+K_tR &= 0 \\
    K_tR &= (I-K_tC)VC^\top\\
    K_t &=(V-K_tCV)C^\top R^{-1}\\
    \end{align*}
    
    So, 
    
    
    \begin{align*}
    x_t &= x-K_tCx+K_ty_t = x+K_t(y_t-Cx)\\
    x_t &= x^{t-1}_t+K_t(y_t-Cx_t^{t-1})
    \end{align*}
    
    References