%matplotlib inline

import numpy as np, matplotlib as mpl, scipy as sp, scipy.stats as st
from matplotlib import pylab, mlab, pyplot as plt
from matplotlib.pylab import plot, scatter, contour, xlabel, ylabel, title, legend, subplot, fill_between
from matplotlib.animation import FuncAnimation

from numpy import sqrt, pi, exp, log, floor, ceil, sin, cos
from numpy import linspace, logspace, arange, empty, zeros, ones, empty_like, zeros_like, ones_like, full, full_like
from numpy import diff, meshgrid, mean, std, argmin, array, eye, amin, amax, fmin, fmax
from numpy.linalg import norm

import re, multiprocessing as mp, networkx as nx, json
from collections import defaultdict
from tqdm.notebook import tqdm, trange

plt.rcParams.update({
    'image.cmap': 'coolwarm',
    'animation.html': 'jshtml',
    'animation.embed_limit': 40, # 40 mb
})
rng = np.random.default_rng()

# To run this cell change the type back to "code".

# Compute frequencies, and save to file
alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ. '
list_alphabet = list(alphabet)
index = { s: n for n, s in enumerate(alphabet) }

def sanitize(s):
    s = re.sub( '[^A-Za-z. ]', ' ', s )
    s = re.sub( r'\s+', ' ', s ).upper()    
    
    return list(s)

lang_sample_file = 'data/few-books.txt'
with open( lang_sample_file ) as f:
    lang_sample = f.read()

txt = sanitize( lang_sample )

P = defaultdict( lambda: defaultdict( lambda: 0 ) )
for n in trange( len(txt) -1 ):
    P[ txt[n] ][txt[n+1]] += 1

μ={}
for c in P:
    μ[c] = sum( P[c].values() )
    P[c] = { k: v/μ[c] for k, v in P[c].items() }

T = sum( μ.values() )
μ = { k:v/T for k, v in μ.items() }

with open( 'data/letter-frequencies.json', 'w') as f:
    json.dump( { 'P': dict(P), 'μ': μ }, f )

# Load letter freqencies
with open( 'data/letter-frequencies.json' ) as f:
    data = json.load( f )

# P[x] = { y1: p1, y2: p2 ... }, where P[x][y1] = probability that y1 follows x in our English text sample
# μ[x] = frequency of x occuring in our English text sample.
P, μ = data['P'], data['μ']
del data

# Make a graph with the above edge weights
G = nx.DiGraph()
G.add_nodes_from( P.keys() )
for a in G.nodes():
    for b, p in P[a].items():
        G.add_edge( a, b, weight=p )

# Visualize the transition probabilities P
widths = nx.get_edge_attributes(G, 'weight')
nodelist = G.nodes()
#plt.figure(figsize=(12,8))

pos = nx.spring_layout(G)
nx.draw_networkx_nodes(G,pos, nodelist=nodelist, node_size=300, edgecolors='black', alpha=.5)
nx.draw_networkx_edges(G,pos, edgelist = widths.keys(), alpha=array(list(widths.values())) )
_ = nx.draw_networkx_labels(G, pos=pos, verticalalignment='center_baseline',
        labels=dict(zip(nodelist,nodelist)) )

def compute_π( G, P, α, ε ):
    """Compute the stationary distribution π with mean square error at most ε^2

    Returns a dictionary π where π[x] is the computed value of the stationary distribution
    """

# If you want to test you got things correct, here are a few plots of π as α varies
keys = sorted( μ, key = lambda k: μ[k], reverse=True )
def plot_dist( μ, keys, **kwargs ):
    plot( keys, [μ[k] for k in keys], **kwargs )

plot_dist( μ, keys, label='μ' )
for α in [ .25, .5, .85]:
    π = compute_π( G, P, α, .01 )
    plot_dist( π, keys, label=f'α={α:.2f}' )

legend()

  0%|          | 0/240000 [00:00<?, ?it/s]

  0%|          | 0/480000 [00:00<?, ?it/s]

  0%|          | 0/2047212 [00:00<?, ?it/s]

<matplotlib.legend.Legend at 0x75133c7f7bf0>

Computing the Stationary Distribution¶

Question: Compute the stationary distribution¶