#!/usr/bin/env python3
import numpy as np

def value_iteration(grid_size=4, discount=0.9, theta=1e-9):
    num_states = grid_size ** 2
    actions = [(0, -1), (0, 1), (-1, 0), (1, 0)]  # Implicitly define moves
    rewards = np.full((grid_size, grid_size), -1.0)
    rewards[:-1, 1] = -10  # Wall penalties
    rewards[0, 0] = 0  # Terminal state

    # Flattened rewards
    R = rewards.flatten()
    V = np.zeros(num_states)  # Value function
    Q = np.zeros((num_states, len(actions)))  # State-action value function

    # Transition function
    def next_state(s, a):
        x, y = divmod(s, grid_size)
        dx, dy = actions[a]
        nx, ny = x + dx, y + dy
        if 0 <= nx < grid_size and 0 <= ny < grid_size:
            return nx * grid_size + ny
        return s  # Stay in place if out of bounds

    while True:
        delta = 0
        for s in range(num_states):
            if s == 0:  # Terminal state
                continue
            # Compute Q-values for all actions
            Q[s] = [R[s] + discount * V[next_state(s, a)] for a in range(len(actions))]
            new_value = Q[s].max()  # Optimal value for state s
            delta = max(delta, abs(new_value - V[s]))
            V[s] = new_value

        if delta < theta:  # Convergence check
            break

    # Derive optimal policy
    policy = np.argmax(Q, axis=1)
    return V.reshape(grid_size, grid_size), policy.reshape(grid_size, grid_size)

# Execute and display results
values, policy = value_iteration()

# Convert policy to arrows for clarity
directions = ['←', '→', '↑', '↓']
policy_arrows = np.vectorize(lambda p: directions[p])(policy)

print("Optimal Values:\n", values)
print("Optimal Policy:\n", policy_arrows)