# import gym
import gymnasium as gym

# from stable_baselines3 import DQN # Does NOT Work!!! (why not?)
from stable_baselines3 import A2C, DQN  # WORKS GREAT! sometimes!?
from stable_baselines3 import PPO  # WORKS GREAT! even better 8sec done!
from stable_baselines3 import TD3 # suitable for continuous action spaces
from stable_baselines3 import DDPG # suitable for continuous action spaces
from stable_baselines3 import SAC # Soft Actor-Critic (SAC) is suitable for continuous action spaces
from stable_baselines3.common.results_plotter import load_results, ts2xy
from stable_baselines3.common.monitor import Monitor
import matplotlib.pyplot as plt

import time
start_time = time.time()
log_dir = "./logs/"

render_mode = None
# render_mode = "human"  # visualization
# env = Monitor(gym.make("MountainCarContinuous-v0", render_mode=render_mode), log_dir)
env = Monitor(gym.make("MountainCar-v0", render_mode=render_mode), log_dir)

algo = DQN("MlpPolicy", env, verbose=1) # NOT for continuous action space!
# algo = DQN("MlpPolicy", env, verbose=1, learning_rate=1e-3, buffer_size=10000, batch_size=64, exploration_fraction=0.1, exploration_initial_eps=1.0, exploration_final_eps=0.01)
# algo = TD3("MlpPolicy", env, verbose=1)
# algo = PPO("MlpPolicy", env, verbose=1)
algo.learn(total_timesteps=20000) # 1 step = one action! (not episode!)

# plotting
x, y = ts2xy(load_results(log_dir), 'timesteps')
plt.plot(x, y)
plt.xlabel('Timesteps')
plt.ylabel('Rewards')
plt.title('Learning Curve')
plt.grid()
plt.show()


end_time = time.time()
print(f"Execution time: {end_time - start_time:.2f} seconds")
