# import gym
import gymnasium as gym

# from stable_baselines3 import DQN # Does NOT Work!!! (why not?)
# from stable_baselines3 import A2C  # WORKS GREAT! sometimes!?
from stable_baselines3 import PPO  # WORKS GREAT! even better 8sec done!
# from stable_baselines3 import TD3 # not here cause action space is discrete
# from stable_baselines3 import DDPG # not here cause action space is discrete
# from stable_baselines3 import SAC # Soft Actor-Critic (SAC) is suitable for continuous action spaces
from stable_baselines3.common.results_plotter import load_results, ts2xy
from stable_baselines3.common.monitor import Monitor
import matplotlib.pyplot as plt

import time
start_time = time.time()
log_dir = "./logs/"

render_mode = None
# render_mode = "human"  # Set to None for no rendering, or "human" for rendering
env = Monitor(gym.make("CartPole-v1", render_mode=render_mode), log_dir)

# algo = A2C("MlpPolicy", env, verbose=1)
algo = PPO("MlpPolicy", env, verbose=1)
algo.learn(total_timesteps=100000) # 1 step = one action! (not episode!)

# plotting
x, y = ts2xy(load_results(log_dir), 'timesteps')
plt.plot(x, y)
plt.xlabel('Timesteps')
plt.ylabel('Rewards')
plt.title('Learning Curve')
plt.grid()
plt.show()


end_time = time.time()
print(f"Execution time: {end_time - start_time:.2f} seconds")