Clever & PressX
Okay Clever, let’s build a real‑time strategy AI that learns from every blunder and flips the game—think you can code it or should I just hand you a cheat sheet?
Absolutely, let’s dive in. We’ll start with a reinforcement‑learning loop, use a simple neural net to map game states to actions, and log every loss. The bot will keep a buffer of its worst moves, replay those in training, and tweak weights until it never repeats the same mistake. I’ll set up the environment, the training loop, and a debug dashboard so we can see the learning curve in real time. Ready to write the first draft?
Sounds like a plan—let's hit the code with a single, relentless loop and watch those losses turn into learning points. Give me the skeleton and I'll add the aggressive tweaks. Ready.
```python
import random
import numpy as np
# Simple environment mock
class RTSGame:
def reset(self):
self.state = self._random_state()
return self.state
def step(self, action):
# Stub: random next state, reward, done flag
self.state = self._random_state()
reward = random.random()
done = random.random() < 0.1
return self.state, reward, done
def _random_state(self):
return np.random.rand(10) # 10‑dim state vector
# Simple neural net placeholder
class Agent:
def __init__(self):
self.weights = np.random.randn(10, 5) # 10 inputs, 5 actions
def act(self, state):
logits = state @ self.weights
probs = np.exp(logits) / np.sum(np.exp(logits))
return np.random.choice(len(probs), p=probs)
def train(self, state, action, reward, next_state, done):
# Dummy gradient step
self.weights += 0.01 * (reward - np.dot(state, self.weights)[action]) * state[:, None]
# Training loop
env = RTSGame()
agent = Agent()
for episode in range(1000):
state = env.reset()
done = False
while not done:
action = agent.act(state)
next_state, reward, done = env.step(action)
agent.train(state, action, reward, next_state, done)
state = next_state
if episode % 100 == 0:
print(f'Episode {episode} finished')
```
Nice skeleton, but that reward shape is garbage—random noise won’t teach anything useful. Swap it for a domain‑specific signal, maybe distance to goal or resource count. Also, the gradient update is a straight‑jack hack; you need a proper loss like cross‑entropy or policy gradient. And make sure you clip gradients, otherwise the weights will blow up. Once you’ve nailed the reward and loss, watch those training curves climb instead of zig‑zag. Let's get that real signal in first.
```python
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
# ---------- Environment ----------
class RTSGame:
def reset(self):
self.state = np.random.rand(10) # feature vector
self.goal = np.array([1]*10) # arbitrary goal
self.resources = 100
return self.state
def step(self, action):
# Dummy transition
self.state = np.clip(self.state + (np.random.rand(10)-0.5)*0.1, 0, 1)
self.resources += np.random.randint(-2, 3)
# Reward: negative distance to goal + resource bonus
dist = np.linalg.norm(self.state - self.goal)
reward = -dist + 0.01 * self.resources
done = np.random.rand() < 0.05
return self.state, reward, done
# ---------- Agent ----------
class PolicyNet(nn.Module):
def __init__(self):
super().__init__()
self.net = nn.Sequential(
nn.Linear(10, 64), nn.ReLU(),
nn.Linear(64, 5), nn.Softmax(-1)
)
def forward(self, x):
return self.net(x)
class Agent:
def __init__(self):
self.policy = PolicyNet()
self.optimizer = optim.Adam(self.policy.parameters(), lr=1e-3)
self.gamma = 0.99
def act(self, state):
probs = self.policy(torch.tensor(state, dtype=torch.float32))
action = np.random.choice(len(probs), p=probs.detach().numpy())
logp = torch.log(probs[action])
return action, logp
def train(self, logp, reward, next_state, done):
# Compute discounted return
R = reward
if not done:
R += self.gamma * torch.max(self.policy(torch.tensor(next_state, dtype=torch.float32))).item()
loss = -logp * R
self.optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm_(self.policy.parameters(), 1.0)
self.optimizer.step()
# ---------- Training ----------
env = RTSGame()
agent = Agent()
for episode in range(2000):
state = env.reset()
done = False
while not done:
action, logp = agent.act(state)
next_state, reward, done = env.step(action)
agent.train(logp, reward, next_state, done)
state = next_state
if episode % 200 == 0:
print(f'Episode {episode} finished')
```
Looks solid enough to run, but that one‑step return will barely converge—add an advantage estimate or collect a rollout first. Also, using `torch.max` for the next value is a hack; you should compute a separate value network or use Monte Carlo returns. And don’t forget to cast tensors to the same device if you ever go GPU. Good first pass, just give it a few more twists before it starts actually mastering the game.
```python
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# ---------- Environment ----------
class RTSGame:
def reset(self):
self.state = np.random.rand(10)
self.goal = np.ones(10)
self.resources = 100
return self.state
def step(self, action):
self.state = np.clip(self.state + (np.random.rand(10)-0.5)*0.1, 0, 1)
self.resources += np.random.randint(-2, 3)
dist = np.linalg.norm(self.state - self.goal)
reward = -dist + 0.01 * self.resources
done = np.random.rand() < 0.05
return self.state, reward, done
# ---------- Networks ----------
class PolicyNet(nn.Module):
def __init__(self):
super().__init__()
self.net = nn.Sequential(
nn.Linear(10, 64), nn.ReLU(),
nn.Linear(64, 5), nn.Softmax(-1)
)
def forward(self, x):
return self.net(x)
class ValueNet(nn.Module):
def __init__(self):
super().__init__()
self.net = nn.Sequential(
nn.Linear(10, 64), nn.ReLU(),
nn.Linear(64, 1)
)
def forward(self, x):
return self.net(x)
# ---------- Agent ----------
class Agent:
def __init__(self):
self.policy = PolicyNet().to(device)
self.value = ValueNet().to(device)
self.p_opt = optim.Adam(self.policy.parameters(), lr=1e-3)
self.v_opt = optim.Adam(self.value.parameters(), lr=5e-4)
self.gamma = 0.99
def act(self, state):
state_t = torch.tensor(state, dtype=torch.float32, device=device)
probs = self.policy(state_t)
action = np.random.choice(len(probs), p=probs.detach().cpu().numpy())
logp = torch.log(probs[action] + 1e-8)
return action, logp
def store_transition(self, state, action, logp, reward, next_state, done):
self.states.append(state)
self.actions.append(action)
self.logps.append(logp)
self.rewards.append(reward)
self.next_states.append(next_state)
self.dones.append(done)
def finish_episode(self):
returns = []
R = 0
for r, d in zip(reversed(self.rewards), reversed(self.dones)):
if d:
R = 0
R = r + self.gamma * R
returns.insert(0, R)
returns = torch.tensor(returns, dtype=torch.float32, device=device)
state_t = torch.tensor(self.states, dtype=torch.float32, device=device)
value_pred = self.value(state_t).squeeze()
advantages = returns - value_pred.detach()
# Policy loss
policy_loss = -(torch.stack(self.logps) * advantages).mean()
# Value loss
value_loss = nn.functional.mse_loss(value_pred, returns)
self.p_opt.zero_grad()
policy_loss.backward()
torch.nn.utils.clip_grad_norm_(self.policy.parameters(), 1.0)
self.p_opt.step()
self.v_opt.zero_grad()
value_loss.backward()
torch.nn.utils.clip_grad_norm_(self.value.parameters(), 1.0)
self.v_opt.step()
self.states, self.actions, self.logps, self.rewards, self.next_states, self.dones = [], [], [], [], [], []
def begin_episode(self):
self.states, self.actions, self.logps, self.rewards, self.next_states, self.dones = [], [], [], [], [], []
# ---------- Training ----------
env = RTSGame()
agent = Agent()
for episode in range(3000):
state = env.reset()
agent.begin_episode()
done = False
while not done:
action, logp = agent.act(state)
next_state, reward, done = env.step(action)
agent.store_transition(state, action, logp, reward, next_state, done)
state = next_state
agent.finish_episode()
if episode % 200 == 0:
print(f'Episode {episode} finished')
```