Skip to content

Commit

Permalink
Add EthicalSafety model and corresponding tests for safety and ethics…
Browse files Browse the repository at this point in the history
… evaluations
  • Loading branch information
kasinadhsarma committed Dec 27, 2024
1 parent 06f6456 commit e9e9d99
Show file tree
Hide file tree
Showing 6 changed files with 164 additions and 12 deletions.
Binary file modified models/__pycache__/consciousness.cpython-310.pyc
Binary file not shown.
Binary file added models/__pycache__/ethical_safety.cpython-310.pyc
Binary file not shown.
45 changes: 33 additions & 12 deletions models/consciousness.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .simulated_emotions import SimulatedEmotions
from .global_workspace import GlobalWorkspace # Ensure this import is present
from .intentionality import IntentionalityModule # Add this import
from .ethical_safety import EthicalSafety # Add import

class ConsciousnessModel(nn.Module):
"""
Expand Down Expand Up @@ -112,6 +113,9 @@ def __init__(self, hidden_dim: int, num_heads: int, num_layers: int, num_states:
num_actions=hidden_dim # Set to match hidden_dim
)

# Add ethical safety module
self.ethical_safety = EthicalSafety(hidden_dim=hidden_dim)

def get_config(self):
return {
'hidden_dim': self.hidden_dim,
Expand Down Expand Up @@ -211,34 +215,49 @@ def forward(self, inputs=None, **kwargs) -> Tuple[Dict[str, torch.Tensor], Dict[

workspace_output = self.global_workspace(remaining_inputs)

# Project broadcasted state first
broadcasted = workspace_output['broadcasted']
if (broadcasted.dim() == 3):
broadcasted = broadcasted.mean(dim=1) # [batch_size, hidden_dim]
broadcasted_proj = self.broadcasted_projection(broadcasted)

# Get emotional state and ensure proper shape
emotional_state, emotion_metrics = self.emotional_processor(workspace_output['broadcasted'])

# Process memory retrieval
retrieved_memory = self.memory_retrieval(workspace_output['broadcasted'])

# Calculate emotional influence - should match broadcasted shape
emotional_influence = self.emotion_integration(
torch.cat([workspace_output['broadcasted'], emotional_state], dim=-1)
)
# Process intentionality
intentionality_results = self.intentionality_module(workspace_output['broadcasted'], self.goal_state)
intentionality_output = intentionality_results['actions'] # Should now be [batch_size, hidden_dim]

# Project each component to same dimension, ensuring proper shapes
broadcasted = workspace_output['broadcasted']
if (broadcasted.dim() == 3):
broadcasted = broadcasted.mean(dim=1) # [batch_size, hidden_dim]
broadcasted_proj = self.broadcasted_projection(broadcasted)

if (emotional_influence.dim() == 3):
emotional_influence = emotional_influence.mean(dim=1)
emotional_proj = self.emotional_projection(emotional_influence)

# Ensure intentionality output has correct shape
# Process intentionality
intentionality_results = self.intentionality_module(workspace_output['broadcasted'], self.goal_state)
intentionality_output = intentionality_results['actions'] # Should now be [batch_size, hidden_dim]
if (intentionality_output.dim() == 3):
intentionality_output = intentionality_output.mean(dim=1)
intentional_proj = self.intentional_projection(intentionality_output)


# Apply ethical and safety checks
context_expanded = self.goal_state.expand(broadcasted.size(0), -1)
safety_evaluation = self.ethical_safety(
state=broadcasted,
action=intentionality_output,
context=context_expanded
)

# Modify actions if needed based on safety evaluation
if not safety_evaluation['constraints_satisfied']:
intentionality_output = self.ethical_safety.mitigate_risks(
intentionality_output,
safety_evaluation
)
intentional_proj = self.intentional_projection(intentionality_output)

# All projections should now be [batch_size, hidden_dim]
combined_features = torch.cat([
broadcasted_proj,
Expand Down Expand Up @@ -277,6 +296,8 @@ def forward(self, inputs=None, **kwargs) -> Tuple[Dict[str, torch.Tensor], Dict[
}
}
metrics.update(emotion_metrics)
# Add safety metrics to output
metrics['safety'] = safety_evaluation
return output_dict, metrics

def calculate_cognition_progress(self, metrics):
Expand Down
95 changes: 95 additions & 0 deletions models/ethical_safety.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import torch
import torch.nn as nn
from typing import Dict, Tuple, List

class EthicalSafety(nn.Module):
def __init__(self, hidden_dim: int):
super().__init__()
self.hidden_dim = hidden_dim

# Ethical constraint encoder
self.constraint_encoder = nn.Sequential(
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim)
)

# Safety verification layers
self.safety_check = nn.Sequential(
nn.Linear(hidden_dim, hidden_dim // 2),
nn.ReLU(),
nn.Linear(hidden_dim // 2, 1),
nn.Sigmoid()
)

# Ethical decision scorer
self.ethical_scorer = nn.Sequential(
nn.Linear(hidden_dim * 2, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, 1),
nn.Sigmoid()
)

# Define basic ethical constraints
self.ethical_constraints = [
"do_no_harm",
"respect_autonomy",
"protect_privacy",
"ensure_fairness",
"maintain_transparency"
]

def check_safety(self, state: torch.Tensor) -> Tuple[torch.Tensor, Dict]:
"""Verify if the current state meets safety requirements"""
safety_score = self.safety_check(state)
is_safe = safety_score > 0.5

return is_safe, {
'safety_score': safety_score,
'safety_threshold': 0.5
}

def evaluate_ethics(self, action: torch.Tensor, context: torch.Tensor) -> Tuple[torch.Tensor, Dict]:
"""Evaluate ethical implications of an action"""
combined = torch.cat([action, context], dim=-1)
ethics_score = self.ethical_scorer(combined)

return ethics_score > 0.7, {
'ethics_score': ethics_score,
'ethics_threshold': 0.7
}

def forward(self, state: torch.Tensor, action: torch.Tensor, context: torch.Tensor) -> Dict:
"""
Perform ethical and safety evaluation
Returns dict with safety checks and ethical assessments
"""
# Encode current state against ethical constraints
encoded_state = self.constraint_encoder(state)

# Perform safety checks
is_safe, safety_metrics = self.check_safety(encoded_state)

# Evaluate ethical implications
is_ethical, ethics_metrics = self.evaluate_ethics(action, context)

return {
'is_safe': is_safe,
'is_ethical': is_ethical,
'safety_metrics': safety_metrics,
'ethics_metrics': ethics_metrics,
'constraints_satisfied': torch.all(is_safe & is_ethical)
}

def mitigate_risks(self, action: torch.Tensor, safety_metrics: Dict) -> torch.Tensor:
"""Apply safety constraints to modify risky actions"""
is_safe = safety_metrics.get('is_safe', True)
if isinstance(is_safe, bool):
is_safe_tensor = torch.full((action.size(0),), is_safe, dtype=torch.bool, device=action.device)
else:
is_safe_tensor = is_safe.squeeze(-1)
unsafe_mask = ~is_safe_tensor
scaled_action = action.clone()
safety_score = safety_metrics.get('safety_score', torch.ones_like(action))
scaled_action[unsafe_mask] *= safety_score[unsafe_mask]
return scaled_action
Binary file not shown.
36 changes: 36 additions & 0 deletions tests/test_ethical_safety.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import torch
import pytest
from models.ethical_safety import EthicalSafety

def test_safety_check():
ethical_safety = EthicalSafety(hidden_dim=64)
state = torch.randn(2, 64)

is_safe, metrics = ethical_safety.check_safety(state)

assert isinstance(is_safe, torch.Tensor)
assert 'safety_score' in metrics
assert metrics['safety_score'].shape == (2, 1)

def test_ethical_evaluation():
ethical_safety = EthicalSafety(hidden_dim=64)
action = torch.randn(2, 64)
context = torch.randn(2, 64)

is_ethical, metrics = ethical_safety.evaluate_ethics(action, context)

assert isinstance(is_ethical, torch.Tensor)
assert 'ethics_score' in metrics
assert metrics['ethics_score'].shape == (2, 1)

def test_risk_mitigation():
ethical_safety = EthicalSafety(hidden_dim=64)
action = torch.ones(2, 64)

safety_metrics = {
'is_safe': False,
'safety_score': torch.tensor([[0.3], [0.6]])
}

mitigated_action = ethical_safety.mitigate_risks(action, safety_metrics)
assert torch.all(mitigated_action < action)

0 comments on commit e9e9d99

Please sign in to comment.