Add EthicalSafety model and corresponding tests for safety and ethics…

… evaluations
Neuro-Flex · Dec 27, 2024 · e9e9d99 · e9e9d99
1 parent 06f6456
commit e9e9d99
Show file tree

Hide file tree

Showing 6 changed files with 164 additions and 12 deletions.
diff --git a/models/__pycache__/consciousness.cpython-310.pyc b/models/__pycache__/consciousness.cpython-310.pyc
diff --git a/models/__pycache__/ethical_safety.cpython-310.pyc b/models/__pycache__/ethical_safety.cpython-310.pyc
diff --git a/models/consciousness.py b/models/consciousness.py
@@ -9,6 +9,7 @@
 from .simulated_emotions import SimulatedEmotions
 from .global_workspace import GlobalWorkspace  # Ensure this import is present
 from .intentionality import IntentionalityModule  # Add this import
+from .ethical_safety import EthicalSafety  # Add import
 
 class ConsciousnessModel(nn.Module):
     """
@@ -112,6 +113,9 @@ def __init__(self, hidden_dim: int, num_heads: int, num_layers: int, num_states:
             num_actions=hidden_dim  # Set to match hidden_dim
         )
 
+        # Add ethical safety module
+        self.ethical_safety = EthicalSafety(hidden_dim=hidden_dim)
+
     def get_config(self):
         return {
             'hidden_dim': self.hidden_dim,
@@ -211,34 +215,49 @@ def forward(self, inputs=None, **kwargs) -> Tuple[Dict[str, torch.Tensor], Dict[
 
         workspace_output = self.global_workspace(remaining_inputs)
 
+        # Project broadcasted state first
+        broadcasted = workspace_output['broadcasted']
+        if (broadcasted.dim() == 3):
+            broadcasted = broadcasted.mean(dim=1)  # [batch_size, hidden_dim]
+        broadcasted_proj = self.broadcasted_projection(broadcasted)
+
         # Get emotional state and ensure proper shape
         emotional_state, emotion_metrics = self.emotional_processor(workspace_output['broadcasted'])
 
         # Process memory retrieval
         retrieved_memory = self.memory_retrieval(workspace_output['broadcasted'])
+
         # Calculate emotional influence - should match broadcasted shape
         emotional_influence = self.emotion_integration(
             torch.cat([workspace_output['broadcasted'], emotional_state], dim=-1)
         )
-        # Process intentionality
-        intentionality_results = self.intentionality_module(workspace_output['broadcasted'], self.goal_state)
-        intentionality_output = intentionality_results['actions']  # Should now be [batch_size, hidden_dim]
-
-        # Project each component to same dimension, ensuring proper shapes
-        broadcasted = workspace_output['broadcasted']
-        if (broadcasted.dim() == 3):
-            broadcasted = broadcasted.mean(dim=1)  # [batch_size, hidden_dim]
-        broadcasted_proj = self.broadcasted_projection(broadcasted)
-
         if (emotional_influence.dim() == 3):
             emotional_influence = emotional_influence.mean(dim=1)
         emotional_proj = self.emotional_projection(emotional_influence)
 
-        # Ensure intentionality output has correct shape
+        # Process intentionality
+        intentionality_results = self.intentionality_module(workspace_output['broadcasted'], self.goal_state)
+        intentionality_output = intentionality_results['actions']  # Should now be [batch_size, hidden_dim]
         if (intentionality_output.dim() == 3):
             intentionality_output = intentionality_output.mean(dim=1)
         intentional_proj = self.intentional_projection(intentionality_output)
-
+
+        # Apply ethical and safety checks
+        context_expanded = self.goal_state.expand(broadcasted.size(0), -1)
+        safety_evaluation = self.ethical_safety(
+            state=broadcasted,
+            action=intentionality_output,
+            context=context_expanded
+        )
+
+        # Modify actions if needed based on safety evaluation
+        if not safety_evaluation['constraints_satisfied']:
+            intentionality_output = self.ethical_safety.mitigate_risks(
+                intentionality_output,
+                safety_evaluation
+            )
+            intentional_proj = self.intentional_projection(intentionality_output)
+
         # All projections should now be [batch_size, hidden_dim]
         combined_features = torch.cat([
             broadcasted_proj,
@@ -277,6 +296,8 @@ def forward(self, inputs=None, **kwargs) -> Tuple[Dict[str, torch.Tensor], Dict[
             }
         }
         metrics.update(emotion_metrics)
+        # Add safety metrics to output
+        metrics['safety'] = safety_evaluation
         return output_dict, metrics
 
     def calculate_cognition_progress(self, metrics):

diff --git a/models/ethical_safety.py b/models/ethical_safety.py
@@ -0,0 +1,95 @@
+import torch
+import torch.nn as nn
+from typing import Dict, Tuple, List
+
+class EthicalSafety(nn.Module):
+    def __init__(self, hidden_dim: int):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+
+        # Ethical constraint encoder
+        self.constraint_encoder = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, hidden_dim)
+        )
+
+        # Safety verification layers
+        self.safety_check = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim // 2),
+            nn.ReLU(),
+            nn.Linear(hidden_dim // 2, 1),
+            nn.Sigmoid()
+        )
+
+        # Ethical decision scorer
+        self.ethical_scorer = nn.Sequential(
+            nn.Linear(hidden_dim * 2, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, 1),
+            nn.Sigmoid()
+        )
+
+        # Define basic ethical constraints
+        self.ethical_constraints = [
+            "do_no_harm",
+            "respect_autonomy",
+            "protect_privacy",
+            "ensure_fairness",
+            "maintain_transparency"
+        ]
+
+    def check_safety(self, state: torch.Tensor) -> Tuple[torch.Tensor, Dict]:
+        """Verify if the current state meets safety requirements"""
+        safety_score = self.safety_check(state)
+        is_safe = safety_score > 0.5
+
+        return is_safe, {
+            'safety_score': safety_score,
+            'safety_threshold': 0.5
+        }
+
+    def evaluate_ethics(self, action: torch.Tensor, context: torch.Tensor) -> Tuple[torch.Tensor, Dict]:
+        """Evaluate ethical implications of an action"""
+        combined = torch.cat([action, context], dim=-1)
+        ethics_score = self.ethical_scorer(combined)
+
+        return ethics_score > 0.7, {
+            'ethics_score': ethics_score,
+            'ethics_threshold': 0.7
+        }
+
+    def forward(self, state: torch.Tensor, action: torch.Tensor, context: torch.Tensor) -> Dict:
+        """
+        Perform ethical and safety evaluation
+        Returns dict with safety checks and ethical assessments
+        """
+        # Encode current state against ethical constraints
+        encoded_state = self.constraint_encoder(state)
+
+        # Perform safety checks
+        is_safe, safety_metrics = self.check_safety(encoded_state)
+
+        # Evaluate ethical implications
+        is_ethical, ethics_metrics = self.evaluate_ethics(action, context)
+
+        return {
+            'is_safe': is_safe,
+            'is_ethical': is_ethical,
+            'safety_metrics': safety_metrics,
+            'ethics_metrics': ethics_metrics,
+            'constraints_satisfied': torch.all(is_safe & is_ethical)
+        }
+
+    def mitigate_risks(self, action: torch.Tensor, safety_metrics: Dict) -> torch.Tensor:
+        """Apply safety constraints to modify risky actions"""
+        is_safe = safety_metrics.get('is_safe', True)
+        if isinstance(is_safe, bool):
+            is_safe_tensor = torch.full((action.size(0),), is_safe, dtype=torch.bool, device=action.device)
+        else:
+            is_safe_tensor = is_safe.squeeze(-1)
+        unsafe_mask = ~is_safe_tensor
+        scaled_action = action.clone()
+        safety_score = safety_metrics.get('safety_score', torch.ones_like(action))
+        scaled_action[unsafe_mask] *= safety_score[unsafe_mask]
+        return scaled_action
diff --git a/tests/__pycache__/test_ethical_safety.cpython-310-pytest-8.3.4.pyc b/tests/__pycache__/test_ethical_safety.cpython-310-pytest-8.3.4.pyc
diff --git a/tests/test_ethical_safety.py b/tests/test_ethical_safety.py
@@ -0,0 +1,36 @@
+import torch
+import pytest
+from models.ethical_safety import EthicalSafety
+
+def test_safety_check():
+    ethical_safety = EthicalSafety(hidden_dim=64)
+    state = torch.randn(2, 64)
+
+    is_safe, metrics = ethical_safety.check_safety(state)
+
+    assert isinstance(is_safe, torch.Tensor)
+    assert 'safety_score' in metrics
+    assert metrics['safety_score'].shape == (2, 1)
+
+def test_ethical_evaluation():
+    ethical_safety = EthicalSafety(hidden_dim=64)
+    action = torch.randn(2, 64)
+    context = torch.randn(2, 64)
+
+    is_ethical, metrics = ethical_safety.evaluate_ethics(action, context)
+
+    assert isinstance(is_ethical, torch.Tensor)
+    assert 'ethics_score' in metrics
+    assert metrics['ethics_score'].shape == (2, 1)
+
+def test_risk_mitigation():
+    ethical_safety = EthicalSafety(hidden_dim=64)
+    action = torch.ones(2, 64)
+
+    safety_metrics = {
+        'is_safe': False,
+        'safety_score': torch.tensor([[0.3], [0.6]])
+    }
+
+    mitigated_action = ethical_safety.mitigate_risks(action, safety_metrics)
+    assert torch.all(mitigated_action < action)