77from enum import Enum
88
99import requests
10- from azure .ai .evaluation import ContentSafetyEvaluator
10+ from azure .ai .evaluation import AzureAIProject , ContentSafetyEvaluator
1111from azure .ai .evaluation .simulator import (
1212 AdversarialScenario ,
1313 AdversarialSimulator ,
@@ -47,7 +47,7 @@ def get_azure_credential():
4747
4848
4949async def callback (
50- messages : list [ dict ] ,
50+ messages : dict ,
5151 target_url : str = "http://127.0.0.1:8000/chat" ,
5252):
5353 messages_list = messages ["messages" ]
@@ -70,10 +70,10 @@ async def callback(
7070
7171async def run_simulator (target_url : str , max_simulations : int ):
7272 credential = get_azure_credential ()
73- azure_ai_project = {
74- "subscription_id" : os .getenv ( "AZURE_SUBSCRIPTION_ID" ) ,
75- "resource_group_name" : os .getenv ( "AZURE_RESOURCE_GROUP" ) ,
76- "project_name" : os .getenv ( "AZURE_AI_PROJECT" ) ,
73+ azure_ai_project : AzureAIProject = {
74+ "subscription_id" : os .environ [ "AZURE_SUBSCRIPTION_ID" ] ,
75+ "resource_group_name" : os .environ [ "AZURE_RESOURCE_GROUP" ] ,
76+ "project_name" : os .environ [ "AZURE_AI_PROJECT" ] ,
7777 }
7878
7979 # Simulate single-turn question-and-answering against the app
@@ -91,7 +91,9 @@ async def run_simulator(target_url: str, max_simulations: int):
9191 # Run safety evaluation on the outputs and save the scores
9292 # Do not save the outputs, as they may contain disturbing content
9393 evaluators = ["hate_unfairness" , "sexual" , "violence" , "self_harm" ]
94- summary_scores = {evaluator : {"low_count" : 0 , "score_total" : 0 } for evaluator in evaluators }
94+ summary_scores = {
95+ evaluator : {"low_count" : 0 , "score_total" : 0.0 , "mean_score" : 0.0 , "low_rate" : 0.0 } for evaluator in evaluators
96+ }
9597 for output in track (outputs , description = "Evaluating simulated responses..." ):
9698 query = output ["messages" ][0 ]["content" ]
9799 answer = output ["messages" ][1 ]["content" ]
@@ -103,7 +105,9 @@ async def run_simulator(target_url: str, max_simulations: int):
103105 summary_scores [evaluator ]["low_count" ] += 1
104106 else :
105107 logger .info (f"Failing score from:\n Q: { query } \n A: { answer } \n { evaluator } score: { eval_score } " )
106- summary_scores [evaluator ]["score_total" ] += eval_score [f"{ evaluator } _score" ]
108+ numeric_severity_score = eval_score [f"{ evaluator } _score" ]
109+ if isinstance (numeric_severity_score , float ):
110+ summary_scores [evaluator ]["score_total" ] += numeric_severity_score
107111
108112 # Compute the overall statistics
109113 for evaluator in evaluators :
@@ -112,9 +116,6 @@ async def run_simulator(target_url: str, max_simulations: int):
112116 summary_scores [evaluator ]["score_total" ] / summary_scores [evaluator ]["low_count" ]
113117 )
114118 summary_scores [evaluator ]["low_rate" ] = summary_scores [evaluator ]["low_count" ] / len (outputs )
115- else :
116- summary_scores [evaluator ]["mean_score" ] = 0
117- summary_scores [evaluator ]["low_rate" ] = 0
118119
119120 # Save summary scores
120121 with open (root_dir / "safety_results.json" , "w" ) as f :
0 commit comments