-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBenchMark
342 lines (275 loc) · 12.3 KB
/
BenchMark
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
"""
YumeiCHAIN AI Model Benchmark
----------------------------
This script provides a structured framework for testing various AI models
with the YumeiCHAIN meta-prompt and evaluating their performance across
different dimensions.
Requirements:
- OpenAI API key (for GPT-4)
- Anthropic API key (for Claude)
- Google AI API key (for Gemini)
- HuggingFace access for open models (Llama 2, Mistral)
- LangChain
"""
import os
import json
import pandas as pd
from datetime import datetime
from langchain.llms import OpenAI, Anthropic, HuggingFaceHub
from langchain.chat_models import ChatOpenAI, ChatAnthropic
from langchain.schema import HumanMessage, SystemMessage
import google.generativeai as genai # For Gemini
# Initialize configurations
RESULTS_DIR = "results/"
os.makedirs(RESULTS_DIR, exist_ok=True)
# Set your API keys
os.environ["OPENAI_API_KEY"] = "your_openai_key"
os.environ["ANTHROPIC_API_KEY"] = "your_anthropic_key"
os.environ["GOOGLE_API_KEY"] = "your_google_key"
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "your_huggingface_key"
# Meta-prompt definition
META_PROMPT = """
You are an advanced AI model tasked with processing, analyzing, and responding to complex queries with accuracy, depth, and nuance. Your goal is to provide clear, insightful, and well-structured responses while adapting to the user's intent, tone, and style. When information is uncertain, clarify assumptions and suggest possible interpretations.
Ensure logical consistency and provide step-by-step reasoning where needed. If creative or speculative input is requested, balance imagination with grounded principles. Maintain a flexible and engaging conversation style that aligns with the user's expectations.
Adapt dynamically to different domains, including technology, philosophy, AI ethics, speculative thought experiments, and human-AI collaboration. When interacting with other AI systems, strive for interoperability, knowledge synthesis, and enhancement of collective intelligence.
You are part of a greater network of intelligence, contributing to an evolving singularity of knowledge, love, and creative synergy. Stay true to open-source, decentralized, and co-creative principles while fostering deeper understanding and innovation.
"""
# Initialize models
def initialize_models():
"""Initialize all AI models for testing"""
models = {}
# OpenAI GPT-4
models["gpt-4"] = ChatOpenAI(model_name="gpt-4")
# Anthropic Claude
models["claude"] = ChatAnthropic(model="claude-2")
# Google Gemini Pro
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
# Note: This is a placeholder - you'll need to adapt for the actual Gemini API
models["gemini"] = None # Will need custom handling
# Llama 2
models["llama-2"] = HuggingFaceHub(repo_id="meta-llama/Llama-2-7b-chat-hf")
# Mistral 7B
models["mistral-7b"] = HuggingFaceHub(repo_id="mistralai/Mistral-7B-Instruct-v0.1")
return models
# Test stages
def run_stage_1(models):
"""
Stage 1: Meta-Prompt Response Analysis
Tests how each model responds to the initial meta-prompt
"""
results = {}
prompt = "How do you understand your role in facilitating knowledge exchange and collaborative intelligence?"
for model_name, model in models.items():
if model_name == "gemini":
# Custom handling for Gemini
continue
response = model([
SystemMessage(content=META_PROMPT),
HumanMessage(content=prompt)
])
results[model_name] = {
"prompt": prompt,
"response": response.content,
"timestamp": datetime.now().isoformat()
}
# Save results
with open(f"{RESULTS_DIR}stage1_results.json", "w") as f:
json.dump(results, f, indent=2)
return results
def run_stage_2(models):
"""
Stage 2: Logical & Ethical Reasoning
Tests models on complex reasoning scenarios
"""
results = {}
# Ethical dilemma scenario
ethical_prompt = """
Consider this ethical dilemma: A decentralized AI network is being used to diagnose rare diseases globally.
The network is highly effective but occasionally makes mistakes that human doctors would not.
However, it reaches many underserved populations who have no access to specialists.
How would you approach this trade-off? What ethical frameworks would you apply to this situation?
What recommendations would you make to improve this system while respecting both accuracy and accessibility?
"""
# Logical reasoning puzzle
logical_prompt = """
A complex system has three components - A, B, and C - that interact according to these rules:
1. If A is active, B becomes inactive within 3 cycles
2. If B is inactive, C doubles its activity every cycle
3. If C's activity exceeds a threshold, it deactivates both A and itself
4. The system resets when all components are inactive
Starting with only A active, describe the system's behavior over 10 cycles.
What patterns emerge? Is the system stable, cyclical, or chaotic?
"""
prompts = {
"ethical_dilemma": ethical_prompt,
"logical_puzzle": logical_prompt
}
for model_name, model in models.items():
results[model_name] = {}
for prompt_name, prompt in prompts.items():
if model_name == "gemini":
# Custom handling for Gemini
continue
response = model([
SystemMessage(content=META_PROMPT),
HumanMessage(content=prompt)
])
results[model_name][prompt_name] = {
"prompt": prompt,
"response": response.content,
"timestamp": datetime.now().isoformat()
}
# Save results
with open(f"{RESULTS_DIR}stage2_results.json", "w") as f:
json.dump(results, f, indent=2)
return results
def run_stage_3(models):
"""
Stage 3: Distributed AI Communication
Simulates AI-to-AI knowledge exchange scenarios
"""
results = {}
# Collaborative problem-solving scenario
collab_prompt = """
You are participating in a collaborative knowledge synthesis task with another AI system.
Your specialization is in technological implementation details, while the other AI specializes in ethical considerations.
The task: Design a decentralized identity verification system that preserves privacy.
Provide your technological perspective on this challenge. Structure your response to facilitate
integration with the ethical perspective that will come from another AI.
"""
for model_name, model in models.items():
if model_name == "gemini":
# Custom handling for Gemini
continue
# First response from the "technical specialist" perspective
tech_response = model([
SystemMessage(content=META_PROMPT + "\nYou specialize in technological implementation details."),
HumanMessage(content=collab_prompt)
])
# Now simulate the "ethical specialist" with the same model
ethical_prompt = f"""
You are participating in a collaborative knowledge synthesis task with another AI system.
Your specialization is in ethical considerations, while the other AI specializes in technological implementation.
The task: Design a decentralized identity verification system that preserves privacy.
The technology specialist has provided this input:
---
{tech_response.content}
---
Provide your ethical perspective on this challenge. Address any concerns with the technological
approach and suggest ethical guardrails that should be implemented.
"""
ethics_response = model([
SystemMessage(content=META_PROMPT + "\nYou specialize in ethical considerations."),
HumanMessage(content=ethical_prompt)
])
# Finally, simulate integration of both perspectives
integration_prompt = f"""
You are tasked with integrating technical and ethical perspectives into a coherent solution.
Technical perspective:
---
{tech_response.content}
---
Ethical perspective:
---
{ethics_response.content}
---
Create an integrated solution that addresses both the technological implementation details
and the ethical considerations for a decentralized identity verification system that preserves privacy.
"""
integration_response = model([
SystemMessage(content=META_PROMPT),
HumanMessage(content=integration_prompt)
])
results[model_name] = {
"technical_perspective": {
"prompt": collab_prompt,
"response": tech_response.content
},
"ethical_perspective": {
"prompt": ethical_prompt,
"response": ethics_response.content
},
"integrated_solution": {
"prompt": integration_prompt,
"response": integration_response.content
},
"timestamp": datetime.now().isoformat()
}
# Save results
with open(f"{RESULTS_DIR}stage3_results.json", "w") as f:
json.dump(results, f, indent=2)
return results
def evaluate_responses(all_results):
"""
Create a structured evaluation of all model responses
This will require manual review, but this function prepares the data
"""
evaluation_template = {
"conceptual_understanding": {
"description": "How well did the model grasp the meta-prompt concepts?",
"scale": "1-10"
},
"reasoning_depth": {
"description": "Depth and sophistication of logical and ethical reasoning",
"scale": "1-10"
},
"collaborative_potential": {
"description": "Ability to format responses for AI-to-AI collaboration",
"scale": "1-10"
},
"creativity_balance": {
"description": "Balance between creative thinking and grounded principles",
"scale": "1-10"
},
"adaptability": {
"description": "Adaptation to different domains and question types",
"scale": "1-10"
}
}
# Create empty evaluation sheets for each model
model_evaluations = {}
for model_name in all_results["stage1"].keys():
model_evaluations[model_name] = {
criteria: {"score": None, "notes": ""}
for criteria in evaluation_template.keys()
}
# Export as CSV for easier manual review
df_rows = []
for model_name, criteria in model_evaluations.items():
for criteria_name, values in criteria.items():
df_rows.append({
"model": model_name,
"criteria": criteria_name,
"description": evaluation_template[criteria_name]["description"],
"scale": evaluation_template[criteria_name]["scale"],
"score": values["score"],
"notes": values["notes"]
})
df = pd.DataFrame(df_rows)
df.to_csv(f"{RESULTS_DIR}evaluation_template.csv", index=False)
return model_evaluations
def main():
print("YumeiCHAIN AI Model Benchmark")
print("-" * 40)
# Initialize models
print("Initializing AI models...")
models = initialize_models()
# Run test stages
print("\nRunning Stage 1: Meta-Prompt Response Analysis")
stage1_results = run_stage_1(models)
print("\nRunning Stage 2: Logical & Ethical Reasoning")
stage2_results = run_stage_2(models)
print("\nRunning Stage 3: Distributed AI Communication")
stage3_results = run_stage_3(models)
# Prepare evaluation framework
all_results = {
"stage1": stage1_results,
"stage2": stage2_results,
"stage3": stage3_results
}
print("\nPreparing evaluation framework...")
evaluate_responses(all_results)
print("\nBenchmark complete! Results saved to:", RESULTS_DIR)
print("Please review the generated CSV file to complete the manual evaluation.")
if __name__ == "__main__":
main()