cascadeflow/.env.example at main · lemony-ai/cascadeflow · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
# ============================================
# cascadeflow Provider API Keys & Configuration
# ============================================
# Copy this file to .env and fill in your API keys
# Local providers (Ollama, vLLM) can be used without API keys

# ============================================
# CLOUD PROVIDERS (Require API Keys)
# ============================================

# --------------------------------------------
# OpenAI - Industry-leading quality, most reliable, best for production
# --------------------------------------------
OPENAI_API_KEY=your_openai_api_key_here
# Get your key: https://platform.openai.com/api-keys
# Models: gpt-4, gpt-4-turbo, gpt-4o, gpt-4o-mini, gpt-3.5-turbo
#
# Optional: used by live E2E tests that verify tool calling/streaming.
# Use a tool-capable chat model (for example: gpt-4o-mini).
OPENAI_TOOL_MODEL=gpt-4o-mini

# --------------------------------------------
# Anthropic Claude - Best for reasoning and analysis, strong safety features
# --------------------------------------------
ANTHROPIC_API_KEY=your_anthropic_api_key_here
# Get your key: https://console.anthropic.com/settings/keys
# Models: claude-3-opus, claude-3-5-sonnet, claude-3-sonnet, claude-3-haiku

# --------------------------------------------
# Groq - Fastest inference speed, ultra-low latency, free tier available
# --------------------------------------------
GROQ_API_KEY=your_groq_api_key_here
# Get your key: https://console.groq.com/keys
# Models: llama3-70b, llama3-8b, mixtral-8x7b, gemma-7b

# --------------------------------------------
# Together AI - Cost-effective, wide model selection, good for experimentation
# --------------------------------------------
TOGETHER_API_KEY=your_together_api_key_here
# Get your key: https://api.together.xyz/settings/api-keys
# Models: Llama-2, Mixtral, Code Llama, and many more

# --------------------------------------------
# Hugging Face - Open-source models, community-driven, flexible deployment
# --------------------------------------------
HF_TOKEN=your_huggingface_token_here
# Get your key: https://huggingface.co/settings/tokens
# Models: Mistral, Falcon, BLOOM, and thousands more

# Optional: If using Hugging Face Inference Endpoints
HF_INFERENCE_ENDPOINT_URL=https://your-endpoint.endpoints.huggingface.cloud
# Leave empty if using standard Hugging Face API

# --------------------------------------------
# Google (Vertex AI) - Enterprise integration, GCP ecosystem, Gemini models
# --------------------------------------------
GOOGLE_API_KEY=your_google_api_key_here
# Get your key: https://makersuite.google.com/app/apikey
# Models: gemini-pro, gemini-pro-vision, text-bison, chat-bison

# Optional: For Vertex AI (instead of AI Studio)
# GOOGLE_APPLICATION_CREDENTIALS=/path/to/service-account.json
# GOOGLE_PROJECT_ID=your-gcp-project-id
# GOOGLE_REGION=us-central1

# --------------------------------------------
# Azure OpenAI - Enterprise compliance, HIPAA/SOC2, Microsoft ecosystem
# --------------------------------------------
AZURE_API_KEY=your_azure_api_key_here
AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com/
AZURE_OPENAI_DEPLOYMENT_NAME=your-deployment-name
AZURE_OPENAI_API_VERSION=2024-02-15-preview
# Get your key: https://portal.azure.com/ → Azure OpenAI → Keys and Endpoint
# Models: Same as OpenAI (gpt-4, gpt-3.5-turbo, etc.) but via Azure

# --------------------------------------------
# DeepSeek - Specialized code models, very cost-effective for coding tasks
# --------------------------------------------
DEEPSEEK_API_KEY=your_deepseek_api_key_here
# Get your key: https://platform.deepseek.com/api_keys
# Models: deepseek-coder, deepseek-chat

# --------------------------------------------
# OpenRouter - Unified API for 400+ models from multiple providers
# --------------------------------------------
OPENROUTER_API_KEY=your_openrouter_api_key_here
# Get your key: https://openrouter.ai/keys
# Browse models: https://openrouter.ai/models
#
# Model Format: provider/model-name
# Top Models (2025):
#   - x-ai/grok-code-fast-1 (most popular, free tier)
#   - anthropic/claude-opus-4 (best coding, $15/$75 per 1M tokens)
#   - anthropic/claude-4.5-sonnet-20250929 ($3/$15 per 1M tokens)
#   - openai/gpt-4o ($2.50/$10 per 1M tokens)
#   - google/gemini-2.5-flash ($0.15/$0.60 per 1M, 1M context)
#   - deepseek/deepseek-coder-v2 ($0.27/$1.10 per 1M tokens)
#   - meta-llama/llama-3.1-8b-instruct ($0.05/$0.05 per 1M tokens)
#   - mistralai/devstral-small (free)
#   - deepseek/deepseek-chat (free)
#
# Benefits:
#   • Single API key for all providers (OpenAI, Anthropic, Google, Meta, X.AI, etc.)
#   • Access 400+ models through one endpoint
#   • Automatic fallbacks and routing
#   • Pass-through pricing (no markup)
#   • Free models available (50 requests/day limit)

# ============================================
# LOCAL / SELF-HOSTED PROVIDERS (No API Keys Required)
# ============================================

# --------------------------------------------
# Ollama - Privacy-first, local deployment, no internet required
# --------------------------------------------
# Ollama can run on localhost or any network-accessible host

# For local Ollama (default)
OLLAMA_BASE_URL=http://localhost:11434
# Models: llama2, mistral, codellama, phi, gemma, etc.

# For Ollama on another machine in your network
# OLLAMA_BASE_URL=http://192.168.1.100:11434

# For Ollama on a remote server
# OLLAMA_BASE_URL=https://ollama.yourdomain.com
# If using authentication:
# OLLAMA_API_KEY=your_custom_auth_token

# -----------------------------------------------------------
# Multi-Instance Ollama (Advanced: Run draft + verifier on different servers)
# -----------------------------------------------------------
# Run two Ollama instances on different GPUs/servers for optimal performance
# Example use case: Fast draft model on GPU 0, powerful verifier on GPU 1

# Primary Ollama instance (for draft models)
# OLLAMA_DRAFT_URL=http://localhost:11434

# Secondary Ollama instance (for verifier models)
# OLLAMA_VERIFIER_URL=http://localhost:11435

# Docker Compose example:
# service ollama-draft:   port 11434 → GPU 0 → llama-3.2-1b (fast)
# service ollama-verifier: port 11435 → GPU 1 → llama-3.1-70b (accurate)
#
# Then configure in code:
# models: [
#   { name: 'llama-3.2-1b', provider: 'ollama', cost: 0, baseUrl: process.env.OLLAMA_DRAFT_URL },
#   { name: 'llama-3.1-70b', provider: 'ollama', cost: 0, baseUrl: process.env.OLLAMA_VERIFIER_URL }
# ]
#
# Full examples:
# - TypeScript: packages/core/examples/nodejs/multi-instance-ollama.ts
# - Python: examples/multi_instance_ollama.py
# - Docker: examples/docker/multi-instance-ollama/
# -----------------------------------------------------------

# How to install Ollama:
# 1. Download: https://ollama.ai/download
# 2. Install and run: ollama serve
# 3. Pull a model: ollama pull llama2
# 4. Test: curl http://localhost:11434/api/generate -d '{"model":"llama2","prompt":"Hello"}'

# --------------------------------------------
# vLLM - Self-hosted inference server, full control, optimized performance
# --------------------------------------------
# vLLM can run on localhost, your network, or a remote server

# For local vLLM (default)
VLLM_BASE_URL=http://localhost:8000
# Models: Any model supported by vLLM (Llama, Mistral, Mixtral, etc.)

# For vLLM on another machine in your network
# VLLM_BASE_URL=http://192.168.1.200:8000

# For vLLM on a remote server
# VLLM_BASE_URL=https://vllm.yourdomain.com
# If using authentication:
# VLLM_API_KEY=your_custom_auth_token

# Optional: vLLM model configuration
VLLM_MODEL_NAME=mistralai/Mistral-7B-Instruct-v0.2
# Specify which model your vLLM server is serving

# -----------------------------------------------------------
# Multi-Instance vLLM (Advanced: Run draft + verifier on different servers)
# -----------------------------------------------------------
# Run two vLLM instances with different models for optimal cascade performance
# Example use case: Fast 7B model for draft, powerful 70B model for verifier

# Primary vLLM instance (for draft models)
# VLLM_DRAFT_URL=http://localhost:8000/v1
# VLLM_DRAFT_MODEL=Qwen/Qwen2.5-7B-Instruct

# Secondary vLLM instance (for verifier models)
# VLLM_VERIFIER_URL=http://localhost:8001/v1
# VLLM_VERIFIER_MODEL=Qwen/Qwen2.5-72B-Instruct

# Docker/Kubernetes example:
# service vllm-draft:    port 8000 → GPU 0 → Qwen 7B (fast, 200 tok/s)
# service vllm-verifier: port 8001 → GPU 1 → Qwen 72B (accurate, 50 tok/s)
#
# Then configure in code:
# models: [
#   { name: 'Qwen/Qwen2.5-7B-Instruct', provider: 'vllm', cost: 0, baseUrl: process.env.VLLM_DRAFT_URL },
#   { name: 'Qwen/Qwen2.5-72B-Instruct', provider: 'vllm', cost: 0, baseUrl: process.env.VLLM_VERIFIER_URL }
# ]
#
# Full examples:
# - TypeScript: packages/core/examples/nodejs/multi-instance-vllm.ts
# - Python: examples/multi_instance_vllm.py
# -----------------------------------------------------------

# How to install vLLM:
# 1. Install: pip install vllm
# 2. Start server: python -m vllm.entrypoints.openai.api_server \
#    --model mistralai/Mistral-7B-Instruct-v0.2 \
#    --port 8000
# 3. Test: curl http://localhost:8000/v1/models

# ============================================
# CONFIGURATION EXAMPLES
# ============================================

# Example 1: Local-only setup (no cloud providers)
# - Set OLLAMA_BASE_URL and VLLM_BASE_URL
# - Leave all API keys empty
# - Great for privacy-sensitive work

# Example 2: Hybrid setup (cloud + local)
# - Configure OpenAI/Anthropic for production quality
# - Configure Ollama for development and testing
# - Use cloud for critical tasks, local for everything else

# Example 3: Multi-cloud setup (all providers)
# - Configure all cloud providers for maximum flexibility
# - Use cascadeflow's automatic fallback system
# - Let cascadeflow pick the best provider for each task

# Example 4: Enterprise setup (Azure + local)
# - Configure Azure OpenAI for compliance
# - Configure vLLM for high-volume, low-cost inference
# - Keep sensitive data on-premises

# ============================================
# PROVIDER SELECTION GUIDE
# ============================================

# For production quality:     OpenAI, Anthropic
# For speed:                   Groq, vLLM
# For cost optimization:       Together, DeepSeek, Ollama
# For privacy/compliance:      Ollama, vLLM, Azure
# For experimentation:         HuggingFace, Together
# For code tasks:              DeepSeek, OpenAI (GPT-4)
# For reasoning:               Anthropic (Claude), OpenAI (GPT-4)
# For enterprise:              Azure, Google (Vertex AI)

# ============================================
# TESTING YOUR SETUP
# ============================================

# After configuring, test with:
# python examples/integrations/test_all_providers.py

# This will:
# - Check which providers are configured
# - Test cost calculations for each provider
# - Identify any missing API keys or configuration issues
# - Generate a status report

# ============================================
# SECURITY NOTES
# ============================================

# 1. Never commit .env file to version control
# 2. Keep API keys secure and rotate them regularly
# 3. Use environment-specific .env files (.env.dev, .env.prod)
# 4. For production, use a secrets manager (AWS Secrets Manager, HashiCorp Vault, etc.)
# 5. Limit API key permissions to only what's needed
# 6. Monitor API usage and set up billing alerts

# ============================================
# TROUBLESHOOTING
# ============================================

# Issue: "API key not found"
# Solution: Make sure .env file is in the project root and keys are set

# Issue: "Connection refused" for Ollama/vLLM
# Solution: Check if the service is running and BASE_URL is correct

# Issue: "Model not found"
# Solution: For Ollama, pull the model first: ollama pull <model>
#           For vLLM, make sure the model is loaded on the server

# Issue: "Rate limit exceeded"
# Solution: Use cascadeflow's budget tracking to manage API usage
#           Or configure multiple providers for automatic fallback

# ============================================
# ADDITIONAL RESOURCES
# ============================================

# cascadeflow Documentation: https://github.com/yourusername/cascadeflow
# OpenAI API: https://platform.openai.com/docs
# Anthropic API: https://docs.anthropic.com/
# Groq API: https://console.groq.com/docs
# Together API: https://docs.together.ai/
# Hugging Face: https://huggingface.co/docs
# Google AI: https://ai.google.dev/docs
# Azure OpenAI: https://learn.microsoft.com/azure/ai-services/openai/
# DeepSeek: https://platform.deepseek.com/docs
# Ollama: https://ollama.ai/docs
# vLLM: https://docs.vllm.ai/

# ============================================
# OPTIONAL CASCADEFLOW SETTINGS
# ============================================

# Logging level (DEBUG, INFO, WARNING, ERROR)
LOG_LEVEL=INFO

# Default quality threshold for responses (0.0 - 1.0)
CASCADEFLOW_DEFAULT_QUALITY_THRESHOLD=0.7