最近在做一个需要调用多个大模型的项目踩了不少坑。把方案整理出来分享给同样在搞多模型接入的同学。本文包含完整的Python代码可以直接跑。涉及DeepSeek、通义千问、智谱GLM三个模型的统一接入和智能路由。一、为什么要做多模型统一接入项目需求是这样的一个知识库问答系统不同问题要路由到不同模型。简单问题走便宜的快模型复杂推理走深度模型代码相关走擅长编程的模型。如果每个模型单独对接你会面对三个痛点每家请求格式不一样DeepSeek用OpenAI兼容格式通义千问有自己的SDKGLM又是一套每家的计费规则和Token计算方式不同月底对账头疼某家模型挂了没有自动降级用户体验直接崩解决方案就是在业务层和模型之间加一层抽象——统一请求格式、统一错误处理、统一成本核算。二、架构设计核心设计决策三、完整代码实现3.1 项目结构3.2 配置文件# config.yamlmodels:deepseek:endpoint:https://api.deepseek.com/v1/chat/completionsmodel_name:deepseek-chatprice_input:0.001# 元/千Tokenprice_output:0.002max_retries:3timeout:30qwen:endpoint:https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completionsmodel_name:qwen-plusprice_input:0.0008price_output:0.002glm:endpoint:https://open.bigmodel.cn/api/paas/v4/chat/completionsmodel_name:glm-4price_input:0.001price_output:0.001routing:default_model:deepseekfallback_chain:[deepseek,qwen,glm]cost_limit_per_call:0.05credentials:deepseek:${DEEPSEEK_API_KEY}qwen:${DASHSCOPE_API_KEY}glm:${GLM_API_KEY}这里说一句接入凭证千万别硬编码到代码里。用环境变量读取生产环境接密钥管理服务。3.3 统一数据结构#gateway/base.pyfrom abc import ABC,abstractmethod from dataclasses import dataclass from typing import Optional,Dict,Any,List fromenumimport Enum import httpx import os import yaml classRole(str,Enum):SYSTEMsystemUSERuserASSISTANTassistantdataclass class Message:role:Role content:strdataclass class ChatRequest:统一请求体——不管底层调哪个模型业务层只用这个messages:List[Message]model:Optional[str]None # 指定模型None则走默认路由 temperature:float0.7max_tokens:int4096dataclass class TokenUsage:input_tokens:intoutput_tokens:inttotal_tokens:intdataclass class ChatResponse:统一响应体content:str model:str usage:TokenUsage cost:float# 本次调用费用单位元 latency_ms:float# 响应耗时 classBaseAdapter(ABC):模型适配器基类——所有模型适配器继承这个def__init__(self,config:Dict[str,Any]):self.endpointconfig[endpoint]self.model_nameconfig[model_name]self.input_priceconfig[price_input]self.output_priceconfig[price_output]self.max_retriesconfig.get(max_retries,3)self.timeoutconfig.get(timeout,30)# 接入凭证从环境变量读取self.api_keyos.environ.get(config.get(env_key,))abstractmethod defbuild_payload(self,request:ChatRequest)-Dict[str,Any]:将统一请求转为该模型的原生格式passabstractmethod defparse_response(self,raw:Dict[str,Any])-ChatResponse:将原生响应转为统一格式pass defcalc_cost(self,input_tokens:int,output_tokens:int)-float:计算本次调用费用returnround(input_tokens*self.input_price/1000output_tokens*self.output_price/1000,6)async defcall(self,request:ChatRequest)-ChatResponse:发起HTTP请求带重试headers{Authorization:fBearer {self.api_key},Content-Type:application/json}payloadself.build_payload(request)async with httpx.AsyncClient(timeoutself.timeout)as client:forattemptinrange(self.max_retries):try:respawait client.post(self.endpoint,jsonpayload,headersheaders)resp.raise_for_status()returnself.parse_response(resp.json())except Exception as e:ifattemptself.max_retries-1:raisecontinue3.4 具体适配器实现DeepSeek、通义千问、GLM都兼容OpenAI格式但请求体有细微差异# gateway/adapters/deepseek.pyclassDeepSeekAdapter(BaseAdapter):defbuild_payload(self,request:ChatRequest)-Dict[str,Any]:return{model:self.model_name,messages:[{role:m.role.value,content:m.content}forminrequest.messages],temperature:request.temperature,max_tokens:request.max_tokens,}defparse_response(self,raw:Dict[str,Any])-ChatResponse:choiceraw[choices][0]usageraw[usage]input_tusage[prompt_tokens]output_tusage[completion_tokens]returnChatResponse(contentchoice[message][content],modeldeepseek-chat,usageTokenUsage(input_t,output_t,usage[total_tokens]),costself.calc_cost(input_t,output_t),latency_ms0.0,)# gateway/adapters/qwen.pyclassQwenAdapter(BaseAdapter):通义千问——兼容OpenAI格式但model字段名称不同defbuild_payload(self,request:ChatRequest)-Dict[str,Any]:return{model:self.model_name,# qwen-plusmessages:[{role:m.role.value,content:m.content}forminrequest.messages],temperature:request.temperature,max_tokens:request.max_tokens,}defparse_response(self,raw:Dict[str,Any])-ChatResponse:choiceraw[choices][0]usageraw[usage]input_tusage.get(input_tokens,usage.get(prompt_tokens,0))output_tusage.get(output_tokens,usage.get(completion_tokens,0))returnChatResponse(contentchoice[message][content],modelqwen-plus,usageTokenUsage(input_t,output_t,input_toutput_t),costself.calc_cost(input_t,output_t),latency_ms0.0,)# gateway/adapters/glm.pyclassGLMAdapter(BaseAdapter):智谱GLM——部分字段名不同defbuild_payload(self,request:ChatRequest)-Dict[str,Any]:return{model:self.model_name,# glm-4messages:[{role:m.role.value,content:m.content}forminrequest.messages],temperature:request.temperature,max_tokens:request.max_tokens,}defparse_response(self,raw:Dict[str,Any])-ChatResponse:choiceraw[choices][0]usageraw[usage]input_tusage.get(prompt_tokens,0)output_tusage.get(completion_tokens,0)returnChatResponse(contentchoice[message][content],modelglm-4,usageTokenUsage(input_t,output_t,usage.get(total_tokens,input_toutput_t)),costself.calc_cost(input_t,output_t),latency_ms0.0,)3.5 核心路由器这是整个方案的心脏——根据任务类型选模型失败自动降级# gateway/router.pyimporttimefromcollectionsimportdefaultdictclassModelRouter:多模型路由器——统一入口智能选模型自动降级ADAPTER_MAP{deepseek:DeepSeekAdapter,qwen:QwenAdapter,glm:GLMAdapter,}def__init__(self,config_path:strconfig.yaml):withopen(config_path)asf:self.configyaml.safe_load(f)self.routing_configself.config[routing]self.default_modelself.routing_config[default_model]self.fallback_chainself.routing_config[fallback_chain]# 初始化各模型适配器self.adapters{}forname,clsinself.ADAPTER_MAP.items():ifnameinself.config[models]:model_configself.config[models][name]model_config[env_key]list(self.config[credentials].values())[list(self.config[credentials].keys()).index(name)].strip(${})self.adapters[name]cls(model_config)# 成本统计self.cost_logdefaultdict(float)self.call_countdefaultdict(int)asyncdefchat(self,request:ChatRequest)-ChatResponse:统一调用入口首选模型 → 失败按降级链路依次尝试preferredrequest.modelifrequest.modelinself.adapterselseself.default_model# 构建候选链首选 降级链中排除首选的candidates[preferred][mforminself.fallback_chainifm!preferredandminself.adapters]last_errorNoneformodel_nameincandidates:try:starttime.time()responseawaitself.adapters[model_name].call(request)response.latency_ms(time.time()-start)*1000# 记录成本self.cost_log[model_name]response.cost self.call_count[model_name]1returnresponseexceptExceptionase:print(f[路由器]{model_name}调用失败:{e}尝试下一个...)last_erroreraiseRuntimeError(f所有模型调用失败最后错误:{last_error})defstats(self)-Dict[str,Any]:查看各模型调用统计和成本total_costsum(self.cost_log.values())return{per_model:{name:{calls:self.call_count[name],cost:round(self.cost_log[name],4),}fornameinself.adapters},total_cost:round(total_cost,4),}3.6 使用示例# demo.pyimportasynciofromgateway.baseimportChatRequest,Message,Rolefromgateway.routerimportModelRouterasyncdefmain():routerModelRouter(config.yaml)# 场景1自动路由走默认模型reqChatRequest(messages[Message(Role.SYSTEM,你是一个Python专家),Message(Role.USER,写一个异步爬虫用aiohttp),])respawaitrouter.chat(req)print(f模型:{resp.model})print(f内容:{resp.content[:100]}...)print(fToken: 输入{resp.usage.input_tokens}/ 输出{resp.usage.output_tokens})print(f费用: ¥{resp.cost})print(f耗时:{resp.latency_ms:.0f}ms\n)# 场景2指定模型req2ChatRequest(messages[Message(Role.USER,解释一下什么是RAG)],modelqwen# 指定走通义千问)resp2awaitrouter.chat(req2)print(f模型:{resp2.model}| 费用: ¥{resp2.cost})# 场景3模拟降级指定一个不存在的模型会走fallbackreq3ChatRequest(messages[Message(Role.USER,你好)],modelnonexistent# 不存在自动走默认模型)resp3awaitrouter.chat(req3)print(f降级后使用:{resp3.model})# 查看统计print(f\n调用统计:{router.stats()})asyncio.run(main())运行结果四、实测成本对比成本下降的三个来源智能路由省约15%简单任务走低价模型复杂任务走深度推理语义缓存省约35%知识库场景大量相似问题缓存命中直接返回成本可视化省约10%有了明细才发现某功能每天无意义重复调用200次五、踩坑记录坑1各家Token计数不一致。同一句话DeepSeek算100 Token通义千问可能算120。解决方案是在网关层用tiktoken统一计算不依赖各家返回的usage。坑2连接池管理。httpx默认连接池太小高并发下会排队。建议max_connections20、max_keepalive_connections10、read_timeout60秒。坑3降级链别超过3个。超过3个说明你的模型选型有问题而且链路太长会拖慢响应。总超时设30秒超过直接返回错误比无限等待好。坑4缓存不是万能的。时间敏感的问题、需要实时信息的场景、需要随机性的创意写作这些场景加黑名单不走缓存。六、总结核心就三件事统一接口一套代码调所有模型新增模型只需写50行适配器智能路由按任务类型自动选模型失败自动降级成本可控每次调用都有明细月底对账不头疼完整代码已开源有问题评论区交流。如果觉得有用点赞收藏支持一下。
2025多模型API统一接入实战:DeepSeek/通义千问/GLM一个接口搞定
发布时间:2026/7/3 4:06:31
最近在做一个需要调用多个大模型的项目踩了不少坑。把方案整理出来分享给同样在搞多模型接入的同学。本文包含完整的Python代码可以直接跑。涉及DeepSeek、通义千问、智谱GLM三个模型的统一接入和智能路由。一、为什么要做多模型统一接入项目需求是这样的一个知识库问答系统不同问题要路由到不同模型。简单问题走便宜的快模型复杂推理走深度模型代码相关走擅长编程的模型。如果每个模型单独对接你会面对三个痛点每家请求格式不一样DeepSeek用OpenAI兼容格式通义千问有自己的SDKGLM又是一套每家的计费规则和Token计算方式不同月底对账头疼某家模型挂了没有自动降级用户体验直接崩解决方案就是在业务层和模型之间加一层抽象——统一请求格式、统一错误处理、统一成本核算。二、架构设计核心设计决策三、完整代码实现3.1 项目结构3.2 配置文件# config.yamlmodels:deepseek:endpoint:https://api.deepseek.com/v1/chat/completionsmodel_name:deepseek-chatprice_input:0.001# 元/千Tokenprice_output:0.002max_retries:3timeout:30qwen:endpoint:https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completionsmodel_name:qwen-plusprice_input:0.0008price_output:0.002glm:endpoint:https://open.bigmodel.cn/api/paas/v4/chat/completionsmodel_name:glm-4price_input:0.001price_output:0.001routing:default_model:deepseekfallback_chain:[deepseek,qwen,glm]cost_limit_per_call:0.05credentials:deepseek:${DEEPSEEK_API_KEY}qwen:${DASHSCOPE_API_KEY}glm:${GLM_API_KEY}这里说一句接入凭证千万别硬编码到代码里。用环境变量读取生产环境接密钥管理服务。3.3 统一数据结构#gateway/base.pyfrom abc import ABC,abstractmethod from dataclasses import dataclass from typing import Optional,Dict,Any,List fromenumimport Enum import httpx import os import yaml classRole(str,Enum):SYSTEMsystemUSERuserASSISTANTassistantdataclass class Message:role:Role content:strdataclass class ChatRequest:统一请求体——不管底层调哪个模型业务层只用这个messages:List[Message]model:Optional[str]None # 指定模型None则走默认路由 temperature:float0.7max_tokens:int4096dataclass class TokenUsage:input_tokens:intoutput_tokens:inttotal_tokens:intdataclass class ChatResponse:统一响应体content:str model:str usage:TokenUsage cost:float# 本次调用费用单位元 latency_ms:float# 响应耗时 classBaseAdapter(ABC):模型适配器基类——所有模型适配器继承这个def__init__(self,config:Dict[str,Any]):self.endpointconfig[endpoint]self.model_nameconfig[model_name]self.input_priceconfig[price_input]self.output_priceconfig[price_output]self.max_retriesconfig.get(max_retries,3)self.timeoutconfig.get(timeout,30)# 接入凭证从环境变量读取self.api_keyos.environ.get(config.get(env_key,))abstractmethod defbuild_payload(self,request:ChatRequest)-Dict[str,Any]:将统一请求转为该模型的原生格式passabstractmethod defparse_response(self,raw:Dict[str,Any])-ChatResponse:将原生响应转为统一格式pass defcalc_cost(self,input_tokens:int,output_tokens:int)-float:计算本次调用费用returnround(input_tokens*self.input_price/1000output_tokens*self.output_price/1000,6)async defcall(self,request:ChatRequest)-ChatResponse:发起HTTP请求带重试headers{Authorization:fBearer {self.api_key},Content-Type:application/json}payloadself.build_payload(request)async with httpx.AsyncClient(timeoutself.timeout)as client:forattemptinrange(self.max_retries):try:respawait client.post(self.endpoint,jsonpayload,headersheaders)resp.raise_for_status()returnself.parse_response(resp.json())except Exception as e:ifattemptself.max_retries-1:raisecontinue3.4 具体适配器实现DeepSeek、通义千问、GLM都兼容OpenAI格式但请求体有细微差异# gateway/adapters/deepseek.pyclassDeepSeekAdapter(BaseAdapter):defbuild_payload(self,request:ChatRequest)-Dict[str,Any]:return{model:self.model_name,messages:[{role:m.role.value,content:m.content}forminrequest.messages],temperature:request.temperature,max_tokens:request.max_tokens,}defparse_response(self,raw:Dict[str,Any])-ChatResponse:choiceraw[choices][0]usageraw[usage]input_tusage[prompt_tokens]output_tusage[completion_tokens]returnChatResponse(contentchoice[message][content],modeldeepseek-chat,usageTokenUsage(input_t,output_t,usage[total_tokens]),costself.calc_cost(input_t,output_t),latency_ms0.0,)# gateway/adapters/qwen.pyclassQwenAdapter(BaseAdapter):通义千问——兼容OpenAI格式但model字段名称不同defbuild_payload(self,request:ChatRequest)-Dict[str,Any]:return{model:self.model_name,# qwen-plusmessages:[{role:m.role.value,content:m.content}forminrequest.messages],temperature:request.temperature,max_tokens:request.max_tokens,}defparse_response(self,raw:Dict[str,Any])-ChatResponse:choiceraw[choices][0]usageraw[usage]input_tusage.get(input_tokens,usage.get(prompt_tokens,0))output_tusage.get(output_tokens,usage.get(completion_tokens,0))returnChatResponse(contentchoice[message][content],modelqwen-plus,usageTokenUsage(input_t,output_t,input_toutput_t),costself.calc_cost(input_t,output_t),latency_ms0.0,)# gateway/adapters/glm.pyclassGLMAdapter(BaseAdapter):智谱GLM——部分字段名不同defbuild_payload(self,request:ChatRequest)-Dict[str,Any]:return{model:self.model_name,# glm-4messages:[{role:m.role.value,content:m.content}forminrequest.messages],temperature:request.temperature,max_tokens:request.max_tokens,}defparse_response(self,raw:Dict[str,Any])-ChatResponse:choiceraw[choices][0]usageraw[usage]input_tusage.get(prompt_tokens,0)output_tusage.get(completion_tokens,0)returnChatResponse(contentchoice[message][content],modelglm-4,usageTokenUsage(input_t,output_t,usage.get(total_tokens,input_toutput_t)),costself.calc_cost(input_t,output_t),latency_ms0.0,)3.5 核心路由器这是整个方案的心脏——根据任务类型选模型失败自动降级# gateway/router.pyimporttimefromcollectionsimportdefaultdictclassModelRouter:多模型路由器——统一入口智能选模型自动降级ADAPTER_MAP{deepseek:DeepSeekAdapter,qwen:QwenAdapter,glm:GLMAdapter,}def__init__(self,config_path:strconfig.yaml):withopen(config_path)asf:self.configyaml.safe_load(f)self.routing_configself.config[routing]self.default_modelself.routing_config[default_model]self.fallback_chainself.routing_config[fallback_chain]# 初始化各模型适配器self.adapters{}forname,clsinself.ADAPTER_MAP.items():ifnameinself.config[models]:model_configself.config[models][name]model_config[env_key]list(self.config[credentials].values())[list(self.config[credentials].keys()).index(name)].strip(${})self.adapters[name]cls(model_config)# 成本统计self.cost_logdefaultdict(float)self.call_countdefaultdict(int)asyncdefchat(self,request:ChatRequest)-ChatResponse:统一调用入口首选模型 → 失败按降级链路依次尝试preferredrequest.modelifrequest.modelinself.adapterselseself.default_model# 构建候选链首选 降级链中排除首选的candidates[preferred][mforminself.fallback_chainifm!preferredandminself.adapters]last_errorNoneformodel_nameincandidates:try:starttime.time()responseawaitself.adapters[model_name].call(request)response.latency_ms(time.time()-start)*1000# 记录成本self.cost_log[model_name]response.cost self.call_count[model_name]1returnresponseexceptExceptionase:print(f[路由器]{model_name}调用失败:{e}尝试下一个...)last_erroreraiseRuntimeError(f所有模型调用失败最后错误:{last_error})defstats(self)-Dict[str,Any]:查看各模型调用统计和成本total_costsum(self.cost_log.values())return{per_model:{name:{calls:self.call_count[name],cost:round(self.cost_log[name],4),}fornameinself.adapters},total_cost:round(total_cost,4),}3.6 使用示例# demo.pyimportasynciofromgateway.baseimportChatRequest,Message,Rolefromgateway.routerimportModelRouterasyncdefmain():routerModelRouter(config.yaml)# 场景1自动路由走默认模型reqChatRequest(messages[Message(Role.SYSTEM,你是一个Python专家),Message(Role.USER,写一个异步爬虫用aiohttp),])respawaitrouter.chat(req)print(f模型:{resp.model})print(f内容:{resp.content[:100]}...)print(fToken: 输入{resp.usage.input_tokens}/ 输出{resp.usage.output_tokens})print(f费用: ¥{resp.cost})print(f耗时:{resp.latency_ms:.0f}ms\n)# 场景2指定模型req2ChatRequest(messages[Message(Role.USER,解释一下什么是RAG)],modelqwen# 指定走通义千问)resp2awaitrouter.chat(req2)print(f模型:{resp2.model}| 费用: ¥{resp2.cost})# 场景3模拟降级指定一个不存在的模型会走fallbackreq3ChatRequest(messages[Message(Role.USER,你好)],modelnonexistent# 不存在自动走默认模型)resp3awaitrouter.chat(req3)print(f降级后使用:{resp3.model})# 查看统计print(f\n调用统计:{router.stats()})asyncio.run(main())运行结果四、实测成本对比成本下降的三个来源智能路由省约15%简单任务走低价模型复杂任务走深度推理语义缓存省约35%知识库场景大量相似问题缓存命中直接返回成本可视化省约10%有了明细才发现某功能每天无意义重复调用200次五、踩坑记录坑1各家Token计数不一致。同一句话DeepSeek算100 Token通义千问可能算120。解决方案是在网关层用tiktoken统一计算不依赖各家返回的usage。坑2连接池管理。httpx默认连接池太小高并发下会排队。建议max_connections20、max_keepalive_connections10、read_timeout60秒。坑3降级链别超过3个。超过3个说明你的模型选型有问题而且链路太长会拖慢响应。总超时设30秒超过直接返回错误比无限等待好。坑4缓存不是万能的。时间敏感的问题、需要实时信息的场景、需要随机性的创意写作这些场景加黑名单不走缓存。六、总结核心就三件事统一接口一套代码调所有模型新增模型只需写50行适配器智能路由按任务类型自动选模型失败自动降级成本可控每次调用都有明细月底对账不头疼完整代码已开源有问题评论区交流。如果觉得有用点赞收藏支持一下。