混合检索实现:关键词+语义检索的完美结合 混合检索实现关键词语义检索的完美结合前言单一的检索方式往往无法满足复杂需求。将关键词检索与语义检索结合可以显著提升检索质量兼顾精确匹配和语义理解。我在多个搜索系统中实现过混合检索今天分享一些实战经验。混合检索架构核心架构from typing import List, Dict, Any import numpy as np from rank_bm25 import BM25Okapi class HybridRetriever: 混合检索器 def __init__(self, vector_db, keyword_dbNone): self.vector_db vector_db self.keyword_db keyword_db or BM25Index() self.vector_weight 0.6 self.keyword_weight 0.4 def set_weights(self, vector_weight, keyword_weight): 设置权重 self.vector_weight vector_weight self.keyword_weight keyword_weight def search(self, query: str, query_embedding: np.ndarray, top_k: int 10): 混合检索 # 并行获取两种检索结果 vector_results self.vector_db.search(query_embedding, top_ktop_k*2) keyword_results self.keyword_db.search(query, top_ktop_k*2) # 结果融合 combined self._merge_results(vector_results, keyword_results) # 重新排序 reranked self._rerank(combined, query, top_k) return reranked def _merge_results(self, vector_results, keyword_results): 合并检索结果 # 构建结果字典 results_dict {} for item in vector_results: doc_id item[id] results_dict[doc_id] { **item, vector_score: item[score], keyword_score: 0.0 } for item in keyword_results: doc_id item[id] if doc_id in results_dict: results_dict[doc_id][keyword_score] item[score] else: results_dict[doc_id] { **item, vector_score: 0.0, keyword_score: item[score] } return list(results_dict.values()) def _rerank(self, results, query, top_k): 重新排序 for item in results: item[hybrid_score] ( self.vector_weight * item[vector_score] self.keyword_weight * item[keyword_score] ) # 按混合分数排序 results.sort(keylambda x: x[hybrid_score], reverseTrue) return results[:top_k]关键词检索实现import jieba from collections import defaultdict class BM25Index: BM25 关键词索引 def __init__(self): self.bm25 None self.documents [] self.tokenized_docs [] def add_document(self, doc_id: str, content: str, metadata: Dict None): 添加文档 self.documents.append({ id: doc_id, content: content, metadata: metadata or {} }) self.tokenized_docs.append(self._tokenize(content)) def _tokenize(self, text: str): 分词 return list(jieba.cut(text)) def build_index(self): 构建索引 self.bm25 BM25Okapi(self.tokenized_docs) def search(self, query: str, top_k: int 10): BM25 检索 if self.bm25 is None: self.build_index() query_tokens self._tokenize(query) scores self.bm25.get_scores(query_tokens) # 获取top-k结果 top_indices np.argsort(scores)[::-1][:top_k] results [] for idx in top_indices: results.append({ id: self.documents[idx][id], content: self.documents[idx][content], metadata: self.documents[idx][metadata], score: float(scores[idx]) }) return results高级融合策略基于分数归一化的融合class NormalizedHybridRetriever(HybridRetriever): 归一化混合检索器 def _merge_results(self, vector_results, keyword_results): 归一化后合并结果 # 收集所有分数 vector_scores [item[score] for item in vector_results] keyword_scores [item[score] for item in keyword_results] # 归一化 if vector_scores: v_min, v_max min(vector_scores), max(vector_scores) if v_max v_min: for item in vector_results: item[norm_vector_score] (item[score] - v_min) / (v_max - v_min) else: for item in vector_results: item[norm_vector_score] 1.0 if keyword_scores: k_min, k_max min(keyword_scores), max(keyword_scores) if k_max k_min: for item in keyword_results: item[norm_keyword_score] (item[score] - k_min) / (k_max - k_min) else: for item in keyword_results: item[norm_keyword_score] 1.0 return super()._merge_results(vector_results, keyword_results) def _rerank(self, results, query, top_k): 使用归一化分数重新排序 for item in results: item[hybrid_score] ( self.vector_weight * item.get(norm_vector_score, 0.0) self.keyword_weight * item.get(norm_keyword_score, 0.0) ) results.sort(keylambda x: x[hybrid_score], reverseTrue) return results[:top_k]基于学习的重排序from sklearn.ensemble import GradientBoostingClassifier from sklearn.preprocessing import StandardScaler class LearnedReranker: 基于学习的重排序器 def __init__(self): self.model GradientBoostingClassifier() self.scaler StandardScaler() self.is_trained False def extract_features(self, query, doc, vector_score, keyword_score): 提取特征 features [ vector_score, keyword_score, len(query) / len(doc[content]) if doc[content] else 0, sum(1 for q in query.split() if q in doc[content]), vector_score * keyword_score ] return features def train(self, queries, docs, labels): 训练模型 X [] y [] for query, doc_candidates, relevance in zip(queries, docs, labels): for doc, vec_score, key_score, rel in zip( doc_candidates[docs], doc_candidates[vector_scores], doc_candidates[keyword_scores], relevance ): features self.extract_features(query, doc, vec_score, key_score) X.append(features) y.append(rel) X self.scaler.fit_transform(X) self.model.fit(X, y) self.is_trained True def rerank(self, query, results): 重排序 if not self.is_trained: return results X [] for item in results: features self.extract_features( query, item, item[vector_score], item[keyword_score] ) X.append(features) X self.scaler.transform(X) scores self.model.predict_proba(X)[:, 1] for item, score in zip(results, scores): item[learned_score] score results.sort(keylambda x: x[learned_score], reverseTrue) return results完整检索流程class CompleteSearchSystem: 完整搜索系统 def __init__(self, embedding_model, vector_db): self.embedding_model embedding_model self.vector_db vector_db self.keyword_index BM25Index() self.hybrid_retriever NormalizedHybridRetriever(vector_db, self.keyword_index) self.reranker None def index_document(self, doc_id: str, content: str, metadata: Dict None): 索引文档 # 添加到关键词索引 self.keyword_index.add_document(doc_id, content, metadata) # 添加到向量索引 embedding self.embedding_model.encode(content) self.vector_db.upsert(doc_id, embedding, metadata) def index_batch(self, documents): 批量索引 for doc in documents: self.index_document(doc[id], doc[content], doc.get(metadata)) def search(self, query: str, top_k: int 10): 搜索 # 生成查询 embedding query_embedding self.embedding_model.encode(query) # 混合检索 results self.hybrid_retriever.search(query, query_embedding, top_ktop_k) # 可选学习重排序 if self.reranker: results self.reranker.rerank(query, results) return results总结混合检索的核心要点多检索源关键词语义双重保障分数融合权重可调灵活适配归一化确保不同分数可比较学习重排序进一步提升质量关键实践从简单权重融合开始根据场景调整权重考虑使用归一化分数有数据时加入学习重排序