本文不生产技术只做技术的搬运工前言最近朋友有个近似文本查找的需求询问本人是否能帮忙实现要求匹配速度快准确率有基本保证即可想到了之前写的以图搜图demo决定以此作为技术路线简单实现一版。环境配置pip install torch1.13.0cu117 torchvision0.14.0cu117 torchaudio0.13.0 --extra-index-url https://download.pytorch.org/whl/cu117 pip install transformers4.30.2 pip install faiss-cpu模型转换这里作者使用onnx进行部署因此需要将cn-clip转成onnx模型转换方法参考以下链接https://github.com/OFA-Sys/Chinese-CLIP/blob/master/cn_clip/deploy/pytorch_to_onnx.pyhttps://github.com/OFA-Sys/Chinese-CLIP/blob/master/cn_clip/deploy/pytorch_to_onnx.py实现数据入库这里作者的标准文件是一个std.json文件各自的读取方式可能不同需要修改from transformers import AutoTokenizer import torch import onnxruntime import numpy as np import json import faiss def load_standard_points(json_file_path): 读取 standard_points JSON 文件 Args: json_file_path: JSON 文件路径 Returns: dict: 键为 name值为 displayName 的字典 with open(json_file_path, r, encodingutf-8) as f: data json.load(f) names [] display_names [] display_type_names[] for point in data.get(standard_points, []): name point.get(name) display_name point.get(displayName) display_type_name point.get(deviceTypeId) names.append(name) display_names.append(display_name) display_type_names.append(display_type_name) return names,display_names,display_type_names def text_encode(model_name,text,device): tokenizer AutoTokenizer.from_pretrained(model_name) # 设置固定长度为52,padding到最大长度,截断超过部分 inputs tokenizer( text, return_tensorspt, paddingmax_length, max_length52, truncationTrue ).to(device) return inputs def create_faiss_index(embeddings, std_names,std_displaynames,std_displaytypenames , output_path): dimension len(embeddings[0]) index faiss.IndexFlatIP(dimension) index faiss.IndexIDMap(index) vectors np.array(embeddings).astype(np.float32) # Add vectors to the index with IDs index.add_with_ids(vectors, np.array(range(len(embeddings)))) # Save the index faiss.write_index(index, output_path) print(fIndex created and saved to {output_path}) # Save image paths with open(output_path .names, w) as f: for std_name in std_names: f.write(std_name \n) with open(output_path .displaynames, w) as f: for std_displayname in std_displaynames: f.write(std_displayname \n) with open(output_path .displaytypenames, w) as f: for std_displaytypename in std_displaytypenames: f.write(std_displaytypename \n) return index if __name__ __main__: model_name OFA-Sys/chinese-clip-vit-base-patch16 model_path /home/workspace/rag/output.txt.fp32.onnx json_path r/home/workspace/rag/standard_points_202604291727.json output_path r/home/workspace/rag/database_stdpts/stdpts_name.index std_names,std_displaynames,std_displaytypenames load_standard_points(json_path) if (len(std_names)!len(std_displaynames) or len(std_displaynames)!len(std_displaytypenames)): print(std_names and std_displaynames length not equal) exit() embeddings [] i 1 length len(std_names) session onnxruntime.InferenceSession(model_path) text_model_inputs session.get_inputs()[0].name text_model_outputs session.get_outputs()[0].name for name, display_name, display_type_name in zip(std_names, std_displaynames, std_displaytypenames): print(i, /, length) inputs text_encode(model_name,display_name,torch.device(cpu)) text np.array(inputs[input_ids]) tensor session.run([text_model_outputs], {text_model_inputs: text})[0] tensor np.squeeze(tensor) tensor tensor / np.linalg.norm(tensor) embeddings.append(tensor) i i 1 index create_faiss_index(embeddings, std_names, std_displaynames,std_displaytypenames,output_path)数据查询from transformers import AutoTokenizer import torch import onnxruntime import numpy as np import faiss def text_encode(model_name,text,device): tokenizer AutoTokenizer.from_pretrained(model_name) # 设置固定长度为52,padding到最大长度,截断超过部分 inputs tokenizer( text, return_tensorspt, paddingmax_length, max_length52, truncationTrue ).to(device) return inputs def load_faiss_index(index_path): index faiss.read_index(index_path) with open(index_path .names, r) as f: names [line.strip() for line in f] with open(index_path .displaynames, r) as f: display_names [line.strip() for line in f] with open(index_path .displaytypenames, r) as f: display_typenames [line.strip() for line in f] return index, names, display_names, display_typenames if __name__ __main__: model_name OFA-Sys/chinese-clip-vit-base-patch16 model_path /home/workspace/rag/output.txt.fp32.onnx index_path r/home/workspace/rag/database_stdpts/stdpts_name.index need_search_text 运行状态 topk15 session onnxruntime.InferenceSession(model_path) text_model_inputs session.get_inputs()[0].name text_model_outputs session.get_outputs()[0].name inputs text_encode(model_name, need_search_text, torch.device(cpu)) text np.array(inputs[input_ids]) tensor session.run([text_model_outputs], {text_model_inputs: text})[0] tensor tensor / np.linalg.norm(tensor) index, names, display_names , display_typenames load_faiss_index(index_path) distances, indices index.search(tensor, topk) for i in range(topk): print(names[indices[0][i]], display_names[indices[0][i]],display_typenames[indices[0][i]],distances[0][i])
Chinese-Clip实现文本匹配
发布时间:2026/6/3 17:46:00
本文不生产技术只做技术的搬运工前言最近朋友有个近似文本查找的需求询问本人是否能帮忙实现要求匹配速度快准确率有基本保证即可想到了之前写的以图搜图demo决定以此作为技术路线简单实现一版。环境配置pip install torch1.13.0cu117 torchvision0.14.0cu117 torchaudio0.13.0 --extra-index-url https://download.pytorch.org/whl/cu117 pip install transformers4.30.2 pip install faiss-cpu模型转换这里作者使用onnx进行部署因此需要将cn-clip转成onnx模型转换方法参考以下链接https://github.com/OFA-Sys/Chinese-CLIP/blob/master/cn_clip/deploy/pytorch_to_onnx.pyhttps://github.com/OFA-Sys/Chinese-CLIP/blob/master/cn_clip/deploy/pytorch_to_onnx.py实现数据入库这里作者的标准文件是一个std.json文件各自的读取方式可能不同需要修改from transformers import AutoTokenizer import torch import onnxruntime import numpy as np import json import faiss def load_standard_points(json_file_path): 读取 standard_points JSON 文件 Args: json_file_path: JSON 文件路径 Returns: dict: 键为 name值为 displayName 的字典 with open(json_file_path, r, encodingutf-8) as f: data json.load(f) names [] display_names [] display_type_names[] for point in data.get(standard_points, []): name point.get(name) display_name point.get(displayName) display_type_name point.get(deviceTypeId) names.append(name) display_names.append(display_name) display_type_names.append(display_type_name) return names,display_names,display_type_names def text_encode(model_name,text,device): tokenizer AutoTokenizer.from_pretrained(model_name) # 设置固定长度为52,padding到最大长度,截断超过部分 inputs tokenizer( text, return_tensorspt, paddingmax_length, max_length52, truncationTrue ).to(device) return inputs def create_faiss_index(embeddings, std_names,std_displaynames,std_displaytypenames , output_path): dimension len(embeddings[0]) index faiss.IndexFlatIP(dimension) index faiss.IndexIDMap(index) vectors np.array(embeddings).astype(np.float32) # Add vectors to the index with IDs index.add_with_ids(vectors, np.array(range(len(embeddings)))) # Save the index faiss.write_index(index, output_path) print(fIndex created and saved to {output_path}) # Save image paths with open(output_path .names, w) as f: for std_name in std_names: f.write(std_name \n) with open(output_path .displaynames, w) as f: for std_displayname in std_displaynames: f.write(std_displayname \n) with open(output_path .displaytypenames, w) as f: for std_displaytypename in std_displaytypenames: f.write(std_displaytypename \n) return index if __name__ __main__: model_name OFA-Sys/chinese-clip-vit-base-patch16 model_path /home/workspace/rag/output.txt.fp32.onnx json_path r/home/workspace/rag/standard_points_202604291727.json output_path r/home/workspace/rag/database_stdpts/stdpts_name.index std_names,std_displaynames,std_displaytypenames load_standard_points(json_path) if (len(std_names)!len(std_displaynames) or len(std_displaynames)!len(std_displaytypenames)): print(std_names and std_displaynames length not equal) exit() embeddings [] i 1 length len(std_names) session onnxruntime.InferenceSession(model_path) text_model_inputs session.get_inputs()[0].name text_model_outputs session.get_outputs()[0].name for name, display_name, display_type_name in zip(std_names, std_displaynames, std_displaytypenames): print(i, /, length) inputs text_encode(model_name,display_name,torch.device(cpu)) text np.array(inputs[input_ids]) tensor session.run([text_model_outputs], {text_model_inputs: text})[0] tensor np.squeeze(tensor) tensor tensor / np.linalg.norm(tensor) embeddings.append(tensor) i i 1 index create_faiss_index(embeddings, std_names, std_displaynames,std_displaytypenames,output_path)数据查询from transformers import AutoTokenizer import torch import onnxruntime import numpy as np import faiss def text_encode(model_name,text,device): tokenizer AutoTokenizer.from_pretrained(model_name) # 设置固定长度为52,padding到最大长度,截断超过部分 inputs tokenizer( text, return_tensorspt, paddingmax_length, max_length52, truncationTrue ).to(device) return inputs def load_faiss_index(index_path): index faiss.read_index(index_path) with open(index_path .names, r) as f: names [line.strip() for line in f] with open(index_path .displaynames, r) as f: display_names [line.strip() for line in f] with open(index_path .displaytypenames, r) as f: display_typenames [line.strip() for line in f] return index, names, display_names, display_typenames if __name__ __main__: model_name OFA-Sys/chinese-clip-vit-base-patch16 model_path /home/workspace/rag/output.txt.fp32.onnx index_path r/home/workspace/rag/database_stdpts/stdpts_name.index need_search_text 运行状态 topk15 session onnxruntime.InferenceSession(model_path) text_model_inputs session.get_inputs()[0].name text_model_outputs session.get_outputs()[0].name inputs text_encode(model_name, need_search_text, torch.device(cpu)) text np.array(inputs[input_ids]) tensor session.run([text_model_outputs], {text_model_inputs: text})[0] tensor tensor / np.linalg.norm(tensor) index, names, display_names , display_typenames load_faiss_index(index_path) distances, indices index.search(tensor, topk) for i in range(topk): print(names[indices[0][i]], display_names[indices[0][i]],display_typenames[indices[0][i]],distances[0][i])