本文件演示如何從Milvus將Collection資料全量匯出,並適配遷移至DashVector。方案的主要流程包括:
- 首先,升級Milvus版本,目前Milvus只有在最新版本(v.2.3.x)中支援全量匯出
- 其次,將Milvus Collection的Schema資訊和資料資訊匯出到具體的檔案中
- 最後,以匯出的檔案作為輸入來構建DashVector Collection並資料匯入
下面,將詳細闡述遷移方案的具體操作細節。
Milvus升級2.3.x版本
本文中,我們將藉助Milvus的[query_iterator](https://milvus.io/docs/with_iterators.md)來全量匯出資料([query](https://milvus.io/docs/query.md)介面無法匯出完整資料),由於該介面目前只在v2.3.x版本中支援,所以在匯出資料前,需要先將Milvus版本升級到該版本。Milvus版本升級的詳細操作參考[Milvus使用者文件](https://milvus.io/docs/upgrade_milvus_standalone-operator.md)。:::info
注意:在進行Milvus Upgrade時需要注意資料的備份安全問題。
:::
Milvus全量資料匯出
資料的匯出包含Schema以及資料記錄,Schema主要用於完備地定義Collection,資料記錄對應於每個Partition下的全量資料,這兩部分涵蓋了需要匯出的全部資料。下文展示如何將單個Milvus Collection全量匯出。Schema匯出
DashVector和Milvus在Schema的設計上有一些區別,DashVector向使用者透出的介面非常簡單,Milvus則更加詳盡。從Milvus遷移DashVector時會涉及到部分Schema引數的刪除(例如Collection的index_param引數),只會保留DashVector構建Collection的必要引數,以下為一個Schema轉換的簡單示例(其中,Collection已有的資料參考[Milvus示例程式碼](https://raw.githubusercontent.com/zilliztech/milvus-backup/main/example/prepare_data.py)寫入)。from pymilvus import (
connections,
utility,
Collection,
DataType
)
import os
import json
from pathlib import Path
fmt = "\n=== {:30} ===\n"
print(fmt.format("start connecting to Milvus"))
host = os.environ.get('MILVUS_HOST', "localhost")
print(fmt.format(f"Milvus host: {host}"))
connections.connect("default", host=host, port="19530")
metrics_map = {
'COSINE': 'cosine',
'L2': 'euclidean',
'IP': 'dotproduct',
}
dtype_map = {
DataType.BOOL: 'bool',
DataType.INT8: 'int',
DataType.INT16: 'int',
DataType.INT32: 'int',
DataType.INT64: 'int',
DataType.FLOAT: 'float',
DataType.DOUBLE: 'float',
DataType.STRING: 'str',
DataType.VARCHAR: 'str',
}
def load_collection(collection_name: str) -> Collection:
has = utility.has_collection(collection_name)
print(f"Does collection hello_milvus exist in Milvus: {has}")
if not has:
return None
collection = Collection(collection_name)
collection.load()
return collection
def export_collection_schema(collection, file: str):
schema = collection.schema.to_dict()
index = collection.indexes[0].to_dict()
export_schema = dict()
milvus_metric_type = index['index_param']['metric_type']
try:
export_schema['metrics'] = metrics_map[milvus_metric_type]
except:
raise Exception(f"milvus metrics_type{milvus_metric_type} not supported")
export_schema['fields_schema'] = {}
for field in schema['fields']:
if 'is_primary' in field and field['is_primary']:
continue
if field['name'] == index['field']:
# vector
if field['type'] == DataType.FLOAT_VECTOR:
export_schema['dtype'] = 'float'
export_schema['dimension'] = field['params']['dim']
else:
raise Exception(f"milvus dtype{field['type']} not supported yet")
else:
try:
# non-vector
export_schema['fields_schema'][field['name']] = dtype_map[field['type']]
except:
raise Exception(f"milvus dtype{field['type']} not supported yet")
with open(file, 'w') as file:
json.dump(export_schema, file, indent=4)
if __name__ == "__main__":
collection_name = "YOUR_MILVUS_COLLECTION_NAME"
collection = load_collection(collection_name)
dump_path_str = collection_name+'.dump'
dump_path = Path(dump_path_str)
dump_path.mkdir(parents=True, exist_ok=True)
schema_file = dump_path_str + "/schema.json"
export_collection_schema(collection, schema_file)
以下是一個可用於建立DashVector Collection的schema檔案示例。
{
"metrics": "euclidean",
"fields_schema": {
"random": "float",
"var": "str"
},
"dtype": "float",
"dimension": 8
}
Data匯出
DashVector和Milvus在設計上都有Partition的概念,所以向量以及其他資料進行匯出時,需要注意按照Partition粒度進行匯出。此外,DashVector的主鍵型別為str,而Milvus設計其為自定義型別,所以在匯出時需要考慮主鍵型別的轉換。以下為一個基於[query_iterator](https://milvus.io/docs/with_iterators.md)介面匯出的簡單程式碼示例:from pymilvus import (
connections,
utility,
Collection,
DataType
)
import os
import json
import numpy as np
from pathlib import Path
fmt = "\n=== {:30} ===\n"
print(fmt.format("start connecting to Milvus"))
host = os.environ.get('MILVUS_HOST', "localhost")
print(fmt.format(f"Milvus host: {host}"))
connections.connect("default", host=host, port="19530")
pk = "pk"
vector_field_name = "vector"
def load_collection(collection_name: str) -> Collection:
has = utility.has_collection(collection_name)
print(f"Does collection hello_milvus exist in Milvus: {has}")
if not has:
return None
collection = Collection(collection_name)
collection.load()
return collection
def export_partition_data(collection, partition_name, file: str):
batch_size = 10
output_fields=["pk", "random", "var", "embeddings"]
query_iter = collection.query_iterator(
batch_size=batch_size,
output_fields = output_fields,
partition_names=[partition_name]
)
export_file = open(file, 'w')
while True:
docs = query_iter.next()
if len(docs) == 0:
# close the iterator
query_iter.close()
break
for doc in docs:
new_doc = {}
new_doc_fields = {}
for k, v in doc.items():
if k == pk:
# primary key
new_doc['pk'] = str(v)
elif k == vector_field_name:
new_doc['vector'] = [float(k) for k in v]
else:
new_doc_fields[k] = v
new_doc['fields'] = new_doc_fields
json.dump(new_doc, export_file)
export_file.write('\n')
export_file.close()
if __name__ == "__main__":
collection_name = "YOUR_MILVUS_COLLECTION_NAME"
collection = load_collection(collection_name)
pk = collection.schema.primary_field.name
vector_field_name = collection.indexes[0].field_name
dump_path_str = collection_name+'.dump'
dump_path = Path(dump_path_str)
dump_path.mkdir(parents=True, exist_ok=True)
for partition in collection.partitions:
partition_name = partition.name
if partition_name == '_default':
export_path = dump_path_str + '/default.txt'
else:
export_path = dump_path_str + '/' + partition_name + ".txt"
export_partition_data(collection, partition_name, export_path)
上述示例程式碼會將Milvus Collection的各個Partition分別進行資料匯出,匯出後的檔案結構如下圖所示:
# collection_name = hello_milvus
hello_milvus.dump/
├── default.txt
└── schema.json
將資料匯入DashVector
建立Cluster
參考DashVector官方[使用者手冊](https://help.aliyun.com/document_detail/2631966.html?spm=a2c4g.2631965.0.0.33485425aqhYvz)構建Cluster。建立Collection
根據2.1章節中匯出的Schema資訊以及參考Dashvector官方[使用者手冊](https://help.aliyun.com/document_detail/2568085.html?spm=a2c4g.2631966.0.0.153c1afcNYc6rW)來建立Collection。下面的示例程式碼會根據2.1章節中匯出的schema.json來建立一個DashVector的Collection。from dashvector import Client, DashVectorException
from pydantic import BaseModel
from typing import Dict, Type
import json
dtype_convert = {
'int': int,
'float': float,
'bool': bool,
'str': str
}
class Schema(BaseModel):
metrics: str
dtype: Type
dimension: int
fields_schema: Dict[str, Type]
@classmethod
def from_dict(cls, json_data):
metrics = json_data['metrics']
dtype = dtype_convert[json_data['dtype']]
dimension = json_data['dimension']
fields_schema = {k: dtype_convert[v] for k, v in json_data['fields_schema'].items()}
return cls(metrics=metrics, dtype=dtype, dimension=dimension, fields_schema=fields_schema)
def read_schema(schema_path) -> Schema:
with open(schema_path) as file:
json_data = json.loads(file.read())
return Schema.from_dict(json_data)
if __name__ == "__main__":
milvus_dump_path = f"{YOUR_MILVUS_COLLECTION_NAME}.dump"
milvus_dump_scheme_path = milvus_dump_path + "/schema.json"
schema = read_schema(milvus_dump_scheme_path)
client = dashvector.Client(
api_key='YOUR_API_KEY',
endpoint='YOUR_CLUSTER_ENDPOINT'
)
# create collection
rsp = client.create(name="YOUR_DASHVECTOR_COLLECTION_NAME",
dimension=schema.dimension,
metric=schema.metrics,
dtype=schema.dtype,
fields_schema=schema.fields_schema)
if not rsp:
raise DashVectorException(rsp.code, reason=rsp.message)
匯入Data
根據2.2章節中匯出的資料以及參考DashVector官方[使用者手冊](https://help.aliyun.com/document_detail/2510249.html?spm=a2c4g.2510248.0.0.49ef7738NuI0kM#aa59e950508ld)來批次插入Doc。下面的示例程式碼會依次解析各個Partition匯出的資料,然後依次建立DashVector下的Partition並匯入資料。from dashvector import Client, DashVectorException, Doc
from pydantic import BaseModel
from typing import Dict, Type
import json
import glob
from pathlib import Path
def insert_data(collection, partition_name, partition_file):
if partition_name != 'default':
rsp = collection.create_partition(partition_name)
if not rsp:
raise DashVectorException(rsp.code, reason=rsp.message)
with open(partition_file) as f:
for line in f:
if line.strip():
json_data = json.loads(line)
rsp = collection.insert(
[
Doc(id=json_data['pk'], vector=json_data['vector'], fields=json_data['fields'])
]
)
if not rsp:
raise DashVectorException(rsp.code, reason=rsp.message)
if __name__ == "__main__":
milvus_dump_path = f"{YOUR_MILVUS_COLLECTION_NAME}.dump"
client = dashvector.Client(
api_key='YOUR_API_KEY',
endpoint='YOUR_CLUSTER_ENDPOINT'
)
# create collection
collection = client.get("YOUR_DASHVECTOR_COLLECTION_NAME")
partition_files = glob.glob(milvus_dump_path+'/*.txt', recursive=False)
for partition_file in partition_files:
# create partition
partition_name = Path(partition_file).stem
insert_data(collection, partition_name, partition_file)