1.Mongoshake工具介紹

和一般的同步工具一樣三種模式： all or full or incr

2.引數

# 通道模式。

tunnel = direct

# tunnel target resource url

# for rpc. this is remote receiver socket address

# for tcp. this is remote receiver socket address

# for file. this is the file path, for instance "data"

# for kafka. this is the topic and brokers address which split by comma, for

# instance: topic@brokers1,brokers2, default topic is "mongoshake"

# for mock. this is uesless

# for direct. this is target mongodb address which format is the same as `mongo_urls`. If

# the target is sharding, this should be the mongos address.

# direct模式用於直接寫入MongoDB，其餘模式用於一些分析，或者遠距離傳輸場景，

# 注意，如果是非direct模式，需要透過receiver進行解析，具體參考FAQ文件。

# 此處配置通道的地址，格式與mongo_urls對齊。

# the number of collection concurrence

# 併發最大拉取的表個數，例如，6表示同一時刻shake最多拉取6個表。

full_sync.reader.collection_parallel = 8

# the number of document writer thread in each collection.

# 同一個表內併發寫的執行緒數，例如，8表示對於同一個表，將會有8個寫執行緒進行併發寫入。

full_sync.reader.write_document_parallel = 8

# number of documents in a batch insert in a document concurrence

# 目的端寫入的batch大小，例如，128表示一個執行緒將會一次聚合128個文件然後再寫入。

full_sync.reader.document_batch_size = 128

# number of documents reading in single reader thread.

# do not enable when the _id has more than one type: e.g., ObjectId, string.

# 用於單表傾斜的最佳化，單個拉取執行緒讀取的最多的文件數，預設0表示拉取是單執行緒拉取，非0情況下必須>=10000。

# 例如，表內有50000文件，設定10000則讀取段拉取為5個執行緒（建議併發在1-32個執行緒）。

# 注意：對單個表來說，僅支援_id對應的value是同種型別，如果有不同型別請勿啟用該配置項！

full_sync.reader.read_document_count = 0

# drop the same name of collection in dest mongodb in full synchronization

# 同步時如果目的庫存在，是否先刪除目的庫再進行同步，true表示先刪除再同步，false表示不刪除。

full_sync.collection_exist_drop = false

# create index option.

# none: do not create indexes.

# foreground: create indexes when data sync finish in full sync stage.

# background: create indexes when starting.

# 全量期間資料同步完畢後，是否需要建立索引，none表示不建立，foreground表示建立前臺索引，

# background表示建立後臺索引。

full_sync.create_index = background

# convert insert to update when duplicate key found

# 如果_id存在在目的庫，是否將insert語句修改為update語句。

full_sync.executor.insert_on_dup_update = false

# filter orphan document for source type is sharding.

# 源端是sharding，是否需要過濾orphan文件

full_sync.executor.filter.orphan_document = false

# enable majority write in full sync.

# the performance will degrade if enable.

# 全量階段寫入端是否啟用majority write

full_sync.executor.majority_enable = false

# --------------------------- incrmental sync configuration ---------------------------

# fetch method:

# oplog: fetch oplog from source mongodb (default)

# change_stream: use change to receive change event from source mongodb, support MongoDB >= 4.0

incr_sync.mongo_fetch_method = change_stream

# After the document is updated, the fields that only need to be updated are set to false, 

# and the contents of all documents are set to true

# 更新文件後,只需要更新的欄位則設為false,需要全部文件內容則設為true

# 只在mongo_fetch_method = change_stream 模式下生效，且效能有所下降

incr_sync.change_stream.watch_full_document = false

# global id. used in active-active replication.

# this parameter is not supported on current open-source version.

# gid用於雙活防止環形複製，目前只用於阿里云云上MongoDB，如果是阿里云云上例項互相同步

# 希望開啟gid，請聯絡阿里雲售後，sharding的有多個gid請以分號(;)分隔。

incr_sync.oplog.gids =

# distribute data to different worker by hash key to run in parallel.

# [auto]                decide by if there has unique index in collections.

#                               use `collection` if has unique index otherwise use `id`.

# [id]                  shard by ObjectId. handle oplogs in sequence by unique _id

# [collection]  shard by ns. handle oplogs in sequence by unique ns

# hash的方式，id表示按文件hash，collection表示按表hash，auto表示自動選擇hash型別。

# 如果沒有索引建議選擇id達到非常高的同步效能，反之請選擇collection。

incr_sync.shard_key = collection

#incr_sync.shard_key = auto

# if shard_key is collection, and users want to improve performance when some collections

# do not have unique key.

# 對於按collection雜湊，如果某些表不具有唯一索引，則可以設定按_id雜湊以提高併發度。

# 使用者需要確認該表不會建立唯一索引，一旦檢測發現存在唯一索引，則會立刻crash退出。

# 例如，db1.collection1;db2.collection2，不支援僅指定db

incr_sync.shard_by_object_id_whitelist =

# oplog transmit worker concurrent

# if the source is sharding, worker number must equal to shard numbers.

# 內部傳送的worker數目，如果機器效能足夠，可以提高worker個數。

incr_sync.worker = 16

# batched oplogs have block level checksum value using

# crc32 algorithm. and compressor for compressing content

# of oplog entry.

# supported compressor are : gzip,zlib,deflate

# Do not enable this option when tunnel type is "direct"

# 是否啟用傳送，非direct模式傳送可以選擇壓縮以減少網路頻寬消耗。

incr_sync.worker.oplog_compressor = none

# set the sync delay just like mongodb secondary slaveDelay parameter. unit second.

# 設定目的端的延遲，比如延遲源端20分鐘，類似MongoDB本身主從同步slaveDelay引數，單位：秒

# 0表示不啟用

incr_sync.target_delay = 0

# memory queue configuration, plz visit FAQ document to see more details.

# do not modify these variables if the performance and resource usage can

# meet your needs.

# 內部佇列的配置引數，如果目前效能足夠不建議修改，詳細資訊參考FAQ。

incr_sync.worker.batch_queue_size = 64

incr_sync.adaptive.batching_max_size = 1024

incr_sync.fetcher.buffer_capacity = 256

# --- direct tunnel only begin ---

# if tunnel type is direct, all the below variable should be set

# 下列引數僅用於tunnel為direct的情況。

# oplog changes to Insert while Update found non-exist (_id or unique-index)

# 如果_id不存在在目的庫，是否將update語句修改為insert語句。

incr_sync.executor.upsert = false

# oplog changes to Update while Insert found duplicated key (_id or unique-index)

# 如果_id存在在目的庫，是否將insert語句修改為update語句。

incr_sync.executor.insert_on_dup_update = false

# db. write duplicated logs to mongoshake_conflict

# sdk. write duplicated logs to sdk.

# 如果寫入存在衝突，記錄衝突的文件。

incr_sync.conflict_write_to = none

# enable majority write in incrmental sync.

# the performance will degrade if enable.

# 增量階段寫入端是否啟用majority write

incr_sync.executor.majority_enable = false

incr_sync.mongo_fetch_method

oplog方式可以獲取noop oplog，

ChangeStream方式獲取不到noop oplog（被MongoDB自身過濾掉了）。
所以oplog會更新，ChangeStream不會更新。
這個現在維持現狀，等確定好的最佳化邏輯

3.AWS的documentDB不支援

因為DocumentDB不支援下面這個引數

cursor. noCursorTimeout ( ): Instructs the server to avoid closing a cursor automatically after a period of inactivity

3.Bug

這個原因是現在mongoshake全量階段用的是secondaryPreferred，所以優先讀的secondary。read concern沒有進行配置，導致讀secondary是available的方式，available和local方式不同的是前者不會過濾掉孤兒文件。

對於3.6以下的版本，readPreferred建議改為primary。
對於3.6+的版本，readConcern需要預設改為local。3.6新增了一個sharding_filter的stage用於過濾文件：mongod讀到以後會判斷該文件是否屬於當前mongod，不是的話會過濾掉，從此避免孤兒文件。

此處MongoShake將會給出建議和最佳化：

對於3.6以下版本，使用者自己調整配置項： mongo_connect_mode = primary。這是因為考慮到MongoShake如果直接強制修改，可能不符合使用者的預期，導致primary節點負載打高。
對於3.6+的版本，預設readConcern改為local。

當然，使用者也可以透過 full_sync.executor.insert_on_dup_update = true來進行過濾，不過建議還是採用上面方式更合理。
還有一種方式是全量開始前，先清理一下孤兒文件。

參考：

親測發現4.0版本，孤兒文件過濾不完善，即使配置了readConcern=local/majority，readPreferred=primary還是會碰到孤兒文件，這種情況建議手動清理一下。如果還有疑問請reopen當前issue

4.記憶體計算

FetcherBufferCapacity = 256
AdaptiveBatchingMaxSize = 16384. Since v2.0.7, we set the default value to 1024 to lower the memory usage. If the tunnel is direct, choose a small value won't decrease the performance a lot. But for others tunnels like tcp, rpc, kafka, set a big value will improve transmission performance.
WorkerBatchQueueSize = 64
Worker = 8

MongoDB遷移工具Mongoshake

相關文章