# 通道模式。
tunnel = direct
# tunnel target resource url
# for rpc. this is remote receiver socket address
# for tcp. this is remote receiver socket address
# for file. this is the file path, for instance "data"
# for kafka. this is the topic and brokers address which split by comma, for
# instance: topic@brokers1,brokers2, default topic is "mongoshake"
# for mock. this is uesless
# for direct. this is target mongodb address which format is the same as `mongo_urls`. If
# the target is sharding, this should be the mongos address.
# direct模式用於直接寫入MongoDB,其餘模式用於一些分析,或者遠距離傳輸場景,
# 注意,如果是非direct模式,需要透過receiver進行解析,具體參考FAQ文件。
# 此處配置通道的地址,格式與mongo_urls對齊。
# the number of collection concurrence
# 併發最大拉取的表個數,例如,6表示同一時刻shake最多拉取6個表。
full_sync.reader.collection_parallel = 8
# the number of document writer thread in each collection.
# 同一個表內併發寫的執行緒數,例如,8表示對於同一個表,將會有8個寫執行緒進行併發寫入。
full_sync.reader.write_document_parallel = 8
# number of documents in a batch insert in a document concurrence
# 目的端寫入的batch大小,例如,128表示一個執行緒將會一次聚合128個文件然後再寫入。
full_sync.reader.document_batch_size = 128
# number of documents reading in single reader thread.
# do not enable when the _id has more than one type: e.g., ObjectId, string.
# 用於單表傾斜的最佳化,單個拉取執行緒讀取的最多的文件數,預設0表示拉取是單執行緒拉取,非0情況下必須>=10000。
# 例如,表內有50000文件,設定10000則讀取段拉取為5個執行緒(建議併發在1-32個執行緒)。
# 注意:對單個表來說,僅支援_id對應的value是同種型別,如果有不同型別請勿啟用該配置項!
full_sync.reader.read_document_count = 0
# drop the same name of collection in dest mongodb in full synchronization
# 同步時如果目的庫存在,是否先刪除目的庫再進行同步,true表示先刪除再同步,false表示不刪除。
full_sync.collection_exist_drop = false
# create index option.
# none: do not create indexes.
# foreground: create indexes when data sync finish in full sync stage.
# background: create indexes when starting.
# 全量期間資料同步完畢後,是否需要建立索引,none表示不建立,foreground表示建立前臺索引,
# background表示建立後臺索引。
full_sync.create_index = background
# convert insert to update when duplicate key found
# 如果_id存在在目的庫,是否將insert語句修改為update語句。
full_sync.executor.insert_on_dup_update = false
# filter orphan document for source type is sharding.
# 源端是sharding,是否需要過濾orphan文件
full_sync.executor.filter.orphan_document = false
# enable majority write in full sync.
# the performance will degrade if enable.
# 全量階段寫入端是否啟用majority write
full_sync.executor.majority_enable = false
# --------------------------- incrmental sync configuration ---------------------------
# fetch method:
# oplog: fetch oplog from source mongodb (default)
# change_stream: use change to receive change event from source mongodb, support MongoDB >= 4.0
incr_sync.mongo_fetch_method = change_stream
# After the document is updated, the fields that only need to be updated are set to false,
# and the contents of all documents are set to true
# 更新文件後,只需要更新的欄位則設為false,需要全部文件內容則設為true
# 只在mongo_fetch_method = change_stream 模式下生效,且效能有所下降
incr_sync.change_stream.watch_full_document = false
# global id. used in active-active replication.
# this parameter is not supported on current open-source version.
# gid用於雙活防止環形複製,目前只用於阿里云云上MongoDB,如果是阿里云云上例項互相同步
# 希望開啟gid,請聯絡阿里雲售後,sharding的有多個gid請以分號(;)分隔。
incr_sync.oplog.gids =
# distribute data to different worker by hash key to run in parallel.
# [auto] decide by if there has unique index in collections.
# use `collection` if has unique index otherwise use `id`.
# [id] shard by ObjectId. handle oplogs in sequence by unique _id
# [collection] shard by ns. handle oplogs in sequence by unique ns
# hash的方式,id表示按文件hash,collection表示按表hash,auto表示自動選擇hash型別。
# 如果沒有索引建議選擇id達到非常高的同步效能,反之請選擇collection。
incr_sync.shard_key = collection
#incr_sync.shard_key = auto
# if shard_key is collection, and users want to improve performance when some collections
# do not have unique key.
# 對於按collection雜湊,如果某些表不具有唯一索引,則可以設定按_id雜湊以提高併發度。
# 使用者需要確認該表不會建立唯一索引,一旦檢測發現存在唯一索引,則會立刻crash退出。
# 例如,db1.collection1;db2.collection2,不支援僅指定db
incr_sync.shard_by_object_id_whitelist =
# oplog transmit worker concurrent
# if the source is sharding, worker number must equal to shard numbers.
# 內部傳送的worker數目,如果機器效能足夠,可以提高worker個數。
incr_sync.worker = 16
# batched oplogs have block level checksum value using
# crc32 algorithm. and compressor for compressing content
# of oplog entry.
# supported compressor are : gzip,zlib,deflate
# Do not enable this option when tunnel type is "direct"
# 是否啟用傳送,非direct模式傳送可以選擇壓縮以減少網路頻寬消耗。
incr_sync.worker.oplog_compressor = none
# set the sync delay just like mongodb secondary slaveDelay parameter. unit second.
# 設定目的端的延遲,比如延遲源端20分鐘,類似MongoDB本身主從同步slaveDelay引數,單位:秒
# 0表示不啟用
incr_sync.target_delay = 0
# memory queue configuration, plz visit FAQ document to see more details.
# do not modify these variables if the performance and resource usage can
# meet your needs.
# 內部佇列的配置引數,如果目前效能足夠不建議修改,詳細資訊參考FAQ。
incr_sync.worker.batch_queue_size = 64
incr_sync.adaptive.batching_max_size = 1024
incr_sync.fetcher.buffer_capacity = 256
# --- direct tunnel only begin ---
# if tunnel type is direct, all the below variable should be set
# 下列引數僅用於tunnel為direct的情況。
# oplog changes to Insert while Update found non-exist (_id or unique-index)
# 如果_id不存在在目的庫,是否將update語句修改為insert語句。
incr_sync.executor.upsert = false
# oplog changes to Update while Insert found duplicated key (_id or unique-index)
# 如果_id存在在目的庫,是否將insert語句修改為update語句。
incr_sync.executor.insert_on_dup_update = false
# db. write duplicated logs to mongoshake_conflict
# sdk. write duplicated logs to sdk.
# 如果寫入存在衝突,記錄衝突的文件。
incr_sync.conflict_write_to = none
# enable majority write in incrmental sync.
# the performance will degrade if enable.
# 增量階段寫入端是否啟用majority write
incr_sync.executor.majority_enable = false
這個原因是現在mongoshake全量階段用的是secondaryPreferred,所以優先讀的secondary。read
concern沒有進行配置,導致讀secondary是available的方式,available和local方式不同的是前者不會過濾掉孤兒文件。
親測發現4.0版本,孤兒文件過濾不完善,即使配置了readConcern=local/majority,readPreferred=primary還是會碰到孤兒文件,這種情況建議手動清理一下。如果還有疑問請reopen當前issue