線上的一個叢集，從庫的io執行緒自動停止了，看了下錯誤日誌

2022-07-29T19:10:25.749310Z 8032277 [Note] Slave I/O thread killed while reading event for channel ''
2022-07-29T19:10:25.749343Z 8032277 [Note] Slave I/O thread exiting for channel '', read up to log 'mysql-bin.006098', position 640239162
2022-07-29T19:10:25.797015Z 8136896 [Note] Slave I/O thread: Start semi-sync replication to master 'xx@xxx:xxx' in log 'mysql-bin.006098' at position 640239162
2022-07-29T19:10:25.797086Z 8136896 [Warning] Storing MySQL user name or password information in the master info repository is not secure and is therefore not recommended. Please consider using the USER and PASSWORD connection options for START SLAVE; see the 'START SLAVE Syntax' in the MySQL Manual for more information.

可以看到在讀取event的時候，io執行緒被殺了，出問題的時間看mysql吞吐，負載與正常情況下沒有太大變化，在主庫有failed on flush_net()的報錯。下面看了下從庫報錯的相關原始碼，有疑問的地方是，日誌中io執行緒推出了，但是緊接著下面又啟動了半同步複製，但是我在看程式碼，讀event的時候報錯，直接跳到err了，沒有自動重啟的部分，有知道的可以指點下。下面貼下程式碼

在handle_slave_io函式中，我們可以看到

if (check_io_slave_killed(thd, mi, "Slave I/O thread killed while \
reading event"))
goto err;
DBUG_EXECUTE_IF("FORCE_SLAVE_TO_RECONNECT_EVENT",
if (!retry_count_event)
{
retry_count_event++;
sql_print_information("Forcing to reconnect slave I/O thread%s",
mi->get_for_channel_str());
if (try_to_reconnect(thd, mysql, mi, &retry_count, suppress_warnings,
reconnect_messages[SLAVE_RECON_ACT_EVENT]))
goto err;
goto connected;
});

直接goto err: err部分就是一些清理的動作了

err:
  // print the current replication position
sql_print_information("Slave I/O thread exiting%s, read up to log '%s', position %s",
mi->get_for_channel_str(), mi->get_io_rpl_log_name(),
llstr(mi->get_master_log_pos(), llbuff));
  /* At this point the I/O thread will not try to reconnect anymore. */
mi->is_stopping.atomic_set(1);
(void) RUN_HOOK(binlog_relay_io, thread_stop, (thd, mi));
  /*
    Pause the IO thread and wait for 'continue_to_stop_io_thread'
    signal to continue to shutdown the IO thread.
  */
DBUG_EXECUTE_IF("pause_after_io_thread_stop_hook",
{
const char act[]= "now SIGNAL reached_stopping_io_thread "
"WAIT_FOR continue_to_stop_io_thread";
DBUG_ASSERT(!debug_sync_set_action(thd,
STRING_WITH_LEN(act)));
};);
thd->reset_query();
thd->reset_db(NULL_CSTR);
if (mysql)
{
    /*
      Here we need to clear the active VIO before closing the
      connection with the master.  The reason is that THD::awake()
      might be called from terminate_slave_thread() because somebody
      issued a STOP SLAVE.  If that happends, the shutdown_active_vio()
      can be called in the middle of closing the VIO associated with
      the 'mysql' object, causing a crash.
    */
thd->clear_active_vio();
mysql_close(mysql);
mi->mysql=0;
}
mysql_mutex_lock(&mi->data_lock);
write_ignored_events_info_to_relay_log(thd, mi, false/* force_flush_mi_info */);
mysql_mutex_unlock(&mi->data_lock);
THD_STAGE_INFO(thd, stage_waiting_for_slave_mutex_on_exit);
mysql_mutex_lock(&mi->run_lock);
  /*
    Clean information used to start slave in order to avoid
    security issues.
  */
mi->reset_start_info();
  /* Forget the relay log's format */
mysql_mutex_lock(&mi->data_lock);
mi->set_mi_description_event(NULL);
mysql_mutex_unlock(&mi->data_lock);
  // destructor will not free it, because net.vio is 0
thd->get_protocol_classic()->end_net();
thd->release_resources();
THD_CHECK_SENTRY(thd);
if (thd_added)
thd_manager->remove_thd(thd);
mi->abort_slave= 0;
mi->slave_running= 0;
mi->is_stopping.atomic_set(0);
mysql_mutex_lock(&mi->info_thd_lock);
mi->info_thd= NULL;
mysql_mutex_unlock(&mi->info_thd_lock);

有興趣學習原始碼的加群一起學習啊 QQ： 700072075

MySQL 網路導致的複製報錯案例

相關文章