透過mysql binlog日誌進行
import pymysql import redis from pymysqlreplication import BinLogStreamReader from pymysqlreplication.row_event import WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent from pymysqlreplication.event import QueryEvent import json from urllib.parse import quote # 處理流程:1、將mysql表中mysql_to_redis_sign欄位更新為當前時間戳,為了生成更新binlog日誌,生成後才會將資料輸入到redis # 2、清空redis中db0 的所有資料 # 3、將資料插入到redis mysql_settings = {'host': 'xxx', 'port': 3306, 'user': 'root', 'password': 'xxx'} binlog_position_file = 'binlog_position.json' schema = 'analyze' table = 'web_link_data' redis_settings = {'host': 'xxx', 'port': 6379, 'db': 9, 'password': 'xxx'} # 連線到 MySQL 資料庫 connection = pymysql.connect(host=mysql_settings['host'], user=mysql_settings['user'], password=mysql_settings['password'], cursorclass=pymysql.cursors.DictCursor) try: with connection.cursor() as cursor: # 鎖定 web_link_data 表 cursor.execute("LOCK TABLES analyze.web_link_data WRITE") # 建立更新語句 update_query = """ UPDATE analyze.web_link_data SET mysql_to_redis_sign = UNIX_TIMESTAMP() """ print(update_query) # 執行更新操作 cursor.execute(update_query) # 提交修改 connection.commit() print(f"Updated rows: {cursor.rowcount}") # 解鎖表 cursor.execute("UNLOCK TABLES") finally: # 關閉資料庫連線 connection.close() try: r = redis.Redis(**redis_settings) # 清空資料庫 r.flushdb() # 使用 pipeline 來最佳化批次寫入效能 pipeline = r.pipeline() try: with open(binlog_position_file, 'r') as f: saved_position = json.load(f) binlog_file = saved_position['binlog_file'] binlog_position = saved_position['binlog_position'] except FileNotFoundError: binlog_file = None binlog_position = None stream = BinLogStreamReader(connection_settings=mysql_settings, server_id=100, blocking=True, resume_stream=True, only_events=[WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent], log_file=binlog_file, log_pos=binlog_position) for binlogevent in stream: if binlogevent.schema == schema and binlogevent.table == table: for row in binlogevent.rows: row = dict(row) if isinstance(binlogevent, WriteRowsEvent): print('新增資料:',row) row = { 'id': row['values'].get('UNKNOWN_COL0', '') if row['values'].get('UNKNOWN_COL0','') is not None else '', 'host': row['values'].get('UNKNOWN_COL1', '') if row['values'].get('UNKNOWN_COL1','') is not None else '', 'url': row['values'].get('UNKNOWN_COL2', '') if row['values'].get('UNKNOWN_COL2',' ') is not None else '', 'class': row['values'].get('UNKNOWN_COL3', '') if row['values'].get('UNKNOWN_COL3',' ') is not None else '', 'class_sec': row['values'].get('UNKNOWN_COL4', '') if row['values'].get('UNKNOWN_COL4',' ') is not None else '', 'url_host': row['values'].get('UNKNOWN_COL5', '')if row['values'].get('UNKNOWN_COL5',' ') is not None else '', 'http': row['values'].get('UNKNOWN_COL6', '')if row['values'].get('UNKNOWN_COL6',' ') is not None else '', 'title': row['values'].get('UNKNOWN_COL7', '')if row['values'].get('UNKNOWN_COL7',' ') is not None else '', 'ip': row['values'].get('UNKNOWN_COL8', '')if row['values'].get('UNKNOWN_COL8',' ') is not None else '', 'ip_area': row['values'].get('UNKNOWN_COL9', '')if row['values'].get('UNKNOWN_COL9',' ') is not None else '', 'jump_url': row['values'].get('UNKNOWN_COL10', '')if row['values'].get('UNKNOWN_COL10',' ') is not None else '', 'import_source': row['values'].get('UNKNOWN_COL11', '')if row['values'].get('UNKNOWN_COL11',' ') is not None else '', 'is_gather': row['values'].get('UNKNOWN_COL12', '')if row['values'].get('UNKNOWN_COL12',' ') is not None else '', 'import_time': row['values'].get('UNKNOWN_COL13', '') if row['values'].get('UNKNOWN_COL13',' ') is not None else '', 'gather_time': row['values'].get('UNKNOWN_COL15', '')if row['values'].get('UNKNOWN_COL15',' ') is not None else '', } id = row['id'] # 對 URL 進行編碼 row['url'] = quote(row['url']) row['jump_url'] = quote(row['jump_url']) pipeline.hmset(f"record:{id}", mapping=row) fields = [ 'host', 'url', 'class', 'class_sec', 'url_host', 'http', 'title', 'ip', 'ip_area', 'jump_url', 'import_source', 'is_gather', 'import_time','gather_time'] # 為每個欄位 新增索引 for field in fields: if row.get(field): # 確保欄位存在並且有值 print(row) pipeline.sadd(f"{field}:{row[field]}", row['id']) pipeline.execute() elif isinstance(binlogevent, DeleteRowsEvent): print('刪除資料:', row) id = row['values']['UNKNOWN_COL0'] # 獲取舊資料 old_data = r.hgetall(f"record:{id}") if old_data: # 刪除舊資料記錄 pipeline.delete(f"record:{id}") # 從舊索引中移除 fields = ['host', 'url', 'class', 'class_sec', 'url_host', 'http', 'title', 'ip', 'ip_area', 'jump_url', 'import_source', 'is_gather','import_time','gather_time'] for field in fields: if old_data.get(field.encode()): pipeline.srem(f"{field}:{old_data[field.encode()].decode()}", id) # 執行所有操作 pipeline.execute() elif isinstance(binlogevent, UpdateRowsEvent): for row in binlogevent.rows: print('更改資料:', row) before_values = row['before_values'] after_values = row['after_values'] id = before_values['UNKNOWN_COL0'] old_data = r.hgetall(f"record:{id}") # 更新Redis記錄 new_data = { 'id': after_values.get('UNKNOWN_COL0', ''), 'host': after_values.get('UNKNOWN_COL1', ''), 'url': quote(str(after_values.get('UNKNOWN_COL2', ''))), 'class': after_values.get('UNKNOWN_COL3', ''), 'class_sec': after_values.get('UNKNOWN_COL4', ''), 'url_host': after_values.get('UNKNOWN_COL5', ''), 'http': after_values.get('UNKNOWN_COL6', ''), 'title': after_values.get('UNKNOWN_COL7', ''), 'ip': after_values.get('UNKNOWN_COL8', ''), 'ip_area': after_values.get('UNKNOWN_COL9', ''), 'jump_url': quote(str(after_values.get('UNKNOWN_COL10', ''))), 'import_source': after_values.get('UNKNOWN_COL11', ''), 'is_gather': after_values.get('UNKNOWN_COL12', ''), 'import_time': after_values.get('UNKNOWN_COL13', ''), 'gather_time': after_values.get('UNKNOWN_COL15', '') } pipeline.hmset(f"record:{id}", mapping=new_data) # 更新索引 fields = [ 'host', 'url', 'class', 'class_sec', 'url_host', 'http', 'title', 'ip', 'ip_area', 'jump_url', 'import_source', 'is_gather','import_time','gather_time' ] # 刪除舊索引並新增新索引 for field in fields: if old_data.get(field.encode()): pipeline.srem(f"{field}:{old_data[field.encode()].decode()}", id) if new_data[field]: pipeline.sadd(f"{field}:{new_data[field]}", id) pipeline.execute() with open(binlog_position_file, 'w') as f: json.dump({'binlog_file': stream.log_file, 'binlog_position': stream.log_pos}, f) finally: if stream: stream.close()