mysql資料同步至redis

阿久丶xin發表於2024-05-13

透過mysql binlog日誌進行

import pymysql
import redis
from pymysqlreplication import BinLogStreamReader
from pymysqlreplication.row_event import WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent
from pymysqlreplication.event import QueryEvent
import json
from urllib.parse import quote

# 處理流程:1、將mysql表中mysql_to_redis_sign欄位更新為當前時間戳,為了生成更新binlog日誌,生成後才會將資料輸入到redis
# 2、清空redis中db0 的所有資料
# 3、將資料插入到redis

mysql_settings = {'host': 'xxx', 'port': 3306, 'user': 'root', 'password': 'xxx'}
binlog_position_file = 'binlog_position.json'
schema = 'analyze'
table = 'web_link_data'
redis_settings = {'host': 'xxx', 'port': 6379, 'db': 9, 'password': 'xxx'}




# 連線到 MySQL 資料庫
connection = pymysql.connect(host=mysql_settings['host'],
                             user=mysql_settings['user'],
                             password=mysql_settings['password'],
                             cursorclass=pymysql.cursors.DictCursor)
try:
    with connection.cursor() as cursor:
        # 鎖定 web_link_data 表
        cursor.execute("LOCK TABLES analyze.web_link_data WRITE")
        # 建立更新語句
        update_query = """
        UPDATE analyze.web_link_data
        SET mysql_to_redis_sign = UNIX_TIMESTAMP()
        """
        print(update_query)
        # 執行更新操作
        cursor.execute(update_query)
        # 提交修改
        connection.commit()
        print(f"Updated rows: {cursor.rowcount}")
        # 解鎖表
        cursor.execute("UNLOCK TABLES")
finally:
    # 關閉資料庫連線
    connection.close()


try:
    r = redis.Redis(**redis_settings)
    # 清空資料庫
    r.flushdb()
    # 使用 pipeline 來最佳化批次寫入效能
    pipeline = r.pipeline()
    try:
        with open(binlog_position_file, 'r') as f:
            saved_position = json.load(f)
            binlog_file = saved_position['binlog_file']
            binlog_position = saved_position['binlog_position']
    except FileNotFoundError:
        binlog_file = None
        binlog_position = None

    stream = BinLogStreamReader(connection_settings=mysql_settings, server_id=100, blocking=True,
                                 resume_stream=True, only_events=[WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent],
                                 log_file=binlog_file, log_pos=binlog_position)
    for binlogevent in stream:
        if binlogevent.schema == schema and binlogevent.table == table:
            for row in binlogevent.rows:
                row = dict(row)
                if isinstance(binlogevent, WriteRowsEvent):
                    print('新增資料:',row)
                    row = {
                        'id': row['values'].get('UNKNOWN_COL0', '') if row['values'].get('UNKNOWN_COL0','') is not None else '',
                        'host': row['values'].get('UNKNOWN_COL1', '') if row['values'].get('UNKNOWN_COL1','') is not None else '',
                        'url': row['values'].get('UNKNOWN_COL2', '') if row['values'].get('UNKNOWN_COL2',' ') is not None else '',
                        'class': row['values'].get('UNKNOWN_COL3', '') if row['values'].get('UNKNOWN_COL3',' ') is not None else '',
                        'class_sec': row['values'].get('UNKNOWN_COL4', '') if row['values'].get('UNKNOWN_COL4',' ') is not None else '',
                        'url_host': row['values'].get('UNKNOWN_COL5', '')if row['values'].get('UNKNOWN_COL5',' ') is not None else '',
                        'http': row['values'].get('UNKNOWN_COL6', '')if row['values'].get('UNKNOWN_COL6',' ') is not None else '',
                        'title': row['values'].get('UNKNOWN_COL7', '')if row['values'].get('UNKNOWN_COL7',' ') is not None else '',
                        'ip': row['values'].get('UNKNOWN_COL8', '')if row['values'].get('UNKNOWN_COL8',' ') is not None else '',
                        'ip_area': row['values'].get('UNKNOWN_COL9', '')if row['values'].get('UNKNOWN_COL9',' ') is not None else '',
                        'jump_url': row['values'].get('UNKNOWN_COL10', '')if row['values'].get('UNKNOWN_COL10',' ') is not None else '',
                        'import_source': row['values'].get('UNKNOWN_COL11', '')if row['values'].get('UNKNOWN_COL11',' ') is not None else '',
                        'is_gather': row['values'].get('UNKNOWN_COL12', '')if row['values'].get('UNKNOWN_COL12',' ') is not None else '',
                        'import_time': row['values'].get('UNKNOWN_COL13', '') if row['values'].get('UNKNOWN_COL13',' ') is not None else '',
                        'gather_time': row['values'].get('UNKNOWN_COL15', '')if row['values'].get('UNKNOWN_COL15',' ') is not None else '',
                    }



                    id = row['id']
                    # 對 URL 進行編碼
                    row['url'] = quote(row['url'])
                    row['jump_url'] = quote(row['jump_url'])
                    pipeline.hmset(f"record:{id}", mapping=row)
                    fields = [
                        'host', 'url', 'class', 'class_sec', 'url_host', 'http', 'title',
                        'ip', 'ip_area', 'jump_url', 'import_source', 'is_gather', 'import_time','gather_time']
                    # 為每個欄位 新增索引
                    for field in fields:
                        if row.get(field):  # 確保欄位存在並且有值
                            print(row)
                            pipeline.sadd(f"{field}:{row[field]}", row['id'])
                    pipeline.execute()
                elif isinstance(binlogevent, DeleteRowsEvent):
                    print('刪除資料:', row)
                    id = row['values']['UNKNOWN_COL0']
                    # 獲取舊資料
                    old_data = r.hgetall(f"record:{id}")
                    if old_data:
                        # 刪除舊資料記錄
                        pipeline.delete(f"record:{id}")
                        # 從舊索引中移除
                        fields = ['host', 'url', 'class', 'class_sec', 'url_host', 'http', 'title', 'ip', 'ip_area',
                                  'jump_url', 'import_source', 'is_gather','import_time','gather_time']
                        for field in fields:
                            if old_data.get(field.encode()):
                                pipeline.srem(f"{field}:{old_data[field.encode()].decode()}", id)
                            # 執行所有操作
                        pipeline.execute()
                elif isinstance(binlogevent, UpdateRowsEvent):
                    for row in binlogevent.rows:
                        print('更改資料:', row)
                        before_values = row['before_values']
                        after_values = row['after_values']
                        id = before_values['UNKNOWN_COL0']
                        old_data = r.hgetall(f"record:{id}")
                        # 更新Redis記錄
                        new_data = {
                            'id': after_values.get('UNKNOWN_COL0', ''),
                            'host': after_values.get('UNKNOWN_COL1', ''),
                            'url': quote(str(after_values.get('UNKNOWN_COL2', ''))),
                            'class': after_values.get('UNKNOWN_COL3', ''),
                            'class_sec': after_values.get('UNKNOWN_COL4', ''),
                            'url_host': after_values.get('UNKNOWN_COL5', ''),
                            'http': after_values.get('UNKNOWN_COL6', ''),
                            'title': after_values.get('UNKNOWN_COL7', ''),
                            'ip': after_values.get('UNKNOWN_COL8', ''),
                            'ip_area': after_values.get('UNKNOWN_COL9', ''),
                            'jump_url': quote(str(after_values.get('UNKNOWN_COL10', ''))),
                            'import_source': after_values.get('UNKNOWN_COL11', ''),
                            'is_gather': after_values.get('UNKNOWN_COL12', ''),
                            'import_time': after_values.get('UNKNOWN_COL13', ''),
                            'gather_time': after_values.get('UNKNOWN_COL15', '')
                        }
                        pipeline.hmset(f"record:{id}", mapping=new_data)
                        # 更新索引
                        fields = [
                            'host', 'url', 'class', 'class_sec', 'url_host', 'http', 'title',
                            'ip', 'ip_area', 'jump_url', 'import_source', 'is_gather','import_time','gather_time'
                        ]
                        # 刪除舊索引並新增新索引
                        for field in fields:
                            if old_data.get(field.encode()):
                                pipeline.srem(f"{field}:{old_data[field.encode()].decode()}", id)
                            if new_data[field]:
                                pipeline.sadd(f"{field}:{new_data[field]}", id)
                        pipeline.execute()
        with open(binlog_position_file, 'w') as f:
            json.dump({'binlog_file': stream.log_file, 'binlog_position': stream.log_pos}, f)

finally:
    if stream:
        stream.close()

相關文章