將Mysql資料匯入到ElasticSearch叢集

eff666發表於2016-09-04

  一星期沒有寫博文了,只是因為最近領導交代了一項艱鉅的任務,讓我無暇顧及其他。將mysql資料庫中的資料搬到我們的ES叢集中。mysql資料是我們從阿里那邊拉過來臨時存放資料的地方,現在要將其儲存到我們的叢集中。說道這裡,可能讀者會覺得,這有什麼難度。是的,如果資料量很少,幾百萬或者上千萬確實多花點時間傳送,就可以了。可是,如果是二十幾億的資料量呢,怎麼辦?領導給了我一星期時間,包括我寫程式碼,除錯速度,開始拉資料。為此,我被這二十幾億資料折磨得體無完膚。

  接下來就是我寫程式碼,我用了半天時間寫好了程式碼,進行了測試,除錯了速度,開始部署,程式是跑起來了,我鬆了一口氣。緊接著,我將其部署到正式環境,在ES中建好index,在mysql中建了一個測試表,插入了幾十萬資料,將mysql資料中的資料開始匯入到叢集,似乎還可以的樣子。這時候一天時間已經過去了。

  然後我就將其開始正式操作。這時候問題來了,首先拋開速度不說,mysql中每張表一億多條資料,資料裡面未知的東西太多太多,資料格式、空、各種奇葩字元、特殊字元等。我之前沒有想到的異常處理,改善了程式碼。然後程式可以跑通了。我從mysql中找了一張表,決定先插入100萬資料,試試。問題又來了,從mysql中查詢了100萬資料,但是最終插入到叢集中的卻只有99萬多,還有幾千條資料去哪了,縱使我考慮了各種異常情況,還是有這麼多資料量的丟失。老大給我的要求是資料量丟失控制在百萬分之一啊。無奈,我再次改善程式碼,我列印了更多的log資訊,記錄下插入到叢集中失敗的資料。然後分析了這些資料,再次改進了程式碼,這次100萬資料全部插入進去了。我終於緩了一緩。

  最後,就是除錯速度了,採用多執行緒,如何從mysql中查詢,查詢出來的資料是放到記憶體中然後直接讀取插入到叢集中還是寫入檔案後在讀取檔案插入叢集中,取決於自己了。個人推薦第一種。當然這其中還有很多問題需要考慮,比如連線如果斷掉,失敗資料記錄,磁碟負載等之類,對於大資料量的匯入,需要注意的問題很多。下面我將貼上我最開始的大概程式碼,供大家參考。

import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.PropertyNamingStrategy;
import com.fasterxml.jackson.databind.SerializationFeature;
import org.elasticsearch.action.bulk.*;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.action.search.SearchType;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.InetSocketTransportAddress;
import org.elasticsearch.common.unit.ByteSizeUnit;
import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.index.query.QueryBuilders;

import java.io.*;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.sql.*;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.Date;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.atomic.AtomicBoolean;

/**
 * Created by xxx on 2016/08/30.
 */
public class FileToEsOrderTest {

    static ConcurrentLinkedQueue<String> queues = new ConcurrentLinkedQueue<String>();
    static AtomicBoolean isInsert = new AtomicBoolean(true);
    static TransportClient client = null;

    public static void main(String[] agrs) throws Exception {
        Settings settings = Settings.settingsBuilder()
                .put("cluster.name", "elasticsearch-cluster").build();
        client = TransportClient.builder().settings(settings).build();

        try {
            client.addTransportAddress(new InetSocketTransportAddress(InetAddress.getByName("xxxxx"), 9500));
        } catch (UnknownHostException error) {
            System.out.print(error.getMessage());
        }
        final long aa = System.currentTimeMillis();

        final ConcurrentHashMap<String, Boolean> hashMap = new ConcurrentHashMap();
        for (int i = 0; i < 20; i++) {
            new Thread(new Runnable() {
                Integer num  = 1;
                public void run() {
                    //Add transport addresses and do something with the client...
                    hashMap.put(Thread.currentThread().getName(), Boolean.FALSE);
                    final BulkProcessor bulkProcessor = BulkProcessor.builder(
                            client,
                            new BulkProcessor.Listener() {

                                //批量成功後執行
                                public void afterBulk(long l, BulkRequest bulkRequest, 
                                                    BulkResponse bulkResponse) {
                                      System.out.println("請求數量是:" + 
                                                  bulkRequest.numberOfActions());
                                    if (bulkResponse.hasFailures()) {
                                        for (BulkItemResponse item : 
                                                    bulkResponse.getItems()) {
                                            if (item.isFailed()) {
                                                System.out.println("失敗資訊:--------" + 
                                                item.getFailureMessage());
                                            }
                                        }
                                    }
                                }

                                //批量提交之前執行
                                public void beforeBulk(long executionId,
                                                       BulkRequest request) {
                                }

                                //批量失敗後執行
                                public void afterBulk(long executionId,
                                                      BulkRequest request,
                                                      Throwable failure) {
                                    System.out.println("happen fail = " +
                                     failure.getMessage() + " ,
                                     cause = " + failure.getCause());
                                }
                            })
                            .setBulkActions(10000)
                            .setBulkSize(new ByteSizeValue(100, ByteSizeUnit.MB))
                            .setBackoffPolicy(
                                    BackoffPolicy.exponentialBackoff(
                                            TimeValue.timeValueMillis(100), 3))
                            .setConcurrentRequests(1)
                            .build();
                    while (true) {
                        if (!queues.isEmpty()) {
                            try {
                                String json = queues.poll();
                                if (json == null) continue;
                                int index1 = json.indexOf("checksum");
                                int index2 = json.indexOf("}", index1);
                                index1 += 10;
                                String id = json.substring(index1 + 1, index2 - 1);

                                int index3 = json.indexOf("dp_id");
                                int index4 = json.indexOf(",", index3);
                                index3 += 7;
                                String routing = json.substring(index3 + 1, index4 - 1);
                                count++;
                                bulkProcessor.add(new IndexRequest("xxxx", 
                                    "xxxxx").id(id).routing(routing).source(json));
                            } catch (Exception e) {
                                System.out.print(e.getMessage());
                            }
                        }
                        if (queues.isEmpty() && !isInsert.get()) {
                            bulkProcessor.flush();
                            long jjj = System.currentTimeMillis() - aa;
                            System.out.print("   " + Thread.currentThread().getName()
                                         + ":" + jjj + "   ");
                            hashMap.put(Thread.currentThread().getName(), Boolean.TRUE);
                            while (hashMap.values().contains(Boolean.FALSE)) {
                                try {
                                    Thread.currentThread().sleep(1 * 1000);
                                } catch (Exception e) {
                                    e.printStackTrace(System.out);
                                }
                            }
                            bulkProcessor.close();
                            break;
                        }
                    }
                }
            }).start();
        }


//        File file = new File("/test/rfm/rfm_data.txt");
//        FileOutputStream fileOutputStream = new FileOutputStream((file));
//        OutputStreamWriter outputStreamWriter = 
//                      new OutputStreamWriter(fileOutputStream);
//        bufferedWriter = new BufferedWriter(outputStreamWriter);
        for(int i = 2; i <= 23; i++){
            WriteData("xxx" + i);
        }
//        WriteData("rfm_1");
//        bufferedWriter.close();
//        outputStreamWriter.close();
//        fileOutputStream.close();
        System.out.println("資料寫入完畢");
    }

    // 寫資料
    public static void WriteData(String tableName) throws IOException {
        Connection conn = null;
        PreparedStatement ps = null;
        ResultSet rs = null;
        Integer count = 1;
        List<String> columnName = Arrays.asList("trade_last_interval","trade_first_interval");
        List<String> columnDateName = Arrays.asList("modify","trade_first_time","trade_last_time");
        SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
        try {
            Class.forName("com.mysql.jdbc.Driver");
            String url = "jdbc:mysql://xxxxxxxxx";
            conn = DriverManager.getConnection(url, "xxxx", "xxxx");
            System.out.println("寫入資料開始,成功連線MySQL:" + tableName);

            String sql = "select * from " + tableName;
            ps = conn.prepareStatement(sql,
                    ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
            ps.setFetchSize(Integer.MIN_VALUE);
            rs = ps.executeQuery();

            ResultSetMetaData rsmd = rs.getMetaData();
            int colCount = rsmd.getColumnCount();
            ObjectMapper objectMapper = new ObjectMapper()
                    .setSerializationInclusion(JsonInclude.Include.NON_NULL)
                    .setDateFormat(new SimpleDateFormat("yyyy-MM-dd"))
                    .disable(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES)
                    .disable(SerializationFeature.FAIL_ON_EMPTY_BEANS)
                    .setPropertyNamingStrategy(
                    PropertyNamingStrategy.CAMEL_CASE_TO_LOWER_CASE_WITH_UNDERSCORES);

            while(rs.next())  {  //while控制行數
                Map<String, String> map = new LinkedHashMap<>();
                 //StringBuilder buffer = new StringBuilder();
                for(int i = 1; i <= colCount; i++  )   {
                    String name = rsmd.getColumnName(i);
                    if(!columnName.contains(name)) {
                        String value = rs.getString(i);
                        boolean flag = true;
                        if(columnDateName.contains(name)){
                            try {
                                dateFormat.parse(value);
                            } catch (Exception e){
                                flag = false;
                            }
                        } else if("buyer_nick".equalsIgnoreCase(name)){
                            value = encrypt(value);
                        }
                        if (flag && value != null && !"".equals(value.trim()) && value.trim().length() > 0) {
                        //buffer.append("\"" + name + "\":\"" + value + "\"");
                        //buffer.append(",");
                            map.put(name, value);
                        }
                    }
                }
                count++;

                if(map != null && map.size() > 0){
                    queues.add(objectMapper.writeValueAsString(map));
                }

                if(count % 200000 == 0){
                    int number = queues.size();
                    int jj = number/200000;
                    System.out.println("index: " + count + ", 
                                        jj: " + jj + ", number: " + number);
                    while(jj > 0){
                        try {
                            Thread.sleep(2000*jj);
                        } catch (InterruptedException e) {
                            e.printStackTrace();
                        }
                        int number2 = queues.size();
                        jj = number2 / 200000;
                        System.out.println("index2: " + count + ",
                                         jj: " + jj + ", number2: " + number2);
                    }
                }
            }
            isInsert = new AtomicBoolean(false);

        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        }catch (SQLException e) {
            e.printStackTrace();
        } finally {
            try {
                if(rs != null) {
                    rs.close();
                }
            } catch (SQLException e) {
                e.printStackTrace();
            }
            try {
                if(ps != null) {
                    ps.close();
                }
            } catch (SQLException e) {
                e.printStackTrace();
            }
            try {
                if(conn != null) {
                    conn.close();
                }
            } catch (SQLException e) {
                e.printStackTrace();
            }
        }
        System.out.println(tableName + "資料寫入完畢,共有資料:" + count);
        try {
            Thread.sleep(100000);
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }
}

相關文章